diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 98071b536a336a..784dce0f11b2b5 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -109,8 +109,6 @@ jobs: if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }} run: | ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel - env: - NODE_OPTIONS: "--max-old-space-size=4096" - name: Gradle compile (jdk8) for legacy Spark if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | @@ -157,4 +155,4 @@ jobs: uses: actions/upload-artifact@v3 with: name: Event File - path: ${{ github.event_path }} \ No newline at end of file + path: ${{ github.event_path }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index cf7a2ecfde3025..80e5a9d056b3d4 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -444,8 +444,6 @@ jobs: run: | ./gradlew :datahub-frontend:dist -x test -x yarnTest -x yarnLint --parallel mv ./datahub-frontend/build/distributions/datahub-frontend-*.zip datahub-frontend.zip - env: - NODE_OPTIONS: "--max-old-space-size=4096" - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapper.java new file mode 100644 index 00000000000000..04602e7ff6dde9 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapper.java @@ -0,0 +1,73 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataTransform; +import com.linkedin.datahub.graphql.generated.DataTransformLogic; +import com.linkedin.datahub.graphql.generated.QueryLanguage; +import com.linkedin.datahub.graphql.generated.QueryStatement; +import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +public class DataTransformLogicMapper + implements ModelMapper< + com.linkedin.common.DataTransformLogic, + com.linkedin.datahub.graphql.generated.DataTransformLogic> { + + public static final DataTransformLogicMapper INSTANCE = new DataTransformLogicMapper(); + + public static DataTransformLogic map( + @Nullable final QueryContext context, + @Nonnull final com.linkedin.common.DataTransformLogic input) { + return INSTANCE.apply(context, input); + } + + @Override + public DataTransformLogic apply( + @Nullable final QueryContext context, + @Nonnull final com.linkedin.common.DataTransformLogic input) { + + final DataTransformLogic result = new DataTransformLogic(); + + // Map transforms array using DataTransformMapper + result.setTransforms( + input.getTransforms().stream() + .map(transform -> DataTransformMapper.map(context, transform)) + .collect(Collectors.toList())); + + return result; + } +} + +class DataTransformMapper + implements ModelMapper< + com.linkedin.common.DataTransform, com.linkedin.datahub.graphql.generated.DataTransform> { + + public static final DataTransformMapper INSTANCE = new DataTransformMapper(); + + public static DataTransform map( + @Nullable final QueryContext context, + @Nonnull final com.linkedin.common.DataTransform input) { + return INSTANCE.apply(context, input); + } + + @Override + public DataTransform apply( + @Nullable final QueryContext context, + @Nonnull final com.linkedin.common.DataTransform input) { + + final DataTransform result = new DataTransform(); + + // Map query statement if present + if (input.hasQueryStatement()) { + QueryStatement statement = + new QueryStatement( + input.getQueryStatement().getValue(), + QueryLanguage.valueOf(input.getQueryStatement().getLanguage().toString())); + result.setQueryStatement(statement); + } + + return result; + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapper.java new file mode 100644 index 00000000000000..e29bea5b3943ce --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapper.java @@ -0,0 +1,61 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import com.linkedin.data.template.GetMode; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.*; +import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.query.QueryProperties; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +public class QueryPropertiesMapper + implements ModelMapper< + QueryProperties, com.linkedin.datahub.graphql.generated.QueryProperties> { + + public static final QueryPropertiesMapper INSTANCE = new QueryPropertiesMapper(); + + public static com.linkedin.datahub.graphql.generated.QueryProperties map( + @Nullable final QueryContext context, @Nonnull final QueryProperties input) { + return INSTANCE.apply(context, input); + } + + @Override + public com.linkedin.datahub.graphql.generated.QueryProperties apply( + @Nullable final QueryContext context, @Nonnull final QueryProperties input) { + + final com.linkedin.datahub.graphql.generated.QueryProperties result = + new com.linkedin.datahub.graphql.generated.QueryProperties(); + + // Map Query Source + result.setSource(QuerySource.valueOf(input.getSource().toString())); + + // Map Query Statement + result.setStatement( + new QueryStatement( + input.getStatement().getValue(), + QueryLanguage.valueOf(input.getStatement().getLanguage().toString()))); + + // Map optional fields + result.setName(input.getName(GetMode.NULL)); + result.setDescription(input.getDescription(GetMode.NULL)); + + // Map origin if present + if (input.hasOrigin() && input.getOrigin() != null) { + result.setOrigin(UrnToEntityMapper.map(context, input.getOrigin())); + } + + // Map created audit stamp + AuditStamp created = new AuditStamp(); + created.setTime(input.getCreated().getTime()); + created.setActor(input.getCreated().getActor(GetMode.NULL).toString()); + result.setCreated(created); + + // Map last modified audit stamp + AuditStamp lastModified = new AuditStamp(); + lastModified.setTime(input.getLastModified().getTime()); + lastModified.setActor(input.getLastModified().getActor(GetMode.NULL).toString()); + result.setLastModified(lastModified); + + return result; + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java index b32832a28d5d57..8d55ca6dbf7ac9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/DataJobType.java @@ -79,7 +79,8 @@ public class DataJobType BROWSE_PATHS_V2_ASPECT_NAME, SUB_TYPES_ASPECT_NAME, STRUCTURED_PROPERTIES_ASPECT_NAME, - FORMS_ASPECT_NAME); + FORMS_ASPECT_NAME, + DATA_TRANSFORM_LOGIC_ASPECT_NAME); private static final Set FACET_FIELDS = ImmutableSet.of("flow"); private final EntityClient _entityClient; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java index 772871d77f2175..ec57c95ce151e2 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/datajob/mappers/DataJobMapper.java @@ -4,16 +4,7 @@ import static com.linkedin.metadata.Constants.*; import com.google.common.collect.ImmutableList; -import com.linkedin.common.BrowsePathsV2; -import com.linkedin.common.DataPlatformInstance; -import com.linkedin.common.Deprecation; -import com.linkedin.common.Forms; -import com.linkedin.common.GlobalTags; -import com.linkedin.common.GlossaryTerms; -import com.linkedin.common.InstitutionalMemory; -import com.linkedin.common.Ownership; -import com.linkedin.common.Status; -import com.linkedin.common.SubTypes; +import com.linkedin.common.*; import com.linkedin.common.urn.Urn; import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; @@ -26,15 +17,7 @@ import com.linkedin.datahub.graphql.generated.DataJobProperties; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.EntityType; -import com.linkedin.datahub.graphql.types.common.mappers.BrowsePathsV2Mapper; -import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; -import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper; -import com.linkedin.datahub.graphql.types.common.mappers.DeprecationMapper; -import com.linkedin.datahub.graphql.types.common.mappers.FineGrainedLineagesMapper; -import com.linkedin.datahub.graphql.types.common.mappers.InstitutionalMemoryMapper; -import com.linkedin.datahub.graphql.types.common.mappers.OwnershipMapper; -import com.linkedin.datahub.graphql.types.common.mappers.StatusMapper; -import com.linkedin.datahub.graphql.types.common.mappers.SubTypesMapper; +import com.linkedin.datahub.graphql.types.common.mappers.*; import com.linkedin.datahub.graphql.types.common.mappers.util.SystemMetadataUtils; import com.linkedin.datahub.graphql.types.domain.DomainAssociationMapper; import com.linkedin.datahub.graphql.types.form.FormsMapper; @@ -139,6 +122,9 @@ public DataJob apply( context, new StructuredProperties(data), entityUrn)); } else if (FORMS_ASPECT_NAME.equals(name)) { result.setForms(FormsMapper.map(new Forms(data), entityUrn.toString())); + } else if (DATA_TRANSFORM_LOGIC_ASPECT_NAME.equals(name)) { + result.setDataTransformLogic( + DataTransformLogicMapper.map(context, new DataTransformLogic(data))); } }); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryMapper.java index e71b569e9ae238..916ebc772f545f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryMapper.java @@ -5,18 +5,13 @@ import com.linkedin.common.DataPlatformInstance; import com.linkedin.common.urn.Urn; import com.linkedin.data.DataMap; -import com.linkedin.data.template.GetMode; import com.linkedin.datahub.graphql.QueryContext; -import com.linkedin.datahub.graphql.generated.AuditStamp; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.generated.QueryEntity; -import com.linkedin.datahub.graphql.generated.QueryLanguage; -import com.linkedin.datahub.graphql.generated.QuerySource; -import com.linkedin.datahub.graphql.generated.QueryStatement; import com.linkedin.datahub.graphql.generated.QuerySubject; -import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; +import com.linkedin.datahub.graphql.types.common.mappers.QueryPropertiesMapper; import com.linkedin.datahub.graphql.types.common.mappers.util.MappingHelper; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; import com.linkedin.entity.EntityResponse; @@ -48,7 +43,10 @@ public QueryEntity apply( result.setType(EntityType.QUERY); EnvelopedAspectMap aspectMap = entityResponse.getAspects(); MappingHelper mappingHelper = new MappingHelper<>(aspectMap, result); - mappingHelper.mapToResult(context, QUERY_PROPERTIES_ASPECT_NAME, this::mapQueryProperties); + mappingHelper.mapToResult( + QUERY_PROPERTIES_ASPECT_NAME, + (entity, dataMap) -> + entity.setProperties(QueryPropertiesMapper.map(context, new QueryProperties(dataMap)))); mappingHelper.mapToResult(QUERY_SUBJECTS_ASPECT_NAME, this::mapQuerySubjects); mappingHelper.mapToResult(DATA_PLATFORM_INSTANCE_ASPECT_NAME, this::mapPlatform); return mappingHelper.getResult(); @@ -64,37 +62,6 @@ private void mapPlatform(@Nonnull QueryEntity query, @Nonnull DataMap dataMap) { } } - private void mapQueryProperties( - @Nullable final QueryContext context, @Nonnull QueryEntity query, @Nonnull DataMap dataMap) { - QueryProperties queryProperties = new QueryProperties(dataMap); - com.linkedin.datahub.graphql.generated.QueryProperties res = - new com.linkedin.datahub.graphql.generated.QueryProperties(); - - // Query Source must be kept in sync. - res.setSource(QuerySource.valueOf(queryProperties.getSource().toString())); - res.setStatement( - new QueryStatement( - queryProperties.getStatement().getValue(), - QueryLanguage.valueOf(queryProperties.getStatement().getLanguage().toString()))); - res.setName(queryProperties.getName(GetMode.NULL)); - res.setDescription(queryProperties.getDescription(GetMode.NULL)); - if (queryProperties.hasOrigin() && queryProperties.getOrigin() != null) { - res.setOrigin(UrnToEntityMapper.map(context, queryProperties.getOrigin())); - } - - AuditStamp created = new AuditStamp(); - created.setTime(queryProperties.getCreated().getTime()); - created.setActor(queryProperties.getCreated().getActor(GetMode.NULL).toString()); - res.setCreated(created); - - AuditStamp lastModified = new AuditStamp(); - lastModified.setTime(queryProperties.getLastModified().getTime()); - lastModified.setActor(queryProperties.getLastModified().getActor(GetMode.NULL).toString()); - res.setLastModified(lastModified); - - query.setProperties(res); - } - @Nonnull private void mapQuerySubjects(@Nonnull QueryEntity query, @Nonnull DataMap dataMap) { QuerySubjects querySubjects = new QuerySubjects(dataMap); diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 9abf4e16f12dd7..a5cb0893a64fae 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -6569,6 +6569,11 @@ type DataJob implements EntityWithRelationships & Entity & BrowsableEntity { The forms associated with the Dataset """ forms: Forms + + """ + Data Transform Logic associated with the Data Job + """ + dataTransformLogic: DataTransformLogic } """ @@ -6786,6 +6791,26 @@ type DataJobInputOutput { fineGrainedLineages: [FineGrainedLineage!] } +""" +Information about a transformation applied to data assets +""" +type DataTransform { + """ + The transformation may be defined by a query statement + """ + queryStatement: QueryStatement +} + +""" +Information about transformations applied to data assets +""" +type DataTransformLogic { + """ + List of transformations applied + """ + transforms: [DataTransform!]! +} + """ Information about individual user usage of a Dataset """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapperTest.java new file mode 100644 index 00000000000000..f94738ff049efb --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/DataTransformLogicMapperTest.java @@ -0,0 +1,103 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; + +import com.linkedin.common.DataTransform; +import com.linkedin.common.DataTransformArray; +import com.linkedin.common.DataTransformLogic; +import com.linkedin.query.QueryLanguage; +import com.linkedin.query.QueryStatement; +import java.util.Arrays; +import org.testng.annotations.Test; + +public class DataTransformLogicMapperTest { + + @Test + public void testMapWithQueryStatement() throws Exception { + // Create test data + DataTransformLogic input = new DataTransformLogic(); + + // Create a transform with query statement + DataTransform transform1 = new DataTransform(); + QueryStatement statement = new QueryStatement(); + statement.setValue("SELECT * FROM source_table"); + statement.setLanguage(QueryLanguage.SQL); + transform1.setQueryStatement(statement); + + // Create another transform + DataTransform transform2 = new DataTransform(); + QueryStatement statement2 = new QueryStatement(); + statement2.setValue("INSERT INTO target_table SELECT * FROM temp_table"); + statement2.setLanguage(QueryLanguage.SQL); + transform2.setQueryStatement(statement2); + + // Set transforms + input.setTransforms(new DataTransformArray(Arrays.asList(transform1, transform2))); + + // Map the object + com.linkedin.datahub.graphql.generated.DataTransformLogic result = + DataTransformLogicMapper.map(null, input); + + // Verify result + assertNotNull(result); + assertEquals(result.getTransforms().size(), 2); + + // Verify first transform + com.linkedin.datahub.graphql.generated.DataTransform resultTransform1 = + result.getTransforms().get(0); + assertNotNull(resultTransform1.getQueryStatement()); + assertEquals(resultTransform1.getQueryStatement().getValue(), "SELECT * FROM source_table"); + assertEquals(resultTransform1.getQueryStatement().getLanguage().toString(), "SQL"); + + // Verify second transform + com.linkedin.datahub.graphql.generated.DataTransform resultTransform2 = + result.getTransforms().get(1); + assertNotNull(resultTransform2.getQueryStatement()); + assertEquals( + resultTransform2.getQueryStatement().getValue(), + "INSERT INTO target_table SELECT * FROM temp_table"); + assertEquals(resultTransform2.getQueryStatement().getLanguage().toString(), "SQL"); + } + + @Test + public void testMapWithoutQueryStatement() throws Exception { + // Create test data + DataTransformLogic input = new DataTransformLogic(); + + // Create a transform without query statement + DataTransform transform = new DataTransform(); + + // Set transforms + input.setTransforms(new DataTransformArray(Arrays.asList(transform))); + + // Map the object + com.linkedin.datahub.graphql.generated.DataTransformLogic result = + DataTransformLogicMapper.map(null, input); + + // Verify result + assertNotNull(result); + assertEquals(result.getTransforms().size(), 1); + + // Verify transform + com.linkedin.datahub.graphql.generated.DataTransform resultTransform = + result.getTransforms().get(0); + assertNull(resultTransform.getQueryStatement()); + } + + @Test + public void testMapWithEmptyTransforms() throws Exception { + // Create test data + DataTransformLogic input = new DataTransformLogic(); + input.setTransforms(new DataTransformArray(Arrays.asList())); + + // Map the object + com.linkedin.datahub.graphql.generated.DataTransformLogic result = + DataTransformLogicMapper.map(null, input); + + // Verify result + assertNotNull(result); + assertEquals(result.getTransforms().size(), 0); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapperTest.java new file mode 100644 index 00000000000000..a0251adca78f9d --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/QueryPropertiesMapperTest.java @@ -0,0 +1,117 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.query.QueryLanguage; +import com.linkedin.query.QueryProperties; +import com.linkedin.query.QuerySource; +import com.linkedin.query.QueryStatement; +import org.testng.annotations.Test; + +public class QueryPropertiesMapperTest { + + @Test + public void testMapWithRequiredFields() throws Exception { + // Create test data + QueryProperties input = new QueryProperties(); + + // Set required fields + QueryStatement statement = new QueryStatement(); + statement.setValue("SELECT * FROM table"); + statement.setLanguage(QueryLanguage.SQL); + input.setStatement(statement); + + input.setSource(QuerySource.MANUAL); + + Urn userUrn = Urn.createFromString("urn:li:corpuser:test"); + + AuditStamp created = new AuditStamp(); + created.setTime(1000L); + created.setActor(userUrn); + input.setCreated(created); + + AuditStamp lastModified = new AuditStamp(); + lastModified.setTime(2000L); + lastModified.setActor(userUrn); + input.setLastModified(lastModified); + + // Map the object + com.linkedin.datahub.graphql.generated.QueryProperties result = + QueryPropertiesMapper.map(null, input); + + // Verify required fields + assertNotNull(result); + assertEquals(result.getSource().toString(), "MANUAL"); + assertEquals(result.getStatement().getValue(), "SELECT * FROM table"); + assertEquals(result.getStatement().getLanguage().toString(), "SQL"); + + // Verify audit stamps + assertEquals(result.getCreated().getTime().longValue(), 1000L); + assertEquals(result.getCreated().getActor(), userUrn.toString()); + assertEquals(result.getLastModified().getTime().longValue(), 2000L); + assertEquals(result.getLastModified().getActor(), userUrn.toString()); + + // Verify optional fields are null + assertNull(result.getName()); + assertNull(result.getDescription()); + assertNull(result.getOrigin()); + } + + @Test + public void testMapWithOptionalFields() throws Exception { + // Create test data + QueryProperties input = new QueryProperties(); + + // Set required fields + QueryStatement statement = new QueryStatement(); + statement.setValue("SELECT * FROM table"); + statement.setLanguage(QueryLanguage.SQL); + input.setStatement(statement); + + input.setSource(QuerySource.SYSTEM); + + Urn userUrn = Urn.createFromString("urn:li:corpuser:test"); + Urn originUrn = Urn.createFromString("urn:li:dataset:test"); + + AuditStamp created = new AuditStamp(); + created.setTime(1000L); + created.setActor(userUrn); + input.setCreated(created); + + AuditStamp lastModified = new AuditStamp(); + lastModified.setTime(2000L); + lastModified.setActor(userUrn); + input.setLastModified(lastModified); + + // Set optional fields + input.setName("Test Query"); + input.setDescription("Test Description"); + input.setOrigin(originUrn); + + // Map the object + com.linkedin.datahub.graphql.generated.QueryProperties result = + QueryPropertiesMapper.map(null, input); + + // Verify required fields + assertNotNull(result); + assertEquals(result.getSource().toString(), "SYSTEM"); + assertEquals(result.getStatement().getValue(), "SELECT * FROM table"); + assertEquals(result.getStatement().getLanguage().toString(), "SQL"); + + // Verify audit stamps + assertEquals(result.getCreated().getTime().longValue(), 1000L); + assertEquals(result.getCreated().getActor(), userUrn.toString()); + assertEquals(result.getLastModified().getTime().longValue(), 2000L); + assertEquals(result.getLastModified().getActor(), userUrn.toString()); + + // Verify optional fields + assertEquals(result.getName(), "Test Query"); + assertEquals(result.getDescription(), "Test Description"); + assertNotNull(result.getOrigin()); + assertEquals(result.getOrigin().getUrn(), originUrn.toString()); + } +} diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index bf1aa401e3f560..3dad778a2b3038 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -79,7 +79,7 @@ task yarnServe(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { task yarnTest(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { // Explicitly runs in non-watch mode. - args = ['run', 'test', 'run'] + args = ['run', project.hasProperty('withCoverage') ? 'test-coverage' : 'test', 'run'] } task yarnLint(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index 2d1d667a89f14a..a608698d7602c4 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -89,10 +89,11 @@ "scripts": { "analyze": "source-map-explorer 'dist/assets/*.js'", "start": "yarn run generate && vite", - "ec2-dev": "yarn run generate && CI=true;export CI;vite", - "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=4096 --openssl-legacy-provider' CI=false vite build", - "test": "vitest", - "generate": "graphql-codegen --config codegen.yml", + "ec2-dev": "yarn run generate && CI=true vite", + "build": "yarn run generate && CI=false NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' vite build", + "test": "NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' vitest", + "test-coverage": "yarn test run --coverage", + "generate": "NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' graphql-codegen --config codegen.yml", "lint": "eslint . --ext .ts,.tsx --quiet && yarn format-check && yarn type-check", "lint-fix": "eslint '*/**/*.{ts,tsx}' --quiet --fix && yarn format", "format-check": "prettier --check src", @@ -100,7 +101,7 @@ "type-check": "tsc --noEmit", "type-watch": "tsc -w --noEmit", "storybook": "storybook dev -p 6006", - "build-storybook": "storybook build" + "build-storybook": "NODE_OPTIONS='--max-old-space-size=5120 --openssl-legacy-provider' storybook build" }, "browserslist": { "production": [ @@ -135,6 +136,7 @@ "@typescript-eslint/eslint-plugin": "^5.38.1", "@typescript-eslint/parser": "^5.38.1", "@vitejs/plugin-react": "^4.1.1", + "@vitest/coverage-v8": "^0.34.6", "eslint": "^8.2.0", "eslint-config-airbnb": "19.0.4", "eslint-config-airbnb-typescript": "^17.0.0", diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index 73a789030ce6fb..2da9e733eb4072 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -1714,6 +1714,7 @@ export const mlModel = { }, tags: [], properties: { + name: 'trust model', description: 'a ml trust model', date: null, version: '1', diff --git a/datahub-web-react/vite.config.ts b/datahub-web-react/vite.config.ts index 2532b24067754d..c43470dee031a8 100644 --- a/datahub-web-react/vite.config.ts +++ b/datahub-web-react/vite.config.ts @@ -68,6 +68,11 @@ export default defineConfig(({ mode }) => { envPrefix: 'REACT_APP_', build: { outDir: 'dist', + target: 'esnext', + minify: 'esbuild', + reportCompressedSize: false, + // Limit number of worker threads to reduce CPU pressure + workers: 3, // default is number of CPU cores }, server: { open: false, @@ -92,8 +97,11 @@ export default defineConfig(({ mode }) => { css: true, // reporters: ['verbose'], coverage: { + enabled: true, + provider: 'v8', reporter: ['text', 'json', 'html'], include: ['src/**/*'], + reportsDirectory: '../build/coverage-reports/datahub-web-react/', exclude: [], }, }, diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock index dc7260efd183fd..f16e8aa506e2cc 100644 --- a/datahub-web-react/yarn.lock +++ b/datahub-web-react/yarn.lock @@ -20,6 +20,14 @@ "@jridgewell/gen-mapping" "^0.3.0" "@jridgewell/trace-mapping" "^0.3.9" +"@ampproject/remapping@^2.2.1": + version "2.3.0" + resolved "https://registry.yarnpkg.com/@ampproject/remapping/-/remapping-2.3.0.tgz#ed441b6fa600072520ce18b43d2c8cc8caecc7f4" + integrity sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw== + dependencies: + "@jridgewell/gen-mapping" "^0.3.5" + "@jridgewell/trace-mapping" "^0.3.24" + "@analytics/amplitude@0.0.3": version "0.0.3" resolved "https://registry.yarnpkg.com/@analytics/amplitude/-/amplitude-0.0.3.tgz#15ccb76094d6f1003979a4f3aa5d3263781bd776" @@ -1017,6 +1025,11 @@ "@babel/helper-validator-identifier" "^7.22.20" to-fast-properties "^2.0.0" +"@bcoe/v8-coverage@^0.2.3": + version "0.2.3" + resolved "https://registry.yarnpkg.com/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz#75a2e8b51cb758a7553d6804a5932d7aace75c39" + integrity sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw== + "@ctrl/tinycolor@^3.3.1", "@ctrl/tinycolor@^3.4.0": version "3.4.0" resolved "https://registry.yarnpkg.com/@ctrl/tinycolor/-/tinycolor-3.4.0.tgz#c3c5ae543c897caa9c2a68630bed355be5f9990f" @@ -1941,6 +1954,11 @@ resolved "https://registry.yarnpkg.com/@icons/material/-/material-0.2.4.tgz#e90c9f71768b3736e76d7dd6783fc6c2afa88bc8" integrity sha512-QPcGmICAPbGLGb6F/yNf/KzKqvFx8z5qx3D1yFqVAjoFmXK35EgyW+cJ57Te3CNsmzblwtzakLGFqHPqrfb4Tw== +"@istanbuljs/schema@^0.1.2": + version "0.1.3" + resolved "https://registry.yarnpkg.com/@istanbuljs/schema/-/schema-0.1.3.tgz#e45e384e4b8ec16bce2fd903af78450f6bf7ec98" + integrity sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA== + "@jest/schemas@^29.6.3": version "29.6.3" resolved "https://registry.yarnpkg.com/@jest/schemas/-/schemas-29.6.3.tgz#430b5ce8a4e0044a7e3819663305a7b3091c8e03" @@ -2009,6 +2027,14 @@ resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz#d7c6e6755c78567a951e04ab52ef0fd26de59f32" integrity sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg== +"@jridgewell/trace-mapping@^0.3.12", "@jridgewell/trace-mapping@^0.3.24", "@jridgewell/trace-mapping@^0.3.25": + version "0.3.25" + resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz#15f190e98895f3fc23276ee14bc76b675c2e50f0" + integrity sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ== + dependencies: + "@jridgewell/resolve-uri" "^3.1.0" + "@jridgewell/sourcemap-codec" "^1.4.14" + "@jridgewell/trace-mapping@^0.3.17", "@jridgewell/trace-mapping@^0.3.9": version "0.3.18" resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.18.tgz#25783b2086daf6ff1dcb53c9249ae480e4dd4cd6" @@ -2017,14 +2043,6 @@ "@jridgewell/resolve-uri" "3.1.0" "@jridgewell/sourcemap-codec" "1.4.14" -"@jridgewell/trace-mapping@^0.3.24", "@jridgewell/trace-mapping@^0.3.25": - version "0.3.25" - resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz#15f190e98895f3fc23276ee14bc76b675c2e50f0" - integrity sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ== - dependencies: - "@jridgewell/resolve-uri" "^3.1.0" - "@jridgewell/sourcemap-codec" "^1.4.14" - "@linaria/core@3.0.0-beta.13": version "3.0.0-beta.13" resolved "https://registry.yarnpkg.com/@linaria/core/-/core-3.0.0-beta.13.tgz#049c5be5faa67e341e413a0f6b641d5d78d91056" @@ -3974,6 +3992,11 @@ "@types/react" "*" hoist-non-react-statics "^3.3.0" +"@types/istanbul-lib-coverage@^2.0.1": + version "2.0.6" + resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.6.tgz#7739c232a1fee9b4d3ce8985f314c0c6d33549d7" + integrity sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w== + "@types/js-cookie@^2.2.6": version "2.2.6" resolved "https://registry.yarnpkg.com/@types/js-cookie/-/js-cookie-2.2.6.tgz#f1a1cb35aff47bc5cfb05cb0c441ca91e914c26f" @@ -4714,6 +4737,23 @@ "@types/babel__core" "^7.20.3" react-refresh "^0.14.0" +"@vitest/coverage-v8@^0.34.6": + version "0.34.6" + resolved "https://registry.yarnpkg.com/@vitest/coverage-v8/-/coverage-v8-0.34.6.tgz#931d9223fa738474e00c08f52b84e0f39cedb6d1" + integrity sha512-fivy/OK2d/EsJFoEoxHFEnNGTg+MmdZBAVK9Ka4qhXR2K3J0DS08vcGVwzDtXSuUMabLv4KtPcpSKkcMXFDViw== + dependencies: + "@ampproject/remapping" "^2.2.1" + "@bcoe/v8-coverage" "^0.2.3" + istanbul-lib-coverage "^3.2.0" + istanbul-lib-report "^3.0.1" + istanbul-lib-source-maps "^4.0.1" + istanbul-reports "^3.1.5" + magic-string "^0.30.1" + picocolors "^1.0.0" + std-env "^3.3.3" + test-exclude "^6.0.0" + v8-to-istanbul "^9.1.0" + "@vitest/expect@0.34.6": version "0.34.6" resolved "https://registry.yarnpkg.com/@vitest/expect/-/expect-0.34.6.tgz#608a7b7a9aa3de0919db99b4cc087340a03ea77e" @@ -7282,6 +7322,18 @@ glob@^7.0.5, glob@^7.1.1, glob@^7.1.3, glob@^7.1.6: once "^1.3.0" path-is-absolute "^1.0.0" +glob@^7.1.4: + version "7.2.3" + resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b" + integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== + dependencies: + fs.realpath "^1.0.0" + inflight "^1.0.4" + inherits "2" + minimatch "^3.1.1" + once "^1.3.0" + path-is-absolute "^1.0.0" + globals@^11.1.0: version "11.12.0" resolved "https://registry.yarnpkg.com/globals/-/globals-11.12.0.tgz#ab8795338868a0babd8525758018c2a7eb95c42e" @@ -7612,6 +7664,11 @@ html-encoding-sniffer@^3.0.0: dependencies: whatwg-encoding "^2.0.0" +html-escaper@^2.0.0: + version "2.0.2" + resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" + integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== + html-void-elements@^1.0.0: version "1.0.5" resolved "https://registry.yarnpkg.com/html-void-elements/-/html-void-elements-1.0.5.tgz#ce9159494e86d95e45795b166c2021c2cfca4483" @@ -8154,6 +8211,37 @@ isomorphic.js@^0.2.4: resolved "https://registry.yarnpkg.com/isomorphic.js/-/isomorphic.js-0.2.5.tgz#13eecf36f2dba53e85d355e11bf9d4208c6f7f88" integrity sha512-PIeMbHqMt4DnUP3MA/Flc0HElYjMXArsw1qwJZcm9sqR8mq3l8NYizFMty0pWwE/tzIGH3EKK5+jes5mAr85yw== +istanbul-lib-coverage@^3.0.0, istanbul-lib-coverage@^3.2.0: + version "3.2.2" + resolved "https://registry.yarnpkg.com/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz#2d166c4b0644d43a39f04bf6c2edd1e585f31756" + integrity sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg== + +istanbul-lib-report@^3.0.0, istanbul-lib-report@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/istanbul-lib-report/-/istanbul-lib-report-3.0.1.tgz#908305bac9a5bd175ac6a74489eafd0fc2445a7d" + integrity sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw== + dependencies: + istanbul-lib-coverage "^3.0.0" + make-dir "^4.0.0" + supports-color "^7.1.0" + +istanbul-lib-source-maps@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/istanbul-lib-source-maps/-/istanbul-lib-source-maps-4.0.1.tgz#895f3a709fcfba34c6de5a42939022f3e4358551" + integrity sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw== + dependencies: + debug "^4.1.1" + istanbul-lib-coverage "^3.0.0" + source-map "^0.6.1" + +istanbul-reports@^3.1.5: + version "3.1.7" + resolved "https://registry.yarnpkg.com/istanbul-reports/-/istanbul-reports-3.1.7.tgz#daed12b9e1dca518e15c056e1e537e741280fa0b" + integrity sha512-BewmUXImeuRk2YY0PVbxgKAysvhRPUQE0h5QRM++nVWyubKGV0l8qQ5op8+B2DOmwSe63Jivj0BjkPQVf8fP5g== + dependencies: + html-escaper "^2.0.0" + istanbul-lib-report "^3.0.0" + jake@^10.8.5: version "10.8.7" resolved "https://registry.yarnpkg.com/jake/-/jake-10.8.7.tgz#63a32821177940c33f356e0ba44ff9d34e1c7d8f" @@ -8542,6 +8630,13 @@ make-dir@^2.1.0: pify "^4.0.1" semver "^5.6.0" +make-dir@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-4.0.0.tgz#c3c2307a771277cd9638305f915c29ae741b614e" + integrity sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw== + dependencies: + semver "^7.5.3" + make-error@^1.3.6: version "1.3.6" resolved "https://registry.yarnpkg.com/make-error/-/make-error-1.3.6.tgz#2eb2e37ea9b67c4891f684a1394799af484cf7a2" @@ -8819,7 +8914,7 @@ min-indent@^1.0.0, min-indent@^1.0.1: resolved "https://registry.yarnpkg.com/min-indent/-/min-indent-1.0.1.tgz#a63f681673b30571fbe8bc25686ae746eefa9869" integrity sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg== -minimatch@3.0.5, minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.2, minimatch@^4.2.3, minimatch@^5.0.1: +minimatch@3.0.5, minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.1, minimatch@^3.1.2, minimatch@^4.2.3, minimatch@^5.0.1: version "3.0.5" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.5.tgz#4da8f1290ee0f0f8e83d60ca69f8f134068604a3" integrity sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw== @@ -10891,6 +10986,11 @@ semver@^7.3.7: dependencies: lru-cache "^6.0.0" +semver@^7.5.3, semver@^7.6.2: + version "7.6.3" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143" + integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A== + semver@^7.5.4: version "7.5.4" resolved "https://registry.yarnpkg.com/semver/-/semver-7.5.4.tgz#483986ec4ed38e1c6c48c34894a9182dbff68a6e" @@ -10898,11 +10998,6 @@ semver@^7.5.4: dependencies: lru-cache "^6.0.0" -semver@^7.6.2: - version "7.6.3" - resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143" - integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A== - sentence-case@^3.0.4: version "3.0.4" resolved "https://registry.yarnpkg.com/sentence-case/-/sentence-case-3.0.4.tgz#3645a7b8c117c787fde8702056225bb62a45131f" @@ -11381,6 +11476,15 @@ temp@^0.9.4: mkdirp "^0.5.1" rimraf "~2.6.2" +test-exclude@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/test-exclude/-/test-exclude-6.0.0.tgz#04a8698661d805ea6fa293b6cb9e63ac044ef15e" + integrity sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w== + dependencies: + "@istanbuljs/schema" "^0.1.2" + glob "^7.1.4" + minimatch "^3.0.4" + text-table@^0.2.0: version "0.2.0" resolved "https://registry.yarnpkg.com/text-table/-/text-table-0.2.0.tgz#7f5ee823ae805207c00af2df4a84ec3fcfa570b4" @@ -11873,6 +11977,15 @@ uuid@^9.0.0: resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30" integrity sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA== +v8-to-istanbul@^9.1.0: + version "9.3.0" + resolved "https://registry.yarnpkg.com/v8-to-istanbul/-/v8-to-istanbul-9.3.0.tgz#b9572abfa62bd556c16d75fdebc1a411d5ff3175" + integrity sha512-kiGUalWN+rgBJ/1OHZsBtU4rXZOfj/7rKQxULKlIzwzQSvMJUUNgPwJEEh7gU6xEVxC0ahoOBvN2YI8GH6FNgA== + dependencies: + "@jridgewell/trace-mapping" "^0.3.12" + "@types/istanbul-lib-coverage" "^2.0.1" + convert-source-map "^2.0.0" + value-equal@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/value-equal/-/value-equal-1.0.1.tgz#1e0b794c734c5c0cade179c437d356d931a34d6c" diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index c18d8671318f64..bcb06affedcff0 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -780,6 +780,11 @@ module.exports = { label: "Getting Started With GraphQL", id: "docs/api/graphql/getting-started", }, + { + type: "doc", + label: "GraphQL Best Practices", + id: "docs/api/graphql/graphql-best-practices", + }, { type: "doc", label: "Access Token Management", diff --git a/docs-website/static/img/hero.png b/docs-website/static/img/hero.png index 44b11c2781143f..a520a6d8a6b6ff 100644 Binary files a/docs-website/static/img/hero.png and b/docs-website/static/img/hero.png differ diff --git a/docs/api/graphql/graphql-best-practices.md b/docs/api/graphql/graphql-best-practices.md new file mode 100644 index 00000000000000..d4c85d52e29f98 --- /dev/null +++ b/docs/api/graphql/graphql-best-practices.md @@ -0,0 +1,1022 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# GraphQL Best Practices + +## Introduction: + +DataHub’s GraphQL API is designed to power the UI. The following guidelines are written with this use-case in mind. + +## General Best Practices + +### Query Optimizations + +> One of GraphQL's biggest advantages over a traditional REST API is its support for **declarative data fetching**. Each component can (and should) query exactly the fields it requires to render, with no superfluous data sent over the network. If instead your root component executes a single, enormous query to obtain data for all of its children, it might query on behalf of components that *aren't even rendered* given the current state. This can result in a delayed response, and it drastically reduces the likelihood that the query's result can be reused by a **server-side response cache**. [[ref](https://www.apolloGraphQL.com/docs/react/data/operation-best-practices#query-only-the-data-you-need-where-you-need-it)] +> +1. Minimize over-fetching by only requesting data needed to be displayed. +2. Limit result counts and use pagination (additionally see section below on `Deep Pagination`). +3. Avoid deeply nested queries and instead break out queries into separate requests for the nested objects. + +### Client-side Caching + +Clients, such as Apollo Client (javascript, python `apollo-client-python`), offer [client-side caching](https://www.apolloGraphQL.com/docs/react/caching/overview) to prevent requests to the service and are able to understand the content of the GraphQL query. This enables more advanced caching vs HTTP response caching. + +### Reuse Pieces of Query Logic with Fragments + +One powerful feature of GraphQL that we recommend you use is [fragments](https://hygraph.com/learn/GraphQL/fragments). Fragments allow you to define pieces of a query that you can reuse across any client-side query that you define. Basically, you can define a set of fields that you want to query, and reuse it in multiple places. + +This technique makes maintaining your GraphQL queries much more doable. For example, if you want to request a new field for an entity type across many queries, you’re able to update it in one place if you’re leveraging fragments. + +## Search Query Best Practices + +### Deep Pagination: search* vs scroll* APIs + +`search*` APIs such as [`searchAcrossEntities`](https://datahubproject.io/docs/GraphQL/queries/#searchacrossentities) are designed for minimal pagination (< ~50). They do not perform well for deep pagination requests. Use the equivalent `scroll*` APIs such as [`scrollAcrossEntities`](https://datahubproject.io/docs/GraphQL/queries/#scrollacrossentities) when expecting the need to paginate deeply into the result set. + +Note: that it is impossible to use `search*` for paginating beyond 10k results. + +#### Examples + +In the following examples we demonstrate pagination for both `scroll*` and `search*` requests. This particular request is searching for two entities, Datasets and Charts, that +contain `pet` in the entities' name or title. The results will only include the URN for the entities. + + + +Page 1 Request: + +```graphql +{ + scrollAcrossEntities( + input: { + types: [DATASET, CHART] + count: 2 + query: "*" + orFilters: [ + { and: [{ field: "name", condition: CONTAIN, values: ["pet"] }] }, + { and: [{ field: "title", condition: CONTAIN, values: ["pet"] }] } + ] + } + ) { + nextScrollId + searchResults { + entity { + ... on Dataset { + urn + } + ... on Chart { + urn + } + } + } + } +} +``` + +Page 1 Result: + +```json +{ + "data": { + "scrollAcrossEntities": { + "nextScrollId": "eyJzb3J0IjpbMi4wNzk2ODc2LCJ1cm46bGk6ZGF0YXNldDoodXJuOmxpOmRhdGFQbGF0Zm9ybTpzbm93Zmxha2UsbG9uZ190YWlsX2NvbXBhbmlvbnMuYWRvcHRpb24ucGV0X3Byb2ZpbGVzLFBST0QpIl0sInBpdElkIjpudWxsLCJleHBpcmF0aW9uVGltZSI6MH0=", + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)" + } + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pet_profiles,PROD)" + } + } + ] + } + }, + "extensions": {} +} +``` + +Page 2 Request: + +```graphql +{ + scrollAcrossEntities( + input: { + scrollId: "eyJzb3J0IjpbMi4wNzk2ODc2LCJ1cm46bGk6ZGF0YXNldDoodXJuOmxpOmRhdGFQbGF0Zm9ybTpzbm93Zmxha2UsbG9uZ190YWlsX2NvbXBhbmlvbnMuYWRvcHRpb24ucGV0X3Byb2ZpbGVzLFBST0QpIl0sInBpdElkIjpudWxsLCJleHBpcmF0aW9uVGltZSI6MH0=" + types: [DATASET, CHART] + count: 2 + query: "*" + orFilters: [ + { and: [{ field: "name", condition: CONTAIN, values: ["pet"] }] }, + { and: [{ field: "title", condition: CONTAIN, values: ["pet"] }] } + ] + } + ) { + nextScrollId + searchResults { + entity { + ... on Dataset { + urn + } + ... on Chart { + urn + } + } + } + } +} +``` + +Page 2 Result: + +```json +{ + "data": { + "scrollAcrossEntities": { + "nextScrollId": "eyJzb3J0IjpbMS43MTg3NSwidXJuOmxpOmRhdGFzZXQ6KHVybjpsaTpkYXRhUGxhdGZvcm06c25vd2ZsYWtlLGxvbmdfdGFpbF9jb21wYW5pb25zLmFkb3B0aW9uLnBldHMsUFJPRCkiXSwicGl0SWQiOm51bGwsImV4cGlyYXRpb25UaW1lIjowfQ==", + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_status_history,PROD)" + } + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pets,PROD)" + } + } + ] + } + }, + "extensions": {} +} +``` + + + +Page 1 Request: + +```graphql +{ + searchAcrossEntities( + input: { + types: [DATASET, CHART] + count: 2, + start: 0 + query: "*" + orFilters: [ + { and: [{ field: "name", condition: CONTAIN, values: ["pet"] }] }, + { and: [{ field: "title", condition: CONTAIN, values: ["pet"] }] } + ] + } + ) { + searchResults { + entity { + ... on Dataset { + urn + } + ... on Chart { + urn + } + } + } + } +} +``` + +Page 1 Response: + +```json +{ + "data": { + "searchAcrossEntities": { + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)" + } + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pet_profiles,PROD)" + } + } + ] + } + }, + "extensions": {} +} +``` + +Page 2 Request: + +```graphql +{ + searchAcrossEntities( + input: { + types: [DATASET, CHART] + count: 2, + start: 2 + query: "*" + orFilters: [ + { and: [{ field: "name", condition: CONTAIN, values: ["pet"] }] }, + { and: [{ field: "title", condition: CONTAIN, values: ["pet"] }] } + ] + } + ) { + searchResults { + entity { + ... on Dataset { + urn + } + ... on Chart { + urn + } + } + } + } +} +``` + +Page 2 Response: + +```json +{ + "data": { + "searchAcrossEntities": { + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_status_history,PROD)" + } + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pets,PROD)" + } + } + ] + } + }, + "extensions": {} +} +``` + + + +### SearchFlags: Highlighting and Aggregation + +When performing queries which accept [`searchFlags`](https://datahubproject.io/docs/GraphQL/inputObjects#searchflags) and highlighting and aggregation is not needed, be sure to disable these flags. + +- skipHighlighting: true +- skipAggregates: true + +As a fallback, if only certain fields require highlighting use `customHighlightingFields` to limit highlighting to the specific fields. + + + + +Example for skipping highlighting and aggregates, typically used for scrolling search requests. + +```graphql +{ + scrollAcrossEntities( + input: {types: [DATASET], count: 2, query: "pet", searchFlags: {skipAggregates: true, skipHighlighting: true}} + ) { + searchResults { + entity { + ... on Dataset { + urn + } + } + matchedFields { + name + value + } + } + facets { + displayName + aggregations { + value + count + } + } + } +} +``` + +Response: + +Note that a few `matchedFields` are still returned by default [`urn`, `customProperties`] + +```json +{ + "data": { + "scrollAcrossEntities": { + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)" + }, + "matchedFields": [ + { + "name": "urn", + "value": "" + }, + { + "name": "customProperties", + "value": "" + } + ] + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)" + }, + "matchedFields": [ + { + "name": "urn", + "value": "" + }, + { + "name": "customProperties", + "value": "" + } + ] + } + ], + "facets": [] + } + }, + "extensions": {} +} +``` + + + + + +Custom highlighting can be used for searchAcrossEntities when only a limited number of fields are useful for highlighting. In this example we specifically request highlighting for `description`. + +```graphql +{ + searchAcrossEntities( + input: {types: [DATASET], count: 2, query: "pet", searchFlags: {customHighlightingFields: ["description"]}} + ) { + searchResults { + entity { + ... on Dataset { + urn + } + } + matchedFields { + name + value + } + } + } +} +``` + +Response: + +```json +{ + "data": { + "searchAcrossEntities": { + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)" + }, + "matchedFields": [ + { + "name": "urn", + "value": "" + }, + { + "name": "customProperties", + "value": "" + }, + { + "name": "description", + "value": "Table with all pet-related details" + } + ] + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)" + }, + "matchedFields": [ + { + "name": "urn", + "value": "" + }, + { + "name": "customProperties", + "value": "" + } + ] + } + ] + } + }, + "extensions": {} +} +``` + + + + + +### Aggregation + +When aggregation is required with `searchAcrossEntities`, it is possible to set the `count` to 0 to avoid fetching the top search hits, only returning the aggregations. Alternatively [aggregateAcrossEntities](https://datahubproject.io/docs/GraphQL/queries#aggregateacrossentities) provides counts and can provide faster results from server-side caching. + +Request: + +```graphql +{ + searchAcrossEntities( + input: {types: [DATASET], count: 0, query: "pet", searchFlags: {skipHighlighting: true}} + ) { + searchResults { + entity { + ... on Dataset { + urn + } + } + matchedFields { + name + value + } + } + facets { + displayName + aggregations { + value + count + } + } + } +} +``` + +Response: + +```json +{ + "data": { + "searchAcrossEntities": { + "searchResults": [], + "facets": [ + { + "displayName": "Container", + "aggregations": [ + { + "value": "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", + "count": 4 + }, + { + "value": "urn:li:container:701919de0ec93cb338fe9bac0b35403c", + "count": 3 + } + ] + }, + { + "displayName": "Sub Type", + "aggregations": [ + { + "value": "table", + "count": 9 + }, + { + "value": "view", + "count": 6 + }, + { + "value": "explore", + "count": 5 + }, + { + "value": "source", + "count": 4 + }, + { + "value": "incremental", + "count": 1 + } + ] + }, + { + "displayName": "Type", + "aggregations": [ + { + "value": "DATASET", + "count": 24 + } + ] + }, + { + "displayName": "Environment", + "aggregations": [ + { + "value": "PROD", + "count": 24 + } + ] + }, + { + "displayName": "Glossary Term", + "aggregations": [ + { + "value": "urn:li:glossaryTerm:Adoption.DaysInStatus", + "count": 1 + }, + { + "value": "urn:li:glossaryTerm:Ecommerce.HighRisk", + "count": 1 + }, + { + "value": "urn:li:glossaryTerm:Classification.Confidential", + "count": 1 + } + ] + }, + { + "displayName": "Domain", + "aggregations": [ + { + "value": "urn:li:domain:094dc54b-0ebc-40a6-a4cf-e1b75e8b8089", + "count": 6 + }, + { + "value": "urn:li:domain:7d64d0fa-66c3-445c-83db-3a324723daf8", + "count": 2 + } + ] + }, + { + "displayName": "Owned By", + "aggregations": [ + { + "value": "urn:li:corpGroup:Adoption", + "count": 5 + }, + { + "value": "urn:li:corpuser:shannon@longtail.com", + "count": 4 + }, + { + "value": "urn:li:corpuser:admin", + "count": 2 + }, + { + "value": "urn:li:corpGroup:Analytics Engineering", + "count": 2 + }, + { + "value": "urn:li:corpuser:avigdor@longtail.com", + "count": 1 + }, + { + "value": "urn:li:corpuser:prentiss@longtail.com", + "count": 1 + }, + { + "value": "urn:li:corpuser:tasha@longtail.com", + "count": 1 + }, + { + "value": "urn:li:corpuser:ricca@longtail.com", + "count": 1 + }, + { + "value": "urn:li:corpuser:emilee@longtail.com", + "count": 1 + } + ] + }, + { + "displayName": "Platform", + "aggregations": [ + { + "value": "urn:li:dataPlatform:looker", + "count": 8 + }, + { + "value": "urn:li:dataPlatform:dbt", + "count": 7 + }, + { + "value": "urn:li:dataPlatform:snowflake", + "count": 7 + }, + { + "value": "urn:li:dataPlatform:s3", + "count": 1 + }, + { + "value": "urn:li:dataPlatform:mongodb", + "count": 1 + } + ] + }, + { + "displayName": "Tag", + "aggregations": [ + { + "value": "urn:li:tag:prod_model", + "count": 3 + }, + { + "value": "urn:li:tag:pii", + "count": 2 + }, + { + "value": "urn:li:tag:business critical", + "count": 2 + }, + { + "value": "urn:li:tag:business_critical", + "count": 2 + }, + { + "value": "urn:li:tag:Tier1", + "count": 1 + }, + { + "value": "urn:li:tag:prod", + "count": 1 + } + ] + }, + { + "displayName": "Type", + "aggregations": [ + { + "value": "DATASET", + "count": 24 + } + ] + } + ] + } + }, + "extensions": {} +} +``` + +### Limit Search Entity Types + +When querying for specific entities, enumerate only the entity types required using `types` , for example [`DATASET` , `CHART`] + +### Limit Results + +Limit search results based on the amount of information being requested. For example, a minimal number of attributes can fetch 1,000 - 2,000 results in a single page, however as the number of attributes increases (especially nested objects) the `count` should be lowered, 20-25 for very complex requests. + +## Lineage Query Best Practices + +There are two primary ways to query lineage: + +### Search Across Lineage + +`searchAcrossLineage` / `scrollAcrossLineage` root query: + +- Recommended for all lineage queries +- Only the shortest path is guaranteed to show up in `paths` +- Supports querying indirect lineage (depth > 1) + - Depending on the fanout of the lineage, 3+ hops may not return data, use 1-hop queries for the fastest response times. + - Specify using a filter with name `"degree"` and values `"1"` , `"2"`, and / or `"3+"` + +The following examples are demonstrated using sample data for `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`. + +

+ +

+ + + + +The following example queries show UPSTREAM lineage with progressively higher degrees, first with degree `["1"]` and then `["1","2"]`. + +1-Hop Upstreams: + +Request: + +```graphql +{ + searchAcrossLineage( + input: {urn: "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", query: "*", count: 10, start: 0, direction: UPSTREAM, orFilters: [{and: [{field: "degree", condition: EQUAL, values: ["1"]}]}], searchFlags: {skipAggregates: true, skipHighlighting: true}} + ) { + start + count + total + searchResults { + entity { + urn + type + ... on Dataset { + name + } + } + paths { + path { + ... on Dataset { + urn + } + } + } + degree + } + } +} +``` + +Response: + +```json +{ + "data": { + "searchAcrossLineage": { + "start": 0, + "count": 10, + "total": 1, + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", + "type": "DATASET", + "name": "SampleHdfsDataset" + }, + "paths": [ + { + "path": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)" + } + ] + } + ], + "degree": 1 + } + ] + } + }, + "extensions": {} +} +``` + + + +1-Hop & 2-Hop Upstreams: + +Request: + +```graphql +{ + searchAcrossLineage( + input: {urn: "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", query: "*", count: 10, start: 0, direction: UPSTREAM, orFilters: [{and: [{field: "degree", condition: EQUAL, values: ["1","2"]}]}], searchFlags: {skipAggregates: true, skipHighlighting: true}} + ) { + start + count + total + searchResults { + entity { + urn + type + ... on Dataset { + name + } + } + paths { + path { + ... on Dataset { + urn + } + } + } + degree + } + } +} +``` + +```json +{ + "data": { + "searchAcrossLineage": { + "start": 0, + "count": 10, + "total": 2, + "searchResults": [ + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", + "type": "DATASET", + "name": "SampleHdfsDataset" + }, + "paths": [ + { + "path": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)" + } + ] + } + ], + "degree": 1 + }, + { + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", + "type": "DATASET", + "name": "SampleKafkaDataset" + }, + "paths": [ + { + "path": [ + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)" + }, + { + "urn": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)" + } + ] + } + ], + "degree": 2 + } + ] + } + }, + "extensions": {} +} +``` + + + + +### Lineage Subquery + +The previous query requires a root or starting node in the lineage graph. The following request offers a way to request lineage for multiple nodes at once with a few limitations. + +`lineage` query on `EntityWithRelationship` entities: + +- A more direct reflection of the graph index +- 1-hop lineage only +- Multiple URNs +- Should not be requested too many times in a single request. 20 is a tested limit + +The following examples are based on the sample lineage graph shown here: + +

+ +

+ +Example Request: + +```graphql +query getBulkEntityLineageV2($urns: [String!]! = ["urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)"]) { + entities(urns: $urns) { + urn + type + ... on DataJob { + jobId + dataFlow { + flowId + } + properties { + name + } + upstream: lineage(input: {direction: UPSTREAM, start: 0, count: 10}) { + total + relationships { + type + entity { + urn + type + } + } + } + downstream: lineage(input: {direction: DOWNSTREAM, start: 0, count: 10}) { + total + relationships { + type + entity { + urn + type + } + } + } + } + } +} +``` + +Example Response: + +```json +{ + "data": { + "entities": [ + { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", + "type": "DATA_JOB", + "jobId": "task_123", + "dataFlow": { + "flowId": "dag_abc" + }, + "properties": { + "name": "User Creations" + }, + "upstream": { + "total": 1, + "relationships": [ + { + "type": "Consumes", + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "type": "DATASET" + } + } + ] + }, + "downstream": { + "total": 2, + "relationships": [ + { + "type": "DownstreamOf", + "entity": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", + "type": "DATA_JOB" + } + }, + { + "type": "Produces", + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)", + "type": "DATASET" + } + } + ] + } + }, + { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", + "type": "DATA_JOB", + "jobId": "task_456", + "dataFlow": { + "flowId": "dag_abc" + }, + "properties": { + "name": "User Deletions" + }, + "upstream": { + "total": 2, + "relationships": [ + { + "type": "DownstreamOf", + "entity": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", + "type": "DATA_JOB" + } + }, + { + "type": "Consumes", + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "type": "DATASET" + } + } + ] + }, + "downstream": { + "total": 1, + "relationships": [ + { + "type": "Produces", + "entity": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)", + "type": "DATASET" + } + } + ] + } + } + ] + }, + "extensions": {} +} +``` \ No newline at end of file diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md index 2be2f935b12b10..8a6db7f6c105a2 100644 --- a/docs/api/graphql/how-to-set-up-graphql.md +++ b/docs/api/graphql/how-to-set-up-graphql.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # How To Set Up GraphQL ## Preparing Local Datahub Deployment @@ -29,6 +32,7 @@ DataHub provides a browser-based GraphQL Explorer Tool ([GraphiQL](https://githu This interface allows you to easily craft queries and mutations against real metadata stored in your live DataHub deployment. To experiment with GraphiQL before deploying it in your live DataHub deployment, you can access a demo site provided by DataHub at https://demo.datahubproject.io/api/graphiql. + For instance, you can create a tag by posting the following query: ```json @@ -43,6 +47,29 @@ mutation createTag { For a detailed usage guide, check out [How to use GraphiQL](https://www.gatsbyjs.com/docs/how-to/querying-data/running-queries-with-graphiql/). +To navigate to `GraphiQL` on the demo site or your local instance, select `GraphiQL` from the user profile drop-down menu as +shown below. + + + +

+ +

+![graphiql_link.png](../../../../../Desktop/datahub_graphiql_link.png) +
+ +

+ +

+
+
+ +This link will then display the following interface for exploring GraphQL queries. + +

+ +

+ ### CURL CURL is a command-line tool used for transferring data using various protocols including HTTP, HTTPS, and others. diff --git a/docs/developers.md b/docs/developers.md index 401169490dd4b6..0d398e6232b23d 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -9,6 +9,7 @@ title: "Local Development" - [Java 17 JDK](https://openjdk.org/projects/jdk/17/) - [Python 3.10](https://www.python.org/downloads/release/python-3100/) - [Docker](https://www.docker.com/) +- [Node 22.x](https://nodejs.org/en/about/previous-releases) - [Docker Compose >=2.20](https://docs.docker.com/compose/) - Docker engine with at least 8GB of memory to run tests. diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index 09f873ebf7bc96..42080e4e17596e 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -127,6 +127,7 @@ public class Constants { public static final String EMBED_ASPECT_NAME = "embed"; public static final String INCIDENTS_SUMMARY_ASPECT_NAME = "incidentsSummary"; public static final String DOCUMENTATION_ASPECT_NAME = "documentation"; + public static final String DATA_TRANSFORM_LOGIC_ASPECT_NAME = "dataTransformLogic"; // User public static final String CORP_USER_KEY_ASPECT_NAME = "corpUserKey"; diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index 28def68ccf3f55..c143a8b49f4b7c 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -146,12 +146,55 @@ def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> Non aspect_value=source_info_aspect, ) + @staticmethod + def _convert_sets_to_lists(obj: Any) -> Any: + """ + Recursively converts all sets to lists in a Python object. + Works with nested dictionaries, lists, and sets. + + Args: + obj: Any Python object that might contain sets + + Returns: + The object with all sets converted to lists + """ + if isinstance(obj, dict): + return { + key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value) + for key, value in obj.items() + } + elif isinstance(obj, list): + return [ + DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element) + for element in obj + ] + elif isinstance(obj, set): + return [ + DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element) + for element in obj + ] + elif isinstance(obj, tuple): + return tuple( + DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element) + for element in obj + ) + else: + return obj + def _get_recipe_to_report(self, ctx: PipelineContext) -> str: assert ctx.pipeline_config if not self.report_recipe or not ctx.pipeline_config.get_raw_dict(): return "" else: - return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict())) + redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict()) + # This is required otherwise json dumps will fail + # with a TypeError: Object of type set is not JSON serializable + converted_recipe = ( + DatahubIngestionRunSummaryProvider._convert_sets_to_lists( + redacted_recipe + ) + ) + return json.dumps(converted_recipe) def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None: self.sink.write_record_async( diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 3f7a1fc453bcdb..64c1a0ad0bfbad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -167,7 +167,7 @@ class DataJobEntity: class DataProcessCleanupReport(SourceReport): num_aspects_removed: int = 0 num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict) - sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field( + sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field( default_factory=TopKDict ) num_data_flows_found: int = 0 @@ -286,9 +286,9 @@ def delete_entity(self, urn: str, type: str) -> None: self.report.num_aspect_removed_by_type[type] = ( self.report.num_aspect_removed_by_type.get(type, 0) + 1 ) - if type not in self.report.sample_removed_aspects_by_type: - self.report.sample_removed_aspects_by_type[type] = LossyList() - self.report.sample_removed_aspects_by_type[type].append(urn) + if type not in self.report.sample_soft_deleted_aspects_by_type: + self.report.sample_soft_deleted_aspects_by_type[type] = LossyList() + self.report.sample_soft_deleted_aspects_by_type[type].append(urn) if self.dry_run: logger.info( diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 93f004ab675edc..4c0355834f9b4f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -1,9 +1,10 @@ import logging import time -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait from dataclasses import dataclass, field from datetime import datetime, timezone -from typing import List, Optional +from threading import Lock +from typing import Dict, Iterable, List, Optional from pydantic import Field @@ -18,12 +19,28 @@ logger = logging.getLogger(__name__) +QUERY_QUERY_ENTITY = """ +query listQueries($input: ScrollAcrossEntitiesInput!) { + scrollAcrossEntities(input: $input) { + nextScrollId + count + searchResults { + entity { + ... on QueryEntity { + urn + } + } + } + } +} +""" + class SoftDeletedEntitiesCleanupConfig(ConfigModel): enabled: bool = Field( default=True, description="Whether to do soft deletion cleanup." ) - retention_days: Optional[int] = Field( + retention_days: int = Field( 10, description="Number of days to retain metadata in DataHub", ) @@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel): default=None, description="Query to filter entities", ) + limit_entities_delete: Optional[int] = Field( 25000, description="Max number of entities to delete." ) - runtime_limit_seconds: Optional[int] = Field( - None, + futures_max_at_time: int = Field( + 1000, description="Max number of futures to have at a time." + ) + + runtime_limit_seconds: int = Field( + 7200, # 2 hours by default description="Runtime limit in seconds", ) @dataclass class SoftDeletedEntitiesReport(SourceReport): - num_soft_deleted_entity_removed: int = 0 - num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field( - default_factory=TopKDict - ) - sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field( + num_queries_found: int = 0 + num_soft_deleted_entity_processed: int = 0 + num_soft_deleted_retained_due_to_age: int = 0 + num_soft_deleted_entity_removal_started: int = 0 + num_hard_deleted: int = 0 + num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict) + sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field( default_factory=TopKDict ) @@ -103,48 +127,53 @@ def __init__( self.config = config self.report = report self.dry_run = dry_run + self.start_time = 0.0 + self._report_lock: Lock = Lock() + self.last_print_time = 0.0 + + def _increment_retained_count(self) -> None: + """Thread-safe method to update report fields""" + with self._report_lock: + self.report.num_soft_deleted_retained_due_to_age += 1 + + def _increment_removal_started_count(self) -> None: + """Thread-safe method to update report fields""" + with self._report_lock: + self.report.num_soft_deleted_entity_removal_started += 1 + + def _update_report(self, urn: str, entity_type: str) -> None: + """Thread-safe method to update report fields""" + with self._report_lock: + self.report.num_hard_deleted += 1 + + current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0) + self.report.num_hard_deleted_by_type[entity_type] = current_count + 1 + if entity_type not in self.report.sample_hard_deleted_aspects_by_type: + self.report.sample_hard_deleted_aspects_by_type[ + entity_type + ] = LossyList() + self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn) def delete_entity(self, urn: str) -> None: assert self.ctx.graph entity_urn = Urn.from_string(urn) - self.report.num_soft_deleted_entity_removed += 1 - self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = ( - self.report.num_soft_deleted_entity_removed_by_type.get( - entity_urn.entity_type, 0 - ) - + 1 - ) - if ( - entity_urn.entity_type - not in self.report.sample_soft_deleted_removed_aspects_by_type - ): - self.report.sample_soft_deleted_removed_aspects_by_type[ - entity_urn.entity_type - ] = LossyList() - self.report.sample_soft_deleted_removed_aspects_by_type[ - entity_urn.entity_type - ].append(urn) - if self.dry_run: logger.info( f"Dry run is on otherwise it would have deleted {urn} with hard deletion" ) return - + self._increment_removal_started_count() self.ctx.graph.delete_entity(urn=urn, hard=True) self.ctx.graph.delete_references_to_urn( urn=urn, dry_run=False, ) + self._update_report(urn, entity_urn.entity_type) def delete_soft_deleted_entity(self, urn: str) -> None: assert self.ctx.graph - if self.config.retention_days is None: - logger.info("Retention days is not set, skipping soft delete cleanup") - return - retention_time = ( int(datetime.now(timezone.utc).timestamp()) - self.config.retention_days * 24 * 60 * 60 @@ -157,15 +186,85 @@ def delete_soft_deleted_entity(self, urn: str) -> None: ]["created"]["time"] < (retention_time * 1000): logger.debug(f"Hard deleting {urn}") self.delete_entity(urn) + else: + self._increment_retained_count() + + def _print_report(self) -> None: + time_taken = round(time.time() - self.last_print_time, 1) + # Print report every 2 minutes + if time_taken > 120: + self.last_print_time = time.time() + logger.info(f"\n{self.report.as_string()}") + + def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]: + done, not_done = wait(futures, return_when=FIRST_COMPLETED) + futures = {future: urn for future, urn in futures.items() if future in not_done} + + for future in done: + self._print_report() + if future.exception(): + logger.error( + f"Failed to delete entity {futures[future]}: {future.exception()}" + ) + self.report.failure( + f"Failed to delete entity {futures[future]}", + exc=future.exception(), + ) + self.report.num_soft_deleted_entity_processed += 1 + if ( + self.report.num_soft_deleted_entity_processed % self.config.batch_size + == 0 + ): + if self.config.delay: + logger.debug( + f"Sleeping for {self.config.delay} seconds before further processing batch" + ) + time.sleep(self.config.delay) + return futures - def cleanup_soft_deleted_entities(self) -> None: - if not self.config.enabled: - return + def _get_soft_deleted_queries(self) -> Iterable[str]: assert self.ctx.graph - start_time = time.time() - - deleted_count_retention = 0 - urns = self.ctx.graph.get_urns_by_filter( + scroll_id: Optional[str] = None + while True: + try: + result = self.ctx.graph.execute_graphql( + QUERY_QUERY_ENTITY, + { + "input": { + "types": ["QUERY"], + "query": "*", + "scrollId": scroll_id if scroll_id else None, + "count": self.config.batch_size, + "orFilters": [ + { + "and": [ + { + "field": "removed", + "values": ["true"], + "condition": "EQUAL", + } + ] + } + ], + } + }, + ) + except Exception as e: + self.report.failure( + f"While trying to get queries with {scroll_id}", exc=e + ) + break + scroll_across_entities = result.get("scrollAcrossEntities") + if not scroll_across_entities: + break + scroll_id = scroll_across_entities.get("nextScrollId") + self.report.num_queries_found += scroll_across_entities.get("count") + for query in scroll_across_entities.get("searchResults"): + yield query["entity"]["urn"] + + def _get_urns(self) -> Iterable[str]: + assert self.ctx.graph + yield from self.ctx.graph.get_urns_by_filter( entity_types=self.config.entity_types, platform=self.config.platform, env=self.config.env, @@ -173,52 +272,41 @@ def cleanup_soft_deleted_entities(self) -> None: status=RemovedStatusFilter.ONLY_SOFT_DELETED, batch_size=self.config.batch_size, ) + yield from self._get_soft_deleted_queries() + + def cleanup_soft_deleted_entities(self) -> None: + if not self.config.enabled: + return + self.start_time = time.time() - futures = {} + futures: Dict[Future, str] = dict() with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor: - num_urns_submitted = 0 - for urn in urns: - num_urns_submitted += 1 + for urn in self._get_urns(): + self._print_report() + while len(futures) >= self.config.futures_max_at_time: + futures = self._process_futures(futures) if ( self.config.limit_entities_delete - and num_urns_submitted > self.config.limit_entities_delete + and self.report.num_hard_deleted > self.config.limit_entities_delete ): logger.info( - f"Limit of {self.config.limit_entities_delete} entities reached. Stopping" + f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more." ) break if ( self.config.runtime_limit_seconds - and time.time() - start_time > self.config.runtime_limit_seconds + and time.time() - self.start_time + > self.config.runtime_limit_seconds ): logger.info( - f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping" + f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures." ) break future = executor.submit(self.delete_soft_deleted_entity, urn) futures[future] = urn - if not futures: - return - for future in as_completed(futures): - if future.exception(): - logger.error( - f"Failed to delete entity {futures[future]}: {future.exception()}" - ) - self.report.failure( - f"Failed to delete entity {futures[future]}", - exc=future.exception(), - ) - deleted_count_retention += 1 - - if deleted_count_retention % self.config.batch_size == 0: - logger.info( - f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far" - ) - - if self.config.delay: - logger.debug( - f"Sleeping for {self.config.delay} seconds before getting next batch" - ) - time.sleep(self.config.delay) + logger.info(f"Waiting for {len(futures)} futures to complete") + while len(futures) > 0: + self._print_report() + futures = self._process_futures(futures) diff --git a/metadata-ingestion/tests/unit/reporting/test_datahub_ingestion_reporter.py b/metadata-ingestion/tests/unit/reporting/test_datahub_ingestion_reporter.py index 749ea03a7f20a8..2ab6208e2dcc68 100644 --- a/metadata-ingestion/tests/unit/reporting/test_datahub_ingestion_reporter.py +++ b/metadata-ingestion/tests/unit/reporting/test_datahub_ingestion_reporter.py @@ -1,3 +1,5 @@ +from typing import Any, Dict, List, Set, Tuple, Union + import pytest from datahub.ingestion.reporting.datahub_ingestion_run_summary_provider import ( @@ -50,3 +52,136 @@ def test_default_config(): typed_config = DatahubIngestionRunSummaryProviderConfig.parse_obj({}) assert typed_config.sink is None assert typed_config.report_recipe is True + + +def test_simple_set() -> None: + """Test conversion of a simple set""" + input_data: Set[int] = {1, 2, 3} + expected: List[int] = [1, 2, 3] + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + assert sorted(result) == sorted(expected) + assert isinstance(result, list) + + +def test_nested_dict_with_sets() -> None: + """Test conversion of nested dictionary containing sets""" + input_data: Dict[str, Union[Set[int], Dict[str, Set[str]]]] = { + "set1": {1, 2, 3}, + "dict1": {"set2": {"a", "b"}}, + } + expected = { + "set1": [1, 2, 3], + "dict1": {"set2": ["a", "b"]}, + } + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + + def sort_nested_lists(d): + return { + k: ( + sorted(v) + if isinstance(v, list) + else (sort_nested_lists(v) if isinstance(v, dict) else v) + ) + for k, v in d.items() + } + + assert sort_nested_lists(result) == sort_nested_lists(expected) + + +def test_nested_lists_with_sets() -> None: + """Test conversion of nested lists containing sets""" + input_data = [{1, 2}, [{3, 4}, {5, 6}]] + expected = [[1, 2], [[3, 4], [5, 6]]] + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + assert [ + sorted(x) + if isinstance(x, list) and len(x) > 0 and not isinstance(x[0], list) + else x + for x in result + ] == [ + sorted(x) + if isinstance(x, list) and len(x) > 0 and not isinstance(x[0], list) + else x + for x in expected + ] + + +def test_tuple_with_sets() -> None: + """Test conversion of tuples containing sets""" + input_data = (1, {2, 3}, 4) + expected = (1, [2, 3], 4) + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + assert (result[0], sorted(result[1]), result[2]) == ( + expected[0], + sorted(expected[1]), + expected[2], + ) + assert isinstance(result, tuple) + + +def test_mixed_nested_structure() -> None: + """Test conversion of a complex nested structure""" + input_data = { + "simple_set": {1, 2, 3}, + "nested_dict": { + "another_set": {"a", "b", "c"}, + "mixed_list": [1, {2, 3}, {"x", "y"}], + }, + "tuple_with_set": (1, {4, 5}, 6), + "list_of_sets": [{1, 2}, {3, 4}], + } + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + + # Verify structure types + assert isinstance(result["simple_set"], list) + assert isinstance(result["nested_dict"]["another_set"], list) + assert isinstance(result["nested_dict"]["mixed_list"][1], list) + assert isinstance(result["nested_dict"]["mixed_list"][2], list) + assert isinstance(result["tuple_with_set"], tuple) + assert isinstance(result["tuple_with_set"][1], list) + assert isinstance(result["list_of_sets"][0], list) + + +def test_non_set_data() -> None: + """Test that non-set data remains unchanged""" + input_data = { + "string": "hello", + "int": 42, + "float": 3.14, + "bool": True, + "none": None, + "list": [1, 2, 3], + "dict": {"a": 1, "b": 2}, + } + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + assert result == input_data + + +def test_empty_structures() -> None: + """Test handling of empty structures""" + input_data: Dict[ + str, Union[Set[Any], Dict[Any, Any], List[Any], Tuple[Any, ...]] + ] = {"empty_set": set(), "empty_dict": {}, "empty_list": [], "empty_tuple": ()} + expected: Dict[ + str, Union[List[Any], Dict[Any, Any], List[Any], Tuple[Any, ...]] + ] = {"empty_set": [], "empty_dict": {}, "empty_list": [], "empty_tuple": ()} + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + assert result == expected + + +def test_json_serializable() -> None: + """Test that the converted structure is JSON serializable""" + import json + + input_data = { + "set": {1, 2, 3}, + "nested": {"set": {"a", "b"}}, + "mixed": [1, {2, 3}, {"x"}], + } + result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data) + try: + json.dumps(result) + serializable = True + except TypeError: + serializable = False + assert serializable diff --git a/metadata-ingestion/tests/unit/test_aws_common.py b/metadata-ingestion/tests/unit/test_aws_common.py index 9291fb91134b1c..dd1f06cf9bfd55 100644 --- a/metadata-ingestion/tests/unit/test_aws_common.py +++ b/metadata-ingestion/tests/unit/test_aws_common.py @@ -17,6 +17,14 @@ ) +@pytest.fixture +def mock_disable_ec2_metadata(): + """Disable EC2 metadata detection""" + with patch("requests.put") as mock_put: + mock_put.return_value.status_code = 404 + yield mock_put + + @pytest.fixture def mock_aws_config(): return AwsConnectionConfig( @@ -27,17 +35,19 @@ def mock_aws_config(): class TestAwsCommon: - def test_environment_detection_no_environment(self): + def test_environment_detection_no_environment(self, mock_disable_ec2_metadata): """Test environment detection when no AWS environment is present""" with patch.dict(os.environ, {}, clear=True): assert detect_aws_environment() == AwsEnvironment.UNKNOWN - def test_environment_detection_lambda(self): + def test_environment_detection_lambda(self, mock_disable_ec2_metadata): """Test Lambda environment detection""" with patch.dict(os.environ, {"AWS_LAMBDA_FUNCTION_NAME": "test-function"}): assert detect_aws_environment() == AwsEnvironment.LAMBDA - def test_environment_detection_lambda_cloudformation(self): + def test_environment_detection_lambda_cloudformation( + self, mock_disable_ec2_metadata + ): """Test CloudFormation Lambda environment detection""" with patch.dict( os.environ, @@ -48,7 +58,7 @@ def test_environment_detection_lambda_cloudformation(self): ): assert detect_aws_environment() == AwsEnvironment.CLOUD_FORMATION - def test_environment_detection_eks(self): + def test_environment_detection_eks(self, mock_disable_ec2_metadata): """Test EKS environment detection""" with patch.dict( os.environ, @@ -59,19 +69,19 @@ def test_environment_detection_eks(self): ): assert detect_aws_environment() == AwsEnvironment.EKS - def test_environment_detection_app_runner(self): + def test_environment_detection_app_runner(self, mock_disable_ec2_metadata): """Test App Runner environment detection""" with patch.dict(os.environ, {"AWS_APP_RUNNER_SERVICE_ID": "service-id"}): assert detect_aws_environment() == AwsEnvironment.APP_RUNNER - def test_environment_detection_ecs(self): + def test_environment_detection_ecs(self, mock_disable_ec2_metadata): """Test ECS environment detection""" with patch.dict( os.environ, {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2/v4"} ): assert detect_aws_environment() == AwsEnvironment.ECS - def test_environment_detection_beanstalk(self): + def test_environment_detection_beanstalk(self, mock_disable_ec2_metadata): """Test Elastic Beanstalk environment detection""" with patch.dict(os.environ, {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}): assert detect_aws_environment() == AwsEnvironment.BEANSTALK @@ -103,6 +113,7 @@ def test_ec2_metadata_token_failure(self, mock_put): @patch("requests.put") def test_is_running_on_ec2(self, mock_put, mock_get): """Test EC2 instance detection with IMDSv2""" + # Explicitly mock EC2 metadata responses mock_put.return_value.status_code = 200 mock_put.return_value.text = "token123" mock_get.return_value.status_code = 200 @@ -322,7 +333,9 @@ def test_aws_connection_config_validation_error(self): ), ], ) - def test_environment_detection_parametrized(self, env_vars, expected_environment): + def test_environment_detection_parametrized( + self, mock_disable_ec2_metadata, env_vars, expected_environment + ): """Parametrized test for environment detection with different configurations""" with patch.dict(os.environ, env_vars, clear=True): assert detect_aws_environment() == expected_environment diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java index 7a8c5c76c31c3a..0d5bdd9ff64286 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.utils.metrics.MetricUtils; +import com.linkedin.util.Pair; import java.sql.Timestamp; import java.util.List; import java.util.Map; @@ -155,6 +156,16 @@ default Map getNextVersions( long getMaxVersion(@Nonnull final String urn, @Nonnull final String aspectName); + /** + * Return the min/max version for the given URN & aspect + * + * @param urn the urn + * @param aspectName the aspect + * @return the range of versions, if they do not exist -1 is returned + */ + @Nonnull + Pair getVersionRange(@Nonnull final String urn, @Nonnull final String aspectName); + void setWritable(boolean canWrite); @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 8ae09111204cab..75f16ae4d981d2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -2229,8 +2229,9 @@ public Set exists( } /** Does not emit MCL */ + @VisibleForTesting @Nullable - private RollbackResult deleteAspectWithoutMCL( + RollbackResult deleteAspectWithoutMCL( @Nonnull OperationContext opContext, String urn, String aspectName, @@ -2288,11 +2289,14 @@ private RollbackResult deleteAspectWithoutMCL( // 4. Fetch all preceding aspects, that match List aspectsToDelete = new ArrayList<>(); - long maxVersion = aspectDao.getMaxVersion(urn, aspectName); + Pair versionRange = aspectDao.getVersionRange(urn, aspectName); + long minVersion = Math.max(0, versionRange.getFirst()); + long maxVersion = Math.max(0, versionRange.getSecond()); + EntityAspect.EntitySystemAspect survivingAspect = null; - String previousMetadata = null; + boolean filterMatch = true; - while (maxVersion > 0 && filterMatch) { + while (maxVersion > minVersion && filterMatch) { EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( @@ -2305,11 +2309,13 @@ private RollbackResult deleteAspectWithoutMCL( previousSysMetadata != null && filterMatch(previousSysMetadata, conditions); if (filterMatch) { aspectsToDelete.add(candidateAspect.getEntityAspect()); - maxVersion = maxVersion - 1; + } else if (candidateAspect == null) { + // potential gap + filterMatch = true; } else { survivingAspect = candidateAspect; - previousMetadata = survivingAspect.getMetadataRaw(); } + maxVersion = maxVersion - 1; } // Delete validation hooks diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java index 4d177d50ea44de..c5a6615ac4face 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraAspectDao.java @@ -36,6 +36,7 @@ import com.linkedin.metadata.query.ExtraInfo; import com.linkedin.metadata.query.ExtraInfoArray; import com.linkedin.metadata.query.ListResultMetadata; +import com.linkedin.util.Pair; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.sql.Timestamp; @@ -110,7 +111,14 @@ public Map> getLatestAspects( @Override public long getMaxVersion(@Nonnull final String urn, @Nonnull final String aspectName) { validateConnection(); - Map result = getMaxVersions(urn, ImmutableSet.of(aspectName)); + Map> result = getVersionRanges(urn, ImmutableSet.of(aspectName)); + return result.get(aspectName).getSecond(); + } + + @Override + @Nonnull + public Pair getVersionRange(@Nonnull String urn, @Nonnull String aspectName) { + Map> result = getVersionRanges(urn, ImmutableSet.of(aspectName)); return result.get(aspectName); } @@ -148,15 +156,17 @@ public boolean checkIfAspectExists(@Nonnull String aspectName) { return rs.one() != null; } - private Map getMaxVersions( + private Map> getVersionRanges( @Nonnull final String urn, @Nonnull final Set aspectNames) { SimpleStatement ss = selectFrom(CassandraAspect.TABLE_NAME) .selectors( Selector.column(CassandraAspect.URN_COLUMN), Selector.column(CassandraAspect.ASPECT_COLUMN), + Selector.function("min", Selector.column(CassandraAspect.VERSION_COLUMN)) + .as("min_version"), Selector.function("max", Selector.column(CassandraAspect.VERSION_COLUMN)) - .as(CassandraAspect.VERSION_COLUMN)) + .as("max_version")) .whereColumn(CassandraAspect.URN_COLUMN) .isEqualTo(literal(urn)) .whereColumn(CassandraAspect.ASPECT_COLUMN) @@ -168,21 +178,21 @@ private Map getMaxVersions( .build(); ResultSet rs = _cqlSession.execute(ss); - Map aspectVersions = + Map> aspectVersionRanges = rs.all().stream() .collect( Collectors.toMap( row -> row.getString(CassandraAspect.ASPECT_COLUMN), - row -> row.getLong(CassandraAspect.VERSION_COLUMN))); + row -> Pair.of(row.getLong("min_version"), row.getLong("max_version")))); - // For each requested aspect that didn't come back from DB, add a version -1 + // For each requested aspect that didn't come back from DB, add a version range of (-1, -1) for (String aspect : aspectNames) { - if (!aspectVersions.containsKey(aspect)) { - aspectVersions.put(aspect, -1L); + if (!aspectVersionRanges.containsKey(aspect)) { + aspectVersionRanges.put(aspect, Pair.of(-1L, -1L)); } } - return aspectVersions; + return aspectVersionRanges; } @Override @@ -551,11 +561,12 @@ public Map> getNextVersions(Map> u Map> result = new HashMap<>(); for (Map.Entry> aspectNames : urnAspectMap.entrySet()) { - Map maxVersions = getMaxVersions(aspectNames.getKey(), aspectNames.getValue()); + Map> maxVersions = + getVersionRanges(aspectNames.getKey(), aspectNames.getValue()); Map nextVersions = new HashMap<>(); for (String aspectName : aspectNames.getValue()) { - long latestVersion = maxVersions.get(aspectName); + long latestVersion = maxVersions.get(aspectName).getSecond(); long nextVal = latestVersion < 0 ? ASPECT_LATEST_VERSION : latestVersion + 1L; nextVersions.put(aspectName, nextVal); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index ea580a97c51886..ad8333407a2760 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -38,6 +38,8 @@ import io.ebean.Query; import io.ebean.RawSql; import io.ebean.RawSqlBuilder; +import io.ebean.SqlQuery; +import io.ebean.SqlRow; import io.ebean.Transaction; import io.ebean.TxScope; import io.ebean.annotation.TxIsolation; @@ -247,10 +249,18 @@ private void saveEbeanAspect( @Nonnull final EbeanAspectV2 ebeanAspect, final boolean insert) { validateConnection(); - if (insert) { - _server.insert(ebeanAspect, txContext.tx()); + if (txContext != null && txContext.tx() != null) { + if (insert) { + _server.insert(ebeanAspect, txContext.tx()); + } else { + _server.update(ebeanAspect, txContext.tx()); + } } else { - _server.update(ebeanAspect, txContext.tx()); + if (insert) { + _server.insert(ebeanAspect); + } else { + _server.update(ebeanAspect); + } } } @@ -864,20 +874,33 @@ public T runInTransactionWithRetryUnlocked( } @Override - public long getMaxVersion(@Nonnull final String urn, @Nonnull final String aspectName) { + @Nonnull + public Pair getVersionRange( + @Nonnull final String urn, @Nonnull final String aspectName) { validateConnection(); - final List result = - _server - .find(EbeanAspectV2.class) - .where() - .eq(EbeanAspectV2.URN_COLUMN, urn.toString()) - .eq(EbeanAspectV2.ASPECT_COLUMN, aspectName) - .orderBy() - .desc(EbeanAspectV2.VERSION_COLUMN) - .setMaxRows(1) - .findIds(); - return result.isEmpty() ? -1 : result.get(0).getVersion(); + // Use SQL aggregation to get both min and max in a single query + SqlQuery query = + _server.sqlQuery( + "SELECT MIN(version) as min_version, MAX(version) as max_version " + + "FROM metadata_aspect_v2 " + + "WHERE urn = :urn AND aspect = :aspect"); + + query.setParameter("urn", urn); + query.setParameter("aspect", aspectName); + + SqlRow result = query.findOne(); + + if (result == null) { + return Pair.of(-1L, -1L); + } + + return Pair.of(result.getLong("min_version"), result.getLong("max_version")); + } + + @Override + public long getMaxVersion(@Nonnull final String urn, @Nonnull final String aspectName) { + return getVersionRange(urn, aspectName).getSecond(); } /** diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index c00632e5cf5424..6eda210baf7d4a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -84,6 +84,8 @@ import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.annotation.Nonnull; +import java.sql.Timestamp; +import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -2670,6 +2672,109 @@ public void testPatchAddNonExistent() throws Exception { "Expected all tags"); } + @Test + public void testDeleteUrnWithRunIdFilterNonMatch() throws Exception { + Urn entityUrn = UrnUtils.getUrn("urn:li:corpuser:deleteWithFilterNonMatch"); + + // Create aspects with different run IDs + SystemMetadata metadata1 = AspectGenerationUtils.createSystemMetadata(); + metadata1.setRunId("run-123"); + + SystemMetadata metadata2 = AspectGenerationUtils.createSystemMetadata(); + metadata2.setRunId("run-456"); // Different run ID + + String aspectName = AspectGenerationUtils.getAspectName(new CorpUserInfo()); + + // First ingest the aspect that should survive (run-456) + CorpUserInfo writeAspect1 = AspectGenerationUtils.createCorpUserInfo("first@test.com"); + List> firstPair = new ArrayList<>(); + firstPair.add(getAspectRecordPair(writeAspect1, CorpUserInfo.class)); + _entityServiceImpl.ingestAspects(opContext, entityUrn, firstPair, TEST_AUDIT_STAMP, metadata2); + + // Then ingest the aspect that should be deleted (run-123) + CorpUserInfo writeAspect2 = AspectGenerationUtils.createCorpUserInfo("second@test.com"); + List> secondPair = new ArrayList<>(); + secondPair.add(getAspectRecordPair(writeAspect2, CorpUserInfo.class)); + _entityServiceImpl.ingestAspects(opContext, entityUrn, secondPair, TEST_AUDIT_STAMP, metadata1); + + // When we try to delete with runId=run-123, the version with runId=run-456 should survive + RollbackResult result = + _entityServiceImpl.deleteAspectWithoutMCL( + opContext, + entityUrn.toString(), + aspectName, + Collections.singletonMap("runId", "run-123"), + true); + + // The aspect with run-456 should still exist + RecordTemplate survivingAspect = + _entityServiceImpl.getLatestAspect(opContext, entityUrn, aspectName); + assertTrue(DataTemplateUtil.areEqual(writeAspect1, survivingAspect)); + + // Verify the RollbackResult details + assertNotNull(result); + assertEquals(result.getUrn(), entityUrn); + assertEquals(result.getEntityName(), "corpuser"); + assertEquals(result.getAspectName(), aspectName); + } + + @Test + public void testDeleteUrnWithRunIdFilterNonMatchVersionGap() throws Exception { + Urn entityUrn = UrnUtils.getUrn("urn:li:corpuser:deleteWithFilterNonMatch"); + String aspectName = AspectGenerationUtils.getAspectName(new CorpUserInfo()); + + // Metadata that should be preserved (run-456) + SystemMetadata metadata456 = AspectGenerationUtils.createSystemMetadata(); + metadata456.setRunId("run-456"); // Different run ID + CorpUserInfo writeAspect456 = AspectGenerationUtils.createCorpUserInfo("first@test.com"); + List> firstPair = new ArrayList<>(); + firstPair.add(getAspectRecordPair(writeAspect456, CorpUserInfo.class)); + _entityServiceImpl.ingestAspects( + opContext, entityUrn, firstPair, TEST_AUDIT_STAMP, metadata456); + + // Metadata that should be deleted (run-123) + SystemMetadata metadata123 = AspectGenerationUtils.createSystemMetadata(); + metadata123.setRunId("run-123"); + CorpUserInfo writeAspect123 = AspectGenerationUtils.createCorpUserInfo("second@test.com"); + List> secondPair = new ArrayList<>(); + secondPair.add(getAspectRecordPair(writeAspect123, CorpUserInfo.class)); + _entityServiceImpl.ingestAspects( + opContext, entityUrn, secondPair, TEST_AUDIT_STAMP, metadata123); + + // Then insert another run-123 with version gap + _aspectDao.saveAspect( + null, + entityUrn.toString(), + aspectName, + RecordUtils.toJsonString(writeAspect123), + TEST_AUDIT_STAMP.getActor().toString(), + null, + Timestamp.from(Instant.ofEpochMilli(TEST_AUDIT_STAMP.getTime())), + RecordUtils.toJsonString(metadata123), + 10L, + true); + + // When we try to delete with runId=run-123, the version with runId=run-456 should survive + RollbackResult result = + _entityServiceImpl.deleteAspectWithoutMCL( + opContext, + entityUrn.toString(), + aspectName, + Collections.singletonMap("runId", "run-123"), + true); + + // The aspect with run-456 should still exist + RecordTemplate survivingAspect = + _entityServiceImpl.getLatestAspect(opContext, entityUrn, aspectName); + assertTrue(DataTemplateUtil.areEqual(writeAspect456, survivingAspect)); + + // Verify the RollbackResult details + assertNotNull(result); + assertEquals(result.getUrn(), entityUrn); + assertEquals(result.getEntityName(), "corpuser"); + assertEquals(result.getAspectName(), aspectName); + } + @Nonnull protected com.linkedin.entity.Entity createCorpUserEntity(Urn entityUrn, String email) throws Exception { diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/DataTransform.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/DataTransform.pdl new file mode 100644 index 00000000000000..adc8d693b28e24 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/common/DataTransform.pdl @@ -0,0 +1,13 @@ +namespace com.linkedin.common + +import com.linkedin.query.QueryStatement + +/** + * Information about a transformation. It may be a query, + */ +record DataTransform { + /** + * The data transform may be defined by a query statement + */ + queryStatement: optional QueryStatement +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/DataTransformLogic.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/DataTransformLogic.pdl new file mode 100644 index 00000000000000..431cebf436ffbc --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/common/DataTransformLogic.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.common + +/** + * Information about a Query against one or more data assets (e.g. Tables or Views). + */ +@Aspect = { + "name": "dataTransformLogic" +} +record DataTransformLogic { + /** + * List of transformations applied + */ + transforms: array[DataTransform], +} diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 4fe170ced69f33..0193e5e2c5c6c3 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -76,6 +76,7 @@ entities: - subTypes - incidentsSummary - testResults + - dataTransformLogic - name: dataFlow category: core keyAspect: dataFlowKey diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index 33c67a923c278d..fa7274158f9e23 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -212,9 +212,6 @@ def test_run_cypress(auth_session): else: record_arg = " " - rest_specs = set(os.listdir("tests/cypress/cypress/e2e")) - cypress_suite1_specs = {"mutations", "search", "views"} - rest_specs.difference_update(set(cypress_suite1_specs)) print(f"test strategy is {test_strategy}") test_spec_arg = "" specs_str = ",".join([f"**/{f}" for f in _get_cypress_tests_batch()])