From 18495bb17eb70021bf3eb2d325500eba16301207 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Fri, 3 Jan 2025 23:26:20 -0600 Subject: [PATCH] feat(telemetry): cross-component async write tracing * created TraceContext for opentelemetry spans * added tracing header/cookies to control logging trace info * support legacy dropwizard tracing using opentelemetry * added smoke-tests for tracing conditions --- build.gradle | 7 +- .../resolvers/group/EntityCountsResolver.java | 2 +- .../mutate/MutableTypeBatchResolver.java | 41 +- .../ListRecommendationsResolver.java | 2 +- .../resolvers/search/SearchResolver.java | 2 +- .../upgrade/UpgradeCliApplication.java | 6 +- .../upgrade/config/SystemUpdateConfig.java | 1 + .../upgrade/impl/DefaultUpgradeManager.java | 93 +- .../linkedin/metadata/aspect/ReadItem.java | 7 + .../metadata/aspect/SystemAspect.java | 6 + .../metadata/aspect/batch/MCLItem.java | 5 + .../test/metadata/aspect/batch/TestMCP.java | 2 +- .../dao/producer/KafkaEventProducer.java | 72 +- .../java/com/linkedin/mxe/ConsumerGroups.java | 8 + .../com/linkedin/metadata/EventUtils.java | 24 +- metadata-io/build.gradle | 1 + .../entity/ebean/batch/AspectsBatchImpl.java | 8 +- .../entity/ebean/batch/ChangeItemImpl.java | 28 +- .../entity/ebean/batch/DeleteItemImpl.java | 6 + .../entity/ebean/batch/PatchItemImpl.java | 10 +- .../entity/ebean/batch/ProposedItem.java | 5 + .../aspect/utils/DefaultAspectsUtil.java | 14 +- .../metadata/client/JavaEntityClient.java | 2 +- .../metadata/entity/EntityServiceImpl.java | 904 ++++++++++-------- .../cassandra/CassandraRetentionService.java | 2 +- .../entity/ebean/EbeanRetentionService.java | 2 +- .../entity/validation/ValidationUtils.java | 237 ++--- .../metadata/event/EventProducer.java | 88 +- .../graph/elastic/ESGraphQueryDAO.java | 81 +- .../elastic/ElasticSearchGraphService.java | 2 +- .../graph/neo4j/Neo4jGraphService.java | 39 +- .../candidatesource/MostPopularSource.java | 42 +- .../candidatesource/RecentlyEditedSource.java | 42 +- .../candidatesource/RecentlyViewedSource.java | 42 +- .../metadata/search/LineageSearchService.java | 2 +- .../metadata/search/SearchService.java | 15 +- .../search/cache/CacheableSearcher.java | 138 +-- .../search/cache/EntityDocCountCache.java | 2 +- .../client/CachingEntitySearchService.java | 250 ++--- .../elasticsearch/query/ESBrowseDAO.java | 96 +- .../elasticsearch/query/ESSearchDAO.java | 214 +++-- .../request/AggregationQueryBuilder.java | 2 +- .../query/request/SearchRequestHandler.java | 2 +- .../service/UpdateIndicesService.java | 75 +- .../systemmetadata/ESSystemMetadataDAO.java | 48 +- .../ElasticSearchSystemMetadataService.java | 79 +- .../SystemMetadataMappingsBuilder.java | 3 + .../ElasticSearchTimeseriesAspectService.java | 69 +- .../timeseries/elastic/UsageServiceUtil.java | 28 +- .../metadata/trace/KafkaTraceReader.java | 460 +++++++++ .../metadata/trace/MCLTraceReader.java | 43 + .../metadata/trace/MCPFailedTraceReader.java | 45 + .../metadata/trace/MCPTraceReader.java | 43 + .../metadata/trace/TraceServiceImpl.java | 484 ++++++++++ .../metadata/entity/EntityServiceTest.java | 121 ++- .../graph/neo4j/Neo4jGraphServiceTest.java | 3 +- .../trace/BaseKafkaTraceReaderTest.java | 239 +++++ .../metadata/trace/MCLTraceReaderTest.java | 93 ++ .../trace/MCPFailedTraceReaderTest.java | 98 ++ .../metadata/trace/MCPTraceReaderTest.java | 93 ++ .../metadata/trace/TraceServiceImplTest.java | 350 +++++++ .../kafka/MAEOpenTelemetryConfig.java | 21 + metadata-jobs/mae-consumer/build.gradle | 1 + .../kafka/DataHubUsageEventsProcessor.java | 59 +- .../metadata/kafka/MCLKafkaListener.java | 92 +- .../MCLSpringCommonTestConfiguration.java | 1 + .../kafka/MCEOpenTelemetryConfig.java | 21 + metadata-jobs/mce-consumer/build.gradle | 1 + .../kafka/MetadataChangeEventsProcessor.java | 55 +- .../MetadataChangeProposalsProcessor.java | 81 +- ...BatchMetadataChangeProposalsProcessor.java | 107 ++- .../kafka/util/KafkaListenerUtil.java | 42 - .../datahub/event/PlatformEventProcessor.java | 100 +- .../metadata/run/AspectRowSummary.pdl | 1 + metadata-operation-context/build.gradle | 4 +- .../metadata/context/OperationContext.java | 95 +- .../metadata/context/RequestContext.java | 8 + .../metadata/context/TraceContext.java | 414 ++++++++ .../metadata/context/TraceIdGenerator.java | 48 + .../metadata/exception/TraceException.java | 40 + .../context/TestOperationContexts.java | 1 + .../context/OperationContextTest.java | 1 + .../metadata/context/TraceContextTest.java | 293 ++++++ .../authorization/DataHubAuthorizerTest.java | 1 + .../src/main/resources/application.yaml | 3 + metadata-service/factories/build.gradle | 5 + .../common/Neo4jGraphServiceFactory.java | 10 +- .../SystemOperationContextFactory.java | 9 +- .../kafka/common/AdminClientFactory.java | 30 + .../kafka/throttle/KafkaThrottleFactory.java | 25 +- .../kafka/trace/KafkaTraceReaderFactory.java | 196 ++++ .../OpenTelemetryBaseFactory.java | 81 ++ .../factory/trace/TraceServiceFactory.java | 37 + .../kafka/DataHubUpgradeKafkaListener.java | 85 +- .../OpenAPIAnalyticsTestConfiguration.java | 2 + .../OpenAPIEntityTestConfiguration.java | 8 + metadata-service/openapi-servlet/build.gradle | 4 + .../openapi-servlet/models/build.gradle | 1 + .../openapi/v1/models/TraceRequestV1.java | 17 + .../openapi/v1/models/TraceResponseV1.java | 22 + .../openapi/config/SpringWebConfig.java | 9 + .../openapi/config/TracingInterceptor.java | 93 ++ .../controller/GenericEntitiesController.java | 98 +- .../operations/v1/TraceController.java | 149 +++ .../openapi/util/MappingUtil.java | 3 - .../openapi/util/RequestInputUtil.java | 136 +++ .../v1/entities/EntitiesController.java | 6 +- .../RelationshipsController.java | 4 +- .../v3/controller/EntityController.java | 13 + .../java/entities/EntitiesControllerTest.java | 3 +- .../v3/controller/EntityControllerTest.java | 4 + .../com.linkedin.entity.aspects.snapshot.json | 38 +- ...com.linkedin.entity.entities.snapshot.json | 56 +- .../com.linkedin.entity.runs.snapshot.json | 42 +- ...nkedin.operations.operations.snapshot.json | 38 +- ...m.linkedin.platform.platform.snapshot.json | 52 +- .../resources/entity/AspectResource.java | 12 +- .../entity/BatchIngestionRunResource.java | 8 +- .../resources/entity/EntityResource.java | 40 +- .../resources/entity/EntityV2Resource.java | 6 +- .../entity/EntityVersionedV2Resource.java | 4 +- .../resources/lineage/Relationships.java | 6 +- .../operations/OperationsResource.java | 10 +- .../resources/platform/PlatformResource.java | 2 +- .../resources/restli/RestliUtils.java | 30 +- .../metadata/resources/usage/UsageStats.java | 8 +- .../resources/entity/AspectResourceTest.java | 75 +- .../metadata/entity/IngestAspectsResult.java | 62 ++ .../metadata/entity/IngestProposalResult.java | 11 - .../RecommendationsService.java | 2 +- .../EntitySearchAggregationSource.java | 2 +- .../RecentlySearchedSource.java | 38 +- .../candidatesource/RecommendationSource.java | 2 +- .../systemmetadata/SystemMetadataService.java | 4 + .../metadata/systemmetadata/TraceService.java | 38 + .../metadata/systemmetadata/TraceStatus.java | 16 + .../systemmetadata/TraceStorageStatus.java | 52 + .../systemmetadata/TraceWriteStatus.java | 18 + .../linkedin/gms/CommonApplicationConfig.java | 4 +- .../config/GMSOpenTelemetryConfig.java | 21 + metadata-utils/build.gradle | 1 + .../utils/metrics/MetricSpanExporter.java | 58 ++ .../metadata/utils/metrics/MetricUtils.java | 21 +- smoke-test/requirements.txt | 3 +- smoke-test/tests/trace/__init__.py | 0 smoke-test/tests/trace/test_api_trace.py | 486 ++++++++++ 146 files changed, 6929 insertions(+), 1805 deletions(-) create mode 100644 metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/ConsumerGroups.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/trace/KafkaTraceReader.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/trace/MCLTraceReader.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/trace/MCPFailedTraceReader.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/trace/MCPTraceReader.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/trace/BaseKafkaTraceReaderTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/trace/MCLTraceReaderTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/trace/MCPFailedTraceReaderTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/trace/MCPTraceReaderTest.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java create mode 100644 metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MAEOpenTelemetryConfig.java create mode 100644 metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MCEOpenTelemetryConfig.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceContext.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceIdGenerator.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/TraceException.java create mode 100644 metadata-operation-context/src/test/java/io/datahubproject/metadata/context/TraceContextTest.java create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/common/AdminClientFactory.java create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/trace/KafkaTraceReaderFactory.java create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/system_telemetry/OpenTelemetryBaseFactory.java create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/trace/TraceServiceFactory.java create mode 100644 metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceRequestV1.java create mode 100644 metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceResponseV1.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/TracingInterceptor.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/v1/TraceController.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/RequestInputUtil.java create mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestAspectsResult.java delete mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestProposalResult.java create mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceService.java create mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStatus.java create mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStorageStatus.java create mode 100644 metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceWriteStatus.java create mode 100644 metadata-service/war/src/main/java/com/linkedin/gms/factory/config/GMSOpenTelemetryConfig.java create mode 100644 metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricSpanExporter.java create mode 100644 smoke-test/tests/trace/__init__.py create mode 100644 smoke-test/tests/trace/test_api_trace.py diff --git a/build.gradle b/build.gradle index e3259a8df342e1..62ffb35c1e9db5 100644 --- a/build.gradle +++ b/build.gradle @@ -38,7 +38,7 @@ buildscript { ext.springVersion = '6.1.14' ext.springBootVersion = '3.2.9' ext.springKafkaVersion = '3.1.6' - ext.openTelemetryVersion = '1.18.0' + ext.openTelemetryVersion = '1.45.0' ext.neo4jVersion = '5.14.0' ext.neo4jTestVersion = '5.14.0' ext.neo4jApocVersion = '5.14.0' @@ -218,7 +218,10 @@ project.ext.externalDependency = [ 'neo4jApocCore': 'org.neo4j.procedure:apoc-core:' + neo4jApocVersion, 'neo4jApocCommon': 'org.neo4j.procedure:apoc-common:' + neo4jApocVersion, 'opentelemetryApi': 'io.opentelemetry:opentelemetry-api:' + openTelemetryVersion, - 'opentelemetryAnnotations': 'io.opentelemetry:opentelemetry-extension-annotations:' + openTelemetryVersion, + 'opentelemetrySdk': 'io.opentelemetry:opentelemetry-sdk:' + openTelemetryVersion, + 'opentelemetrySdkTrace': 'io.opentelemetry:opentelemetry-sdk-trace:' + openTelemetryVersion, + 'opentelemetryAutoConfig': 'io.opentelemetry:opentelemetry-sdk-extension-autoconfigure:' + openTelemetryVersion, + 'opentelemetryAnnotations': 'io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:2.11.0', 'opentracingJdbc':'io.opentracing.contrib:opentracing-jdbc:0.2.15', 'parquet': 'org.apache.parquet:parquet-avro:1.12.3', 'parquetHadoop': 'org.apache.parquet:parquet-hadoop:1.13.1', diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java index ac195ca5d82520..d97141b84588c6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java @@ -12,7 +12,7 @@ import com.linkedin.metadata.service.ViewService; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MutableTypeBatchResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MutableTypeBatchResolver.java index d647374b8e1efc..b343a78412ccc6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MutableTypeBatchResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/MutableTypeBatchResolver.java @@ -2,13 +2,14 @@ import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; -import com.codahale.metrics.Timer; +import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.concurrency.GraphQLConcurrencyUtils; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.types.BatchMutableType; import com.linkedin.metadata.utils.metrics.MetricUtils; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import io.datahubproject.metadata.context.OperationContext; import java.util.List; import java.util.concurrent.CompletableFuture; import org.slf4j.Logger; @@ -33,25 +34,29 @@ public MutableTypeBatchResolver(final BatchMutableType batchMutableType @Override public CompletableFuture> get(DataFetchingEnvironment environment) throws Exception { + final QueryContext context = environment.getContext(); + final OperationContext opContext = context.getOperationContext(); + final B[] input = bindArgument(environment.getArgument("input"), _batchMutableType.batchInputClass()); - return GraphQLConcurrencyUtils.supplyAsync( - () -> { - Timer.Context timer = MetricUtils.timer(this.getClass(), "batchMutate").time(); - - try { - return _batchMutableType.batchUpdate(input, environment.getContext()); - } catch (AuthorizationException e) { - throw e; - } catch (Exception e) { - _logger.error("Failed to perform batchUpdate", e); - throw new IllegalArgumentException(e); - } finally { - timer.stop(); - } - }, - this.getClass().getSimpleName(), - "get"); + return opContext.withSpan( + "batchMutate", + () -> + GraphQLConcurrencyUtils.supplyAsync( + () -> { + try { + return _batchMutableType.batchUpdate(input, environment.getContext()); + } catch (AuthorizationException e) { + throw e; + } catch (Exception e) { + _logger.error("Failed to perform batchUpdate", e); + throw new IllegalArgumentException(e); + } + }, + this.getClass().getSimpleName(), + "get"), + MetricUtils.DROPWIZARD_METRIC, + "true"); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java index 77f6eb285ecc5b..e613f4fc5849e7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java @@ -27,7 +27,7 @@ import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Collections; import java.util.List; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java index 45751fc6eb8cb2..bbf59234247e95 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java @@ -19,7 +19,7 @@ import com.linkedin.metadata.query.SearchFlags; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.Collections; import java.util.concurrent.CompletableFuture; import lombok.RequiredArgsConstructor; diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/UpgradeCliApplication.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/UpgradeCliApplication.java index e17ac6be79face..b1f601761212a4 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/UpgradeCliApplication.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/UpgradeCliApplication.java @@ -5,7 +5,9 @@ import com.linkedin.gms.factory.graphql.GraphQLEngineFactory; import com.linkedin.gms.factory.kafka.KafkaEventConsumerFactory; import com.linkedin.gms.factory.kafka.SimpleKafkaConsumerFactory; +import com.linkedin.gms.factory.kafka.trace.KafkaTraceReaderFactory; import com.linkedin.gms.factory.telemetry.ScheduledAnalyticsFactory; +import com.linkedin.gms.factory.trace.TraceServiceFactory; import org.springframework.boot.WebApplicationType; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.elasticsearch.ElasticsearchRestClientAutoConfiguration; @@ -30,7 +32,9 @@ DataHubAuthorizerFactory.class, SimpleKafkaConsumerFactory.class, KafkaEventConsumerFactory.class, - GraphQLEngineFactory.class + GraphQLEngineFactory.class, + KafkaTraceReaderFactory.class, + TraceServiceFactory.class }) }) public class UpgradeCliApplication { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index d0493019a40af2..2600ea2300dc9c 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -195,6 +195,7 @@ protected OperationContext javaSystemOperationContext( .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build(), + null, true); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/impl/DefaultUpgradeManager.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/impl/DefaultUpgradeManager.java index 443042049e8856..8142c04ddf600d 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/impl/DefaultUpgradeManager.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/impl/DefaultUpgradeManager.java @@ -1,7 +1,6 @@ package com.linkedin.datahub.upgrade.impl; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.datahub.upgrade.Upgrade; import com.linkedin.datahub.upgrade.UpgradeCleanupStep; import com.linkedin.datahub.upgrade.UpgradeContext; @@ -119,44 +118,60 @@ private UpgradeResult executeInternal(UpgradeContext context) { } private UpgradeStepResult executeStepInternal(UpgradeContext context, UpgradeStep step) { - int retryCount = step.retryCount(); - UpgradeStepResult result = null; - int maxAttempts = retryCount + 1; - for (int i = 0; i < maxAttempts; i++) { - try (Timer.Context completionTimer = - MetricUtils.timer(MetricRegistry.name(step.id(), "completionTime")).time()) { - try (Timer.Context executionTimer = - MetricUtils.timer(MetricRegistry.name(step.id(), "executionTime")).time()) { - result = step.executable().apply(context); - } - - if (result == null) { - // Failed to even retrieve a result. Create a default failure result. - result = new DefaultUpgradeStepResult(step.id(), DataHubUpgradeState.FAILED); - context - .report() - .addLine(String.format("Retrying %s more times...", maxAttempts - (i + 1))); - MetricUtils.counter(MetricRegistry.name(step.id(), "retry")).inc(); - } - - if (DataHubUpgradeState.SUCCEEDED.equals(result.result())) { - MetricUtils.counter(MetricRegistry.name(step.id(), "succeeded")).inc(); - break; - } - } catch (Exception e) { - log.error("Caught exception during attempt {} of Step with id {}", i, step.id(), e); - context - .report() - .addLine( - String.format( - "Caught exception during attempt %s of Step with id %s: %s", i, step.id(), e)); - MetricUtils.counter(MetricRegistry.name(step.id(), "failed")).inc(); - result = new DefaultUpgradeStepResult(step.id(), DataHubUpgradeState.FAILED); - context.report().addLine(String.format("Retrying %s more times...", maxAttempts - (i + 1))); - } - } - - return result; + return context + .opContext() + .withSpan( + "completionTime", + () -> { + int retryCount = step.retryCount(); + UpgradeStepResult result = null; + int maxAttempts = retryCount + 1; + for (int i = 0; i < maxAttempts; i++) { + try { + result = + context + .opContext() + .withSpan( + "executionTime", + () -> step.executable().apply(context), + "step.id", + step.id(), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(step.id(), "executionTime")); + + if (result == null) { + // Failed to even retrieve a result. Create a default failure result. + result = new DefaultUpgradeStepResult(step.id(), DataHubUpgradeState.FAILED); + context + .report() + .addLine(String.format("Retrying %s more times...", maxAttempts - (i + 1))); + MetricUtils.counter(MetricRegistry.name(step.id(), "retry")).inc(); + } + + if (DataHubUpgradeState.SUCCEEDED.equals(result.result())) { + MetricUtils.counter(MetricRegistry.name(step.id(), "succeeded")).inc(); + break; + } + } catch (Exception e) { + log.error( + "Caught exception during attempt {} of Step with id {}", i, step.id(), e); + context + .report() + .addLine( + String.format( + "Caught exception during attempt %s of Step with id %s: %s", + i, step.id(), e)); + MetricUtils.counter(MetricRegistry.name(step.id(), "failed")).inc(); + result = new DefaultUpgradeStepResult(step.id(), DataHubUpgradeState.FAILED); + context + .report() + .addLine(String.format("Retrying %s more times...", maxAttempts - (i + 1))); + } + } + return result; + }, + MetricUtils.DROPWIZARD_METRIC, + "true"); } private void executeCleanupInternal(UpgradeContext context, UpgradeResult result) { diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java index 341dec4d4741c7..46e6bf45d654e5 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java @@ -64,6 +64,13 @@ static T getAspect(Class clazz, @Nullable RecordTemplate recordTemplate) @Nullable SystemMetadata getSystemMetadata(); + /** + * Set system metadata on the item + * + * @param systemMetadata + */ + void setSystemMetadata(@Nonnull SystemMetadata systemMetadata); + /** * The entity's schema * diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/SystemAspect.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/SystemAspect.java index 4c9bf3d4fdbc78..abbc82833c55f8 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/SystemAspect.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/SystemAspect.java @@ -6,6 +6,7 @@ import java.sql.Timestamp; import java.util.Optional; import javax.annotation.Nonnull; +import org.apache.commons.lang3.NotImplementedException; /** * An aspect along with system metadata and creation timestamp. Represents an aspect as stored in @@ -36,4 +37,9 @@ default Optional getSystemMetadataVersion() { .map(SystemMetadata::getVersion) .map(Long::parseLong); } + + @Override + default void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + throw new NotImplementedException(); + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/MCLItem.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/MCLItem.java index 09da0a52ff0c30..1a9d66581ad523 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/MCLItem.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/MCLItem.java @@ -37,6 +37,11 @@ default SystemMetadata getSystemMetadata() { return getMetadataChangeLog().getSystemMetadata(); } + @Override + default void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + getMetadataChangeLog().setSystemMetadata(systemMetadata); + } + default SystemMetadata getPreviousSystemMetadata() { return getMetadataChangeLog().getPreviousSystemMetadata(); } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java index d7dd1fab2b6acf..aacd09adc4036a 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/batch/TestMCP.java @@ -116,7 +116,7 @@ public static Set ofOneMCP( private Urn urn; private RecordTemplate recordTemplate; - private SystemMetadata systemMetadata; + @Setter private SystemMetadata systemMetadata; private AuditStamp auditStamp; private ChangeType changeType; @Nonnull private final EntitySpec entitySpec; diff --git a/metadata-dao-impl/kafka-producer/src/main/java/com/linkedin/metadata/dao/producer/KafkaEventProducer.java b/metadata-dao-impl/kafka-producer/src/main/java/com/linkedin/metadata/dao/producer/KafkaEventProducer.java index 26b48449c1c2ff..f2434f07dd11c8 100644 --- a/metadata-dao-impl/kafka-producer/src/main/java/com/linkedin/metadata/dao/producer/KafkaEventProducer.java +++ b/metadata-dao-impl/kafka-producer/src/main/java/com/linkedin/metadata/dao/producer/KafkaEventProducer.java @@ -6,13 +6,17 @@ import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.mxe.DataHubUpgradeHistoryEvent; +import com.linkedin.mxe.FailedMetadataChangeProposal; import com.linkedin.mxe.MetadataChangeLog; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.PlatformEvent; import com.linkedin.mxe.TopicConvention; import com.linkedin.mxe.TopicConventionImpl; -import io.opentelemetry.extension.annotations.WithSpan; +import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; +import java.util.Set; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.Future; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -29,7 +33,7 @@ * delimiter of an underscore (_). */ @Slf4j -public class KafkaEventProducer implements EventProducer { +public class KafkaEventProducer extends EventProducer { private final Producer _producer; private final TopicConvention _topicConvention; @@ -69,13 +73,19 @@ record = EventUtils.pegasusToAvroMCL(metadataChangeLog); throw new ModelConversionException("Failed to convert Pegasus MAE to Avro", e); } + String topic = getMetadataChangeLogTopicName(aspectSpec); + return _producer.send( + new ProducerRecord(topic, urn.toString(), record), + _kafkaHealthChecker.getKafkaCallBack("MCL", urn.toString())); + } + + @Override + public String getMetadataChangeLogTopicName(@Nonnull AspectSpec aspectSpec) { String topic = _topicConvention.getMetadataChangeLogVersionedTopicName(); if (aspectSpec.isTimeseries()) { topic = _topicConvention.getMetadataChangeLogTimeseriesTopicName(); } - return _producer.send( - new ProducerRecord(topic, urn.toString(), record), - _kafkaHealthChecker.getKafkaCallBack("MCL", urn.toString())); + return topic; } @Override @@ -102,6 +112,42 @@ record = EventUtils.pegasusToAvroMCP(metadataChangeProposal); _kafkaHealthChecker.getKafkaCallBack("MCP", urn.toString())); } + @Override + public String getMetadataChangeProposalTopicName() { + return _topicConvention.getMetadataChangeProposalTopicName(); + } + + @Override + public Future produceFailedMetadataChangeProposalAsync( + @Nonnull OperationContext opContext, + @Nonnull MetadataChangeProposal mcp, + @Nonnull Set throwables) { + + try { + String topic = _topicConvention.getFailedMetadataChangeProposalTopicName(); + final FailedMetadataChangeProposal failedMetadataChangeProposal = + createFailedMCPEvent(opContext, mcp, throwables); + + final GenericRecord record = EventUtils.pegasusToAvroFailedMCP(failedMetadataChangeProposal); + log.debug( + "Sending FailedMessages to topic - {}", + _topicConvention.getFailedMetadataChangeProposalTopicName()); + log.info( + "Error while processing FMCP: FailedMetadataChangeProposal - {}", + failedMetadataChangeProposal); + + return _producer.send( + new ProducerRecord(topic, mcp.getEntityUrn().toString(), record), + _kafkaHealthChecker.getKafkaCallBack("FMCP", mcp.getEntityUrn().toString())); + } catch (IOException e) { + log.error( + "Error while sending FailedMetadataChangeProposal: Exception - {}, FailedMetadataChangeProposal - {}", + e.getStackTrace(), + mcp); + return CompletableFuture.failedFuture(e); + } + } + @Override public Future producePlatformEvent( @Nonnull String name, @Nullable String key, @Nonnull PlatformEvent event) { @@ -121,6 +167,11 @@ record = EventUtils.pegasusToAvroPE(event); _kafkaHealthChecker.getKafkaCallBack("Platform Event", name)); } + @Override + public String getPlatformEventTopicName() { + return _topicConvention.getPlatformEventTopicName(); + } + @Override public void produceDataHubUpgradeHistoryEvent(@Nonnull DataHubUpgradeHistoryEvent event) { GenericRecord record; @@ -141,4 +192,15 @@ record = EventUtils.pegasusToAvroDUHE(event); _kafkaHealthChecker.getKafkaCallBack( "History Event", "Event Version: " + event.getVersion())); } + + @Nonnull + private static FailedMetadataChangeProposal createFailedMCPEvent( + @Nonnull OperationContext opContext, + @Nonnull MetadataChangeProposal event, + @Nonnull Set throwables) { + final FailedMetadataChangeProposal fmcp = new FailedMetadataChangeProposal(); + fmcp.setError(opContext.traceException(throwables)); + fmcp.setMetadataChangeProposal(event); + return fmcp; + } } diff --git a/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/ConsumerGroups.java b/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/ConsumerGroups.java new file mode 100644 index 00000000000000..565be1d56e5fd4 --- /dev/null +++ b/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/ConsumerGroups.java @@ -0,0 +1,8 @@ +package com.linkedin.mxe; + +public class ConsumerGroups { + private ConsumerGroups() {} + + public static final String MCP_CONSUMER_GROUP_ID_VALUE = + "${METADATA_CHANGE_PROPOSAL_KAFKA_CONSUMER_GROUP_ID:generic-mce-consumer-job-client}"; +} diff --git a/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java b/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java index 18005dfb7b2a5d..e40124b6abba82 100644 --- a/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java +++ b/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java @@ -38,6 +38,9 @@ public class EventUtils { private static final RecordDataSchema MCP_PEGASUS_SCHEMA = new MetadataChangeProposal().schema(); + private static final RecordDataSchema FMCP_PEGASUS_SCHEMA = + new FailedMetadataChangeProposal().schema(); + private static final RecordDataSchema MCL_PEGASUS_SCHEMA = new MetadataChangeLog().schema(); private static final RecordDataSchema PE_PEGASUS_SCHEMA = new PlatformEvent().schema(); @@ -60,7 +63,7 @@ public class EventUtils { private static final Schema ORIGINAL_MCL_AVRO_SCHEMA = getAvroSchemaFromResource("avro/com/linkedin/mxe/MetadataChangeLog.avsc"); - private static final Schema ORIGINAL_FMCL_AVRO_SCHEMA = + private static final Schema ORIGINAL_FMCP_AVRO_SCHEMA = getAvroSchemaFromResource("avro/com/linkedin/mxe/FailedMetadataChangeProposal.avsc"); private static final Schema ORIGINAL_PE_AVRO_SCHEMA = @@ -175,6 +178,23 @@ public static MetadataChangeProposal avroToPegasusMCP(@Nonnull GenericRecord rec ORIGINAL_MCP_AVRO_SCHEMA)); } + /** + * Converts a {@link GenericRecord} Failed MCP into the equivalent Pegasus model. + * + * @param record the {@link GenericRecord} that contains the MCP in com.linkedin.pegasus2avro + * namespace + * @return the Pegasus {@link FailedMetadataChangeProposal} model + */ + @Nonnull + public static FailedMetadataChangeProposal avroToPegasusFailedMCP(@Nonnull GenericRecord record) + throws IOException { + return new FailedMetadataChangeProposal( + DataTranslator.genericRecordToDataMap( + renameSchemaNamespace(record, RENAMED_FMCP_AVRO_SCHEMA, ORIGINAL_FMCP_AVRO_SCHEMA), + FMCP_PEGASUS_SCHEMA, + ORIGINAL_FMCP_AVRO_SCHEMA)); + } + /** * Converts a {@link GenericRecord} PE into the equivalent Pegasus model. * @@ -323,7 +343,7 @@ public static GenericRecord pegasusToAvroFailedMCP( DataTranslator.dataMapToGenericRecord( failedMetadataChangeProposal.data(), failedMetadataChangeProposal.schema(), - ORIGINAL_FMCL_AVRO_SCHEMA); + ORIGINAL_FMCP_AVRO_SCHEMA); return renameSchemaNamespace(original, RENAMED_FMCP_AVRO_SCHEMA); } diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index aab29101b30f71..c49ae056ac4721 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -57,6 +57,7 @@ dependencies { implementation externalDependency.ebeanDdl implementation externalDependency.ebeanAgent implementation externalDependency.opentelemetryAnnotations + implementation externalDependency.opentelemetrySdkTrace implementation externalDependency.resilience4j // Newer Spring libraries require JDK17 classes, allow for JDK11 compileOnly externalDependency.springBootAutoconfigureJdk11 diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java index 1af9fc1565a456..9b993ab5fc4abb 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java @@ -154,8 +154,8 @@ private static BatchItem patchDiscriminator(MCPItem mcpItem, AspectRetriever asp mcpItem.getAuditStamp(), aspectRetriever.getEntityRegistry()); } - return ChangeItemImpl.ChangeItemImplBuilder.build( - mcpItem.getMetadataChangeProposal(), mcpItem.getAuditStamp(), aspectRetriever); + return ChangeItemImpl.builder() + .build(mcpItem.getMetadataChangeProposal(), mcpItem.getAuditStamp(), aspectRetriever); } public static class AspectsBatchImplBuilder { @@ -208,8 +208,8 @@ public AspectsBatchImplBuilder mcps( auditStamp, retrieverContext.getAspectRetriever().getEntityRegistry()); } else { - return ChangeItemImpl.ChangeItemImplBuilder.build( - mcp, auditStamp, retrieverContext.getAspectRetriever()); + return ChangeItemImpl.builder() + .build(mcp, auditStamp, retrieverContext.getAspectRetriever()); } } catch (IllegalArgumentException e) { log.error("Invalid proposal, skipping and proceeding with batch: {}", mcp, e); diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java index 64263859e4aadb..94f71ff0897c63 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java @@ -150,6 +150,14 @@ public MetadataChangeProposal getMetadataChangeProposal() { } } + @Override + public void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + this.systemMetadata = systemMetadata; + if (this.metadataChangeProposal != null) { + this.metadataChangeProposal.setSystemMetadata(systemMetadata); + } + } + @Override public Map getHeaders() { return Optional.ofNullable(metadataChangeProposal) @@ -183,6 +191,10 @@ public ChangeItemImpl build(AspectRetriever aspectRetriever) { this.headers = Map.of(); } + if (this.urn == null && this.metadataChangeProposal != null) { + this.urn = this.metadataChangeProposal.getEntityUrn(); + } + ValidationApiUtils.validateUrn(aspectRetriever.getEntityRegistry(), this.urn); log.debug("entity type = {}", this.urn.getEntityType()); @@ -210,7 +222,7 @@ public ChangeItemImpl build(AspectRetriever aspectRetriever) { this.headers); } - public static ChangeItemImpl build( + public ChangeItemImpl build( MetadataChangeProposal mcp, AuditStamp auditStamp, AspectRetriever aspectRetriever) { log.debug("entity type = {}", mcp.getEntityType()); @@ -303,15 +315,17 @@ public String toString() { return "ChangeItemImpl{" + "changeType=" + changeType - + ", urn=" - + urn + + ", auditStamp=" + + auditStamp + + ", systemMetadata=" + + systemMetadata + + ", recordTemplate=" + + recordTemplate + ", aspectName='" + aspectName + '\'' - + ", recordTemplate=" - + recordTemplate - + ", systemMetadata=" - + systemMetadata + + ", urn=" + + urn + '}'; } } diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/DeleteItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/DeleteItemImpl.java index 40bcb0fa8ed2d1..935227e55b6638 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/DeleteItemImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/DeleteItemImpl.java @@ -23,6 +23,7 @@ import lombok.Setter; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.NotImplementedException; @Slf4j @Getter @@ -61,6 +62,11 @@ public SystemMetadata getSystemMetadata() { return null; } + @Override + public void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + throw new NotImplementedException(); + } + @Nullable @Override public MetadataChangeProposal getMetadataChangeProposal() { diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/PatchItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/PatchItemImpl.java index 2543d99ac6af37..5e4e36cfe6fbd8 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/PatchItemImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/PatchItemImpl.java @@ -59,7 +59,7 @@ public class PatchItemImpl implements PatchMCP { private final Urn urn; // aspectName name of the aspect being inserted private final String aspectName; - private final SystemMetadata systemMetadata; + private SystemMetadata systemMetadata; private final AuditStamp auditStamp; private final JsonPatch patch; @@ -105,6 +105,14 @@ public MetadataChangeProposal getMetadataChangeProposal() { } } + @Override + public void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + this.systemMetadata = systemMetadata; + if (this.metadataChangeProposal != null) { + this.metadataChangeProposal.setSystemMetadata(systemMetadata); + } + } + public ChangeItemImpl applyPatch(RecordTemplate recordTemplate, AspectRetriever aspectRetriever) { ChangeItemImpl.ChangeItemImplBuilder builder = ChangeItemImpl.builder() diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java index 370f1f6f073e65..b5dec0bb06d0e2 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java @@ -81,6 +81,11 @@ public SystemMetadata getSystemMetadata() { return metadataChangeProposal.getSystemMetadata(); } + @Override + public void setSystemMetadata(@Nonnull SystemMetadata systemMetadata) { + metadataChangeProposal.setSystemMetadata(systemMetadata); + } + @Nonnull @Override public ChangeType getChangeType() { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 82bc0ae1409c52..b98e2465e971a8 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -133,11 +133,15 @@ public static List getAdditionalChanges( return defaultAspects.stream() .map( entry -> - ChangeItemImpl.ChangeItemImplBuilder.build( - getProposalFromAspectForDefault( - entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), - templateItem.getAuditStamp(), - opContext.getAspectRetriever())) + ChangeItemImpl.builder() + .build( + getProposalFromAspectForDefault( + entry.getKey(), + entry.getValue(), + entityKeyAspect, + templateItem), + templateItem.getAuditStamp(), + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 35d133c74c0692..71e1aea59c711a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -58,7 +58,7 @@ import com.linkedin.r2.RemoteInvocationException; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.time.Clock; import java.util.ArrayList; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 75f16ae4d981d2..153aa0685f9b26 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -12,8 +12,8 @@ import static com.linkedin.metadata.utils.PegasusUtils.urnToEntityName; import static com.linkedin.metadata.utils.SystemMetadataUtils.createDefaultSystemMetadata; import static com.linkedin.metadata.utils.metrics.ExceptionUtils.collectMetrics; +import static com.linkedin.metadata.utils.metrics.MetricUtils.BATCH_SIZE_ATTR; -import com.codahale.metrics.Timer; import com.datahub.util.RecordUtils; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -48,6 +48,7 @@ import com.linkedin.metadata.aspect.batch.ChangeMCP; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.aspect.plugins.validation.AspectValidationException; import com.linkedin.metadata.aspect.plugins.validation.ValidationExceptionCollection; import com.linkedin.metadata.aspect.utils.DefaultAspectsUtil; import com.linkedin.metadata.config.PreProcessHooks; @@ -84,7 +85,9 @@ import com.linkedin.r2.RemoteInvocationException; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.instrumentation.annotations.WithSpan; import jakarta.persistence.EntityNotFoundException; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -792,14 +795,17 @@ public List ingestAspects( // Handle throttling APIThrottle.evaluate(opContext, new HashSet<>(throttleEvents.values()), false); - List ingestResults = - ingestAspectsToLocalDB(opContext, aspectsBatch, overwrite); - - List mclResults = emitMCL(opContext, ingestResults, emitMCL); + IngestAspectsResult ingestResults = ingestAspectsToLocalDB(opContext, aspectsBatch, overwrite); + // Produce MCLs & run side effects + List mclResults = + emitMCL(opContext, ingestResults.getUpdateAspectResults(), emitMCL); processPostCommitMCLSideEffects( opContext, mclResults.stream().map(UpdateAspectResult::toMCL).collect(Collectors.toList())); + // Produce FailedMCPs for tracing + produceFailedMCPs(opContext, ingestResults); + return mclResults; } @@ -827,6 +833,7 @@ private void processPostCommitMCLSideEffects( sideEffects -> { long count = ingestProposalAsync( + opContext, AspectsBatchImpl.builder() .items(sideEffects) .retrieverContext(opContext.getRetrieverContext()) @@ -847,254 +854,299 @@ private void processPostCommitMCLSideEffects( * @return Details about the new and old version of the aspect */ @Nonnull - private List ingestAspectsToLocalDB( + private IngestAspectsResult ingestAspectsToLocalDB( @Nonnull OperationContext opContext, @Nonnull final AspectsBatch inputBatch, boolean overwrite) { - if (inputBatch.containsDuplicateAspects()) { - log.warn("Batch contains duplicates: {}", inputBatch.duplicateAspects()); - MetricUtils.counter(EntityServiceImpl.class, "batch_with_duplicate").inc(); - } - - return aspectDao - .runInTransactionWithRetry( - (txContext) -> { - // Generate default aspects within the transaction (they are re-calculated on retry) - AspectsBatch batchWithDefaults = - DefaultAspectsUtil.withAdditionalChanges( - opContext, inputBatch, this, enableBrowseV2); - - // Read before write is unfortunate, however batch it - final Map> urnAspects = batchWithDefaults.getUrnAspectsMap(); - - // read #1 - // READ COMMITED is used in conjunction with SELECT FOR UPDATE (read lock) in order - // to ensure that the aspect's version is not modified outside the transaction. - // We rely on the retry mechanism if the row is modified and will re-read (require the - // lock) - Map> databaseAspects = - aspectDao.getLatestAspects(urnAspects, true); - - final Map> batchAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); - - // read #2 (potentially) - final Map> nextVersions = - EntityUtils.calculateNextVersions(txContext, aspectDao, batchAspects, urnAspects); - - // 1. Convert patches to full upserts - // 2. Run any entity/aspect level hooks - Pair>, List> updatedItems = - batchWithDefaults.toUpsertBatchItems(batchAspects, nextVersions); - - // Fetch additional information if needed - final List changeMCPs; - - if (!updatedItems.getFirst().isEmpty()) { - // These items are new items from side effects - Map> sideEffects = updatedItems.getFirst(); - - final Map> updatedLatestAspects; - final Map> updatedNextVersions; - - Map> newLatestAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext(), - aspectDao.getLatestAspects(updatedItems.getFirst(), true)); - // merge - updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); - - Map> newNextVersions = - EntityUtils.calculateNextVersions( - txContext, aspectDao, updatedLatestAspects, updatedItems.getFirst()); - // merge - updatedNextVersions = AspectsBatch.merge(nextVersions, newNextVersions); - - changeMCPs = - updatedItems.getSecond().stream() - .peek( - changeMCP -> { - // Add previous version to each side-effect - if (sideEffects - .getOrDefault( - changeMCP.getUrn().toString(), Collections.emptySet()) - .contains(changeMCP.getAspectName())) { - - AspectsBatch.incrementBatchVersion( - changeMCP, updatedLatestAspects, updatedNextVersions); - } - }) - .collect(Collectors.toList()); - } else { - changeMCPs = updatedItems.getSecond(); - } - - // No changes, return - if (changeMCPs.isEmpty()) { - MetricUtils.counter(EntityServiceImpl.class, "batch_empty").inc(); - return Collections.emptyList(); - } - - // do final pre-commit checks with previous aspect value - ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); - - if (exceptions.hasFatalExceptions()) { - // IF this is a client request/API request we fail the `transaction batch` - if (opContext.getRequestContext() != null) { - MetricUtils.counter(EntityServiceImpl.class, "batch_request_validation_exception") - .inc(); - throw new ValidationException(collectMetrics(exceptions).toString()); - } + return opContext.withSpan( + "ingestAspectsToLocalDB", + () -> { + if (inputBatch.containsDuplicateAspects()) { + log.warn("Batch contains duplicates: {}", inputBatch.duplicateAspects()); + MetricUtils.counter(EntityServiceImpl.class, "batch_with_duplicate").inc(); + } - MetricUtils.counter(EntityServiceImpl.class, "batch_consumer_validation_exception") - .inc(); - log.error("mce-consumer batch exceptions: {}", collectMetrics(exceptions)); - } + return aspectDao + .runInTransactionWithRetry( + (txContext) -> { + // Generate default aspects within the transaction (they are re-calculated on + // retry) + AspectsBatch batchWithDefaults = + DefaultAspectsUtil.withAdditionalChanges( + opContext, inputBatch, this, enableBrowseV2); + + // Read before write is unfortunate, however batch it + final Map> urnAspects = + batchWithDefaults.getUrnAspectsMap(); + + // read #1 + // READ COMMITED is used in conjunction with SELECT FOR UPDATE (read lock) in + // order + // to ensure that the aspect's version is not modified outside the transaction. + // We rely on the retry mechanism if the row is modified and will re-read + // (require the + // lock) + Map> databaseAspects = + aspectDao.getLatestAspects(urnAspects, true); + + final Map> batchAspects = + EntityUtils.toSystemAspects( + opContext.getRetrieverContext(), databaseAspects); + + // read #2 (potentially) + final Map> nextVersions = + EntityUtils.calculateNextVersions( + txContext, aspectDao, batchAspects, urnAspects); + + // 1. Convert patches to full upserts + // 2. Run any entity/aspect level hooks + Pair>, List> updatedItems = + batchWithDefaults.toUpsertBatchItems(batchAspects, nextVersions); + + // Fetch additional information if needed + final List changeMCPs; + + if (!updatedItems.getFirst().isEmpty()) { + // These items are new items from side effects + Map> sideEffects = updatedItems.getFirst(); + + final Map> updatedLatestAspects; + final Map> updatedNextVersions; + + Map> newLatestAspects = + EntityUtils.toSystemAspects( + opContext.getRetrieverContext(), + aspectDao.getLatestAspects(updatedItems.getFirst(), true)); + // merge + updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); + + Map> newNextVersions = + EntityUtils.calculateNextVersions( + txContext, aspectDao, updatedLatestAspects, updatedItems.getFirst()); + // merge + updatedNextVersions = AspectsBatch.merge(nextVersions, newNextVersions); + + changeMCPs = + updatedItems.getSecond().stream() + .peek( + changeMCP -> { + // Add previous version to each side-effect + if (sideEffects + .getOrDefault( + changeMCP.getUrn().toString(), Collections.emptySet()) + .contains(changeMCP.getAspectName())) { + + AspectsBatch.incrementBatchVersion( + changeMCP, updatedLatestAspects, updatedNextVersions); + } + }) + .collect(Collectors.toList()); + } else { + changeMCPs = updatedItems.getSecond(); + } - // Database Upsert successfully validated results - log.info( - "Ingesting aspects batch to database: {}", - AspectsBatch.toAbbreviatedString(changeMCPs, 2048)); - Timer.Context ingestToLocalDBTimer = - MetricUtils.timer(this.getClass(), "ingestAspectsToLocalDB").time(); - List upsertResults = - exceptions - .streamSuccessful(changeMCPs.stream()) - .map( - writeItem -> { - - /* - database*Aspect - should be used for comparisons of before batch operation information - */ - final EntityAspect databaseAspect = - databaseAspects - .getOrDefault(writeItem.getUrn().toString(), Map.of()) - .get(writeItem.getAspectName()); - final EntityAspect.EntitySystemAspect databaseSystemAspect = - databaseAspect == null - ? null - : EntityAspect.EntitySystemAspect.builder() - .build( - writeItem.getEntitySpec(), - writeItem.getAspectSpec(), - databaseAspect); - - /* - This condition is specifically for an older conditional write ingestAspectIfNotPresent() - overwrite is always true otherwise - */ - if (overwrite || databaseAspect == null) { - return Optional.ofNullable( - ingestAspectToLocalDB( - txContext, writeItem, databaseSystemAspect)) - .map( - optResult -> optResult.toBuilder().request(writeItem).build()) - .orElse(null); - } + // No changes, return + if (changeMCPs.isEmpty()) { + MetricUtils.counter(EntityServiceImpl.class, "batch_empty").inc(); + return IngestAspectsResult.EMPTY; + } - return null; - }) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + // do final pre-commit checks with previous aspect value + ValidationExceptionCollection exceptions = + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); + + List>> failedUpsertResults = + new ArrayList<>(); + if (exceptions.hasFatalExceptions()) { + // IF this is a client request/API request we fail the `transaction batch` + if (opContext.getRequestContext() != null) { + MetricUtils.counter( + EntityServiceImpl.class, "batch_request_validation_exception") + .inc(); + throw new ValidationException(collectMetrics(exceptions).toString()); + } + + MetricUtils.counter( + EntityServiceImpl.class, "batch_consumer_validation_exception") + .inc(); + log.error("mce-consumer batch exceptions: {}", collectMetrics(exceptions)); + failedUpsertResults = + exceptions + .streamExceptions(changeMCPs.stream()) + .map( + writeItem -> + Pair.of( + writeItem, + exceptions.get( + Pair.of( + writeItem.getUrn(), writeItem.getAspectName())))) + .collect(Collectors.toList()); + } - if (!upsertResults.isEmpty()) { - // commit upserts prior to retention or kafka send, if supported by impl - if (txContext != null) { - txContext.commitAndContinue(); - } - long took = TimeUnit.NANOSECONDS.toMillis(ingestToLocalDBTimer.stop()); - if (took > DB_TIMER_LOG_THRESHOLD_MS) { - log.info("Ingestion of aspects batch to database took {} ms", took); - } + // Database Upsert successfully validated results + log.info( + "Ingesting aspects batch to database: {}", + AspectsBatch.toAbbreviatedString(changeMCPs, 2048)); - // Retention optimization and tx - if (retentionService != null) { - List retentionBatch = - upsertResults.stream() - // Only consider retention when there was a previous version - .filter( - result -> - batchAspects.containsKey(result.getUrn().toString()) - && batchAspects - .get(result.getUrn().toString()) - .containsKey(result.getRequest().getAspectName())) - .filter( - result -> { - RecordTemplate oldAspect = result.getOldValue(); - RecordTemplate newAspect = result.getNewValue(); - // Apply retention policies if there was an update to existing - // aspect - // value - return oldAspect != newAspect - && oldAspect != null - && retentionService != null; - }) - .map( - result -> - RetentionService.RetentionContext.builder() - .urn(result.getUrn()) - .aspectName(result.getRequest().getAspectName()) - .maxVersion(Optional.of(result.getMaxVersion())) - .build()) - .collect(Collectors.toList()); - retentionService.applyRetentionWithPolicyDefaults(opContext, retentionBatch); - } else { - log.warn("Retention service is missing!"); - } - } else { - MetricUtils.counter(EntityServiceImpl.class, "batch_empty_transaction").inc(); - // This includes no-op batches. i.e. patch removing non-existent items - log.debug("Empty transaction detected"); - } + List upsertResults = + exceptions + .streamSuccessful(changeMCPs.stream()) + .map( + writeItem -> { + + /* + database*Aspect - should be used for comparisons of before batch operation information + */ + final EntityAspect databaseAspect = + databaseAspects + .getOrDefault(writeItem.getUrn().toString(), Map.of()) + .get(writeItem.getAspectName()); + final EntityAspect.EntitySystemAspect databaseSystemAspect = + databaseAspect == null + ? null + : EntityAspect.EntitySystemAspect.builder() + .build( + writeItem.getEntitySpec(), + writeItem.getAspectSpec(), + databaseAspect); + + /* + This condition is specifically for an older conditional write ingestAspectIfNotPresent() + overwrite is always true otherwise + */ + if (overwrite || databaseAspect == null) { + return Optional.ofNullable( + ingestAspectToLocalDB( + opContext, + txContext, + writeItem, + databaseSystemAspect)) + .map( + optResult -> + optResult.toBuilder().request(writeItem).build()) + .orElse(null); + } + + return null; + }) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + + if (!upsertResults.isEmpty()) { + // commit upserts prior to retention or kafka send, if supported by impl + if (txContext != null) { + txContext.commitAndContinue(); + } + + // Retention optimization and tx + if (retentionService != null) { + opContext.withSpan( + "retentionService", + () -> { + List retentionBatch = + upsertResults.stream() + // Only consider retention when there was a previous version + .filter( + result -> + batchAspects.containsKey(result.getUrn().toString()) + && batchAspects + .get(result.getUrn().toString()) + .containsKey( + result.getRequest().getAspectName())) + .filter( + result -> { + RecordTemplate oldAspect = result.getOldValue(); + RecordTemplate newAspect = result.getNewValue(); + // Apply retention policies if there was an update to + // existing + // aspect + // value + return oldAspect != newAspect + && oldAspect != null + && retentionService != null; + }) + .map( + result -> + RetentionService.RetentionContext.builder() + .urn(result.getUrn()) + .aspectName(result.getRequest().getAspectName()) + .maxVersion(Optional.of(result.getMaxVersion())) + .build()) + .collect(Collectors.toList()); + retentionService.applyRetentionWithPolicyDefaults( + opContext, retentionBatch); + }, + BATCH_SIZE_ATTR, + String.valueOf(upsertResults.size())); + } else { + log.warn("Retention service is missing!"); + } + } else { + MetricUtils.counter(EntityServiceImpl.class, "batch_empty_transaction").inc(); + // This includes no-op batches. i.e. patch removing non-existent items + log.debug("Empty transaction detected"); + } - return upsertResults; - }, - inputBatch, - DEFAULT_MAX_TRANSACTION_RETRY) - .stream() - .filter(Objects::nonNull) - .flatMap(List::stream) - .collect(Collectors.toList()); + return IngestAspectsResult.builder() + .updateAspectResults(upsertResults) + .failedUpdateAspectResults(failedUpsertResults) + .build(); + }, + inputBatch, + DEFAULT_MAX_TRANSACTION_RETRY) + .stream() + .reduce(IngestAspectsResult.EMPTY, IngestAspectsResult::combine); + }, + BATCH_SIZE_ATTR, + String.valueOf(inputBatch.getItems().size()), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "ingestAspectsToLocalDB")); } @Nonnull private List emitMCL( @Nonnull OperationContext opContext, List sqlResults, boolean emitMCL) { - List withEmitMCL = - sqlResults.stream() - .map(result -> emitMCL ? conditionallyProduceMCLAsync(opContext, result) : result) - .collect(Collectors.toList()); - - // join futures messages, capture error state - List> statusPairs = - withEmitMCL.stream() - .filter(result -> result.getMclFuture() != null) - .map( - result -> { - try { - result.getMclFuture().get(); - return Pair.of(true, result); - } catch (InterruptedException | ExecutionException e) { - return Pair.of(false, result); - } - }) - .collect(Collectors.toList()); - if (statusPairs.stream().anyMatch(p -> !p.getFirst())) { - log.error( - "Failed to produce MCLs: {}", - statusPairs.stream() - .filter(p -> !p.getFirst()) - .map(Pair::getValue) - .map(v -> v.getRequest().toString()) - .collect(Collectors.toList())); - // TODO restoreIndices? - throw new RuntimeException("Failed to produce MCLs"); - } + return opContext.withSpan( + "emitMCL", + () -> { + List withEmitMCL = + sqlResults.stream() + .map(result -> emitMCL ? conditionallyProduceMCLAsync(opContext, result) : result) + .collect(Collectors.toList()); + + // join futures messages, capture error state + List> statusPairs = + withEmitMCL.stream() + .filter(result -> result.getMclFuture() != null) + .map( + result -> { + try { + result.getMclFuture().get(); + return Pair.of(true, result); + } catch (InterruptedException | ExecutionException e) { + return Pair.of(false, result); + } + }) + .collect(Collectors.toList()); + + if (statusPairs.stream().anyMatch(p -> !p.getFirst())) { + log.error( + "Failed to produce MCLs: {}", + statusPairs.stream() + .filter(p -> !p.getFirst()) + .map(Pair::getValue) + .map(v -> v.getRequest().toString()) + .collect(Collectors.toList())); + // TODO restoreIndices? + throw new RuntimeException("Failed to produce MCLs"); + } - return withEmitMCL; + return withEmitMCL; + }, + BATCH_SIZE_ATTR, + String.valueOf(sqlResults.size())); } /** @@ -1193,7 +1245,9 @@ public List ingestProposal( Stream timeseriesIngestResults = ingestTimeseriesProposal(opContext, aspectsBatch, async); Stream nonTimeseriesIngestResults = - async ? ingestProposalAsync(aspectsBatch) : ingestProposalSync(opContext, aspectsBatch); + async + ? ingestProposalAsync(opContext, aspectsBatch) + : ingestProposalSync(opContext, aspectsBatch); return Stream.concat(nonTimeseriesIngestResults, timeseriesIngestResults) .collect(Collectors.toList()); @@ -1222,89 +1276,99 @@ private Stream ingestTimeseriesProposal( + unsupported.stream().map(BatchItem::getChangeType).collect(Collectors.toSet())); } - if (!async) { - // Handle throttling - APIThrottle.evaluate(opContext, new HashSet<>(throttleEvents.values()), true); + return opContext.withSpan( + "ingestTimeseriesProposal", + () -> { + if (!async) { + // Handle throttling + APIThrottle.evaluate(opContext, new HashSet<>(throttleEvents.values()), true); + + // Create default non-timeseries aspects for timeseries aspects + List timeseriesKeyAspects = + aspectsBatch.getMCPItems().stream() + .filter( + item -> item.getAspectSpec() != null && item.getAspectSpec().isTimeseries()) + .map( + item -> + ChangeItemImpl.builder() + .urn(item.getUrn()) + .aspectName(item.getEntitySpec().getKeyAspectName()) + .changeType(ChangeType.UPSERT) + .entitySpec(item.getEntitySpec()) + .aspectSpec(item.getEntitySpec().getKeyAspectSpec()) + .auditStamp(item.getAuditStamp()) + .systemMetadata(item.getSystemMetadata()) + .recordTemplate( + EntityApiUtils.buildKeyAspect( + opContext.getEntityRegistry(), item.getUrn())) + .build(opContext.getAspectRetriever())) + .collect(Collectors.toList()); + + ingestProposalSync( + opContext, + AspectsBatchImpl.builder() + .retrieverContext(aspectsBatch.getRetrieverContext()) + .items(timeseriesKeyAspects) + .build()); + } - // Create default non-timeseries aspects for timeseries aspects - List timeseriesKeyAspects = - aspectsBatch.getMCPItems().stream() - .filter(item -> item.getAspectSpec() != null && item.getAspectSpec().isTimeseries()) + // Emit timeseries MCLs + List, Boolean>>>> timeseriesResults = + aspectsBatch.getItems().stream() + .filter( + item -> item.getAspectSpec() != null && item.getAspectSpec().isTimeseries()) + .map(item -> (MCPItem) item) + .map( + item -> + Pair.of( + item, + conditionallyProduceMCLAsync( + opContext, + null, + null, + item.getRecordTemplate(), + item.getSystemMetadata(), + item.getMetadataChangeProposal(), + item.getUrn(), + item.getAuditStamp(), + item.getAspectSpec()))) + .collect(Collectors.toList()); + + return timeseriesResults.stream() .map( - item -> - ChangeItemImpl.builder() - .urn(item.getUrn()) - .aspectName(item.getEntitySpec().getKeyAspectName()) - .changeType(ChangeType.UPSERT) - .entitySpec(item.getEntitySpec()) - .aspectSpec(item.getEntitySpec().getKeyAspectSpec()) - .auditStamp(item.getAuditStamp()) - .systemMetadata(item.getSystemMetadata()) - .recordTemplate( - EntityApiUtils.buildKeyAspect( - opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetriever())) - .collect(Collectors.toList()); - - ingestProposalSync( - opContext, - AspectsBatchImpl.builder() - .retrieverContext(aspectsBatch.getRetrieverContext()) - .items(timeseriesKeyAspects) - .build()); - } - - // Emit timeseries MCLs - List, Boolean>>>> timeseriesResults = - aspectsBatch.getItems().stream() - .filter(item -> item.getAspectSpec() != null && item.getAspectSpec().isTimeseries()) - .map(item -> (MCPItem) item) - .map( - item -> - Pair.of( - item, - conditionallyProduceMCLAsync( - opContext, - null, - null, - item.getRecordTemplate(), - item.getSystemMetadata(), - item.getMetadataChangeProposal(), - item.getUrn(), - item.getAuditStamp(), - item.getAspectSpec()))) - .collect(Collectors.toList()); - - return timeseriesResults.stream() - .map( - result -> { - MCPItem item = result.getFirst(); - Optional, Boolean>> emissionStatus = result.getSecond(); - - emissionStatus.ifPresent( - status -> { - try { - status.getFirst().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } + result -> { + MCPItem item = result.getFirst(); + Optional, Boolean>> emissionStatus = result.getSecond(); + + emissionStatus.ifPresent( + status -> { + try { + status.getFirst().get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); + + return IngestResult.builder() + .urn(item.getUrn()) + .request(item) + .result( + UpdateAspectResult.builder() + .urn(item.getUrn()) + .newValue(item.getRecordTemplate()) + .auditStamp(item.getAuditStamp()) + .newSystemMetadata(item.getSystemMetadata()) + .build()) + .publishedMCL( + emissionStatus.map(status -> status.getFirst() != null).orElse(false)) + .processedMCL(emissionStatus.map(Pair::getSecond).orElse(false)) + .build(); }); - - return IngestResult.builder() - .urn(item.getUrn()) - .request(item) - .result( - UpdateAspectResult.builder() - .urn(item.getUrn()) - .newValue(item.getRecordTemplate()) - .auditStamp(item.getAuditStamp()) - .newSystemMetadata(item.getSystemMetadata()) - .build()) - .publishedMCL( - emissionStatus.map(status -> status.getFirst() != null).orElse(false)) - .processedMCL(emissionStatus.map(Pair::getSecond).orElse(false)) - .build(); - }); + }, + "async", + String.valueOf(async), + BATCH_SIZE_ATTR, + String.valueOf(aspectsBatch.getItems().size())); } /** @@ -1313,81 +1377,100 @@ private Stream ingestTimeseriesProposal( * @param aspectsBatch non-timeseries ingest aspects * @return produced items to the MCP topic */ - private Stream ingestProposalAsync(AspectsBatch aspectsBatch) { - List nonTimeseries = - aspectsBatch.getMCPItems().stream() - .filter(item -> item.getAspectSpec() == null || !item.getAspectSpec().isTimeseries()) - .collect(Collectors.toList()); - - List> futures = - nonTimeseries.stream() - .map( - item -> - // When async is turned on, we write to proposal log and return without waiting - producer.produceMetadataChangeProposal( - item.getUrn(), item.getMetadataChangeProposal())) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + private Stream ingestProposalAsync( + OperationContext opContext, AspectsBatch aspectsBatch) { + return opContext.withSpan( + "ingestProposalAsync", + () -> { + List nonTimeseries = + aspectsBatch.getMCPItems().stream() + .filter( + item -> item.getAspectSpec() == null || !item.getAspectSpec().isTimeseries()) + .collect(Collectors.toList()); + + List> futures = + nonTimeseries.stream() + .map( + item -> { + // When async is turned on, we write to proposal log and return without + // waiting + return producer.produceMetadataChangeProposal( + opContext, item.getUrn(), item); + }) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + + futures.forEach( + f -> { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); - try { - return nonTimeseries.stream() - .map( - item -> - IngestResult.builder() - .urn(item.getUrn()) - .request(item) - .publishedMCP(true) - .build()); - } finally { - futures.forEach( - f -> { - try { - f.get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); - } + return nonTimeseries.stream() + .map( + item -> + IngestResult.builder() + .urn(item.getUrn()) + .request(item) + .publishedMCP(true) + .build()); + }, + BATCH_SIZE_ATTR, + String.valueOf(aspectsBatch.getItems().size())); } private Stream ingestProposalSync( @Nonnull OperationContext opContext, AspectsBatch aspectsBatch) { - AspectsBatchImpl nonTimeseries = - AspectsBatchImpl.builder() - .retrieverContext(aspectsBatch.getRetrieverContext()) - .items( - aspectsBatch.getItems().stream() - .filter(item -> !item.getAspectSpec().isTimeseries()) - .collect(Collectors.toList())) - .build(); + return opContext.withSpan( + "ingestProposalSync", + () -> { + AspectsBatchImpl nonTimeseries = + AspectsBatchImpl.builder() + .retrieverContext(aspectsBatch.getRetrieverContext()) + .items( + aspectsBatch.getItems().stream() + .filter(item -> !item.getAspectSpec().isTimeseries()) + .collect(Collectors.toList())) + .build(); - List unsupported = - nonTimeseries.getMCPItems().stream() - .filter(item -> !MCPItem.isValidChangeType(item.getChangeType(), item.getAspectSpec())) - .collect(Collectors.toList()); - if (!unsupported.isEmpty()) { - throw new UnsupportedOperationException( - "ChangeType not supported: " - + unsupported.stream().map(item -> item.getChangeType()).collect(Collectors.toSet())); - } + List unsupported = + nonTimeseries.getMCPItems().stream() + .filter( + item -> + !MCPItem.isValidChangeType(item.getChangeType(), item.getAspectSpec())) + .collect(Collectors.toList()); + if (!unsupported.isEmpty()) { + throw new UnsupportedOperationException( + "ChangeType not supported: " + + unsupported.stream() + .map(item -> item.getChangeType()) + .collect(Collectors.toSet())); + } - List upsertResults = ingestAspects(opContext, nonTimeseries, true, true); + List upsertResults = + ingestAspects(opContext, nonTimeseries, true, true); - return upsertResults.stream() - .map( - result -> { - ChangeMCP item = result.getRequest(); - - return IngestResult.builder() - .urn(item.getUrn()) - .request(item) - .result(result) - .publishedMCL(result.getMclFuture() != null) - .sqlCommitted(true) - .isUpdate(result.getOldValue() != null) - .build(); - }); + return upsertResults.stream() + .map( + result -> { + ChangeMCP item = result.getRequest(); + + return IngestResult.builder() + .urn(item.getUrn()) + .request(item) + .result(result) + .publishedMCL(result.getMclFuture() != null) + .sqlCommitted(true) + .isUpdate(result.getOldValue() != null) + .build(); + }); + }, + BATCH_SIZE_ATTR, + String.valueOf(aspectsBatch.getItems().size())); } @Override @@ -1768,8 +1851,10 @@ public Pair, Boolean> alwaysProduceMCLAsync( @Nonnull final Urn urn, @Nonnull final AspectSpec aspectSpec, @Nonnull final MetadataChangeLog metadataChangeLog) { - Future future = producer.produceMetadataChangeLog(urn, aspectSpec, metadataChangeLog); - return Pair.of(future, preprocessEvent(opContext, metadataChangeLog)); + boolean preprocessed = preprocessEvent(opContext, metadataChangeLog); + Future future = + producer.produceMetadataChangeLog(opContext, urn, aspectSpec, metadataChangeLog); + return Pair.of(future, preprocessed); } @Override @@ -1830,6 +1915,19 @@ public Optional, Boolean>> conditionallyProduceMCLAsync( log.debug("Serialized MCL event: {}", metadataChangeLog); Pair, Boolean> emissionStatus = alwaysProduceMCLAsync(opContext, entityUrn, aspectSpec, metadataChangeLog); + + // for tracing propagate properties to system meta + if (newSystemMetadata != null && metadataChangeLog.getSystemMetadata().hasProperties()) { + if (!newSystemMetadata.hasProperties()) { + newSystemMetadata.setProperties( + metadataChangeLog.getSystemMetadata().getProperties(), SetMode.IGNORE_NULL); + } else { + newSystemMetadata + .getProperties() + .putAll(metadataChangeLog.getSystemMetadata().getProperties()); + } + } + return emissionStatus.getFirst() != null ? Optional.of(emissionStatus) : Optional.empty(); } else { log.info( @@ -1865,6 +1963,35 @@ private UpdateAspectResult conditionallyProduceMCLAsync( .orElse(result); } + public void produceFailedMCPs( + @Nonnull OperationContext opContext, @Nonnull IngestAspectsResult ingestAspectsResult) { + + if (!ingestAspectsResult.getFailedUpdateAspectResults().isEmpty()) { + Span currentSpan = Span.current(); + currentSpan.recordException( + new IllegalStateException("Batch contains failed aspect validations.")); + currentSpan.setStatus(StatusCode.ERROR, "Batch contains failed aspect validations."); + currentSpan.setAttribute(MetricUtils.ERROR_TYPE, IllegalStateException.class.getName()); + + List> futures = + ingestAspectsResult.getFailedUpdateAspectResults().stream() + .map( + failedItem -> + producer.produceFailedMetadataChangeProposalAsync( + opContext, failedItem.getFirst(), new HashSet<>(failedItem.getSecond()))) + .collect(Collectors.toList()); + + futures.forEach( + f -> { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); + } + } + @Override public void ingestEntities( @Nonnull OperationContext opContext, @@ -2528,6 +2655,7 @@ private Map getEnvelopedAspects( */ @Nullable private UpdateAspectResult ingestAspectToLocalDB( + @Nonnull OperationContext opContext, @Nullable TransactionContext txContext, @Nonnull final ChangeMCP writeItem, @Nullable final EntityAspect.EntitySystemAspect databaseAspect) { @@ -2597,7 +2725,7 @@ private UpdateAspectResult ingestAspectToLocalDB( previousBatchAspect.getCreatedBy(), null, previousBatchAspect.getCreatedOn(), - RecordUtils.toJsonString(latestSystemMetadataDiff.get()), + RecordUtils.toJsonString(opContext.withTraceId(latestSystemMetadataDiff.get())), previousBatchAspect.getVersion(), false); @@ -2663,7 +2791,7 @@ private UpdateAspectResult ingestAspectToLocalDB( ? writeItem.getAuditStamp().getImpersonator().toString() : null, new Timestamp(writeItem.getAuditStamp().getTime()), - EntityApiUtils.toJsonAspect(writeItem.getSystemMetadata()), + EntityApiUtils.toJsonAspect(opContext.withTraceId(writeItem.getSystemMetadata())), writeItem.getNextAspectVersion()); // metrics diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index c595e3e07b8342..eb3c5b9cca0671 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -31,7 +31,7 @@ import com.linkedin.retention.TimeBasedRetention; import com.linkedin.retention.VersionBasedRetention; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.sql.Timestamp; import java.time.Clock; import java.util.List; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 74d0d8b0964de0..5fbac036bd05e6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -26,7 +26,7 @@ import io.ebean.TxScope; import io.ebeaninternal.server.expression.Op; import io.ebeaninternal.server.expression.SimpleExpression; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.sql.Timestamp; import java.time.Clock; import java.util.List; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/validation/ValidationUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/validation/ValidationUtils.java index 6ecac70e13c7e5..7a792efd2984d2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/validation/ValidationUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/validation/ValidationUtils.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.entity.validation; -import com.codahale.metrics.Timer; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.AbstractArrayTemplate; @@ -38,33 +37,36 @@ public static SearchResult validateSearchResult( @Nonnull OperationContext opContext, final SearchResult searchResult, @Nonnull final EntityService entityService) { - try (Timer.Context ignored = - MetricUtils.timer(ValidationUtils.class, "validateSearchResult").time()) { - if (searchResult == null) { - return null; - } - Objects.requireNonNull(entityService, "entityService must not be null"); - - SearchResult validatedSearchResult = - new SearchResult() - .setFrom(searchResult.getFrom()) - .setMetadata(searchResult.getMetadata()) - .setPageSize(searchResult.getPageSize()) - .setNumEntities(searchResult.getNumEntities()); - - SearchEntityArray validatedEntities = - validateSearchUrns( - opContext, - searchResult.getEntities(), - SearchEntity::getEntity, - entityService, - true, - true) - .collect(Collectors.toCollection(SearchEntityArray::new)); - validatedSearchResult.setEntities(validatedEntities); - - return validatedSearchResult; - } + return opContext.withSpan( + "validateSearchResult", + () -> { + if (searchResult == null) { + return null; + } + Objects.requireNonNull(entityService, "entityService must not be null"); + + SearchResult validatedSearchResult = + new SearchResult() + .setFrom(searchResult.getFrom()) + .setMetadata(searchResult.getMetadata()) + .setPageSize(searchResult.getPageSize()) + .setNumEntities(searchResult.getNumEntities()); + + SearchEntityArray validatedEntities = + validateSearchUrns( + opContext, + searchResult.getEntities(), + SearchEntity::getEntity, + entityService, + true, + true) + .collect(Collectors.toCollection(SearchEntityArray::new)); + validatedSearchResult.setEntities(validatedEntities); + + return validatedSearchResult; + }, + MetricUtils.DROPWIZARD_METRIC, + MetricUtils.name(ValidationUtils.class, "validateSearchResult")); } public static ScrollResult validateScrollResult( @@ -104,102 +106,113 @@ public static BrowseResult validateBrowseResult( @Nonnull OperationContext opContext, final BrowseResult browseResult, @Nonnull final EntityService entityService) { - try (Timer.Context ignored = - MetricUtils.timer(ValidationUtils.class, "validateBrowseResult").time()) { - if (browseResult == null) { - return null; - } - Objects.requireNonNull(entityService, "entityService must not be null"); - - BrowseResult validatedBrowseResult = - new BrowseResult() - .setGroups(browseResult.getGroups()) - .setMetadata(browseResult.getMetadata()) - .setFrom(browseResult.getFrom()) - .setPageSize(browseResult.getPageSize()) - .setNumGroups(browseResult.getNumGroups()) - .setNumEntities(browseResult.getNumEntities()) - .setNumElements(browseResult.getNumElements()); - - BrowseResultEntityArray validatedEntities = - validateSearchUrns( - opContext, - browseResult.getEntities(), - BrowseResultEntity::getUrn, - entityService, - true, - true) - .collect(Collectors.toCollection(BrowseResultEntityArray::new)); - validatedBrowseResult.setEntities(validatedEntities); - - return validatedBrowseResult; - } + return opContext.withSpan( + "validateBrowseResult", + () -> { + if (browseResult == null) { + return null; + } + Objects.requireNonNull(entityService, "entityService must not be null"); + + BrowseResult validatedBrowseResult = + new BrowseResult() + .setGroups(browseResult.getGroups()) + .setMetadata(browseResult.getMetadata()) + .setFrom(browseResult.getFrom()) + .setPageSize(browseResult.getPageSize()) + .setNumGroups(browseResult.getNumGroups()) + .setNumEntities(browseResult.getNumEntities()) + .setNumElements(browseResult.getNumElements()); + + BrowseResultEntityArray validatedEntities = + validateSearchUrns( + opContext, + browseResult.getEntities(), + BrowseResultEntity::getUrn, + entityService, + true, + true) + .collect(Collectors.toCollection(BrowseResultEntityArray::new)); + validatedBrowseResult.setEntities(validatedEntities); + + return validatedBrowseResult; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(ValidationUtils.class, "validateBrowseResult")); } public static ListResult validateListResult( @Nonnull OperationContext opContext, final ListResult listResult, @Nonnull final EntityService entityService) { - try (Timer.Context ignored = - MetricUtils.timer(ValidationUtils.class, "validateListResult").time()) { - if (listResult == null) { - return null; - } - Objects.requireNonNull(entityService, "entityService must not be null"); - - ListResult validatedListResult = - new ListResult() - .setStart(listResult.getStart()) - .setCount(listResult.getCount()) - .setTotal(listResult.getTotal()); - - UrnArray validatedEntities = - validateSearchUrns( - opContext, - listResult.getEntities(), - Function.identity(), - entityService, - true, - true) - .collect(Collectors.toCollection(UrnArray::new)); - validatedListResult.setEntities(validatedEntities); - - return validatedListResult; - } + + return opContext.withSpan( + "validateListResult", + () -> { + if (listResult == null) { + return null; + } + Objects.requireNonNull(entityService, "entityService must not be null"); + + ListResult validatedListResult = + new ListResult() + .setStart(listResult.getStart()) + .setCount(listResult.getCount()) + .setTotal(listResult.getTotal()); + + UrnArray validatedEntities = + validateSearchUrns( + opContext, + listResult.getEntities(), + Function.identity(), + entityService, + true, + true) + .collect(Collectors.toCollection(UrnArray::new)); + validatedListResult.setEntities(validatedEntities); + + return validatedListResult; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(ValidationUtils.class, "validateListResult")); } public static LineageSearchResult validateLineageSearchResult( @Nonnull OperationContext opContext, final LineageSearchResult lineageSearchResult, @Nonnull final EntityService entityService) { - try (Timer.Context ignored = - MetricUtils.timer(ValidationUtils.class, "validateLineageResult").time()) { - if (lineageSearchResult == null) { - return null; - } - Objects.requireNonNull(entityService, "entityService must not be null"); - - LineageSearchResult validatedLineageSearchResult = - new LineageSearchResult() - .setMetadata(lineageSearchResult.getMetadata()) - .setFrom(lineageSearchResult.getFrom()) - .setPageSize(lineageSearchResult.getPageSize()) - .setNumEntities(lineageSearchResult.getNumEntities()); - - LineageSearchEntityArray validatedEntities = - validateSearchUrns( - opContext, - lineageSearchResult.getEntities(), - LineageSearchEntity::getEntity, - entityService, - true, - true) - .collect(Collectors.toCollection(LineageSearchEntityArray::new)); - validatedLineageSearchResult.setEntities(validatedEntities); - - log.debug("Returning validated lineage search results"); - return validatedLineageSearchResult; - } + + return opContext.withSpan( + "validateLineageResult", + () -> { + if (lineageSearchResult == null) { + return null; + } + Objects.requireNonNull(entityService, "entityService must not be null"); + + LineageSearchResult validatedLineageSearchResult = + new LineageSearchResult() + .setMetadata(lineageSearchResult.getMetadata()) + .setFrom(lineageSearchResult.getFrom()) + .setPageSize(lineageSearchResult.getPageSize()) + .setNumEntities(lineageSearchResult.getNumEntities()); + + LineageSearchEntityArray validatedEntities = + validateSearchUrns( + opContext, + lineageSearchResult.getEntities(), + LineageSearchEntity::getEntity, + entityService, + true, + true) + .collect(Collectors.toCollection(LineageSearchEntityArray::new)); + validatedLineageSearchResult.setEntities(validatedEntities); + + log.debug("Returning validated lineage search results"); + return validatedLineageSearchResult; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(ValidationUtils.class, "validateLineageResult")); } public static EntityLineageResult validateEntityLineageResult( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/event/EventProducer.java b/metadata-io/src/main/java/com/linkedin/metadata/event/EventProducer.java index a809c7f9a3e31b..15017231eee907 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/event/EventProducer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/event/EventProducer.java @@ -1,19 +1,28 @@ package com.linkedin.metadata.event; import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.SetMode; +import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.DataHubUpgradeHistoryEvent; import com.linkedin.mxe.MetadataChangeLog; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.PlatformEvent; -import io.opentelemetry.extension.annotations.WithSpan; +import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.instrumentation.annotations.WithSpan; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import lombok.extern.slf4j.Slf4j; /** Interface implemented by producers of {@link com.linkedin.mxe.MetadataAuditEvent}s. */ -public interface EventProducer { +@Slf4j +public abstract class EventProducer { /** * Produces a {@link com.linkedin.mxe.MetadataChangeLog} from a new & previous aspect. @@ -23,22 +32,85 @@ public interface EventProducer { * @param metadataChangeLog metadata change log to push into MCL kafka topic * @return A {@link Future} object that reports when the message has been produced. */ - Future produceMetadataChangeLog( + public Future produceMetadataChangeLog( + @Nonnull OperationContext opContext, + @Nonnull final Urn urn, + @Nonnull AspectSpec aspectSpec, + @Nonnull final MetadataChangeLog metadataChangeLog) { + metadataChangeLog.setSystemMetadata( + opContext.withProducerTrace( + "produceMetadataChangeLog", + metadataChangeLog.getSystemMetadata(), + getMetadataChangeLogTopicName(aspectSpec)), + SetMode.IGNORE_NULL); + return produceMetadataChangeLog(urn, aspectSpec, metadataChangeLog); + } + + public abstract Future produceMetadataChangeLog( @Nonnull final Urn urn, @Nonnull AspectSpec aspectSpec, @Nonnull final MetadataChangeLog metadataChangeLog); + public abstract String getMetadataChangeLogTopicName(@Nonnull AspectSpec aspectSpec); + /** * Produces a {@link com.linkedin.mxe.MetadataChangeProposal} as an async update to an entity * * @param urn the urn associated with the change proposal. - * @param metadataChangeProposal metadata change proposal to push into MCP kafka topic. + * @param item Item which includes the metadata change proposal to push into MCP kafka topic. * @return A {@link Future} object that reports when the message has been produced. */ + public Future produceMetadataChangeProposal( + @Nonnull OperationContext opContext, @Nonnull final Urn urn, @Nonnull MCPItem item) { + item.setSystemMetadata( + opContext.withProducerTrace( + "produceMetadataChangeProposal", + item.getSystemMetadata(), + getMetadataChangeProposalTopicName())); + return produceMetadataChangeProposal(urn, item.getMetadataChangeProposal()); + } + @WithSpan - Future produceMetadataChangeProposal( + public abstract Future produceMetadataChangeProposal( @Nonnull final Urn urn, @Nonnull MetadataChangeProposal metadataChangeProposal); + public abstract String getMetadataChangeProposalTopicName(); + + public Future produceFailedMetadataChangeProposalAsync( + @Nonnull OperationContext opContext, + @Nonnull MCPItem item, + @Nonnull Set throwables) { + return produceFailedMetadataChangeProposalAsync( + opContext, item.getMetadataChangeProposal(), throwables); + } + + public void produceFailedMetadataChangeProposal( + @Nonnull OperationContext opContext, + @Nonnull List mcps, + @Nonnull Throwable throwable) { + List> futures = + mcps.stream() + .map( + event -> + produceFailedMetadataChangeProposalAsync(opContext, event, Set.of(throwable))) + .collect(Collectors.toList()); + + futures.forEach( + f -> { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); + } + + @WithSpan + public abstract Future produceFailedMetadataChangeProposalAsync( + @Nonnull OperationContext opContext, + @Nonnull MetadataChangeProposal mcp, + @Nonnull Set throwables); + /** * Produces a generic platform "event". * @@ -50,14 +122,16 @@ Future produceMetadataChangeProposal( * system event. * @return A {@link Future} object that reports when the message has been produced. */ - Future producePlatformEvent( + public abstract Future producePlatformEvent( @Nonnull String name, @Nullable String key, @Nonnull PlatformEvent payload); + public abstract String getPlatformEventTopicName(); + /** * Creates an entry on the history log of when the indices were last rebuilt with the latest * configuration. * * @param event the history event to send to the DataHub Upgrade history topic */ - void produceDataHubUpgradeHistoryEvent(@Nonnull DataHubUpgradeHistoryEvent event); + public abstract void produceDataHubUpgradeHistoryEvent(@Nonnull DataHubUpgradeHistoryEvent event); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index a801cab81c952f..30a4a1d878995c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -3,7 +3,6 @@ import static com.linkedin.metadata.aspect.models.graph.Edge.*; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.*; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -36,7 +35,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -135,7 +134,10 @@ private static void addFilterToQueryBuilder( } private SearchResponse executeLineageSearchQuery( - @Nonnull final QueryBuilder query, final int offset, final int count) { + @Nonnull OperationContext opContext, + @Nonnull final QueryBuilder query, + final int offset, + final int count) { SearchRequest searchRequest = new SearchRequest(); SearchSourceBuilder searchSourceBuilder = sharedSourceBuilder(query, offset, count); @@ -144,13 +146,19 @@ private SearchResponse executeLineageSearchQuery( searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esQuery").time()) { - MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); - return client.search(searchRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Search query failed", e); - throw new ESQueryException("Search query failed:", e); - } + return opContext.withSpan( + "esQuery", + () -> { + try { + MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); + return client.search(searchRequest, RequestOptions.DEFAULT); + } catch (Exception e) { + log.error("Search query failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esQuery")); } private SearchSourceBuilder sharedSourceBuilder( @@ -168,6 +176,7 @@ private SearchSourceBuilder sharedSourceBuilder( } private SearchResponse executeGroupByLineageSearchQuery( + @Nonnull final OperationContext opContext, @Nonnull final QueryBuilder query, final int offset, final int count, @@ -232,14 +241,19 @@ private SearchResponse executeGroupByLineageSearchQuery( searchRequest.source(searchSourceBuilder); searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "esLineageGroupByQuery").time()) { - MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); - return client.search(searchRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Search query failed", e); - throw new ESQueryException("Search query failed:", e); - } + return opContext.withSpan( + "esLineageGroupByQuery", + () -> { + try { + MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); + return client.search(searchRequest, RequestOptions.DEFAULT); + } catch (Exception e) { + log.error("Search query failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esLineageGroupByQuery")); } private static BoolQueryBuilder getAggregationFilter( @@ -289,7 +303,7 @@ public SearchResponse getSearchResponse( relationshipTypes, relationshipFilter); - return executeLineageSearchQuery(finalQuery, offset, count); + return executeLineageSearchQuery(opContext, finalQuery, offset, count); } public static BoolQueryBuilder buildQuery( @@ -664,7 +678,7 @@ private List getLineageRelationships( if (lineageFlags != null && lineageFlags.getEntitiesExploredPerHopLimit() != null) { response = executeGroupByLineageSearchQuery( - finalQuery, 0, lineageFlags.getEntitiesExploredPerHopLimit(), validEdges); + opContext, finalQuery, 0, lineageFlags.getEntitiesExploredPerHopLimit(), validEdges); return extractRelationshipsGroupByQuery( entityUrnSet, response, @@ -676,7 +690,9 @@ private List getLineageRelationships( existingPaths, exploreMultiplePaths); } else { - response = executeLineageSearchQuery(finalQuery, 0, graphQueryConfiguration.getMaxResult()); + response = + executeLineageSearchQuery( + opContext, finalQuery, 0, graphQueryConfiguration.getMaxResult()); return extractRelationships( entityUrnSet, response, @@ -1378,10 +1394,11 @@ public SearchResponse getSearchResponse( relationshipTypes, relationshipFilter); - return executeScrollSearchQuery(finalQuery, sortCriteria, scrollId, count); + return executeScrollSearchQuery(opContext, finalQuery, sortCriteria, scrollId, count); } private SearchResponse executeScrollSearchQuery( + @Nonnull final OperationContext opContext, @Nonnull final QueryBuilder query, @Nonnull List sortCriteria, @Nullable String scrollId, @@ -1405,13 +1422,19 @@ private SearchResponse executeScrollSearchQuery( searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esQuery").time()) { - MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); - return client.search(searchRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Search query failed", e); - throw new ESQueryException("Search query failed:", e); - } + return opContext.withSpan( + "esQuery", + () -> { + try { + MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); + return client.search(searchRequest, RequestOptions.DEFAULT); + } catch (Exception e) { + log.error("Search query failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esQuery")); } private static void applyExcludeSoftDelete( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 1068fae9478e1b..6fd741d30062c5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -39,7 +39,7 @@ import com.linkedin.structured.StructuredPropertyDefinition; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java index ef748ebd232789..42f241186d2a34 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.graph.neo4j; -import com.codahale.metrics.Timer; import com.datahub.util.Statement; import com.datahub.util.exception.RetryLimitReached; import com.google.common.annotations.VisibleForTesting; @@ -66,26 +65,32 @@ public class Neo4jGraphService implements GraphService { private static final int MAX_TRANSACTION_RETRY = 3; - private final LineageRegistry _lineageRegistry; - private final Driver _driver; - private SessionConfig _sessionConfig; + private final LineageRegistry lineageRegistry; + private final Driver driver; + private final OperationContext systemOperationContext; + private SessionConfig sessionConfig; - public Neo4jGraphService(@Nonnull LineageRegistry lineageRegistry, @Nonnull Driver driver) { - this(lineageRegistry, driver, SessionConfig.defaultConfig()); + public Neo4jGraphService( + @Nonnull final OperationContext systemOperationContext, + @Nonnull LineageRegistry lineageRegistry, + @Nonnull Driver driver) { + this(systemOperationContext, lineageRegistry, driver, SessionConfig.defaultConfig()); } public Neo4jGraphService( + @Nonnull final OperationContext systemOperationContext, @Nonnull LineageRegistry lineageRegistry, @Nonnull Driver driver, @Nonnull SessionConfig sessionConfig) { - this._lineageRegistry = lineageRegistry; - this._driver = driver; - this._sessionConfig = sessionConfig; + this.systemOperationContext = systemOperationContext; + this.lineageRegistry = lineageRegistry; + this.driver = driver; + this.sessionConfig = sessionConfig; } @Override public LineageRegistry getLineageRegistry() { - return _lineageRegistry; + return lineageRegistry; } @Override @@ -329,7 +334,7 @@ private String getPathFindingRelationshipFilter( final var filterComponents = new HashSet(); for (final var entityName : entityNames) { if (direction != null) { - for (final var edgeInfo : _lineageRegistry.getLineageRelationships(entityName, direction)) { + for (final var edgeInfo : lineageRegistry.getLineageRelationships(entityName, direction)) { final var type = edgeInfo.getType(); if (edgeInfo.getDirection() == RelationshipDirection.INCOMING) { filterComponents.add("<" + type); @@ -342,7 +347,7 @@ private String getPathFindingRelationshipFilter( for (final var direction1 : List.of(LineageDirection.UPSTREAM, LineageDirection.DOWNSTREAM)) { for (final var edgeInfo : - _lineageRegistry.getLineageRelationships(entityName, direction1)) { + lineageRegistry.getLineageRelationships(entityName, direction1)) { filterComponents.add(edgeInfo.getType()); } } @@ -736,7 +741,7 @@ private ExecutionResult executeStatements(@Nonnull List statements) { final StopWatch stopWatch = new StopWatch(); stopWatch.start(); int retry = 0; - try (final Session session = _driver.session(_sessionConfig)) { + try (final Session session = driver.session(sessionConfig)) { for (retry = 0; retry <= MAX_TRANSACTION_RETRY; retry++) { try { session.executeWrite( @@ -773,9 +778,11 @@ private ExecutionResult executeStatements(@Nonnull List statements) { @Nonnull private Result runQuery(@Nonnull Statement statement) { log.debug(String.format("Running Neo4j query %s", statement.toString())); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "runQuery").time()) { - return _driver.session(_sessionConfig).run(statement.getCommandText(), statement.getParams()); - } + return systemOperationContext.withSpan( + "runQuery", + () -> driver.session(sessionConfig).run(statement.getCommandText(), statement.getParams()), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "runQuery")); } // Returns "key:value" String, if value is not primitive, then use toString() and double quote it diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java index fea3fafdc845ad..565de23aafb92c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.codahale.metrics.Timer; import com.datahub.authorization.config.ViewAuthorizationConfiguration; import com.datahub.util.exception.ESQueryException; import com.google.common.collect.ImmutableSet; @@ -17,7 +16,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; import java.util.List; import java.util.Optional; @@ -105,22 +104,29 @@ public List getRecommendations( @Nonnull RecommendationRequestContext requestContext, @Nullable Filter filter) { SearchRequest searchRequest = buildSearchRequest(opContext); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getMostPopular").time()) { - final SearchResponse searchResponse = - _searchClient.search(searchRequest, RequestOptions.DEFAULT); - // extract results - ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); - List bucketUrns = - parsedTerms.getBuckets().stream() - .map(MultiBucketsAggregation.Bucket::getKeyAsString) - .collect(Collectors.toList()); - return buildContent(opContext, bucketUrns, _entityService) - .limit(MAX_CONTENT) - .collect(Collectors.toList()); - } catch (Exception e) { - log.error("Search query to get most popular entities failed", e); - throw new ESQueryException("Search query failed:", e); - } + + return opContext.withSpan( + "getMostPopular", + () -> { + try { + final SearchResponse searchResponse = + _searchClient.search(searchRequest, RequestOptions.DEFAULT); + // extract results + ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); + List bucketUrns = + parsedTerms.getBuckets().stream() + .map(MultiBucketsAggregation.Bucket::getKeyAsString) + .collect(Collectors.toList()); + return buildContent(opContext, bucketUrns, _entityService) + .limit(MAX_CONTENT) + .collect(Collectors.toList()); + } catch (Exception e) { + log.error("Search query to get most popular entities failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getMostPopular")); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java index afdce0d7145133..e03293c11b36c2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyEditedSource.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.google.common.collect.ImmutableSet; import com.linkedin.common.urn.Urn; @@ -18,7 +17,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; import java.util.List; import java.util.Set; @@ -107,22 +106,29 @@ public List getRecommendations( SearchRequest searchRequest = buildSearchRequest( opContext.getSessionActorContext().getActorUrn(), opContext.getAspectRetriever()); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getRecentlyEdited").time()) { - final SearchResponse searchResponse = - _searchClient.search(searchRequest, RequestOptions.DEFAULT); - // extract results - ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); - List bucketUrns = - parsedTerms.getBuckets().stream() - .map(MultiBucketsAggregation.Bucket::getKeyAsString) - .collect(Collectors.toList()); - return buildContent(opContext, bucketUrns, _entityService) - .limit(MAX_CONTENT) - .collect(Collectors.toList()); - } catch (Exception e) { - log.error("Search query to get most recently edited entities failed", e); - throw new ESQueryException("Search query failed:", e); - } + + return opContext.withSpan( + "getRecentlyEdited", + () -> { + try { + final SearchResponse searchResponse = + _searchClient.search(searchRequest, RequestOptions.DEFAULT); + // extract results + ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); + List bucketUrns = + parsedTerms.getBuckets().stream() + .map(MultiBucketsAggregation.Bucket::getKeyAsString) + .collect(Collectors.toList()); + return buildContent(opContext, bucketUrns, _entityService) + .limit(MAX_CONTENT) + .collect(Collectors.toList()); + } catch (Exception e) { + log.error("Search query to get most recently edited entities failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getRecentlyEdited")); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java index f282470193ae5e..ea3a80c5f60381 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.google.common.collect.ImmutableSet; import com.linkedin.common.urn.Urn; @@ -18,7 +17,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; import java.util.List; import java.util.Set; @@ -107,22 +106,29 @@ public List getRecommendations( SearchRequest searchRequest = buildSearchRequest( opContext.getSessionActorContext().getActorUrn(), opContext.getAspectRetriever()); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getRecentlyViewed").time()) { - final SearchResponse searchResponse = - _searchClient.search(searchRequest, RequestOptions.DEFAULT); - // extract results - ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); - List bucketUrns = - parsedTerms.getBuckets().stream() - .map(MultiBucketsAggregation.Bucket::getKeyAsString) - .collect(Collectors.toList()); - return buildContent(opContext, bucketUrns, _entityService) - .limit(MAX_CONTENT) - .collect(Collectors.toList()); - } catch (Exception e) { - log.error("Search query to get most recently viewed entities failed", e); - throw new ESQueryException("Search query failed:", e); - } + + return opContext.withSpan( + "getRecentlyViewed", + () -> { + try { + final SearchResponse searchResponse = + _searchClient.search(searchRequest, RequestOptions.DEFAULT); + // extract results + ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); + List bucketUrns = + parsedTerms.getBuckets().stream() + .map(MultiBucketsAggregation.Bucket::getKeyAsString) + .collect(Collectors.toList()); + return buildContent(opContext, bucketUrns, _entityService) + .limit(MAX_CONTENT) + .collect(Collectors.toList()); + } catch (Exception e) { + log.error("Search query to get most recently viewed entities failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getRecentlyViewed")); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java index 67ebdf8882b80c..f77b5097db80c9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java @@ -33,7 +33,7 @@ import com.linkedin.metadata.search.utils.FilterUtils; import com.linkedin.metadata.search.utils.SearchUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Collections; import java.util.HashMap; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java index ecded1bb9c3846..731517ba3290f1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java @@ -2,7 +2,6 @@ import static com.linkedin.metadata.utils.SearchUtil.*; -import com.codahale.metrics.Timer; import com.linkedin.data.template.LongMap; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; @@ -215,20 +214,18 @@ public SearchResult searchAcrossEntities( */ public List getEntitiesToSearch( @Nonnull OperationContext opContext, @Nonnull Collection inputEntities, int size) { - List nonEmptyEntities; List lowercaseEntities = inputEntities.stream().map(String::toLowerCase).collect(Collectors.toList()); if (lowercaseEntities.isEmpty()) { - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "getNonEmptyEntities").time()) { - nonEmptyEntities = _entityDocCountCache.getNonEmptyEntities(opContext); - } - } else { - nonEmptyEntities = lowercaseEntities; + return opContext.withSpan( + "getNonEmptyEntities", + () -> _entityDocCountCache.getNonEmptyEntities(opContext), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getNonEmptyEntities")); } - return nonEmptyEntities; + return lowercaseEntities; } /** diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/cache/CacheableSearcher.java b/metadata-io/src/main/java/com/linkedin/metadata/search/cache/CacheableSearcher.java index 28efa29c9fffa2..28a308c7f16d23 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/cache/CacheableSearcher.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/cache/CacheableSearcher.java @@ -1,13 +1,14 @@ package com.linkedin.metadata.search.cache; import static com.datahub.util.RecordUtils.*; +import static com.linkedin.metadata.utils.metrics.MetricUtils.CACHE_HIT_ATTR; -import com.codahale.metrics.Timer; import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.api.trace.Span; import java.io.Serializable; import java.util.ArrayList; import java.util.List; @@ -42,41 +43,46 @@ public static class QueryPagination implements Serializable { * corresponds to) */ public SearchResult getSearchResults(@Nonnull OperationContext opContext, int from, int size) { - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getSearchResults").time()) { - int resultsSoFar = 0; - int batchId = 0; - boolean foundStart = false; - List resultEntities = new ArrayList<>(); - SearchResult batchedResult; - // Use do-while to make sure we run at least one batch to fetch metadata - do { - batchedResult = getBatch(opContext, batchId); - int currentBatchSize = batchedResult.getEntities().size(); - // If the number of results in this batch is 0, no need to continue - if (currentBatchSize == 0) { - break; - } - if (resultsSoFar + currentBatchSize > from) { - int startInBatch = foundStart ? 0 : from - resultsSoFar; - int endInBatch = Math.min(currentBatchSize, startInBatch + size - resultEntities.size()); - resultEntities.addAll(batchedResult.getEntities().subList(startInBatch, endInBatch)); - foundStart = true; - } - // If current batch is smaller than the requested batch size, the next batch will return - // empty. - if (currentBatchSize < batchSize) { - break; - } - resultsSoFar += currentBatchSize; - batchId++; - } while (resultsSoFar < from + size); - return new SearchResult() - .setEntities(new SearchEntityArray(resultEntities)) - .setMetadata(batchedResult.getMetadata()) - .setFrom(from) - .setPageSize(size) - .setNumEntities(batchedResult.getNumEntities()); - } + return opContext.withSpan( + "getSearchResults", + () -> { + int resultsSoFar = 0; + int batchId = 0; + boolean foundStart = false; + List resultEntities = new ArrayList<>(); + SearchResult batchedResult; + // Use do-while to make sure we run at least one batch to fetch metadata + do { + batchedResult = getBatch(opContext, batchId); + int currentBatchSize = batchedResult.getEntities().size(); + // If the number of results in this batch is 0, no need to continue + if (currentBatchSize == 0) { + break; + } + if (resultsSoFar + currentBatchSize > from) { + int startInBatch = foundStart ? 0 : from - resultsSoFar; + int endInBatch = + Math.min(currentBatchSize, startInBatch + size - resultEntities.size()); + resultEntities.addAll(batchedResult.getEntities().subList(startInBatch, endInBatch)); + foundStart = true; + } + // If current batch is smaller than the requested batch size, the next batch will return + // empty. + if (currentBatchSize < batchSize) { + break; + } + resultsSoFar += currentBatchSize; + batchId++; + } while (resultsSoFar < from + size); + return new SearchResult() + .setEntities(new SearchEntityArray(resultEntities)) + .setMetadata(batchedResult.getMetadata()) + .setFrom(from) + .setPageSize(size) + .setNumEntities(batchedResult.getNumEntities()); + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getSearchResults")); } private QueryPagination getBatchQuerySize(int batchId) { @@ -84,37 +90,41 @@ private QueryPagination getBatchQuerySize(int batchId) { } private SearchResult getBatch(@Nonnull OperationContext opContext, int batchId) { - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getBatch").time()) { - QueryPagination batch = getBatchQuerySize(batchId); - SearchResult result; - if (enableCache) { - K cacheKey = cacheKeyGenerator.apply(batch); - if ((opContext.getSearchContext().getSearchFlags().isSkipCache() == null - || !opContext.getSearchContext().getSearchFlags().isSkipCache())) { - try (Timer.Context ignored2 = - MetricUtils.timer(this.getClass(), "getBatch_cache").time()) { - Timer.Context cacheAccess = - MetricUtils.timer(this.getClass(), "getBatch_cache_access").time(); - String json = cache.get(cacheKey, String.class); - result = json != null ? toRecordTemplate(SearchResult.class, json) : null; - cacheAccess.stop(); - if (result == null) { - Timer.Context cacheMiss = - MetricUtils.timer(this.getClass(), "getBatch_cache_miss").time(); + + return opContext.withSpan( + "getBatch", + () -> { + QueryPagination batch = getBatchQuerySize(batchId); + SearchResult result; + if (enableCache) { + K cacheKey = cacheKeyGenerator.apply(batch); + if ((opContext.getSearchContext().getSearchFlags().isSkipCache() == null + || !opContext.getSearchContext().getSearchFlags().isSkipCache())) { + + String json = cache.get(cacheKey, String.class); + result = json != null ? toRecordTemplate(SearchResult.class, json) : null; + + if (result == null) { + Span.current().setAttribute(CACHE_HIT_ATTR, false); + result = searcher.apply(batch); + cache.put(cacheKey, toJsonString(result)); + MetricUtils.counter(this.getClass(), "getBatch_cache_miss_count").inc(); + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, true); + } + + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, false); result = searcher.apply(batch); cache.put(cacheKey, toJsonString(result)); - cacheMiss.stop(); - MetricUtils.counter(this.getClass(), "getBatch_cache_miss_count").inc(); } + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, false); + result = searcher.apply(batch); } - } else { - result = searcher.apply(batch); - cache.put(cacheKey, toJsonString(result)); - } - } else { - result = searcher.apply(batch); - } - return result; - } + return result; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getBatch")); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/cache/EntityDocCountCache.java b/metadata-io/src/main/java/com/linkedin/metadata/search/cache/EntityDocCountCache.java index 1efaeb2b12f45f..7cea787583b54d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/cache/EntityDocCountCache.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/cache/EntityDocCountCache.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.utils.ConcurrencyUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index cb062e0e3f4483..7272809bb1221a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -2,8 +2,8 @@ import static com.datahub.util.RecordUtils.toJsonString; import static com.datahub.util.RecordUtils.toRecordTemplate; +import static com.linkedin.metadata.utils.metrics.MetricUtils.CACHE_HIT_ATTR; -import com.codahale.metrics.Timer; import com.linkedin.metadata.browse.BrowseResult; import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.SearchFlags; @@ -15,6 +15,7 @@ import com.linkedin.metadata.search.cache.CacheableSearcher; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.api.trace.Span; import java.util.List; import java.util.Optional; import javax.annotation.Nonnull; @@ -184,40 +185,42 @@ public AutoCompleteResult getCachedAutoCompleteResults( @Nullable String field, @Nullable Filter filters, int limit) { - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "getCachedAutoCompleteResults").time()) { - Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_AUTOCOMPLETE_CACHE_NAME); - AutoCompleteResult result; - if (enableCache(opContext.getSearchContext().getSearchFlags())) { - try (Timer.Context ignored2 = - MetricUtils.timer(this.getClass(), "getCachedAutoCompleteResults_cache").time()) { - Timer.Context cacheAccess = - MetricUtils.timer(this.getClass(), "autocomplete_cache_access").time(); - Object cacheKey = - Sextet.with( - opContext.getSearchContextId(), - entityName, - input, - field, - filters != null ? toJsonString(filters) : null, - limit); - String json = cache.get(cacheKey, String.class); - result = json != null ? toRecordTemplate(AutoCompleteResult.class, json) : null; - cacheAccess.stop(); - if (result == null) { - Timer.Context cacheMiss = - MetricUtils.timer(this.getClass(), "autocomplete_cache_miss").time(); + + return opContext.withSpan( + "getAutoCompleteResults", + () -> { + Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_AUTOCOMPLETE_CACHE_NAME); + AutoCompleteResult result; + if (enableCache(opContext.getSearchContext().getSearchFlags())) { + + Object cacheKey = + Sextet.with( + opContext.getSearchContextId(), + entityName, + input, + field, + filters != null ? toJsonString(filters) : null, + limit); + String json = cache.get(cacheKey, String.class); + result = json != null ? toRecordTemplate(AutoCompleteResult.class, json) : null; + + if (result == null) { + result = + getRawAutoCompleteResults(opContext, entityName, input, field, filters, limit); + cache.put(cacheKey, toJsonString(result)); + Span.current().setAttribute(CACHE_HIT_ATTR, false); + MetricUtils.counter(this.getClass(), "autocomplete_cache_miss_count").inc(); + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, true); + } + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, false); result = getRawAutoCompleteResults(opContext, entityName, input, field, filters, limit); - cache.put(cacheKey, toJsonString(result)); - cacheMiss.stop(); - MetricUtils.counter(this.getClass(), "autocomplete_cache_miss_count").inc(); } - } - } else { - result = getRawAutoCompleteResults(opContext, entityName, input, field, filters, limit); - } - return result; - } + return result; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getCachedAutoCompleteResults")); } /** Returns cached browse results. */ @@ -228,40 +231,40 @@ public BrowseResult getCachedBrowseResults( @Nullable Filter filters, int from, int size) { - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "getCachedBrowseResults").time()) { - Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_BROWSE_CACHE_NAME); - BrowseResult result; - if (enableCache(opContext.getSearchContext().getSearchFlags())) { - try (Timer.Context ignored2 = - MetricUtils.timer(this.getClass(), "getCachedBrowseResults_cache").time()) { - Timer.Context cacheAccess = - MetricUtils.timer(this.getClass(), "browse_cache_access").time(); - Object cacheKey = - Sextet.with( - opContext.getSearchContextId(), - entityName, - path, - filters != null ? toJsonString(filters) : null, - from, - size); - String json = cache.get(cacheKey, String.class); - result = json != null ? toRecordTemplate(BrowseResult.class, json) : null; - cacheAccess.stop(); - if (result == null) { - Timer.Context cacheMiss = - MetricUtils.timer(this.getClass(), "browse_cache_miss").time(); + + return opContext.withSpan( + "getBrowseResults", + () -> { + Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_BROWSE_CACHE_NAME); + BrowseResult result; + if (enableCache(opContext.getSearchContext().getSearchFlags())) { + Object cacheKey = + Sextet.with( + opContext.getSearchContextId(), + entityName, + path, + filters != null ? toJsonString(filters) : null, + from, + size); + String json = cache.get(cacheKey, String.class); + result = json != null ? toRecordTemplate(BrowseResult.class, json) : null; + + if (result == null) { + result = getRawBrowseResults(opContext, entityName, path, filters, from, size); + cache.put(cacheKey, toJsonString(result)); + Span.current().setAttribute(CACHE_HIT_ATTR, false); + MetricUtils.counter(this.getClass(), "browse_cache_miss_count").inc(); + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, true); + } + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, false); result = getRawBrowseResults(opContext, entityName, path, filters, from, size); - cache.put(cacheKey, toJsonString(result)); - cacheMiss.stop(); - MetricUtils.counter(this.getClass(), "browse_cache_miss_count").inc(); } - } - } else { - result = getRawBrowseResults(opContext, entityName, path, filters, from, size); - } - return result; - } + return result; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getCachedBrowseResults")); } /** Returns cached scroll results. */ @@ -274,62 +277,67 @@ public ScrollResult getCachedScrollResults( @Nullable String scrollId, @Nullable String keepAlive, int size) { - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "getCachedScrollResults").time()) { - boolean isFullText = - Boolean.TRUE.equals( - Optional.ofNullable(opContext.getSearchContext().getSearchFlags()) - .orElse(new SearchFlags()) - .isFulltext()); - Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_SCROLL_CACHE_NAME); - ScrollResult result; - if (enableCache(opContext.getSearchContext().getSearchFlags())) { - Timer.Context cacheAccess = - MetricUtils.timer(this.getClass(), "scroll_cache_access").time(); - Object cacheKey = - Septet.with( - opContext.getSearchContextId(), - entities, - query, - filters != null ? toJsonString(filters) : null, - CollectionUtils.isNotEmpty(sortCriteria) ? toJsonString(sortCriteria) : null, - scrollId, - size); - String json = cache.get(cacheKey, String.class); - result = json != null ? toRecordTemplate(ScrollResult.class, json) : null; - cacheAccess.stop(); - if (result == null) { - Timer.Context cacheMiss = MetricUtils.timer(this.getClass(), "scroll_cache_miss").time(); - result = - getRawScrollResults( - opContext, - entities, - query, - filters, - sortCriteria, - scrollId, - keepAlive, - size, - isFullText); - cache.put(cacheKey, toJsonString(result)); - cacheMiss.stop(); - MetricUtils.counter(this.getClass(), "scroll_cache_miss_count").inc(); - } - } else { - result = - getRawScrollResults( - opContext, - entities, - query, - filters, - sortCriteria, - scrollId, - keepAlive, - size, - isFullText); - } - return result; - } + + return opContext.withSpan( + "getScrollResults", + () -> { + boolean isFullText = + Boolean.TRUE.equals( + Optional.ofNullable(opContext.getSearchContext().getSearchFlags()) + .orElse(new SearchFlags()) + .isFulltext()); + Cache cache = cacheManager.getCache(ENTITY_SEARCH_SERVICE_SCROLL_CACHE_NAME); + ScrollResult result; + if (enableCache(opContext.getSearchContext().getSearchFlags())) { + + Object cacheKey = + Septet.with( + opContext.getSearchContextId(), + entities, + query, + filters != null ? toJsonString(filters) : null, + CollectionUtils.isNotEmpty(sortCriteria) ? toJsonString(sortCriteria) : null, + scrollId, + size); + String json = cache.get(cacheKey, String.class); + result = json != null ? toRecordTemplate(ScrollResult.class, json) : null; + + if (result == null) { + result = + getRawScrollResults( + opContext, + entities, + query, + filters, + sortCriteria, + scrollId, + keepAlive, + size, + isFullText); + cache.put(cacheKey, toJsonString(result)); + Span.current().setAttribute(CACHE_HIT_ATTR, false); + MetricUtils.counter(this.getClass(), "scroll_cache_miss_count").inc(); + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, true); + } + } else { + Span.current().setAttribute(CACHE_HIT_ATTR, false); + result = + getRawScrollResults( + opContext, + entities, + query, + filters, + sortCriteria, + scrollId, + keepAlive, + size, + isFullText); + } + return result; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getCachedScrollResults")); } /** Executes the expensive search query using the {@link EntitySearchService} */ diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index 35f133cc794f2a..3a84d1cb2de489 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -3,7 +3,6 @@ import static com.linkedin.metadata.search.utils.ESUtils.applyDefaultSearchFilters; import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.google.common.annotations.VisibleForTesting; import com.linkedin.common.urn.Urn; @@ -30,6 +29,7 @@ import com.linkedin.metadata.utils.SearchUtil; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; +import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; @@ -139,13 +139,21 @@ public BrowseResult browse( .getIndexConvention() .getIndexName(opContext.getEntityRegistry().getEntitySpec(entityName)); - final SearchResponse groupsResponse; - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) { - groupsResponse = - client.search( - constructGroupsSearchRequest(finalOpContext, indexName, path, requestMap), - RequestOptions.DEFAULT); - } + final SearchResponse groupsResponse = + opContext.withSpan( + "esGroupSearch", + () -> { + try { + return client.search( + constructGroupsSearchRequest(finalOpContext, indexName, path, requestMap), + RequestOptions.DEFAULT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esGroupSearch")); + final BrowseGroupsResult browseGroupsResult = extractGroupsResponse(groupsResponse, path, from, size); final int numGroups = browseGroupsResult.getTotalGroups(); @@ -156,14 +164,22 @@ public BrowseResult browse( // if numGroups <= from, we should only return entities int entityFrom = Math.max(from - numGroups, 0); int entitySize = Math.min(Math.max(from + size - numGroups, 0), size); - final SearchResponse entitiesResponse; - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esEntitiesSearch").time()) { - entitiesResponse = - client.search( - constructEntitiesSearchRequest( - finalOpContext, indexName, path, requestMap, entityFrom, entitySize), - RequestOptions.DEFAULT); - } + final SearchResponse entitiesResponse = + opContext.withSpan( + "esEntitiesSearch", + () -> { + try { + return client.search( + constructEntitiesSearchRequest( + finalOpContext, indexName, path, requestMap, entityFrom, entitySize), + RequestOptions.DEFAULT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esEntitiesSearch")); + final int numEntities = (int) entitiesResponse.getHits().getTotalHits().value; final List browseResultEntityList = extractEntitiesResponse(entitiesResponse, path); @@ -441,19 +457,25 @@ public BrowseResultV2 browseV2( int start, int count) { try { - final SearchResponse groupsResponse; final OperationContext finalOpContext = opContext.withSearchFlags( flags -> applyDefaultSearchFlags(flags, path, DEFAULT_BROWSE_SEARCH_FLAGS)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) { - final String finalInput = input.isEmpty() ? "*" : input; - groupsResponse = - client.search( - constructGroupsSearchRequestV2( - finalOpContext, entityName, path, filter, finalInput), - RequestOptions.DEFAULT); - } + final SearchResponse groupsResponse = + opContext.withSpan( + "esGroupSearch", + () -> { + try { + return client.search( + constructGroupsSearchRequestV2( + finalOpContext, entityName, path, filter, input.isEmpty() ? "*" : input), + RequestOptions.DEFAULT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esGroupSearch")); final BrowseGroupsResultV2 browseGroupsResult = extractGroupsResponseV2(groupsResponse, path, start, count); @@ -483,19 +505,25 @@ public BrowseResultV2 browseV2( int start, int count) { try { - final SearchResponse groupsResponse; final OperationContext finalOpContext = opContext.withSearchFlags( flags -> applyDefaultSearchFlags(flags, path, DEFAULT_BROWSE_SEARCH_FLAGS)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) { - final String finalInput = input.isEmpty() ? "*" : input; - groupsResponse = - client.search( - constructGroupsSearchRequestBrowseAcrossEntities( - finalOpContext, entities, path, filter, finalInput), - RequestOptions.DEFAULT); - } + final SearchResponse groupsResponse = + opContext.withSpan( + "esGroupSearch", + () -> { + try { + return client.search( + constructGroupsSearchRequestBrowseAcrossEntities( + finalOpContext, entities, path, filter, input.isEmpty() ? "*" : input), + RequestOptions.DEFAULT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "esGroupSearch")); final BrowseGroupsResultV2 browseGroupsResult = extractGroupsResponseV2(groupsResponse, path, start, count); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index 2d7db075e676ff..a720e0bf815c26 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -4,7 +4,6 @@ import static com.linkedin.metadata.aspect.patch.template.TemplateUtil.*; import static com.linkedin.metadata.utils.SearchUtil.*; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; @@ -31,7 +30,7 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -114,12 +113,19 @@ public long docCount( filter, entitySpec.getSearchableFieldTypes(), queryFilterRewriteChain)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "docCount").time()) { - return client.count(countRequest, RequestOptions.DEFAULT).getCount(); - } catch (IOException e) { - log.error("Count query failed:" + e.getMessage()); - throw new ESQueryException("Count query failed:", e); - } + + return opContext.withSpan( + "docCount", + () -> { + try { + return client.count(countRequest, RequestOptions.DEFAULT).getCount(); + } catch (IOException e) { + log.error("Count query failed:" + e.getMessage()); + throw new ESQueryException("Count query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "docCount")); } @Nonnull @@ -132,26 +138,33 @@ private SearchResult executeAndExtract( int from, int size) { long id = System.currentTimeMillis(); - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "executeAndExtract_search").time()) { - log.debug("Executing request {}: {}", id, searchRequest); - final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); - // extract results, validated against document model as well - return transformIndexIntoEntityName( - opContext.getSearchContext().getIndexConvention(), - SearchRequestHandler.getBuilder( - opContext.getEntityRegistry(), - entitySpec, - searchConfiguration, - customSearchConfiguration, - queryFilterRewriteChain) - .extractResult(opContext, searchResponse, filter, from, size)); - } catch (Exception e) { - log.error("Search query failed", e); - throw new ESQueryException("Search query failed:", e); - } finally { - log.debug("Returning from request {}.", id); - } + + return opContext.withSpan( + "executeAndExtract_search", + () -> { + try { + log.debug("Executing request {}: {}", id, searchRequest); + final SearchResponse searchResponse = + client.search(searchRequest, RequestOptions.DEFAULT); + // extract results, validated against document model as well + return transformIndexIntoEntityName( + opContext.getSearchContext().getIndexConvention(), + SearchRequestHandler.getBuilder( + opContext.getEntityRegistry(), + entitySpec, + searchConfiguration, + customSearchConfiguration, + queryFilterRewriteChain) + .extractResult(opContext, searchResponse, filter, from, size)); + } catch (Exception e) { + log.error("Search query failed", e); + throw new ESQueryException("Search query failed:", e); + } finally { + log.debug("Returning from request {}.", id); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "executeAndExtract_search")); } private String transformIndexToken( @@ -234,24 +247,30 @@ private ScrollResult executeAndExtract( @Nullable Filter filter, @Nullable String keepAlive, int size) { - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "executeAndExtract_scroll").time()) { - final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); - // extract results, validated against document model as well - return transformIndexIntoEntityName( - opContext.getSearchContext().getIndexConvention(), - SearchRequestHandler.getBuilder( - opContext.getEntityRegistry(), - entitySpecs, - searchConfiguration, - customSearchConfiguration, - queryFilterRewriteChain) - .extractScrollResult( - opContext, searchResponse, filter, keepAlive, size, supportsPointInTime())); - } catch (Exception e) { - log.error("Search query failed: {}", searchRequest, e); - throw new ESQueryException("Search query failed:", e); - } + return opContext.withSpan( + "executeAndExtract_scroll", + () -> { + try { + final SearchResponse searchResponse = + client.search(searchRequest, RequestOptions.DEFAULT); + // extract results, validated against document model as well + return transformIndexIntoEntityName( + opContext.getSearchContext().getIndexConvention(), + SearchRequestHandler.getBuilder( + opContext.getEntityRegistry(), + entitySpecs, + searchConfiguration, + customSearchConfiguration, + queryFilterRewriteChain) + .extractScrollResult( + opContext, searchResponse, filter, keepAlive, size, supportsPointInTime())); + } catch (Exception e) { + log.error("Search query failed: {}", searchRequest, e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "executeAndExtract_scroll")); } /** @@ -279,26 +298,32 @@ public SearchResult search( int size, @Nullable List facets) { final String finalInput = input.isEmpty() ? "*" : input; - Timer.Context searchRequestTimer = MetricUtils.timer(this.getClass(), "searchRequest").time(); List entitySpecs = entityNames.stream() .map(name -> opContext.getEntityRegistry().getEntitySpec(name)) .collect(Collectors.toList()); IndexConvention indexConvention = opContext.getSearchContext().getIndexConvention(); Filter transformedFilters = transformFilterForEntities(postFilters, indexConvention); + // Step 1: construct the query final SearchRequest searchRequest = - SearchRequestHandler.getBuilder( - opContext.getEntityRegistry(), - entitySpecs, - searchConfiguration, - customSearchConfiguration, - queryFilterRewriteChain) - .getSearchRequest( - opContext, finalInput, transformedFilters, sortCriteria, from, size, facets); - searchRequest.indices( - entityNames.stream().map(indexConvention::getEntityIndexName).toArray(String[]::new)); - searchRequestTimer.stop(); + opContext.withSpan( + "searchRequest", + () -> + SearchRequestHandler.getBuilder( + opContext.getEntityRegistry(), + entitySpecs, + searchConfiguration, + customSearchConfiguration, + queryFilterRewriteChain) + .getSearchRequest( + opContext, finalInput, transformedFilters, sortCriteria, from, size, facets) + .indices( + entityNames.stream() + .map(indexConvention::getEntityIndexName) + .toArray(String[]::new)), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "searchRequest")); if (testLoggingEnabled) { testLog(opContext.getObjectMapper(), searchRequest); @@ -437,15 +462,21 @@ public Map aggregateByValue( searchRequest.indices(stream.toArray(String[]::new)); } - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "aggregateByValue_search").time()) { - final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); - // extract results, validated against document model as well - return AggregationQueryBuilder.extractAggregationsFromResponse(searchResponse, field); - } catch (Exception e) { - log.error("Aggregation query failed", e); - throw new ESQueryException("Aggregation query failed:", e); - } + return opContext.withSpan( + "aggregateByValue_search", + () -> { + try { + final SearchResponse searchResponse = + client.search(searchRequest, RequestOptions.DEFAULT); + // extract results, validated against document model as well + return AggregationQueryBuilder.extractAggregationsFromResponse(searchResponse, field); + } catch (Exception e) { + log.error("Aggregation query failed", e); + throw new ESQueryException("Aggregation query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "aggregateByValue_search")); } /** @@ -476,33 +507,40 @@ public ScrollResult scroll( IndexConvention indexConvention = opContext.getSearchContext().getIndexConvention(); String[] indexArray = entities.stream().map(indexConvention::getEntityIndexName).toArray(String[]::new); - Timer.Context scrollRequestTimer = MetricUtils.timer(this.getClass(), "scrollRequest").time(); List entitySpecs = entities.stream() .map(name -> opContext.getEntityRegistry().getEntitySpec(name)) .collect(Collectors.toList()); Filter transformedFilters = transformFilterForEntities(postFilters, indexConvention); - // TODO: Align scroll and search using facets - final SearchRequest searchRequest = - getScrollRequest( - opContext, - scrollId, - keepAlive, - indexArray, - size, - transformedFilters, - entitySpecs, - finalInput, - sortCriteria, - null); - // PIT specifies indices in creation so it doesn't support specifying indices on the request, so - // we only specify if not using PIT - if (!supportsPointInTime()) { - searchRequest.indices(indexArray); - } - - scrollRequestTimer.stop(); + final SearchRequest searchRequest = + opContext.withSpan( + "scrollRequest", + () -> { + // TODO: Align scroll and search using facets + final SearchRequest req = + getScrollRequest( + opContext, + scrollId, + keepAlive, + indexArray, + size, + transformedFilters, + entitySpecs, + finalInput, + sortCriteria, + null); + + // PIT specifies indices in creation so it doesn't support specifying indices on the + // request, so + // we only specify if not using PIT + if (!supportsPointInTime()) { + req.indices(indexArray); + } + return req; + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "scrollRequest")); if (testLoggingEnabled) { testLog(opContext.getObjectMapper(), searchRequest); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java index 8b83439a3008c1..8ff0759b2e41a5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilder.java @@ -22,7 +22,7 @@ import com.linkedin.metadata.utils.SearchUtil; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.time.OffsetDateTime; import java.time.format.DateTimeParseException; import java.util.ArrayList; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 01a1e9cb159844..0ecf1a932e4bfd 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -43,7 +43,7 @@ import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 635d4472305c93..7578426100c400 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -131,13 +131,20 @@ public void handleChangeEvent( Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { MetadataChangeLog hookEvent = mclItem.getMetadataChangeLog(); if (UPDATE_CHANGE_TYPES.contains(hookEvent.getChangeType())) { - handleUpdateChangeEvent(opContext, mclItem); + // non-system metadata + handleUpdateChangeEvent(opContext, mclItem, false); + // graph update + updateGraphIndicesService.handleChangeEvent(opContext, event); + // system metadata is last for tracing + handleUpdateChangeEvent(opContext, mclItem, true); } else if (hookEvent.getChangeType() == ChangeType.DELETE) { - handleDeleteChangeEvent(opContext, mclItem); + // non-system metadata + handleDeleteChangeEvent(opContext, mclItem, false); + // graph update + updateGraphIndicesService.handleChangeEvent(opContext, event); + // system metadata is last for tracing + handleDeleteChangeEvent(opContext, mclItem, true); } - - // graph update - updateGraphIndicesService.handleChangeEvent(opContext, event); } } catch (IOException e) { throw new RuntimeException(e); @@ -154,7 +161,8 @@ public void handleChangeEvent( * @param event the change event to be processed. */ private void handleUpdateChangeEvent( - @Nonnull OperationContext opContext, @Nonnull final MCLItem event) throws IOException { + @Nonnull OperationContext opContext, @Nonnull final MCLItem event, boolean forSystemMetadata) + throws IOException { final EntitySpec entitySpec = event.getEntitySpec(); final AspectSpec aspectSpec = event.getAspectSpec(); @@ -163,32 +171,34 @@ private void handleUpdateChangeEvent( RecordTemplate aspect = event.getRecordTemplate(); RecordTemplate previousAspect = event.getPreviousRecordTemplate(); - // Step 0. If the aspect is timeseries, add to its timeseries index. - if (aspectSpec.isTimeseries()) { - updateTimeseriesFields( - opContext, - urn.getEntityType(), - event.getAspectName(), - urn, - aspect, - aspectSpec, - event.getSystemMetadata()); - } else { + if (!forSystemMetadata) { + // Step 0. If the aspect is timeseries, add to its timeseries index. + if (aspectSpec.isTimeseries()) { + updateTimeseriesFields( + opContext, + urn.getEntityType(), + event.getAspectName(), + urn, + aspect, + aspectSpec, + event.getSystemMetadata()); + } + + try { + // Step 1. Handle StructuredProperties Index Mapping changes + updateIndexMappings(urn, entitySpec, aspectSpec, aspect, previousAspect); + } catch (Exception e) { + log.error("Issue with updating index mappings for structured property change", e); + } + + // Step 2. For all aspects, attempt to update Search + updateSearchService(opContext, event); + } else if (forSystemMetadata && !aspectSpec.isTimeseries()) { // Inject into the System Metadata Index when an aspect is non-timeseries only. // TODO: Verify whether timeseries aspects can be dropped into System Metadata as well // without impacting rollbacks. updateSystemMetadata(event.getSystemMetadata(), urn, aspectSpec, aspect); } - - try { - // Step 1. Handle StructuredProperties Index Mapping changes - updateIndexMappings(urn, entitySpec, aspectSpec, aspect, previousAspect); - } catch (Exception e) { - log.error("Issue with updating index mappings for structured property change", e); - } - - // Step 2. For all aspects, attempt to update Search - updateSearchService(opContext, event); } public void updateIndexMappings( @@ -245,7 +255,9 @@ public void updateIndexMappings( * @param event the change event to be processed. */ private void handleDeleteChangeEvent( - @Nonnull OperationContext opContext, @Nonnull final MCLItem event) { + @Nonnull OperationContext opContext, + @Nonnull final MCLItem event, + boolean forSystemMetadata) { final EntitySpec entitySpec = event.getEntitySpec(); final Urn urn = event.getUrn(); @@ -262,8 +274,11 @@ private void handleDeleteChangeEvent( Boolean isDeletingKey = event.getAspectName().equals(entitySpec.getKeyAspectName()); if (!aspectSpec.isTimeseries()) { - deleteSystemMetadata(urn, aspectSpec, isDeletingKey); - deleteSearchData(opContext, urn, entitySpec.getName(), aspectSpec, aspect, isDeletingKey); + if (!forSystemMetadata) { + deleteSearchData(opContext, urn, entitySpec.getName(), aspectSpec, aspect, isDeletingKey); + } else { + deleteSystemMetadata(urn, aspectSpec, isDeletingKey); + } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java index a5c2fb04b5ce39..594b1852740c7d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java @@ -1,8 +1,11 @@ package com.linkedin.metadata.systemmetadata; +import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.FIELD_ASPECT; +import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.FIELD_URN; import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.INDEX_NAME; import com.google.common.collect.ImmutableList; +import com.linkedin.metadata.search.elasticsearch.query.request.SearchAfterWrapper; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; @@ -103,8 +106,8 @@ public BulkByScrollResponse deleteByUrn(@Nonnull final String urn) { public BulkByScrollResponse deleteByUrnAspect( @Nonnull final String urn, @Nonnull final String aspect) { BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); - finalQuery.must(QueryBuilders.termQuery("urn", urn)); - finalQuery.must(QueryBuilders.termQuery("aspect", aspect)); + finalQuery.filter(QueryBuilders.termQuery("urn", urn)); + finalQuery.filter(QueryBuilders.termQuery("aspect", aspect)); final Optional deleteResponse = bulkProcessor.deleteByQuery(finalQuery, indexConvention.getIndexName(INDEX_NAME)); @@ -121,7 +124,7 @@ public SearchResponse findByParams( BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); for (String key : searchParams.keySet()) { - finalQuery.must(QueryBuilders.termQuery(key, searchParams.get(key))); + finalQuery.filter(QueryBuilders.termQuery(key, searchParams.get(key))); } if (!includeSoftDeleted) { @@ -161,7 +164,7 @@ public SearchResponse findByParams( BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); for (String key : searchParams.keySet()) { - finalQuery.must(QueryBuilders.termQuery(key, searchParams.get(key))); + finalQuery.filter(QueryBuilders.termQuery(key, searchParams.get(key))); } if (!includeSoftDeleted) { @@ -186,6 +189,43 @@ public SearchResponse findByParams( return null; } + public SearchResponse scroll( + BoolQueryBuilder queryBuilder, + boolean includeSoftDeleted, + @Nullable String scrollId, + @Nullable String pitId, + @Nullable String keepAlive, + int size) { + SearchRequest searchRequest = new SearchRequest(); + + SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + + if (!includeSoftDeleted) { + queryBuilder.mustNot(QueryBuilders.termQuery("removed", "true")); + } + + Object[] sort = null; + if (scrollId != null) { + SearchAfterWrapper searchAfterWrapper = SearchAfterWrapper.fromScrollId(scrollId); + sort = searchAfterWrapper.getSort(); + } + + searchSourceBuilder.query(queryBuilder); + ESUtils.setSearchAfter(searchSourceBuilder, sort, pitId, keepAlive); + searchSourceBuilder.size(size); + searchSourceBuilder.sort(FIELD_URN).sort(FIELD_ASPECT); + + searchRequest.source(searchSourceBuilder); + searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); + + try { + return client.search(searchRequest, RequestOptions.DEFAULT); + } catch (IOException e) { + log.error("Error while searching by params.", e); + } + return null; + } + public SearchResponse findByRegistry( String registryName, String registryVersion, boolean includeSoftDeleted, int from, int size) { Map params = new HashMap<>(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index fe79ba75cb1d14..dfef592b7be943 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -1,9 +1,12 @@ package com.linkedin.metadata.systemmetadata; +import static io.datahubproject.metadata.context.TraceContext.TELEMETRY_TRACE_KEY; + import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableMap; import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.SetMode; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; @@ -36,6 +39,7 @@ import lombok.extern.slf4j.Slf4j; import org.opensearch.action.search.SearchResponse; import org.opensearch.client.tasks.GetTaskResponse; +import org.opensearch.index.query.BoolQueryBuilder; import org.opensearch.index.query.QueryBuilders; import org.opensearch.search.SearchHits; import org.opensearch.search.aggregations.bucket.filter.ParsedFilter; @@ -56,10 +60,10 @@ public class ElasticSearchSystemMetadataService private static final String DOC_DELIMETER = "--"; public static final String INDEX_NAME = "system_metadata_service_v1"; - private static final String FIELD_URN = "urn"; - private static final String FIELD_ASPECT = "aspect"; + public static final String FIELD_URN = "urn"; + public static final String FIELD_ASPECT = "aspect"; private static final String FIELD_RUNID = "runId"; - private static final String FIELD_LAST_UPDATED = "lastUpdated"; + public static final String FIELD_LAST_UPDATED = "lastUpdated"; private static final String FIELD_REGISTRY_NAME = "registryName"; private static final String FIELD_REGISTRY_VERSION = "registryVersion"; private static final Set INDEX_FIELD_SET = @@ -82,6 +86,10 @@ private String toDocument(SystemMetadata systemMetadata, String urn, String aspe document.put("registryName", systemMetadata.getRegistryName()); document.put("registryVersion", systemMetadata.getRegistryVersion()); document.put("removed", false); + if (systemMetadata.getProperties() != null + && systemMetadata.getProperties().containsKey(TELEMETRY_TRACE_KEY)) { + document.put(TELEMETRY_TRACE_KEY, systemMetadata.getProperties().get(TELEMETRY_TRACE_KEY)); + } return document.toString(); } @@ -160,31 +168,18 @@ public List findByParams( Map systemMetaParams, boolean includeSoftDeleted, int from, int size) { SearchResponse searchResponse = _esDAO.findByParams(systemMetaParams, includeSoftDeleted, from, size); - if (searchResponse != null) { - SearchHits hits = searchResponse.getHits(); - List summaries = - Arrays.stream(hits.getHits()) - .map( - hit -> { - Map values = hit.getSourceAsMap(); - AspectRowSummary summary = new AspectRowSummary(); - summary.setRunId((String) values.get(FIELD_RUNID)); - summary.setAspectName((String) values.get(FIELD_ASPECT)); - summary.setUrn((String) values.get(FIELD_URN)); - Object timestamp = values.get(FIELD_LAST_UPDATED); - if (timestamp instanceof Long) { - summary.setTimestamp((Long) timestamp); - } else if (timestamp instanceof Integer) { - summary.setTimestamp(Long.valueOf((Integer) timestamp)); - } - summary.setKeyAspect(((String) values.get(FIELD_ASPECT)).endsWith("Key")); - return summary; - }) - .collect(Collectors.toList()); - return summaries; - } else { - return Collections.emptyList(); - } + return toAspectRowSummary(searchResponse); + } + + @Override + public List findAspectsByUrn( + @Nonnull Urn urn, @Nonnull List aspects, boolean includeSoftDeleted) { + BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery(); + boolQueryBuilder.filter(QueryBuilders.termQuery(FIELD_URN, urn.toString())); + boolQueryBuilder.filter(QueryBuilders.termsQuery(FIELD_ASPECT, aspects)); + SearchResponse searchResponse = + _esDAO.scroll(boolQueryBuilder, includeSoftDeleted, null, null, null, aspects.size()); + return toAspectRowSummary(searchResponse); } @Override @@ -254,4 +249,32 @@ public void clear() { _esBulkProcessor.deleteByQuery( QueryBuilders.matchAllQuery(), true, _indexConvention.getIndexName(INDEX_NAME)); } + + private static List toAspectRowSummary(SearchResponse searchResponse) { + if (searchResponse != null) { + SearchHits hits = searchResponse.getHits(); + return Arrays.stream(hits.getHits()) + .map( + hit -> { + Map values = hit.getSourceAsMap(); + AspectRowSummary summary = new AspectRowSummary(); + summary.setRunId((String) values.get(FIELD_RUNID)); + summary.setAspectName((String) values.get(FIELD_ASPECT)); + summary.setUrn((String) values.get(FIELD_URN)); + Object timestamp = values.get(FIELD_LAST_UPDATED); + if (timestamp instanceof Long) { + summary.setTimestamp((Long) timestamp); + } else if (timestamp instanceof Integer) { + summary.setTimestamp(Long.valueOf((Integer) timestamp)); + } + summary.setKeyAspect(((String) values.get(FIELD_ASPECT)).endsWith("Key")); + summary.setTelemetryTraceId( + (String) values.get(TELEMETRY_TRACE_KEY), SetMode.IGNORE_NULL); + return summary; + }) + .collect(Collectors.toList()); + } else { + return Collections.emptyList(); + } + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataMappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataMappingsBuilder.java index 6623580548706b..9b8c50a81f8510 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataMappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataMappingsBuilder.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.systemmetadata; +import static io.datahubproject.metadata.context.TraceContext.TELEMETRY_TRACE_KEY; + import com.google.common.collect.ImmutableMap; import java.util.HashMap; import java.util.Map; @@ -19,6 +21,7 @@ public static Map getMappings() { mappings.put("registryVersion", getMappingsForKeyword()); mappings.put("registryName", getMappingsForKeyword()); mappings.put("removed", getMappingsForRemoved()); + mappings.put(TELEMETRY_TRACE_KEY, getMappingsForKeyword()); return ImmutableMap.of("properties", mappings); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index 4d940c229dc9af..0862077f4d1fa7 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -3,7 +3,6 @@ import static com.linkedin.metadata.Constants.*; import static com.linkedin.metadata.utils.CriterionUtils.buildCriterion; -import com.codahale.metrics.Timer; import com.datahub.util.RecordUtils; import com.datahub.util.exception.ESQueryException; import com.fasterxml.jackson.core.JsonProcessingException; @@ -46,7 +45,6 @@ import com.linkedin.timeseries.TimeseriesIndexSizeResult; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; -import io.datahubproject.metadata.context.SearchContext; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -404,19 +402,24 @@ public List getAspectValues( searchRequest.indices(indexName); log.debug("Search request is: " + searchRequest); - SearchHits hits; - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "searchAspectValues_search").time()) { - final SearchResponse searchResponse = - searchClient.search(searchRequest, RequestOptions.DEFAULT); - hits = searchResponse.getHits(); - } catch (Exception e) { - log.error("Search query failed:", e); - throw new ESQueryException("Search query failed:", e); - } - return Arrays.stream(hits.getHits()) - .map(ElasticSearchTimeseriesAspectService::parseDocument) - .collect(Collectors.toList()); + return opContext.withSpan( + "searchAspectValues_search", + () -> { + SearchHits hits; + try { + final SearchResponse searchResponse = + searchClient.search(searchRequest, RequestOptions.DEFAULT); + hits = searchResponse.getHits(); + } catch (Exception e) { + log.error("Search query failed:", e); + throw new ESQueryException("Search query failed:", e); + } + return Arrays.stream(hits.getHits()) + .map(ElasticSearchTimeseriesAspectService::parseDocument) + .collect(Collectors.toList()); + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "searchAspectValues_search")); } @Nonnull @@ -681,13 +684,7 @@ public TimeseriesScrollResult scrollAspects( SearchResponse response = executeScrollSearchQuery( - opContext.getSearchContext(), - entityName, - aspectName, - filterQueryBuilder, - sortCriteria, - scrollId, - count); + opContext, entityName, aspectName, filterQueryBuilder, sortCriteria, scrollId, count); int totalCount = (int) response.getHits().getTotalHits().value; List> resultPairs = @@ -704,7 +701,7 @@ public TimeseriesScrollResult scrollAspects( } private SearchResponse executeScrollSearchQuery( - @Nonnull SearchContext searchContext, + @Nonnull OperationContext opContext, @Nonnull final String entityName, @Nonnull final String aspectName, @Nonnull final QueryBuilder query, @@ -729,14 +726,22 @@ private SearchResponse executeScrollSearchQuery( ESUtils.setSearchAfter(searchSourceBuilder, sort, null, null); searchRequest.indices( - searchContext.getIndexConvention().getTimeseriesAspectIndexName(entityName, aspectName)); - - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), "scrollAspects_search").time()) { - return searchClient.search(searchRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Search query failed", e); - throw new ESQueryException("Search query failed:", e); - } + opContext + .getSearchContext() + .getIndexConvention() + .getTimeseriesAspectIndexName(entityName, aspectName)); + + return opContext.withSpan( + "scrollAspects_search", + () -> { + try { + return searchClient.search(searchRequest, RequestOptions.DEFAULT); + } catch (Exception e) { + log.error("Search query failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "scrollAspects_search")); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/UsageServiceUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/UsageServiceUtil.java index 54f97f45219ac5..671b142b13150f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/UsageServiceUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/UsageServiceUtil.java @@ -41,7 +41,6 @@ import java.time.Instant; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; @@ -122,25 +121,22 @@ public static UsageQueryResult query( long took; // 2. Get buckets. - timer = MetricUtils.timer(UsageServiceUtil.class, "getBuckets").time(); UsageAggregationArray buckets = - getBuckets(opContext, timeseriesAspectService, filter, resource, duration); - took = timer.stop(); - log.info( - "Usage stats for resource {} returned {} buckets in {} ms", - resource, - buckets.size(), - TimeUnit.NANOSECONDS.toMillis(took)); + opContext.withSpan( + "getBuckets", + () -> getBuckets(opContext, timeseriesAspectService, filter, resource, duration), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(UsageServiceUtil.class, "getBuckets")); + log.info("Usage stats for resource {} returned {} buckets", resource, buckets.size()); // 3. Get aggregations. - timer = MetricUtils.timer(UsageServiceUtil.class, "getAggregations").time(); UsageQueryResultAggregations aggregations = - getAggregations(opContext, timeseriesAspectService, filter); - took = timer.stop(); - log.info( - "Usage stats aggregation for resource {} took {} ms", - resource, - TimeUnit.NANOSECONDS.toMillis(took)); + opContext.withSpan( + "getAggregations", + () -> getAggregations(opContext, timeseriesAspectService, filter), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(UsageServiceUtil.class, "getAggregations")); + log.info("Usage stats aggregation for resource {}", resource); // 4. Compute totalSqlQuery count from the buckets itself. // We want to avoid issuing an additional query with a sum aggregation. diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/KafkaTraceReader.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/KafkaTraceReader.java new file mode 100644 index 00000000000000..9b054ba1d0dfa1 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/KafkaTraceReader.java @@ -0,0 +1,460 @@ +package com.linkedin.metadata.trace; + +import static io.datahubproject.metadata.context.TraceContext.TELEMETRY_TRACE_KEY; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.metadata.systemmetadata.TraceStorageStatus; +import com.linkedin.metadata.systemmetadata.TraceWriteStatus; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import java.time.Duration; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.experimental.SuperBuilder; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.ListConsumerGroupOffsetsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetAndTimestamp; +import org.apache.kafka.clients.producer.internals.DefaultPartitioner; +import org.apache.kafka.common.Cluster; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.PartitionInfo; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.protocol.types.SchemaException; + +@Slf4j +@SuperBuilder +public abstract class KafkaTraceReader { + private final AdminClient adminClient; + private final Supplier> consumerSupplier; + private final int pollDurationMs; + private final int pollMaxAttempts; + + @Nonnull private final ExecutorService executorService; + private final long timeoutSeconds; + + private final Cache topicPartitionCache = + Caffeine.newBuilder() + .maximumSize(1_000) // Maximum number of entries + .expireAfterWrite(Duration.ofHours(1)) // expire entries after 1 hour + .build(); + private final Cache offsetCache = + Caffeine.newBuilder() + .maximumSize(100) // unlikely to have more than 100 partitions + .expireAfterWrite(Duration.ofMinutes(5)) // Shorter expiry for offsets + .build(); + + public KafkaTraceReader( + AdminClient adminClient, + Supplier> consumerSupplier, + int pollDurationMillis, + int pollMaxAttempts, + ExecutorService executorService, + long timeoutSeconds) { + this.adminClient = adminClient; + this.consumerSupplier = consumerSupplier; + this.pollDurationMs = pollDurationMillis; + this.pollMaxAttempts = pollMaxAttempts; + this.executorService = executorService; + this.timeoutSeconds = timeoutSeconds; + } + + @Nonnull + protected abstract String getTopicName(); + + @Nullable + protected abstract String getConsumerGroupId(); + + public abstract Optional read(@Nullable GenericRecord genericRecord); + + protected abstract Optional, SystemMetadata>> + matchConsumerRecord( + ConsumerRecord consumerRecord, String traceId, String aspectName); + + /** + * Determines the write status of a trace by comparing consumer offset with message offset. + * + * @return PENDING if the message exists but hasn't been consumed yet, UNKNOWN if no consumer + * offset exists, ERROR in other cases + */ + public Map> tracePendingStatuses( + Map> urnAspectPairs, String traceId, Long traceTimestampMillis) { + return tracePendingStatuses(urnAspectPairs, traceId, traceTimestampMillis, false); + } + + public Map> tracePendingStatuses( + Map> urnAspectPairs, + String traceId, + Long traceTimestampMillis, + boolean skipCache) { + + List>>> futures = + urnAspectPairs.entrySet().stream() + .map( + entry -> + CompletableFuture.supplyAsync( + () -> { + try { + Map result = + tracePendingStatuses( + entry.getKey(), + entry.getValue(), + traceId, + traceTimestampMillis, + skipCache); + return Map.entry(entry.getKey(), result); + } catch (Exception e) { + log.error( + "Error processing trace status for URN: {}", entry.getKey(), e); + return Map.entry( + entry.getKey(), Collections.emptyMap()); + } + }, + executorService)) + .collect(Collectors.toList()); + + try { + List>> results = + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) + .thenApply( + v -> futures.stream().map(CompletableFuture::join).collect(Collectors.toList())) + .get(timeoutSeconds, TimeUnit.SECONDS); + + return results.stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, Map.Entry::getValue, (existing, replacement) -> existing)); + } catch (Exception e) { + log.error("Error processing parallel trace status requests", e); + throw new RuntimeException("Failed to process parallel trace status requests", e); + } + } + + /** + * Find messages in the kafka topic by urn, aspect names, and trace id using the timestamp to seek + * to the expected location. + * + * @return Map of aspect name to matching record pair, containing only the aspects that were found + */ + public Map, SystemMetadata>>> + findMessages( + Map> urnAspectPairs, String traceId, Long traceTimestampMillis) { + + List< + CompletableFuture< + Map.Entry< + Urn, Map, SystemMetadata>>>>> + futures = + urnAspectPairs.entrySet().stream() + .map( + entry -> + CompletableFuture.supplyAsync( + () -> { + try { + Map< + String, + Pair, SystemMetadata>> + result = + findMessages( + entry.getKey(), + entry.getValue(), + traceId, + traceTimestampMillis); + return Map.entry(entry.getKey(), result); + } catch (Exception e) { + log.error("Error processing trace for URN: {}", entry.getKey(), e); + return Map.entry( + entry.getKey(), + Collections + ., + SystemMetadata>> + emptyMap()); + } + }, + executorService)) + .collect(Collectors.toList()); + + try { + List, SystemMetadata>>>> + results = + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) + .thenApply( + v -> + futures.stream() + .map(CompletableFuture::join) + .collect(Collectors.toList())) + .get(timeoutSeconds, TimeUnit.SECONDS); + + return results.stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, Map.Entry::getValue, (existing, replacement) -> existing)); + } catch (Exception e) { + log.error("Error processing parallel trace requests", e); + throw new RuntimeException("Failed to process parallel trace requests", e); + } + } + + private Map tracePendingStatuses( + Urn urn, + Collection aspectNames, + String traceId, + Long traceTimestampMillis, + boolean skipCache) { + try { + TopicPartition topicPartition = getTopicPartition(urn); + Optional offsetMetadata = getOffsetAndMetadata(topicPartition, skipCache); + if (offsetMetadata.isEmpty()) { + log.warn("No consumer offset to compare with."); + return aspectNames.stream() + .collect( + Collectors.toMap( + aspectName -> aspectName, + aspectName -> + TraceStorageStatus.ok( + TraceWriteStatus.UNKNOWN, "Missing consumer offsets."))); + } + + Map, SystemMetadata>> messages = + findMessages(urn, aspectNames, traceId, traceTimestampMillis); + + return aspectNames.stream() + .collect( + Collectors.toMap( + aspectName -> aspectName, + aspectName -> { + Pair, SystemMetadata> message = + messages.get(aspectName); + if (message != null + && offsetMetadata.get().offset() < message.getFirst().offset()) { + return TraceStorageStatus.ok( + TraceWriteStatus.PENDING, "Consumer has not processed offset."); + } + return TraceStorageStatus.fail( + TraceWriteStatus.ERROR, "Consumer has processed past the offset."); + })); + } catch (ExecutionException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + /** + * Get the offset metadata for a specific TopicPartition from the consumer group. This method is + * now the primary interface for offset lookup and uses caching. + */ + private Optional getOffsetAndMetadata( + TopicPartition topicPartition, boolean skipCache) { + if (skipCache) { + offsetCache.invalidate(topicPartition); + } + + return Optional.ofNullable( + offsetCache.get( + topicPartition, + tp -> { + final String consumerGroupId = Objects.requireNonNull(getConsumerGroupId()); + + try { + ListConsumerGroupOffsetsResult offsetsResult = + adminClient.listConsumerGroupOffsets(consumerGroupId); + + if (offsetsResult == null) { + log.error("Failed to get consumer group offsets for group: {}", consumerGroupId); + return null; + } + + Map offsets = + offsetsResult.partitionsToOffsetAndMetadata().get(); + + if (offsets == null) { + log.error("Null offsets returned for consumer group: {}", consumerGroupId); + return null; + } + + OffsetAndMetadata offsetAndMetadata = offsets.get(tp); + if (offsetAndMetadata == null) { + log.warn( + "No committed offset found for Topic: {}, Partition: {}, Group: {}", + tp.topic(), + tp.partition(), + consumerGroupId); + return null; + } + + log.debug( + "Found offset metadata {} for Topic: {}, Partition: {}, Group: {}", + offsetAndMetadata, + tp.topic(), + tp.partition(), + consumerGroupId); + + return offsetAndMetadata; + } catch (SchemaException e) { + log.error("Schema error when fetching consumer group offsets", e); + return null; + } catch (Exception e) { + log.error("Error fetching consumer group offsets", e); + return null; + } + })); + } + + private Map, SystemMetadata>> findMessages( + Urn urn, Collection aspectNames, String traceId, Long traceTimestampMillis) + throws ExecutionException, InterruptedException { + + TopicPartition topicPartition = getTopicPartition(urn); + + try (Consumer consumer = consumerSupplier.get()) { + // Assign the partition we want to read from + consumer.assign(Collections.singleton(topicPartition)); + + // Get offset for timestamp + OffsetAndTimestamp offsetAndTimestamp = + getOffsetByTime(consumer, topicPartition, traceTimestampMillis); + + if (offsetAndTimestamp == null) { + log.debug( + "No offset found for timestamp {} in partition {}", + traceTimestampMillis, + topicPartition); + return Collections.emptyMap(); + } + + // Seek to the offset for the timestamp + consumer.seek(topicPartition, offsetAndTimestamp.offset()); + log.debug( + "Seeking to timestamp-based offset {} for partition {}", + offsetAndTimestamp.offset(), + topicPartition); + + // Poll with a maximum number of attempts + int attempts = 0; + long lastProcessedOffset = -1; + Map, SystemMetadata>> results = + new HashMap<>(); + + while (attempts < pollMaxAttempts && results.size() < aspectNames.size()) { + var records = consumer.poll(java.time.Duration.ofMillis(pollDurationMs)); + attempts++; + + if (records.isEmpty()) { + break; + } + + // Check if we're making progress + long currentOffset = consumer.position(topicPartition); + if (currentOffset == lastProcessedOffset) { + break; + } + lastProcessedOffset = currentOffset; + + // Process records for each aspect name we haven't found yet + for (String aspectName : aspectNames) { + if (!results.containsKey(aspectName)) { + var matchingRecord = + records.records(topicPartition).stream() + .filter(record -> record.key().equals(urn.toString())) + .flatMap(record -> matchConsumerRecord(record, traceId, aspectName).stream()) + .findFirst(); + + matchingRecord.ifPresent(pair -> results.put(aspectName, pair)); + } + } + } + + return results; + } + } + + protected static boolean traceIdMatch(@Nullable SystemMetadata systemMetadata, String traceId) { + return systemMetadata != null + && systemMetadata.getProperties() != null + && traceId.equals(systemMetadata.getProperties().get(TELEMETRY_TRACE_KEY)); + } + + private TopicPartition getTopicPartition(Urn urn) { + return topicPartitionCache.get( + urn.toString(), + key -> { + try { + DefaultPartitioner partitioner = new DefaultPartitioner(); + + TopicDescription topicDescription = + adminClient + .describeTopics(Collections.singletonList(getTopicName())) + .all() + .get() + .get(getTopicName()); + + if (topicDescription == null) { + throw new IllegalStateException("Topic " + getTopicName() + " not found"); + } + + List partitions = + topicDescription.partitions().stream() + .map( + p -> + new PartitionInfo( + getTopicName(), + p.partition(), + p.leader(), + p.replicas().toArray(new Node[0]), + p.isr().toArray(new Node[0]), + p.replicas().toArray(new Node[0]))) + .collect(Collectors.toList()); + + List nodes = + partitions.stream() + .map(PartitionInfo::leader) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + + Cluster cluster = + new Cluster( + null, nodes, partitions, Collections.emptySet(), Collections.emptySet()); + + int partition = + partitioner.partition(getTopicName(), key, key.getBytes(), null, null, cluster); + + return new TopicPartition(getTopicName(), partition); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException("Failed to get topic partition for " + key, e); + } + }); + } + + private static OffsetAndTimestamp getOffsetByTime( + Consumer consumer, + TopicPartition topicPartition, + Long traceTimestampMillis) { + // If we have a timestamp, first seek to that approximate location + Map timestampsToSearch = + Collections.singletonMap(topicPartition, traceTimestampMillis); + + return consumer.offsetsForTimes(timestampsToSearch).get(topicPartition); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/MCLTraceReader.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCLTraceReader.java new file mode 100644 index 00000000000000..98da8f87d3b5ad --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCLTraceReader.java @@ -0,0 +1,43 @@ +package com.linkedin.metadata.trace; + +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.MetadataChangeLog; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import java.io.IOException; +import java.util.Optional; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.experimental.SuperBuilder; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +@Getter +@SuperBuilder +public class MCLTraceReader extends KafkaTraceReader { + @Nonnull private final String topicName; + @Nullable private final String consumerGroupId; + + @Override + public Optional read(@Nullable GenericRecord genericRecord) { + try { + return Optional.ofNullable( + genericRecord == null ? null : EventUtils.avroToPegasusMCL(genericRecord)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected Optional, SystemMetadata>> + matchConsumerRecord( + ConsumerRecord consumerRecord, String traceId, String aspectName) { + return read(consumerRecord.value()) + .filter( + event -> + traceIdMatch(event.getSystemMetadata(), traceId) + && aspectName.equals(event.getAspectName())) + .map(event -> Pair.of(consumerRecord, event.getSystemMetadata())); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPFailedTraceReader.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPFailedTraceReader.java new file mode 100644 index 00000000000000..bb3520d6630d84 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPFailedTraceReader.java @@ -0,0 +1,45 @@ +package com.linkedin.metadata.trace; + +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.FailedMetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import java.io.IOException; +import java.util.Optional; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.experimental.SuperBuilder; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +@Getter +@SuperBuilder +public class MCPFailedTraceReader extends KafkaTraceReader { + @Nonnull private final String topicName; + @Nullable private final String consumerGroupId; + + @Override + public Optional read(@Nullable GenericRecord genericRecord) { + try { + return Optional.ofNullable( + genericRecord == null ? null : EventUtils.avroToPegasusFailedMCP(genericRecord)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected Optional, SystemMetadata>> + matchConsumerRecord( + ConsumerRecord consumerRecord, String traceId, String aspectName) { + return read(consumerRecord.value()) + .filter( + event -> + traceIdMatch(event.getMetadataChangeProposal().getSystemMetadata(), traceId) + && aspectName.equals(event.getMetadataChangeProposal().getAspectName())) + .map( + event -> + Pair.of(consumerRecord, event.getMetadataChangeProposal().getSystemMetadata())); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPTraceReader.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPTraceReader.java new file mode 100644 index 00000000000000..99781e80416de9 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/MCPTraceReader.java @@ -0,0 +1,43 @@ +package com.linkedin.metadata.trace; + +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import java.io.IOException; +import java.util.Optional; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.experimental.SuperBuilder; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +@Getter +@SuperBuilder +public class MCPTraceReader extends KafkaTraceReader { + @Nonnull private final String topicName; + @Nullable private final String consumerGroupId; + + @Override + public Optional read(@Nullable GenericRecord genericRecord) { + try { + return Optional.ofNullable( + genericRecord == null ? null : EventUtils.avroToPegasusMCP(genericRecord)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected Optional, SystemMetadata>> + matchConsumerRecord( + ConsumerRecord consumerRecord, String traceId, String aspectName) { + return read(consumerRecord.value()) + .filter( + event -> + traceIdMatch(event.getSystemMetadata(), traceId) + && aspectName.equals(event.getAspectName())) + .map(event -> Pair.of(consumerRecord, event.getSystemMetadata())); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java new file mode 100644 index 00000000000000..51e30bd6f6658d --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java @@ -0,0 +1,484 @@ +package com.linkedin.metadata.trace; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.run.AspectRowSummary; +import com.linkedin.metadata.systemmetadata.SystemMetadataService; +import com.linkedin.metadata.systemmetadata.TraceService; +import com.linkedin.metadata.systemmetadata.TraceStatus; +import com.linkedin.metadata.systemmetadata.TraceStorageStatus; +import com.linkedin.metadata.systemmetadata.TraceWriteStatus; +import com.linkedin.mxe.FailedMetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.TraceContext; +import io.datahubproject.metadata.context.TraceIdGenerator; +import io.datahubproject.metadata.exception.TraceException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Builder; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +@Builder +@Slf4j +public class TraceServiceImpl implements TraceService { + private final EntityRegistry entityRegistry; + private final SystemMetadataService systemMetadataService; + private final EntityService entityService; + private final MCPTraceReader mcpTraceReader; + private final MCPFailedTraceReader mcpFailedTraceReader; + private final MCLTraceReader mclVersionedTraceReader; + private final MCLTraceReader mclTimeseriesTraceReader; + + public TraceServiceImpl( + EntityRegistry entityRegistry, + SystemMetadataService systemMetadataService, + EntityService entityService, + MCPTraceReader mcpTraceReader, + MCPFailedTraceReader mcpFailedTraceReader, + MCLTraceReader mclVersionedTraceReader, + MCLTraceReader mclTimeseriesTraceReader) { + this.entityRegistry = entityRegistry; + this.systemMetadataService = systemMetadataService; + this.entityService = entityService; + this.mcpTraceReader = mcpTraceReader; + this.mcpFailedTraceReader = mcpFailedTraceReader; + this.mclVersionedTraceReader = mclVersionedTraceReader; + this.mclTimeseriesTraceReader = mclTimeseriesTraceReader; + } + + @Nonnull + @Override + public Map> trace( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + @Nonnull Map> aspectNames, + boolean onlyIncludeErrors, + boolean detailed, + boolean skipCache) { + + long traceTimestampMillis = TraceIdGenerator.getTimestampMillis(traceId); + + // Get primary status for all URNs + Map> primaryStatuses = + tracePrimaryInParallel( + opContext, traceId, traceTimestampMillis, aspectNames, detailed, skipCache); + + // Get search status for all URNs using primary results + Map> searchStatuses = + traceSearchInParallel( + opContext, traceId, traceTimestampMillis, aspectNames, primaryStatuses, skipCache); + + // Merge and filter results for each URN + Map> mergedResults = + aspectNames.keySet().stream() + .collect( + Collectors.toMap( + urn -> urn, + urn -> + mergeStatus( + primaryStatuses.getOrDefault(urn, new LinkedHashMap<>()), + searchStatuses.getOrDefault(urn, new LinkedHashMap<>()), + onlyIncludeErrors))); + + // Remove URNs with empty aspect maps (when filtering for errors) + return mergedResults.entrySet().stream() + .filter(entry -> !entry.getValue().isEmpty()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + private Map> tracePrimaryInParallel( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + long traceTimestampMillis, + @Nonnull Map> aspectNames, + boolean detailed, + boolean skipCache) { + + // Group aspects by whether they are timeseries + Map> timeseriesResults = new HashMap<>(); + Map> nonTimeseriesAspects = new HashMap<>(); + + for (Map.Entry> entry : aspectNames.entrySet()) { + Urn urn = entry.getKey(); + EntitySpec entitySpec = entityRegistry.getEntitySpec(urn.getEntityType()); + + Map timeseriesStatuses = new LinkedHashMap<>(); + Set remainingAspects = new HashSet<>(); + + for (String aspectName : entry.getValue()) { + if (entitySpec.getAspectSpec(aspectName).isTimeseries()) { + timeseriesStatuses.put(aspectName, TraceStorageStatus.NO_OP); + } else { + remainingAspects.add(aspectName); + } + } + + if (!timeseriesStatuses.isEmpty()) { + timeseriesResults.put(urn, timeseriesStatuses); + } + if (!remainingAspects.isEmpty()) { + nonTimeseriesAspects.put(urn, remainingAspects); + } + } + + // Process non-timeseries aspects using SQL + Map> sqlResults = new HashMap<>(); + if (!nonTimeseriesAspects.isEmpty()) { + try { + Map responses = + entityService.getEntitiesV2( + opContext, + nonTimeseriesAspects.keySet().iterator().next().getEntityType(), + nonTimeseriesAspects.keySet(), + nonTimeseriesAspects.values().stream() + .flatMap(Collection::stream) + .collect(Collectors.toSet()), + false); + + for (Map.Entry entry : responses.entrySet()) { + Map aspectStatuses = new LinkedHashMap<>(); + for (Map.Entry aspectEntry : + entry.getValue().getAspects().entrySet()) { + long createdOnMillis = aspectEntry.getValue().getCreated().getTime(); + SystemMetadata systemMetadata = aspectEntry.getValue().getSystemMetadata(); + String systemTraceId = extractTraceId(systemMetadata); + Optional aspectLastUpdated = extractLastUpdated(systemMetadata); + String aspectName = aspectEntry.getKey(); + + if (traceId.equals(systemTraceId)) { + aspectStatuses.put(aspectName, TraceStorageStatus.ok(TraceWriteStatus.ACTIVE_STATE)); + } else if (traceTimestampMillis <= extractTimestamp(systemTraceId, createdOnMillis)) { + aspectStatuses.put( + aspectName, TraceStorageStatus.ok(TraceWriteStatus.HISTORIC_STATE)); + } else if (createdOnMillis < traceTimestampMillis + && traceTimestampMillis < aspectLastUpdated.orElse(traceTimestampMillis)) { + aspectStatuses.put(aspectName, TraceStorageStatus.ok(TraceWriteStatus.NO_OP)); + } + } + sqlResults.put(entry.getKey(), aspectStatuses); + } + } catch (Exception e) { + log.error("Error getting entities", e); + } + } + + // Account for sql results + Map> remainingAspects = new HashMap<>(); + for (Map.Entry> entry : nonTimeseriesAspects.entrySet()) { + Set foundAspects = + sqlResults.getOrDefault(entry.getKey(), Collections.emptyMap()).keySet(); + Set remaining = new HashSet<>(entry.getValue()); + remaining.removeAll(foundAspects); + if (!remaining.isEmpty()) { + remainingAspects.put(entry.getKey(), new ArrayList<>(remaining)); + } + } + + // Get remaining aspects from Kafka + Map> kafkaResults = + mcpTraceReader.tracePendingStatuses( + remainingAspects, traceId, traceTimestampMillis, skipCache); + + // Merge all results + Map> finalResults = new HashMap<>(); + for (Urn urn : aspectNames.keySet()) { + LinkedHashMap merged = new LinkedHashMap<>(); + merged.putAll(timeseriesResults.getOrDefault(urn, Collections.emptyMap())); + merged.putAll(sqlResults.getOrDefault(urn, Collections.emptyMap())); + merged.putAll(kafkaResults.getOrDefault(urn, Collections.emptyMap())); + finalResults.put(urn, merged); + } + + if (detailed) { + handleFailedMCP(opContext, finalResults, traceId, traceTimestampMillis); + } + + return finalResults; + } + + private Optional extractLastUpdated(@Nullable SystemMetadata systemMetadata) { + return Optional.ofNullable(systemMetadata) + .flatMap(sysMeta -> Optional.ofNullable(sysMeta.getLastObserved())); + } + + private void handleFailedMCP( + @Nonnull OperationContext opContext, + Map> finalResults, + @Nonnull String traceId, + long traceTimestampMillis) { + // Create a map of URNs and aspects that need to be checked in the failed topic + Map> aspectsToCheck = new HashMap<>(); + + // Filter for aspects with ERROR, NO_OP, or UNKNOWN status that might be in the failed topic + for (Map.Entry> entry : + finalResults.entrySet()) { + Urn urn = entry.getKey(); + EntitySpec entitySpec = entityRegistry.getEntitySpec(urn.getEntityType()); + + /* + * ERROR - to fetch exception + * NO_OP - to validate there wasn't a failure during an expected NO_OP + * UNKNOWN - ambiguous case resolution + */ + List aspectsToVerify = + entry.getValue().entrySet().stream() + .filter(aspect -> !entitySpec.getAspectSpec(aspect.getKey()).isTimeseries()) + .filter( + aspect -> + Set.of( + TraceWriteStatus.ERROR, + TraceWriteStatus.NO_OP, + TraceWriteStatus.UNKNOWN) + .contains(aspect.getValue().getWriteStatus())) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); + + if (!aspectsToVerify.isEmpty()) { + aspectsToCheck.put(entry.getKey(), aspectsToVerify); + } + } + + // If there are no aspects to check, return early + if (aspectsToCheck.isEmpty()) { + return; + } + + try { + // Find messages in the failed topic for these URNs and aspects + Map, SystemMetadata>>> + failedMessages = + mcpFailedTraceReader.findMessages(aspectsToCheck, traceId, traceTimestampMillis); + + // Update the status for any aspects found in the failed topic + for (Map.Entry, SystemMetadata>>> + entry : failedMessages.entrySet()) { + Urn urn = entry.getKey(); + LinkedHashMap urnStatuses = finalResults.get(urn); + + if (urnStatuses != null) { + for (Map.Entry, SystemMetadata>> + aspectEntry : entry.getValue().entrySet()) { + String aspectName = aspectEntry.getKey(); + + // If we found the message in the failed topic, update its status (possible transition + // from UNKNOWN) + TraceStorageStatus.TraceStorageStatusBuilder builder = + TraceStorageStatus.builder().writeStatus(TraceWriteStatus.ERROR); + + // Populate the exception if possible + mcpFailedTraceReader + .read(aspectEntry.getValue().getFirst().value()) + .ifPresent( + failedMCP -> + builder.writeExceptions(extractTraceExceptions(opContext, failedMCP))); + + urnStatuses.put(aspectName, builder.build()); + } + } + } + } catch (Exception e) { + log.error("Error processing failed MCP messages", e); + } + } + + private Map> traceSearchInParallel( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + long traceTimestampMillis, + @Nonnull Map> aspectNames, + @Nonnull Map> primaryStatuses, + boolean skipCache) { + + Map> aspectsToResolve = new HashMap<>(); + Map> finalResults = new HashMap<>(); + + // 1. Consider status of primary storage write + for (Map.Entry> entry : aspectNames.entrySet()) { + Urn urn = entry.getKey(); + EntitySpec entitySpec = entityRegistry.getEntitySpec(urn.getEntityType()); + LinkedHashMap finalResponse = new LinkedHashMap<>(); + List remaining = new ArrayList<>(); + + Map primaryStatus = + primaryStatuses.getOrDefault(urn, new LinkedHashMap<>()); + + for (String aspectName : entry.getValue()) { + TraceWriteStatus status = primaryStatus.get(aspectName).getWriteStatus(); + if (status == TraceWriteStatus.PENDING) { + finalResponse.put( + aspectName, + TraceStorageStatus.ok(TraceWriteStatus.PENDING, "Pending primary storage write.")); + } else if (status == TraceWriteStatus.NO_OP) { + if (entitySpec.getAspectSpec(aspectName).isTimeseries()) { + finalResponse.put( + aspectName, TraceStorageStatus.ok(TraceWriteStatus.TRACE_NOT_IMPLEMENTED)); + } else { + finalResponse.put(aspectName, TraceStorageStatus.NO_OP); + } + } else if (status == TraceWriteStatus.ERROR) { + finalResponse.put( + aspectName, + TraceStorageStatus.fail(TraceWriteStatus.ERROR, "Primary storage write failed.")); + } else if (status == TraceWriteStatus.TRACE_NOT_IMPLEMENTED + || status == TraceWriteStatus.UNKNOWN) { + finalResponse.put( + aspectName, + TraceStorageStatus.ok( + TraceWriteStatus.UNKNOWN, "Primary storage write indeterminate.")); + } else { + remaining.add(aspectName); + } + } + + if (!remaining.isEmpty()) { + aspectsToResolve.put(urn, remaining); + } + if (!finalResponse.isEmpty()) { + finalResults.put(urn, finalResponse); + } + } + + // 2. Check implied search write using system metadata + if (!aspectsToResolve.isEmpty()) { + // Get system metadata & group by URN + Map> summariesByUrn = + aspectsToResolve.entrySet().stream() + .flatMap( + entry -> + systemMetadataService + .findAspectsByUrn(entry.getKey(), entry.getValue(), true) + .stream()) + .collect(Collectors.groupingBy(summary -> UrnUtils.getUrn(summary.getUrn()))); + + // Process each URN's summaries + for (Map.Entry> entry : aspectsToResolve.entrySet()) { + Urn urn = entry.getKey(); + List remaining = new ArrayList<>(entry.getValue()); + LinkedHashMap response = + finalResults.computeIfAbsent(urn, k -> new LinkedHashMap<>()); + + for (AspectRowSummary summary : summariesByUrn.getOrDefault(urn, Collections.emptyList())) { + if (traceId.equals(summary.getTelemetryTraceId())) { + response.put( + summary.getAspectName(), TraceStorageStatus.ok(TraceWriteStatus.ACTIVE_STATE)); + remaining.remove(summary.getAspectName()); + } else if (summary.hasTimestamp() + && summary.getTimestamp() > 0 + && traceTimestampMillis <= summary.getTimestamp()) { + response.put( + summary.getAspectName(), TraceStorageStatus.ok(TraceWriteStatus.HISTORIC_STATE)); + remaining.remove(summary.getAspectName()); + } + } + + // update remaining + aspectsToResolve.put(urn, remaining); + } + + // Get remaining from Kafka + Map> kafkaResults = + mcpTraceReader.tracePendingStatuses( + aspectsToResolve, traceId, traceTimestampMillis, skipCache); + + // Merge Kafka results + kafkaResults.forEach( + (urn, statuses) -> + finalResults.computeIfAbsent(urn, k -> new LinkedHashMap<>()).putAll(statuses)); + } + + return finalResults; + } + + private static Map mergeStatus( + LinkedHashMap primaryAspectStatus, + LinkedHashMap searchAspectStatus, + boolean onlyIncludeErrors) { + + return primaryAspectStatus.entrySet().stream() + .map( + storageEntry -> { + String aspectName = storageEntry.getKey(); + TraceStorageStatus primaryStatus = storageEntry.getValue(); + TraceStorageStatus searchStatus = searchAspectStatus.get(aspectName); + TraceStatus traceStatus = + TraceStatus.builder() + .primaryStorage(primaryStatus) + .searchStorage(searchStatus) + .success(isSuccess(primaryStatus, searchStatus)) + .build(); + + // Only include this aspect if we're not filtering for errors + // or if either storage has an ERROR status + if (!onlyIncludeErrors + || TraceWriteStatus.ERROR.equals(primaryStatus.getWriteStatus()) + || TraceWriteStatus.ERROR.equals(searchStatus.getWriteStatus())) { + return Map.entry(aspectName, traceStatus); + } + return null; + }) + .filter(Objects::nonNull) + .collect( + Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (existing, replacement) -> existing, + LinkedHashMap::new)); + } + + private static boolean isSuccess( + TraceStorageStatus primaryStatus, TraceStorageStatus searchStatus) { + return !TraceWriteStatus.ERROR.equals(primaryStatus.getWriteStatus()) + && !TraceWriteStatus.ERROR.equals(searchStatus.getWriteStatus()); + } + + @Nullable + private static String extractTraceId(@Nullable SystemMetadata systemMetadata) { + if (systemMetadata != null && systemMetadata.getProperties() != null) { + return systemMetadata.getProperties().get(TraceContext.TELEMETRY_TRACE_KEY); + } + return null; + } + + private static long extractTimestamp(@Nullable String traceId, long createOnMillis) { + return Optional.ofNullable(traceId) + .map(TraceIdGenerator::getTimestampMillis) + .orElse(createOnMillis); + } + + private List extractTraceExceptions( + @Nonnull OperationContext opContext, FailedMetadataChangeProposal fmcp) { + if (!fmcp.getError().isEmpty()) { + try { + if (fmcp.getError().startsWith("[") && fmcp.getError().endsWith("]")) { + return opContext.getObjectMapper().readValue(fmcp.getError(), new TypeReference<>() {}); + } + } catch (Exception e) { + log.warn("Failed to deserialize: {}", fmcp.getError()); + } + return List.of(new TraceException(fmcp.getError())); + } + return List.of(new TraceException("Unable to extract trace exception")); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 6eda210baf7d4a..2971d3a37e7979 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -190,7 +190,8 @@ public void testIngestGetEntity() throws Exception { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); MetadataChangeLog mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "corpuser"); assertNull(mcl.getPreviousAspectValue()); @@ -236,7 +237,8 @@ public void testAddKey() throws Exception { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); MetadataChangeLog mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "corpuser"); assertNull(mcl.getPreviousAspectValue()); @@ -324,7 +326,11 @@ public void testIngestGetEntities() throws Exception { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn1), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), + Mockito.eq(entityUrn1), + Mockito.any(), + mclCaptor.capture()); MetadataChangeLog mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "corpuser"); assertNull(mcl.getPreviousAspectValue()); @@ -332,7 +338,11 @@ public void testIngestGetEntities() throws Exception { assertEquals(mcl.getChangeType(), ChangeType.UPSERT); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn2), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), + Mockito.eq(entityUrn2), + Mockito.any(), + mclCaptor.capture()); mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "corpuser"); assertNull(mcl.getPreviousAspectValue()); @@ -405,10 +415,12 @@ public void testIngestGetEntitiesV2() throws Exception { DataTemplateUtil.areEqual(expectedKey2, new CorpUserKey(envelopedKey2.getValue().data()))); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn1), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn1), Mockito.any(), Mockito.any()); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn2), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn2), Mockito.any(), Mockito.any()); verifyNoMoreInteractions(_mockProducer); } @@ -476,10 +488,12 @@ public void testIngestGetEntitiesVersionedV2() throws Exception { DataTemplateUtil.areEqual(expectedKey2, new CorpUserKey(envelopedKey2.getValue().data()))); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn1), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn1), Mockito.any(), Mockito.any()); verify(_mockProducer, times(2)) - .produceMetadataChangeLog(Mockito.eq(entityUrn2), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn2), Mockito.any(), Mockito.any()); verifyNoMoreInteractions(_mockProducer); } @@ -510,7 +524,8 @@ public void testIngestAspectsGetLatestAspects() throws Exception { assertTrue(DataTemplateUtil.areEqual(writeAspect2, latestAspects.get(aspectName2))); verify(_mockProducer, times(3)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); verifyNoMoreInteractions(_mockProducer); } @@ -566,7 +581,10 @@ public void testReingestAspectsGetLatestAspects() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.any(), Mockito.eq(initialChangeLog)); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + Mockito.eq(initialChangeLog)); // Mockito detects the previous invocation and throws an error in verifying the second call // unless invocations are cleared @@ -576,7 +594,8 @@ public void testReingestAspectsGetLatestAspects() throws Exception { opContext, entityUrn, pairToIngest, TEST_AUDIT_STAMP, metadata1); verify(_mockProducer, times(0)) - .produceMetadataChangeLog(Mockito.any(), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.any(), Mockito.any(), Mockito.any()); verifyNoMoreInteractions(_mockProducer); } @@ -645,7 +664,10 @@ public void testReingestLineageAspect() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.any(), Mockito.eq(initialChangeLog)); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + Mockito.eq(initialChangeLog)); // Mockito detects the previous invocation and throws an error in verifying the second call // unless invocations are cleared @@ -656,7 +678,10 @@ public void testReingestLineageAspect() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.any(), Mockito.eq(restateChangeLog)); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + Mockito.eq(restateChangeLog)); verifyNoMoreInteractions(_mockProducer); } @@ -718,7 +743,10 @@ public void testReingestLineageProposal() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.any(), Mockito.eq(initialChangeLog)); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + Mockito.eq(initialChangeLog)); // Mockito detects the previous invocation and throws an error in verifying the second call // unless invocations are cleared @@ -728,7 +756,10 @@ public void testReingestLineageProposal() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.any(), Mockito.eq(restateChangeLog)); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + Mockito.eq(restateChangeLog)); verifyNoMoreInteractions(_mockProducer); } @@ -773,9 +804,15 @@ public void testAsyncProposalVersioned() throws Exception { gmce.setAspect(genericAspect); _entityServiceImpl.ingestProposal(opContext, gmce, TEST_AUDIT_STAMP, true); verify(_mockProducer, times(0)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); + + ChangeItemImpl item = + ChangeItemImpl.builder().build(gmce, TEST_AUDIT_STAMP, opContext.getAspectRetriever()); + verify(_mockProducer, times(1)) - .produceMetadataChangeProposal(Mockito.eq(entityUrn), Mockito.eq(gmce)); + .produceMetadataChangeProposal( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq(item)); } @Test @@ -798,7 +835,8 @@ public void testAsyncProposalTimeseries() throws Exception { gmce.setAspect(genericAspect); _entityServiceImpl.ingestProposal(opContext, gmce, TEST_AUDIT_STAMP, true); verify(_mockProducer, times(1)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), Mockito.any()); verify(_mockProducer, times(0)) .produceMetadataChangeProposal(Mockito.eq(entityUrn), Mockito.eq(gmce)); } @@ -823,7 +861,10 @@ public void testUpdateGetAspect() throws AssertionError { assertTrue(DataTemplateUtil.areEqual(writeAspect, readAspect1)); verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.eq(corpUserInfoSpec), Mockito.any()); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.eq(corpUserInfoSpec), + Mockito.any()); // Ingest CorpUserInfo Aspect #2 writeAspect.setEmail("newemail@test.com"); @@ -836,10 +877,14 @@ public void testUpdateGetAspect() throws AssertionError { assertTrue(DataTemplateUtil.areEqual(writeAspect, readAspect2)); verify(_mockProducer, times(2)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.eq(corpUserInfoSpec), Mockito.any()); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.eq(corpUserInfoSpec), + Mockito.any()); verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -877,7 +922,10 @@ public void testGetAspectAtVersion() throws AssertionError { assertTrue(DataTemplateUtil.areEqual(writtenVersionedAspect1, readAspect1)); verify(_mockProducer, times(1)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.eq(corpUserInfoSpec), Mockito.any()); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.eq(corpUserInfoSpec), + Mockito.any()); readAspect1 = _entityServiceImpl.getVersionedAspect(opContext, entityUrn, aspectName, -1); assertTrue(DataTemplateUtil.areEqual(writtenVersionedAspect1, readAspect1)); @@ -896,7 +944,10 @@ public void testGetAspectAtVersion() throws AssertionError { assertTrue(DataTemplateUtil.areEqual(writtenVersionedAspect2, readAspectVersion2)); verify(_mockProducer, times(2)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), Mockito.eq(corpUserInfoSpec), Mockito.any()); + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.eq(corpUserInfoSpec), + Mockito.any()); readAspect1 = _entityServiceImpl.getVersionedAspect(opContext, entityUrn, aspectName, -1); assertFalse(DataTemplateUtil.areEqual(writtenVersionedAspect1, readAspect1)); @@ -904,6 +955,7 @@ public void testGetAspectAtVersion() throws AssertionError { // check key aspect verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1228,6 +1280,7 @@ public void testIngestGetLatestAspect() throws AssertionError { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1243,6 +1296,7 @@ public void testIngestGetLatestAspect() throws AssertionError { verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1291,7 +1345,8 @@ public void testIngestGetLatestAspect() throws AssertionError { EntityApiUtils.parseSystemMetadata(readAspectDao1.getSystemMetadata()), metadata1)); verify(_mockProducer, times(1)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "corpuser"); assertNotNull(mcl.getPreviousAspectValue()); @@ -1376,6 +1431,7 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { verify(_mockProducer, times(2)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1386,6 +1442,7 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1435,6 +1492,7 @@ public void testIngestSameAspect() throws AssertionError { verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1446,6 +1504,7 @@ public void testIngestSameAspect() throws AssertionError { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(entityUrn), Mockito.eq( opContext @@ -1506,7 +1565,8 @@ public void testIngestSameAspect() throws AssertionError { EntityApiUtils.parseSystemMetadata(readAspectDao2.getSystemMetadata()), metadata3)); verify(_mockProducer, times(0)) - .produceMetadataChangeLog(Mockito.any(), Mockito.any(), Mockito.any()); + .produceMetadataChangeLog( + any(OperationContext.class), Mockito.any(), Mockito.any(), Mockito.any()); verifyNoMoreInteractions(_mockProducer); } @@ -1781,7 +1841,11 @@ public void testRestoreIndices() throws Exception { ArgumentCaptor mclCaptor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(1)) - .produceMetadataChangeLog(Mockito.eq(entityUrn), Mockito.any(), mclCaptor.capture()); + .produceMetadataChangeLog( + any(OperationContext.class), + Mockito.eq(entityUrn), + Mockito.any(), + mclCaptor.capture()); MetadataChangeLog mcl = mclCaptor.getValue(); assertEquals(mcl.getEntityType(), "dataset"); assertNull(mcl.getPreviousAspectValue()); @@ -1919,7 +1983,10 @@ public void testUIPreProcessedProposal() throws Exception { ArgumentCaptor aspectSpecCaptor = ArgumentCaptor.forClass(AspectSpec.class); verify(_mockProducer, times(4)) .produceMetadataChangeLog( - Mockito.eq(entityUrn), aspectSpecCaptor.capture(), captor.capture()); + any(OperationContext.class), + Mockito.eq(entityUrn), + aspectSpecCaptor.capture(), + captor.capture()); assertEquals(UI_SOURCE, captor.getValue().getSystemMetadata().getProperties().get(APP_SOURCE)); assertEquals( aspectSpecCaptor.getAllValues().stream() @@ -1963,6 +2030,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { ArgumentCaptor captor = ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(firstPropertyUrn), Mockito.eq(structuredPropertiesDefinitionAspect), captor.capture()); @@ -2050,6 +2118,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { ArgumentCaptor.forClass(MetadataChangeLog.class); verify(_mockProducer, times(1)) .produceMetadataChangeLog( + any(OperationContext.class), Mockito.eq(secondPropertyUrn), Mockito.eq(structuredPropertiesDefinitionAspect), secondCaptor.capture()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java index d47652e2dbca55..f2d3446392bf10 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java @@ -56,7 +56,8 @@ public void init() { _serverBuilder.newServer(); _driver = GraphDatabase.driver(_serverBuilder.boltURI()); _client = - new Neo4jGraphService(new LineageRegistry(SnapshotEntityRegistry.getInstance()), _driver); + new Neo4jGraphService( + operationContext, new LineageRegistry(SnapshotEntityRegistry.getInstance()), _driver); _client.clear(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/BaseKafkaTraceReaderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/BaseKafkaTraceReaderTest.java new file mode 100644 index 00000000000000..c5217475362295 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/BaseKafkaTraceReaderTest.java @@ -0,0 +1,239 @@ +package com.linkedin.metadata.trace; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyCollection; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.data.template.StringMap; +import com.linkedin.metadata.systemmetadata.TraceStorageStatus; +import com.linkedin.metadata.systemmetadata.TraceWriteStatus; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.TraceContext; +import java.io.IOException; +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import javax.annotation.Nullable; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.ListConsumerGroupOffsetsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetAndTimestamp; +import org.apache.kafka.common.KafkaFuture; +import org.apache.kafka.common.Node; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.TopicPartitionInfo; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public abstract class BaseKafkaTraceReaderTest { + protected static final String TOPIC_NAME = "test-topic"; + protected static final String CONSUMER_GROUP = "test-group"; + protected static final String TRACE_ID = "test-trace-id"; + protected static final String ASPECT_NAME = "status"; + protected static final Urn TEST_URN = UrnUtils.getUrn("urn:li:container:123"); + + @Mock protected AdminClient adminClient; + @Mock protected Consumer consumer; + protected ExecutorService executorService; + protected KafkaTraceReader traceReader; + + abstract KafkaTraceReader buildTraceReader(); + + abstract M buildMessage(@Nullable SystemMetadata systemMetadata); + + abstract GenericRecord toGenericRecord(M message) throws IOException; + + abstract M fromGenericRecord(GenericRecord genericRecord) throws IOException; + + @BeforeMethod(alwaysRun = true) + public void setup() { + MockitoAnnotations.openMocks(this); + executorService = Executors.newSingleThreadExecutor(); + traceReader = buildTraceReader(); + setupDefaultMocks(); + } + + protected void setupDefaultMocks() { + // Mock topic description + Node mockNode = new Node(0, "localhost", 9092); + TopicPartitionInfo partitionInfo = + new TopicPartitionInfo( + 0, mockNode, Collections.singletonList(mockNode), Collections.singletonList(mockNode)); + TopicDescription topicDescription = + new TopicDescription(TOPIC_NAME, false, Collections.singletonList(partitionInfo)); + + DescribeTopicsResult mockDescribeTopicsResult = mock(DescribeTopicsResult.class); + when(mockDescribeTopicsResult.all()) + .thenReturn( + KafkaFuture.completedFuture(Collections.singletonMap(TOPIC_NAME, topicDescription))); + when(adminClient.describeTopics(anyCollection())).thenReturn(mockDescribeTopicsResult); + + // Mock consumer group offset lookup + ListConsumerGroupOffsetsResult mockOffsetResult = mock(ListConsumerGroupOffsetsResult.class); + when(adminClient.listConsumerGroupOffsets(CONSUMER_GROUP)).thenReturn(mockOffsetResult); + when(mockOffsetResult.partitionsToOffsetAndMetadata()) + .thenReturn( + KafkaFuture.completedFuture( + Collections.singletonMap( + new TopicPartition(TOPIC_NAME, 0), new OffsetAndMetadata(100L)))); + + // Mock consumer behavior + when(consumer.poll(any(Duration.class))).thenReturn(mock(ConsumerRecords.class)); + } + + @Test + public void testRead_WithValidGenericRecord() throws Exception { + // Arrange + M expectedMessage = buildMessage(null); + GenericRecord genericRecord = toGenericRecord(expectedMessage); + + // Act + Optional result = traceReader.read(genericRecord); + + // Assert + assertTrue(result.isPresent()); + assertEquals(result.get(), expectedMessage); + } + + @Test + public void testRead_WithNullGenericRecord() { + Optional result = traceReader.read(null); + assertFalse(result.isPresent()); + } + + @Test + public void testMatchConsumerRecord_WithMatchingTraceAndAspect() throws IOException { + // Arrange + ConsumerRecord mockConsumerRecord = mock(ConsumerRecord.class); + + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + GenericRecord genericRecord = toGenericRecord(buildMessage(systemMetadata)); + when(mockConsumerRecord.value()).thenReturn(genericRecord); + + // Act + Optional, SystemMetadata>> result = + traceReader.matchConsumerRecord(mockConsumerRecord, TRACE_ID, ASPECT_NAME); + + // Assert + assertTrue(result.isPresent()); + assertEquals(result.get().getFirst(), mockConsumerRecord); + assertEquals(result.get().getSecond(), systemMetadata); + } + + @Test + public void testTracePendingStatuses() throws IOException { + // Arrange + List aspectNames = Collections.singletonList(ASPECT_NAME); + Map> urnAspectPairs = Collections.singletonMap(TEST_URN, aspectNames); + long timestamp = System.currentTimeMillis(); + + // Mock topic partition + TopicPartition topicPartition = new TopicPartition(TOPIC_NAME, 0); + + // Mock consumer group offset lookup (lower offset) + OffsetAndMetadata offsetAndMetadata = new OffsetAndMetadata(100L); + ListConsumerGroupOffsetsResult mockOffsetResult = mock(ListConsumerGroupOffsetsResult.class); + when(adminClient.listConsumerGroupOffsets(CONSUMER_GROUP)).thenReturn(mockOffsetResult); + when(mockOffsetResult.partitionsToOffsetAndMetadata()) + .thenReturn( + KafkaFuture.completedFuture( + Collections.singletonMap(topicPartition, offsetAndMetadata))); + + // Mock offset lookup by timestamp + when(consumer.offsetsForTimes(any())) + .thenReturn( + Collections.singletonMap(topicPartition, new OffsetAndTimestamp(150L, timestamp))); + + // Create system metadata with trace ID + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + // Build message with metadata + M message = buildMessage(systemMetadata); + GenericRecord genericRecord = toGenericRecord(message); + + // Mock consumer record fetch with higher offset than consumer offset + ConsumerRecord mockRecord = + new ConsumerRecord<>(TOPIC_NAME, 0, 150L, TEST_URN.toString(), genericRecord); + ConsumerRecords mockRecords = mock(ConsumerRecords.class); + when(mockRecords.isEmpty()).thenReturn(false); + when(mockRecords.records(any(TopicPartition.class))) + .thenReturn(Collections.singletonList(mockRecord)); + when(consumer.poll(any(Duration.class))).thenReturn(mockRecords); + + // Act + Map> result = + traceReader.tracePendingStatuses(urnAspectPairs, TRACE_ID, timestamp); + + // Assert + assertTrue(result.containsKey(TEST_URN)); + assertTrue(result.get(TEST_URN).containsKey(ASPECT_NAME)); + assertEquals(result.get(TEST_URN).get(ASPECT_NAME).getWriteStatus(), TraceWriteStatus.PENDING); + } + + @Test + public void testFindMessages() throws Exception { + // Arrange + List aspectNames = Collections.singletonList(ASPECT_NAME); + Map> urnAspectPairs = Collections.singletonMap(TEST_URN, aspectNames); + long timestamp = System.currentTimeMillis(); + + // Mock topic partition assignment and offsets + TopicPartition topicPartition = new TopicPartition(TOPIC_NAME, 0); + OffsetAndTimestamp offsetAndTimestamp = new OffsetAndTimestamp(100L, timestamp); + when(consumer.offsetsForTimes(any())) + .thenReturn(Collections.singletonMap(topicPartition, offsetAndTimestamp)); + + // Mock system metadata + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + M message = buildMessage(systemMetadata); + + // Mock consumer record fetch + ConsumerRecord mockRecord = + new ConsumerRecord<>(TOPIC_NAME, 0, 100L, TEST_URN.toString(), toGenericRecord(message)); + ConsumerRecords mockRecords = mock(ConsumerRecords.class); + when(mockRecords.records(any(TopicPartition.class))) + .thenReturn(Collections.singletonList(mockRecord)); + when(consumer.poll(any(Duration.class))).thenReturn(mockRecords); + + // Act + Map, SystemMetadata>>> result = + traceReader.findMessages(urnAspectPairs, TRACE_ID, timestamp); + + // Assert + assertTrue(result.containsKey(TEST_URN)); + assertTrue(result.get(TEST_URN).containsKey(ASPECT_NAME)); + assertEquals(result.get(TEST_URN).get(ASPECT_NAME).getFirst(), mockRecord); + assertEquals(result.get(TEST_URN).get(ASPECT_NAME).getSecond(), systemMetadata); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/MCLTraceReaderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCLTraceReaderTest.java new file mode 100644 index 00000000000000..9a4afd74917d4b --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCLTraceReaderTest.java @@ -0,0 +1,93 @@ +package com.linkedin.metadata.trace; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringMap; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.MetadataChangeLog; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.TraceContext; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.jetbrains.annotations.Nullable; +import org.testng.annotations.Test; + +public class MCLTraceReaderTest extends BaseKafkaTraceReaderTest { + @Override + KafkaTraceReader buildTraceReader() { + return MCLTraceReader.builder() + .adminClient(adminClient) + .consumerSupplier(() -> consumer) + .pollDurationMs(100) + .pollMaxAttempts(3) + .executorService(executorService) + .timeoutSeconds(5) + .topicName(TOPIC_NAME) + .consumerGroupId(CONSUMER_GROUP) + .build(); + } + + @Override + MetadataChangeLog buildMessage(@Nullable SystemMetadata systemMetadata) { + return new MetadataChangeLog() + .setAspectName(ASPECT_NAME) + .setEntityType(TEST_URN.getEntityType()) + .setChangeType(ChangeType.UPSERT) + .setEntityUrn(TEST_URN) + .setSystemMetadata(systemMetadata, SetMode.IGNORE_NULL); + } + + @Override + GenericRecord toGenericRecord(MetadataChangeLog message) throws IOException { + return EventUtils.pegasusToAvroMCL(message); + } + + @Override + MetadataChangeLog fromGenericRecord(GenericRecord genericRecord) throws IOException { + return EventUtils.avroToPegasusMCL(genericRecord); + } + + @Test + public void testMCLRead() throws Exception { + MetadataChangeLog expectedMCL = buildMessage(null); + + GenericRecord genericRecord = toGenericRecord(expectedMCL); + + Optional result = traceReader.read(genericRecord); + + assertTrue(result.isPresent()); + assertEquals(result.get().getAspectName(), ASPECT_NAME); + } + + @Test + public void testMCLMatchConsumerRecord() throws Exception { + ConsumerRecord mockConsumerRecord = mock(ConsumerRecord.class); + + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + MetadataChangeLog mcl = buildMessage(systemMetadata); + + GenericRecord genericRecord = toGenericRecord(mcl); + when(mockConsumerRecord.value()).thenReturn(genericRecord); + + Optional, SystemMetadata>> result = + traceReader.matchConsumerRecord(mockConsumerRecord, TRACE_ID, ASPECT_NAME); + + assertTrue(result.isPresent()); + assertEquals(result.get().getFirst(), mockConsumerRecord); + assertEquals(result.get().getSecond(), systemMetadata); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPFailedTraceReaderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPFailedTraceReaderTest.java new file mode 100644 index 00000000000000..8d8fedfc98c2b3 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPFailedTraceReaderTest.java @@ -0,0 +1,98 @@ +package com.linkedin.metadata.trace; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringMap; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.FailedMetadataChangeProposal; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.TraceContext; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.jetbrains.annotations.Nullable; +import org.testng.annotations.Test; + +public class MCPFailedTraceReaderTest + extends BaseKafkaTraceReaderTest { + @Override + KafkaTraceReader buildTraceReader() { + return MCPFailedTraceReader.builder() + .adminClient(adminClient) + .consumerSupplier(() -> consumer) + .pollDurationMs(100) + .pollMaxAttempts(3) + .executorService(executorService) + .timeoutSeconds(5) + .topicName(TOPIC_NAME) + .consumerGroupId(CONSUMER_GROUP) + .build(); + } + + @Override + FailedMetadataChangeProposal buildMessage(@Nullable SystemMetadata systemMetadata) { + return new FailedMetadataChangeProposal() + .setError("Test failure error") + .setMetadataChangeProposal( + new MetadataChangeProposal() + .setAspectName(ASPECT_NAME) + .setEntityType(TEST_URN.getEntityType()) + .setChangeType(ChangeType.UPSERT) + .setEntityUrn(TEST_URN) + .setSystemMetadata(systemMetadata, SetMode.IGNORE_NULL)); + } + + @Override + GenericRecord toGenericRecord(FailedMetadataChangeProposal message) throws IOException { + return EventUtils.pegasusToAvroFailedMCP(message); + } + + @Override + FailedMetadataChangeProposal fromGenericRecord(GenericRecord genericRecord) throws IOException { + return EventUtils.avroToPegasusFailedMCP(genericRecord); + } + + @Test + public void testFailedMCPRead() throws Exception { + FailedMetadataChangeProposal expectedMCP = buildMessage(null); + + GenericRecord genericRecord = toGenericRecord(expectedMCP); + + Optional result = traceReader.read(genericRecord); + + assertTrue(result.isPresent()); + assertEquals(result.get().getMetadataChangeProposal().getAspectName(), ASPECT_NAME); + } + + @Test + public void testFailedMCPMatchConsumerRecord() throws Exception { + ConsumerRecord mockConsumerRecord = mock(ConsumerRecord.class); + + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + FailedMetadataChangeProposal fmcp = buildMessage(systemMetadata); + + GenericRecord genericRecord = toGenericRecord(fmcp); + when(mockConsumerRecord.value()).thenReturn(genericRecord); + + Optional, SystemMetadata>> result = + traceReader.matchConsumerRecord(mockConsumerRecord, TRACE_ID, ASPECT_NAME); + + assertTrue(result.isPresent()); + assertEquals(result.get().getFirst(), mockConsumerRecord); + assertEquals(result.get().getSecond(), systemMetadata); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPTraceReaderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPTraceReaderTest.java new file mode 100644 index 00000000000000..e2aa9730267ddf --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/MCPTraceReaderTest.java @@ -0,0 +1,93 @@ +package com.linkedin.metadata.trace; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringMap; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.EventUtils; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.TraceContext; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.jetbrains.annotations.Nullable; +import org.testng.annotations.Test; + +public final class MCPTraceReaderTest extends BaseKafkaTraceReaderTest { + @Override + KafkaTraceReader buildTraceReader() { + return MCPTraceReader.builder() + .adminClient(adminClient) + .consumerSupplier(() -> consumer) + .pollDurationMs(100) + .pollMaxAttempts(3) + .executorService(executorService) + .timeoutSeconds(5) + .topicName(TOPIC_NAME) + .consumerGroupId(CONSUMER_GROUP) + .build(); + } + + @Override + MetadataChangeProposal buildMessage(@Nullable SystemMetadata systemMetadata) { + return new MetadataChangeProposal() + .setAspectName(ASPECT_NAME) + .setEntityType(TEST_URN.getEntityType()) + .setChangeType(ChangeType.UPSERT) + .setEntityUrn(TEST_URN) + .setSystemMetadata(systemMetadata, SetMode.IGNORE_NULL); + } + + @Override + GenericRecord toGenericRecord(MetadataChangeProposal message) throws IOException { + return EventUtils.pegasusToAvroMCP(message); + } + + @Override + MetadataChangeProposal fromGenericRecord(GenericRecord genericRecord) throws IOException { + return EventUtils.avroToPegasusMCP(genericRecord); + } + + @Test + public void testMCPRead() throws Exception { + MetadataChangeProposal expectedMCP = buildMessage(null); + + GenericRecord genericRecord = toGenericRecord(expectedMCP); + + Optional result = traceReader.read(genericRecord); + + assertTrue(result.isPresent()); + assertEquals(result.get().getAspectName(), ASPECT_NAME); + } + + @Test + public void testMCPMatchConsumerRecord() throws Exception { + ConsumerRecord mockConsumerRecord = mock(ConsumerRecord.class); + + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + MetadataChangeProposal mcp = buildMessage(systemMetadata); + + GenericRecord genericRecord = toGenericRecord(mcp); + when(mockConsumerRecord.value()).thenReturn(genericRecord); + + Optional, SystemMetadata>> result = + traceReader.matchConsumerRecord(mockConsumerRecord, TRACE_ID, ASPECT_NAME); + + assertTrue(result.isPresent()); + assertEquals(result.get().getFirst(), mockConsumerRecord); + assertEquals(result.get().getSecond(), systemMetadata); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java new file mode 100644 index 00000000000000..abe65d48b3410a --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java @@ -0,0 +1,350 @@ +package com.linkedin.metadata.trace; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anySet; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.StringMap; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.EventUtils; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.run.AspectRowSummary; +import com.linkedin.metadata.systemmetadata.SystemMetadataService; +import com.linkedin.metadata.systemmetadata.TraceStatus; +import com.linkedin.metadata.systemmetadata.TraceStorageStatus; +import com.linkedin.metadata.systemmetadata.TraceWriteStatus; +import com.linkedin.mxe.FailedMetadataChangeProposal; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.TraceContext; +import io.datahubproject.metadata.context.TraceIdGenerator; +import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class TraceServiceImplTest { + private static final String TEST_TRACE_ID_FUTURE = + TraceContext.TRACE_ID_GENERATOR.generateTraceId(Instant.now().toEpochMilli() + 1000); + private static final String TEST_TRACE_ID = TraceContext.TRACE_ID_GENERATOR.generateTraceId(); + protected static final String ASPECT_NAME = "status"; + protected static final String TIMESERIES_ASPECT_NAME = "datasetProfile"; + protected static final Urn TEST_URN = + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:kafka,PageViewEvent,PROD)"); + + @Mock private SystemMetadataService systemMetadataService; + @Mock private EntityService entityService; + @Mock private MCPTraceReader mcpTraceReader; + @Mock private MCPFailedTraceReader mcpFailedTraceReader; + @Mock private MCLTraceReader mclVersionedTraceReader; + @Mock private MCLTraceReader mclTimeseriesTraceReader; + + private TraceServiceImpl traceService; + private static final OperationContext operationContext = + TestOperationContexts.systemContextNoSearchAuthorization(); + + @BeforeMethod + public void setup() throws Exception { + MockitoAnnotations.openMocks(this); + + traceService = + TraceServiceImpl.builder() + .entityRegistry(operationContext.getEntityRegistry()) + .systemMetadataService(systemMetadataService) + .entityService(entityService) + .mcpTraceReader(mcpTraceReader) + .mcpFailedTraceReader(mcpFailedTraceReader) + .mclVersionedTraceReader(mclVersionedTraceReader) + .mclTimeseriesTraceReader(mclTimeseriesTraceReader) + .build(); + } + + @Test + public void testTraceWithActiveState() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Mock entityService response for primary storage + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TEST_TRACE_ID); + systemMetadata.setProperties(new StringMap(properties)); + + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setCreated(new AuditStamp().setTime(Instant.now().toEpochMilli())); + envelopedAspect.setSystemMetadata(systemMetadata); + + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setAspects( + new EnvelopedAspectMap(Collections.singletonMap(ASPECT_NAME, envelopedAspect))); + entityResponse.setEntityName(TEST_URN.getEntityType()); + entityResponse.setUrn(TEST_URN); + + when(entityService.getEntitiesV2(any(), anyString(), anySet(), anySet(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, entityResponse)); + + // Mock search storage response + AspectRowSummary summary = mock(AspectRowSummary.class); + when(summary.getUrn()).thenReturn(TEST_URN.toString()); + when(summary.getAspectName()).thenReturn(ASPECT_NAME); + when(summary.getTelemetryTraceId()).thenReturn(TEST_TRACE_ID); + when(systemMetadataService.findAspectsByUrn(eq(TEST_URN), anyList(), eq(true))) + .thenReturn(Collections.singletonList(summary)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, false, false); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.ACTIVE_STATE); + assertEquals(status.getSearchStorage().getWriteStatus(), TraceWriteStatus.ACTIVE_STATE); + assertTrue(status.isSuccess()); + } + + @Test + public void testTraceWithPendingStatus() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Mock empty entity response (not in SQL) + when(entityService.getEntitiesV2(any(), anyString(), anySet(), anySet(), anyBoolean())) + .thenReturn(Collections.emptyMap()); + + // Mock pending status from Kafka + Map pendingStatus = new LinkedHashMap<>(); + pendingStatus.put( + ASPECT_NAME, + TraceStorageStatus.ok(TraceWriteStatus.PENDING, "Consumer has not processed offset.")); + + when(mcpTraceReader.tracePendingStatuses(any(), eq(TEST_TRACE_ID), any(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, pendingStatus)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, false, false); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.PENDING); + assertTrue(status.isSuccess()); + } + + @Test + public void testTraceWithErrorStatus() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Mock empty entity response + when(entityService.getEntitiesV2(any(), anyString(), anySet(), anySet(), anyBoolean())) + .thenReturn(Collections.emptyMap()); + + // Mock error status from Kafka + Map errorStatus = new LinkedHashMap<>(); + errorStatus.put( + ASPECT_NAME, TraceStorageStatus.fail(TraceWriteStatus.ERROR, "Failed to process message.")); + + when(mcpTraceReader.tracePendingStatuses(any(), eq(TEST_TRACE_ID), any(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, errorStatus)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, true, true); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.ERROR); + assertFalse(status.isSuccess()); + } + + @Test + public void testTraceWithTimeseriesAspect() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(TIMESERIES_ASPECT_NAME)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, false, false); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(TIMESERIES_ASPECT_NAME)); + + TraceStatus status = urnStatus.get(TIMESERIES_ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.NO_OP); + assertEquals( + status.getSearchStorage().getWriteStatus(), TraceWriteStatus.TRACE_NOT_IMPLEMENTED); + assertTrue(status.isSuccess()); + } + + @Test + public void testTraceWithHistoricState() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Mock primary storage with historic state + SystemMetadata systemMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TEST_TRACE_ID_FUTURE); + systemMetadata.setProperties(new StringMap(properties)); + + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setCreated( + new AuditStamp() + .setTime( + TraceIdGenerator.getTimestampMillis(TEST_TRACE_ID_FUTURE))); // Future timestamp + envelopedAspect.setSystemMetadata(systemMetadata); + + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setAspects( + new EnvelopedAspectMap(Collections.singletonMap(ASPECT_NAME, envelopedAspect))); + entityResponse.setEntityName(TEST_URN.getEntityType()); + entityResponse.setUrn(TEST_URN); + + when(entityService.getEntitiesV2(any(), anyString(), anySet(), anySet(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, entityResponse)); + + // Mock search storage with historic state + AspectRowSummary summary = mock(AspectRowSummary.class); + when(summary.getUrn()).thenReturn(TEST_URN.toString()); + when(summary.getAspectName()).thenReturn(ASPECT_NAME); + when(summary.hasTimestamp()).thenReturn(true); + when(summary.getTimestamp()) + .thenReturn(TraceIdGenerator.getTimestampMillis(TEST_TRACE_ID_FUTURE)); // Future timestamp + when(summary.getTelemetryTraceId()).thenReturn(TEST_TRACE_ID_FUTURE); + + when(systemMetadataService.findAspectsByUrn(eq(TEST_URN), anyList(), eq(true))) + .thenReturn(Collections.singletonList(summary)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, false, false); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.HISTORIC_STATE); + assertEquals(status.getSearchStorage().getWriteStatus(), TraceWriteStatus.HISTORIC_STATE); + assertTrue(status.isSuccess()); + } + + @Test + public void testTraceWithFailedMessage() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Mock primary storage with ERROR status + Map errorStatus = new LinkedHashMap<>(); + errorStatus.put(ASPECT_NAME, TraceStorageStatus.fail(TraceWriteStatus.ERROR, "Initial error")); + + when(mcpTraceReader.tracePendingStatuses(any(), eq(TEST_TRACE_ID), any(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, errorStatus)); + + // Mock the failed message in MCPFailedTraceReader + SystemMetadata failedMetadata = new SystemMetadata(); + Map properties = new HashMap<>(); + properties.put(TraceContext.TELEMETRY_TRACE_KEY, TEST_TRACE_ID); + failedMetadata.setProperties(new StringMap(properties)); + + FailedMetadataChangeProposal failedMCP = + new FailedMetadataChangeProposal() + .setError( + "[{\"message\":\"Processing failed: Test error message\",\"exceptionClass\":\"java.lang.IllegalArgumentException\"}]") + .setMetadataChangeProposal( + new MetadataChangeProposal() + .setEntityUrn(TEST_URN) + .setChangeType(ChangeType.UPSERT) + .setAspectName(ASPECT_NAME) + .setEntityType(TEST_URN.getEntityType()) + .setSystemMetadata(failedMetadata)); + + GenericRecord genericRecord = EventUtils.pegasusToAvroFailedMCP(failedMCP); + ConsumerRecord failedRecord = mock(ConsumerRecord.class); + when(failedRecord.value()).thenReturn(genericRecord); + + Map, SystemMetadata>> failedMessages = + Collections.singletonMap(ASPECT_NAME, Pair.of(failedRecord, failedMetadata)); + + when(mcpFailedTraceReader.findMessages(any(), eq(TEST_TRACE_ID), any())) + .thenReturn(Collections.singletonMap(TEST_URN, failedMessages)); + + // Mock failed record read with error message + when(mcpFailedTraceReader.read(eq(genericRecord))).thenReturn(Optional.of(failedMCP)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, true, true); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.ERROR); + assertNotNull(status.getPrimaryStorage().getWriteExceptions()); + assertEquals(status.getPrimaryStorage().getWriteExceptions().size(), 1); + assertEquals( + status.getPrimaryStorage().getWriteExceptions().get(0).getMessage(), + "Processing failed: Test error message"); + assertEquals( + status.getPrimaryStorage().getWriteExceptions().get(0).getExceptionClass(), + "java.lang.IllegalArgumentException"); + assertFalse(status.isSuccess()); + } +} diff --git a/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MAEOpenTelemetryConfig.java b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MAEOpenTelemetryConfig.java new file mode 100644 index 00000000000000..e1761755fd3adc --- /dev/null +++ b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MAEOpenTelemetryConfig.java @@ -0,0 +1,21 @@ +package com.linkedin.metadata.kafka; + +import com.linkedin.gms.factory.system_telemetry.OpenTelemetryBaseFactory; +import io.datahubproject.metadata.context.TraceContext; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class MAEOpenTelemetryConfig extends OpenTelemetryBaseFactory { + + @Override + protected String getApplicationComponent() { + return "datahub-mae-consumer"; + } + + @Bean + @Override + protected TraceContext traceContext() { + return super.traceContext(); + } +} diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index b4990e289b10df..08c712f6167b40 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -33,6 +33,7 @@ dependencies { implementation externalDependency.springKafka implementation externalDependency.annotationApi + implementation externalDependency.opentelemetrySdkTrace implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/DataHubUsageEventsProcessor.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/DataHubUsageEventsProcessor.java index ce7376f1f8d662..e561a36eaca8fd 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/DataHubUsageEventsProcessor.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/DataHubUsageEventsProcessor.java @@ -2,7 +2,6 @@ import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.events.metadata.ChangeType; import com.linkedin.gms.factory.kafka.SimpleKafkaConsumerFactory; import com.linkedin.metadata.kafka.config.DataHubUsageEventsProcessorCondition; @@ -12,11 +11,13 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.Topics; +import io.datahubproject.metadata.context.OperationContext; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.Optional; import lombok.extern.slf4j.Slf4j; import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Conditional; import org.springframework.context.annotation.Import; import org.springframework.kafka.annotation.EnableKafka; @@ -33,6 +34,7 @@ public class DataHubUsageEventsProcessor { private final ElasticsearchConnector elasticSearchConnector; private final DataHubUsageEventTransformer dataHubUsageEventTransformer; private final String indexName; + private final OperationContext systemOperationContext; private final Histogram kafkaLagStats = MetricUtils.get().histogram(MetricRegistry.name(this.getClass(), "kafkaLag")); @@ -40,10 +42,12 @@ public class DataHubUsageEventsProcessor { public DataHubUsageEventsProcessor( ElasticsearchConnector elasticSearchConnector, DataHubUsageEventTransformer dataHubUsageEventTransformer, - IndexConvention indexConvention) { + IndexConvention indexConvention, + @Qualifier("systemOperationContext") OperationContext systemOperationContext) { this.elasticSearchConnector = elasticSearchConnector; this.dataHubUsageEventTransformer = dataHubUsageEventTransformer; this.indexName = indexConvention.getIndexName("datahub_usage_event"); + this.systemOperationContext = systemOperationContext; } @KafkaListener( @@ -51,31 +55,36 @@ public DataHubUsageEventsProcessor( topics = "${DATAHUB_USAGE_EVENT_NAME:" + Topics.DATAHUB_USAGE_EVENT + "}", containerFactory = "simpleKafkaConsumer") public void consume(final ConsumerRecord consumerRecord) { - try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { - kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); - final String record = consumerRecord.value(); + systemOperationContext.withSpan( + "consume", + () -> { + kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); + final String record = consumerRecord.value(); - log.info( - "Got DHUE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", - consumerRecord.key(), - consumerRecord.topic(), - consumerRecord.partition(), - consumerRecord.offset(), - consumerRecord.serializedValueSize(), - consumerRecord.timestamp()); + log.info( + "Got DHUE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", + consumerRecord.key(), + consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.offset(), + consumerRecord.serializedValueSize(), + consumerRecord.timestamp()); - Optional eventDocument = - dataHubUsageEventTransformer.transformDataHubUsageEvent(record); - if (eventDocument.isEmpty()) { - log.warn("Failed to apply usage events transform to record: {}", record); - return; - } - JsonElasticEvent elasticEvent = new JsonElasticEvent(eventDocument.get().getDocument()); - elasticEvent.setId(generateDocumentId(eventDocument.get().getId(), consumerRecord.offset())); - elasticEvent.setIndex(indexName); - elasticEvent.setActionType(ChangeType.CREATE); - elasticSearchConnector.feedElasticEvent(elasticEvent); - } + Optional eventDocument = + dataHubUsageEventTransformer.transformDataHubUsageEvent(record); + if (eventDocument.isEmpty()) { + log.warn("Failed to apply usage events transform to record: {}", record); + return; + } + JsonElasticEvent elasticEvent = new JsonElasticEvent(eventDocument.get().getDocument()); + elasticEvent.setId( + generateDocumentId(eventDocument.get().getId(), consumerRecord.offset())); + elasticEvent.setIndex(indexName); + elasticEvent.setActionType(ChangeType.CREATE); + elasticSearchConnector.feedElasticEvent(elasticEvent); + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); } /** diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListener.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListener.java index a2d59023ba5ce2..ec33ea12209310 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListener.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MCLKafkaListener.java @@ -7,7 +7,6 @@ import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.common.urn.Urn; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.EventUtils; @@ -15,6 +14,8 @@ import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.MetadataChangeLog; import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -33,11 +34,13 @@ public class MCLKafkaListener { private final String consumerGroupId; private final List hooks; + private final OperationContext systemOperationContext; public MCLKafkaListener( OperationContext systemOperationContext, String consumerGroup, List hooks) { + this.systemOperationContext = systemOperationContext; this.consumerGroupId = consumerGroup; this.hooks = hooks; this.hooks.forEach(hook -> hook.init(systemOperationContext)); @@ -49,7 +52,7 @@ public MCLKafkaListener( } public void consume(final ConsumerRecord consumerRecord) { - try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { + try { kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); final GenericRecord record = consumerRecord.value(); log.debug( @@ -85,38 +88,61 @@ public void consume(final ConsumerRecord consumerRecord) MDC.put( MDC_CHANGE_TYPE, Optional.ofNullable(changeType).map(ChangeType::toString).orElse("")); - log.info( - "Invoking MCL hooks for consumer: {} urn: {}, aspect name: {}, entity type: {}, change type: {}", - consumerGroupId, - entityUrn, - aspectName, - entityType, - changeType); + systemOperationContext.withQueueSpan( + "consume", + event.getSystemMetadata(), + consumerRecord.topic(), + () -> { + log.info( + "Invoking MCL hooks for consumer: {} urn: {}, aspect name: {}, entity type: {}, change type: {}", + consumerGroupId, + entityUrn, + aspectName, + entityType, + changeType); - // Here - plug in additional "custom processor hooks" - for (MetadataChangeLogHook hook : this.hooks) { - log.debug( - "Invoking MCL hook {} for urn: {}", - hook.getClass().getSimpleName(), - event.getEntityUrn()); - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), hook.getClass().getSimpleName() + "_latency") - .time()) { - hook.invoke(event); - } catch (Exception e) { - // Just skip this hook and continue. - Note that this represents "at most once"// - // processing. - MetricUtils.counter(this.getClass(), hook.getClass().getSimpleName() + "_failure").inc(); - log.error( - "Failed to execute MCL hook with name {}", hook.getClass().getCanonicalName(), e); - } - } - // TODO: Manually commit kafka offsets after full processing. - MetricUtils.counter(this.getClass(), consumerGroupId + "_consumed_mcl_count").inc(); - log.info( - "Successfully completed MCL hooks for consumer: {} urn: {}", - consumerGroupId, - event.getEntityUrn()); + // Here - plug in additional "custom processor hooks" + for (MetadataChangeLogHook hook : this.hooks) { + systemOperationContext.withSpan( + hook.getClass().getSimpleName(), + () -> { + log.debug( + "Invoking MCL hook {} for urn: {}", + hook.getClass().getSimpleName(), + event.getEntityUrn()); + try { + hook.invoke(event); + } catch (Exception e) { + // Just skip this hook and continue. - Note that this represents "at most + // once"// + // processing. + MetricUtils.counter( + this.getClass(), hook.getClass().getSimpleName() + "_failure") + .inc(); + log.error( + "Failed to execute MCL hook with name {}", + hook.getClass().getCanonicalName(), + e); + + Span currentSpan = Span.current(); + currentSpan.recordException(e); + currentSpan.setStatus(StatusCode.ERROR, e.getMessage()); + currentSpan.setAttribute(MetricUtils.ERROR_TYPE, e.getClass().getName()); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), hook.getClass().getSimpleName() + "_latency")); + } + + // TODO: Manually commit kafka offsets after full processing. + MetricUtils.counter(this.getClass(), consumerGroupId + "_consumed_mcl_count").inc(); + log.info( + "Successfully completed MCL hooks for consumer: {} urn: {}", + consumerGroupId, + event.getEntityUrn()); + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); } finally { MDC.clear(); } diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index c92749385145de..017570cfcf7afb 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -96,6 +96,7 @@ public OperationContext operationContext( indexConvention, TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class), + null, true); } diff --git a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MCEOpenTelemetryConfig.java b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MCEOpenTelemetryConfig.java new file mode 100644 index 00000000000000..9ae6aa5f50c1f4 --- /dev/null +++ b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MCEOpenTelemetryConfig.java @@ -0,0 +1,21 @@ +package com.linkedin.metadata.kafka; + +import com.linkedin.gms.factory.system_telemetry.OpenTelemetryBaseFactory; +import io.datahubproject.metadata.context.TraceContext; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class MCEOpenTelemetryConfig extends OpenTelemetryBaseFactory { + + @Override + protected String getApplicationComponent() { + return "datahub-mce-consumer"; + } + + @Bean + @Override + protected TraceContext traceContext() { + return super.traceContext(); + } +} diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index 2da3957c4bb218..21951106ca6b24 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation externalDependency.protobuf implementation externalDependency.springKafka + implementation externalDependency.opentelemetrySdkTrace implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java index 5d2f6452e69197..1ddb7a576f42b9 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeEventsProcessor.java @@ -4,7 +4,6 @@ import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.entity.Entity; import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.gms.factory.entityclient.RestliEntityClientFactory; @@ -65,35 +64,39 @@ public class MetadataChangeEventsProcessor { containerFactory = DEFAULT_EVENT_CONSUMER_NAME) @Deprecated public void consume(final ConsumerRecord consumerRecord) { - try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { - kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); - final GenericRecord record = consumerRecord.value(); + systemOperationContext.withSpan( + "consume", + () -> { + kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); + final GenericRecord record = consumerRecord.value(); - log.info( - "Got MCE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", - consumerRecord.key(), - consumerRecord.topic(), - consumerRecord.partition(), - consumerRecord.offset(), - consumerRecord.serializedValueSize(), - consumerRecord.timestamp()); + log.info( + "Got MCE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", + consumerRecord.key(), + consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.offset(), + consumerRecord.serializedValueSize(), + consumerRecord.timestamp()); - log.debug("Record {}", record); + log.debug("Record {}", record); - MetadataChangeEvent event = new MetadataChangeEvent(); + MetadataChangeEvent event = new MetadataChangeEvent(); - try { - event = EventUtils.avroToPegasusMCE(record); - log.debug("MetadataChangeEvent {}", event); - if (event.hasProposedSnapshot()) { - processProposedSnapshot(event); - } - } catch (Throwable throwable) { - log.error("MCE Processor Error", throwable); - log.error("Message: {}", record); - sendFailedMCE(event, throwable); - } - } + try { + event = EventUtils.avroToPegasusMCE(record); + log.debug("MetadataChangeEvent {}", event); + if (event.hasProposedSnapshot()) { + processProposedSnapshot(event); + } + } catch (Throwable throwable) { + log.error("MCE Processor Error", throwable); + log.error("Message: {}", record); + sendFailedMCE(event, throwable); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); } private void sendFailedMCE(@Nonnull MetadataChangeEvent event, @Nonnull Throwable throwable) { diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java index 4e356f5fb3670a..4cc2f33049e836 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeProposalsProcessor.java @@ -5,10 +5,10 @@ import static com.linkedin.metadata.Constants.MDC_ENTITY_TYPE; import static com.linkedin.metadata.Constants.MDC_ENTITY_URN; import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCP_EVENT_CONSUMER_NAME; +import static com.linkedin.mxe.ConsumerGroups.MCP_CONSUMER_GROUP_ID_VALUE; import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.common.urn.Urn; import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.events.metadata.ChangeType; @@ -16,20 +16,23 @@ import com.linkedin.gms.factory.entityclient.RestliEntityClientFactory; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.dao.throttle.ThrottleSensor; +import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.kafka.config.MetadataChangeProposalProcessorCondition; import com.linkedin.metadata.kafka.util.KafkaListenerUtil; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.Topics; import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import java.io.IOException; +import java.util.List; import java.util.Optional; import javax.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.producer.Producer; import org.slf4j.MDC; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; @@ -47,12 +50,9 @@ @EnableKafka @RequiredArgsConstructor public class MetadataChangeProposalsProcessor { - private static final String CONSUMER_GROUP_ID_VALUE = - "${METADATA_CHANGE_PROPOSAL_KAFKA_CONSUMER_GROUP_ID:generic-mce-consumer-job-client}"; - private final OperationContext systemOperationContext; private final SystemEntityClient entityClient; - private final Producer kafkaProducer; + private final EventProducer kafkaProducer; @Qualifier("kafkaThrottle") private final ThrottleSensor kafkaThrottle; @@ -69,7 +69,7 @@ public class MetadataChangeProposalsProcessor { + "}") private String fmcpTopicName; - @Value(CONSUMER_GROUP_ID_VALUE) + @Value(MCP_CONSUMER_GROUP_ID_VALUE) private String mceConsumerGroupId; @PostConstruct @@ -78,11 +78,11 @@ public void registerConsumerThrottle() { } @KafkaListener( - id = CONSUMER_GROUP_ID_VALUE, + id = MCP_CONSUMER_GROUP_ID_VALUE, topics = "${METADATA_CHANGE_PROPOSAL_TOPIC_NAME:" + Topics.METADATA_CHANGE_PROPOSAL + "}", containerFactory = MCP_EVENT_CONSUMER_NAME) public void consume(final ConsumerRecord consumerRecord) { - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "consume").time()) { + try { kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); final GenericRecord record = consumerRecord.value(); @@ -99,29 +99,50 @@ public void consume(final ConsumerRecord consumerRecord) log.debug("Record {}", record); } - MetadataChangeProposal event = new MetadataChangeProposal(); + final MetadataChangeProposal event; try { event = EventUtils.avroToPegasusMCP(record); - Urn entityUrn = event.getEntityUrn(); - String aspectName = event.hasAspectName() ? event.getAspectName() : null; - String entityType = event.hasEntityType() ? event.getEntityType() : null; - ChangeType changeType = event.hasChangeType() ? event.getChangeType() : null; - MDC.put(MDC_ENTITY_URN, Optional.ofNullable(entityUrn).map(Urn::toString).orElse("")); - MDC.put(MDC_ASPECT_NAME, aspectName); - MDC.put(MDC_ENTITY_TYPE, entityType); - MDC.put( - MDC_CHANGE_TYPE, Optional.ofNullable(changeType).map(ChangeType::toString).orElse("")); - - if (log.isDebugEnabled()) { - log.debug("MetadataChangeProposal {}", event); - } - String urn = entityClient.ingestProposal(systemOperationContext, event, false); - log.info("Successfully processed MCP event urn: {}", urn); - } catch (Throwable throwable) { - log.error("MCP Processor Error", throwable); - log.error("Message: {}", record); - KafkaListenerUtil.sendFailedMCP(event, throwable, fmcpTopicName, kafkaProducer); + systemOperationContext.withQueueSpan( + "consume", + event.getSystemMetadata(), + consumerRecord.topic(), + () -> { + try { + Urn entityUrn = event.getEntityUrn(); + String aspectName = event.hasAspectName() ? event.getAspectName() : null; + String entityType = event.hasEntityType() ? event.getEntityType() : null; + ChangeType changeType = event.hasChangeType() ? event.getChangeType() : null; + MDC.put( + MDC_ENTITY_URN, Optional.ofNullable(entityUrn).map(Urn::toString).orElse("")); + MDC.put(MDC_ASPECT_NAME, aspectName); + MDC.put(MDC_ENTITY_TYPE, entityType); + MDC.put( + MDC_CHANGE_TYPE, + Optional.ofNullable(changeType).map(ChangeType::toString).orElse("")); + + if (log.isDebugEnabled()) { + log.debug("MetadataChangeProposal {}", event); + } + String urn = entityClient.ingestProposal(systemOperationContext, event, false); + log.info("Successfully processed MCP event urn: {}", urn); + } catch (Throwable throwable) { + log.error("MCP Processor Error", throwable); + log.error("Message: {}", record); + Span currentSpan = Span.current(); + currentSpan.recordException(throwable); + currentSpan.setStatus(StatusCode.ERROR, throwable.getMessage()); + currentSpan.setAttribute(MetricUtils.ERROR_TYPE, throwable.getClass().getName()); + + kafkaProducer.produceFailedMetadataChangeProposal( + systemOperationContext, List.of(event), throwable); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); + } catch (IOException e) { + log.error( + "Unrecoverable message deserialization error. Cannot forward to failure topic.", e); } } finally { MDC.clear(); diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/batch/BatchMetadataChangeProposalsProcessor.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/batch/BatchMetadataChangeProposalsProcessor.java index fed93628fe4d79..ae30986294cf03 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/batch/BatchMetadataChangeProposalsProcessor.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/batch/BatchMetadataChangeProposalsProcessor.java @@ -1,28 +1,34 @@ package com.linkedin.metadata.kafka.batch; +import static com.linkedin.metadata.config.kafka.KafkaConfiguration.MCP_EVENT_CONSUMER_NAME; +import static com.linkedin.metadata.utils.metrics.MetricUtils.BATCH_SIZE_ATTR; +import static com.linkedin.mxe.ConsumerGroups.MCP_CONSUMER_GROUP_ID_VALUE; + import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.entity.client.SystemEntityClient; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.entityclient.RestliEntityClientFactory; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.dao.throttle.ThrottleSensor; +import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.kafka.config.batch.BatchMetadataChangeProposalProcessorCondition; import com.linkedin.metadata.kafka.util.KafkaListenerUtil; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; import com.linkedin.mxe.Topics; import io.datahubproject.metadata.context.OperationContext; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import javax.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.producer.Producer; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Conditional; @@ -39,12 +45,9 @@ @EnableKafka @RequiredArgsConstructor public class BatchMetadataChangeProposalsProcessor { - private static final String CONSUMER_GROUP_ID_VALUE = - "${METADATA_CHANGE_PROPOSAL_KAFKA_CONSUMER_GROUP_ID:generic-mce-consumer-job-client}"; - private final OperationContext systemOperationContext; private final SystemEntityClient entityClient; - private final Producer kafkaProducer; + private final EventProducer kafkaProducer; @Qualifier("kafkaThrottle") private final ThrottleSensor kafkaThrottle; @@ -61,7 +64,7 @@ public class BatchMetadataChangeProposalsProcessor { + "}") private String fmcpTopicName; - @Value(CONSUMER_GROUP_ID_VALUE) + @Value(MCP_CONSUMER_GROUP_ID_VALUE) private String mceConsumerGroupId; @PostConstruct @@ -70,47 +73,67 @@ public void registerConsumerThrottle() { } @KafkaListener( - id = CONSUMER_GROUP_ID_VALUE, + id = MCP_CONSUMER_GROUP_ID_VALUE, topics = "${METADATA_CHANGE_PROPOSAL_TOPIC_NAME:" + Topics.METADATA_CHANGE_PROPOSAL + "}", - containerFactory = "kafkaEventConsumer", + containerFactory = MCP_EVENT_CONSUMER_NAME, batch = "true") public void consume(final List> consumerRecords) { - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "consume").time()) { - List metadataChangeProposals = - new ArrayList<>(consumerRecords.size()); - for (ConsumerRecord consumerRecord : consumerRecords) { - kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); - final GenericRecord record = consumerRecord.value(); - - log.info( - "Got MCP event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", - consumerRecord.key(), - consumerRecord.topic(), - consumerRecord.partition(), - consumerRecord.offset(), - consumerRecord.serializedValueSize(), - consumerRecord.timestamp()); - - MetadataChangeProposal event = new MetadataChangeProposal(); - try { - event = EventUtils.avroToPegasusMCP(record); - } catch (Throwable throwable) { - log.error("MCP Processor Error", throwable); - log.error("Message: {}", record); - KafkaListenerUtil.sendFailedMCP(event, throwable, fmcpTopicName, kafkaProducer); - } - metadataChangeProposals.add(event); + List metadataChangeProposals = new ArrayList<>(consumerRecords.size()); + String topicName = null; + + for (ConsumerRecord consumerRecord : consumerRecords) { + kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); + final GenericRecord record = consumerRecord.value(); + + log.info( + "Got MCP event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", + consumerRecord.key(), + consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.offset(), + consumerRecord.serializedValueSize(), + consumerRecord.timestamp()); + + if (topicName == null) { + topicName = consumerRecord.topic(); } + final MetadataChangeProposal event; try { - List urns = - entityClient.batchIngestProposals( - systemOperationContext, metadataChangeProposals, false); - log.info("Successfully processed MCP event urns: {}", urns); - } catch (Exception e) { - // Java client should never throw this - log.error("Exception in batch ingest", e); + event = EventUtils.avroToPegasusMCP(record); + metadataChangeProposals.add(event); + } catch (IOException e) { + log.error( + "Unrecoverable message deserialization error. Cannot forward to failure topic.", e); } } + + List systemMetadataList = + metadataChangeProposals.stream().map(MetadataChangeProposal::getSystemMetadata).toList(); + systemOperationContext.withQueueSpan( + "consume", + systemMetadataList, + topicName, + () -> { + try { + List urns = + entityClient.batchIngestProposals( + systemOperationContext, metadataChangeProposals, false); + log.info("Successfully processed MCP event urns: {}", urns); + } catch (Throwable throwable) { + log.error("MCP Processor Error", throwable); + Span currentSpan = Span.current(); + currentSpan.recordException(throwable); + currentSpan.setStatus(StatusCode.ERROR, throwable.getMessage()); + currentSpan.setAttribute(MetricUtils.ERROR_TYPE, throwable.getClass().getName()); + + kafkaProducer.produceFailedMetadataChangeProposal( + systemOperationContext, metadataChangeProposals, throwable); + } + }, + BATCH_SIZE_ATTR, + String.valueOf(metadataChangeProposals.size()), + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); } } diff --git a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/util/KafkaListenerUtil.java b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/util/KafkaListenerUtil.java index 874a45c995e911..c9dcb55d8e3d33 100644 --- a/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/util/KafkaListenerUtil.java +++ b/metadata-jobs/mce-consumer/src/main/java/com/linkedin/metadata/kafka/util/KafkaListenerUtil.java @@ -1,20 +1,10 @@ package com.linkedin.metadata.kafka.util; import com.linkedin.gms.factory.config.ConfigurationProvider; -import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.dao.throttle.ThrottleControl; import com.linkedin.metadata.dao.throttle.ThrottleSensor; -import com.linkedin.mxe.FailedMetadataChangeProposal; -import com.linkedin.mxe.MetadataChangeProposal; -import java.io.IOException; import java.util.Optional; -import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.commons.lang.exception.ExceptionUtils; -import org.apache.kafka.clients.producer.Producer; -import org.apache.kafka.clients.producer.ProducerRecord; import org.springframework.kafka.config.KafkaListenerEndpointRegistry; import org.springframework.kafka.listener.MessageListenerContainer; @@ -61,36 +51,4 @@ public static void registerThrottle( log.info("MCE Consumer Throttle Disabled"); } } - - public static void sendFailedMCP( - @Nonnull MetadataChangeProposal event, - @Nonnull Throwable throwable, - String fmcpTopicName, - Producer kafkaProducer) { - final FailedMetadataChangeProposal failedMetadataChangeProposal = - createFailedMCPEvent(event, throwable); - try { - final GenericRecord genericFailedMCERecord = - EventUtils.pegasusToAvroFailedMCP(failedMetadataChangeProposal); - log.debug("Sending FailedMessages to topic - {}", fmcpTopicName); - log.info( - "Error while processing FMCP: FailedMetadataChangeProposal - {}", - failedMetadataChangeProposal); - kafkaProducer.send(new ProducerRecord<>(fmcpTopicName, genericFailedMCERecord)); - } catch (IOException e) { - log.error( - "Error while sending FailedMetadataChangeProposal: Exception - {}, FailedMetadataChangeProposal - {}", - e.getStackTrace(), - failedMetadataChangeProposal); - } - } - - @Nonnull - public static FailedMetadataChangeProposal createFailedMCPEvent( - @Nonnull MetadataChangeProposal event, @Nonnull Throwable throwable) { - final FailedMetadataChangeProposal fmcp = new FailedMetadataChangeProposal(); - fmcp.setError(ExceptionUtils.getStackTrace(throwable)); - fmcp.setMetadataChangeProposal(event); - return fmcp; - } } diff --git a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java index 5d11697bed93d2..ed0268d6dca829 100644 --- a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java +++ b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/PlatformEventProcessor.java @@ -4,7 +4,6 @@ import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.datahub.event.hook.PlatformEventHook; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; @@ -60,53 +59,66 @@ public PlatformEventProcessor( topics = {"${PLATFORM_EVENT_TOPIC_NAME:" + Topics.PLATFORM_EVENT + "}"}, containerFactory = PE_EVENT_CONSUMER_NAME) public void consume(final ConsumerRecord consumerRecord) { - try (Timer.Context i = MetricUtils.timer(this.getClass(), "consume").time()) { - log.debug("Consuming a Platform Event"); + systemOperationContext.withSpan( + "consume", + () -> { + log.debug("Consuming a Platform Event"); - kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); - final GenericRecord record = consumerRecord.value(); - log.info( - "Got PE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", - consumerRecord.key(), - consumerRecord.topic(), - consumerRecord.partition(), - consumerRecord.offset(), - consumerRecord.serializedValueSize(), - consumerRecord.timestamp()); - MetricUtils.counter(this.getClass(), "received_pe_count").inc(); + kafkaLagStats.update(System.currentTimeMillis() - consumerRecord.timestamp()); + final GenericRecord record = consumerRecord.value(); + log.info( + "Got PE event key: {}, topic: {}, partition: {}, offset: {}, value size: {}, timestamp: {}", + consumerRecord.key(), + consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.offset(), + consumerRecord.serializedValueSize(), + consumerRecord.timestamp()); + MetricUtils.counter(this.getClass(), "received_pe_count").inc(); - PlatformEvent event; - try { - event = EventUtils.avroToPegasusPE(record); - log.debug("Successfully converted Avro PE to Pegasus PE. name: {}", event.getName()); - } catch (Exception e) { - MetricUtils.counter(this.getClass(), "avro_to_pegasus_conversion_failure").inc(); - log.error("Error deserializing message due to: ", e); - log.error("Message: {}", record.toString()); - return; - } + PlatformEvent event; + try { + event = EventUtils.avroToPegasusPE(record); + log.debug("Successfully converted Avro PE to Pegasus PE. name: {}", event.getName()); + } catch (Exception e) { + MetricUtils.counter(this.getClass(), "avro_to_pegasus_conversion_failure").inc(); + log.error("Error deserializing message due to: ", e); + log.error("Message: {}", record.toString()); + return; + } - log.info("Invoking PE hooks for event name {}", event.getName()); + log.info("Invoking PE hooks for event name {}", event.getName()); - for (PlatformEventHook hook : this.hooks) { - log.info( - "Invoking PE hook {} for event name {}", - hook.getClass().getSimpleName(), - event.getName()); - try (Timer.Context ignored = - MetricUtils.timer(this.getClass(), hook.getClass().getSimpleName() + "_latency") - .time()) { - hook.invoke(systemOperationContext, event); - } catch (Exception e) { - // Just skip this hook and continue. - MetricUtils.counter(this.getClass(), hook.getClass().getSimpleName() + "_failure").inc(); - log.error( - "Failed to execute PE hook with name {}", hook.getClass().getCanonicalName(), e); - } - } - MetricUtils.counter(this.getClass(), "consumed_pe_count").inc(); - log.info("Successfully completed PE hooks for event with name {}", event.getName()); - } + for (PlatformEventHook hook : this.hooks) { + log.info( + "Invoking PE hook {} for event name {}", + hook.getClass().getSimpleName(), + event.getName()); + + systemOperationContext.withSpan( + hook.getClass().getSimpleName(), + () -> { + try { + hook.invoke(systemOperationContext, event); + } catch (Exception e) { + // Just skip this hook and continue. + MetricUtils.counter( + this.getClass(), hook.getClass().getSimpleName() + "_failure") + .inc(); + log.error( + "Failed to execute PE hook with name {}", + hook.getClass().getCanonicalName(), + e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), hook.getClass().getSimpleName() + "_latency")); + } + MetricUtils.counter(this.getClass(), "consumed_pe_count").inc(); + log.info("Successfully completed PE hooks for event with name {}", event.getName()); + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "consume")); } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/run/AspectRowSummary.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/run/AspectRowSummary.pdl index 7ebc97e73877b2..ad7015afdf827d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/run/AspectRowSummary.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/run/AspectRowSummary.pdl @@ -11,4 +11,5 @@ record AspectRowSummary { version: long keyAspect: boolean aspect: optional Aspect + telemetryTraceId: optional string } diff --git a/metadata-operation-context/build.gradle b/metadata-operation-context/build.gradle index 71b61528d187bd..a02d47790f2ed6 100644 --- a/metadata-operation-context/build.gradle +++ b/metadata-operation-context/build.gradle @@ -10,9 +10,11 @@ dependencies { implementation externalDependency.slf4jApi implementation externalDependency.servletApi implementation spec.product.pegasus.restliServer + implementation externalDependency.opentelemetryApi + implementation externalDependency.opentelemetrySdkTrace compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok testImplementation externalDependency.testng - testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline } \ No newline at end of file diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 30255f7ebcac36..dd8769ba427325 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -5,6 +5,7 @@ import com.datahub.authorization.AuthorizationSession; import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableSet; import com.linkedin.common.AuditStamp; @@ -16,17 +17,23 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import com.linkedin.mxe.SystemMetadata; import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.metadata.exception.OperationContextException; +import io.datahubproject.metadata.exception.TraceException; import java.util.Collection; +import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.Builder; import lombok.Getter; +import lombok.extern.slf4j.Slf4j; /** * These contexts define a read/write context which allows more flexibility when reading and writing @@ -41,6 +48,7 @@ */ @Builder(toBuilder = true) @Getter +@Slf4j public class OperationContext implements AuthorizationSession { /** @@ -153,6 +161,7 @@ public static OperationContext asSystem( @Nullable IndexConvention indexConvention, @Nullable RetrieverContext retrieverContext, @Nonnull ValidationContext validationContext, + @Nullable TraceContext traceContext, boolean enforceExistenceEnabled) { return asSystem( config, @@ -163,6 +172,7 @@ public static OperationContext asSystem( retrieverContext, validationContext, ObjectMapperContext.DEFAULT, + traceContext, enforceExistenceEnabled); } @@ -175,6 +185,7 @@ public static OperationContext asSystem( @Nullable RetrieverContext retrieverContext, @Nonnull ValidationContext validationContext, @Nonnull ObjectMapperContext objectMapperContext, + @Nullable TraceContext traceContext, boolean enforceExistenceEnabled) { ActorContext systemActorContext = @@ -202,6 +213,7 @@ public static OperationContext asSystem( .retrieverContext(retrieverContext) .objectMapperContext(objectMapperContext) .validationContext(validationContext) + .traceContext(traceContext) .build(systemAuthentication, false); } catch (OperationContextException e) { throw new RuntimeException(e); @@ -219,6 +231,7 @@ public static OperationContext asSystem( @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; + @Nullable private final TraceContext traceContext; public OperationContext withSearchFlags( @Nonnull Function flagDefaults) { @@ -343,6 +356,84 @@ public AuthorizationResult authorize( return authorizationContext.authorize(getSessionActorContext(), privilege, resourceSpec); } + @Nullable + public SystemMetadata withTraceId(@Nullable SystemMetadata systemMetadata) { + if (systemMetadata != null && traceContext != null) { + return traceContext.withTraceId(systemMetadata); + } + return systemMetadata; + } + + public SystemMetadata withProducerTrace( + String operationName, @Nullable SystemMetadata systemMetadata, String topicName) { + if (systemMetadata != null && traceContext != null) { + return traceContext.withProducerTrace(operationName, systemMetadata, topicName); + } + return systemMetadata; + } + + /** + * Generic method to capture spans + * + * @param name name of the span + * @param operation the actual logic + * @param attributes additional attributes + * @return the output from the logic + * @param generic + */ + public T withSpan(String name, Supplier operation, String... attributes) { + if (traceContext != null) { + return traceContext.withSpan(name, operation, attributes); + } else { + return operation.get(); + } + } + + public void withSpan(String name, Runnable operation, String... attributes) { + if (traceContext != null) { + traceContext.withSpan(name, operation, attributes); + } else { + operation.run(); + } + } + + public void withQueueSpan( + String name, + SystemMetadata systemMetadata, + String topicName, + Runnable operation, + String... attributes) { + if (systemMetadata != null) { + withQueueSpan(name, List.of(systemMetadata), topicName, operation, attributes); + } else { + operation.run(); + } + } + + public void withQueueSpan( + String name, + List systemMetadata, + String topicName, + Runnable operation, + String... attributes) { + if (traceContext != null) { + traceContext.withQueueSpan(name, systemMetadata, topicName, operation, attributes); + } else { + operation.run(); + } + } + + public String traceException(Set throwables) { + try { + return getObjectMapper() + .writeValueAsString( + throwables.stream().map(TraceException::new).collect(Collectors.toList())); + } catch (JsonProcessingException e) { + log.error("Error creating trace.", e); + } + return throwables.stream().map(Throwable::getMessage).collect(Collectors.joining("\n")); + } + /** * Return a unique id for this context. Typically useful for building cache keys. We combine the * different context components to create a single string representation of the hashcode across @@ -371,6 +462,7 @@ public String getGlobalContextId() { .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) .add(getRetrieverContext()) .add(getObjectMapperContext()) + .add(getTraceContext() == null ? EmptyContext.EMPTY : getTraceContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -513,7 +605,8 @@ public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCa this.requestContext, this.retrieverContext, this.objectMapperContext != null ? this.objectMapperContext : ObjectMapperContext.DEFAULT, - this.validationContext); + this.validationContext, + this.traceContext); } private OperationContext build() { diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java index 779c418a56142f..022a75945b5049 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java @@ -2,6 +2,7 @@ import com.google.common.net.HttpHeaders; import com.linkedin.restli.server.ResourceContext; +import io.opentelemetry.api.trace.Span; import jakarta.servlet.http.HttpServletRequest; import java.util.Arrays; import java.util.Collection; @@ -59,6 +60,13 @@ public Optional getCacheKeyComponent() { public static class RequestContextBuilder { private RequestContext build() { + + // Add context for tracing + Span.current() + .setAttribute("user.id", this.actorUrn) + .setAttribute("request.api", this.requestAPI.toString()) + .setAttribute("request.id", this.requestID); + return new RequestContext( this.actorUrn, this.sourceIP, this.requestAPI, this.requestID, this.userAgent); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceContext.java new file mode 100644 index 00000000000000..d881020aed0528 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceContext.java @@ -0,0 +1,414 @@ +package io.datahubproject.metadata.context; + +import static com.linkedin.metadata.utils.metrics.MetricUtils.BATCH_SIZE_ATTR; +import static com.linkedin.metadata.utils.metrics.MetricUtils.QUEUE_DURATION_MS_ATTR; +import static com.linkedin.metadata.utils.metrics.MetricUtils.QUEUE_ENQUEUED_AT_ATTR; + +import com.linkedin.data.template.StringMap; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.metadata.utils.metrics.MetricUtils; +import com.linkedin.mxe.SystemMetadata; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanBuilder; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.api.trace.SpanKind; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.TraceFlags; +import io.opentelemetry.api.trace.TraceState; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.opentelemetry.sdk.common.CompletableResultCode; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.opentelemetry.sdk.trace.export.SpanExporter; +import jakarta.servlet.http.Cookie; +import jakarta.servlet.http.HttpServletRequest; +import java.time.Instant; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Builder; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Getter +@Builder +public class TraceContext implements ContextInterface { + // trace logging + public static final String TRACE_HEADER = "X-Enable-Trace-Log"; + public static final String TRACE_COOKIE = "enable-trace-log"; + // system metadata properties + public static final String TELEMETRY_TRACE_KEY = "telemetryTraceId"; + public static final String TELEMETRY_QUEUE_SPAN_KEY = "telemetryQueueSpanId"; + public static final String TELEMETRY_LOG_KEY = "telemetryLog"; + public static final String TELEMETRY_ENQUEUED_AT = "telemetryEnqueuedAt"; + + public static final TraceIdGenerator TRACE_ID_GENERATOR = new TraceIdGenerator(); + public static final SpanExporter LOG_SPAN_EXPORTER = new ConditionalLogSpanExporter(); + + private static final ThreadLocal logTracingEnabled = new ThreadLocal<>(); + + public static boolean isLogTracingEnabled() { + Boolean enabled = logTracingEnabled.get(); + return enabled != null && enabled; + } + + public static void clear() { + logTracingEnabled.remove(); + } + + public static void enableLogTracing(HttpServletRequest request) { + // Check header + String headerValue = request.getHeader(TRACE_HEADER); + if ("true".equalsIgnoreCase(headerValue)) { + logTracingEnabled.set(true); + return; + } + + // Check cookie + Cookie[] cookies = request.getCookies(); + if (cookies != null) { + for (Cookie cookie : cookies) { + if (TRACE_COOKIE.equals(cookie.getName()) && "true".equalsIgnoreCase(cookie.getValue())) { + logTracingEnabled.set(true); + return; + } + } + } + + logTracingEnabled.set(false); + } + + @Getter @Nonnull private final Tracer tracer; + + @Override + public Optional getCacheKeyComponent() { + return Optional.empty(); + } + + /** + * Generic method to capture spans + * + * @param name name of the span + * @param operation the actual logic + * @param attributes additional attributes + * @return the output from the logic + * @param generic + */ + public T withSpan(String name, Supplier operation, String... attributes) { + Span span = tracer.spanBuilder(name).startSpan(); + try (var scope = span.makeCurrent()) { + for (int i = 0; i < attributes.length; i += 2) { + span.setAttribute(attributes[i], attributes[i + 1]); + } + return operation.get(); + } catch (Exception e) { + span.setStatus(StatusCode.ERROR, e.getMessage()); + span.recordException(e); + span.setAttribute(MetricUtils.ERROR_TYPE, e.getClass().getName()); + throw e; + } finally { + span.end(); + } + } + + protected void withSpan(String name, Runnable operation, String... attributes) { + Span span = tracer.spanBuilder(name).startSpan(); + try (var scope = span.makeCurrent()) { + for (int i = 0; i < attributes.length; i += 2) { + span.setAttribute(attributes[i], attributes[i + 1]); + } + operation.run(); + } catch (Exception e) { + span.setStatus(StatusCode.ERROR, e.getMessage()); + span.recordException(e); + span.setAttribute(MetricUtils.ERROR_TYPE, e.getClass().getName()); + throw e; + } finally { + span.end(); + } + } + + /** + * Handle multiple messages with different trace ids processed from a queue + * + * @param name name of the processing of the queue + * @param batchSystemMetadata batch of system metadata + * @param operation actual processing logic + * @param attributes span attributes + */ + protected void withQueueSpan( + String name, + List batchSystemMetadata, + String topicName, + Runnable operation, + String... attributes) { + + List tracingEnabledSystemMetadata = + batchSystemMetadata.stream() + .filter( + sysMeta -> + Objects.nonNull(sysMeta) + && sysMeta.getProperties() != null + && sysMeta.getProperties().get(TELEMETRY_TRACE_KEY) != null + && sysMeta.getProperties().get(TELEMETRY_QUEUE_SPAN_KEY) != null) + .collect(Collectors.toList()); + + // resume log tracing + logTracingEnabled.set( + tracingEnabledSystemMetadata.stream() + .anyMatch( + sysMeta -> + Boolean.parseBoolean( + sysMeta.getProperties().getOrDefault(TELEMETRY_LOG_KEY, "false")))); + + // Create the span builder, close queue span and add links + SpanBuilder spanBuilder = + tracer.spanBuilder(name).setAttribute(BATCH_SIZE_ATTR, batchSystemMetadata.size()); + + List originalSpanContexts = + tracingEnabledSystemMetadata.stream() + .map(sysMeta -> closeQueueSpan(name, sysMeta, topicName)) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); + + Span span; + if (originalSpanContexts.size() == 1) { + // set parent if there is only a single original trace in the batch + spanBuilder.setParent(Context.current().with(Span.wrap(originalSpanContexts.get(0)))); + span = spanBuilder.startSpan(); + } else { + // otherwise link the current trace to all original traces + originalSpanContexts.forEach(spanBuilder::addLink); + span = spanBuilder.startSpan(); + + // log linked traces + if (isLogTracingEnabled()) { + log.info( + "Trace: {}, Linked Traces: {}", + span.getSpanContext().getTraceId(), + originalSpanContexts.stream().map(SpanContext::getTraceId)); + } + } + + try (var scope = span.makeCurrent()) { + // Set additional attributes + for (int i = 0; i < attributes.length; i += 2) { + span.setAttribute(attributes[i], attributes[i + 1]); + } + + operation.run(); + } catch (Exception e) { + span.setStatus(StatusCode.ERROR, e.getMessage()); + span.recordException(e); + span.setAttribute(MetricUtils.ERROR_TYPE, e.getClass().getName()); + throw e; + } finally { + span.end(); + } + } + + public SystemMetadata withTraceId(@Nonnull SystemMetadata systemMetadata) { + if (systemMetadata.getProperties() == null + || !systemMetadata.getProperties().containsKey(TELEMETRY_TRACE_KEY)) { + SpanContext currentSpanContext = Span.current().getSpanContext(); + + if (currentSpanContext.isValid()) { + SystemMetadata copy = GenericRecordUtils.copy(systemMetadata, SystemMetadata.class); + + if (!copy.hasProperties() || copy.getProperties() == null) { + copy.setProperties(new StringMap()); + } + + copy.getProperties().putAll(Map.of(TELEMETRY_TRACE_KEY, currentSpanContext.getTraceId())); + + return copy; + } + } + + return systemMetadata; + } + + /** Method to capture the current trace and span ids in systemMetadata */ + public SystemMetadata withProducerTrace( + String operationName, @Nonnull SystemMetadata systemMetadata, String topicName) { + SpanContext currentSpanContext = Span.current().getSpanContext(); + + if (currentSpanContext.isValid()) { + SystemMetadata copy = GenericRecordUtils.copy(systemMetadata, SystemMetadata.class); + + if (!copy.hasProperties() || copy.getProperties() == null) { + copy.setProperties(new StringMap()); + } + + // Create the queue span that will be closed by consumer + Span queueSpan = + tracer + .spanBuilder(operationName) + .setParent(Context.current()) + .setSpanKind(SpanKind.PRODUCER) + .setAttribute(MetricUtils.MESSAGING_SYSTEM, "kafka") + .setAttribute(MetricUtils.MESSAGING_DESTINATION, topicName) + .setAttribute(MetricUtils.MESSAGING_DESTINATION_KIND, "topic") + .setAttribute(MetricUtils.MESSAGING_OPERATION, "publish") + .startSpan(); + + long enqueuedAt = Instant.now().toEpochMilli(); + if (!copy.getProperties().containsKey(TELEMETRY_TRACE_KEY)) { + copy.getProperties() + .putAll( + Map.of( + TELEMETRY_TRACE_KEY, currentSpanContext.getTraceId(), + TELEMETRY_QUEUE_SPAN_KEY, queueSpan.getSpanContext().getSpanId())); + } + + copy.getProperties() + .putAll( + Map.of( + TELEMETRY_LOG_KEY, String.valueOf(isLogTracingEnabled()), + TELEMETRY_ENQUEUED_AT, String.valueOf(enqueuedAt))); + + // It will be mirrored by consumer with enqueued time + queueSpan.setAttribute(QUEUE_ENQUEUED_AT_ATTR, enqueuedAt).end(); + + return copy; + } + + return systemMetadata; + } + + /** + * When processing from queue - create new span with stored parent context + * + * @param systemMetadata systemMetadata with trace/span ids to restore + */ + @Nullable + private Span queueConsumerTrace( + String operationName, @Nonnull SystemMetadata systemMetadata, String topicName) { + + SpanContext queueSpanContext = closeQueueSpan(operationName, systemMetadata, topicName); + + if (queueSpanContext != null) { + // Create the processing span with the queue span as parent + return tracer + .spanBuilder(operationName) + .setParent( + Context.current() + .with(Span.wrap(queueSpanContext))) // Use queue span context as parent + .startSpan(); + } + + return null; + } + + @Nullable + private SpanContext closeQueueSpan( + String operationName, SystemMetadata metadata, String topicName) { + if (metadata != null && metadata.getProperties() != null) { + // resume log tracing + logTracingEnabled.set( + Boolean.parseBoolean(metadata.getProperties().getOrDefault(TELEMETRY_LOG_KEY, "false"))); + + String traceId = metadata.getProperties().get(TELEMETRY_TRACE_KEY); + String queueSpanId = metadata.getProperties().get(TELEMETRY_QUEUE_SPAN_KEY); + + if (traceId != null && queueSpanId != null) { + + SpanContext queueSpanContext = + SpanContext.createFromRemoteParent( + traceId, queueSpanId, TraceFlags.getSampled(), TraceState.getDefault()); + + // Get the span and end it with duration + SpanBuilder queueSpanBuilder = + tracer + .spanBuilder(operationName) + .setParent(Context.current().with(Span.wrap(queueSpanContext))) + .setSpanKind(SpanKind.CONSUMER); + + Span queueSpan = + queueSpanBuilder + .startSpan() + .setAttribute(MetricUtils.MESSAGING_SYSTEM, "kafka") + .setAttribute(MetricUtils.MESSAGING_DESTINATION, topicName) + .setAttribute(MetricUtils.MESSAGING_DESTINATION_KIND, "topic") + .setAttribute(MetricUtils.MESSAGING_OPERATION, "receive"); + + // calculate duration + if (metadata.getProperties().containsKey(TELEMETRY_ENQUEUED_AT)) { + long enqueuedAt = Long.parseLong(metadata.getProperties().get(TELEMETRY_ENQUEUED_AT)); + long queueTimeMillis = Instant.now().toEpochMilli() - enqueuedAt; + queueSpan + .setAttribute(QUEUE_ENQUEUED_AT_ATTR, enqueuedAt) + .setAttribute(QUEUE_DURATION_MS_ATTR, queueTimeMillis); + } + + queueSpan.end(); + + return queueSpanContext; + } + } + + return null; + } + + private static class ConditionalLogSpanExporter implements SpanExporter { + + @Override + public CompletableResultCode export(Collection spans) { + if (isLogTracingEnabled()) { + spans.forEach( + span -> { + log.info( + "Trace: {}, SpanId: {}, ParentId: {}, Name: {}, Duration: {} ms", + span.getTraceId(), + span.getSpanId(), + span.getParentSpanId(), + span.getName(), + String.format( + "%.2f", (span.getEndEpochNanos() - span.getStartEpochNanos()) / 1_000_000.0)); + + if (!span.getAttributes().isEmpty()) { + log.info("Trace: {}, Attributes: {}", span.getTraceId(), span.getAttributes()); + } + + if (!span.getEvents().isEmpty()) { + log.info("Trace: {}, Events: {}", span.getTraceId(), span.getEvents()); + } + + // Add logging for links + if (!span.getLinks().isEmpty()) { + span.getLinks() + .forEach( + link -> { + log.info( + "Trace: {}, Linked TraceId: {}, Linked SpanId: {}, Link Attributes: {}", + span.getTraceId(), + link.getSpanContext().getTraceId(), + link.getSpanContext().getSpanId(), + link.getAttributes()); + }); + } + }); + } + + return CompletableResultCode.ofSuccess(); + } + + @Override + public CompletableResultCode flush() { + return CompletableResultCode.ofSuccess(); + } + + @Override + public CompletableResultCode shutdown() { + return CompletableResultCode.ofSuccess(); + } + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceIdGenerator.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceIdGenerator.java new file mode 100644 index 00000000000000..9011a0e28d38a6 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/TraceIdGenerator.java @@ -0,0 +1,48 @@ +package io.datahubproject.metadata.context; + +import com.google.common.annotations.VisibleForTesting; +import io.opentelemetry.sdk.trace.IdGenerator; +import java.time.Instant; +import java.util.concurrent.ThreadLocalRandom; + +public class TraceIdGenerator implements IdGenerator { + private final IdGenerator defaultGenerator; + + public TraceIdGenerator() { + this.defaultGenerator = IdGenerator.random(); + } + + @VisibleForTesting + public String generateTraceId(long epochMillis) { + // First 8 bytes (16 hex chars) as timestamp in micros + long timestampMicros = epochMillis * 1000; + // Last 8 bytes as random to ensure uniqueness + long randomBits = ThreadLocalRandom.current().nextLong(); + + return String.format("%016x%016x", timestampMicros, randomBits); + } + + @Override + public String generateTraceId() { + return generateTraceId(Instant.now().toEpochMilli()); + } + + @Override + public String generateSpanId() { + // Use default random generation for span IDs + return defaultGenerator.generateSpanId(); + } + + // Utility method to extract timestamp + private static long getTimestampMicros(String traceId) { + if (traceId == null || traceId.length() < 16) { + throw new IllegalArgumentException("Invalid trace ID format"); + } + return Long.parseUnsignedLong(traceId.substring(0, 16), 16); + } + + // Convert to milliseconds for easier comparison + public static long getTimestampMillis(String traceId) { + return getTimestampMicros(traceId) / 1000; + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/TraceException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/TraceException.java new file mode 100644 index 00000000000000..85b32d6ca06d16 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/TraceException.java @@ -0,0 +1,40 @@ +package io.datahubproject.metadata.exception; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import java.util.Arrays; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Value; +import lombok.extern.jackson.Jacksonized; + +@Value +@Builder +@Jacksonized +@AllArgsConstructor +@JsonInclude(Include.NON_NULL) +public class TraceException { + String message; + String exceptionClass; + String[] stackTrace; + TraceException cause; + + public TraceException(Throwable throwable) { + this.message = throwable.getMessage(); + this.exceptionClass = throwable.getClass().getName(); + this.stackTrace = + Arrays.stream(throwable.getStackTrace()) + .map(StackTraceElement::toString) + .toArray(String[]::new); + + // Handle nested cause + this.cause = throwable.getCause() != null ? new TraceException(throwable.getCause()) : null; + } + + public TraceException(String message) { + this.message = message; + this.exceptionClass = null; + this.stackTrace = null; + this.cause = null; + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 92d62d42295b92..6e2d4ee6c5b924 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -261,6 +261,7 @@ public static OperationContext systemContext( indexConvention, retrieverContext, validationContext, + null, true); if (postConstruct != null) { diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index a2575c1c562209..15e189c6cc3679 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -28,6 +28,7 @@ public void testSystemPrivilegeEscalation() { null, TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class), + null, true); OperationContext opContext = diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/TraceContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/TraceContextTest.java new file mode 100644 index 00000000000000..8adf540c570b17 --- /dev/null +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/TraceContextTest.java @@ -0,0 +1,293 @@ +package io.datahubproject.metadata.context; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.atLeast; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertSame; +import static org.testng.Assert.assertTrue; + +import com.linkedin.data.template.StringMap; +import com.linkedin.mxe.SystemMetadata; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanBuilder; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.opentelemetry.context.Scope; +import jakarta.servlet.http.Cookie; +import jakarta.servlet.http.HttpServletRequest; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class TraceContextTest { + @Mock private Tracer tracer; + @Mock private HttpServletRequest request; + @Mock private Span span; + @Mock private SpanContext spanContext; + + private TraceContext traceContext; + + @BeforeMethod + public void setup() { + MockitoAnnotations.openMocks(this); + traceContext = TraceContext.builder().tracer(tracer).build(); + + // Clear any existing thread local state + TraceContext.clear(); + } + + @Test + public void testEnableLogTracingWithHeader() { + when(request.getHeader(TraceContext.TRACE_HEADER)).thenReturn("true"); + TraceContext.enableLogTracing(request); + assertTrue(TraceContext.isLogTracingEnabled()); + } + + @Test + public void testEnableLogTracingWithCookie() { + when(request.getHeader(TraceContext.TRACE_HEADER)).thenReturn(null); + Cookie cookie = new Cookie(TraceContext.TRACE_COOKIE, "true"); + when(request.getCookies()).thenReturn(new Cookie[] {cookie}); + TraceContext.enableLogTracing(request); + assertTrue(TraceContext.isLogTracingEnabled()); + } + + @Test + public void testEnableLogTracingDisabled() { + when(request.getHeader(TraceContext.TRACE_HEADER)).thenReturn("false"); + when(request.getCookies()).thenReturn(null); + TraceContext.enableLogTracing(request); + assertFalse(TraceContext.isLogTracingEnabled()); + } + + @Test + public void testWithTraceIdValidSpanContext() { + SystemMetadata systemMetadata = new SystemMetadata(); + when(span.getSpanContext()).thenReturn(spanContext); + when(spanContext.isValid()).thenReturn(true); + when(spanContext.getTraceId()).thenReturn("test-trace-id"); + + try (var mockedStatic = mockStatic(Span.class)) { + mockedStatic.when(Span::current).thenReturn(span); + SystemMetadata result = traceContext.withTraceId(systemMetadata); + assertNotNull(result.getProperties()); + assertEquals(result.getProperties().get(TraceContext.TELEMETRY_TRACE_KEY), "test-trace-id"); + } + } + + @Test + public void testWithTraceIdInvalidSpanContext() { + SystemMetadata systemMetadata = new SystemMetadata(); + when(span.getSpanContext()).thenReturn(spanContext); + when(spanContext.isValid()).thenReturn(false); + + try (var mockedStatic = mockStatic(Span.class)) { + mockedStatic.when(Span::current).thenReturn(span); + SystemMetadata result = traceContext.withTraceId(systemMetadata); + assertSame(result, systemMetadata); + } + } + + @Test + public void testWithQueueSpanBatch() { + // Setup + List batchMetadata = new ArrayList<>(); + SystemMetadata metadata1 = new SystemMetadata(); + metadata1.setProperties(new StringMap()); + metadata1.getProperties().put(TraceContext.TELEMETRY_TRACE_KEY, "trace-1"); + metadata1.getProperties().put(TraceContext.TELEMETRY_QUEUE_SPAN_KEY, "span-1"); + metadata1.getProperties().put(TraceContext.TELEMETRY_LOG_KEY, "true"); + metadata1 + .getProperties() + .put(TraceContext.TELEMETRY_ENQUEUED_AT, String.valueOf(System.currentTimeMillis())); + + SystemMetadata metadata2 = new SystemMetadata(); + metadata2.setProperties(new StringMap()); + metadata2.getProperties().put(TraceContext.TELEMETRY_TRACE_KEY, "trace-2"); + metadata2.getProperties().put(TraceContext.TELEMETRY_QUEUE_SPAN_KEY, "span-2"); + metadata2.getProperties().put(TraceContext.TELEMETRY_LOG_KEY, "false"); + metadata2 + .getProperties() + .put(TraceContext.TELEMETRY_ENQUEUED_AT, String.valueOf(System.currentTimeMillis())); + + batchMetadata.add(metadata1); + batchMetadata.add(metadata2); + + // Mock span builder chain for both consumer and processing spans + io.opentelemetry.api.trace.SpanBuilder mockSpanBuilder = + mock(io.opentelemetry.api.trace.SpanBuilder.class); + when(mockSpanBuilder.setParent(any(Context.class))).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setSpanKind(any())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyLong())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.addLink(any())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setAttribute(anyString(), anyLong())).thenReturn(span); + when(span.getSpanContext()).thenReturn(spanContext); + + // Execute & Verify - mainly checking that no exceptions are thrown + traceContext.withQueueSpan( + "test-operation", + batchMetadata, + "test-topic", + () -> { + // Do nothing + }); + } + + @Test + public void testWithSpanSupplier() { + SpanBuilder mockSpanBuilder = mock(SpanBuilder.class); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setStatus(any())).thenReturn(span); + when(span.makeCurrent()).thenReturn(mock(Scope.class)); + + // Execute + String result = traceContext.withSpan("test-operation", () -> "test-result", "attr1", "value1"); + + // Verify + assertEquals(result, "test-result"); + verify(mockSpanBuilder).startSpan(); + verify(span).end(); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testWithSpanSupplierException() { + io.opentelemetry.api.trace.SpanBuilder mockSpanBuilder = + mock(io.opentelemetry.api.trace.SpanBuilder.class); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setStatus(any(), anyString())).thenReturn(span); + when(span.recordException(any(RuntimeException.class))).thenReturn(span); + when(span.makeCurrent()).thenReturn(mock(Scope.class)); + + try { + traceContext.withSpan( + "test-operation", + () -> { + throw new RuntimeException("test-exception"); + }, + "attr1", + "value1"); + } finally { + verify(mockSpanBuilder).startSpan(); + verify(span).setStatus(StatusCode.ERROR, "test-exception"); + verify(span).recordException(any(RuntimeException.class)); + verify(span).end(); + } + } + + @Test + public void testWithSpanRunnable() { + SpanBuilder mockSpanBuilder = mock(SpanBuilder.class); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setStatus(any())).thenReturn(span); + when(span.makeCurrent()).thenReturn(mock(Scope.class)); + + AtomicBoolean executed = new AtomicBoolean(false); + + traceContext.withSpan("test-operation", () -> executed.set(true), "attr1", "value1"); + + assertTrue(executed.get()); + verify(mockSpanBuilder).startSpan(); + verify(span).end(); + } + + @Test + public void testWithSingleQueueSpan() { + SystemMetadata metadata = new SystemMetadata(); + metadata.setProperties(new StringMap()); + metadata.getProperties().put(TraceContext.TELEMETRY_TRACE_KEY, "trace-1"); + metadata.getProperties().put(TraceContext.TELEMETRY_QUEUE_SPAN_KEY, "span-1"); + metadata.getProperties().put(TraceContext.TELEMETRY_LOG_KEY, "true"); + metadata + .getProperties() + .put(TraceContext.TELEMETRY_ENQUEUED_AT, String.valueOf(System.currentTimeMillis())); + + io.opentelemetry.api.trace.SpanBuilder mockSpanBuilder = + mock(io.opentelemetry.api.trace.SpanBuilder.class); + when(mockSpanBuilder.setParent(any(Context.class))).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setSpanKind(any())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyLong())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setAttribute(anyString(), anyLong())).thenReturn(span); + when(span.makeCurrent()).thenReturn(mock(Scope.class)); + + AtomicBoolean executed = new AtomicBoolean(false); + + traceContext.withQueueSpan( + "test-operation", List.of(metadata), "test-topic", () -> executed.set(true)); + + assertTrue(executed.get()); + verify(mockSpanBuilder, atLeast(1)).startSpan(); + verify(span, atLeast(1)).end(); + } + + @Test + public void testWithProducerTrace() { + SystemMetadata systemMetadata = new SystemMetadata(); + + io.opentelemetry.api.trace.SpanBuilder mockSpanBuilder = + mock(io.opentelemetry.api.trace.SpanBuilder.class); + when(mockSpanBuilder.setParent(any(Context.class))).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setSpanKind(any())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyString())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.setAttribute(anyString(), anyLong())).thenReturn(mockSpanBuilder); + when(mockSpanBuilder.startSpan()).thenReturn(span); + + when(tracer.spanBuilder(anyString())).thenReturn(mockSpanBuilder); + when(span.setAttribute(anyString(), anyString())).thenReturn(span); + when(span.setAttribute(anyString(), anyLong())).thenReturn(span); + when(span.getSpanContext()).thenReturn(spanContext); + when(spanContext.getSpanId()).thenReturn("test-span-id"); + when(spanContext.getTraceId()).thenReturn("test-trace-id"); + when(spanContext.isValid()).thenReturn(true); + + try (var mockedStatic = mockStatic(Span.class)) { + mockedStatic.when(Span::current).thenReturn(span); + + SystemMetadata result = + traceContext.withProducerTrace("test-operation", systemMetadata, "test-topic"); + + assertNotNull(result.getProperties()); + assertTrue(result.getProperties().containsKey(TraceContext.TELEMETRY_TRACE_KEY)); + assertTrue(result.getProperties().containsKey(TraceContext.TELEMETRY_QUEUE_SPAN_KEY)); + assertTrue(result.getProperties().containsKey(TraceContext.TELEMETRY_LOG_KEY)); + assertTrue(result.getProperties().containsKey(TraceContext.TELEMETRY_ENQUEUED_AT)); + verify(mockSpanBuilder).startSpan(); + verify(span).end(); + } + } +} diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index ce9c636be16ac7..74232efc84d9c6 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -321,6 +321,7 @@ public void setupTest() throws Exception { mock(IndexConvention.class), mock(RetrieverContext.class), mock(ValidationContext.class), + null, true); _dataHubAuthorizer = diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 69b86962442b91..43449a7e70ad85 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -1,3 +1,6 @@ +# Name of the data hub component or container (used for tracing) +spring.application.name: ${APPLICATION_NAME:datahub-gms} + # The base URL where DataHub is accessible to users. baseUrl: ${DATAHUB_BASE_URL:http://localhost:9002} diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 501c46d64d6f9c..3fbbcebdae7b04 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -51,6 +51,11 @@ dependencies { implementation externalDependency.jline implementation externalDependency.commonsIo + implementation externalDependency.opentelemetryApi + implementation externalDependency.opentelemetrySdk + implementation externalDependency.opentelemetrySdkTrace + implementation externalDependency.opentelemetryAutoConfig + testImplementation externalDependency.springBootTest testImplementation externalDependency.mockito testImplementation externalDependency.testng diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java index 136c31aa0693da..0de389834927c7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.graph.neo4j.Neo4jGraphService; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.LineageRegistry; +import io.datahubproject.metadata.context.OperationContext; import javax.annotation.Nonnull; import org.neo4j.driver.Driver; import org.neo4j.driver.SessionConfig; @@ -28,9 +29,14 @@ public class Neo4jGraphServiceFactory { @Bean(name = "graphService") @Nonnull - protected GraphService getInstance(final EntityRegistry entityRegistry) { + protected GraphService getInstance( + @Qualifier("systemOperationContext") OperationContext systemOperationContext, + final EntityRegistry entityRegistry) { LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry); return new Neo4jGraphService( - lineageRegistry, neo4jDriver, SessionConfig.forDatabase(neo4jDatabase)); + systemOperationContext, + lineageRegistry, + neo4jDriver, + SessionConfig.forDatabase(neo4jDatabase)); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index 78107cc0ecc900..f5e26714a7f6aa 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -16,6 +16,7 @@ import io.datahubproject.metadata.context.OperationContextConfig; import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; +import io.datahubproject.metadata.context.TraceContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.metadata.services.RestrictedService; import javax.annotation.Nonnull; @@ -46,7 +47,8 @@ protected OperationContext javaSystemOperationContext( @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider, - @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient, + @Nonnull final TraceContext traceContext) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -80,6 +82,7 @@ protected OperationContext javaSystemOperationContext( .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build(), + traceContext, configurationProvider.getAuthentication().isEnforceExistenceEnabled()); entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); @@ -109,7 +112,8 @@ protected OperationContext restliSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Nonnull final TraceContext traceContext) { EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); @@ -136,6 +140,7 @@ protected OperationContext restliSystemOperationContext( .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build(), + traceContext, configurationProvider.getAuthentication().isEnforceExistenceEnabled()); entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/common/AdminClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/common/AdminClientFactory.java new file mode 100644 index 00000000000000..53b3a86011e7e0 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/common/AdminClientFactory.java @@ -0,0 +1,30 @@ +package com.linkedin.gms.factory.kafka.common; + +import com.linkedin.metadata.config.kafka.KafkaConfiguration; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.KafkaAdminClient; +import org.springframework.boot.autoconfigure.kafka.KafkaProperties; + +public class AdminClientFactory { + public static AdminClient buildKafkaAdminClient( + KafkaConfiguration kafkaConfiguration, + final KafkaProperties kafkaProperties, + String clientId) { + Map adminProperties = new HashMap<>(kafkaProperties.buildAdminProperties(null)); + adminProperties.put(AdminClientConfig.CLIENT_ID_CONFIG, clientId); + + // KAFKA_BOOTSTRAP_SERVER has precedence over SPRING_KAFKA_BOOTSTRAP_SERVERS + if (kafkaConfiguration.getBootstrapServers() != null + && !kafkaConfiguration.getBootstrapServers().isEmpty()) { + adminProperties.put( + AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, + Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); + } // else we rely on KafkaProperties which defaults to localhost:9092 or environment variables + + return KafkaAdminClient.create(adminProperties); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaThrottleFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaThrottleFactory.java index e2cdca8a065c03..f6d9fcefe46461 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaThrottleFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaThrottleFactory.java @@ -1,5 +1,7 @@ package com.linkedin.gms.factory.kafka.throttle; +import static com.linkedin.gms.factory.kafka.common.AdminClientFactory.buildKafkaAdminClient; + import com.datahub.metadata.dao.throttle.KafkaThrottleSensor; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.metadata.config.MetadataChangeProposalConfig; @@ -8,13 +10,7 @@ import com.linkedin.metadata.dao.throttle.ThrottleSensor; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.Topics; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; import lombok.extern.slf4j.Slf4j; -import org.apache.kafka.clients.admin.AdminClient; -import org.apache.kafka.clients.admin.AdminClientConfig; -import org.apache.kafka.clients.admin.KafkaAdminClient; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.kafka.KafkaProperties; @@ -47,7 +43,7 @@ public ThrottleSensor kafkaThrottle( if (mcpConfig.getThrottle().getUpdateIntervalMs() > 0) { return KafkaThrottleSensor.builder() .entityRegistry(entityRegistry) - .kafkaAdmin(kafkaAdmin(kafkaConfiguration, kafkaProperties)) + .kafkaAdmin(buildKafkaAdminClient(kafkaConfiguration, kafkaProperties, "throttle-sensor")) .config(mcpConfig.getThrottle()) .mclConsumerGroupId(maeConsumerGroupId) .timeseriesTopicName(timeseriesTopicName) @@ -58,19 +54,4 @@ public ThrottleSensor kafkaThrottle( return new NoOpSensor(); } } - - private static AdminClient kafkaAdmin( - KafkaConfiguration kafkaConfiguration, final KafkaProperties kafkaProperties) { - Map adminProperties = new HashMap<>(kafkaProperties.buildAdminProperties(null)); - - // KAFKA_BOOTSTRAP_SERVER has precedence over SPRING_KAFKA_BOOTSTRAP_SERVERS - if (kafkaConfiguration.getBootstrapServers() != null - && !kafkaConfiguration.getBootstrapServers().isEmpty()) { - adminProperties.put( - AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, - Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); - } // else we rely on KafkaProperties which defaults to localhost:9092 or environment variables - - return KafkaAdminClient.create(adminProperties); - } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/trace/KafkaTraceReaderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/trace/KafkaTraceReaderFactory.java new file mode 100644 index 00000000000000..6ba7dedce8ff50 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/trace/KafkaTraceReaderFactory.java @@ -0,0 +1,196 @@ +package com.linkedin.gms.factory.kafka.trace; + +import static com.linkedin.gms.factory.kafka.common.AdminClientFactory.buildKafkaAdminClient; +import static com.linkedin.mxe.ConsumerGroups.MCP_CONSUMER_GROUP_ID_VALUE; + +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.trace.MCLTraceReader; +import com.linkedin.metadata.trace.MCPFailedTraceReader; +import com.linkedin.metadata.trace.MCPTraceReader; +import com.linkedin.mxe.Topics; +import jakarta.annotation.PreDestroy; +import java.util.Properties; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.kafka.KafkaProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.kafka.core.DefaultKafkaConsumerFactory; + +@Configuration +public class KafkaTraceReaderFactory { + private static final Properties TRACE_CONSUMER_PROPERTIES = new Properties(); + + static { + TRACE_CONSUMER_PROPERTIES.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + } + + @Value("${trace.pollMaxAttempts:5}") + private int pollMaxAttempts; + + @Value("${trace.pollDurationMs:1000}") + private int pollDurationMs; + + @Value(MCP_CONSUMER_GROUP_ID_VALUE) + private String mceConsumerGroupId; + + @Value("${METADATA_CHANGE_PROPOSAL_TOPIC_NAME:" + Topics.METADATA_CHANGE_PROPOSAL + "}") + private String mcpTopicName; + + @Value( + "${FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME:" + + Topics.FAILED_METADATA_CHANGE_PROPOSAL + + "}") + private String mcpFailedTopicName; + + @Value("${METADATA_CHANGE_LOG_KAFKA_CONSUMER_GROUP_ID:generic-mae-consumer-job-client}") + private String maeConsumerGroupId; + + @Value("${METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME:" + Topics.METADATA_CHANGE_LOG_VERSIONED + "}") + private String mclVersionedTopicName; + + @Value( + "${METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME:" + Topics.METADATA_CHANGE_LOG_TIMESERIES + "}") + private String mclTimeseriesTopicName; + + @Value("${trace.executor.thread-pool-size:10}") + private int threadPoolSize; + + @Value("${trace.executor.shutdown-timeout-seconds:60}") + private int shutdownTimeoutSeconds; + + @Value("${trace.timeout-seconds:30}") + private long traceTimeoutSeconds; + + @Bean("traceAdminClient") + public AdminClient traceAdminClient( + @Qualifier("configurationProvider") ConfigurationProvider provider, + final KafkaProperties kafkaProperties) { + return buildKafkaAdminClient(provider.getKafka(), kafkaProperties, "trace-reader"); + } + + private ExecutorService traceExecutorService; + + @Bean("traceExecutorService") + public ExecutorService traceExecutorService() { + traceExecutorService = Executors.newFixedThreadPool(threadPoolSize); + return traceExecutorService; + } + + @Bean("mcpTraceReader") + public MCPTraceReader mcpTraceReader( + @Qualifier("traceAdminClient") AdminClient adminClient, + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("traceExecutorService") ExecutorService traceExecutorService) { + return MCPTraceReader.builder() + .adminClient(adminClient) + .topicName(mcpTopicName) + .consumerGroupId(mceConsumerGroupId) + .consumerSupplier( + () -> createConsumerWithUniqueId(kafkaConsumerFactory, "trace-reader-mcp")) + .pollDurationMs(pollDurationMs) + .pollMaxAttempts(pollMaxAttempts) + .timeoutSeconds(traceTimeoutSeconds) + .executorService(traceExecutorService) + .build(); + } + + @Bean("mcpFailedTraceReader") + public MCPFailedTraceReader mcpFailedTraceReader( + @Qualifier("traceAdminClient") AdminClient adminClient, + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("traceExecutorService") ExecutorService traceExecutorService) { + return MCPFailedTraceReader.builder() + .adminClient(adminClient) + .topicName(mcpFailedTopicName) + .consumerSupplier( + () -> createConsumerWithUniqueId(kafkaConsumerFactory, "trace-reader-mcp-failed")) + .pollDurationMs(pollDurationMs) + .pollMaxAttempts(pollMaxAttempts) + .timeoutSeconds(traceTimeoutSeconds) + .executorService(traceExecutorService) + .build(); + } + + @Bean("mclVersionedTraceReader") + public MCLTraceReader mclVersionedTraceReader( + @Qualifier("traceAdminClient") AdminClient adminClient, + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("traceExecutorService") ExecutorService traceExecutorService) { + return MCLTraceReader.builder() + .adminClient(adminClient) + .topicName(mclVersionedTopicName) + .consumerGroupId(maeConsumerGroupId) + .consumerSupplier( + () -> createConsumerWithUniqueId(kafkaConsumerFactory, "trace-reader-mcl-versioned")) + .pollDurationMs(pollDurationMs) + .pollMaxAttempts(pollMaxAttempts) + .timeoutSeconds(traceTimeoutSeconds) + .executorService(traceExecutorService) + .build(); + } + + @Bean("mclTimeseriesTraceReader") + public MCLTraceReader mclTimeseriesTraceReader( + @Qualifier("traceAdminClient") AdminClient adminClient, + @Qualifier("kafkaConsumerFactory") + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("traceExecutorService") ExecutorService traceExecutorService) { + return MCLTraceReader.builder() + .adminClient(adminClient) + .topicName(mclTimeseriesTopicName) + .consumerGroupId(maeConsumerGroupId) + .consumerSupplier( + () -> createConsumerWithUniqueId(kafkaConsumerFactory, "trace-reader-mcl-timeseries")) + .pollDurationMs(pollDurationMs) + .pollMaxAttempts(pollMaxAttempts) + .timeoutSeconds(traceTimeoutSeconds) + .executorService(traceExecutorService) + .build(); + } + + private Consumer createConsumerWithUniqueId( + DefaultKafkaConsumerFactory kafkaConsumerFactory, + String baseClientId) { + Properties consumerProps = new Properties(); + consumerProps.putAll(TRACE_CONSUMER_PROPERTIES); + // Add a unique suffix to the client.id + consumerProps.put( + ConsumerConfig.CLIENT_ID_CONFIG, + baseClientId + "-" + Thread.currentThread().getId() + "-" + System.nanoTime()); + + return kafkaConsumerFactory.createConsumer( + baseClientId, // groupId prefix + null, // groupId suffix (using default) + null, // assignor + consumerProps); + } + + @PreDestroy + public void shutdown() { + if (traceExecutorService != null) { + traceExecutorService.shutdown(); + try { + if (!traceExecutorService.awaitTermination(shutdownTimeoutSeconds, TimeUnit.SECONDS)) { + traceExecutorService.shutdownNow(); + if (!traceExecutorService.awaitTermination(shutdownTimeoutSeconds, TimeUnit.SECONDS)) { + System.err.println("ExecutorService did not terminate"); + } + } + } catch (InterruptedException e) { + traceExecutorService.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/system_telemetry/OpenTelemetryBaseFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/system_telemetry/OpenTelemetryBaseFactory.java new file mode 100644 index 00000000000000..c6b3219a623e9c --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/system_telemetry/OpenTelemetryBaseFactory.java @@ -0,0 +1,81 @@ +package com.linkedin.gms.factory.system_telemetry; + +import com.linkedin.metadata.utils.metrics.MetricSpanExporter; +import io.datahubproject.metadata.context.TraceContext; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.api.trace.propagation.W3CTraceContextPropagator; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; +import io.opentelemetry.sdk.resources.Resource; +import io.opentelemetry.sdk.trace.export.BatchSpanProcessor; +import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor; +import java.util.HashMap; +import java.util.Map; + +/** Common System OpenTelemetry */ +public abstract class OpenTelemetryBaseFactory { + private static final AttributeKey SERVICE_NAME = AttributeKey.stringKey("service.name"); + + protected abstract String getApplicationComponent(); + + protected TraceContext traceContext() { + return TraceContext.builder().tracer(tracer(openTelemetry())).build(); + } + + private Tracer tracer(OpenTelemetry openTelemetry) { + return openTelemetry.getTracer(getApplicationComponent()); + } + + private OpenTelemetry openTelemetry() { + return AutoConfiguredOpenTelemetrySdk.builder() + .addPropertiesCustomizer( + (configProperties) -> { + Map props = new HashMap<>(); + // override exporters to "none" if not specified + Map.of( + "OTEL_METRICS_EXPORTER", "otel.metrics.exporter", + "OTEL_TRACES_EXPORTER", "otel.traces.exporter", + "OTEL_LOGS_EXPORTER", "otel.logs.exporter") + .forEach( + (envVar, propKey) -> { + String value = System.getenv(envVar); + if (value == null || value.trim().isEmpty()) { + props.put(propKey, "none"); + } + }); + + return props; + }) + .addTracerProviderCustomizer( + (sdkTracerProviderBuilder, configProperties) -> + sdkTracerProviderBuilder + .addSpanProcessor(SimpleSpanProcessor.create(TraceContext.LOG_SPAN_EXPORTER)) + .addSpanProcessor(BatchSpanProcessor.builder(new MetricSpanExporter()).build()) + .setIdGenerator(TraceContext.TRACE_ID_GENERATOR) + .setResource( + Resource.getDefault() + .merge( + Resource.create( + Attributes.of(SERVICE_NAME, getApplicationComponent()))))) + .addPropagatorCustomizer( + (existingPropagator, configProperties) -> { + // If OTEL_PROPAGATORS is not set or doesn't include tracecontext, + // return W3C propagator, otherwise keep existing + String propagators = configProperties.getString("OTEL_PROPAGATORS"); + return (propagators == null || !propagators.contains("tracecontext")) + ? W3CTraceContextPropagator.getInstance() + : existingPropagator; + }) + .addMetricExporterCustomizer( + (metricExporter, configProperties) -> { + String metricsExporter = configProperties.getString("OTEL_METRICS_EXPORTER"); + return (metricsExporter == null || metricsExporter.trim().isEmpty()) + ? null // Return null to disable the exporter + : metricExporter; + }) + .build() + .getOpenTelemetrySdk(); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/trace/TraceServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/trace/TraceServiceFactory.java new file mode 100644 index 00000000000000..aadce86036f6b2 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/trace/TraceServiceFactory.java @@ -0,0 +1,37 @@ +package com.linkedin.gms.factory.trace; + +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.systemmetadata.SystemMetadataService; +import com.linkedin.metadata.systemmetadata.TraceService; +import com.linkedin.metadata.trace.MCLTraceReader; +import com.linkedin.metadata.trace.MCPFailedTraceReader; +import com.linkedin.metadata.trace.MCPTraceReader; +import com.linkedin.metadata.trace.TraceServiceImpl; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class TraceServiceFactory { + + @Bean + public TraceService traceService( + @Qualifier("entityRegistry") EntityRegistry entityRegistry, + @Qualifier("entityService") EntityService entityService, + @Qualifier("systemMetadataService") SystemMetadataService systemMetadataService, + @Qualifier("mcpTraceReader") MCPTraceReader mcpTraceReader, + @Qualifier("mcpFailedTraceReader") MCPFailedTraceReader mcpFailedTraceReader, + @Qualifier("mclVersionedTraceReader") MCLTraceReader mclVersionedTraceReader, + @Qualifier("mclTimeseriesTraceReader") MCLTraceReader mclTimeseriesTraceReader) { + return TraceServiceImpl.builder() + .entityRegistry(entityRegistry) + .entityService(entityService) + .systemMetadataService(systemMetadataService) + .mcpTraceReader(mcpTraceReader) + .mcpFailedTraceReader(mcpFailedTraceReader) + .mclVersionedTraceReader(mclVersionedTraceReader) + .mclTimeseriesTraceReader(mclTimeseriesTraceReader) + .build(); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/kafka/DataHubUpgradeKafkaListener.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/kafka/DataHubUpgradeKafkaListener.java index e69ab342740e43..d95d9304b8514a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/kafka/DataHubUpgradeKafkaListener.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/kafka/DataHubUpgradeKafkaListener.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.boot.kafka; -import com.codahale.metrics.Timer; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.boot.dependencies.BootstrapDependency; @@ -8,6 +7,7 @@ import com.linkedin.metadata.version.GitVersion; import com.linkedin.mxe.DataHubUpgradeHistoryEvent; import com.linkedin.mxe.Topics; +import io.datahubproject.metadata.context.OperationContext; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; @@ -42,13 +42,13 @@ public class DataHubUpgradeKafkaListener implements ConsumerSeekAware, Bootstrap public static final String TOPIC_NAME = "${DATAHUB_UPGRADE_HISTORY_TOPIC_NAME:" + Topics.DATAHUB_UPGRADE_HISTORY_TOPIC_NAME + "}"; - private final DefaultKafkaConsumerFactory _defaultKafkaConsumerFactory; + private final DefaultKafkaConsumerFactory defaultKafkaConsumerFactory; @Value("#{systemEnvironment['DATAHUB_REVISION'] ?: '0'}") private String revision; - private final GitVersion _gitVersion; - private final ConfigurationProvider _configurationProvider; + private final GitVersion gitVersion; + private final ConfigurationProvider configurationProvider; @Value(CONSUMER_GROUP) private String consumerGroup; @@ -56,6 +56,8 @@ public class DataHubUpgradeKafkaListener implements ConsumerSeekAware, Bootstrap @Value(TOPIC_NAME) private String topicName; + private final OperationContext systemOperationContext; + private static final AtomicBoolean IS_UPDATED = new AtomicBoolean(false); public DataHubUpgradeKafkaListener( @@ -63,11 +65,13 @@ public DataHubUpgradeKafkaListener( @Qualifier("duheKafkaConsumerFactory") DefaultKafkaConsumerFactory defaultKafkaConsumerFactory, GitVersion gitVersion, - ConfigurationProvider configurationProvider) { + ConfigurationProvider configurationProvider, + @Qualifier("systemOperationContext") OperationContext operationContext) { this.registry = registry; - this._defaultKafkaConsumerFactory = defaultKafkaConsumerFactory; - this._gitVersion = gitVersion; - this._configurationProvider = configurationProvider; + this.defaultKafkaConsumerFactory = defaultKafkaConsumerFactory; + this.gitVersion = gitVersion; + this.configurationProvider = configurationProvider; + this.systemOperationContext = operationContext; } // Constructs a consumer to read determine final offset to assign, prevents re-reading whole topic @@ -76,7 +80,7 @@ public DataHubUpgradeKafkaListener( public void onPartitionsAssigned( Map assignments, ConsumerSeekCallback callback) { try (Consumer kafkaConsumer = - _defaultKafkaConsumerFactory.createConsumer(consumerGroup, SUFFIX)) { + defaultKafkaConsumerFactory.createConsumer(consumerGroup, SUFFIX)) { final Map offsetMap = kafkaConsumer.endOffsets(assignments.keySet()); assignments.entrySet().stream() .filter(entry -> topicName.equals(entry.getKey().topic())) @@ -99,44 +103,49 @@ public void onPartitionsAssigned( containerFactory = "duheKafkaEventConsumer", concurrency = "1") public void checkSystemVersion(final ConsumerRecord consumerRecord) { - try (Timer.Context i = MetricUtils.timer(this.getClass(), "checkSystemVersion").time()) { - final GenericRecord record = consumerRecord.value(); - final String expectedVersion = String.format("%s-%s", _gitVersion.getVersion(), revision); - - DataHubUpgradeHistoryEvent event; - try { - event = EventUtils.avroToPegasusDUHE(record); - log.info("Latest system update version: {}", event.getVersion()); - if (expectedVersion.equals(event.getVersion())) { - IS_UPDATED.getAndSet(true); - } else if (!_configurationProvider.getSystemUpdate().isWaitForSystemUpdate()) { - log.warn("Wait for system update is disabled. Proceeding with startup."); - IS_UPDATED.getAndSet(true); - } else { - log.warn( - "System version is not up to date: {}. Waiting for datahub-upgrade to complete...", - expectedVersion); - } - } catch (Exception e) { - MetricUtils.counter(this.getClass(), "avro_to_pegasus_conversion_failure").inc(); - log.error("Error deserializing message due to: ", e); - log.error("Message: {}", record.toString()); - return; - } - } + systemOperationContext.withSpan( + "checkSystemVersion", + () -> { + final GenericRecord record = consumerRecord.value(); + final String expectedVersion = String.format("%s-%s", gitVersion.getVersion(), revision); + + DataHubUpgradeHistoryEvent event; + try { + event = EventUtils.avroToPegasusDUHE(record); + log.info("Latest system update version: {}", event.getVersion()); + if (expectedVersion.equals(event.getVersion())) { + IS_UPDATED.getAndSet(true); + } else if (!configurationProvider.getSystemUpdate().isWaitForSystemUpdate()) { + log.warn("Wait for system update is disabled. Proceeding with startup."); + IS_UPDATED.getAndSet(true); + } else { + log.warn( + "System version is not up to date: {}. Waiting for datahub-upgrade to complete...", + expectedVersion); + } + + } catch (Exception e) { + MetricUtils.counter(this.getClass(), "avro_to_pegasus_conversion_failure").inc(); + log.error("Error deserializing message due to: ", e); + log.error("Message: {}", record.toString()); + return; + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "checkSystemVersion")); } public void waitForUpdate() { - if (!_configurationProvider.getSystemUpdate().isWaitForSystemUpdate()) { + if (!configurationProvider.getSystemUpdate().isWaitForSystemUpdate()) { log.warn("Wait for system update is disabled. Proceeding with startup."); IS_UPDATED.getAndSet(true); } - int maxBackOffs = Integer.parseInt(_configurationProvider.getSystemUpdate().getMaxBackOffs()); + int maxBackOffs = Integer.parseInt(configurationProvider.getSystemUpdate().getMaxBackOffs()); long initialBackOffMs = - Long.parseLong(_configurationProvider.getSystemUpdate().getInitialBackOffMs()); + Long.parseLong(configurationProvider.getSystemUpdate().getInitialBackOffMs()); int backOffFactor = - Integer.parseInt(_configurationProvider.getSystemUpdate().getBackOffFactor()); + Integer.parseInt(configurationProvider.getSystemUpdate().getBackOffFactor()); long backOffMs = initialBackOffMs; for (int i = 0; i < maxBackOffs; i++) { diff --git a/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java b/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java index 615ef985ca05d7..ebd622a1f0cce1 100644 --- a/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java +++ b/metadata-service/openapi-analytics-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIAnalyticsTestConfiguration.java @@ -20,11 +20,13 @@ import org.mockito.Mockito; import org.opensearch.action.search.SearchResponse; import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Primary; @TestConfiguration public class OpenAPIAnalyticsTestConfiguration { + @MockBean TracingInterceptor tracingInterceptor; @Bean(name = "systemOperationContext") public OperationContext systemOperationContext() { diff --git a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java index 075501c1a10711..305a91072c683d 100644 --- a/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java +++ b/metadata-service/openapi-entity-servlet/src/test/java/io/datahubproject/openapi/config/OpenAPIEntityTestConfiguration.java @@ -28,6 +28,7 @@ import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.timeline.TimelineService; import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.TraceContext; import io.datahubproject.openapi.dto.UrnResponseMap; import io.datahubproject.openapi.generated.EntityResponse; import io.datahubproject.openapi.v1.entities.EntitiesController; @@ -47,6 +48,13 @@ @TestConfiguration public class OpenAPIEntityTestConfiguration { + @MockBean TraceContext traceContext; + + @Bean + public TracingInterceptor tracingInterceptor(final TraceContext traceContext) { + return new TracingInterceptor(traceContext); + } + @Bean public ObjectMapper objectMapper() { return new ObjectMapper(new YAMLFactory()); diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index 77679790f25dea..59dbd2408ccdd9 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -36,6 +36,10 @@ dependencies { annotationProcessor externalDependency.lombok + implementation externalDependency.opentelemetryApi + implementation externalDependency.opentelemetrySdk + implementation externalDependency.opentelemetrySdkTrace + testImplementation externalDependency.springBootTest testImplementation project(':mock-entity-registry') testImplementation externalDependency.springBoot diff --git a/metadata-service/openapi-servlet/models/build.gradle b/metadata-service/openapi-servlet/models/build.gradle index d75e656e5ecd6c..dbc51ca17e3388 100644 --- a/metadata-service/openapi-servlet/models/build.gradle +++ b/metadata-service/openapi-servlet/models/build.gradle @@ -6,6 +6,7 @@ dependencies { implementation project(':entity-registry') implementation project(':metadata-operation-context') implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:services') implementation externalDependency.jacksonDataBind implementation externalDependency.httpClient diff --git a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceRequestV1.java b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceRequestV1.java new file mode 100644 index 00000000000000..8fe811f23d958b --- /dev/null +++ b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceRequestV1.java @@ -0,0 +1,17 @@ +package io.datahubproject.openapi.v1.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.linkedin.common.urn.Urn; +import java.util.LinkedHashMap; +import java.util.List; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode(callSuper = true) +@Data +@Builder +@JsonInclude(JsonInclude.Include.NON_NULL) +@AllArgsConstructor +public class TraceRequestV1 extends LinkedHashMap> {} diff --git a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceResponseV1.java b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceResponseV1.java new file mode 100644 index 00000000000000..5fc721c2812d41 --- /dev/null +++ b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v1/models/TraceResponseV1.java @@ -0,0 +1,22 @@ +package io.datahubproject.openapi.v1.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.systemmetadata.TraceStatus; +import java.util.LinkedHashMap; +import java.util.Map; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode(callSuper = true) +@Data +@Builder +@JsonInclude(JsonInclude.Include.NON_NULL) +@AllArgsConstructor +public class TraceResponseV1 extends LinkedHashMap> { + public TraceResponseV1(Map> m) { + super(m); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java index c4b4431e77c4ef..3fe1399017e981 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java @@ -26,6 +26,7 @@ import java.util.stream.Stream; import javax.annotation.Nonnull; import org.springdoc.core.models.GroupedOpenApi; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -36,6 +37,7 @@ import org.springframework.http.converter.StringHttpMessageConverter; import org.springframework.http.converter.json.MappingJackson2HttpMessageConverter; import org.springframework.web.servlet.config.annotation.AsyncSupportConfigurer; +import org.springframework.web.servlet.config.annotation.InterceptorRegistry; import org.springframework.web.servlet.config.annotation.WebMvcConfigurer; @OpenAPIDefinition( @@ -55,6 +57,8 @@ public class SpringWebConfig implements WebMvcConfigurer { @Value("${datahub.gms.async.request-timeout-ms}") private long asyncTimeoutMilliseconds; + @Autowired private TracingInterceptor tracingInterceptor; + @Override public void configureMessageConverters(List> messageConverters) { messageConverters.add(new StringHttpMessageConverter()); @@ -173,4 +177,9 @@ public void configureAsyncSupport(@Nonnull AsyncSupportConfigurer configurer) { WebMvcConfigurer.super.configureAsyncSupport(configurer); configurer.setDefaultTimeout(asyncTimeoutMilliseconds); } + + @Override + public void addInterceptors(InterceptorRegistry registry) { + registry.addInterceptor(tracingInterceptor).addPathPatterns("/**"); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/TracingInterceptor.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/TracingInterceptor.java new file mode 100644 index 00000000000000..0ca913b7c65cfc --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/TracingInterceptor.java @@ -0,0 +1,93 @@ +package io.datahubproject.openapi.config; + +import io.datahubproject.metadata.context.TraceContext; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import javax.annotation.Nullable; +import org.slf4j.MDC; +import org.springframework.stereotype.Component; +import org.springframework.web.servlet.HandlerInterceptor; + +@Component +public class TracingInterceptor implements HandlerInterceptor { + @Nullable private final Tracer tracer; + + public TracingInterceptor(final TraceContext traceContext) { + this.tracer = traceContext.getTracer(); + } + + @Override + public boolean preHandle( + HttpServletRequest request, HttpServletResponse response, Object handler) { + + if (tracer != null) { + String spanName = request.getMethod() + " " + request.getRequestURI(); + Span span = + tracer + .spanBuilder(spanName) + .setAttribute("http.method", request.getMethod()) + .setAttribute("http.url", request.getRequestURI()) + .setParent(Context.root()) + .startSpan(); + + request.setAttribute("span", span); + span.makeCurrent(); + + TraceContext.enableLogTracing(request); + + if (span.getSpanContext().isValid()) { + SpanContext spanContext = span.getSpanContext(); + String traceId = spanContext.getTraceId(); + String spanId = spanContext.getSpanId(); + + // W3C Trace Context format + String flags = spanContext.getTraceFlags().isSampled() ? "01" : "00"; + response.setHeader("traceparent", String.format("00-%s-%s-%s", traceId, spanId, flags)); + + if (TraceContext.isLogTracingEnabled()) { + // Add trace context to MDC for logging + MDC.put("telemetryId", String.format("[%s-%s] ", traceId, spanId)); + } + } + } + + return true; + } + + @Override + public void afterCompletion( + HttpServletRequest request, HttpServletResponse response, Object handler, Exception ex) { + + if (tracer != null) { + Span span = (Span) request.getAttribute("span"); + if (span != null) { + try { + span.setAttribute("http.status_code", response.getStatus()); + + if (ex != null) { + span.setStatus(StatusCode.ERROR); + span.recordException(ex); + } else { + if (response.getStatus() >= 400) { + span.setStatus(StatusCode.ERROR); + } else { + span.setStatus(StatusCode.OK); + } + } + } finally { + span.end(); + } + } + + if (TraceContext.isLogTracingEnabled()) { + TraceContext.clear(); + MDC.clear(); + } + } + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 592d7bba4211fe..d3c67355d0f85d 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -55,6 +55,7 @@ import io.datahubproject.openapi.models.GenericAspect; import io.datahubproject.openapi.models.GenericEntity; import io.datahubproject.openapi.models.GenericEntityScrollResult; +import io.datahubproject.openapi.util.RequestInputUtil; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import jakarta.servlet.http.HttpServletRequest; @@ -128,7 +129,8 @@ protected List buildEntityList( throws URISyntaxException { LinkedHashMap> aspectSpecMap = - resolveAspectSpecs( + RequestInputUtil.resolveAspectSpecs( + entityRegistry, urns.stream() .map( urn -> @@ -398,8 +400,11 @@ public ResponseEntity getAspect( buildEntityVersionedAspectList( opContext, List.of(urn), - resolveAspectSpecs( - new LinkedHashMap<>(Map.of(urn, Map.of(aspectName, version))), 0L, true), + RequestInputUtil.resolveAspectSpecs( + entityRegistry, + new LinkedHashMap<>(Map.of(urn, Map.of(aspectName, version))), + 0L, + true), withSystemMetadata, true); } @@ -634,7 +639,7 @@ public ResponseEntity createAspect( authentication.getActor().toUrnStr() + " is unauthorized to " + CREATE + " entities."); } - AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); + AspectSpec aspectSpec = RequestInputUtil.lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( opContext.getRetrieverContext().getAspectRetriever(), @@ -713,7 +718,7 @@ public ResponseEntity patchAspect( authentication.getActor().toUrnStr() + " is unauthorized to " + UPDATE + " entities."); } - AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); + AspectSpec aspectSpec = RequestInputUtil.lookupAspectSpec(entitySpec, aspectName).get(); RecordTemplate currentValue = entityService.getAspect(opContext, urn, aspectSpec.getName(), 0); GenericPatchTemplate genericPatchTemplate = @@ -761,69 +766,6 @@ protected Boolean exists( opContext, urn, aspect, includeSoftDelete != null ? includeSoftDelete : false); } - /** - * Given a map with aspect names from the API, normalized them into actual aspect names (casing - * fixes) - * - * @param requestedAspectNames requested aspects - * @param map values - * @param expandEmpty whether to expand empty aspect names to all aspect names - * @return updated map - */ - protected LinkedHashMap> resolveAspectSpecs( - LinkedHashMap> requestedAspectNames, - @Nonnull T defaultValue, - boolean expandEmpty) { - return requestedAspectNames.entrySet().stream() - .map( - entry -> { - final Urn urn = entry.getKey(); - if (expandEmpty && (entry.getValue().isEmpty() || entry.getValue().containsKey(""))) { - // All aspects specified - Set allNames = - new HashSet<>( - entityRegistry.getEntitySpec(urn.getEntityType()).getAspectSpecs()); - return Map.entry( - urn, - allNames.stream() - .map( - aspectName -> - Map.entry( - aspectName, entry.getValue().getOrDefault("", defaultValue))) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); - } else if (!entry.getValue().keySet().isEmpty()) { - final Map normalizedNames = - entry.getValue().keySet().stream() - .map( - requestAspectName -> - Map.entry( - requestAspectName, lookupAspectSpec(urn, requestAspectName))) - .filter(aspectSpecEntry -> aspectSpecEntry.getValue().isPresent()) - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().get())); - return Map.entry( - urn, - entry.getValue().entrySet().stream() - .filter(reqEntry -> normalizedNames.containsKey(reqEntry.getKey())) - .map( - reqEntry -> - Map.entry( - normalizedNames.get(reqEntry.getKey()), reqEntry.getValue())) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); - } else { - return (Map.Entry>) null; - } - }) - .filter(Objects::nonNull) - .collect( - Collectors.toMap( - Map.Entry::getKey, - Map.Entry::getValue, - (a, b) -> { - throw new IllegalStateException("Duplicate key"); - }, - LinkedHashMap::new)); - } - protected static LinkedHashMap> aspectSpecsToAspectNames( LinkedHashMap> urnAspectSpecsMap, boolean timeseries) { return urnAspectSpecsMap.entrySet().stream() @@ -859,7 +801,8 @@ protected Map> toAspectMap( } protected Optional lookupAspectSpec(Urn urn, String aspectName) { - return lookupAspectSpec(entityRegistry.getEntitySpec(urn.getEntityType()), aspectName); + return RequestInputUtil.lookupAspectSpec( + entityRegistry.getEntitySpec(urn.getEntityType()), aspectName); } protected RecordTemplate toRecordTemplate( @@ -902,23 +845,6 @@ protected ChangeMCP toUpsertItem( aspectRetriever); } - /** - * Case-insensitive fallback - * - * @return - */ - protected static Optional lookupAspectSpec(EntitySpec entitySpec, String aspectName) { - if (entitySpec == null) { - return Optional.empty(); - } - - return entitySpec.getAspectSpec(aspectName) != null - ? Optional.of(entitySpec.getAspectSpec(aspectName)) - : entitySpec.getAspectSpecs().stream() - .filter(aspec -> aspec.getName().toLowerCase().equals(aspectName)) - .findFirst(); - } - protected static Urn validatedUrn(String urn) throws InvalidUrnException { try { return Urn.createFromString(urn); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/v1/TraceController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/v1/TraceController.java new file mode 100644 index 00000000000000..ad93a62695ce05 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/v1/TraceController.java @@ -0,0 +1,149 @@ +package io.datahubproject.openapi.operations.v1; + +import static com.linkedin.metadata.authorization.ApiOperation.READ; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthUtil; +import com.datahub.authorization.AuthorizerChain; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.systemmetadata.TraceService; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.datahubproject.openapi.exception.UnauthorizedException; +import io.datahubproject.openapi.util.RequestInputUtil; +import io.datahubproject.openapi.v1.models.TraceRequestV1; +import io.datahubproject.openapi.v1.models.TraceResponseV1; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.media.Content; +import io.swagger.v3.oas.annotations.media.ExampleObject; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/v1/trace") +@Slf4j +@Tag(name = "Tracing", description = "An API for tracing async operations.") +public class TraceController { + private final TraceService traceService; + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public TraceController( + TraceService traceService, + OperationContext systemOperationContext, + AuthorizerChain authorizerChain) { + this.traceService = traceService; + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "Async Write Tracing") + @PostMapping(path = "/write/{traceId}", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation( + summary = "Trace an async write to the underlying storage.", + requestBody = + @io.swagger.v3.oas.annotations.parameters.RequestBody( + required = true, + content = + @Content( + mediaType = MediaType.APPLICATION_JSON_VALUE, + examples = { + @ExampleObject( + name = "Default", + value = + """ + { + "urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)": ["status", "datasetProperties"], + "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pet_profiles,PROD)": ["datasetProperties"] + } + """) + }))) + public ResponseEntity getTrace( + HttpServletRequest request, + @PathVariable("traceId") String traceId, + @RequestParam(value = "onlyIncludeErrors", defaultValue = "true") boolean onlyIncludeErrors, + @RequestParam(value = "detailed", defaultValue = "false") boolean detailed, + @RequestParam(value = "skipCache", defaultValue = "false") boolean skipCache, + @RequestBody @Nonnull TraceRequestV1 traceRequestV1) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext opContext = + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getTrace", List.of()), + authorizerChain, + authentication, + true); + + if (!AuthUtil.isAPIAuthorizedEntityUrns(opContext, READ, traceRequestV1.keySet())) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " as least one of the requested URNs."); + } + + LinkedHashMap> normalizedInput = + traceRequestV1.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> + RequestInputUtil.resolveAspectNames( + opContext.getEntityRegistry(), e.getKey(), e.getValue(), true), + (v1, v2) -> v1, + LinkedHashMap::new)); + + return ResponseEntity.ok( + new TraceResponseV1( + traceService.trace( + opContext, + extractTraceId(traceId), + normalizedInput, + onlyIncludeErrors, + detailed, + skipCache))); + } + + private static String extractTraceId(String input) { + if (input == null || input.trim().isEmpty()) { + return null; + } + + // Clean the input + input = input.trim(); + + // Case 1: If it's a full traceparent header (containing hyphens) + if (input.contains("-")) { + String[] parts = input.split("-"); + if (parts.length >= 2) { + // The trace ID is the second part (index 1) + return parts[1]; + } + return null; + } + + // Case 2: If it's just the trace ID (32 hex characters) + if (input.length() == 32 && input.matches("[0-9a-fA-F]+")) { + return input; + } + + return null; + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index ca425810c87a09..72c39c792b355e 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -5,7 +5,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -480,7 +479,6 @@ public static List> ingestBatchProposal( boolean async) { // TODO: Use the actor present in the IC. - Timer.Context context = MetricUtils.timer("postEntity").time(); final com.linkedin.common.AuditStamp auditStamp = new com.linkedin.common.AuditStamp() .setTime(System.currentTimeMillis()) @@ -519,7 +517,6 @@ public static List> ingestBatchProposal( } else { MetricUtils.counter(MetricRegistry.name("postEntity", "success")).inc(); } - context.stop(); } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/RequestInputUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/RequestInputUtil.java new file mode 100644 index 00000000000000..fe1f217c0d8448 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/RequestInputUtil.java @@ -0,0 +1,136 @@ +package io.datahubproject.openapi.util; + +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; + +public class RequestInputUtil { + private RequestInputUtil() {} + + public static List resolveAspectNames( + EntityRegistry entityRegistry, Urn urn, List inputAspectNames, boolean expandEmpty) { + return resolveAspectSpecs(entityRegistry, urn, inputAspectNames, expandEmpty).stream() + .map(AspectSpec::getName) + .toList(); + } + + /** + * For a given urn and list of aspect names, resolve AspectSpecs + * + * @param entityRegistry + * @param urn + * @param inputAspectNames + * @param expandEmpty if empty return all AspectSpecs + * @return + */ + public static List resolveAspectSpecs( + EntityRegistry entityRegistry, Urn urn, List inputAspectNames, boolean expandEmpty) { + LinkedHashMap intermediateReq = + inputAspectNames.stream() + .map(name -> Map.entry(name, 0L)) + .collect( + Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (existing, replacement) -> existing, + LinkedHashMap::new)); + Map> intermediate = + resolveAspectSpecs( + entityRegistry, new LinkedHashMap<>(Map.of(urn, intermediateReq)), 0L, expandEmpty); + return new ArrayList<>(intermediate.getOrDefault(urn, Map.of()).keySet()); + } + + /** + * Given a map with aspect names from the API, normalized them into actual aspect names (casing + * fixes) + * + * @param requestedAspectNames requested aspects + * @param map values + * @param expandEmpty whether to expand empty aspect names to all aspect names + * @return updated map + */ + public static LinkedHashMap> resolveAspectSpecs( + EntityRegistry entityRegistry, + LinkedHashMap> requestedAspectNames, + @Nonnull T defaultValue, + boolean expandEmpty) { + return requestedAspectNames.entrySet().stream() + .map( + entry -> { + final Urn urn = entry.getKey(); + if (expandEmpty && (entry.getValue().isEmpty() || entry.getValue().containsKey(""))) { + // All aspects specified + Set allNames = + new HashSet<>( + entityRegistry.getEntitySpec(urn.getEntityType()).getAspectSpecs()); + return Map.entry( + urn, + allNames.stream() + .map( + aspectName -> + Map.entry( + aspectName, entry.getValue().getOrDefault("", defaultValue))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + } else if (!entry.getValue().keySet().isEmpty()) { + final Map normalizedNames = + entry.getValue().keySet().stream() + .map( + requestAspectName -> + Map.entry( + requestAspectName, + lookupAspectSpec(entityRegistry, urn, requestAspectName))) + .filter(aspectSpecEntry -> aspectSpecEntry.getValue().isPresent()) + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().get())); + return Map.entry( + urn, + entry.getValue().entrySet().stream() + .filter(reqEntry -> normalizedNames.containsKey(reqEntry.getKey())) + .map( + reqEntry -> + Map.entry( + normalizedNames.get(reqEntry.getKey()), reqEntry.getValue())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + } else { + return (Map.Entry>) null; + } + }) + .filter(Objects::nonNull) + .collect( + Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (a, b) -> { + throw new IllegalStateException("Duplicate key"); + }, + LinkedHashMap::new)); + } + + private static Optional lookupAspectSpec( + EntityRegistry entityRegistry, Urn urn, String aspectName) { + return lookupAspectSpec(entityRegistry.getEntitySpec(urn.getEntityType()), aspectName); + } + + /** Case-insensitive fallback */ + public static Optional lookupAspectSpec(EntitySpec entitySpec, String aspectName) { + if (entitySpec == null) { + return Optional.empty(); + } + + return entitySpec.getAspectSpec(aspectName) != null + ? Optional.of(entitySpec.getAspectSpec(aspectName)) + : entitySpec.getAspectSpecs().stream() + .filter(aspec -> aspec.getName().toLowerCase().equals(aspectName)) + .findFirst(); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java index 03050868efdcab..030f43152cd283 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/entities/EntitiesController.java @@ -7,7 +7,6 @@ import static com.linkedin.metadata.utils.PegasusUtils.urnToEntityName; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.authorization.AuthUtil; @@ -104,7 +103,7 @@ public ResponseEntity getEntities( @RequestParam(name = "aspectNames", required = false) @Nullable String[] aspectNames) { - Timer.Context context = MetricUtils.timer("getEntities").time(); + final Set entityUrns = Arrays.stream(urns) // Have to decode here because of frontend routing, does No-op for already unencoded @@ -167,7 +166,6 @@ public ResponseEntity getEntities( } else { MetricUtils.counter(MetricRegistry.name("getEntities", "success")).inc(); } - context.stop(); } } @@ -263,7 +261,7 @@ public ResponseEntity> deleteEntities( boolean soft, @RequestParam(required = false, name = "async") Boolean async) { Throwable exceptionally = null; - try (Timer.Context context = MetricUtils.timer("deleteEntities").time()) { + try { Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/relationships/RelationshipsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/relationships/RelationshipsController.java index b2b7eb557ca32a..2caa4911081c94 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/relationships/RelationshipsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v1/relationships/RelationshipsController.java @@ -5,7 +5,6 @@ import static com.linkedin.metadata.search.utils.QueryUtils.*; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.authorization.AuthUtil; @@ -157,7 +156,7 @@ public ResponseEntity getRelationships( @RequestParam(name = "count", defaultValue = "200") @Nullable Integer count) { - Timer.Context context = MetricUtils.timer("getRelationships").time(); + // Have to decode here because of frontend routing, does No-op for already unencoded through // direct API access final Urn entityUrn = UrnUtils.getUrn(URLDecoder.decode(urn, Charset.forName("UTF-8"))); @@ -201,7 +200,6 @@ public ResponseEntity getRelationships( } else { MetricUtils.counter(MetricRegistry.name("getRelationships", "success")).inc(); } - context.stop(); } } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index a4583082d57c7f..429e97eb615d86 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -709,6 +709,17 @@ protected ChangeMCP toUpsertItem( changeType = ChangeType.UPSERT; } + SystemMetadata systemMetadata = null; + if (jsonNode.has("systemMetadata")) { + systemMetadata = + EntityApiUtils.parseSystemMetadata( + objectMapper.writeValueAsString(jsonNode.get("systemMetadata"))); + } + Map headers = null; + if (jsonNode.has("headers")) { + headers = objectMapper.convertValue(jsonNode.get("headers"), new TypeReference<>() {}); + } + return ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectSpec.getName()) @@ -719,6 +730,8 @@ protected ChangeMCP toUpsertItem( ByteString.copyString(aspectJson, StandardCharsets.UTF_8), GenericRecordUtils.JSON, aspectSpec)) + .systemMetadata(systemMetadata) + .headers(headers) .build(aspectRetriever); } } diff --git a/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java b/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java index 8b530b218532d0..a5cff75c0c91a2 100644 --- a/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java +++ b/metadata-service/openapi-servlet/src/test/java/entities/EntitiesControllerTest.java @@ -14,6 +14,7 @@ import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.AspectDao; +import com.linkedin.metadata.entity.IngestAspectsResult; import com.linkedin.metadata.entity.TransactionContext; import com.linkedin.metadata.entity.UpdateAspectResult; import com.linkedin.metadata.event.EventProducer; @@ -76,7 +77,7 @@ public void setup() .thenAnswer( i -> List.of( - ((Function>) i.getArgument(0)) + ((Function) i.getArgument(0)) .apply(TransactionContext.empty(Mockito.mock(Transaction.class), 0)))); EventProducer mockEntityEventProducer = Mockito.mock(EventProducer.class); diff --git a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java index e82ab50a0defeb..7365c9dac00e9d 100644 --- a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java +++ b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java @@ -56,8 +56,10 @@ import com.linkedin.metadata.utils.SearchUtil; import com.linkedin.mxe.GenericAspect; import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.TraceContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.openapi.config.SpringWebConfig; +import io.datahubproject.openapi.config.TracingInterceptor; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.servlet.ServletException; @@ -87,6 +89,7 @@ @ComponentScan(basePackages = {"io.datahubproject.openapi.v3.controller"}) @Import({ SpringWebConfig.class, + TracingInterceptor.class, EntityControllerTest.EntityControllerTestConfig.class, EntityVersioningServiceFactory.class }) @@ -398,6 +401,7 @@ public static class EntityControllerTestConfig { @MockBean public EntityServiceImpl entityService; @MockBean public SearchService searchService; @MockBean public TimeseriesAspectService timeseriesAspectService; + @MockBean public TraceContext traceContext; @Bean public ObjectMapper objectMapper() { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index af11532ccf4ece..3a5e907c1c0ea7 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3830,18 +3830,18 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model\r", + "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { "type" : "record", "name" : "MLModelLineageInfo", - "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups", "fields" : [ { "name" : "trainingJobs", "type" : { "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.", "optional" : true, "Relationship" : { "/*" : { @@ -3856,7 +3856,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "doc" : "List of jobs or process instances (if any) that use the model or group.", "optional" : true, "Relationship" : { "/*" : { @@ -3871,7 +3871,7 @@ "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModel\r", + "doc" : "Display name of the MLModel", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -3882,7 +3882,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel\r", + "doc" : "Documentation of the MLModel", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3891,28 +3891,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed\r", + "doc" : "Date when the MLModel was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Audit stamp containing who created this and when\r", + "doc" : "Audit stamp containing who created this and when", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModel was last modified\r", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel\r", + "doc" : "Version of the MLModel", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3928,7 +3928,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", + "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", "optional" : true }, { "name" : "hyperParams", @@ -3963,7 +3963,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel\r", + "doc" : "Hyperparameters of the MLModel", "optional" : true }, { "name" : "trainingMetrics", @@ -3998,7 +3998,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training\r", + "doc" : "Metrics of the MLModel used in training", "optional" : true }, { "name" : "onlineMetrics", @@ -4006,7 +4006,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production\r", + "doc" : "Metrics of the MLModel used in production", "optional" : true }, { "name" : "mlFeatures", @@ -4014,7 +4014,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training\r", + "doc" : "List of features used for MLModel training", "optional" : true, "Relationship" : { "/*" : { @@ -4029,7 +4029,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel\r", + "doc" : "Tags for the MLModel", "default" : [ ] }, { "name" : "deployments", @@ -4037,7 +4037,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel\r", + "doc" : "Deployments for the MLModel", "optional" : true, "Relationship" : { "/*" : { @@ -4051,7 +4051,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to\r", + "doc" : "Groups the model belongs to", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index f58d83dd1e5cb7..bde79e4d475cc2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3988,18 +3988,18 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model\r", + "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { "type" : "record", "name" : "MLModelLineageInfo", - "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups", "fields" : [ { "name" : "trainingJobs", "type" : { "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.", "optional" : true, "Relationship" : { "/*" : { @@ -4014,7 +4014,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "doc" : "List of jobs or process instances (if any) that use the model or group.", "optional" : true, "Relationship" : { "/*" : { @@ -4029,7 +4029,7 @@ "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModel\r", + "doc" : "Display name of the MLModel", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -4040,7 +4040,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel\r", + "doc" : "Documentation of the MLModel", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4049,28 +4049,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed\r", + "doc" : "Date when the MLModel was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Audit stamp containing who created this and when\r", + "doc" : "Audit stamp containing who created this and when", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModel was last modified\r", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel\r", + "doc" : "Version of the MLModel", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4086,7 +4086,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", + "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", "optional" : true }, { "name" : "hyperParams", @@ -4121,7 +4121,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel\r", + "doc" : "Hyperparameters of the MLModel", "optional" : true }, { "name" : "trainingMetrics", @@ -4156,7 +4156,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training\r", + "doc" : "Metrics of the MLModel used in training", "optional" : true }, { "name" : "onlineMetrics", @@ -4164,7 +4164,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production\r", + "doc" : "Metrics of the MLModel used in production", "optional" : true }, { "name" : "mlFeatures", @@ -4172,7 +4172,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training\r", + "doc" : "List of features used for MLModel training", "optional" : true, "Relationship" : { "/*" : { @@ -4187,7 +4187,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel\r", + "doc" : "Tags for the MLModel", "default" : [ ] }, { "name" : "deployments", @@ -4195,7 +4195,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel\r", + "doc" : "Deployments for the MLModel", "optional" : true, "Relationship" : { "/*" : { @@ -4209,7 +4209,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to\r", + "doc" : "Groups the model belongs to", "optional" : true, "Relationship" : { "/*" : { @@ -5012,12 +5012,12 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group\r", + "doc" : "Properties associated with an ML Model Group", "include" : [ "com.linkedin.common.CustomProperties", "MLModelLineageInfo" ], "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModelGroup\r", + "doc" : "Display name of the MLModelGroup", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -5028,7 +5028,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup\r", + "doc" : "Documentation of the MLModelGroup", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -5037,23 +5037,23 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed\r", + "doc" : "Date when the MLModelGroup was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Time and Actor who created the MLModelGroup\r", + "doc" : "Time and Actor who created the MLModelGroup", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModelGroup was last modified\r", + "doc" : "Date when the MLModelGroup was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup\r", + "doc" : "Version of the MLModelGroup", "optional" : true } ], "Aspect" : { @@ -6291,6 +6291,10 @@ "name" : "aspect", "type" : "com.linkedin.entity.Aspect", "optional" : true + }, { + "name" : "telemetryTraceId", + "type" : "string", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index 61c31f93987b88..a252d5c73591d4 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3554,18 +3554,18 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model\r", + "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { "type" : "record", "name" : "MLModelLineageInfo", - "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups", "fields" : [ { "name" : "trainingJobs", "type" : { "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.", "optional" : true, "Relationship" : { "/*" : { @@ -3580,7 +3580,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "doc" : "List of jobs or process instances (if any) that use the model or group.", "optional" : true, "Relationship" : { "/*" : { @@ -3595,7 +3595,7 @@ "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModel\r", + "doc" : "Display name of the MLModel", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -3606,7 +3606,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel\r", + "doc" : "Documentation of the MLModel", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3615,28 +3615,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed\r", + "doc" : "Date when the MLModel was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Audit stamp containing who created this and when\r", + "doc" : "Audit stamp containing who created this and when", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModel was last modified\r", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel\r", + "doc" : "Version of the MLModel", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3652,7 +3652,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", + "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", "optional" : true }, { "name" : "hyperParams", @@ -3687,7 +3687,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel\r", + "doc" : "Hyperparameters of the MLModel", "optional" : true }, { "name" : "trainingMetrics", @@ -3722,7 +3722,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training\r", + "doc" : "Metrics of the MLModel used in training", "optional" : true }, { "name" : "onlineMetrics", @@ -3730,7 +3730,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production\r", + "doc" : "Metrics of the MLModel used in production", "optional" : true }, { "name" : "mlFeatures", @@ -3738,7 +3738,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training\r", + "doc" : "List of features used for MLModel training", "optional" : true, "Relationship" : { "/*" : { @@ -3753,7 +3753,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel\r", + "doc" : "Tags for the MLModel", "default" : [ ] }, { "name" : "deployments", @@ -3761,7 +3761,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel\r", + "doc" : "Deployments for the MLModel", "optional" : true, "Relationship" : { "/*" : { @@ -3775,7 +3775,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to\r", + "doc" : "Groups the model belongs to", "optional" : true, "Relationship" : { "/*" : { @@ -3952,6 +3952,10 @@ "name" : "aspect", "type" : "com.linkedin.entity.Aspect", "optional" : true + }, { + "name" : "telemetryTraceId", + "type" : "string", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 75793be7331da4..29d72cd00e9c99 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3548,18 +3548,18 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model\r", + "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { "type" : "record", "name" : "MLModelLineageInfo", - "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups", "fields" : [ { "name" : "trainingJobs", "type" : { "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.", "optional" : true, "Relationship" : { "/*" : { @@ -3574,7 +3574,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "doc" : "List of jobs or process instances (if any) that use the model or group.", "optional" : true, "Relationship" : { "/*" : { @@ -3589,7 +3589,7 @@ "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModel\r", + "doc" : "Display name of the MLModel", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -3600,7 +3600,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel\r", + "doc" : "Documentation of the MLModel", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3609,28 +3609,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed\r", + "doc" : "Date when the MLModel was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Audit stamp containing who created this and when\r", + "doc" : "Audit stamp containing who created this and when", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModel was last modified\r", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel\r", + "doc" : "Version of the MLModel", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3646,7 +3646,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", + "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", "optional" : true }, { "name" : "hyperParams", @@ -3681,7 +3681,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel\r", + "doc" : "Hyperparameters of the MLModel", "optional" : true }, { "name" : "trainingMetrics", @@ -3716,7 +3716,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training\r", + "doc" : "Metrics of the MLModel used in training", "optional" : true }, { "name" : "onlineMetrics", @@ -3724,7 +3724,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production\r", + "doc" : "Metrics of the MLModel used in production", "optional" : true }, { "name" : "mlFeatures", @@ -3732,7 +3732,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training\r", + "doc" : "List of features used for MLModel training", "optional" : true, "Relationship" : { "/*" : { @@ -3747,7 +3747,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel\r", + "doc" : "Tags for the MLModel", "default" : [ ] }, { "name" : "deployments", @@ -3755,7 +3755,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel\r", + "doc" : "Deployments for the MLModel", "optional" : true, "Relationship" : { "/*" : { @@ -3769,7 +3769,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to\r", + "doc" : "Groups the model belongs to", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 58ba2ad05dfe74..b4ede3617cacfb 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3982,18 +3982,18 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model\r", + "doc" : "Properties associated with a ML Model", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { "type" : "record", "name" : "MLModelLineageInfo", - "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups", "fields" : [ { "name" : "trainingJobs", "type" : { "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.", "optional" : true, "Relationship" : { "/*" : { @@ -4008,7 +4008,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "doc" : "List of jobs or process instances (if any) that use the model or group.", "optional" : true, "Relationship" : { "/*" : { @@ -4023,7 +4023,7 @@ "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModel\r", + "doc" : "Display name of the MLModel", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -4034,7 +4034,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel\r", + "doc" : "Documentation of the MLModel", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4043,28 +4043,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed\r", + "doc" : "Date when the MLModel was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Audit stamp containing who created this and when\r", + "doc" : "Audit stamp containing who created this and when", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModel was last modified\r", + "doc" : "Date when the MLModel was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel\r", + "doc" : "Version of the MLModel", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4080,7 +4080,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", + "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", "optional" : true }, { "name" : "hyperParams", @@ -4115,7 +4115,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel\r", + "doc" : "Hyperparameters of the MLModel", "optional" : true }, { "name" : "trainingMetrics", @@ -4150,7 +4150,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training\r", + "doc" : "Metrics of the MLModel used in training", "optional" : true }, { "name" : "onlineMetrics", @@ -4158,7 +4158,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production\r", + "doc" : "Metrics of the MLModel used in production", "optional" : true }, { "name" : "mlFeatures", @@ -4166,7 +4166,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training\r", + "doc" : "List of features used for MLModel training", "optional" : true, "Relationship" : { "/*" : { @@ -4181,7 +4181,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel\r", + "doc" : "Tags for the MLModel", "default" : [ ] }, { "name" : "deployments", @@ -4189,7 +4189,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel\r", + "doc" : "Deployments for the MLModel", "optional" : true, "Relationship" : { "/*" : { @@ -4203,7 +4203,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to\r", + "doc" : "Groups the model belongs to", "optional" : true, "Relationship" : { "/*" : { @@ -5006,12 +5006,12 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group\r", + "doc" : "Properties associated with an ML Model Group", "include" : [ "com.linkedin.common.CustomProperties", "MLModelLineageInfo" ], "fields" : [ { "name" : "name", "type" : "string", - "doc" : "Display name of the MLModelGroup\r", + "doc" : "Display name of the MLModelGroup", "optional" : true, "Searchable" : { "boostScore" : 10.0, @@ -5022,7 +5022,7 @@ }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup\r", + "doc" : "Documentation of the MLModelGroup", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -5031,23 +5031,23 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed\r", + "doc" : "Date when the MLModelGroup was developed", "optional" : true, "deprecated" : true }, { "name" : "created", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Time and Actor who created the MLModelGroup\r", + "doc" : "Time and Actor who created the MLModelGroup", "optional" : true }, { "name" : "lastModified", "type" : "com.linkedin.common.TimeStamp", - "doc" : "Date when the MLModelGroup was last modified\r", + "doc" : "Date when the MLModelGroup was last modified", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup\r", + "doc" : "Version of the MLModelGroup", "optional" : true } ], "Aspect" : { diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 30b187da00e91a..ecb37a8c80bb29 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -51,7 +51,7 @@ import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.time.Clock; @@ -145,7 +145,7 @@ public Task get( throws URISyntaxException { log.info("GET ASPECT urn: {} aspect: {} version: {}", urnStr, aspectName, version); final Urn urn = Urn.createFromString(urnStr); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Authentication auth = AuthenticationContext.getAuthentication(); @@ -195,7 +195,7 @@ public Task getTimeseriesAspectValues( endTimeMillis, limit); final Urn urn = Urn.createFromString(urnStr); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Authentication auth = AuthenticationContext.getAuthentication(); @@ -305,7 +305,7 @@ private Task ingestProposals( final AuditStamp auditStamp = new AuditStamp().setTime(_clock.millis()).setActor(Urn.createFromString(actorUrnStr)); - return RestliUtils.toTask(() -> { + return RestliUtils.toTask(systemOperationContext, () -> { log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() @@ -342,7 +342,7 @@ private Task ingestProposals( public Task getCount( @ActionParam(PARAM_ASPECT) @Nonnull String aspectName, @ActionParam(PARAM_URN_LIKE) @Optional @Nullable String urnLike) { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Authentication authentication = AuthenticationContext.getAuthentication(); @@ -374,7 +374,7 @@ public Task restoreIndices( @ActionParam("limit") @Optional @Nullable Integer limit, @ActionParam("gePitEpochMs") @Optional @Nullable Long gePitEpochMs, @ActionParam("lePitEpochMs") @Optional @Nullable Long lePitEpochMs) { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Authentication authentication = AuthenticationContext.getAuthentication(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java index ebbfc6bb6c2983..3539a19ffd4702 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java @@ -36,7 +36,7 @@ import com.linkedin.restli.server.resources.CollectionResourceTaskTemplate; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -108,7 +108,7 @@ public Task rollback( "Both Safe & hardDelete flags were defined, honouring safe flag as hardDelete is deprecated"); } try { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { try { @@ -136,7 +136,7 @@ public Task list( @ActionParam("includeSoft") @Optional @Nullable Boolean includeSoft) { log.info("LIST RUNS offset: {} size: {}", pageOffset, pageSize); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { List summaries = systemMetadataService.listRuns( @@ -160,7 +160,7 @@ public Task describe( @ActionParam("includeAspect") @Optional @Nullable Boolean includeAspect) { log.info("DESCRIBE RUN runId: {}, start: {}, count: {}", runId, start, count); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Authentication auth = AuthenticationContext.getAuthentication(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 0c374c29cf958a..d05bf4a4598473 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -84,7 +84,7 @@ import com.linkedin.restli.server.annotations.RestMethod; import com.linkedin.restli.server.resources.CollectionResourceTaskTemplate; import com.linkedin.timeseries.DeleteAspectValuesResult; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.time.Clock; import java.util.ArrayList; @@ -207,7 +207,7 @@ public Task get( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity " + urn); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Set projectedAspects = aspectNames == null @@ -248,7 +248,7 @@ public Task> batchGet( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entities: " + urnStrs); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Set projectedAspects = aspectNames == null @@ -298,7 +298,7 @@ public Task ingest( // variables referenced in lambdas are required to be final final SystemMetadata finalSystemMetadata = systemMetadata; - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { entityService.ingestEntity(opContext, entity, auditStamp, finalSystemMetadata); return null; @@ -355,7 +355,7 @@ public Task batchIngest( .map(SystemMetadataUtils::generateSystemMetadataIfEmpty) .collect(Collectors.toList()); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { entityService.ingestEntities(opContext, Arrays.asList(entities), auditStamp, finalSystemMetadataList); @@ -396,7 +396,7 @@ public Task search( log.info("GET SEARCH RESULTS for {} with query {}", entityName, input); // TODO - change it to use _searchService once we are confident on it's latency - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final SearchResult result; // This API is not used by the frontend for search bars so we default to structured @@ -509,7 +509,7 @@ public Task scrollAcrossEntities( input, scrollId); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { ScrollResult result = searchService.scrollAcrossEntities( opContext, @@ -576,7 +576,7 @@ public Task searchAcrossLineage( direction, entityList, input); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> validateLineageSearchResult(opContext, lineageSearchService.searchAcrossLineage( opContext, urn, @@ -639,7 +639,7 @@ public Task scrollAcrossLineage( List sortCriterionList = getSortCriteria(sortCriteria, sortCriterion); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> validateLineageScrollResult(opContext, lineageSearchService.scrollAcrossLineage( @@ -686,7 +686,7 @@ public Task list( final Filter finalFilter = validateAndConvert(filter); log.info("GET LIST RESULTS for {} with filter {}", entityName, finalFilter); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { SearchResult result = entitySearchService.filter(opContext, entityName, finalFilter, sortCriterionList, start, count); if (!AuthUtil.isAPIAuthorizedResult( @@ -725,7 +725,7 @@ public Task autocomplete( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to search."); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { AutoCompleteResult result = entitySearchService.autoComplete(opContext, entityName, query, field, filter, limit); if (!isAPIAuthorizedResult( @@ -763,7 +763,7 @@ public Task browse( } log.info("GET BROWSE RESULTS for {} at path {}", entityName, path); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { BrowseResult result = entitySearchService.browse(opContext, entityName, path, filter, start, limit); if (!isAPIAuthorizedResult( @@ -799,7 +799,7 @@ public Task getBrowsePaths( } log.info("GET BROWSE PATHS for {}", urn); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> new StringArray(entitySearchService.getBrowsePaths(opContext, urnToEntityName(urn), urn)), MetricRegistry.name(this.getClass(), "getBrowsePaths")); } @@ -839,7 +839,7 @@ public Task deleteEntities( ComparableVersion finalRegistryVersion = registryVersion; String finalRegistryName1 = registryName; ComparableVersion finalRegistryVersion1 = registryVersion; - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { RollbackResponse response = new RollbackResponse(); List aspectRowsToDelete = @@ -921,7 +921,7 @@ public Task deleteEntity( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entity: " + urnStr); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { // Find the timeseries aspects to delete. If aspectName is null, delete all. List timeseriesAspectNames = @@ -1041,7 +1041,7 @@ public Task deleteReferencesTo( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to delete entity " + urnStr); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> deleteEntityService.deleteReferencesTo(opContext, urn, dryRun), MetricRegistry.name(this.getClass(), "deleteReferences")); } @@ -1137,7 +1137,7 @@ public Task listUrns( } log.info("LIST URNS for {} with start {} and count {}", entityName, start, count); - return RestliUtils.toTask(() -> { + return RestliUtils.toTask(systemOperationContext, () -> { ListUrnsResult result = entityService.listUrns(opContext, entityName, start, count); if (!isAPIAuthorizedEntityUrns( opContext, @@ -1178,7 +1178,7 @@ public Task applyRetention( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to apply retention."); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> entityService.batchApplyRetention(opContext, start, count, attemptWithVersion, aspectName, urn), ACTION_APPLY_RETENTION); } @@ -1208,7 +1208,7 @@ public Task filter( List sortCriterionList = getSortCriteria(sortCriteria, sortCriterion); log.info("FILTER RESULTS for {} with filter {}", entityName, filter); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { SearchResult result = entitySearchService.filter(opContext.withSearchFlags(flags -> flags.setFulltext(true)), entityName, filter, sortCriterionList, start, count); @@ -1245,7 +1245,7 @@ public Task exists(@ActionParam(PARAM_URN) @Nonnull String urnStr, @Act log.info("EXISTS for {}", urnStr); final boolean includeRemoved = includeSoftDelete == null || includeSoftDelete; - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> entityService.exists(opContext, urn, includeRemoved), MetricRegistry.name(this.getClass(), "exists")); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 896d81d3cbecc3..6e05ce2ac82768 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -27,7 +27,7 @@ import com.linkedin.restli.server.resources.CollectionResourceTaskTemplate; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Arrays; import java.util.Collections; @@ -83,7 +83,7 @@ public Task get( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity " + urn); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final String entityName = urnToEntityName(urn); final Set projectedAspects = @@ -133,7 +133,7 @@ public Task> batchGet( return Task.value(Collections.emptyMap()); } final String entityName = urnToEntityName(urns.iterator().next()); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Set projectedAspects = aspectNames == null diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java index 73b2d1a6c5cb87..1ce250300745fd 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java @@ -29,7 +29,7 @@ import com.linkedin.restli.server.resources.CollectionResourceTaskTemplate; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -96,7 +96,7 @@ public Task> batchGetVersioned( if (versionedUrnStrs.size() <= 0) { return Task.value(Collections.emptyMap()); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Set projectedAspects = aspectNames == null diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java index 16d5868443955a..fe3defe9658ca0 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java @@ -40,7 +40,7 @@ import com.linkedin.restli.server.resources.SimpleResourceTemplate; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Arrays; import java.util.List; @@ -133,7 +133,7 @@ public Task get( } RelationshipDirection direction = RelationshipDirection.valueOf(rawDirection); final List relationshipTypes = Arrays.asList(relationshipTypesParam); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final RelatedEntitiesResult relatedEntitiesResult = getRelatedEntities(rawUrn, relationshipTypes, direction, start, count); @@ -210,7 +210,7 @@ public Task getLineage( throw new RestLiServiceException( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to get entity lineage: " + urnStr); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> _graphService.getLineage(systemOperationContext, urn, diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java index 705089baed8f5e..efb20bdee44099 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/OperationsResource.java @@ -33,7 +33,7 @@ import com.linkedin.timeseries.TimeseriesIndicesSizesResult; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; @@ -104,7 +104,7 @@ public Task restoreIndices( @ActionParam("limit") @Optional @Nullable Integer limit, @ActionParam("gePitEpochMs") @Optional @Nullable Long gePitEpochMs, @ActionParam("lePitEpochMs") @Optional @Nullable Long lePitEpochMs) { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> Utils.restoreIndices(systemOperationContext, getContext(), aspectName, urn, urnLike, start, batchSize, limit, gePitEpochMs, lePitEpochMs, _authorizer, _entityService), MetricRegistry.name(this.getClass(), "restoreIndices")); @@ -129,7 +129,7 @@ public Task getTaskStatus( @ActionParam(PARAM_NODE_ID) @Optional String nodeId, @ActionParam(PARAM_TASK_ID) @Optional("0") long taskId, @ActionParam(PARAM_TASK) @Optional String task) { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Authentication auth = AuthenticationContext.getAuthentication(); @@ -192,7 +192,7 @@ public Task getTaskStatus( @Nonnull @WithSpan public Task getIndexSizes() { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Authentication auth = AuthenticationContext.getAuthentication(); @@ -317,7 +317,7 @@ public Task truncateTimeseriesAspect( @ActionParam(PARAM_TIMEOUT_SECONDS) @Optional @Nullable Long timeoutSeconds, @ActionParam(PARAM_FORCE_DELETE_BY_QUERY) @Optional @Nullable Boolean forceDeleteByQuery, @ActionParam(PARAM_FORCE_REINDEX) @Optional @Nullable Boolean forceReindex) { - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> executeTruncateTimeseriesAspect( entityType, diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java index 4fea3b0a1aca68..7054da41173e59 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java @@ -23,7 +23,7 @@ import com.linkedin.restli.server.resources.CollectionResourceTaskTemplate; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import javax.annotation.Nonnull; import javax.inject.Inject; import javax.inject.Named; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index a2092405da3ff6..eec83b37e07d96 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -1,13 +1,13 @@ package com.linkedin.metadata.resources.restli; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.linkedin.metadata.dao.throttle.APIThrottleException; import com.linkedin.metadata.restli.NonExceptionHttpErrorResponse; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; @@ -54,20 +54,20 @@ public static Task toTask(@Nonnull Supplier supplier) { } @Nonnull - public static Task toTask(@Nonnull Supplier supplier, String metricName) { - Timer.Context context = MetricUtils.timer(metricName).time(); - // Stop timer on success and failure - return toTask(supplier) - .transform( - orig -> { - context.stop(); - if (orig.isFailed()) { - MetricUtils.counter(MetricRegistry.name(metricName, "failed")).inc(); - } else { - MetricUtils.counter(MetricRegistry.name(metricName, "success")).inc(); - } - return orig; - }); + public static Task toTask(@Nonnull OperationContext opContext, @Nonnull Supplier supplier, String metricName) { + return opContext.withSpan(metricName, () -> { + // Stop timer on success and failure + return toTask(supplier) + .transform( + orig -> { + if (orig.isFailed()) { + MetricUtils.counter(MetricRegistry.name(metricName, "failed")).inc(); + } else { + MetricUtils.counter(MetricRegistry.name(metricName, "success")).inc(); + } + return orig; + }); + }, MetricUtils.DROPWIZARD_METRIC, "true"); } /** diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index 426eff20c9c6eb..cc5c1b1059b1f7 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -44,7 +44,7 @@ import com.linkedin.usage.UserUsageCounts; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.Arrays; import java.util.Map; import java.util.Set; @@ -100,7 +100,7 @@ public class UsageStats extends SimpleResourceTemplate { @WithSpan public Task batchIngest(@ActionParam(PARAM_BUCKETS) @Nonnull UsageAggregation[] buckets) { log.info("Ingesting {} usage stats aggregations", buckets.length); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { final Authentication auth = AuthenticationContext.getAuthentication(); @@ -141,7 +141,7 @@ public Task query( log.info( "Querying usage stats for resource: {}, duration: {}, start time: {}, end time: {}, max buckets: {}", resource, duration, startTime, endTime, maxBuckets); - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> { Urn resourceUrn = UrnUtils.getUrn(resource); @@ -186,7 +186,7 @@ public Task queryRange( HttpStatus.S_403_FORBIDDEN, "User is unauthorized to query usage."); } - return RestliUtils.toTask( + return RestliUtils.toTask(systemOperationContext, () -> UsageServiceUtil.queryRange(opContext, _timeseriesAspectService, resource, duration, range), MetricRegistry.name(this.getClass(), "queryRange")); } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index 037b5b81fd4df0..265b8a35e840c8 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.IngestAspectsResult; import com.linkedin.metadata.entity.UpdateAspectResult; import com.linkedin.metadata.entity.ebean.batch.ChangeItemImpl; import com.linkedin.metadata.event.EventProducer; @@ -88,7 +89,8 @@ public void testAsyncDefaultAspects() throws URISyntaxException { Actor actor = new Actor(ActorType.USER, "user"); when(mockAuthentication.getActor()).thenReturn(actor); aspectResource.ingestProposal(mcp, "true"); - verify(producer, times(1)).produceMetadataChangeProposal(urn, mcp); + verify(producer, times(1)).produceMetadataChangeProposal(any(OperationContext.class), eq(urn), + argThat(arg -> arg.getMetadataChangeProposal().equals(mcp))); verifyNoMoreInteractions(producer); verifyNoMoreInteractions(aspectDao); @@ -101,42 +103,43 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) .build(opContext.getAspectRetriever()); - when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) - .thenReturn( - List.of(List.of( - UpdateAspectResult.builder() - .urn(urn) - .newValue(new DatasetProperties().setName("name1")) - .auditStamp(new AuditStamp()) - .request(req) - .build(), - UpdateAspectResult.builder() - .urn(urn) - .newValue(new DatasetProperties().setName("name2")) - .auditStamp(new AuditStamp()) - .request(req) - .build(), - UpdateAspectResult.builder() - .urn(urn) - .newValue(new DatasetProperties().setName("name3")) - .auditStamp(new AuditStamp()) - .request(req) - .build(), - UpdateAspectResult.builder() - .urn(urn) - .newValue(new DatasetProperties().setName("name4")) - .auditStamp(new AuditStamp()) - .request(req) - .build(), - UpdateAspectResult.builder() - .urn(urn) - .newValue(new DatasetProperties().setName("name5")) - .auditStamp(new AuditStamp()) - .request(req) - .build()))); + IngestAspectsResult txResult = IngestAspectsResult.builder() + .updateAspectResults(List.of( + UpdateAspectResult.builder() + .urn(urn) + .newValue(new DatasetProperties().setName("name1")) + .auditStamp(new AuditStamp()) + .request(req) + .build(), + UpdateAspectResult.builder() + .urn(urn) + .newValue(new DatasetProperties().setName("name2")) + .auditStamp(new AuditStamp()) + .request(req) + .build(), + UpdateAspectResult.builder() + .urn(urn) + .newValue(new DatasetProperties().setName("name3")) + .auditStamp(new AuditStamp()) + .request(req) + .build(), + UpdateAspectResult.builder() + .urn(urn) + .newValue(new DatasetProperties().setName("name4")) + .auditStamp(new AuditStamp()) + .request(req) + .build(), + UpdateAspectResult.builder() + .urn(urn) + .newValue(new DatasetProperties().setName("name5")) + .auditStamp(new AuditStamp()) + .request(req) + .build())) + .build(); + when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())).thenReturn(List.of(txResult)); aspectResource.ingestProposal(mcp, "false"); verify(producer, times(5)) - .produceMetadataChangeLog(eq(urn), any(AspectSpec.class), any(MetadataChangeLog.class)); + .produceMetadataChangeLog(any(OperationContext.class), eq(urn), any(AspectSpec.class), any(MetadataChangeLog.class)); verifyNoMoreInteractions(producer); } @@ -160,7 +163,7 @@ public void testNoValidateAsync() throws URISyntaxException { Actor actor = new Actor(ActorType.USER, "user"); when(mockAuthentication.getActor()).thenReturn(actor); aspectResource.ingestProposal(mcp, "true"); - verify(producer, times(1)).produceMetadataChangeProposal(urn, mcp); + verify(producer, times(1)).produceMetadataChangeProposal(any(OperationContext.class), eq(urn), argThat(arg -> arg.getMetadataChangeProposal().equals(mcp))); verifyNoMoreInteractions(producer); verifyNoMoreInteractions(aspectDao); reset(producer, aspectDao); diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestAspectsResult.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestAspectsResult.java new file mode 100644 index 00000000000000..d9b7091ac44b0f --- /dev/null +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestAspectsResult.java @@ -0,0 +1,62 @@ +package com.linkedin.metadata.entity; + +import com.linkedin.metadata.aspect.batch.ChangeMCP; +import com.linkedin.metadata.aspect.plugins.validation.AspectValidationException; +import com.linkedin.util.Pair; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import lombok.Builder; +import lombok.Value; + +@Builder(toBuilder = true) +@Value +public class IngestAspectsResult { + public static final IngestAspectsResult EMPTY = IngestAspectsResult.builder().build(); + + List updateAspectResults; + List>> failedUpdateAspectResults; + + public static IngestAspectsResult combine(IngestAspectsResult first, IngestAspectsResult second) { + if (first == null) { + return second != null ? second : IngestAspectsResult.builder().build(); + } + if (second == null) { + return first; + } + + List combinedResults = + Stream.concat( + first.getUpdateAspectResults().stream(), second.getUpdateAspectResults().stream()) + .collect(Collectors.toList()); + + List>> combinedFailedResults = + Stream.concat( + first.getFailedUpdateAspectResults().stream(), + second.getFailedUpdateAspectResults().stream()) + .collect(Collectors.toList()); + + return IngestAspectsResult.builder() + .updateAspectResults(combinedResults) + .failedUpdateAspectResults(combinedFailedResults) + .build(); + } + + public static class IngestAspectsResultBuilder { + public IngestAspectsResult build() { + if (this.failedUpdateAspectResults == null) { + this.failedUpdateAspectResults = Collections.emptyList(); + } + if (this.updateAspectResults == null) { + this.updateAspectResults = Collections.emptyList(); + } + + return new IngestAspectsResult( + this.updateAspectResults.stream().filter(Objects::nonNull).collect(Collectors.toList()), + this.failedUpdateAspectResults); + } + } +} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestProposalResult.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestProposalResult.java deleted file mode 100644 index 1ef818559faaec..00000000000000 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/IngestProposalResult.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.linkedin.metadata.entity; - -import com.linkedin.common.urn.Urn; -import lombok.Value; - -@Value -public class IngestProposalResult { - Urn urn; - boolean didUpdate; - boolean queued; -} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java index c554f2b919b063..8b0540a6e2d949 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java @@ -5,7 +5,7 @@ import com.linkedin.metadata.recommendation.ranker.RecommendationModuleRanker; import com.linkedin.metadata.utils.ConcurrencyUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.List; import java.util.Map; import java.util.Optional; diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java index c1593088a2dd71..03dbe1af149fd0 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java @@ -17,7 +17,7 @@ import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.utils.QueryUtils; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Collections; import java.util.Comparator; diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java index ca3d43762e0738..0d9b3ced8d6f33 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlySearchedSource.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; import com.linkedin.common.urn.Urn; import com.linkedin.metadata.datahubusage.DataHubUsageEventConstants; @@ -85,21 +84,28 @@ public List getRecommendations( @Nullable Filter filter) { SearchRequest searchRequest = buildSearchRequest(opContext.getSessionActorContext().getActorUrn()); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getRecentlySearched").time()) { - final SearchResponse searchResponse = - _searchClient.search(searchRequest, RequestOptions.DEFAULT); - // extract results - ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); - return parsedTerms.getBuckets().stream() - .map(bucket -> buildContent(bucket.getKeyAsString())) - .filter(Optional::isPresent) - .map(Optional::get) - .limit(MAX_CONTENT) - .collect(Collectors.toList()); - } catch (Exception e) { - log.error("Search query to get most recently viewed entities failed", e); - throw new ESQueryException("Search query failed:", e); - } + + return opContext.withSpan( + "getRecentlySearched", + () -> { + try { + final SearchResponse searchResponse = + _searchClient.search(searchRequest, RequestOptions.DEFAULT); + // extract results + ParsedTerms parsedTerms = searchResponse.getAggregations().get(ENTITY_AGG_NAME); + return parsedTerms.getBuckets().stream() + .map(bucket -> buildContent(bucket.getKeyAsString())) + .filter(Optional::isPresent) + .map(Optional::get) + .limit(MAX_CONTENT) + .collect(Collectors.toList()); + } catch (Exception e) { + log.error("Search query to get most recently viewed entities failed", e); + throw new ESQueryException("Search query failed:", e); + } + }, + MetricUtils.DROPWIZARD_NAME, + MetricUtils.name(this.getClass(), "getRecentlySearched")); } private SearchRequest buildSearchRequest(@Nonnull Urn userUrn) { diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java index ddf203067f455e..5175177906e260 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.recommendation.RecommendationRenderType; import com.linkedin.metadata.recommendation.RecommendationRequestContext; import io.datahubproject.metadata.context.OperationContext; -import io.opentelemetry.extension.annotations.WithSpan; +import io.opentelemetry.instrumentation.annotations.WithSpan; import java.util.List; import java.util.Optional; import javax.annotation.Nonnull; diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java index 3880ad1d8da119..6d55ab1252b33d 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/SystemMetadataService.java @@ -1,5 +1,6 @@ package com.linkedin.metadata.systemmetadata; +import com.linkedin.common.urn.Urn; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; import com.linkedin.mxe.SystemMetadata; @@ -40,6 +41,9 @@ List findByRegistry( List listRuns( Integer pageOffset, Integer pageSize, boolean includeSoftDeleted); + List findAspectsByUrn( + @Nonnull Urn urn, @Nonnull List aspects, boolean includeSoftDeleted); + default void configure() {} void clear(); diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceService.java new file mode 100644 index 00000000000000..1b6f32da1162fb --- /dev/null +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceService.java @@ -0,0 +1,38 @@ +package com.linkedin.metadata.systemmetadata; + +import com.linkedin.common.urn.Urn; +import io.datahubproject.metadata.context.OperationContext; +import java.util.List; +import java.util.Map; +import javax.annotation.Nonnull; + +public interface TraceService { + + @Nonnull + Map> trace( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + @Nonnull Map> aspects, + boolean onlyIncludeErrors, + boolean detailed, + boolean skipCache); + + @Nonnull + default Map> trace( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + @Nonnull Map> aspects, + boolean onlyIncludeErrors, + boolean detailed) { + return trace(opContext, traceId, aspects, onlyIncludeErrors, detailed, false); + } + + @Nonnull + default Map> traceDetailed( + @Nonnull OperationContext opContext, + @Nonnull String traceId, + @Nonnull Map> aspects, + boolean skipCache) { + return trace(opContext, traceId, aspects, false, true, skipCache); + } +} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStatus.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStatus.java new file mode 100644 index 00000000000000..35bdecee459e1e --- /dev/null +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStatus.java @@ -0,0 +1,16 @@ +package com.linkedin.metadata.systemmetadata; + +import com.fasterxml.jackson.annotation.JsonInclude; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +@JsonInclude(JsonInclude.Include.NON_NULL) +@AllArgsConstructor +public class TraceStatus { + private boolean success; + private TraceStorageStatus primaryStorage; + private TraceStorageStatus searchStorage; +} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStorageStatus.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStorageStatus.java new file mode 100644 index 00000000000000..0def8785a6b823 --- /dev/null +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceStorageStatus.java @@ -0,0 +1,52 @@ +package com.linkedin.metadata.systemmetadata; + +import com.fasterxml.jackson.annotation.JsonInclude; +import io.datahubproject.metadata.exception.TraceException; +import java.util.List; +import javax.annotation.Nullable; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +@JsonInclude(JsonInclude.Include.NON_NULL) +@AllArgsConstructor +public class TraceStorageStatus { + public static final TraceStorageStatus NO_OP = TraceStorageStatus.ok(TraceWriteStatus.NO_OP); + + public static TraceStorageStatus ok(TraceWriteStatus writeStatus) { + return TraceStorageStatus.builder().writeStatus(writeStatus).build(); + } + + public static TraceStorageStatus ok(TraceWriteStatus writeStatus, @Nullable String message) { + TraceStorageStatus.TraceStorageStatusBuilder builder = + TraceStorageStatus.builder().writeStatus(writeStatus); + if (message != null) { + builder.writeMessage(message); + } + return builder.build(); + } + + public static TraceStorageStatus fail(TraceWriteStatus writeStatus, @Nullable Throwable t) { + TraceStorageStatus.TraceStorageStatusBuilder builder = + TraceStorageStatus.builder().writeStatus(writeStatus); + if (t != null) { + builder.writeExceptions(List.of(new TraceException(t))); + } + return builder.build(); + } + + public static TraceStorageStatus fail(TraceWriteStatus writeStatus, @Nullable String message) { + TraceStorageStatus.TraceStorageStatusBuilder builder = + TraceStorageStatus.builder().writeStatus(writeStatus); + if (message != null) { + builder.writeMessage(message); + } + return builder.build(); + } + + private TraceWriteStatus writeStatus; + private String writeMessage; + @Nullable private List writeExceptions; +} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceWriteStatus.java b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceWriteStatus.java new file mode 100644 index 00000000000000..78bccd6bf1ccf7 --- /dev/null +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/systemmetadata/TraceWriteStatus.java @@ -0,0 +1,18 @@ +package com.linkedin.metadata.systemmetadata; + +public enum TraceWriteStatus { + // error occurred during processing + ERROR, + // write is queued + PENDING, + // write is the active value in the datastore + ACTIVE_STATE, + // write has been overwritten with a newer value. + HISTORIC_STATE, + // write is not required + NO_OP, + // Unknown status due to the fact that tracing is lost or potentially well outside the expected + // tracing range (i.e. last year) + UNKNOWN, + TRACE_NOT_IMPLEMENTED +} diff --git a/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java b/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java index 69fb9df2b04dbb..e8ba2d8f14f3d6 100644 --- a/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java +++ b/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java @@ -41,7 +41,9 @@ "com.linkedin.gms.factory.change", "com.datahub.event.hook", "com.linkedin.gms.factory.notifications", - "com.linkedin.gms.factory.telemetry" + "com.linkedin.gms.factory.telemetry", + "com.linkedin.gms.factory.trace", + "com.linkedin.gms.factory.kafka.trace", }) @PropertySource(value = "classpath:/application.yaml", factory = YamlPropertySourceFactory.class) @Configuration diff --git a/metadata-service/war/src/main/java/com/linkedin/gms/factory/config/GMSOpenTelemetryConfig.java b/metadata-service/war/src/main/java/com/linkedin/gms/factory/config/GMSOpenTelemetryConfig.java new file mode 100644 index 00000000000000..b0eab226619709 --- /dev/null +++ b/metadata-service/war/src/main/java/com/linkedin/gms/factory/config/GMSOpenTelemetryConfig.java @@ -0,0 +1,21 @@ +package com.linkedin.gms.factory.config; + +import com.linkedin.gms.factory.system_telemetry.OpenTelemetryBaseFactory; +import io.datahubproject.metadata.context.TraceContext; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class GMSOpenTelemetryConfig extends OpenTelemetryBaseFactory { + + @Override + protected String getApplicationComponent() { + return "datahub-gms"; + } + + @Bean + @Override + protected TraceContext traceContext() { + return super.traceContext(); + } +} diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 4b24eeac50b0b7..410641ef678e58 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -10,6 +10,7 @@ dependencies { implementation externalDependency.commonsLang api externalDependency.dropwizardMetricsCore implementation externalDependency.dropwizardMetricsJmx + implementation externalDependency.opentelemetrySdk api externalDependency.elasticSearchRest implementation externalDependency.httpClient api externalDependency.neo4jJavaDriver diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricSpanExporter.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricSpanExporter.java new file mode 100644 index 00000000000000..df2d164edc3c50 --- /dev/null +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricSpanExporter.java @@ -0,0 +1,58 @@ +package com.linkedin.metadata.utils.metrics; + +import static com.linkedin.metadata.utils.metrics.MetricUtils.DROPWIZARD_METRIC; +import static com.linkedin.metadata.utils.metrics.MetricUtils.DROPWIZARD_NAME; + +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.Timer; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.sdk.common.CompletableResultCode; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.opentelemetry.sdk.trace.export.SpanExporter; +import java.util.Collection; +import java.util.concurrent.TimeUnit; + +/** Created to forward opentelemetry spans to dropwizard for backwards compatibility */ +public class MetricSpanExporter implements SpanExporter { + private static final AttributeKey DROPWIZARD_ATTR_KEY = + AttributeKey.stringKey(DROPWIZARD_METRIC); + private static final AttributeKey DROPWIZARD_NAME_ATTR_KEY = + AttributeKey.stringKey(DROPWIZARD_NAME); + + @Override + public CompletableResultCode export(Collection spans) { + spans.stream().filter(this::shouldRecordMetric).forEach(this::recordSpanMetric); + + return CompletableResultCode.ofSuccess(); + } + + private boolean shouldRecordMetric(SpanData span) { + // Check for the recordMetric attribute + return Boolean.parseBoolean(span.getAttributes().get(DROPWIZARD_ATTR_KEY)) + || span.getAttributes().get(DROPWIZARD_NAME_ATTR_KEY) != null; + } + + private void recordSpanMetric(SpanData span) { + // Calculate duration in nanoseconds + long durationNanos = span.getEndEpochNanos() - span.getStartEpochNanos(); + String dropWizardName = span.getAttributes().get(DROPWIZARD_NAME_ATTR_KEY); + String dropWizardMetricName = + dropWizardName == null + ? MetricRegistry.name(span.getName()) + : MetricRegistry.name(dropWizardName); + + // Update timer with the span duration + Timer timer = MetricUtils.get().timer(dropWizardMetricName); + timer.update(durationNanos, TimeUnit.NANOSECONDS); + } + + @Override + public CompletableResultCode flush() { + return CompletableResultCode.ofSuccess(); + } + + @Override + public CompletableResultCode shutdown() { + return CompletableResultCode.ofSuccess(); + } +} diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java index 3a47c11f8d7489..963015c111f600 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java @@ -4,10 +4,21 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.SharedMetricRegistries; -import com.codahale.metrics.Timer; import com.codahale.metrics.jmx.JmxReporter; public class MetricUtils { + public static final String DROPWIZARD_METRIC = "dwizMetric"; + public static final String DROPWIZARD_NAME = "dwizName"; + public static final String CACHE_HIT_ATTR = "cache.hit"; + public static final String BATCH_SIZE_ATTR = "batch.size"; + public static final String QUEUE_ENQUEUED_AT_ATTR = "queue.enqueued_at"; + public static final String QUEUE_DURATION_MS_ATTR = "queue.duration_ms"; + public static final String MESSAGING_SYSTEM = "messaging.system"; + public static final String MESSAGING_DESTINATION = "messaging.destination"; + public static final String MESSAGING_DESTINATION_KIND = "messaging.destination_kind"; + public static final String MESSAGING_OPERATION = "messaging.operation"; + public static final String ERROR_TYPE = "error.type"; + private MetricUtils() {} public static final String DELIMITER = "_"; @@ -41,12 +52,12 @@ public static Counter counter(String metricName) { return REGISTRY.counter(MetricRegistry.name(metricName)); } - public static Timer timer(Class klass, String metricName) { - return REGISTRY.timer(MetricRegistry.name(klass, metricName)); + public static String name(String name, String... names) { + return MetricRegistry.name(name, names); } - public static Timer timer(String metricName) { - return REGISTRY.timer(MetricRegistry.name(metricName)); + public static String name(Class clazz, String... names) { + return MetricRegistry.name(clazz.getName(), names); } public static > T gauge( diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index fadc3dbec1f2b5..f1fbdac68f067a 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -17,4 +17,5 @@ types-PyYAML # https://github.com/docker/docker-py/issues/3256 requests<=2.31.0 # Missing numpy requirement in 8.0.0 -deepdiff!=8.0.0 \ No newline at end of file +deepdiff!=8.0.0 +opensearch-py==2.6.0 \ No newline at end of file diff --git a/smoke-test/tests/trace/__init__.py b/smoke-test/tests/trace/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/smoke-test/tests/trace/test_api_trace.py b/smoke-test/tests/trace/test_api_trace.py new file mode 100644 index 00000000000000..4f8671decf8ae6 --- /dev/null +++ b/smoke-test/tests/trace/test_api_trace.py @@ -0,0 +1,486 @@ +import time + +import pytest +from opensearchpy import OpenSearch + +from tests.utils import delete_urns, wait_for_writes_to_sync + +es = OpenSearch(["http://localhost:9200"]) + + +generated_urns = { + "apiTraceHappyPath": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceHappyPath,PROD)", + "apiTraceMCPFail": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceMCPFail,PROD)", + "apiTraceDroppedElasticsearch": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceDroppedElasticsearch,PROD)", + "apiTraceOverwritten": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceOverwritten,PROD)", + "apiTraceTimeseries": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceTimeseries,PROD)", + "apiTraceNoop": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceNoop,PROD)", + "apiTraceNoopWithFMCP": "urn:li:dataset:(urn:li:dataPlatform:test,apiTraceNoopWithFMCP,PROD)", +} + + +@pytest.fixture(scope="module", autouse=True) +def test_setup(graph_client): + """Fixture to clean-up urns before and after a test is run""" + print("removing previous test data") + delete_urns(graph_client, list(generated_urns.values())) + wait_for_writes_to_sync() + yield + print("removing generated test data") + delete_urns(graph_client, list(generated_urns.values())) + wait_for_writes_to_sync() + + +def test_successful_async_write(auth_session): + urn = generated_urns["apiTraceHappyPath"] + aspect_name = "status" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[{"urn": urn, aspect_name: {"value": {"removed": False}}}], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + +def test_mcp_fail_aspect_async_write(auth_session): + urn = generated_urns["apiTraceMCPFail"] + aspect_name = "glossaryTerms" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset/{urn}/{aspect_name}", + params={"async": "true", "systemMetadata": "true"}, + json={ + "value": { + "terms": [{"urn": "urn:li:glossaryTerm:someTerm"}], + "auditStamp": {"time": 0, "actor": "urn:li:corpuser:datahub"}, + }, + "headers": {"If-Version-Match": "-10000"}, + }, + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json()[urn][aspect_name]["success"] is False + assert resp.json()[urn][aspect_name]["primaryStorage"]["writeStatus"] == "ERROR" + assert ( + resp.json()[urn][aspect_name]["primaryStorage"]["writeExceptions"][0]["message"] + == "Expected version -10000, actual version -1" + ) + assert resp.json()[urn][aspect_name]["searchStorage"] == { + "writeStatus": "ERROR", + "writeMessage": "Primary storage write failed.", + } + + +def test_overwritten_async_write(auth_session): + urn = generated_urns["apiTraceOverwritten"] + aspect_name = "datasetProperties" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[ + { + "urn": urn, + aspect_name: { + "value": {"name": "original", "customProperties": {}, "tags": []} + }, + } + ], + ) + + original_trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{original_trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + # Perform 2nd write + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[ + { + "urn": urn, + aspect_name: { + "value": {"name": "updated", "customProperties": {}, "tags": []} + }, + } + ], + ) + + second_trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{second_trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{original_trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "HISTORIC_STATE"}, + "searchStorage": {"writeStatus": "HISTORIC_STATE"}, + } + } + } + + +def test_missing_elasticsearch_async_write(auth_session): + urn = generated_urns["apiTraceDroppedElasticsearch"] + aspect_name = "status" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[{"urn": urn, aspect_name: {"value": {"removed": False}}}], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + # Simulate overwrite + delete_elasticsearch_trace(trace_id) + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "HISTORIC_STATE"}, + } + } + } + + # Simulate dropped write + delete_elasticsearch_system_metadata(urn) + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": False, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": { + "writeStatus": "ERROR", + "writeMessage": "Consumer has processed past the offset.", + }, + } + } + } + + +def test_timeseries_async_write(auth_session): + urn = generated_urns["apiTraceTimeseries"] + aspect_name = "datasetProfile" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[ + { + "urn": urn, + aspect_name: { + "value": { + "timestampMillis": time.time_ns() // 1_000_000, + "messageId": "test timeseries", + "rowCount": 1, + } + }, + } + ], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "NO_OP"}, + "searchStorage": {"writeStatus": "TRACE_NOT_IMPLEMENTED"}, + } + } + } + + +def test_noop_async_write(auth_session): + urn = generated_urns["apiTraceNoop"] + aspect_name = "status" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[{"urn": urn, aspect_name: {"value": {"removed": False}}}], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[{"urn": urn, aspect_name: {"value": {"removed": False}}}], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "NO_OP"}, + "searchStorage": {"writeStatus": "NO_OP"}, + } + } + } + + +def test_noop_with_fmcp_async_write(auth_session): + urn = generated_urns["apiTraceNoopWithFMCP"] + aspect_name = "status" + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[{"urn": urn, aspect_name: {"value": {"removed": False}}}], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json() == { + urn: { + aspect_name: { + "success": True, + "primaryStorage": {"writeStatus": "ACTIVE_STATE"}, + "searchStorage": {"writeStatus": "ACTIVE_STATE"}, + } + } + } + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v3/entity/dataset", + params={"async": "true", "systemMetadata": "true"}, + json=[ + { + "urn": urn, + aspect_name: { + "value": {"removed": False}, + "headers": {"If-Version-Match": "-10000"}, + }, + } + ], + ) + + trace_id = compare_trace_header_system_metadata( + resp, resp.json()[0][aspect_name]["systemMetadata"] + ) + wait_for_writes_to_sync() + + resp = auth_session.post( + f"{auth_session.gms_url()}/openapi/v1/trace/write/{trace_id}", + params={"onlyIncludeErrors": "false", "detailed": "true", "skipCache": "true"}, + json={urn: [aspect_name]}, + ) + assert resp.json()[urn][aspect_name]["success"] is False + assert resp.json()[urn][aspect_name]["primaryStorage"]["writeStatus"] == "ERROR" + assert ( + resp.json()[urn][aspect_name]["primaryStorage"]["writeExceptions"][0]["message"] + == "Expected version -10000, actual version 1" + ) + assert resp.json()[urn][aspect_name]["searchStorage"] == { + "writeStatus": "ERROR", + "writeMessage": "Primary storage write failed.", + } + + +def compare_trace_header_system_metadata(resp, system_metadata): + header_trace_id = extract_trace_header(resp) + system_metadata_trace_id = extract_trace_system_metadata(system_metadata) + assert header_trace_id.startswith("00-" + system_metadata_trace_id) + return system_metadata_trace_id + + +def extract_trace_header(resp): + assert "traceparent" in resp.headers + return resp.headers["traceparent"] + + +def extract_trace_system_metadata(system_metadata): + assert "properties" in system_metadata + assert "telemetryTraceId" in system_metadata["properties"] + return system_metadata["properties"]["telemetryTraceId"] + + +def delete_elasticsearch_trace(trace_id, timeout=10, refresh_interval=1): + field_name = "telemetryTraceId" + index_name = "system_metadata_service_v1" + + update_body = { + "query": {"term": {field_name: trace_id}}, + "script": {"source": f"ctx._source.remove('{field_name}')"}, + } + + response = es.update_by_query( + index=index_name, + body=update_body, + conflicts="proceed", + timeout=timeout, + wait_for_completion=True, + ) + + if response.get("failures"): + raise Exception( + f"Update by query operation had failures: {response['failures']}" + ) + + time.sleep(refresh_interval) + + +def delete_elasticsearch_system_metadata(urn, timeout=10, refresh_interval=1): + index_name = "system_metadata_service_v1" + + update_body = {"query": {"term": {"urn": urn}}} + + response = es.delete_by_query( + index=index_name, + body=update_body, + conflicts="proceed", + timeout=timeout, + wait_for_completion=True, + ) + + if response.get("failures"): + raise Exception( + f"Update by query operation had failures: {response['failures']}" + ) + + time.sleep(refresh_interval)