From d8d5ecd22f3f2f8a1a5f8aece133627146a3bbf9 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Wed, 18 Oct 2023 12:05:58 -0500 Subject: [PATCH] feat(kafka): increase kafka message size and enable compression --- build.gradle | 2 +- .../app/client/KafkaTrackingProducer.java | 10 ++++--- .../app/config/ConfigurationProvider.java | 6 ++++- docs/deploy/environment-vars.md | 26 +++++++++---------- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/build.gradle b/build.gradle index cf55a59cfe694..bd282535fa13c 100644 --- a/build.gradle +++ b/build.gradle @@ -39,7 +39,7 @@ buildscript { plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id 'com.palantir.docker' version '0.35.0' + id 'com.palantir.docker' version '0.35.0' apply false // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } diff --git a/datahub-frontend/app/client/KafkaTrackingProducer.java b/datahub-frontend/app/client/KafkaTrackingProducer.java index fab17f9215d4a..59e91a6d5a0f7 100644 --- a/datahub-frontend/app/client/KafkaTrackingProducer.java +++ b/datahub-frontend/app/client/KafkaTrackingProducer.java @@ -1,6 +1,8 @@ package client; +import com.linkedin.metadata.config.kafka.ProducerConfiguration; import com.typesafe.config.Config; +import config.ConfigurationProvider; import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; @@ -35,12 +37,12 @@ public class KafkaTrackingProducer { private final KafkaProducer _producer; @Inject - public KafkaTrackingProducer(@Nonnull Config config, ApplicationLifecycle lifecycle) { + public KafkaTrackingProducer(@Nonnull Config config, ApplicationLifecycle lifecycle, final ConfigurationProvider configurationProvider) { _isEnabled = !config.hasPath("analytics.enabled") || config.getBoolean("analytics.enabled"); if (_isEnabled) { _logger.debug("Analytics tracking is enabled"); - _producer = createKafkaProducer(config); + _producer = createKafkaProducer(config, configurationProvider.getKafka().getProducer()); lifecycle.addStopHook( () -> { @@ -62,13 +64,15 @@ public void send(ProducerRecord record) { _producer.send(record); } - private static KafkaProducer createKafkaProducer(Config config) { + private static KafkaProducer createKafkaProducer(Config config, ProducerConfiguration producerConfiguration) { final Properties props = new Properties(); props.put(ProducerConfig.CLIENT_ID_CONFIG, "datahub-frontend"); props.put(ProducerConfig.DELIVERY_TIMEOUT_MS_CONFIG, config.getString("analytics.kafka.delivery.timeout.ms")); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("analytics.kafka.bootstrap.server")); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); // Actor urn. props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); // JSON object. + props.put(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, producerConfiguration.getMaxRequestSize()); + props.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, producerConfiguration.getCompressionType()); final String securityProtocolConfig = "analytics.kafka.security.protocol"; if (config.hasPath(securityProtocolConfig) diff --git a/datahub-frontend/app/config/ConfigurationProvider.java b/datahub-frontend/app/config/ConfigurationProvider.java index 00a5472ec3476..8f526c831b5c9 100644 --- a/datahub-frontend/app/config/ConfigurationProvider.java +++ b/datahub-frontend/app/config/ConfigurationProvider.java @@ -1,6 +1,7 @@ package config; import com.linkedin.metadata.config.cache.CacheConfiguration; +import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.spring.YamlPropertySourceFactory; import lombok.Data; @@ -11,7 +12,6 @@ /** * Minimal sharing between metadata-service and frontend - * Initially for use of client caching configuration. * Does not use the factories module to avoid transitive dependencies. */ @EnableConfigurationProperties @@ -19,6 +19,10 @@ @ConfigurationProperties @Data public class ConfigurationProvider { + /** + * Kafka related configs. + */ + private KafkaConfiguration kafka; /** * Configuration for caching diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md index 7c55e57adacfc..4c7b249349ca0 100644 --- a/docs/deploy/environment-vars.md +++ b/docs/deploy/environment-vars.md @@ -67,19 +67,19 @@ In general, there are **lots** of Kafka configuration environment variables for These environment variables follow the standard Spring representation of properties as environment variables. Simply replace the dot, `.`, with an underscore, `_`, and convert to uppercase. -| Variable | Default | Unit/Type | Components | Description | -|-----------------------------------------------------|----------------------------------------------|-----------|------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `KAFKA_LISTENER_CONCURRENCY` | 1 | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Number of Kafka consumer threads. Optimize throughput by matching to topic partitions. | -| `SPRING_KAFKA_PRODUCER_PROPERTIES_MAX_REQUEST_SIZE` | 1048576 | bytes | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | -| `SCHEMA_REGISTRY_TYPE` | `INTERNAL` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry implementation. One of `INTERNAL` or `KAFKA` or `AWS_GLUE` | -| `KAFKA_SCHEMAREGISTRY_URL` | `http://localhost:8080/schema-registry/api/` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry url. Used for `INTERNAL` and `KAFKA`. The default value is for the `GMS` component. The `MCE Consumer` and `MAE Consumer` should be the `GMS` hostname and port. | -| `AWS_GLUE_SCHEMA_REGISTRY_REGION` | `us-east-1` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry implementation. | -| `AWS_GLUE_SCHEMA_REGISTRY_NAME` | `` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry. | -| `USE_CONFLUENT_SCHEMA_REGISTRY` | `true` | boolean | [`kafka-setup`] | Enable Confluent schema registry configuration. | -| `KAFKA_PRODUCER_MAX_REQUEST_SIZE` | `5242880` | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | -| `KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES` | `5242880` | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | The maximum amount of data per-partition the server will return. Records are fetched in batches by the consumer. If the first record batch in the first non-empty partition of the fetch is larger than this limit, the batch will still be returned to ensure that the consumer can make progress. | -| `MAX_MESSAGE_BYTES` | `5242880` | integer | [`kafka-setup`] | Sets the max message size on the kakfa topics. | -| `KAFKA_PRODUCER_COMPRESSION_TYPE` | `snappy` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | The compression used by the producer. | +| Variable | Default | Unit/Type | Components | Description | +|-----------------------------------------------------|----------------------------------------------|-----------|--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `KAFKA_LISTENER_CONCURRENCY` | 1 | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Number of Kafka consumer threads. Optimize throughput by matching to topic partitions. | +| `SPRING_KAFKA_PRODUCER_PROPERTIES_MAX_REQUEST_SIZE` | 1048576 | bytes | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | +| `SCHEMA_REGISTRY_TYPE` | `INTERNAL` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry implementation. One of `INTERNAL` or `KAFKA` or `AWS_GLUE` | +| `KAFKA_SCHEMAREGISTRY_URL` | `http://localhost:8080/schema-registry/api/` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry url. Used for `INTERNAL` and `KAFKA`. The default value is for the `GMS` component. The `MCE Consumer` and `MAE Consumer` should be the `GMS` hostname and port. | +| `AWS_GLUE_SCHEMA_REGISTRY_REGION` | `us-east-1` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry implementation. | +| `AWS_GLUE_SCHEMA_REGISTRY_NAME` | `` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry. | +| `USE_CONFLUENT_SCHEMA_REGISTRY` | `true` | boolean | [`kafka-setup`] | Enable Confluent schema registry configuration. | +| `KAFKA_PRODUCER_MAX_REQUEST_SIZE` | `5242880` | integer | [`Frontend`, `GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | +| `KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES` | `5242880` | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | The maximum amount of data per-partition the server will return. Records are fetched in batches by the consumer. If the first record batch in the first non-empty partition of the fetch is larger than this limit, the batch will still be returned to ensure that the consumer can make progress. | +| `MAX_MESSAGE_BYTES` | `5242880` | integer | [`kafka-setup`] | Sets the max message size on the kakfa topics. | +| `KAFKA_PRODUCER_COMPRESSION_TYPE` | `snappy` | string | [`Frontend`, `GMS`, `MCE Consumer`, `MAE Consumer`] | The compression used by the producer. | ## Frontend