diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index cc0dd6b4278b56..37b7ba1797af5b 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -23,6 +23,8 @@ PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true ENTITY_SERVICE_ENABLE_RETENTION=true +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env index 59fc4bdde02ff4..0ecaa32c4cb123 100644 --- a/docker/datahub-gms/env/docker.env +++ b/docker/datahub-gms/env/docker.env @@ -27,6 +27,8 @@ MCE_CONSUMER_ENABLED=true PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to enable Metadata Service Authentication METADATA_SERVICE_AUTH_ENABLED=false diff --git a/docker/datahub-mae-consumer/env/docker-without-neo4j.env b/docker/datahub-mae-consumer/env/docker-without-neo4j.env index b6899f7e6d63b2..6a82f235b29711 100644 --- a/docker/datahub-mae-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mae-consumer/env/docker-without-neo4j.env @@ -13,6 +13,8 @@ ES_BULK_REFRESH_POLICY=WAIT_UNTIL GRAPH_SERVICE_IMPL=elasticsearch ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mae-consumer/env/docker.env b/docker/datahub-mae-consumer/env/docker.env index 5a6daa6eaeaed7..1f0ee4b05b3820 100644 --- a/docker/datahub-mae-consumer/env/docker.env +++ b/docker/datahub-mae-consumer/env/docker.env @@ -17,6 +17,8 @@ NEO4J_PASSWORD=datahub GRAPH_SERVICE_IMPL=neo4j ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mce-consumer/env/docker-without-neo4j.env b/docker/datahub-mce-consumer/env/docker-without-neo4j.env index e7be7d8ed4ddc5..b0edfc0a75b669 100644 --- a/docker/datahub-mce-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mce-consumer/env/docker-without-neo4j.env @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/datahub-mce-consumer/env/docker.env b/docker/datahub-mce-consumer/env/docker.env index 8618f3f5f7af7a..c0f85ef667546e 100644 --- a/docker/datahub-mce-consumer/env/docker.env +++ b/docker/datahub-mce-consumer/env/docker.env @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 834d55096468f6..a0f60d23710a07 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 47fb50f78e4f0c..11e33a9950ba9b 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 3fa13a9e56b421..2efa8959834183 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index a4211acedcf102..4f47a3da24eb1b 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -19,6 +19,7 @@ services: - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - GRAPH_SERVICE_IMPL=elasticsearch - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -37,6 +38,7 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index e7571e4baf8b4e..7dd7388b939884 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -26,6 +26,7 @@ services: - NEO4J_PASSWORD=datahub - GRAPH_SERVICE_IMPL=neo4j - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -47,6 +48,7 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index c63b6d1d61b030..f42ed1f40c2467 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java index ca5e6d01103848..3de09e599d99ed 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java @@ -72,8 +72,9 @@ public String toDocId() { } try { + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); byte[] bytesOfRawDocID = rawDocId.toString().getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance("MD5"); + MessageDigest md = MessageDigest.getInstance(hashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ff29cb5fff47d2..17d9cb8cd14fee 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -130,6 +130,7 @@ test { // override, testng controlling parallelization // increasing >1 will merely run all tests extra times maxParallelForks = 1 + environment "ELASTIC_ID_HASH_ALGO", "MD5" } useTestNG() { suites 'src/test/resources/testng.xml' diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index 57002a3bfc59d5..cdfc4e985293f6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -86,10 +86,10 @@ private String toDocument(SystemMetadata systemMetadata, String urn, String aspe private String toDocId(@Nonnull final String urn, @Nonnull final String aspect) { String rawDocId = urn + DOC_DELIMETER + aspect; - + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); try { byte[] bytesOfRawDocID = rawDocId.getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance("MD5"); + MessageDigest md = MessageDigest.getInstance(hashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java index c0f66acaaca5af..cf0a3f1466d254 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java @@ -257,7 +257,9 @@ private static Pair getTimeseriesFieldCollectionDocument( finalDocument); } - private static String getDocId(@Nonnull JsonNode document, String collectionId) { + private static String getDocId(@Nonnull JsonNode document, String collectionId) + throws IllegalArgumentException { + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); String docId = document.get(MappingsBuilder.TIMESTAMP_MILLIS_FIELD).toString(); JsonNode eventGranularity = document.get(MappingsBuilder.EVENT_GRANULARITY); if (eventGranularity != null) { @@ -276,6 +278,11 @@ private static String getDocId(@Nonnull JsonNode document, String collectionId) docId += partitionSpec.toString(); } - return DigestUtils.md5Hex(docId); + if (hashAlgo.equalsIgnoreCase("SHA-256")) { + return DigestUtils.sha256Hex(docId); + } else if (hashAlgo.equalsIgnoreCase("MD5")) { + return DigestUtils.md5Hex(docId); + } + throw new IllegalArgumentException("Hash function not handled !"); } } diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 921c2b43dc36c6..2514060ff2d617 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -174,6 +174,7 @@ elasticsearch: opensearchUseAwsIamAuth: ${OPENSEARCH_USE_AWS_IAM_AUTH:false} region: ${AWS_REGION:#{null}} implementation: ${ELASTICSEARCH_IMPLEMENTATION:elasticsearch} # elasticsearch or opensearch, for handling divergent cases + idHashAlgo: ${ELASTIC_ID_HASH_ALGO:MD5} sslContext: # Required if useSSL is true protocol: ${ELASTICSEARCH_SSL_PROTOCOL:#{null}} secureRandomImplementation: ${ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL:#{null}} diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 9800cf65fc4529..95f3ba8ed56d64 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -89,6 +89,7 @@ task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite0' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -101,6 +102,7 @@ task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -113,6 +115,7 @@ task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:inst environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_suite1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -125,6 +128,7 @@ task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_rest' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -139,6 +143,7 @@ task cypressDev(type: Exec, dependsOn: [installDev, ':metadata-ingestion:install environment 'RUN_QUICKSTART', 'false' environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -154,6 +159,7 @@ task cypressData(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'RUN_UI', 'false' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index 2b31c574d05787..59346b26069059 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -12,6 +12,7 @@ source venv/bin/activate export KAFKA_BROKER_CONTAINER="datahub-kafka-broker-1" export KAFKA_BOOTSTRAP_SERVER="broker:9092" +export ELASTIC_ID_HASH_ALGO="MD5" python -c 'from tests.cypress.integration_test import ingest_data; ingest_data()' cd tests/cypress diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 05c321566d54a6..1923d42eb5e939 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,6 +15,7 @@ DATAHUB_SEARCH_TAG="${DATAHUB_SEARCH_TAG:=2.9.0}" XPACK_SECURITY_ENABLED="${XPACK_SECURITY_ENABLED:=plugins.security.disabled=true}" ELASTICSEARCH_USE_SSL="${ELASTICSEARCH_USE_SSL:=false}" USE_AWS_ELASTICSEARCH="${USE_AWS_ELASTICSEARCH:=true}" +ELASTIC_ID_HASH_ALGO="${ELASTIC_ID_HASH_ALGO:=MD5}" echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index fafb2076fe6990..c16865fe1e71ef 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -25,6 +25,7 @@ source venv/bin/activate source ./set-cypress-creds.sh export DATAHUB_GMS_URL=http://localhost:8080 +export ELASTIC_ID_HASH_ALGO="MD5" # no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest if [[ -z "${TEST_STRATEGY}" ]]; then