Skip to content

Commit

Permalink
feat(config): support alternate hashing algorithm for doc id (#10423)
Browse files Browse the repository at this point in the history
Co-authored-by: david-leifker <[email protected]>
Co-authored-by: John Joyce <[email protected]>
  • Loading branch information
3 people authored Aug 6, 2024
1 parent 63faeec commit a5a33f0
Show file tree
Hide file tree
Showing 21 changed files with 44 additions and 5 deletions.
2 changes: 2 additions & 0 deletions docker/datahub-gms/env/docker-without-neo4j.env
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ PE_CONSUMER_ENABLED=true
UI_INGESTION_ENABLED=true
ENTITY_SERVICE_ENABLE_RETENTION=true

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-gms/env/docker.env
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ MCE_CONSUMER_ENABLED=true
PE_CONSUMER_ENABLED=true
UI_INGESTION_ENABLED=true

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to enable Metadata Service Authentication
METADATA_SERVICE_AUTH_ENABLED=false

Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-mae-consumer/env/docker-without-neo4j.env
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ ES_BULK_REFRESH_POLICY=WAIT_UNTIL
GRAPH_SERVICE_IMPL=elasticsearch
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-mae-consumer/env/docker.env
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ NEO4J_PASSWORD=datahub
GRAPH_SERVICE_IMPL=neo4j
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-mce-consumer/env/docker-without-neo4j.env
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false
PE_CONSUMER_ENABLED=false
UI_INGESTION_ENABLED=false

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to configure kafka topic names
# Make sure these names are consistent across the whole deployment
# METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1
Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-mce-consumer/env/docker.env
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false
PE_CONSUMER_ENABLED=false
UI_INGESTION_ENABLED=false

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to configure kafka topic names
# Make sure these names are consistent across the whole deployment
# METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1
Expand Down
1 change: 1 addition & 0 deletions docker/quickstart/docker-compose-m1.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ services:
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ services:
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ services:
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ services:
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
- GRAPH_SERVICE_IMPL=elasticsearch
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml
- ELASTIC_ID_HASH_ALGO=MD5
hostname: datahub-mae-consumer
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head}
ports:
Expand All @@ -37,6 +38,7 @@ services:
- EBEAN_DATASOURCE_USERNAME=datahub
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
2 changes: 2 additions & 0 deletions docker/quickstart/docker-compose.consumers.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ services:
- NEO4J_PASSWORD=datahub
- GRAPH_SERVICE_IMPL=neo4j
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml
- ELASTIC_ID_HASH_ALGO=MD5
hostname: datahub-mae-consumer
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head}
ports:
Expand All @@ -47,6 +48,7 @@ services:
- EBEAN_DATASOURCE_USERNAME=datahub
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
1 change: 1 addition & 0 deletions docker/quickstart/docker-compose.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ services:
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_PORT=9200
- ELASTIC_ID_HASH_ALGO=MD5
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
- ES_BULK_REFRESH_POLICY=WAIT_UNTIL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ public String toDocId() {
}

try {
String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO");
byte[] bytesOfRawDocID = rawDocId.toString().getBytes(StandardCharsets.UTF_8);
MessageDigest md = MessageDigest.getInstance("MD5");
MessageDigest md = MessageDigest.getInstance(hashAlgo);
byte[] thedigest = md.digest(bytesOfRawDocID);
return Base64.getEncoder().encodeToString(thedigest);
} catch (NoSuchAlgorithmException e) {
Expand Down
1 change: 1 addition & 0 deletions metadata-io/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ test {
// override, testng controlling parallelization
// increasing >1 will merely run all tests extra times
maxParallelForks = 1
environment "ELASTIC_ID_HASH_ALGO", "MD5"
}
useTestNG() {
suites 'src/test/resources/testng.xml'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ private String toDocument(SystemMetadata systemMetadata, String urn, String aspe

private String toDocId(@Nonnull final String urn, @Nonnull final String aspect) {
String rawDocId = urn + DOC_DELIMETER + aspect;

String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO");
try {
byte[] bytesOfRawDocID = rawDocId.getBytes(StandardCharsets.UTF_8);
MessageDigest md = MessageDigest.getInstance("MD5");
MessageDigest md = MessageDigest.getInstance(hashAlgo);
byte[] thedigest = md.digest(bytesOfRawDocID);
return Base64.getEncoder().encodeToString(thedigest);
} catch (NoSuchAlgorithmException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ private static Pair<String, ObjectNode> getTimeseriesFieldCollectionDocument(
finalDocument);
}

private static String getDocId(@Nonnull JsonNode document, String collectionId) {
private static String getDocId(@Nonnull JsonNode document, String collectionId)
throws IllegalArgumentException {
String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO");
String docId = document.get(MappingsBuilder.TIMESTAMP_MILLIS_FIELD).toString();
JsonNode eventGranularity = document.get(MappingsBuilder.EVENT_GRANULARITY);
if (eventGranularity != null) {
Expand All @@ -276,6 +278,11 @@ private static String getDocId(@Nonnull JsonNode document, String collectionId)
docId += partitionSpec.toString();
}

return DigestUtils.md5Hex(docId);
if (hashAlgo.equalsIgnoreCase("SHA-256")) {
return DigestUtils.sha256Hex(docId);
} else if (hashAlgo.equalsIgnoreCase("MD5")) {
return DigestUtils.md5Hex(docId);
}
throw new IllegalArgumentException("Hash function not handled !");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ elasticsearch:
opensearchUseAwsIamAuth: ${OPENSEARCH_USE_AWS_IAM_AUTH:false}
region: ${AWS_REGION:#{null}}
implementation: ${ELASTICSEARCH_IMPLEMENTATION:elasticsearch} # elasticsearch or opensearch, for handling divergent cases
idHashAlgo: ${ELASTIC_ID_HASH_ALGO:MD5}
sslContext: # Required if useSSL is true
protocol: ${ELASTICSEARCH_SSL_PROTOCOL:#{null}}
secureRandomImplementation: ${ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL:#{null}}
Expand Down
6 changes: 6 additions & 0 deletions smoke-test/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment 'TEST_STRATEGY', 'no_cypress_suite0'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand All @@ -101,6 +102,7 @@ task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment 'TEST_STRATEGY', 'no_cypress_suite1'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand All @@ -113,6 +115,7 @@ task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:inst
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment 'TEST_STRATEGY', 'cypress_suite1'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand All @@ -125,6 +128,7 @@ task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment 'TEST_STRATEGY', 'cypress_rest'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand All @@ -139,6 +143,7 @@ task cypressDev(type: Exec, dependsOn: [installDev, ':metadata-ingestion:install
environment 'RUN_QUICKSTART', 'false'
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand All @@ -154,6 +159,7 @@ task cypressData(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal
environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/'
environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1'
environment 'RUN_UI', 'false'
environment "ELASTIC_ID_HASH_ALGO", "MD5"

workingDir = project.projectDir
commandLine 'bash', '-c',
Expand Down
1 change: 1 addition & 0 deletions smoke-test/cypress-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ source venv/bin/activate

export KAFKA_BROKER_CONTAINER="datahub-kafka-broker-1"
export KAFKA_BOOTSTRAP_SERVER="broker:9092"
export ELASTIC_ID_HASH_ALGO="MD5"
python -c 'from tests.cypress.integration_test import ingest_data; ingest_data()'

cd tests/cypress
Expand Down
1 change: 1 addition & 0 deletions smoke-test/run-quickstart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ DATAHUB_SEARCH_TAG="${DATAHUB_SEARCH_TAG:=2.9.0}"
XPACK_SECURITY_ENABLED="${XPACK_SECURITY_ENABLED:=plugins.security.disabled=true}"
ELASTICSEARCH_USE_SSL="${ELASTICSEARCH_USE_SSL:=false}"
USE_AWS_ELASTICSEARCH="${USE_AWS_ELASTICSEARCH:=true}"
ELASTIC_ID_HASH_ALGO="${ELASTIC_ID_HASH_ALGO:=MD5}"

echo "DATAHUB_VERSION = $DATAHUB_VERSION"
DATAHUB_TELEMETRY_ENABLED=false \
Expand Down
1 change: 1 addition & 0 deletions smoke-test/smoke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ source venv/bin/activate
source ./set-cypress-creds.sh

export DATAHUB_GMS_URL=http://localhost:8080
export ELASTIC_ID_HASH_ALGO="MD5"

# no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest
if [[ -z "${TEST_STRATEGY}" ]]; then
Expand Down

0 comments on commit a5a33f0

Please sign in to comment.