forked from datahub-project/datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(schematron): add java capabilities for schema translation (datah…
- Loading branch information
1 parent
8e78b59
commit 7277fd5
Showing
28 changed files
with
3,998 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -350,6 +350,7 @@ allprojects { | |
} | ||
} | ||
} | ||
|
||
} | ||
|
||
configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# SchemaTron (Incubating) | ||
|
||
> ⚠️ This is an incubating project in draft status. APIs and functionality may change significantly between releases. | ||
SchemaTron is a schema translation toolkit that converts between various schema formats and DataHub's native schema representation. It currently provides robust support for Apache Avro schema translation with a focus on complex schema structures including unions, arrays, maps, and nested records. | ||
|
||
## Modules | ||
|
||
### CLI Module | ||
|
||
Command-line interface for converting schemas and emitting them to DataHub. | ||
|
||
```bash | ||
# Execute from this directory | ||
../../../gradlew :metadata-integration:java:datahub-schematron:cli:run --args="-i cli/src/test/resources/FlatUser.avsc" | ||
``` | ||
|
||
#### CLI Options | ||
|
||
- `-i, --input`: Input schema file or directory path | ||
- `-p, --platform`: Data platform name (default: "avro") | ||
- `-s, --server`: DataHub server URL (default: "http://localhost:8080") | ||
- `-t, --token`: DataHub access token | ||
- `--sink`: Output sink - "rest" or "file" (default: "rest") | ||
- `--output-file`: Output file path when using file sink (default: "metadata.json") | ||
|
||
### Library Module | ||
|
||
Core translation logic and models for schema conversion. Features include: | ||
|
||
- Support for complex Avro schema structures: | ||
- Union types with multiple record options | ||
- Nested records and arrays | ||
- Optional fields with defaults | ||
- Logical types (date, timestamp, etc.) | ||
- Maps with various value types | ||
- Enum types | ||
- Custom metadata and documentation | ||
|
||
- Comprehensive path handling for schema fields | ||
- DataHub-compatible metadata generation | ||
- Schema fingerprinting and versioning | ||
|
||
## Example Schema Support | ||
|
||
The library can handle sophisticated schema structures including: | ||
|
||
- Customer profiles with multiple identification types (passport, driver's license, national ID) | ||
- Contact information with primary and alternative contact methods | ||
- Address validation with verification metadata | ||
- Subscription history tracking | ||
- Flexible preference and metadata storage | ||
- Tagged customer attributes | ||
|
||
## Development | ||
|
||
The project includes extensive test coverage through: | ||
|
||
- Unit tests for field path handling | ||
- Schema translation comparison tests | ||
- Integration tests with Python reference implementation | ||
|
||
Test resources include example schemas demonstrating various Avro schema features and edge cases. | ||
|
||
## Contributing | ||
|
||
As this is an incubating project, we welcome contributions and feedback on: | ||
|
||
- Additional schema format support | ||
- Improved handling of complex schema patterns | ||
- Enhanced metadata translation | ||
- Documentation and examples | ||
- Test coverage |
110 changes: 110 additions & 0 deletions
110
metadata-integration/java/datahub-schematron/cli/build.gradle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
plugins { | ||
id "application" | ||
} | ||
apply plugin: 'java' | ||
apply plugin: 'jacoco' | ||
|
||
ext { | ||
javaMainClass = "io.datahubproject.schematron.cli.SchemaTron" | ||
} | ||
|
||
application { | ||
mainClassName = javaMainClass | ||
} | ||
|
||
dependencies { | ||
// Existing dependencies remain unchanged | ||
implementation 'info.picocli:picocli:4.7.5' | ||
annotationProcessor 'info.picocli:picocli-codegen:4.7.5' | ||
implementation 'ch.qos.logback:logback-classic:1.2.11' | ||
implementation 'ch.qos.logback:logback-core:1.2.11' | ||
implementation project(':metadata-integration:java:datahub-client') | ||
implementation project(':metadata-integration:java:datahub-schematron:lib') | ||
implementation externalDependency.avro | ||
compileOnly externalDependency.lombok | ||
annotationProcessor externalDependency.lombok | ||
|
||
// Test dependencies | ||
testImplementation externalDependency.testng | ||
testImplementation externalDependency.mockito | ||
} | ||
|
||
test { | ||
useTestNG() | ||
|
||
testLogging { | ||
events "passed", "skipped", "failed" | ||
exceptionFormat "full" | ||
showStandardStreams = true | ||
} | ||
|
||
systemProperty 'python.venv.path', System.getProperty('python.venv.path', '../venv') | ||
} | ||
|
||
task validatePythonEnv { | ||
doFirst { | ||
def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv') | ||
def isWindows = System.getProperty('os.name').toLowerCase().contains('windows') | ||
def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python" | ||
|
||
def result = exec { | ||
commandLine pythonExe, "-c", "import sys; print(sys.executable)" | ||
ignoreExitValue = true | ||
standardOutput = new ByteArrayOutputStream() | ||
errorOutput = new ByteArrayOutputStream() | ||
} | ||
|
||
if (result.exitValue != 0) { | ||
throw new GradleException("Python virtual environment not properly set up at ${venvPath}") | ||
} | ||
} | ||
} | ||
|
||
test.dependsOn tasks.getByPath(":metadata-ingestion:installDev") | ||
|
||
jacocoTestReport { | ||
dependsOn test | ||
} | ||
|
||
test.finalizedBy jacocoTestReport | ||
|
||
task updateGoldenFiles { | ||
dependsOn validatePythonEnv | ||
doLast { | ||
def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv') | ||
def isWindows = System.getProperty('os.name').toLowerCase().contains('windows') | ||
def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python" | ||
def diffsDir = new File('src/test/resources/diffs') | ||
|
||
if (!diffsDir.exists()) { | ||
throw new GradleException("Diffs directory not found at ${diffsDir.absolutePath}") | ||
} | ||
|
||
// Find all json files in the diffs directory | ||
diffsDir.listFiles().findAll { it.name.endsWith('_diff.json') }.each { diffFile -> | ||
def baseName = diffFile.name.replace('_diff.json', '') | ||
def pythonOutput = "build/test-outputs/${baseName}_python.json" | ||
def javaOutput = "build/test-outputs/${baseName}_java.json" | ||
|
||
println "Updating golden file for ${baseName}..." | ||
|
||
exec { | ||
commandLine pythonExe, | ||
'scripts/mce_diff.py', | ||
'--update-golden-diff', | ||
'--golden-diff-file', | ||
diffFile.absolutePath, | ||
pythonOutput, | ||
javaOutput | ||
ignoreExitValue = true | ||
standardOutput = new ByteArrayOutputStream() | ||
errorOutput = new ByteArrayOutputStream() | ||
} | ||
} | ||
} | ||
} | ||
|
||
configurations { | ||
provided | ||
implementation.extendsFrom provided | ||
} |
94 changes: 94 additions & 0 deletions
94
metadata-integration/java/datahub-schematron/cli/scripts/avro_schema_to_mce.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from datahub.ingestion.extractor.schema_util import AvroToMceSchemaConverter | ||
from avro.schema import parse as parse_avro, RecordSchema | ||
from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter | ||
import datahub.metadata.schema_classes as models | ||
import click | ||
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn | ||
from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||
import os | ||
import hashlib | ||
from datahub.ingestion.graph.client import get_default_graph | ||
|
||
|
||
def get_schema_hash(schema): | ||
# Convert schema to string if it isn't already | ||
schema_str = str(schema) | ||
|
||
# Create MD5 hash | ||
schema_hash = hashlib.md5(schema_str.encode("utf-8")).hexdigest() | ||
|
||
return schema_hash | ||
|
||
|
||
@click.command(name="avro2datahub") | ||
@click.option("--input-file", "-i", type=click.Path(exists=True), required=True) | ||
@click.option("--platform", type=str, required=True) | ||
@click.option("--output-file", "-o", type=click.Path(), default="metadata.py.json") | ||
@click.option("--to-file", "-f", is_flag=True, default=True) | ||
@click.option("--to-server", "-s", is_flag=True, default=False) | ||
def generate_schema_file_from_avro_schema( | ||
input_file: str, platform: str, output_file: str, to_file: bool, to_server: bool | ||
): | ||
avro_schema_file = input_file | ||
output_file_name = output_file | ||
platform_urn = make_data_platform_urn(platform) | ||
converter = AvroToMceSchemaConverter(is_key_schema=False) | ||
|
||
# Delete the output file if it exists | ||
if os.path.exists(output_file_name): | ||
os.remove(output_file_name) | ||
|
||
with open(avro_schema_file) as f: | ||
raw_string = f.read() | ||
avro_schema = parse_avro(raw_string) | ||
# Get fingerprint bytes | ||
canonical_form = avro_schema.canonical_form | ||
print( | ||
f"Schema canonical form: Length ({len(canonical_form)}); {canonical_form}" | ||
) | ||
md5_bytes = avro_schema.fingerprint("md5") | ||
# Convert to hex string | ||
avro_schema_hash = md5_bytes.hex() | ||
assert isinstance( | ||
avro_schema, RecordSchema | ||
), "This command only works for Avro records" | ||
dataset_urn = make_dataset_urn( | ||
platform=platform_urn, | ||
name=( | ||
f"{avro_schema.namespace}.{avro_schema.name}" | ||
if avro_schema.namespace | ||
else avro_schema.name | ||
), | ||
) | ||
schema_fields = [ | ||
f for f in converter.to_mce_fields(avro_schema, is_key_schema=False) | ||
] | ||
schema_metadata = models.SchemaMetadataClass( | ||
schemaName=avro_schema.name, | ||
platform=platform_urn, | ||
version=0, | ||
hash=avro_schema_hash, | ||
platformSchema=models.OtherSchemaClass(rawSchema=raw_string), | ||
fields=schema_fields, | ||
) | ||
assert schema_metadata.validate() | ||
if to_file: | ||
with SynchronizedFileEmitter(output_file_name) as file_emitter: | ||
file_emitter.emit( | ||
MetadataChangeProposalWrapper( | ||
entityUrn=dataset_urn, aspect=schema_metadata | ||
) | ||
) | ||
if to_server: | ||
with get_default_graph() as graph: | ||
graph.emit( | ||
MetadataChangeProposalWrapper( | ||
entityUrn=dataset_urn, aspect=schema_metadata | ||
) | ||
) | ||
|
||
print(f"Wrote metadata to {output_file}") | ||
|
||
|
||
if __name__ == "__main__": | ||
generate_schema_file_from_avro_schema() |
Oops, something went wrong.