Skip to content

Commit

Permalink
Merge branch 'master' into docs/version-archive
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Mar 4, 2024
2 parents 870b1be + d987707 commit fdf9a5e
Show file tree
Hide file tree
Showing 80 changed files with 9,474 additions and 74 deletions.
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ buildscript {
ext.hazelcastVersion = '5.3.6'
ext.ebeanVersion = '12.16.1'
ext.googleJavaFormatVersion = '1.18.1'
ext.openLineageVersion = '1.5.0'
ext.logbackClassicJava8 = '1.2.12'

ext.docker_registry = 'linkedin'

Expand Down Expand Up @@ -176,6 +178,7 @@ project.ext.externalDependency = [
'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion",
'snappy': 'org.xerial.snappy:snappy-java:1.1.10.4',
'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic",
'logbackClassicJava8' : "ch.qos.logback:logback-classic:$logbackClassicJava8",
'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion",
'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion",
'log4jApi': "org.apache.logging.log4j:log4j-api:$log4jVersion",
Expand Down
16 changes: 9 additions & 7 deletions docker/datahub-ingestion-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ARG BASE_IMAGE=base
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
ARG GITHUB_REPO_URL=https://github.com
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=null
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM golang:1-alpine3.18 AS dockerize-binary

Expand All @@ -26,15 +26,18 @@ RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION

FROM python:3.10 as base

ARG DEBIAN_REPO_URL
ARG PIP_MIRROR_URL
ARG GITHUB_REPO_URL

ENV DEBIAN_FRONTEND noninteractive

# Optionally set corporate mirror for apk and pip
# Optionally set corporate mirror for deb
ARG DEBIAN_REPO_URL
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi

# Optionally set corporate mirror for pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

RUN apt-get update && apt-get install -y -qq \
python3-ldap \
Expand Down Expand Up @@ -67,8 +70,7 @@ USER datahub
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
RUN python3 -m venv $VIRTUAL_ENV && \
uv pip install --no-cache -r requirements.txt && \
pip uninstall -y acryl-datahub
uv pip install --no-cache -r requirements.txt

ENTRYPOINT [ "/entrypoint.sh" ]

Expand Down
18 changes: 11 additions & 7 deletions docker/datahub-ingestion/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@
ARG APP_ENV=full
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head
ARG PIP_MIRROR_URL=null
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM $BASE_IMAGE:$DOCKER_VERSION as base

# Optionally set corporate mirror for deb
USER 0
ARG DEBIAN_REPO_URL
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
USER datahub

# Optionally set corporate mirror for pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin

Expand All @@ -19,23 +29,17 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS
cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__

FROM base as slim-install
ARG PIP_MIRROR_URL

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ."

FROM base as full-install-build
ARG PIP_MIRROR_URL
ARG DEBIAN_REPO_URL

USER 0
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
RUN apt-get update && apt-get install -y -qq maven

USER datahub
COPY ./docker/datahub-ingestion/pyspark_jars.sh .

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,all] @ ." "acryl-datahub-airflow-plugin[plugin-v2] @ ./airflow-plugin" && \
datahub --version
RUN ./pyspark_jars.sh
Expand Down
19 changes: 9 additions & 10 deletions docker/datahub-ingestion/Dockerfile-slim-only
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
# Defining environment
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head-slim
ARG PIP_MIRROR_URL=null
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM $BASE_IMAGE:$DOCKER_VERSION as base
USER 0
USER datahub

# Optionally set corporate mirror for apk and pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

COPY ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion

ARG RELEASE_VERSION
WORKDIR /datahub-ingestion
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
cat src/datahub/__init__.py && \
chown -R datahub /datahub-ingestion

USER datahub
cat src/datahub/__init__.py

FROM base as slim-install

ARG PIP_MIRROR_URL

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." && \
datahub --version

Expand Down
2 changes: 1 addition & 1 deletion docker/kafka-setup/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ARG ALPINE_REPO_URL
ARG APACHE_DOWNLOAD_URL
ARG GITHUB_REPO_URL

ENV KAFKA_VERSION 3.4.1
ENV KAFKA_VERSION 3.5.2
ENV SCALA_VERSION 2.13

LABEL name="kafka" version=${KAFKA_VERSION}
Expand Down
24 changes: 14 additions & 10 deletions docs-website/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def projectMdFiles = project.fileTree("${project.projectDir}") {
include '**/*.ts'
exclude 'node_modules'
exclude '**/dist/**'
}
}

// Combine GraphQL schemas for documentation.
task generateGraphQLSchema(type: Exec) {
Expand All @@ -68,6 +68,16 @@ task yarnInstall(type: YarnTask) {
} else {
args = ['install']
}

// The node_modules directory can contain built artifacts, so
// it's not really safe to cache it.
outputs.cacheIf { false }

inputs.files(
file('yarn.lock'),
file('package.json'),
)
outputs.dir('node_modules')
}

task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall,
Expand All @@ -94,17 +104,11 @@ task fastReload(type: YarnTask) {
task yarnLint(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
inputs.files(projectMdFiles)
args = ['run', 'lint-check']
outputs.dir("dist")
// tell gradle to apply the build cache
outputs.cacheIf { true }
}

task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall]) {
inputs.files(projectMdFiles)
args = ['run', 'lint-fix']
outputs.dir("dist")
// tell gradle to apply the build cache
outputs.cacheIf { true }
}

task serve(type: YarnTask, dependsOn: [yarnInstall] ) {
Expand All @@ -123,11 +127,11 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHisto
outputs.cacheIf { true }
// See https://stackoverflow.com/questions/53230823/fatal-error-ineffective-mark-compacts-near-heap-limit-allocation-failed-java
// and https://github.com/facebook/docusaurus/issues/8329.
// TODO: As suggested in https://github.com/facebook/docusaurus/issues/4765, try switching to swc-loader.
// TODO: As suggested in https://github.com/facebook/docusaurus/issues/4765, try switching to swc-loader or esbuild minification.
if (project.hasProperty('useSystemNode') && project.getProperty('useSystemNode').toBoolean()) {
environment = ['NODE_OPTIONS': '--max-old-space-size=10248']
environment = ['NODE_OPTIONS': '--max-old-space-size=14336']
} else {
environment = ['NODE_OPTIONS': '--max-old-space-size=10248 --openssl-legacy-provider']
environment = ['NODE_OPTIONS': '--max-old-space-size=14336 --openssl-legacy-provider']
}
args = ['run', 'build']

Expand Down
4 changes: 4 additions & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ module.exports = {
"metadata-integration/java/spark-lineage/README",
"metadata-ingestion/integration_docs/great-expectations",
"metadata-integration/java/datahub-protobuf/README",
"metadata-integration/java/spark-lineage-beta/README",
//"metadata-ingestion/source-docs-template",
{
type: "autogenerated",
Expand Down Expand Up @@ -746,6 +747,9 @@ module.exports = {
//"docs/how/build-metadata-service",
//"docs/how/graph-onboarding",
//"docs/demo/graph-onboarding",
//"metadata-integration/java/spark-lineage/README",
// "metadata-integration/java/spark-lineage-beta/README.md
// "metadata-integration/java/openlineage-converter/README"
//"metadata-ingestion-modules/airflow-plugin/README"
// "metadata-ingestion/schedule_docs/datahub", // we can delete this
// TODO: change the titles of these, removing the "What is..." portion from the sidebar"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nonnull;
import org.apache.commons.lang3.tuple.ImmutableTriple;

public abstract class AbstractMultiFieldPatchBuilder<T extends AbstractMultiFieldPatchBuilder<T>> {
Expand Down Expand Up @@ -64,6 +65,14 @@ public T urn(Urn urn) {
*/
protected abstract String getEntityType();

protected static String encodeValue(@Nonnull String value) {
return value.replace("~ ", "~0").replace("/", "~1");
}

protected static String encodeValueUrn(@Nonnull Urn urn) {
return encodeValue(urn.toString());
}

/**
* Overrides basic behavior to construct multiple patches based on properties
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn da
pathValues.add(
ImmutableTriple.of(
PatchOperationType.ADD.getValue(),
INPUT_DATA_JOB_EDGES_PATH_START + dataJobUrn,
INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(dataJobUrn.toString()),
value));
return this;
}
Expand All @@ -41,7 +41,7 @@ public DataJobInputOutputPatchBuilder removeInputDatajobEdge(@Nonnull DataJobUrn
pathValues.add(
ImmutableTriple.of(
PatchOperationType.REMOVE.getValue(),
INPUT_DATA_JOB_EDGES_PATH_START + dataJobUrn,
INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(dataJobUrn.toString()),
null));
return this;
}
Expand All @@ -51,15 +51,17 @@ public DataJobInputOutputPatchBuilder addInputDatasetEdge(@Nonnull DatasetUrn da

pathValues.add(
ImmutableTriple.of(
PatchOperationType.ADD.getValue(), INPUT_DATASET_EDGES_PATH_START + datasetUrn, value));
PatchOperationType.ADD.getValue(),
INPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()),
value));
return this;
}

public DataJobInputOutputPatchBuilder removeInputDatasetEdge(@Nonnull DatasetUrn datasetUrn) {
pathValues.add(
ImmutableTriple.of(
PatchOperationType.REMOVE.getValue(),
INPUT_DATASET_EDGES_PATH_START + datasetUrn,
INPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()),
null));
return this;
}
Expand All @@ -70,7 +72,7 @@ public DataJobInputOutputPatchBuilder addOutputDatasetEdge(@Nonnull DatasetUrn d
pathValues.add(
ImmutableTriple.of(
PatchOperationType.ADD.getValue(),
OUTPUT_DATASET_EDGES_PATH_START + datasetUrn,
OUTPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()),
value));
return this;
}
Expand All @@ -79,7 +81,7 @@ public DataJobInputOutputPatchBuilder removeOutputDatasetEdge(@Nonnull DatasetUr
pathValues.add(
ImmutableTriple.of(
PatchOperationType.REMOVE.getValue(),
OUTPUT_DATASET_EDGES_PATH_START + datasetUrn,
OUTPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()),
null));
return this;
}
Expand All @@ -88,31 +90,39 @@ public DataJobInputOutputPatchBuilder addInputDatasetField(@Nonnull Urn urn) {
TextNode textNode = instance.textNode(urn.toString());
pathValues.add(
ImmutableTriple.of(
PatchOperationType.ADD.getValue(), INPUT_DATASET_FIELDS_PATH_START + urn, textNode));
PatchOperationType.ADD.getValue(),
INPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()),
textNode));

return this;
}

public DataJobInputOutputPatchBuilder removeInputDatasetField(@Nonnull Urn urn) {
pathValues.add(
ImmutableTriple.of(
PatchOperationType.REMOVE.getValue(), INPUT_DATASET_FIELDS_PATH_START + urn, null));
PatchOperationType.REMOVE.getValue(),
INPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()),
null));
return this;
}

public DataJobInputOutputPatchBuilder addOutputDatasetField(@Nonnull Urn urn) {
TextNode textNode = instance.textNode(urn.toString());
pathValues.add(
ImmutableTriple.of(
PatchOperationType.ADD.getValue(), OUTPUT_DATASET_FIELDS_PATH_START + urn, textNode));
PatchOperationType.ADD.getValue(),
OUTPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()),
textNode));

return this;
}

public DataJobInputOutputPatchBuilder removeOutputDatasetField(@Nonnull Urn urn) {
pathValues.add(
ImmutableTriple.of(
PatchOperationType.REMOVE.getValue(), OUTPUT_DATASET_FIELDS_PATH_START + urn, null));
PatchOperationType.REMOVE.getValue(),
OUTPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()),
null));
return this;
}

Expand Down Expand Up @@ -147,17 +157,17 @@ private String getEdgePath(@Nonnull Edge edge, LineageDirection direction) {

if (DATASET_ENTITY_NAME.equals(destinationUrn.getEntityType())
&& LineageDirection.UPSTREAM.equals(direction)) {
return INPUT_DATASET_EDGES_PATH_START + destinationUrn;
return INPUT_DATASET_EDGES_PATH_START + encodeValue(destinationUrn.toString());
}

if (DATASET_ENTITY_NAME.equals(destinationUrn.getEntityType())
&& LineageDirection.DOWNSTREAM.equals(direction)) {
return INPUT_DATASET_EDGES_PATH_START + destinationUrn;
return INPUT_DATASET_EDGES_PATH_START + encodeValue(destinationUrn.toString());
}

if (DATA_JOB_ENTITY_NAME.equals(destinationUrn.getEntityType())
&& LineageDirection.UPSTREAM.equals(direction)) {
return INPUT_DATA_JOB_EDGES_PATH_START + destinationUrn;
return INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(destinationUrn.toString());
}

// TODO: Output Data Jobs not supported by aspect, add here if this changes
Expand Down
Loading

0 comments on commit fdf9a5e

Please sign in to comment.