From 9695adcf4ad3598238be373446bd246f7fa86231 Mon Sep 17 00:00:00 2001 From: David Leifker Date: Mon, 13 Nov 2023 14:50:00 -0600 Subject: [PATCH] fix(datahub-ingestion): prevent transitive deps, bump addtional pyspark deps --- .dockerignore | 5 +-- datahub-frontend/build.gradle | 3 +- datahub-upgrade/build.gradle | 3 +- docker/datahub-ingestion-base/build.gradle | 8 +++-- docker/datahub-ingestion/Dockerfile | 4 +-- docker/datahub-ingestion/README.md | 7 ++++ docker/datahub-ingestion/build.gradle | 16 ++++----- docker/datahub-ingestion/pyspark_jars.sh | 40 +++++++++++++-------- docker/elasticsearch-setup/build.gradle | 5 +-- docker/kafka-setup/build.gradle | 3 +- docker/mysql-setup/build.gradle | 3 +- docker/postgres-setup/build.gradle | 3 +- metadata-jobs/mae-consumer-job/build.gradle | 3 +- metadata-jobs/mce-consumer-job/build.gradle | 3 +- metadata-service/war/build.gradle | 3 +- 15 files changed, 70 insertions(+), 39 deletions(-) diff --git a/.dockerignore b/.dockerignore index 29c6c45bb06536..701263f5fedded 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,7 @@ **/node_modules/ -datahub-frontend/build/ -metadata-ingestion/venv/ +*/build/ +*/*/build/ +*/venv/ out **/*.class # Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index fdf13bac0accc0..eb81b317455361 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -77,10 +77,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 5d0edf3ee8427c..81e6e79c2a5e52 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -88,10 +88,11 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index 64635671343ef4..c4d8a962dcd325 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -10,18 +10,20 @@ ext { docker_repo = 'datahub-ingestion-base' docker_dir = 'datahub-ingestion-base' docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } docker { - name "${docker_registry}/${docker_repo}:v${version}-${docker_target}" - version "v${version}-${docker_target}" + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([APP_ENV: docker_target]) } diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 2abd4e2f33befd..1aee79a428a98a 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh . RUN pip install --no-cache --user ".[base]" && \ pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \ - pip install --no-cache --user ".[all]" && \ - ./pyspark_jars.sh + pip install --no-cache --user ".[all]" +RUN ./pyspark_jars.sh FROM base as full-install diff --git a/docker/datahub-ingestion/README.md b/docker/datahub-ingestion/README.md index 6580199bcce216..ed856314c5cc0f 100644 --- a/docker/datahub-ingestion/README.md +++ b/docker/datahub-ingestion/README.md @@ -2,3 +2,10 @@ [![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml) Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service. + +## Slim vs Full Image Build + +There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies. + +Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full +image with pyspark use the following project property `-PdockerTarget=full`. diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index fed33752a4b816..247b896d6955cb 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -9,6 +9,8 @@ ext { docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry docker_repo = 'datahub-ingestion' docker_dir = 'datahub-ingestion' + docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } @@ -19,21 +21,19 @@ dependencies { } docker { - name "${docker_registry}/${docker_repo}:v${version}-slim" - version "v${version}-slim" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only") + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" + dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" include "metadata-ingestion/**" include "metadata-ingestion-modules/**" }.exclude { - i -> i.file.isHidden() || - i.file == buildDir || - i.file == project(':metadata-ingestion').buildDir || - i.file == project(':metadata-ingestion-modules').buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([DOCKER_VERSION: version, - RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')]) + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]) } tasks.getByName('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker', diff --git a/docker/datahub-ingestion/pyspark_jars.sh b/docker/datahub-ingestion/pyspark_jars.sh index ecd24e78c41057..ab4b223f0358a5 100755 --- a/docker/datahub-ingestion/pyspark_jars.sh +++ b/docker/datahub-ingestion/pyspark_jars.sh @@ -2,21 +2,33 @@ set -ex -HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}" -ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars" -# Remove conflicting versions -echo "Removing version conflicts from $PYSPARK_JARS" -CONFLICTS="zookeeper hadoop- slf4j-" -for jar in $CONFLICTS; do - rm "$PYSPARK_JARS/$jar"*.jar -done +function replace_jar { + JAR_PREFIX=$1 + TRANSITIVE=$2 + DEPENDENCY=$3 -# Fetch dependencies -mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY" -mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY" + echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar" + ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm -r "$HOME/.m2" || true -# Move to pyspark location -echo "Moving jars to $PYSPARK_JARS" -find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \; + if [ ! -z "$DEPENDENCY" ]; then + echo "Resolving $DEPENDENCY" + mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null + + echo "Moving jars to $PYSPARK_JARS" + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \; + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \; + fi +} + +replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}" +replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}" +replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}" +replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}" +replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}" +replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}" diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index ffee3b9c65cf4f..ac935ca42fd12a 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -15,10 +15,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" - include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + include 'metadata-service/restli-servlet-impl/src/main/resources/index/**' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index 573ef21c88bf91..25f9847190de3c 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -15,9 +15,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 0d8941cce48339..1598866914c0ee 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index 8a026be09d2b4c..e24e206c99145c 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index 51c758f4343280..5e735e118493cd 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -45,11 +45,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index daf41a1e0303ee..ef042188bc3d83 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,11 +56,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 54e95fdcfe5798..35730ad6dfa9f3 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -70,11 +70,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files war.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug")