Skip to content

Commit

Permalink
fix(datahub-ingestion): prevent transitive deps, bump addtional pyspa…
Browse files Browse the repository at this point in the history
…rk deps
  • Loading branch information
david-leifker committed Nov 13, 2023
1 parent 7ba54fd commit 9695adc
Show file tree
Hide file tree
Showing 15 changed files with 70 additions and 39 deletions.
5 changes: 3 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
**/node_modules/
datahub-frontend/build/
metadata-ingestion/venv/
*/build/
*/*/build/
*/venv/
out
**/*.class
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars
Expand Down
3 changes: 2 additions & 1 deletion datahub-frontend/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion datahub-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,11 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
8 changes: 5 additions & 3 deletions docker/datahub-ingestion-base/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,20 @@ ext {
docker_repo = 'datahub-ingestion-base'
docker_dir = 'datahub-ingestion-base'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 2 // increment to trigger rebuild
}

docker {
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}"
version "v${version}-${docker_target}"
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
buildArgs([APP_ENV: docker_target])
}
Expand Down
4 changes: 2 additions & 2 deletions docker/datahub-ingestion/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .

RUN pip install --no-cache --user ".[base]" && \
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
pip install --no-cache --user ".[all]" && \
./pyspark_jars.sh
pip install --no-cache --user ".[all]"
RUN ./pyspark_jars.sh

FROM base as full-install

Expand Down
7 changes: 7 additions & 0 deletions docker/datahub-ingestion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,10 @@
[![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)

Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.

## Slim vs Full Image Build

There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.

Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
image with pyspark use the following project property `-PdockerTarget=full`.
16 changes: 8 additions & 8 deletions docker/datahub-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ ext {
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
docker_repo = 'datahub-ingestion'
docker_dir = 'datahub-ingestion'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 2 // increment to trigger rebuild
}
Expand All @@ -19,21 +21,19 @@ dependencies {
}

docker {
name "${docker_registry}/${docker_repo}:v${version}-slim"
version "v${version}-slim"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only")
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-ingestion/**"
include "metadata-ingestion-modules/**"
}.exclude {
i -> i.file.isHidden() ||
i.file == buildDir ||
i.file == project(':metadata-ingestion').buildDir ||
i.file == project(':metadata-ingestion-modules').buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
buildArgs([DOCKER_VERSION: version,
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')])
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')])
}
tasks.getByName('docker').dependsOn(['build',
':docker:datahub-ingestion-base:docker',
Expand Down
40 changes: 26 additions & 14 deletions docker/datahub-ingestion/pyspark_jars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,33 @@

set -ex

HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"

# Remove conflicting versions
echo "Removing version conflicts from $PYSPARK_JARS"
CONFLICTS="zookeeper hadoop- slf4j-"
for jar in $CONFLICTS; do
rm "$PYSPARK_JARS/$jar"*.jar
done
function replace_jar {
JAR_PREFIX=$1
TRANSITIVE=$2
DEPENDENCY=$3

# Fetch dependencies
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY"
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY"
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true

# Move to pyspark location
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \;
if [ ! -z "$DEPENDENCY" ]; then
echo "Resolving $DEPENDENCY"
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null

echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}

replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"
5 changes: 3 additions & 2 deletions docker/elasticsearch-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-service/restli-servlet-impl/src/main/resources/index/**"
include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/kafka-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/mysql-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/postgres-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion metadata-jobs/mae-consumer-job/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion metadata-jobs/mce-consumer-job/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion metadata-service/war/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files war.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down

0 comments on commit 9695adc

Please sign in to comment.