diff --git a/.travis.yml b/.travis.yml index ba0c27d0e..188fa0fed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,11 +17,9 @@ if: NOT commit_message =~ /^Setting version to.*SNAPSHOT$/ env: global: - SPARK_VERSION=2.4.5 - - HADOOP_VERSION=2.9.2 - - HIVE1_VERSION=1.2.2 - - HUDI_HIVE1_VERSION=0.4.7 + - HADOOP_VERSION=3.2.1 - HIVE_VERSION=2.3.3 - - HUDI_VERSION=0.5.1-incubating + - HUDI_VERSION=0.5.2-incubating before_script: - export -f travis_fold - export -f travis_time_start diff --git a/README.md b/README.md index 2d0c33a3e..181c60b89 100644 --- a/README.md +++ b/README.md @@ -455,7 +455,7 @@ Metorikku supports reading/writing with [Apache Hudi](https://github.com/apache/ Hudi is a very exciting project that basically allows upserts and deletes directly on top of partitioned parquet data. In order to use Hudi with Metorikku you need to add to your classpath (via ```--jars``` or if running locally with ```-cp```) -an external JAR from here: https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar +an external JAR from here: https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar To run Hudi jobs you also have to make sure you have the following spark configuration (pass with ```--conf``` or ```-D```): ```properties diff --git a/build.sbt b/build.sbt index 5cd8b889e..d189e9035 100644 --- a/build.sbt +++ b/build.sbt @@ -62,7 +62,7 @@ libraryDependencies ++= Seq( "org.influxdb" % "influxdb-java" % "2.14", "org.apache.kafka" %% "kafka" % "2.2.0" % "provided", "za.co.absa" % "abris_2.11" % "3.1.1" % "provided" excludeAll(excludeAvro, excludeSpark), - "org.apache.hudi" %% "hudi-spark-bundle" % "0.5.1-incubating" % "provided" excludeAll excludeFasterXML, + "org.apache.hudi" %% "hudi-spark-bundle" % "0.5.2-incubating" % "provided" excludeAll excludeFasterXML, "org.apache.parquet" % "parquet-avro" % "1.10.1" % "provided", "org.apache.avro" % "avro" % "1.8.2" % "provided", "org.apache.hive" % "hive-jdbc" % "2.3.3" % "provided" excludeAll(excludeNetty, excludeNettyAll) diff --git a/docker/hive/Dockerfile b/docker/hive/Dockerfile index 7b777915d..9924455a3 100644 --- a/docker/hive/Dockerfile +++ b/docker/hive/Dockerfile @@ -23,7 +23,7 @@ ENV MYSQL_CONNECTOR_VERSION=5.1.47 RUN wget -q https://repo1.maven.org/maven2/mysql/mysql-connector-java/$MYSQL_CONNECTOR_VERSION/mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar \ && mv mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar $HIVE_HOME/lib -ENV HUDI_VERSION=0.5.1-incubating +ENV HUDI_VERSION=0.5.2-incubating RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \ diff --git a/docker/hive1/Dockerfile b/docker/hive1/Dockerfile deleted file mode 100644 index 3783818fb..000000000 --- a/docker/hive1/Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -FROM openjdk:8u212-b04-jre-stretch - -RUN mkdir /opt/atlas -ENV ATLAS_HOME=/opt/atlas -RUN mkdir -p $ATLAS_HOME/hook/hive -ENV HADOOP_HOME=/opt/hadoop -ENV HADOOP_VERSION=2.7.4 -RUN wget -q https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz \ - && tar -xzf hadoop-$HADOOP_VERSION.tar.gz \ - && mv hadoop-$HADOOP_VERSION $HADOOP_HOME \ - && rm hadoop-$HADOOP_VERSION.tar.gz - -ENV HIVE_HOME=/opt/hive -ENV HIVE_VERSION=1.2.2 -RUN wget -q https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz \ - && tar -xzf apache-hive-$HIVE_VERSION-bin.tar.gz \ - && mv apache-hive-$HIVE_VERSION-bin $HIVE_HOME \ - && rm apache-hive-$HIVE_VERSION-bin.tar.gz - -ENV MYSQL_CONNECTOR_VERSION=5.1.47 -RUN wget -q https://repo1.maven.org/maven2/mysql/mysql-connector-java/$MYSQL_CONNECTOR_VERSION/mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar \ - && mv mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar $HIVE_HOME/lib - -ENV HUDI_VERSION=0.4.7 -RUN apt-get update && apt-get install -y ant -RUN wget -q https://repo1.maven.org/maven2/com/uber/hoodie/hoodie-hive-bundle/$HUDI_VERSION/hoodie-hive-bundle-$HUDI_VERSION.jar \ - && mv hoodie-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib -RUN wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$HADOOP_VERSION/hadoop-aws-$HADOOP_VERSION.jar \ - && mv hadoop-aws-$HADOOP_VERSION.jar $HIVE_HOME/lib - -ENV AWS_JAVA_SDK_VERSION=1.7.4 -RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/$AWS_JAVA_SDK_VERSION/aws-java-sdk-$AWS_JAVA_SDK_VERSION.jar \ - && mv aws-java-sdk-$AWS_JAVA_SDK_VERSION.jar $HIVE_HOME/lib - -RUN wget -q https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar \ - && mv jsonevent-layout-1.7.jar $HIVE_HOME/lib -RUN wget -q https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar \ - && mv json-smart-1.1.1.jar $HIVE_HOME/lib - -# Apache Atlas HiveHook installation -ENV ATLAS_VERSION=2.0.0 -ENV HBASE_VERSION=2.0.2 -ENV JACKSON_VERSION=2.9.9 -ENV JERSEY_VERSION=1.19 -ENV JSR311_VERSION=1.1 -ENV KAFKA_2_1_1_VERSION=2.0.0 -ENV SCALA_LIBRARY_VERSION=2.11.12 -ENV COMMONS_CONFIG_VERSION=1.10 - -RUN mkdir -p $ATLAS_HOME/hook/hive/atlas-hive-plugin-impl - -RUN wget -P ${ATLAS_HOME}/hook/hive/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-plugin-classloader/$ATLAS_VERSION/atlas-plugin-classloader-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/ https://repo1.maven.org/maven2/org/apache/atlas/hive-bridge-shim/$ATLAS_VERSION/hive-bridge-shim-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-client-common/$ATLAS_VERSION/atlas-client-common-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-client-v1/$ATLAS_VERSION/atlas-client-v1-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-client-v2/$ATLAS_VERSION/atlas-client-v2-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-common/$ATLAS_VERSION/atlas-common-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-intg/$ATLAS_VERSION/atlas-intg-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/atlas-notification/$ATLAS_VERSION/atlas-notification-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/hdfs-model/$ATLAS_VERSION/hdfs-model-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/hive-bridge/$ATLAS_VERSION/hive-bridge-$ATLAS_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/hbase/hbase-common/$HBASE_VERSION/hbase-common-$HBASE_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/hbase/hbase-server/$HBASE_VERSION/hbase-server-$HBASE_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/$JACKSON_VERSION/jackson-annotations-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/$JACKSON_VERSION/jackson-core-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/$JACKSON_VERSION/jackson-databind-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/module/jackson-module-jaxb-annotations/$JACKSON_VERSION/jackson-module-jaxb-annotations-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/jaxrs/jackson-jaxrs-base/$JACKSON_VERSION/jackson-jaxrs-base-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/fasterxml/jackson/jaxrs/jackson-jaxrs-json-provider/$JACKSON_VERSION/jackson-jaxrs-json-provider-$JACKSON_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/sun/jersey/jersey-json/$JERSEY_VERSION/jersey-json-$JERSEY_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/com/sun/jersey/contribs/jersey-multipart/$JERSEY_VERSION/jersey-multipart-$JERSEY_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/$KAFKA_2_1_1_VERSION/kafka-clients-$KAFKA_2_1_1_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/kafka/kafka_2.12/$KAFKA_2_1_1_VERSION/kafka_2.12-$KAFKA_2_1_1_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/scala-lang/scala-library/$SCALA_LIBRARY_VERSION/scala-library-$SCALA_LIBRARY_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/commons-configuration/commons-configuration/$COMMONS_CONFIG_VERSION/commons-configuration-$COMMONS_CONFIG_VERSION.jar -RUN wget -P ${ATLAS_HOME}/hook/hive/atlas-hive-plugin-impl/ https://repo1.maven.org/maven2/org/apache/atlas/hdfs-model/$ATLAS_VERSION/hdfs-model-$ATLAS_VERSION.jar - - -ADD https://github.com/ufoscout/docker-compose-wait/releases/download/2.5.0/wait /wait -RUN chmod +x /wait - -COPY start-hive.sh / -COPY log4j.json.properties . - -RUN mkdir -p $ATLAS_HOME/hook-bin/ -COPY atlas/import_hive.sh $ATLAS_HOME/hook-bin/ -RUN chmod +x /$ATLAS_HOME/hook-bin/import_hive.sh - -CMD /wait && /start-hive.sh \ No newline at end of file diff --git a/docker/hive1/atlas/import_hive.sh b/docker/hive1/atlas/import_hive.sh deleted file mode 100644 index 71408bf3b..000000000 --- a/docker/hive1/atlas/import_hive.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. -# -# resolve links - $0 may be a softlink -# Taken from Apache Atlas project repo, removed JAR_BIN and Removed Hoodie Jar from Classpath -PRG="${0}" - -[[ `uname -s` == *"CYGWIN"* ]] && CYGWIN=true - -while [ -h "${PRG}" ]; do - ls=`ls -ld "${PRG}"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "${PRG}"`/"$link" - fi -done - -BASEDIR=`dirname ${PRG}` -BASEDIR=`cd ${BASEDIR}/..;pwd` - -if test -z "${JAVA_HOME}" -then - JAVA_BIN=`which java` - JAR_BIN=`which jar` -else - JAVA_BIN="${JAVA_HOME}/bin/java" - JAR_BIN="${JAVA_HOME}/bin/jar" -fi -export JAVA_BIN - -if [[ ! -e "${JAVA_BIN}" ]]; then - echo "$JAVA_BIN and/or $JAR_BIN not found on the system. Please make sure java and jar commands are available." - exit 1 -fi - -# Construct Atlas classpath using jars from hook/hive/atlas-hive-plugin-impl/ directory. -for i in "${BASEDIR}/hook/hive/atlas-hive-plugin-impl/"*.jar; do - ATLASCPPATH="${ATLASCPPATH}:$i" -done - -# log dir for applications -ATLAS_LOG_DIR="${ATLAS_LOG_DIR:-$BASEDIR/logs}" -export ATLAS_LOG_DIR -LOGFILE="$ATLAS_LOG_DIR/import-hive.log" - -TIME=`date +%Y%m%d%H%M%s` - -#Add hive conf in classpath -if [ ! -z "$HIVE_CONF_DIR" ]; then - HIVE_CONF=$HIVE_CONF_DIR -elif [ ! -z "$HIVE_HOME" ]; then - HIVE_CONF="$HIVE_HOME/conf" -elif [ -e /etc/hive/conf ]; then - HIVE_CONF="/etc/hive/conf" -else - echo "Could not find a valid HIVE configuration" - exit 1 -fi - -echo Using Hive configuration directory ["$HIVE_CONF"] - - -if [ -f "${HIVE_CONF}/hive-env.sh" ]; then - . "${HIVE_CONF}/hive-env.sh" -fi - -if [ -z "$HIVE_HOME" ]; then - if [ -d "${BASEDIR}/../hive" ]; then - HIVE_HOME=${BASEDIR}/../hive - else - echo "Please set HIVE_HOME to the root of Hive installation" - exit 1 - fi -fi - -HIVE_CP="${HIVE_CONF}" - -for i in "${HIVE_HOME}/lib/"*.jar; do - if [[ $i == *"hoodie"* ]]; then - continue - fi - HIVE_CP="${HIVE_CP}:$i" -done - -#Add hadoop conf in classpath -if [ ! -z "$HADOOP_CLASSPATH" ]; then - HADOOP_CP=$HADOOP_CLASSPATH -elif [ ! -z "$HADOOP_HOME" ]; then - HADOOP_CP=`$HADOOP_HOME/bin/hadoop classpath` -elif [ $(command -v hadoop) ]; then - HADOOP_CP=`hadoop classpath` - echo $HADOOP_CP -else - echo "Environment variable HADOOP_CLASSPATH or HADOOP_HOME need to be set" - exit 1 -fi - -CP="${ATLASCPPATH}:${HIVE_CP}:${HADOOP_CP}" - -# If running in cygwin, convert pathnames and classpath to Windows format. -if [ "${CYGWIN}" == "true" ] -then - ATLAS_LOG_DIR=`cygpath -w ${ATLAS_LOG_DIR}` - LOGFILE=`cygpath -w ${LOGFILE}` - HIVE_CP=`cygpath -w ${HIVE_CP}` - HADOOP_CP=`cygpath -w ${HADOOP_CP}` - CP=`cygpath -w -p ${CP}` -fi - -JAVA_PROPERTIES="$ATLAS_OPTS -Datlas.log.dir=$ATLAS_LOG_DIR -Datlas.log.file=import-hive.log --Dlog4j.configuration=atlas-hive-import-log4j.xml" - -IMPORT_ARGS= -JVM_ARGS= - -while true -do - option=$1 - shift - - case "$option" in - -d) IMPORT_ARGS="$IMPORT_ARGS -d $1"; shift;; - -t) IMPORT_ARGS="$IMPORT_ARGS -t $1"; shift;; - -f) IMPORT_ARGS="$IMPORT_ARGS -f $1"; shift;; - --database) IMPORT_ARGS="$IMPORT_ARGS --database $1"; shift;; - --table) IMPORT_ARGS="$IMPORT_ARGS --table $1"; shift;; - --filename) IMPORT_ARGS="$IMPORT_ARGS --filename $1"; shift;; - "") break;; - *) JVM_ARGS="$JVM_ARGS $option" - esac -done - -JAVA_PROPERTIES="${JAVA_PROPERTIES} ${JVM_ARGS}" - -echo "Log file for import is $LOGFILE" - -"${JAVA_BIN}" ${JAVA_PROPERTIES} -cp "${CP}" org.apache.atlas.hive.bridge.HiveMetaStoreBridge $IMPORT_ARGS - -RETVAL=$? -[ $RETVAL -eq 0 ] && echo Hive Meta Data imported successfully!!! -[ $RETVAL -ne 0 ] && echo Failed to import Hive Meta Data!!! - -exit $RETVAL \ No newline at end of file diff --git a/docker/hive1/log4j.json.properties b/docker/hive1/log4j.json.properties deleted file mode 100644 index 2b31b0112..000000000 --- a/docker/hive1/log4j.json.properties +++ /dev/null @@ -1,4 +0,0 @@ -log4j.rootLogger=INFO, CONSOLE - -log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -log4j.appender.CONSOLE.layout=net.logstash.log4j.JSONEventLayoutV1 diff --git a/docker/hive1/start-hive.sh b/docker/hive1/start-hive.sh deleted file mode 100755 index dcccb6d92..000000000 --- a/docker/hive1/start-hive.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -CONNECTION_DRIVER_NAME=${CONNECTION_DRIVER_NAME:=com.mysql.jdbc.Driver} -HIVE_SERVER_PORT=${HIVE_SERVER_PORT:=10000} -SCHEMA_VERIFICATION=${SCHEMA_VERIFICATION:=false} -METASTORE_PORT=${METASTORE_PORT:=9083} -DEFAULT_FS=${DEFAULT_FS:=file:///} -DB_TYPE=${DB_TYPE:=mysql} -USE_ATLAS=${USE_ATLAS:=false} - -if [ ! -z ${JSON_LOG} ] ; then - echo "Setting Log type to JSON" - cat log4j.json.properties >> ${HIVE_HOME}/conf/hive-log4j.properties -fi - -cat >${HIVE_HOME}/conf/hive-site.xml < - - javax.jdo.option.ConnectionURL - ${CONNECTION_URL} - JDBC connect string for a JDBC metastore - - - javax.jdo.option.ConnectionDriverName - ${CONNECTION_DRIVER_NAME} - Driver class name for a JDBC metastore - - - javax.jdo.option.ConnectionUserName - ${CONNECTION_USER_NAME} - username to use against metastore database - - - javax.jdo.option.ConnectionPassword - ${CONNECTION_PASSWORD} - password to use against metastore database - - - hive.metastore.schema.verification - ${SCHEMA_VERIFICATION} - - - hive.metastore.warehouse.dir - ${WAREHOUSE_DIR} - - - - hive.metastore.uris - thrift://localhost:${METASTORE_PORT} - - - hive.server2.thrift.port - ${HIVE_SERVER_PORT} - - - fs.default.name - ${DEFAULT_FS} - - - fs.s3a.impl - org.apache.hadoop.fs.s3a.S3AFileSystem - - - fs.s3.impl - org.apache.hadoop.fs.s3a.S3AFileSystem - - - fs.s3n.awsAccessKeyId - ${AWS_ACCESS_KEY} - - - fs.s3n.awsSecretAccessKey - ${AWS_SECRET_KEY} - - - hive.security.authorization.enabled - false - -EOL - -if [[ ! -z ${USE_ATLAS} ]] ; then -cat >>${HIVE_HOME}/conf/hive-site.xml < - hive.exec.post.hooks - org.apache.atlas.hive.hook.HiveHook - - -EOL -# hive-env extra jars -cat >>${HIVE_HOME}/conf/hive-env.sh <${HIVE_HOME}/conf/atlas-application.properties <>${HIVE_HOME}/conf/hive-site.xml < -EOL -fi - -$HIVE_HOME/bin/schematool -dbType ${DB_TYPE} -initSchema - -nohup ${HIVE_HOME}/bin/hive --service metastore -p ${METASTORE_PORT} & -${HIVE_HOME}/bin/hiveserver2 --hiveconf hive.root.logger=INFO,console diff --git a/docker/spark/Dockerfile b/docker/spark/Dockerfile index 0fde928df..b27083846 100644 --- a/docker/spark/Dockerfile +++ b/docker/spark/Dockerfile @@ -9,17 +9,16 @@ RUN wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-$ && rm -f /spark/jars/hadoop*2.7* \ && cd / -ARG HADOOP_VERSION=2.9.2 +ARG HADOOP_VERSION=3.2.1 RUN wget -q https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \ && tar -xzf hadoop-${HADOOP_VERSION}.tar.gz -C /opt/ \ && rm hadoop-${HADOOP_VERSION}.tar.gz RUN ln -s /opt/hadoop-${HADOOP_VERSION}/etc/hadoop /etc/hadoop -RUN cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml RUN mkdir /opt/hadoop-${HADOOP_VERSION}/logs -ENV HADOOP_PREFIX=/opt/hadoop-${HADOOP_VERSION} +ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} ENV HADOOP_CONF_DIR=/etc/hadoop ENV HIVE_HOME=/opt/hive @@ -29,7 +28,7 @@ RUN wget -q https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive- && mv apache-hive-$HIVE_VERSION-bin $HIVE_HOME \ && rm apache-hive-$HIVE_VERSION-bin.tar.gz -ENV HUDI_VERSION=0.5.1-incubating +ENV HUDI_VERSION=0.5.2-incubating RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \ @@ -42,9 +41,11 @@ RUN apt-get update \ && rm get-pip.py \ && rm -rf /var/lib/apt/lists/* -ARG AWS_SDK_VERSION=1.11.699 -ARG HADOOP_AWS_VERSION=3.0.3 +ARG AWS_SDK_VERSION=1.11.769 +ARG AWS_DYNAMO_VERSION=1.11.375 +ARG HADOOP_AWS_VERSION=3.2.1 ARG HTTPCLIENT_VERSION=4.5.11 +ARG SPARK_HADOOP_CLOUD_VERSION=2.4.0.7.0.3.0-79 RUN wget -q https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar -P /spark/jars/ RUN wget -q https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar -P /spark/jars/ @@ -52,7 +53,10 @@ RUN wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar -P /spark/jars/ RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar -P /spark/jars/ RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar -P /spark/jars/ +RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/${AWS_DYNAMO_VERSION}/aws-java-sdk-dynamodb-${AWS_DYNAMO_VERSION}.jar -P /spark/jars/ +RUN wget -q https://repository.cloudera.com/artifactory/libs-release-local/org/apache/spark/spark-hadoop-cloud_2.11/${SPARK_HADOOP_CLOUD_VERSION}/spark-hadoop-cloud_2.11-${SPARK_HADOOP_CLOUD_VERSION}.jar -P /spark/jars/ RUN rm -f /spark/jars/httpclient-*.jar && wget -q https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/${HTTPCLIENT_VERSION}/httpclient-${HTTPCLIENT_VERSION}.jar -P /spark/jars +RUN rm -f /spark/jars/guava-*.jar ADD log4j.json.properties /spark/conf/ ADD spark-defaults.conf /spark/conf/ @@ -60,7 +64,8 @@ ADD spark-env.sh /spark/conf/ ENV PYTHONHASHSEED 1 ENV SPARK_HOME /spark -ENV PATH="${HADOOP_PREFIX}/bin:/spark/bin:${PATH}" +ENV PATH="${HADOOP_HOME}/bin:/spark/bin:${PATH}" ADD scripts /scripts +ADD hive-shim-loader/target/*.jar /spark/pre-jars/ ENTRYPOINT ["/scripts/entrypoint-master.sh"] \ No newline at end of file diff --git a/docker/spark/hive-shim-loader/pom.xml b/docker/spark/hive-shim-loader/pom.xml new file mode 100644 index 000000000..28a9e2841 --- /dev/null +++ b/docker/spark/hive-shim-loader/pom.xml @@ -0,0 +1,42 @@ + + + + 4.0.0 + + com.yotpo + hive-shim-loader + jar + 1.2.1-spark2-hadoop3 + Hive Shim Loader + + + + org.spark-project.hive + hive-shims + 1.2.1.spark2 + + + org.apache.hadoop + hadoop-core + 1.2.1 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.7 + 1.7 + + + + + + \ No newline at end of file diff --git a/docker/spark/hive-shim-loader/src/main/java/org/apache/hadoop/hive/shims/ShimLoader.java b/docker/spark/hive-shim-loader/src/main/java/org/apache/hadoop/hive/shims/ShimLoader.java new file mode 100644 index 000000000..13540a7d0 --- /dev/null +++ b/docker/spark/hive-shim-loader/src/main/java/org/apache/hadoop/hive/shims/ShimLoader.java @@ -0,0 +1,183 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.shims; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge; +import org.apache.hadoop.util.VersionInfo; +import org.apache.log4j.AppenderSkeleton; + +/** + * ShimLoader. + * + */ +public abstract class ShimLoader { + public static String HADOOP20SVERSIONNAME = "0.20S"; + public static String HADOOP23VERSIONNAME = "0.23"; + + private static HadoopShims hadoopShims; + private static JettyShims jettyShims; + private static AppenderSkeleton eventCounter; + private static HadoopThriftAuthBridge hadoopThriftAuthBridge; + private static SchedulerShim schedulerShim; + + /** + * The names of the classes for shimming Hadoop for each major version. + */ + private static final HashMap HADOOP_SHIM_CLASSES = + new HashMap(); + + static { + HADOOP_SHIM_CLASSES.put(HADOOP20SVERSIONNAME, "org.apache.hadoop.hive.shims.Hadoop20SShims"); + HADOOP_SHIM_CLASSES.put(HADOOP23VERSIONNAME, "org.apache.hadoop.hive.shims.Hadoop23Shims"); + } + + /** + * The names of the classes for shimming Jetty for each major version of + * Hadoop. + */ + private static final HashMap JETTY_SHIM_CLASSES = + new HashMap(); + + static { + JETTY_SHIM_CLASSES.put(HADOOP20SVERSIONNAME, "org.apache.hadoop.hive.shims.Jetty20SShims"); + JETTY_SHIM_CLASSES.put(HADOOP23VERSIONNAME, "org.apache.hadoop.hive.shims.Jetty23Shims"); + } + + /** + * The names of the classes for shimming Hadoop's event counter + */ + private static final HashMap EVENT_COUNTER_SHIM_CLASSES = + new HashMap(); + + static { + EVENT_COUNTER_SHIM_CLASSES.put(HADOOP20SVERSIONNAME, "org.apache.hadoop.log.metrics" + + ".EventCounter"); + EVENT_COUNTER_SHIM_CLASSES.put(HADOOP23VERSIONNAME, "org.apache.hadoop.log.metrics" + + ".EventCounter"); + } + + /** + * The names of the classes for shimming {@link HadoopThriftAuthBridge} + */ + private static final HashMap HADOOP_THRIFT_AUTH_BRIDGE_CLASSES = + new HashMap(); + + static { + HADOOP_THRIFT_AUTH_BRIDGE_CLASSES.put(HADOOP20SVERSIONNAME, + "org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge"); + HADOOP_THRIFT_AUTH_BRIDGE_CLASSES.put(HADOOP23VERSIONNAME, + "org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge23"); + } + + + private static final String SCHEDULER_SHIM_CLASSE = + "org.apache.hadoop.hive.schshim.FairSchedulerShim"; + + /** + * Factory method to get an instance of HadoopShims based on the + * version of Hadoop on the classpath. + */ + public static synchronized HadoopShims getHadoopShims() { + if (hadoopShims == null) { + hadoopShims = loadShims(HADOOP_SHIM_CLASSES, HadoopShims.class); + } + return hadoopShims; + } + + /** + * Factory method to get an instance of JettyShims based on the version + * of Hadoop on the classpath. + */ + public static synchronized JettyShims getJettyShims() { + if (jettyShims == null) { + jettyShims = loadShims(JETTY_SHIM_CLASSES, JettyShims.class); + } + return jettyShims; + } + + public static synchronized AppenderSkeleton getEventCounter() { + if (eventCounter == null) { + eventCounter = loadShims(EVENT_COUNTER_SHIM_CLASSES, AppenderSkeleton.class); + } + return eventCounter; + } + + public static synchronized HadoopThriftAuthBridge getHadoopThriftAuthBridge() { + if (hadoopThriftAuthBridge == null) { + hadoopThriftAuthBridge = loadShims(HADOOP_THRIFT_AUTH_BRIDGE_CLASSES, + HadoopThriftAuthBridge.class); + } + return hadoopThriftAuthBridge; + } + + public static synchronized SchedulerShim getSchedulerShims() { + if (schedulerShim == null) { + schedulerShim = createShim(SCHEDULER_SHIM_CLASSE, SchedulerShim.class); + } + return schedulerShim; + } + + private static T loadShims(Map classMap, Class xface) { + String vers = getMajorVersion(); + String className = classMap.get(vers); + return createShim(className, xface); + } + + private static T createShim(String className, Class xface) { + try { + Class clazz = Class.forName(className); + return xface.cast(clazz.newInstance()); + } catch (Exception e) { + throw new RuntimeException("Could not load shims in class " + className, e); + } + } + + /** + * Return the "major" version of Hadoop currently on the classpath. + * Releases in the 1.x and 2.x series are mapped to the appropriate + * 0.x release series, e.g. 1.x is mapped to "0.20S" and 2.x + * is mapped to "0.23". + */ + public static String getMajorVersion() { + String vers = VersionInfo.getVersion(); + + String[] parts = vers.split("\\."); + if (parts.length < 2) { + throw new RuntimeException("Illegal Hadoop Version: " + vers + + " (expected A.B.* format)"); + } + + switch (Integer.parseInt(parts[0])) { + case 1: + return HADOOP20SVERSIONNAME; + case 2: + return HADOOP23VERSIONNAME; + case 3: + return HADOOP23VERSIONNAME; + default: + throw new IllegalArgumentException("Unrecognized Hadoop major version number: " + vers); + } + } + + private ShimLoader() { + // prevent instantiation + } +} \ No newline at end of file diff --git a/docker/spark/scripts/entrypoint-submit.sh b/docker/spark/scripts/entrypoint-submit.sh index c7417f8cc..2dabd5c9c 100755 --- a/docker/spark/scripts/entrypoint-submit.sh +++ b/docker/spark/scripts/entrypoint-submit.sh @@ -8,7 +8,6 @@ MAX_RETRIES=${MAX_RETRIES:=300} MIN_WORKERS=${MIN_WORKERS:=1} SPARK_UI_PORT=${SPARK_UI_PORT:=4040} POST_SCRIPT=${POST_SCRIPT:=/scripts/finish-submit.sh} -USE_BUILTIN_HIVE_METASTORE=${USE_BUILTIN_HIVE_METASTORE:=true} # Atlas /scripts/add-atlas-integration.sh @@ -43,15 +42,12 @@ spark.ui.port $SPARK_UI_PORT if [[ ! -z ${HIVE_METASTORE_URI} ]]; then echo -e " -spark.sql.catalogImplementation=hive -spark.hadoop.hive.metastore.uris=thrift://$HIVE_METASTORE_URI -" >> /spark/conf/spark-defaults.conf -fi - -if [[ "${USE_BUILTIN_HIVE_METASTORE}" == false ]]; then -echo -e " spark.sql.hive.metastore.version=$HIVE_VERSION spark.sql.hive.metastore.jars=/opt/hive/lib/* +spark.sql.catalogImplementation=hive +spark.hadoop.hive.metastore.uris=thrift://$HIVE_METASTORE_URI +spark.hadoop.hive.metastore.schema.verification=false +spark.hadoop.hive.metastore.schema.verification.record.version=false " >> /spark/conf/spark-defaults.conf fi diff --git a/docker/spark/spark-defaults.conf b/docker/spark/spark-defaults.conf index d51076262..55d9dd030 100644 --- a/docker/spark/spark-defaults.conf +++ b/docker/spark/spark-defaults.conf @@ -8,10 +8,16 @@ spark.executor.logs.rolling.strategy=size spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem spark.hadoop.fs.s3a.fast.upload=true spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem -spark.hadoop.fs.s3a.multiobjectdelete.enable=false -spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 +spark.hadoop.fs.s3a.committer.name=partitioned +spark.hadoop.fs.s3a.committer.staging.conflict-mode=replace +spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a=org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory +spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3=org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory +spark.sql.sources.commitProtocolClass=org.apache.spark.internal.io.cloud.PathOutputCommitProtocol +spark.sql.parquet.output.committer.class=org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter spark.port.maxRetries=0 spark.rdd.compress=true spark.serializer=org.apache.spark.serializer.KryoSerializer spark.sql.hive.convertMetastoreParquet=false -spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation=true \ No newline at end of file +spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation=true +spark.driver.extraClassPath=/spark/pre-jars/* +spark.executor.extraClassPath=/spark/pre-jars/* \ No newline at end of file diff --git a/e2e/cdc/docker-compose.yml b/e2e/cdc/docker-compose.yml index 548aeb0bf..83e28726e 100644 --- a/e2e/cdc/docker-compose.yml +++ b/e2e/cdc/docker-compose.yml @@ -110,9 +110,8 @@ services: spark-submit: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --repositories http://packages.confluent.io/maven/ --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar,https://repo1.maven.org/maven2/za/co/absa/abris_2.11/3.1.1/abris_2.11-3.1.1.jar --packages org.apache.spark:spark-avro_2.11:2.4.5,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,io.confluent:kafka-schema-registry-client:5.3.0,io.confluent:kafka-avro-serializer:5.3.0,org.apache.kafka:kafka_2.11:2.2.0 --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/kafka/kafka_example_cdc.yaml + - SUBMIT_COMMAND=spark-submit --repositories http://packages.confluent.io/maven/ --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar,https://repo1.maven.org/maven2/za/co/absa/abris_2.11/3.1.1/abris_2.11-3.1.1.jar --packages org.apache.spark:spark-avro_2.11:2.4.5,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,io.confluent:kafka-schema-registry-client:5.3.0,io.confluent:kafka-avro-serializer:5.3.0,org.apache.kafka:kafka_2.11:2.2.0 --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/kafka/kafka_example_cdc.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false entrypoint: - /scripts/entrypoint-submit.sh volumes: @@ -144,9 +143,8 @@ services: hive-tester: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings /test_metrics/hive_test.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings /test_metrics/hive_test.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ - ./test_metrics:/test_metrics diff --git a/e2e/elasticsearch/docker-compose.yml b/e2e/elasticsearch/docker-compose.yml index 1928fe6fd..f8410e120 100644 --- a/e2e/elasticsearch/docker-compose.yml +++ b/e2e/elasticsearch/docker-compose.yml @@ -4,7 +4,7 @@ services: spark-submit: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.elasticsearch:elasticsearch-hadoop:6.6.1 --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/elasticsearch/movies.yaml + - SUBMIT_COMMAND=spark-submit --packages commons-httpclient:commons-httpclient:3.0.1,org.elasticsearch:elasticsearch-hadoop:6.6.1 --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/elasticsearch/movies.yaml entrypoint: - /scripts/entrypoint-submit.sh depends_on: diff --git a/e2e/hive/docker-compose.yml b/e2e/hive/docker-compose.yml index 863978c9b..1ac90eede 100644 --- a/e2e/hive/docker-compose.yml +++ b/e2e/hive/docker-compose.yml @@ -5,7 +5,6 @@ services: environment: - SUBMIT_COMMAND=spark-submit --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hive/movies.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false entrypoint: - /scripts/entrypoint-submit.sh volumes: @@ -19,7 +18,6 @@ services: environment: - SUBMIT_COMMAND=spark-submit --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hive/movies_test.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ - ./warehouse:/warehouse diff --git a/e2e/hive1/docker-compose.yml b/e2e/hive1/docker-compose.yml deleted file mode 100644 index caea9e9bb..000000000 --- a/e2e/hive1/docker-compose.yml +++ /dev/null @@ -1,58 +0,0 @@ -version: '3' -services: - spark-submit: - image: metorikku/metorikku - environment: - - SUBMIT_COMMAND=spark-submit --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hive/movies.yaml - - HIVE_METASTORE_URI=hive:9083 - entrypoint: - - /scripts/entrypoint-submit.sh - volumes: - - ./output/:/examples/output/ - depends_on: - - spark-master - - spark-worker - - hive - hive-tester: - image: metorikku/metorikku - environment: - - SUBMIT_COMMAND=spark-submit --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hive/movies_test.yaml - - HIVE_METASTORE_URI=hive:9083 - volumes: - - ./output/:/examples/output/ - - ./warehouse:/warehouse - entrypoint: - - /scripts/entrypoint-submit.sh - depends_on: - - spark-master - - spark-worker - spark-master: - image: metorikku/metorikku - entrypoint: - - /scripts/entrypoint-master.sh - logging: - driver: none - spark-worker: - image: metorikku/metorikku - entrypoint: - - /scripts/entrypoint-worker.sh - volumes: - - ./output/:/examples/output/ - - ./warehouse:/warehouse - logging: - driver: none - hive: - image: metorikku/hive:1 - environment: - - CONNECTION_URL=jdbc:mysql://hive-db:3306/hive?useSSL=false - - CONNECTION_USER_NAME=root - - CONNECTION_PASSWORD=pass - - WAREHOUSE_DIR=file:///warehouse - - WAIT_HOSTS=hive-db:3306 - depends_on: - - hive-db - hive-db: - image: mysql:5.7.25 - environment: - - MYSQL_ROOT_PASSWORD=pass - - MYSQL_DATABASE=hive diff --git a/e2e/hive1/test.sh b/e2e/hive1/test.sh deleted file mode 100755 index 536f9c52c..000000000 --- a/e2e/hive1/test.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -set -e - -docker-compose up -d hive-db -docker-compose up --exit-code-from spark-submit spark-submit -docker-compose up --exit-code-from hive-tester hive-tester -exit_code=$(docker ps -aq -f label=com.docker.compose.project=hive | xargs -I{} docker inspect {} --format='{{.State.ExitCode}}' | paste -sd+ - | bc) -docker-compose down -exit $exit_code diff --git a/e2e/hudi/docker-compose.yml b/e2e/hudi/docker-compose.yml index 96a7d5cf1..b87b37398 100644 --- a/e2e/hudi/docker-compose.yml +++ b/e2e/hudi/docker-compose.yml @@ -3,9 +3,8 @@ services: spark-submit: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/movies.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/movies.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: @@ -16,9 +15,8 @@ services: hive-tester: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/movies_test.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/movies_test.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: @@ -29,9 +27,8 @@ services: spark-submit-manual-hive-sync: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_config.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_config.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: @@ -42,9 +39,8 @@ services: hive-tester-manual-hive-sync: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_test.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_test.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: @@ -55,9 +51,8 @@ services: spark-submit-manual-hive-sync-non-partition: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_no_partitions_config.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_no_partitions_config.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: @@ -68,9 +63,8 @@ services: hive-tester-manual-hive-sync-no-partition: image: metorikku/metorikku environment: - - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.1-incubating/hudi-spark-bundle_2.11-0.5.1-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_no_partitions_test.yaml + - SUBMIT_COMMAND=spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.5 --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.2-incubating/hudi-spark-bundle_2.11-0.5.2-incubating.jar --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_no_partitions_test.yaml - HIVE_METASTORE_URI=hive:9083 - - USE_BUILTIN_HIVE_METASTORE=false volumes: - ./output/:/examples/output/ entrypoint: diff --git a/scripts/docker-without-cache.sh b/scripts/docker-without-cache.sh index 1a16eea4a..d63d01a2e 100755 --- a/scripts/docker-without-cache.sh +++ b/scripts/docker-without-cache.sh @@ -2,6 +2,5 @@ set -e docker build -t metorikku/spark --build-arg SPARK_VERSION=$SPARK_VERSION --build-arg HADOOP_VERSION=$HADOOP_VERSION -f docker/spark/Dockerfile docker/spark -docker build -t metorikku/hive:1 --build-arg HIVE_VERSION=$HIVE1_VERSION --build-arg HUDI_HIVE1_VERSION=$HUDI_HIVE1_VERSION -f docker/hive1/Dockerfile docker/hive1 docker build -t metorikku/hive --build-arg HIVE_VERSION=$HIVE_VERSION --build-arg HUDI_VERSION=$HUDI_VERSION -f docker/hive/Dockerfile docker/hive docker build -t metorikku/metorikku -f docker/metorikku/Dockerfile . diff --git a/scripts/docker.sh b/scripts/docker.sh index 3cae0e94c..10ba32a9c 100755 --- a/scripts/docker.sh +++ b/scripts/docker.sh @@ -7,10 +7,9 @@ docker pull metorikku/spark:latest docker pull metorikku/metorikku:latest docker pull $(grep -ioP '(?<=^from)\s+\S+' docker/hive/Dockerfile) docker pull metorikku/hive:latest -docker pull $(grep -ioP '(?<=^from)\s+\S+' docker/hive1/Dockerfile) -docker pull metorikku/hive:1 + +(cd docker/spark/hive-shim-loader && mvn install) docker build -t metorikku/spark --cache-from metorikku/spark:latest --build-arg SPARK_VERSION=$SPARK_VERSION --build-arg HADOOP_VERSION=$HADOOP_VERSION -f docker/spark/Dockerfile docker/spark -docker build -t metorikku/hive:1 --cache-from metorikku/hive:1 --build-arg HIVE_VERSION=$HIVE1_VERSION --build-arg HUDI_HIVE1_VERSION=$HUDI_HIVE1_VERSION -f docker/hive1/Dockerfile docker/hive1 docker build -t metorikku/hive --cache-from metorikku/hive --build-arg HIVE_VERSION=$HIVE_VERSION --build-arg HUDI_VERSION=$HUDI_VERSION -f docker/hive/Dockerfile docker/hive docker build -t metorikku/metorikku -f docker/metorikku/Dockerfile . diff --git a/scripts/docker_publish.sh b/scripts/docker_publish.sh index 575306142..30fafc478 100755 --- a/scripts/docker_publish.sh +++ b/scripts/docker_publish.sh @@ -5,9 +5,7 @@ echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin docker tag metorikku/metorikku metorikku/metorikku:"$TRAVIS_TAG"_spark_"$SPARK_VERSION" docker tag metorikku/spark metorikku/spark:$SPARK_VERSION docker tag metorikku/hive metorikku/hive:$HIVE_VERSION -docker tag metorikku/hive:1 metorikku/hive:$HIVE1_VERSION docker push metorikku/spark docker push metorikku/metorikku docker push metorikku/hive -docker push metorikku/hive:1 diff --git a/scripts/docker_publish_dev.sh b/scripts/docker_publish_dev.sh new file mode 100755 index 000000000..f08edf5e6 --- /dev/null +++ b/scripts/docker_publish_dev.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +tag=$(([ ! "${TRAVIS_PULL_REQUEST}" = "false" ] && echo "${TRAVIS_PULL_REQUEST}") || echo "${TRAVIS_BRANCH}") + +if [[ -z $DOCKER_PASSWORD ]]; then + echo "Skipping pushing dev docker since no docker password was defined" +else + echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin + docker tag metorikku/metorikku metorikku/dev:metorikku_$tag + docker tag metorikku/spark metorikku/dev:spark_$tag + docker tag metorikku/hive metorikku/dev:hive_$tag + + docker push metorikku/dev:metorikku_$tag + docker push metorikku/dev:spark_$tag + docker push metorikku/dev:hive_$tag +fi + diff --git a/scripts/travis_build.sh b/scripts/travis_build.sh index 9ac1fd05d..350876057 100755 --- a/scripts/travis_build.sh +++ b/scripts/travis_build.sh @@ -37,12 +37,6 @@ travis_fold start "elasticsearch" travis_time_finish travis_fold end "elasticsearch" -travis_fold start "hive1" - travis_time_start - (cd e2e/hive1 && ./test.sh) - travis_time_finish -travis_fold end "hive1" - travis_fold start "hive" travis_time_start (cd e2e/hive && ./test.sh) @@ -61,3 +55,8 @@ travis_fold start "cdc" travis_time_finish travis_fold end "cdc" +travis_fold start "docker_publish_dev" + travis_time_start + ./scripts/docker_publish_dev.sh + travis_time_finish +travis_fold end "docker_publish_dev"