From 5938d40747692dcee8aeaa6210bddd3424be1cbb Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 18 Aug 2022 11:54:14 -0400 Subject: [PATCH] Bump Spark, Hadoop, and Linux AMI defaults (#355) * bump default Spark, Hadoop, and Linux AMI * bump Ubuntu build env --- .github/workflows/flintrock.yaml | 4 ++-- .github/workflows/infra.yaml | 2 +- README.md | 10 +++++----- flintrock/config.yaml.template | 6 +++--- flintrock/flintrock.py | 10 +++++++++- tests/conftest.py | 4 ++-- tests/test_core.py | 4 ++-- tests/test_flintrock.py | 4 ++-- 8 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.github/workflows/flintrock.yaml b/.github/workflows/flintrock.yaml index da7ea179..33d19239 100644 --- a/.github/workflows/flintrock.yaml +++ b/.github/workflows/flintrock.yaml @@ -14,7 +14,7 @@ jobs: strategy: matrix: os: - - ubuntu-18.04 + - ubuntu-20.04 - macos-11 python-version: - "3.7" @@ -38,7 +38,7 @@ jobs: name: Flintrock Standalone - ${{ matrix.os }} path: dist/Flintrock-*-standalone-*.zip - uses: actions/upload-artifact@v3 - if: ${{ matrix.os == 'ubuntu-18.04' && matrix.python-version == '3.9' }} + if: ${{ matrix.os == 'ubuntu-20.04' && matrix.python-version == '3.9' }} with: name: Flintrock Wheel path: dist/Flintrock-*.whl diff --git a/.github/workflows/infra.yaml b/.github/workflows/infra.yaml index ec39ab51..19688ab3 100644 --- a/.github/workflows/infra.yaml +++ b/.github/workflows/infra.yaml @@ -10,7 +10,7 @@ on: jobs: terraform-lint: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Check Terraform Formatting diff --git a/README.md b/README.md index 712f6810..623dbb3c 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Here's a quick way to launch a cluster on EC2, assuming you already have an [AWS ```sh flintrock launch test-cluster \ --num-slaves 1 \ - --spark-version 3.2.1 \ + --spark-version 3.3.0 \ --ec2-key-name key_name \ --ec2-identity-file /path/to/key.pem \ --ec2-ami ami-0aeeebd8d2ab47354 \ @@ -87,12 +87,12 @@ these steps: better performance. 3. Make sure Flintrock is configured to use Hadoop/HDFS 2.7+. Earlier versions of Hadoop do not have solid implementations of `s3a://`. - Flintrock's default is Hadoop 3.3.2, so you don't need to do anything + Flintrock's default is Hadoop 3.3.4, so you don't need to do anything here if you're using a vanilla configuration. 4. Call Spark with the hadoop-aws package to enable `s3a://`. For example: ```sh - spark-submit --packages org.apache.hadoop:hadoop-aws:3.3.2 my-app.py - pyspark --packages org.apache.hadoop:hadoop-aws:3.3.2 + spark-submit --packages org.apache.hadoop:hadoop-aws:3.3.4 my-app.py + pyspark --packages org.apache.hadoop:hadoop-aws:3.3.4 ``` If you have issues using the package, consult the [hadoop-aws troubleshooting guide](http://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html) @@ -252,7 +252,7 @@ provider: ec2 services: spark: - version: 3.2.1 + version: 3.3.0 launch: num-slaves: 1 diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template index 5bfb291a..7db03c76 100644 --- a/flintrock/config.yaml.template +++ b/flintrock/config.yaml.template @@ -1,6 +1,6 @@ services: spark: - version: 3.2.1 + version: 3.3.0 # git-commit: latest # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350 # git-repository: # optional; defaults to https://github.com/apache/spark # optional; defaults to download from a dynamically selected Apache mirror @@ -12,7 +12,7 @@ services: # download-source: "s3://some-bucket/spark/{v}/" # executor-instances: 1 hdfs: - version: 3.3.2 + version: 3.3.4 # optional; defaults to download from a dynamically selected Apache mirror # - can be http, https, or s3 URL # - must contain a {v} template corresponding to the version @@ -30,7 +30,7 @@ providers: instance-type: m5.large region: us-east-1 # availability-zone: - ami: ami-0a3c14e1ddbe7f23c # Amazon Linux 2, us-east-1 + ami: ami-0cabc39acf991f4f1 # Amazon Linux 2, us-east-1 user: ec2-user # ami: ami-61bbf104 # CentOS 7, us-east-1 # user: centos diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index ebe957e1..9c0fe67e 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -192,6 +192,14 @@ def build_spark_download_url(ctx, param, value): spark_version = ctx.params['spark_version'] hadoop_version = ctx.params['hdfs_version'] hadoop_build_version = spark_hadoop_build_version(hadoop_version) + + # Starting in Spark 3.3.0, the build artifact naming scheme changed a bit. + # Instead of 'hadoop3.2', for example, that part now reads 'hadoop3'. + if spark_version: + spark_version_tuple = tuple(map(int, spark_version.split('.'))) + if spark_version_tuple >= (3, 3, 0): + hadoop_build_version = hadoop_build_version.split('.')[0] + if value.endswith('.gz') or value.endswith('.tgz'): logger.warning( "Spark download source appears to point to a file, not a directory. " @@ -279,7 +287,7 @@ def cli(cli_context, config, provider, debug): @click.option('--num-slaves', type=click.IntRange(min=1), required=True) @click.option('--java-version', type=click.IntRange(min=8), default=11) @click.option('--install-hdfs/--no-install-hdfs', default=False) -@click.option('--hdfs-version', default='3.3.2') +@click.option('--hdfs-version', default='3.3.4') @click.option('--hdfs-download-source', help=( "URL to download Hadoop from. If an S3 URL, Flintrock will use the " diff --git a/tests/conftest.py b/tests/conftest.py index 47e9e2e0..a105728c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,8 +10,8 @@ # External import pytest -HADOOP_VERSION = '3.3.2' -SPARK_VERSION = '3.2.1' +HADOOP_VERSION = '3.3.4' +SPARK_VERSION = '3.3.0' SPARK_GIT_COMMIT = 'de351e30a90dd988b133b3d00fa6218bfcaba8b8' # 3.1.2 JAVA_VERSION = '11' diff --git a/tests/test_core.py b/tests/test_core.py index 78d5f7aa..5379c62b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -16,8 +16,8 @@ @pytest.mark.parametrize( 'spark_version', [ (''), - ('3.2.1'), - ('0626b11147133b67b26a04b4819f61a33dd958d3'), + ('3.3.0'), + ('a28880f3b9c63d86368bcd6cbbaa6a9af7075409'), ]) def test_templates(dummy_cluster, spark_version): template_dir = os.path.join(FLINTROCK_ROOT_DIR, 'flintrock', 'templates') diff --git a/tests/test_flintrock.py b/tests/test_flintrock.py index b242af20..ff6b7c10 100644 --- a/tests/test_flintrock.py +++ b/tests/test_flintrock.py @@ -157,8 +157,8 @@ def test_get_latest_commit(): raises=Error, ) def test_validate_valid_download_source(): - validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz") - validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz") + validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz") + validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz") def test_validate_invalid_download_source():