From 5938d40747692dcee8aeaa6210bddd3424be1cbb Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 18 Aug 2022 11:54:14 -0400
Subject: [PATCH] Bump Spark, Hadoop, and Linux AMI defaults (#355)

* bump default Spark, Hadoop, and Linux AMI

* bump Ubuntu build env
---
 .github/workflows/flintrock.yaml |  4 ++--
 .github/workflows/infra.yaml     |  2 +-
 README.md                        | 10 +++++-----
 flintrock/config.yaml.template   |  6 +++---
 flintrock/flintrock.py           | 10 +++++++++-
 tests/conftest.py                |  4 ++--
 tests/test_core.py               |  4 ++--
 tests/test_flintrock.py          |  4 ++--
 8 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/flintrock.yaml b/.github/workflows/flintrock.yaml
index da7ea179..33d19239 100644
--- a/.github/workflows/flintrock.yaml
+++ b/.github/workflows/flintrock.yaml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os:
-          - ubuntu-18.04
+          - ubuntu-20.04
           - macos-11
         python-version:
           - "3.7"
@@ -38,7 +38,7 @@ jobs:
           name: Flintrock Standalone - ${{ matrix.os }}
           path: dist/Flintrock-*-standalone-*.zip
       - uses: actions/upload-artifact@v3
-        if: ${{ matrix.os == 'ubuntu-18.04' && matrix.python-version == '3.9' }}
+        if: ${{ matrix.os == 'ubuntu-20.04' && matrix.python-version == '3.9' }}
         with:
           name: Flintrock Wheel
           path: dist/Flintrock-*.whl
diff --git a/.github/workflows/infra.yaml b/.github/workflows/infra.yaml
index ec39ab51..19688ab3 100644
--- a/.github/workflows/infra.yaml
+++ b/.github/workflows/infra.yaml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   terraform-lint:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Check Terraform Formatting
diff --git a/README.md b/README.md
index 712f6810..623dbb3c 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Here's a quick way to launch a cluster on EC2, assuming you already have an [AWS
 ```sh
 flintrock launch test-cluster \
     --num-slaves 1 \
-    --spark-version 3.2.1 \
+    --spark-version 3.3.0 \
     --ec2-key-name key_name \
     --ec2-identity-file /path/to/key.pem \
     --ec2-ami ami-0aeeebd8d2ab47354 \
@@ -87,12 +87,12 @@ these steps:
    better performance.
 3. Make sure Flintrock is configured to use Hadoop/HDFS 2.7+. Earlier
    versions of Hadoop do not have solid implementations of `s3a://`.
-   Flintrock's default is Hadoop 3.3.2, so you don't need to do anything
+   Flintrock's default is Hadoop 3.3.4, so you don't need to do anything
    here if you're using a vanilla configuration.
 4. Call Spark with the hadoop-aws package to enable `s3a://`. For example:
    ```sh
-   spark-submit --packages org.apache.hadoop:hadoop-aws:3.3.2 my-app.py
-   pyspark --packages org.apache.hadoop:hadoop-aws:3.3.2
+   spark-submit --packages org.apache.hadoop:hadoop-aws:3.3.4 my-app.py
+   pyspark --packages org.apache.hadoop:hadoop-aws:3.3.4
    ```
    If you have issues using the package, consult the [hadoop-aws troubleshooting
    guide](http://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html)
@@ -252,7 +252,7 @@ provider: ec2
 
 services:
   spark:
-    version: 3.2.1
+    version: 3.3.0
 
 launch:
   num-slaves: 1
diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
index 5bfb291a..7db03c76 100644
--- a/flintrock/config.yaml.template
+++ b/flintrock/config.yaml.template
@@ -1,6 +1,6 @@
 services:
   spark:
-    version: 3.2.1
+    version: 3.3.0
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
     # optional; defaults to download from a dynamically selected Apache mirror
@@ -12,7 +12,7 @@ services:
     # download-source: "s3://some-bucket/spark/{v}/"
     # executor-instances: 1
   hdfs:
-    version: 3.3.2
+    version: 3.3.4
     # optional; defaults to download from a dynamically selected Apache mirror
     #   - can be http, https, or s3 URL
     #   - must contain a {v} template corresponding to the version
@@ -30,7 +30,7 @@ providers:
     instance-type: m5.large
     region: us-east-1
     # availability-zone: <name>
-    ami: ami-0a3c14e1ddbe7f23c  # Amazon Linux 2, us-east-1
+    ami: ami-0cabc39acf991f4f1  # Amazon Linux 2, us-east-1
     user: ec2-user
     # ami: ami-61bbf104  # CentOS 7, us-east-1
     # user: centos
diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
index ebe957e1..9c0fe67e 100644
--- a/flintrock/flintrock.py
+++ b/flintrock/flintrock.py
@@ -192,6 +192,14 @@ def build_spark_download_url(ctx, param, value):
     spark_version = ctx.params['spark_version']
     hadoop_version = ctx.params['hdfs_version']
     hadoop_build_version = spark_hadoop_build_version(hadoop_version)
+
+    # Starting in Spark 3.3.0, the build artifact naming scheme changed a bit.
+    # Instead of 'hadoop3.2', for example, that part now reads 'hadoop3'.
+    if spark_version:
+        spark_version_tuple = tuple(map(int, spark_version.split('.')))
+        if spark_version_tuple >= (3, 3, 0):
+            hadoop_build_version = hadoop_build_version.split('.')[0]
+
     if value.endswith('.gz') or value.endswith('.tgz'):
         logger.warning(
             "Spark download source appears to point to a file, not a directory. "
@@ -279,7 +287,7 @@ def cli(cli_context, config, provider, debug):
 @click.option('--num-slaves', type=click.IntRange(min=1), required=True)
 @click.option('--java-version', type=click.IntRange(min=8), default=11)
 @click.option('--install-hdfs/--no-install-hdfs', default=False)
-@click.option('--hdfs-version', default='3.3.2')
+@click.option('--hdfs-version', default='3.3.4')
 @click.option('--hdfs-download-source',
               help=(
                   "URL to download Hadoop from. If an S3 URL, Flintrock will use the "
diff --git a/tests/conftest.py b/tests/conftest.py
index 47e9e2e0..a105728c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,8 +10,8 @@
 # External
 import pytest
 
-HADOOP_VERSION = '3.3.2'
-SPARK_VERSION = '3.2.1'
+HADOOP_VERSION = '3.3.4'
+SPARK_VERSION = '3.3.0'
 SPARK_GIT_COMMIT = 'de351e30a90dd988b133b3d00fa6218bfcaba8b8'  # 3.1.2
 JAVA_VERSION = '11'
 
diff --git a/tests/test_core.py b/tests/test_core.py
index 78d5f7aa..5379c62b 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -16,8 +16,8 @@
 @pytest.mark.parametrize(
     'spark_version', [
         (''),
-        ('3.2.1'),
-        ('0626b11147133b67b26a04b4819f61a33dd958d3'),
+        ('3.3.0'),
+        ('a28880f3b9c63d86368bcd6cbbaa6a9af7075409'),
     ])
 def test_templates(dummy_cluster, spark_version):
     template_dir = os.path.join(FLINTROCK_ROOT_DIR, 'flintrock', 'templates')
diff --git a/tests/test_flintrock.py b/tests/test_flintrock.py
index b242af20..ff6b7c10 100644
--- a/tests/test_flintrock.py
+++ b/tests/test_flintrock.py
@@ -157,8 +157,8 @@ def test_get_latest_commit():
     raises=Error,
 )
 def test_validate_valid_download_source():
-    validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz")
-    validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz")
+    validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz")
+    validate_download_source("https://www.apache.org/dyn/closer.lua?action=download&filename=spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz")
 
 
 def test_validate_invalid_download_source():