From 93512a6be8c9d5446ad5a496ba7ccb3b5406d93b Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 16 Nov 2023 12:11:19 +0100 Subject: [PATCH] {devel}[foss/2023a] Spark v3.5.0, Arrow v14.0.1, RapidJSON v1.1.0-20230928 --- .../a/Arrow/Arrow-14.0.1-gfbf-2023a.eb | 74 +++++++++++++++++++ ...RapidJSON-1.1.0-20230928-GCCcore-12.3.0.eb | 31 ++++++++ .../s/Spark/Spark-3.5.0-foss-2023a.eb | 52 +++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 easybuild/easyconfigs/a/Arrow/Arrow-14.0.1-gfbf-2023a.eb create mode 100644 easybuild/easyconfigs/r/RapidJSON/RapidJSON-1.1.0-20230928-GCCcore-12.3.0.eb create mode 100644 easybuild/easyconfigs/s/Spark/Spark-3.5.0-foss-2023a.eb diff --git a/easybuild/easyconfigs/a/Arrow/Arrow-14.0.1-gfbf-2023a.eb b/easybuild/easyconfigs/a/Arrow/Arrow-14.0.1-gfbf-2023a.eb new file mode 100644 index 00000000000..40327535b8b --- /dev/null +++ b/easybuild/easyconfigs/a/Arrow/Arrow-14.0.1-gfbf-2023a.eb @@ -0,0 +1,74 @@ +easyblock = 'CMakeMake' + +name = 'Arrow' +version = '14.0.1' + +homepage = 'https://arrow.apache.org' +description = """Apache Arrow (incl. PyArrow Python bindings), a cross-language development platform + for in-memory data.""" + +toolchain = {'name': 'gfbf', 'version': '2023a'} + +source_urls = ['https://archive.apache.org/dist/%(namelower)s/%(namelower)s-%(version)s'] +sources = ['apache-arrow-%(version)s.tar.gz'] +checksums = ['5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e'] + +builddependencies = [ + ('CMake', '3.26.3'), + ('Autotools', '20220317'), + ('flex', '2.6.4'), + ('Bison', '3.8.2'), + ('pkgconf', '1.9.5'), +] + +# Arrow strongly prefers included jemalloc, so not including it as a dependency +dependencies = [ + ('Python', '3.11.3'), + ('SciPy-bundle', '2023.07'), # for numpy + ('Boost', '1.82.0'), + ('lz4', '1.9.4'), + ('zlib', '1.2.13'), + ('bzip2', '1.0.8'), + ('zstd', '1.5.5'), + ('snappy', '1.1.10'), + ('RapidJSON', '1.1.0-20230928'), + ('RE2', '2023-08-01'), + ('utf8proc', '2.8.0'), +] + +start_dir = 'cpp' + +# see https://arrow.apache.org/docs/developers/python.html +configopts = "-DARROW_DATASET=on -DARROW_PYTHON=on -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=ON " +configopts += "-DCMAKE_INSTALL_LIBDIR=lib -DPython3_ROOT_DIR=$EBROOTPYTHON " +configopts += "-DARROW_WITH_ZLIB=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON " +configopts += "-DZSTD_ROOT=$EBROOTZSTD " + +# also install Python bindings +local_install_pyarrow_cmds = "export PKG_CONFIG_PATH=%(installdir)s/lib/pkgconfig:$PKG_CONFIG_PATH && " +local_install_pyarrow_cmds += "export Arrow_DIR=%(installdir)s && export ArrowDataset_DIR=%(installdir)s && " +local_install_pyarrow_cmds += "export ArrowAcero_DIR=%(installdir)s && export Parquet_DIR=%(installdir)s && " +local_install_pyarrow_cmds += "export PYTHONPATH=%(installdir)s/lib/python%(pyshortver)s/site-packages:$PYTHONPATH && " +local_install_pyarrow_cmds += "cd %(builddir)s/*arrow-%(version)s/python && export XDG_CACHE_HOME=$TMPDIR && " +local_install_pyarrow_cmds += "sed -i 's/numpy==[0-9.]*/numpy/g' pyproject.toml && " +local_install_pyarrow_cmds += "Python3_ROOT_DIR=$EBROOTPYTHON " +local_install_pyarrow_cmds += "PYARROW_CMAKE_OPTIONS='-DZSTD_LIB=$EBROOTZSTD/lib/libzstd.%s ' " % SHLIB_EXT +local_install_pyarrow_cmds += "PYARROW_WITH_DATASET=1 PYARROW_WITH_PARQUET=1 " +local_install_pyarrow_cmds += "python -m pip install --prefix %(installdir)s --no-build-isolation ." +postinstallcmds = [local_install_pyarrow_cmds] + +modextrapaths = {'PYTHONPATH': 'lib/python%(pyshortver)s/site-packages'} + +sanity_check_paths = { + 'files': ['lib/libarrow.a', 'lib/libarrow.%s' % SHLIB_EXT, + 'lib/python%%(pyshortver)s/site-packages/pyarrow/libarrow_python.%s' % SHLIB_EXT], + 'dirs': ['include/arrow', 'lib/cmake/Arrow', 'lib/pkgconfig', 'lib/python%(pyshortver)s/site-packages'], +} + +sanity_check_commands = [ + "python -c 'import pyarrow'", + "python -c 'import pyarrow.dataset'", + "python -c 'import pyarrow.parquet'", +] + +moduleclass = 'data' diff --git a/easybuild/easyconfigs/r/RapidJSON/RapidJSON-1.1.0-20230928-GCCcore-12.3.0.eb b/easybuild/easyconfigs/r/RapidJSON/RapidJSON-1.1.0-20230928-GCCcore-12.3.0.eb new file mode 100644 index 00000000000..92555adb82a --- /dev/null +++ b/easybuild/easyconfigs/r/RapidJSON/RapidJSON-1.1.0-20230928-GCCcore-12.3.0.eb @@ -0,0 +1,31 @@ +easyblock = 'CMakeMake' + +name = 'RapidJSON' +# no new release since Aug'16 so using latest commit; +# see also https://github.com/Tencent/rapidjson/issues/2202 +version = '1.1.0-20230928' +local_commit = 'f9d5341' + +homepage = 'https://rapidjson.org' +description = "A fast JSON parser/generator for C++ with both SAX/DOM style API" + +toolchain = {'name': 'GCCcore', 'version': '12.3.0'} + +source_urls = ['https://github.com/Tencent/%(namelower)s/archive/'] +sources = [{'download_filename': '%s.tar.gz' % local_commit, 'filename': 'v%(version)s.tar.gz'}] +checksums = ['2b521dba5c22eaae6e6e7d4d304cb317e2cf8c687c70046b02792c02f78c127e'] + +builddependencies = [ + ('binutils', '2.40'), + ('CMake', '3.26.3'), +] + +# strip out hardcoded use of -march=native, EasyBuild should be in control of this +preconfigopts = "sed -i 's/-march=native//g' ../rapidjson-*/CMakeLists.txt && " + +sanity_check_paths = { + 'files': ['lib/pkgconfig/%(name)s.pc'], + 'dirs': ['include/%(namelower)s', 'lib/cmake', 'share'], +} + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/s/Spark/Spark-3.5.0-foss-2023a.eb b/easybuild/easyconfigs/s/Spark/Spark-3.5.0-foss-2023a.eb new file mode 100644 index 00000000000..21fc8a3448e --- /dev/null +++ b/easybuild/easyconfigs/s/Spark/Spark-3.5.0-foss-2023a.eb @@ -0,0 +1,52 @@ +# Author: Denis Krišťák (INUITS) + +easyblock = 'Tarball' + +name = 'Spark' +version = '3.5.0' +homepage = 'https://spark.apache.org' +description = """Spark is Hadoop MapReduce done in memory""" + +toolchain = {'name': 'foss', 'version': '2023a'} + +source_urls = [ + 'https://archive.apache.org/dist//%(namelower)s/%(namelower)s-%(version)s/', + 'https://downloads.apache.org/%(namelower)s/%(namelower)s-%(version)s/' +] +sources = ['%(namelower)s-%(version)s-bin-hadoop3.tgz'] +checksums = ['ac064ec620129b4b9630e180e25ac2b71a8ccd4c6912bf5b5631943b742f0379'] + +dependencies = [ + ('Python', '3.11.3'), + ('Java', '11', '', SYSTEM), + ('Arrow', '14.0.1'), +] + +exts_defaultclass = 'PythonPackage' +exts_default_options = { + 'source_urls': [PYPI_SOURCE], + 'download_dep_fail': True, + 'use_pip': True, +} + +exts_list = [ + ('py4j', '0.10.9.7', { + 'checksums': ['0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb'], + }), +] + +sanity_check_paths = { + 'files': ['bin/pyspark', 'bin/spark-shell'], + 'dirs': ['python'] +} + +sanity_check_commands = [ + "pyspark -h", + "python -c 'import pyspark'", +] + +modextrapaths = {'PYTHONPATH': ['python', 'lib/python%(pyshortver)s/site-packages']} + +modextravars = {'SPARK_HOME': '%(installdir)s'} + +moduleclass = 'devel'