Skip to content

Commit

Permalink
Merge pull request easybuilders#19243 from boegel/20231116121118_new_…
Browse files Browse the repository at this point in the history
…pr_Spark350

{devel}[foss/2023a] Spark v3.5.0, Arrow v14.0.1, RapidJSON v1.1.0-20230928
  • Loading branch information
smoors authored Nov 17, 2023
2 parents 42930b7 + 93512a6 commit 5c7aa75
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 0 deletions.
74 changes: 74 additions & 0 deletions easybuild/easyconfigs/a/Arrow/Arrow-14.0.1-gfbf-2023a.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
easyblock = 'CMakeMake'

name = 'Arrow'
version = '14.0.1'

homepage = 'https://arrow.apache.org'
description = """Apache Arrow (incl. PyArrow Python bindings), a cross-language development platform
for in-memory data."""

toolchain = {'name': 'gfbf', 'version': '2023a'}

source_urls = ['https://archive.apache.org/dist/%(namelower)s/%(namelower)s-%(version)s']
sources = ['apache-arrow-%(version)s.tar.gz']
checksums = ['5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e']

builddependencies = [
('CMake', '3.26.3'),
('Autotools', '20220317'),
('flex', '2.6.4'),
('Bison', '3.8.2'),
('pkgconf', '1.9.5'),
]

# Arrow strongly prefers included jemalloc, so not including it as a dependency
dependencies = [
('Python', '3.11.3'),
('SciPy-bundle', '2023.07'), # for numpy
('Boost', '1.82.0'),
('lz4', '1.9.4'),
('zlib', '1.2.13'),
('bzip2', '1.0.8'),
('zstd', '1.5.5'),
('snappy', '1.1.10'),
('RapidJSON', '1.1.0-20230928'),
('RE2', '2023-08-01'),
('utf8proc', '2.8.0'),
]

start_dir = 'cpp'

# see https://arrow.apache.org/docs/developers/python.html
configopts = "-DARROW_DATASET=on -DARROW_PYTHON=on -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=ON "
configopts += "-DCMAKE_INSTALL_LIBDIR=lib -DPython3_ROOT_DIR=$EBROOTPYTHON "
configopts += "-DARROW_WITH_ZLIB=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON "
configopts += "-DZSTD_ROOT=$EBROOTZSTD "

# also install Python bindings
local_install_pyarrow_cmds = "export PKG_CONFIG_PATH=%(installdir)s/lib/pkgconfig:$PKG_CONFIG_PATH && "
local_install_pyarrow_cmds += "export Arrow_DIR=%(installdir)s && export ArrowDataset_DIR=%(installdir)s && "
local_install_pyarrow_cmds += "export ArrowAcero_DIR=%(installdir)s && export Parquet_DIR=%(installdir)s && "
local_install_pyarrow_cmds += "export PYTHONPATH=%(installdir)s/lib/python%(pyshortver)s/site-packages:$PYTHONPATH && "
local_install_pyarrow_cmds += "cd %(builddir)s/*arrow-%(version)s/python && export XDG_CACHE_HOME=$TMPDIR && "
local_install_pyarrow_cmds += "sed -i 's/numpy==[0-9.]*/numpy/g' pyproject.toml && "
local_install_pyarrow_cmds += "Python3_ROOT_DIR=$EBROOTPYTHON "
local_install_pyarrow_cmds += "PYARROW_CMAKE_OPTIONS='-DZSTD_LIB=$EBROOTZSTD/lib/libzstd.%s ' " % SHLIB_EXT
local_install_pyarrow_cmds += "PYARROW_WITH_DATASET=1 PYARROW_WITH_PARQUET=1 "
local_install_pyarrow_cmds += "python -m pip install --prefix %(installdir)s --no-build-isolation ."
postinstallcmds = [local_install_pyarrow_cmds]

modextrapaths = {'PYTHONPATH': 'lib/python%(pyshortver)s/site-packages'}

sanity_check_paths = {
'files': ['lib/libarrow.a', 'lib/libarrow.%s' % SHLIB_EXT,
'lib/python%%(pyshortver)s/site-packages/pyarrow/libarrow_python.%s' % SHLIB_EXT],
'dirs': ['include/arrow', 'lib/cmake/Arrow', 'lib/pkgconfig', 'lib/python%(pyshortver)s/site-packages'],
}

sanity_check_commands = [
"python -c 'import pyarrow'",
"python -c 'import pyarrow.dataset'",
"python -c 'import pyarrow.parquet'",
]

moduleclass = 'data'
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
easyblock = 'CMakeMake'

name = 'RapidJSON'
# no new release since Aug'16 so using latest commit;
# see also https://github.com/Tencent/rapidjson/issues/2202
version = '1.1.0-20230928'
local_commit = 'f9d5341'

homepage = 'https://rapidjson.org'
description = "A fast JSON parser/generator for C++ with both SAX/DOM style API"

toolchain = {'name': 'GCCcore', 'version': '12.3.0'}

source_urls = ['https://github.com/Tencent/%(namelower)s/archive/']
sources = [{'download_filename': '%s.tar.gz' % local_commit, 'filename': 'v%(version)s.tar.gz'}]
checksums = ['2b521dba5c22eaae6e6e7d4d304cb317e2cf8c687c70046b02792c02f78c127e']

builddependencies = [
('binutils', '2.40'),
('CMake', '3.26.3'),
]

# strip out hardcoded use of -march=native, EasyBuild should be in control of this
preconfigopts = "sed -i 's/-march=native//g' ../rapidjson-*/CMakeLists.txt && "

sanity_check_paths = {
'files': ['lib/pkgconfig/%(name)s.pc'],
'dirs': ['include/%(namelower)s', 'lib/cmake', 'share'],
}

moduleclass = 'lib'
52 changes: 52 additions & 0 deletions easybuild/easyconfigs/s/Spark/Spark-3.5.0-foss-2023a.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Author: Denis Krišťák (INUITS)

easyblock = 'Tarball'

name = 'Spark'
version = '3.5.0'
homepage = 'https://spark.apache.org'
description = """Spark is Hadoop MapReduce done in memory"""

toolchain = {'name': 'foss', 'version': '2023a'}

source_urls = [
'https://archive.apache.org/dist//%(namelower)s/%(namelower)s-%(version)s/',
'https://downloads.apache.org/%(namelower)s/%(namelower)s-%(version)s/'
]
sources = ['%(namelower)s-%(version)s-bin-hadoop3.tgz']
checksums = ['ac064ec620129b4b9630e180e25ac2b71a8ccd4c6912bf5b5631943b742f0379']

dependencies = [
('Python', '3.11.3'),
('Java', '11', '', SYSTEM),
('Arrow', '14.0.1'),
]

exts_defaultclass = 'PythonPackage'
exts_default_options = {
'source_urls': [PYPI_SOURCE],
'download_dep_fail': True,
'use_pip': True,
}

exts_list = [
('py4j', '0.10.9.7', {
'checksums': ['0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb'],
}),
]

sanity_check_paths = {
'files': ['bin/pyspark', 'bin/spark-shell'],
'dirs': ['python']
}

sanity_check_commands = [
"pyspark -h",
"python -c 'import pyspark'",
]

modextrapaths = {'PYTHONPATH': ['python', 'lib/python%(pyshortver)s/site-packages']}

modextravars = {'SPARK_HOME': '%(installdir)s'}

moduleclass = 'devel'

0 comments on commit 5c7aa75

Please sign in to comment.