diff --git a/datafusion/README.md b/datafusion/README.md index b9c5b5d38..cc9157468 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -20,7 +20,7 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe 1. `cd ClickBench/datafusion` 1. `vi benchmark.sh` and modify following line to target Datafusion version ``` - git checkout 40.0.0 + git checkout 43.0.0 ``` 1. `bash benchmark.sh` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index 38e1bbb03..759f6590c 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -14,7 +14,7 @@ sudo yum install gcc -y # Install DataFusion main branch git clone https://github.com/apache/arrow-datafusion.git cd arrow-datafusion/datafusion-cli -git checkout 40.0.0 +git checkout 43.0.0 CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release export PATH="`pwd`/target/release:$PATH" cd ../.. diff --git a/datafusion/create_partitioned.sql b/datafusion/create_partitioned.sql index 17a8b5b4d..3b3330c1b 100644 --- a/datafusion/create_partitioned.sql +++ b/datafusion/create_partitioned.sql @@ -1,3 +1,4 @@ CREATE EXTERNAL TABLE hits STORED AS PARQUET -LOCATION 'partitioned'; +LOCATION 'partitioned' +OPTIONS ('binary_as_string' 'true'); diff --git a/datafusion/create_single.sql b/datafusion/create_single.sql index eedd4c038..19e623b07 100644 --- a/datafusion/create_single.sql +++ b/datafusion/create_single.sql @@ -1,3 +1,4 @@ CREATE EXTERNAL TABLE hits STORED AS PARQUET -LOCATION 'hits.parquet'; +LOCATION 'hits.parquet' +OPTIONS ('binary_as_string' 'true'); diff --git a/datafusion/results/partitioned.json b/datafusion/results/partitioned.json index 52661cb82..05484f901 100644 --- a/datafusion/results/partitioned.json +++ b/datafusion/results/partitioned.json @@ -1,9 +1,9 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2024-07-27", + "date": "2024-11-15", "machine": "c6a.4xlarge, 500gb gp2", "cluster_size": 1, - "comment": "v40.0.0 (4cae813)", + "comment": "v43.0.0 (88f58bf)", "tags": ["Rust", "column-oriented", "embedded", "stateless"], @@ -11,48 +11,48 @@ "data_size": 14779976446, "result": [ -[0.043, 0.018, 0.016], -[0.087, 0.031, 0.028], -[0.173, 0.072, 0.073], -[0.356, 0.075, 0.081], -[1.201, 0.784, 0.796], -[0.960, 0.831, 0.837], -[0.057, 0.026, 0.026], -[0.062, 0.029, 0.031], -[1.408, 1.314, 1.315], -[1.302, 1.025, 1.038], -[0.483, 0.280, 0.269], -[0.705, 0.306, 0.296], -[1.137, 0.931, 0.939], -[3.183, 2.245, 2.252], -[1.499, 1.415, 1.429], -[1.011, 0.901, 0.897], -[3.230, 2.670, 2.655], -[3.136, 2.560, 2.539], -[6.849, 5.608, 5.827], -[0.299, 0.075, 0.068], -[10.086, 1.544, 1.617], -[11.238, 1.821, 1.835], -[21.957, 4.104, 4.132], -[55.510, 10.615, 10.548], -[2.678, 0.503, 0.500], -[0.765, 0.412, 0.413], -[2.649, 0.574, 0.559], -[9.652, 2.177, 2.203], -[8.528, 5.051, 5.019], -[0.499, 0.421, 0.439], -[2.389, 1.018, 1.028], -[6.060, 1.520, 1.513], -[8.820, 8.081, 7.826], -[10.604, 4.851, 5.088], -[10.567, 4.971, 4.880], -[1.737, 1.659, 1.649], -[0.363, 0.247, 0.231], -[0.156, 0.093, 0.092], -[0.198, 0.125, 0.124], -[0.902, 0.701, 0.683], -[0.144, 0.042, 0.041], -[0.130, 0.037, 0.040], -[0.131, 0.055, 0.050] +[0.051, 0.019, 0.019], +[0.091, 0.035, 0.035], +[0.189, 0.085, 0.088], +[0.383, 0.081, 0.077], +[1.071, 0.811, 0.803], +[0.944, 0.801, 0.805], +[0.078, 0.030, 0.030], +[0.103, 0.037, 0.037], +[1.313, 1.205, 1.201], +[1.357, 1.034, 1.025], +[0.511, 0.255, 0.253], +[0.634, 0.295, 0.301], +[1.016, 0.856, 0.879], +[2.615, 1.421, 1.374], +[1.131, 0.931, 0.918], +[1.051, 0.952, 0.958], +[2.672, 2.031, 2.066], +[2.592, 1.879, 1.887], +[5.549, 4.226, 4.335], +[0.254, 0.078, 0.075], +[9.967, 1.098, 1.092], +[11.248, 1.329, 1.327], +[21.868, 2.820, 2.818], +[55.458, 10.286, 10.609], +[2.678, 0.488, 0.486], +[0.802, 0.352, 0.354], +[2.672, 0.507, 0.498], +[9.614, 1.513, 1.507], +[8.368, 3.394, 3.521], +[0.535, 0.418, 0.439], +[2.362, 0.854, 0.861], +[5.957, 0.910, 0.914], +[4.780, 3.806, 3.871], +[10.168, 3.654, 3.586], +[10.090, 3.645, 3.546], +[1.775, 1.644, 1.660], +[0.364, 0.199, 0.183], +[0.183, 0.078, 0.075], +[0.290, 0.128, 0.123], +[0.619, 0.376, 0.376], +[0.148, 0.053, 0.044], +[0.142, 0.042, 0.042], +[0.155, 0.065, 0.053] ] } diff --git a/datafusion/results/single.json b/datafusion/results/single.json index 8bb98a74a..5112839cb 100644 --- a/datafusion/results/single.json +++ b/datafusion/results/single.json @@ -1,9 +1,9 @@ { "system": "DataFusion (Parquet, single)", - "date": "2024-07-27", + "date": "2024-11-15", "machine": "c6a.4xlarge, 500gb gp2", "cluster_size": 1, - "comment": "v40.0.0 (4cae813)", + "comment": "v43.0.0 (88f58bf)", "tags": ["Rust", "column-oriented", "embedded", "stateless"], @@ -11,48 +11,48 @@ "data_size": 14779976446, "result": [ -[0.076, 0.051, 0.055], -[0.113, 0.066, 0.066], -[0.196, 0.115, 0.105], -[0.340, 0.114, 0.115], -[1.074, 0.862, 0.858], -[0.995, 0.874, 0.909], -[0.088, 0.076, 0.065], -[0.102, 0.078, 0.068], -[1.442, 1.349, 1.368], -[1.260, 1.083, 1.064], -[0.451, 0.306, 0.304], -[0.597, 0.337, 0.335], -[1.088, 0.986, 0.974], -[3.085, 2.261, 2.268], -[1.522, 1.428, 1.429], -[1.068, 0.957, 0.960], -[3.217, 2.702, 2.754], -[3.149, 2.621, 2.564], -[6.978, 5.679, 5.865], -[0.338, 0.107, 0.113], -[9.885, 1.466, 1.474], -[11.225, 1.794, 1.791], -[22.035, 3.906, 3.912], -[55.923, 10.899, 10.975], -[2.560, 0.579, 0.575], -[0.754, 0.509, 0.506], -[2.517, 0.674, 0.651], -[9.574, 2.220, 2.216], -[9.070, 4.926, 4.940], -[0.536, 0.473, 0.481], -[2.288, 1.090, 1.101], -[5.823, 1.543, 1.528], -[8.637, 8.328, 7.848], -[10.477, 4.972, 5.022], -[10.435, 4.910, 5.020], -[1.827, 1.685, 1.724], -[0.389, 0.275, 0.270], -[0.201, 0.175, 0.160], -[0.230, 0.173, 0.172], -[0.887, 0.749, 0.755], -[0.172, 0.085, 0.076], -[0.165, 0.075, 0.073], -[0.160, 0.090, 0.100] +[0.093, 0.055, 0.056], +[0.138, 0.070, 0.070], +[0.206, 0.120, 0.117], +[0.346, 0.118, 0.114], +[0.979, 0.867, 0.871], +[1.030, 0.902, 0.904], +[0.125, 0.064, 0.077], +[0.143, 0.083, 0.078], +[1.304, 1.169, 1.240], +[1.533, 1.104, 1.100], +[0.475, 0.272, 0.278], +[0.562, 0.309, 0.315], +[1.165, 0.931, 0.965], +[2.643, 1.402, 1.490], +[1.143, 0.997, 0.983], +[1.106, 0.991, 0.993], +[2.727, 2.161, 2.098], +[2.578, 1.954, 1.947], +[5.530, 4.311, 4.253], +[0.319, 0.105, 0.107], +[9.732, 1.155, 1.149], +[11.337, 1.468, 1.407], +[22.055, 3.678, 3.663], +[55.942, 10.017, 10.014], +[2.561, 0.557, 0.577], +[0.809, 0.510, 0.519], +[2.579, 0.634, 0.620], +[9.630, 1.618, 1.655], +[8.645, 3.565, 3.699], +[0.584, 0.493, 0.485], +[2.285, 0.978, 0.991], +[5.690, 1.046, 1.006], +[4.468, 3.833, 3.885], +[10.123, 3.663, 3.654], +[10.114, 3.672, 3.685], +[1.743, 1.597, 1.659], +[0.389, 0.242, 0.230], +[0.266, 0.155, 0.170], +[0.369, 0.161, 0.180], +[0.659, 0.446, 0.416], +[0.190, 0.084, 0.085], +[0.177, 0.078, 0.079], +[0.164, 0.103, 0.088] ] } diff --git a/datafusion/run.sh b/datafusion/run.sh index a6cc85626..3bda3f86f 100755 --- a/datafusion/run.sh +++ b/datafusion/run.sh @@ -31,7 +31,7 @@ cat queries.sql | while read query; do # 2. each query contains a "Query took xxx seconds", we just grep these 2 lines # 3. use sed to take the second line # 4. use awk to take the number we want - RES=`datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Elapsed" |sed -n 2p | awk '{ print $2 }' + RES=`datafusion-cli -f $CREATE_SQL_FILE /tmp/query.sql 2>&1 | grep "Elapsed" |sed -n 2p | awk '{ print $2 }'` [[ $RES != "" ]] && \ echo -n "$RES" || \ echo -n "null"