From fc7c8f641747aa6e4ad696acfe14d6916cf7e4c6 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 17:19:50 +0000 Subject: [PATCH 1/4] test(arrow-ord): remove duplicate logic and make it easier to add tests --- arrow-ord/src/sort.rs | 610 +++++++++++++++++++++--------------------- 1 file changed, 307 insertions(+), 303 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 60fc4a918525..7c22e7704c27 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -801,19 +801,154 @@ mod tests { use half::f16; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; + use std::ops::Deref; + + mod build_arrays_helper { + use crate::sort::tests::get_same_lists_length; + use arrow_array::{ + ArrayRef, ArrowPrimitiveType, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, + FixedSizeListArray, GenericBinaryArray, GenericListArray, LargeStringArray, + OffsetSizeTrait, PrimitiveArray, StringArray, StringViewArray, + }; + use arrow_buffer::i256; + use std::sync::Arc; + + pub fn decimal128_array(data: Vec>) -> Vec { + vec![Arc::new( + data.into_iter() + .collect::() + .with_precision_and_scale(23, 6) + .unwrap(), + )] + } - fn create_decimal128_array(data: Vec>) -> Decimal128Array { - data.into_iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap() + pub fn decimal256_array(data: Vec>) -> Vec { + vec![Arc::new( + data.into_iter() + .collect::() + .with_precision_and_scale(53, 6) + .unwrap(), + )] + } + + pub fn primitive_arrays(data: Vec>) -> Vec + where + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + vec![Arc::new(PrimitiveArray::::from(data)) as ArrayRef] + } + + pub fn string_arrays(data: Vec>) -> Vec { + vec![ + Arc::new(StringArray::from(data.clone())) as ArrayRef, + Arc::new(LargeStringArray::from(data.clone())) as ArrayRef, + Arc::new(StringViewArray::from(data)) as ArrayRef, + ] + } + + pub fn binary_arrays(data: Vec>>) -> Vec { + // Generic size binary array + fn generic_binary_array( + data: &[Option>], + ) -> Arc> { + Arc::new(GenericBinaryArray::::from_opt_vec( + data.iter() + .map(|binary| binary.as_ref().map(Vec::as_slice)) + .collect(), + )) + } + + let mut arrays = vec![ + generic_binary_array::(&data) as ArrayRef, + generic_binary_array::(&data) as ArrayRef, + ]; + + if let Some(first_length) = get_same_lists_length(&data) { + arrays.push(Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + data.iter().cloned(), + first_length as i32, + ) + .unwrap(), + )); + } + + arrays + } + + pub fn primitive_list_arrays(data: Vec>>>) -> Vec + where + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + let mut arrays = vec![ + Arc::new(GenericListArray::::from_iter_primitive::( + data.clone(), + )) as ArrayRef, + Arc::new(GenericListArray::::from_iter_primitive::( + data.clone(), + )), + ]; + + if let Some(first_length) = get_same_lists_length(&data) { + arrays.push(Arc::new( + FixedSizeListArray::from_iter_primitive::(data, first_length as i32), + )); + } + + arrays + } } - fn create_decimal256_array(data: Vec>) -> Decimal256Array { - data.into_iter() - .collect::() - .with_precision_and_scale(53, 6) - .unwrap() + /// Return Some(length) if all lists have the same length, None otherwise + fn get_same_lists_length(data: &[Option>]) -> Option { + let first_length = data + .iter() + .find_map(|x| x.as_ref().map(|x| x.len())) + .unwrap_or(0); + let first_non_match_length = data + .iter() + .map(|x| x.as_ref().map(|x| x.len()).unwrap_or(first_length)) + .any(|x| x != first_length); + + if first_non_match_length { + None + } else { + Some(first_length) + } + } + + /// Test sorting arrays + /// + /// the `into_lists_fn` function is used to convert the data into the list variants wanted to be tested, (e.g. ListArray, LargeListArray, FixedSizeListArray or any other list variant) + /// The ordering of the returned vector must be consistent as it will be called with the data and the expected data + fn test_sort_arrays< + ListItemType: Clone, + IntoListFn: Fn(Vec>) -> Vec, + >( + into_lists_fn: IntoListFn, + data: Vec>, + options: Option, + limit: Option, + expected_data: Vec>, + ) { + let input = into_lists_fn(data); + let expected = into_lists_fn(expected_data); + + assert_eq!( + input.len(), + expected.len(), + "The input and expected data must have the same number of array variants" + ); + + for (input, expected) in input.iter().zip(expected.iter()) { + let sorted = match limit { + Some(_) => sort_limit(input, options, limit).unwrap(), + _ => sort(input, options).unwrap(), + }; + assert_eq!(sorted.deref(), expected.deref()); + } } fn test_sort_to_indices_decimal128_array( @@ -822,9 +957,9 @@ mod tests { limit: Option, expected_data: Vec, ) { - let output = create_decimal128_array(data); + let output = build_arrays_helper::decimal128_array(data)[0].clone(); let expected = UInt32Array::from(expected_data); - let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&output, options, limit).unwrap(); assert_eq!(output, expected) } @@ -834,42 +969,12 @@ mod tests { limit: Option, expected_data: Vec, ) { - let output = create_decimal256_array(data); + let output = build_arrays_helper::decimal256_array(data)[0].clone(); let expected = UInt32Array::from(expected_data); - let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&output, options, limit).unwrap(); assert_eq!(output, expected) } - fn test_sort_decimal128_array( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) { - let output = create_decimal128_array(data); - let expected = Arc::new(create_decimal128_array(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected) - } - - fn test_sort_decimal256_array( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) { - let output = create_decimal256_array(data); - let expected = Arc::new(create_decimal256_array(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected) - } - fn test_sort_to_indices_boolean_arrays( data: Vec>, options: Option, @@ -897,24 +1002,6 @@ mod tests { assert_eq!(output, expected) } - fn test_sort_primitive_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let output = PrimitiveArray::::from(data); - let expected = Arc::new(PrimitiveArray::::from(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected) - } - fn test_sort_to_indices_string_arrays( data: Vec>, options: Option, @@ -927,38 +1014,6 @@ mod tests { assert_eq!(output, expected) } - /// Tests both Utf8 and LargeUtf8 - fn test_sort_string_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) { - let output = StringArray::from(data.clone()); - let expected = Arc::new(StringArray::from(expected_data.clone())) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected); - - let output = LargeStringArray::from(data.clone()); - let expected = Arc::new(LargeStringArray::from(expected_data.clone())) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected); - - let output = StringViewArray::from(data); - let expected = Arc::new(StringViewArray::from(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected); - } - fn test_sort_string_dict_arrays( data: Vec>, options: Option, @@ -1048,59 +1103,6 @@ mod tests { assert_eq!(sorted_values, expected) } - fn test_sort_list_arrays( - data: Vec>>>, - options: Option, - limit: Option, - expected_data: Vec>>>, - fixed_length: Option, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - // for FixedSizedList - if let Some(length) = fixed_length { - let input = Arc::new(FixedSizeListArray::from_iter_primitive::( - data.clone(), - length, - )); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( - expected_data.clone(), - length, - )) as ArrayRef; - - assert_eq!(&sorted, &expected); - } - - // for List - let input = Arc::new(ListArray::from_iter_primitive::(data.clone())); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = Arc::new(ListArray::from_iter_primitive::( - expected_data.clone(), - )) as ArrayRef; - - assert_eq!(&sorted, &expected); - - // for LargeList - let input = Arc::new(LargeListArray::from_iter_primitive::(data)); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = Arc::new(LargeListArray::from_iter_primitive::( - expected_data, - )) as ArrayRef; - - assert_eq!(&sorted, &expected); - } - fn test_lex_sort_arrays( input: Vec, expected_output: Vec, @@ -1121,64 +1123,6 @@ mod tests { .collect() } - fn test_sort_binary_arrays( - data: Vec>>, - options: Option, - limit: Option, - expected_data: Vec>>, - fixed_length: Option, - ) { - // Fixed size binary array - if let Some(length) = fixed_length { - let input = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.iter().cloned(), length) - .unwrap(), - ); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - None => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter_with_size( - expected_data.iter().cloned(), - length, - ) - .unwrap(), - ) as ArrayRef; - - assert_eq!(&sorted, &expected); - } - - // Generic size binary array - fn make_generic_binary_array( - data: &[Option>], - ) -> Arc> { - Arc::new(GenericBinaryArray::::from_opt_vec( - data.iter() - .map(|binary| binary.as_ref().map(Vec::as_slice)) - .collect(), - )) - } - - // BinaryArray - let input = make_generic_binary_array::(&data); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - None => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = make_generic_binary_array::(&expected_data) as ArrayRef; - assert_eq!(&sorted, &expected); - - // LargeBinaryArray - let input = make_generic_binary_array::(&data); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - None => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = make_generic_binary_array::(&expected_data) as ArrayRef; - assert_eq!(&sorted, &expected); - } - #[test] fn test_sort_to_indices_primitives() { test_sort_to_indices_primitive_arrays::( @@ -1759,14 +1703,16 @@ mod tests { #[test] fn test_sort_decimal128() { // decimal default - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, None, vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], ); // decimal descending - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1776,7 +1722,8 @@ mod tests { vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None], ); // decimal null_first and descending - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1786,7 +1733,8 @@ mod tests { vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)], ); // decimal null_first - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1796,14 +1744,16 @@ mod tests { vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], ); // limit - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, Some(3), vec![None, None, Some(1)], ); // limit descending - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1813,7 +1763,8 @@ mod tests { vec![Some(5), Some(4), Some(3)], ); // limit descending null_first - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1823,7 +1774,8 @@ mod tests { vec![None, None, Some(5)], ); // limit null_first - test_sort_decimal128_array( + test_sort_arrays( + build_arrays_helper::decimal128_array, vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1846,7 +1798,8 @@ mod tests { None, ]; // decimal default - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), None, None, @@ -1856,7 +1809,8 @@ mod tests { .collect(), ); // decimal descending - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), Some(SortOptions { descending: true, @@ -1869,7 +1823,8 @@ mod tests { .collect(), ); // decimal null_first and descending - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), Some(SortOptions { descending: true, @@ -1882,7 +1837,8 @@ mod tests { .collect(), ); // decimal null_first - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), Some(SortOptions { descending: false, @@ -1895,7 +1851,8 @@ mod tests { .collect(), ); // limit - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), None, Some(3), @@ -1905,7 +1862,8 @@ mod tests { .collect(), ); // limit descending - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), Some(SortOptions { descending: true, @@ -1918,7 +1876,8 @@ mod tests { .collect(), ); // limit descending null_first - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data.clone(), Some(SortOptions { descending: true, @@ -1931,7 +1890,8 @@ mod tests { .collect(), ); // limit null_first - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, data, Some(SortOptions { descending: false, @@ -1947,7 +1907,8 @@ mod tests { #[test] fn test_sort_decimal256_max_min() { - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, vec![ None, Some(i256::MIN), @@ -1971,7 +1932,8 @@ mod tests { ], ); - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, vec![ None, Some(i256::MIN), @@ -1995,7 +1957,8 @@ mod tests { ], ); - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, vec![ None, Some(i256::MIN), @@ -2012,7 +1975,8 @@ mod tests { vec![None, None, Some(i256::MIN), Some(i256::from_i128(-1))], ); - test_sort_decimal256_array( + test_sort_arrays( + build_arrays_helper::decimal256_array, vec![ None, Some(i256::MIN), @@ -2033,25 +1997,29 @@ mod tests { #[test] fn test_sort_primitives() { // default case - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(3), Some(5), Some(2), Some(3), None], None, None, vec![None, None, Some(2), Some(3), Some(3), Some(5)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(3), Some(5), Some(2), Some(3), None], None, None, vec![None, None, Some(2), Some(3), Some(3), Some(5)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(3), Some(5), Some(2), Some(3), None], None, None, vec![None, None, Some(2), Some(3), Some(3), Some(5)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(3), Some(5), Some(2), Some(3), None], None, None, @@ -2059,7 +2027,8 @@ mod tests { ); // descending - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2068,7 +2037,8 @@ mod tests { None, vec![Some(2), Some(0), Some(0), Some(-1), None, None], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2077,7 +2047,8 @@ mod tests { None, vec![Some(2), Some(0), Some(0), Some(-1), None, None], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2086,7 +2057,8 @@ mod tests { None, vec![Some(2), Some(0), Some(0), Some(-1), None, None], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2097,7 +2069,8 @@ mod tests { ); // descending, nulls first - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2106,7 +2079,8 @@ mod tests { None, vec![None, None, Some(2), Some(0), Some(0), Some(-1)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2115,7 +2089,8 @@ mod tests { None, vec![None, None, Some(2), Some(0), Some(0), Some(-1)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2124,7 +2099,8 @@ mod tests { None, vec![None, None, Some(2), Some(0), Some(0), Some(-1)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2134,7 +2110,8 @@ mod tests { vec![None, None, Some(2), Some(0), Some(0), Some(-1)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: true, @@ -2144,7 +2121,8 @@ mod tests { vec![None, None, Some(2)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![ None, Some(f16::from_f32(0.0)), @@ -2168,7 +2146,8 @@ mod tests { ], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], Some(SortOptions { descending: true, @@ -2177,7 +2156,8 @@ mod tests { None, vec![None, None, Some(2.0), Some(0.0), Some(0.0), Some(-1.0)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], Some(SortOptions { descending: true, @@ -2186,7 +2166,8 @@ mod tests { None, vec![None, None, Some(f64::NAN), Some(2.0), Some(0.0), Some(-1.0)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], Some(SortOptions { descending: true, @@ -2197,7 +2178,8 @@ mod tests { ); // int8 nulls first - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: false, @@ -2206,7 +2188,8 @@ mod tests { None, vec![None, None, Some(-1), Some(0), Some(0), Some(2)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: false, @@ -2215,7 +2198,8 @@ mod tests { None, vec![None, None, Some(-1), Some(0), Some(0), Some(2)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: false, @@ -2224,7 +2208,8 @@ mod tests { None, vec![None, None, Some(-1), Some(0), Some(0), Some(2)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0), Some(2), Some(-1), Some(0), None], Some(SortOptions { descending: false, @@ -2233,7 +2218,8 @@ mod tests { None, vec![None, None, Some(-1), Some(0), Some(0), Some(2)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![ None, Some(f16::from_f32(0.0)), @@ -2256,7 +2242,8 @@ mod tests { Some(f16::from_f32(2.0)), ], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], Some(SortOptions { descending: false, @@ -2265,7 +2252,8 @@ mod tests { None, vec![None, None, Some(-1.0), Some(0.0), Some(0.0), Some(2.0)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], Some(SortOptions { descending: false, @@ -2274,7 +2262,8 @@ mod tests { None, vec![None, None, Some(-1.0), Some(0.0), Some(2.0), Some(f64::NAN)], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], Some(SortOptions { descending: false, @@ -2285,7 +2274,8 @@ mod tests { ); // limit - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], Some(SortOptions { descending: false, @@ -2296,7 +2286,8 @@ mod tests { ); // limit with actual value - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(2.0), Some(4.0), Some(3.0), Some(1.0)], Some(SortOptions { descending: false, @@ -2307,7 +2298,8 @@ mod tests { ); // valid values less than limit with extra nulls - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(2.0), None, None, Some(1.0)], Some(SortOptions { descending: false, @@ -2317,7 +2309,8 @@ mod tests { vec![Some(1.0), Some(2.0), None], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(2.0), None, None, Some(1.0)], Some(SortOptions { descending: false, @@ -2328,7 +2321,8 @@ mod tests { ); // more nulls than limit - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(2.0), None, None, None], Some(SortOptions { descending: false, @@ -2338,7 +2332,8 @@ mod tests { vec![None, None], ); - test_sort_primitive_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_arrays::, vec![Some(2.0), None, None, None], Some(SortOptions { descending: false, @@ -2478,7 +2473,8 @@ mod tests { #[test] fn test_sort_strings() { - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ None, Some("bad"), @@ -2503,7 +2499,8 @@ mod tests { ], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ None, Some("bad"), @@ -2531,7 +2528,8 @@ mod tests { ], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ None, Some("bad"), @@ -2559,7 +2557,8 @@ mod tests { ], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ None, Some("bad"), @@ -2587,7 +2586,8 @@ mod tests { ], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ None, Some("bad"), @@ -2607,7 +2607,8 @@ mod tests { ); // valid values less than limit with extra nulls - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ Some("def long string longer than 12"), None, @@ -2622,7 +2623,8 @@ mod tests { vec![Some("abc"), Some("def long string longer than 12"), None], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![ Some("def long string longer than 12"), None, @@ -2638,7 +2640,8 @@ mod tests { ); // more nulls than limit - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![Some("def long string longer than 12"), None, None, None], Some(SortOptions { descending: false, @@ -2648,7 +2651,8 @@ mod tests { vec![None, None], ); - test_sort_string_arrays( + test_sort_arrays( + build_arrays_helper::string_arrays, vec![Some("def long string longer than 12"), None, None, None], Some(SortOptions { descending: false, @@ -2928,7 +2932,8 @@ mod tests { #[test] fn test_sort_list() { - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(1)]), Some(vec![Some(4)]), @@ -2946,10 +2951,10 @@ mod tests { Some(vec![Some(3)]), Some(vec![Some(4)]), ], - Some(1), ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(0.0))]), Some(vec![ @@ -2997,10 +3002,10 @@ mod tests { Some(f16::from_f32(1.0)), ]), ], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(1.0), Some(0.0)]), Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), @@ -3020,10 +3025,10 @@ mod tests { Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), ], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(1.0), Some(0.0)]), Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), @@ -3043,10 +3048,10 @@ mod tests { Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), ], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(1), Some(0)]), Some(vec![Some(4), Some(3), Some(2), Some(1)]), @@ -3066,10 +3071,10 @@ mod tests { Some(vec![Some(3), Some(3), Some(3), Some(3)]), Some(vec![Some(4), Some(3), Some(2), Some(1)]), ], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ None, Some(vec![Some(4), None, Some(2)]), @@ -3089,10 +3094,10 @@ mod tests { None, None, ], - Some(3), ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![ Some(vec![Some(1), Some(0)]), Some(vec![Some(4), Some(3), Some(2), Some(1)]), @@ -3106,11 +3111,11 @@ mod tests { }), Some(2), vec![Some(vec![Some(1), Some(0)]), Some(vec![Some(1), Some(1)])], - None, ); // valid values less than limit with extra nulls - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![Some(vec![Some(1)]), None, None, Some(vec![Some(2)])], Some(SortOptions { descending: false, @@ -3118,10 +3123,10 @@ mod tests { }), Some(3), vec![Some(vec![Some(1)]), Some(vec![Some(2)]), None], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![Some(vec![Some(1)]), None, None, Some(vec![Some(2)])], Some(SortOptions { descending: false, @@ -3129,11 +3134,11 @@ mod tests { }), Some(3), vec![None, None, Some(vec![Some(1)])], - None, ); // more nulls than limit - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![Some(vec![Some(1)]), None, None, None], Some(SortOptions { descending: false, @@ -3141,10 +3146,10 @@ mod tests { }), Some(2), vec![None, None], - None, ); - test_sort_list_arrays::( + test_sort_arrays( + build_arrays_helper::primitive_list_arrays::, vec![Some(vec![Some(1)]), None, None, None], Some(SortOptions { descending: false, @@ -3152,13 +3157,13 @@ mod tests { }), Some(2), vec![Some(vec![Some(1)]), None], - None, ); } #[test] fn test_sort_binary() { - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![0, 0, 0]), Some(vec![0, 0, 5]), @@ -3178,11 +3183,11 @@ mod tests { Some(vec![0, 0, 5]), Some(vec![0, 0, 7]), ], - Some(3), ); // with nulls - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![0, 0, 0]), None, @@ -3204,10 +3209,10 @@ mod tests { None, None, ], - Some(3), ); - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![3, 5, 7]), None, @@ -3229,11 +3234,11 @@ mod tests { None, None, ], - Some(3), ); // descending - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![0, 0, 0]), None, @@ -3255,11 +3260,11 @@ mod tests { None, None, ], - Some(3), ); // nulls first - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![0, 0, 0]), None, @@ -3281,11 +3286,11 @@ mod tests { Some(vec![0, 0, 3]), Some(vec![0, 0, 7]), ], - Some(3), ); // limit - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(vec![0, 0, 0]), None, @@ -3300,11 +3305,11 @@ mod tests { }), Some(4), vec![None, None, Some(vec![0, 0, 0]), Some(vec![0, 0, 1])], - Some(3), ); // var length - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(b"Hello".to_vec()), None, @@ -3326,11 +3331,11 @@ mod tests { None, None, ], - None, ); // limit - test_sort_binary_arrays( + test_sort_arrays( + build_arrays_helper::binary_arrays, vec![ Some(b"Hello".to_vec()), None, @@ -3350,7 +3355,6 @@ mod tests { Some(b"Apache".to_vec()), Some(b"Arrow-rs".to_vec()), ], - None, ); } From 7d97c61e87a40b069facfd68d9833c4c57458759 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 18:03:53 +0000 Subject: [PATCH 2/4] fix failing test that cannot infer the fixed size length --- arrow-ord/src/sort.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 7c22e7704c27..f2ec336bb9fa 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -899,6 +899,28 @@ mod tests { arrays } + + // This function is needed when the input data have items, but the expected data only have None, and it would cause different number of vector to return (the input will return 3 and the expected will return 2) + pub fn primitive_list_arrays_with_fixed( + data: Vec>>>, + ) -> Vec + where + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + vec![ + Arc::new(GenericListArray::::from_iter_primitive::( + data.clone(), + )) as ArrayRef, + Arc::new(GenericListArray::::from_iter_primitive::( + data.clone(), + )), + Arc::new(FixedSizeListArray::from_iter_primitive::( + data, + FIXED_SIZE_LIST_ARRAY, + )), + ] + } } /// Return Some(length) if all lists have the same length, None otherwise @@ -3138,7 +3160,7 @@ mod tests { // more nulls than limit test_sort_arrays( - build_arrays_helper::primitive_list_arrays::, + build_arrays_helper::primitive_list_arrays_with_fixed::<1, Int32Type>, vec![Some(vec![Some(1)]), None, None, None], Some(SortOptions { descending: false, From 3f1f7a79b779242a3314f619a9ae55ebc4f5ca50 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 18:38:38 +0000 Subject: [PATCH 3/4] fix failed test --- arrow-ord/src/sort.rs | 82 ++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index f2ec336bb9fa..e2ac52998db0 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -877,50 +877,47 @@ mod tests { arrays } + pub fn primitive_generic_list_array( + data: &[Option>>], + ) -> ArrayRef + where + OffsetSize: OffsetSizeTrait, + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + Arc::new(GenericListArray::::from_iter_primitive::(data.to_vec())) + } + + pub fn primitive_fixed_list_array( + data: &[Option>>], + length: i32, + ) -> ArrayRef + where + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + Arc::new(FixedSizeListArray::from_iter_primitive::( + data.to_vec(), + length, + )) + } + pub fn primitive_list_arrays(data: Vec>>>) -> Vec where T: ArrowPrimitiveType, PrimitiveArray: From>>, { let mut arrays = vec![ - Arc::new(GenericListArray::::from_iter_primitive::( - data.clone(), - )) as ArrayRef, - Arc::new(GenericListArray::::from_iter_primitive::( - data.clone(), - )), + primitive_generic_list_array::(&data), + primitive_generic_list_array::(&data), ]; if let Some(first_length) = get_same_lists_length(&data) { - arrays.push(Arc::new( - FixedSizeListArray::from_iter_primitive::(data, first_length as i32), - )); + arrays.push(primitive_fixed_list_array::(&data, first_length as i32)); } arrays } - - // This function is needed when the input data have items, but the expected data only have None, and it would cause different number of vector to return (the input will return 3 and the expected will return 2) - pub fn primitive_list_arrays_with_fixed( - data: Vec>>>, - ) -> Vec - where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - vec![ - Arc::new(GenericListArray::::from_iter_primitive::( - data.clone(), - )) as ArrayRef, - Arc::new(GenericListArray::::from_iter_primitive::( - data.clone(), - )), - Arc::new(FixedSizeListArray::from_iter_primitive::( - data, - FIXED_SIZE_LIST_ARRAY, - )), - ] - } } /// Return Some(length) if all lists have the same length, None otherwise @@ -956,7 +953,20 @@ mod tests { expected_data: Vec>, ) { let input = into_lists_fn(data); - let expected = into_lists_fn(expected_data); + let mut expected = into_lists_fn(expected_data); + + // Filter out mismatched data types + // This can happen for getting some type of Fixed array in the expected (due to limit or something), but not in the input + if input.len() != expected.len() { + let all_input_specific_class = input + .iter() + .map(|array| array.data_type()) + .collect::>(); + expected = expected + .into_iter() + .filter(|array| all_input_specific_class.contains(&array.data_type())) + .collect::>() + } assert_eq!( input.len(), @@ -3160,7 +3170,13 @@ mod tests { // more nulls than limit test_sort_arrays( - build_arrays_helper::primitive_list_arrays_with_fixed::<1, Int32Type>, + |arrays| { + vec![ + build_arrays_helper::primitive_generic_list_array::(&arrays), + build_arrays_helper::primitive_generic_list_array::(&arrays), + build_arrays_helper::primitive_fixed_list_array::(&arrays, 1), + ] + }, vec![Some(vec![Some(1)]), None, None, None], Some(SortOptions { descending: false, From 253c99abe5a770406948753effd6b67f9bb59198 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:38:33 +0200 Subject: [PATCH 4/4] apply code review changed --- arrow-ord/src/sort.rs | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index e2ac52998db0..a46875844968 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -806,9 +806,10 @@ mod tests { mod build_arrays_helper { use crate::sort::tests::get_same_lists_length; use arrow_array::{ - ArrayRef, ArrowPrimitiveType, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, - FixedSizeListArray, GenericBinaryArray, GenericListArray, LargeStringArray, - OffsetSizeTrait, PrimitiveArray, StringArray, StringViewArray, + ArrayRef, ArrowPrimitiveType, BinaryArray, Decimal128Array, Decimal256Array, + FixedSizeBinaryArray, FixedSizeListArray, GenericListArray, + LargeBinaryArray, LargeStringArray, OffsetSizeTrait, PrimitiveArray, StringArray, + StringViewArray, }; use arrow_buffer::i256; use std::sync::Arc; @@ -848,20 +849,14 @@ mod tests { } pub fn binary_arrays(data: Vec>>) -> Vec { - // Generic size binary array - fn generic_binary_array( - data: &[Option>], - ) -> Arc> { - Arc::new(GenericBinaryArray::::from_opt_vec( - data.iter() - .map(|binary| binary.as_ref().map(Vec::as_slice)) - .collect(), - )) - } + let binary_data = data + .iter() + .map(|binary| binary.as_ref().map(Vec::as_slice)) + .collect::>(); let mut arrays = vec![ - generic_binary_array::(&data) as ArrayRef, - generic_binary_array::(&data) as ArrayRef, + Arc::new(BinaryArray::from_opt_vec(binary_data.clone())) as ArrayRef, + Arc::new(LargeBinaryArray::from_opt_vec(binary_data)) as ArrayRef, ]; if let Some(first_length) = get_same_lists_length(&data) { @@ -883,7 +878,6 @@ mod tests { where OffsetSize: OffsetSizeTrait, T: ArrowPrimitiveType, - PrimitiveArray: From>>, { Arc::new(GenericListArray::::from_iter_primitive::(data.to_vec())) } @@ -894,7 +888,6 @@ mod tests { ) -> ArrayRef where T: ArrowPrimitiveType, - PrimitiveArray: From>>, { Arc::new(FixedSizeListArray::from_iter_primitive::( data.to_vec(), @@ -905,7 +898,6 @@ mod tests { pub fn primitive_list_arrays(data: Vec>>>) -> Vec where T: ArrowPrimitiveType, - PrimitiveArray: From>>, { let mut arrays = vec![ primitive_generic_list_array::(&data),