From 9d985c150e466ace6a8d0a749b1412c01f5d8222 Mon Sep 17 00:00:00 2001 From: JasonLi-cn Date: Sat, 30 Mar 2024 22:04:12 +0800 Subject: [PATCH] pass cargo clippy --- .../physical-expr/src/string_expressions.rs | 391 +----------------- 1 file changed, 5 insertions(+), 386 deletions(-) diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 7e39ed8d6cfb..38d1ae58f5b3 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -32,7 +32,6 @@ use arrow::{ datatypes::DataType, }; use arrow_buffer::{MutableBuffer, NullBuffer}; -use uuid::Uuid; use datafusion_common::Result; use datafusion_common::{ @@ -41,127 +40,6 @@ use datafusion_common::{ }; use datafusion_expr::ColumnarValue; -/// applies a unary expression to `args[0]` that is expected to be downcastable to -/// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) -/// # Errors -/// This function errors when: -/// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` -pub(crate) fn unary_string_function<'a, T, O, F, R>( - args: &[&'a dyn Array], - op: F, - name: &str, -) -> Result> -where - R: AsRef, - O: OffsetSizeTrait, - T: OffsetSizeTrait, - F: Fn(&'a str) -> R, -{ - if args.len() != 1 { - return exec_err!( - "{:?} args were supplied but {} takes exactly one argument", - args.len(), - name - ); - } - - let string_array = as_generic_string_array::(args[0])?; - - // first map is the iterator, second is for the `Option<_>` - Ok(string_array.iter().map(|string| string.map(&op)).collect()) -} - -fn handle<'a, F, R>(args: &'a [ColumnarValue], op: F, name: &str) -> Result -where - R: AsRef, - F: Fn(&'a str) -> R, -{ - match &args[0] { - ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 => { - Ok(ColumnarValue::Array(Arc::new(unary_string_function::< - i32, - i32, - _, - _, - >( - &[a.as_ref()], op, name - )?))) - } - DataType::LargeUtf8 => { - Ok(ColumnarValue::Array(Arc::new(unary_string_function::< - i64, - i64, - _, - _, - >( - &[a.as_ref()], op, name - )?))) - } - other => exec_err!("Unsupported data type {other:?} for function {name}"), - }, - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) => { - let result = a.as_ref().map(|x| (op)(x).as_ref().to_string()); - Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) - } - ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| (op)(x).as_ref().to_string()); - Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result))) - } - other => exec_err!("Unsupported data type {other:?} for function {name}"), - }, - } -} - -/// Returns the numeric code of the first character of the argument. -/// ascii('x') = 120 -pub fn ascii(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| { - string.map(|string: &str| { - let mut chars = string.chars(); - chars.next().map_or(0, |v| v as i32) - }) - }) - .collect::(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. -/// chr(65) = 'A' -pub fn chr(args: &[ArrayRef]) -> Result { - let integer_array = as_int64_array(&args[0])?; - - // first map is the iterator, second is for the `Option<_>` - let result = integer_array - .iter() - .map(|integer: Option| { - integer - .map(|integer| { - if integer == 0 { - exec_err!("null character not permitted.") - } else { - match core::char::from_u32(integer as u32) { - Some(integer) => Ok(integer.to_string()), - None => { - exec_err!("requested character too large for encoding.") - } - } - } - }) - .transpose() - }) - .collect::>()?; - - Ok(Arc::new(result) as ArrayRef) -} - enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), @@ -181,7 +59,7 @@ impl<'a> ColumnarValueRef<'a> { fn nulls(&self) -> Option { match &self { Self::Scalar(_) | Self::NonNullableArray(_) => None, - Self::NullableArray(array) => array.nulls().map(|b| b.clone()), + Self::NullableArray(array) => array.nulls().cloned(), } } } @@ -332,7 +210,7 @@ pub fn concat_ws(args: &[ColumnarValue]) -> Result { let mut result = String::new(); let iter = &mut args[1..].iter(); - while let Some(arg) = iter.next() { + for arg in iter.by_ref() { match arg { ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => { result.push_str(s); @@ -343,7 +221,7 @@ pub fn concat_ws(args: &[ColumnarValue]) -> Result { } } - while let Some(arg) = iter.next() { + for arg in iter.by_ref() { match arg { ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => { result.push_str(sep); @@ -413,14 +291,14 @@ pub fn concat_ws(args: &[ColumnarValue]) -> Result { } let mut iter = columns.iter(); - while let Some(column) = iter.next() { + for column in iter.by_ref() { if column.is_valid(i) { builder.write::(column, i); break; } } - while let Some(column) = iter.next() { + for column in iter { if column.is_valid(i) { builder.write::(&sep, i); builder.write::(column, i); @@ -529,196 +407,8 @@ pub fn ends_with(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } -/// Converts the number to its equivalent hexadecimal representation. -/// to_hex(2147483647) = '7fffffff' -pub fn to_hex(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ - let integer_array = as_primitive_array::(&args[0])?; - - let result = integer_array - .iter() - .map(|integer| { - if let Some(value) = integer { - if let Some(value_usize) = value.to_usize() { - Ok(Some(format!("{value_usize:x}"))) - } else if let Some(value_isize) = value.to_isize() { - Ok(Some(format!("{value_isize:x}"))) - } else { - exec_err!("Unsupported data type {integer:?} for function to_hex") - } - } else { - Ok(None) - } - }) - .collect::>>()?; - - Ok(Arc::new(result) as ArrayRef) -} - -/// Converts the string to all upper case. -/// upper('tom') = 'TOM' -pub fn upper(args: &[ColumnarValue]) -> Result { - handle(args, |string| string.to_uppercase(), "upper") -} - -/// Prints random (v4) uuid values per row -/// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11' -pub fn uuid(args: &[ColumnarValue]) -> Result { - let len: usize = match &args[0] { - ColumnarValue::Array(array) => array.len(), - _ => return exec_err!("Expect uuid function to take no param"), - }; - - let values = iter::repeat_with(|| Uuid::new_v4().to_string()).take(len); - let array = GenericStringArray::::from_iter_values(values); - Ok(ColumnarValue::Array(Arc::new(array))) -} - -/// OVERLAY(string1 PLACING string2 FROM integer FOR integer2) -/// Replaces a substring of string1 with string2 starting at the integer bit -/// pgsql overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas -/// overlay('Txxxxas' placing 'hom' from 2) -> Thomxas, without for option, str2's len is instead -pub fn overlay(args: &[ArrayRef]) -> Result { - match args.len() { - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - let pos_num = as_int64_array(&args[2])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .zip(pos_num.iter()) - .map(|((string, characters), start_pos)| { - match (string, characters, start_pos) { - (Some(string), Some(characters), Some(start_pos)) => { - let string_len = string.chars().count(); - let characters_len = characters.chars().count(); - let replace_len = characters_len as i64; - let mut res = - String::with_capacity(string_len.max(characters_len)); - - //as sql replace index start from 1 while string index start from 0 - if start_pos > 1 && start_pos - 1 < string_len as i64 { - let start = (start_pos - 1) as usize; - res.push_str(&string[..start]); - } - res.push_str(characters); - // if start + replace_len - 1 >= string_length, just to string end - if start_pos + replace_len - 1 < string_len as i64 { - let end = (start_pos + replace_len - 1) as usize; - res.push_str(&string[end..]); - } - Ok(Some(res)) - } - _ => Ok(None), - } - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - 4 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - let pos_num = as_int64_array(&args[2])?; - let len_num = as_int64_array(&args[3])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .zip(pos_num.iter()) - .zip(len_num.iter()) - .map(|(((string, characters), start_pos), len)| { - match (string, characters, start_pos, len) { - (Some(string), Some(characters), Some(start_pos), Some(len)) => { - let string_len = string.chars().count(); - let characters_len = characters.chars().count(); - let replace_len = len.min(string_len as i64); - let mut res = - String::with_capacity(string_len.max(characters_len)); - - //as sql replace index start from 1 while string index start from 0 - if start_pos > 1 && start_pos - 1 < string_len as i64 { - let start = (start_pos - 1) as usize; - res.push_str(&string[..start]); - } - res.push_str(characters); - // if start + replace_len - 1 >= string_length, just to string end - if start_pos + replace_len - 1 < string_len as i64 { - let end = (start_pos + replace_len - 1) as usize; - res.push_str(&string[end..]); - } - Ok(Some(res)) - } - _ => Ok(None), - } - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - other => { - exec_err!("overlay was called with {other} arguments. It requires 3 or 4.") - } - } -} - -///Returns the Levenshtein distance between the two given strings. -/// LEVENSHTEIN('kitten', 'sitting') = 3 -pub fn levenshtein(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!( - "levenshtein function requires two arguments, got {}", - args.len() - ); - } - let str1_array = as_generic_string_array::(&args[0])?; - let str2_array = as_generic_string_array::(&args[1])?; - match args[0].data_type() { - DataType::Utf8 => { - let result = str1_array - .iter() - .zip(str2_array.iter()) - .map(|(string1, string2)| match (string1, string2) { - (Some(string1), Some(string2)) => { - Some(datafusion_strsim::levenshtein(string1, string2) as i32) - } - _ => None, - }) - .collect::(); - Ok(Arc::new(result) as ArrayRef) - } - DataType::LargeUtf8 => { - let result = str1_array - .iter() - .zip(str2_array.iter()) - .map(|(string1, string2)| match (string1, string2) { - (Some(string1), Some(string2)) => { - Some(datafusion_strsim::levenshtein(string1, string2) as i64) - } - _ => None, - }) - .collect::(); - Ok(Arc::new(result) as ArrayRef) - } - other => { - exec_err!( - "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." - ) - } - } -} - #[cfg(test)] mod tests { - use arrow::{array::Int32Array, datatypes::Int32Type}; - use arrow_array::Int64Array; - - use datafusion_common::cast::as_int32_array; - - use crate::string_expressions; - use super::*; #[test] @@ -796,75 +486,4 @@ mod tests { Ok(()) } - - #[test] - // Test to_hex function for zero - fn to_hex_zero() -> Result<()> { - let array = vec![0].into_iter().collect::(); - let array_ref = Arc::new(array); - let hex_value_arc = string_expressions::to_hex::(&[array_ref])?; - let hex_value = as_string_array(&hex_value_arc)?; - let expected = StringArray::from(vec![Some("0")]); - assert_eq!(&expected, hex_value); - - Ok(()) - } - - #[test] - // Test to_hex function for positive number - fn to_hex_positive_number() -> Result<()> { - let array = vec![100].into_iter().collect::(); - let array_ref = Arc::new(array); - let hex_value_arc = string_expressions::to_hex::(&[array_ref])?; - let hex_value = as_string_array(&hex_value_arc)?; - let expected = StringArray::from(vec![Some("64")]); - assert_eq!(&expected, hex_value); - - Ok(()) - } - - #[test] - // Test to_hex function for negative number - fn to_hex_negative_number() -> Result<()> { - let array = vec![-1].into_iter().collect::(); - let array_ref = Arc::new(array); - let hex_value_arc = string_expressions::to_hex::(&[array_ref])?; - let hex_value = as_string_array(&hex_value_arc)?; - let expected = StringArray::from(vec![Some("ffffffffffffffff")]); - assert_eq!(&expected, hex_value); - - Ok(()) - } - - #[test] - fn to_overlay() -> Result<()> { - let string = - Arc::new(StringArray::from(vec!["123", "abcdefg", "xyz", "Txxxxas"])); - let replace_string = - Arc::new(StringArray::from(vec!["abc", "qwertyasdfg", "ijk", "hom"])); - let start = Arc::new(Int64Array::from(vec![4, 1, 1, 2])); // start - let end = Arc::new(Int64Array::from(vec![5, 7, 2, 4])); // replace len - - let res = overlay::(&[string, replace_string, start, end]).unwrap(); - let result = as_generic_string_array::(&res).unwrap(); - let expected = StringArray::from(vec!["abc", "qwertyasdfg", "ijkz", "Thomas"]); - assert_eq!(&expected, result); - - Ok(()) - } - - #[test] - fn to_levenshtein() -> Result<()> { - let string1_array = - Arc::new(StringArray::from(vec!["123", "abc", "xyz", "kitten"])); - let string2_array = - Arc::new(StringArray::from(vec!["321", "def", "zyx", "sitting"])); - let res = levenshtein::(&[string1_array, string2_array]).unwrap(); - let result = - as_int32_array(&res).expect("failed to initialized function levenshtein"); - let expected = Int32Array::from(vec![2, 3, 2, 3]); - assert_eq!(&expected, result); - - Ok(()) - } }