Skip to content

Commit

Permalink
optimize trim function
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonLi-cn committed Apr 9, 2024
1 parent eb05741 commit c523c0c
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 6 deletions.
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,8 @@ required-features = ["datetime_expressions"]
harness = false
name = "substr_index"
required-features = ["unicode_expressions"]

[[bench]]
harness = false
name = "ltrim"
required-features = ["string_expressions"]
50 changes: 50 additions & 0 deletions datafusion/functions/benches/ltrim.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use arrow::array::{ArrayRef, StringArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_common::ScalarValue;
use datafusion_expr::ColumnarValue;
use datafusion_functions::string;
use std::sync::Arc;

fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
let iter =
std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size);
let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
vec![
ColumnarValue::Array(array),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
]
}

fn criterion_benchmark(c: &mut Criterion) {
let ltrim = string::ltrim();
for char in ["\"", "Header:"] {
for size in [1024, 4096, 8192] {
let args = create_args(size, char);
c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
b.iter(|| black_box(ltrim.invoke(&args)))
});
}
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
11 changes: 9 additions & 2 deletions datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use datafusion_physical_expr::functions::Hint;

use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
Expand Down Expand Up @@ -72,8 +73,14 @@ impl ScalarUDFImpl for BTrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(btrim::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(btrim::<i64>, vec![])(args),
DataType::Utf8 => make_scalar_function(
btrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
btrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function btrim"),
}
}
Expand Down
9 changes: 9 additions & 0 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
2 => {
let characters_array = as_generic_string_array::<T>(&args[1])?;

if characters_array.len() == 1 {
let characters = characters_array.value(0);
let result = string_array
.iter()
.map(|item| item.map(|string| func(string, characters)))
.collect::<GenericStringArray<T>>();
return Ok(Arc::new(result) as ArrayRef);
}

let result = string_array
.iter()
.zip(characters_array.iter())
Expand Down
11 changes: 9 additions & 2 deletions datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use datafusion_physical_expr::functions::Hint;

use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
Expand Down Expand Up @@ -70,8 +71,14 @@ impl ScalarUDFImpl for LtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(ltrim::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(ltrim::<i64>, vec![])(args),
DataType::Utf8 => make_scalar_function(
ltrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
ltrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function ltrim"),
}
}
Expand Down
11 changes: 9 additions & 2 deletions datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use datafusion_physical_expr::functions::Hint;

use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
Expand Down Expand Up @@ -70,8 +71,14 @@ impl ScalarUDFImpl for RtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(rtrim::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(rtrim::<i64>, vec![])(args),
DataType::Utf8 => make_scalar_function(
rtrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
rtrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function rtrim"),
}
}
Expand Down

0 comments on commit c523c0c

Please sign in to comment.