Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: encode should work with non-UTF-8 binaries #14087

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,20 @@ impl ScalarUDFImpl for EncodeFunc {
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
Ok(arg_types[0].to_owned())
use DataType::*;

Ok(match arg_types[0] {
Utf8 => Utf8,
LargeUtf8 => LargeUtf8,
Binary => Utf8,
LargeBinary => LargeUtf8,
Null => Null,
_ => {
return plan_err!(
"The encode function can only accept Utf8 or Binary or Null."
);
}
})
}

fn invoke_batch(
Expand All @@ -112,12 +125,12 @@ impl ScalarUDFImpl for EncodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Null => {
Ok(vec![DataType::Utf8; 2])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Ok(vec![DataType::LargeUtf8, DataType::Utf8])
}
DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]),
DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]),
DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]),
_ => plan_err!(
"1st argument should be Utf8 or Binary or Null, got {:?}",
arg_types[0]
Expand Down
24 changes: 21 additions & 3 deletions datafusion/sqllogictest/test_files/encoding.slt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ CREATE TABLE test(
hex_field TEXT
) as VALUES
(0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')),
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
(2, NULL, NULL, NULL)
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
(2, NULL, NULL, NULL),
(3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex'))
;

# errors
Expand All @@ -43,34 +44,51 @@ select decode(hex_field, 'non_encoding') from test;
query error
select to_hex(hex_field) from test;

query error
select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8');

# Arrays tests
query T
SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
----
616263
717765717765
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
select to_hex(num) from test ORDER BY num;
----
0
1
2
3

query T
select encode(bin_field, 'base64') FROM test WHERE num = 3;
----
j1DT9g6uNw3b+FyGIZxVEIo1AWU

query B
select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
----
true

# test for Utf8View support for encode
statement ok
Expand Down Expand Up @@ -101,4 +119,4 @@ FROM test_utf8view;
Andrew QW5kcmV3 416e64726577 X WA 58
Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
Raphael UmFwaGFlbA 5261706861656c R Ug 52
NULL NULL NULL R Ug 52
NULL NULL NULL R Ug 52
Loading