Skip to content

Commit

Permalink
fix: encode should work with non-UTF-8 binaries (#14087)
Browse files Browse the repository at this point in the history
* fix: encode function should work with strings and binary

closes #14055

* chore: address comments, add test
  • Loading branch information
mesejo authored Jan 16, 2025
1 parent 50c7977 commit 5fa8b3b
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 7 deletions.
24 changes: 19 additions & 5 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,21 @@ impl ScalarUDFImpl for EncodeFunc {
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
Ok(arg_types[0].to_owned())
use DataType::*;

Ok(match arg_types[0] {
Utf8 => Utf8,
LargeUtf8 => LargeUtf8,
Utf8View => Utf8,
Binary => Utf8,
LargeBinary => LargeUtf8,
Null => Null,
_ => {
return plan_err!(
"The encode function can only accept Utf8 or Binary or Null."
);
}
})
}

fn invoke_batch(
Expand All @@ -110,12 +124,12 @@ impl ScalarUDFImpl for EncodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Null => {
Ok(vec![DataType::Utf8; 2])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Ok(vec![DataType::LargeUtf8, DataType::Utf8])
}
DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]),
DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]),
DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]),
_ => plan_err!(
"1st argument should be Utf8 or Binary or Null, got {:?}",
arg_types[0]
Expand Down
28 changes: 26 additions & 2 deletions datafusion/sqllogictest/test_files/encoding.slt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ CREATE TABLE test(
hex_field TEXT
) as VALUES
(0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')),
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
(2, NULL, NULL, NULL)
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
(2, NULL, NULL, NULL),
(3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex'))
;

# errors
Expand All @@ -43,34 +44,51 @@ select decode(hex_field, 'non_encoding') from test;
query error
select to_hex(hex_field) from test;

query error
select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8');

# Arrays tests
query T
SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
----
616263
717765717765
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
select to_hex(num) from test ORDER BY num;
----
0
1
2
3

query T
select encode(bin_field, 'base64') FROM test WHERE num = 3;
----
j1DT9g6uNw3b+FyGIZxVEIo1AWU

query B
select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
----
true

# test for Utf8View support for encode
statement ok
Expand Down Expand Up @@ -102,3 +120,9 @@ Andrew QW5kcmV3 416e64726577 X WA 58
Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
Raphael UmFwaGFlbA 5261706861656c R Ug 52
NULL NULL NULL R Ug 52

# test for hex digest
query T
select encode(digest('hello', 'sha256'), 'hex');
----
2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824

0 comments on commit 5fa8b3b

Please sign in to comment.