From aa862364667a2d30f273a9415ae1f0cd65bae038 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Sat, 11 Jan 2025 14:53:11 +0100 Subject: [PATCH 1/2] fix: encode function should work with strings and binary closes #14055 --- datafusion/functions/src/encoding/inner.rs | 23 ++++++++++++++---- .../sqllogictest/test_files/encoding.slt | 24 ++++++++++++++++--- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 42d2ff98c39d..3d90ff15cbaa 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -87,7 +87,20 @@ impl ScalarUDFImpl for EncodeFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(arg_types[0].to_owned()) + use DataType::*; + + Ok(match arg_types[0] { + Utf8 => Utf8, + LargeUtf8 => LargeUtf8, + Binary => Utf8, + LargeBinary => LargeUtf8, + Null => Null, + _ => { + return plan_err!( + "The encode function can only accept Utf8 or Binary or Null." + ); + } + }) } fn invoke_batch( @@ -112,12 +125,12 @@ impl ScalarUDFImpl for EncodeFunc { } match arg_types[0] { - DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => { + DataType::Utf8 | DataType::Utf8View | DataType::Null => { Ok(vec![DataType::Utf8; 2]) } - DataType::LargeUtf8 | DataType::LargeBinary => { - Ok(vec![DataType::LargeUtf8, DataType::Utf8]) - } + DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]), + DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]), + DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]), _ => plan_err!( "1st argument should be Utf8 or Binary or Null, got {:?}", arg_types[0] diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt index 24efb33f7896..00956bfea95d 100644 --- a/datafusion/sqllogictest/test_files/encoding.slt +++ b/datafusion/sqllogictest/test_files/encoding.slt @@ -23,8 +23,9 @@ CREATE TABLE test( hex_field TEXT ) as VALUES (0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')), - (1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')), - (2, NULL, NULL, NULL) + (1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')), + (2, NULL, NULL, NULL), + (3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex')) ; # errors @@ -43,6 +44,9 @@ select decode(hex_field, 'non_encoding') from test; query error select to_hex(hex_field) from test; +query error +select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8'); + # Arrays tests query T SELECT encode(bin_field, 'hex') FROM test ORDER BY num; @@ -50,6 +54,7 @@ SELECT encode(bin_field, 'hex') FROM test ORDER BY num; 616263 717765717765 NULL +8f50d3f60eae370ddbf85c86219c55108a350165 query T SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num; @@ -57,6 +62,7 @@ SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num abc qweqwe NULL +8f50d3f60eae370ddbf85c86219c55108a350165 query T SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num; @@ -64,6 +70,7 @@ SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num; abc qweqwe NULL +8f50d3f60eae370ddbf85c86219c55108a350165 query T select to_hex(num) from test ORDER BY num; @@ -71,6 +78,17 @@ select to_hex(num) from test ORDER BY num; 0 1 2 +3 + +query T +select encode(bin_field, 'base64') FROM test WHERE num = 3; +---- +j1DT9g6uNw3b+FyGIZxVEIo1AWU + +query B +select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3; +---- +true # test for Utf8View support for encode statement ok @@ -101,4 +119,4 @@ FROM test_utf8view; Andrew QW5kcmV3 416e64726577 X WA 58 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Raphael UmFwaGFlbA 5261706861656c R Ug 52 -NULL NULL NULL R Ug 52 +NULL NULL NULL R Ug 52 \ No newline at end of file From ab9f538b321fd62abb50385a9526c5972d3ea716 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Wed, 15 Jan 2025 09:54:13 +0100 Subject: [PATCH 2/2] chore: address comments, add test --- datafusion/functions/src/encoding/inner.rs | 1 + datafusion/sqllogictest/test_files/encoding.slt | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 3d90ff15cbaa..79524629b5ef 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -92,6 +92,7 @@ impl ScalarUDFImpl for EncodeFunc { Ok(match arg_types[0] { Utf8 => Utf8, LargeUtf8 => LargeUtf8, + Utf8View => Utf8, Binary => Utf8, LargeBinary => LargeUtf8, Null => Null, diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt index 00956bfea95d..be1c5aa40583 100644 --- a/datafusion/sqllogictest/test_files/encoding.slt +++ b/datafusion/sqllogictest/test_files/encoding.slt @@ -119,4 +119,10 @@ FROM test_utf8view; Andrew QW5kcmV3 416e64726577 X WA 58 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Raphael UmFwaGFlbA 5261706861656c R Ug 52 -NULL NULL NULL R Ug 52 \ No newline at end of file +NULL NULL NULL R Ug 52 + +# test for hex digest +query T +select encode(digest('hello', 'sha256'), 'hex'); +---- +2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824 \ No newline at end of file