-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support vectorized append and compare for multi group by #12996
Changes from 1 commit
be6a67d
2cdf05d
04ea2d2
a83c2ea
3df75ac
13c9489
5fd63e8
d4b5820
04f35bb
d215937
2af6ff5
473914a
14f8881
ebbeb5a
dad79c0
1a7c2eb
d79b813
6edc646
150248f
37d68e6
ebd9db9
71c45ce
2f272f2
731723c
a77f516
1830c1a
b6f2d00
6375d93
7979f74
86dcb11
197656b
cc96beb
2c1ec19
fa6343c
41ac655
4f8924e
c9b147a
236b0bc
15aaab1
a0aa7b7
c2088f7
7acfef0
7875d50
41f5f04
7efce58
5cbe3fa
8b23ff3
df81f8f
81f99a8
b7a2443
4b45708
d1b879a
14841db
fd9a71a
8cd581d
75aa1dc
406acb4
7a1ed90
e8c0aaa
2d982a1
e4bd579
14fffb8
d479cc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,9 @@ pub trait GroupColumn: Send + Sync { | |
fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool; | ||
/// Appends the row at `row` in `array` to this builder | ||
fn append_val(&mut self, array: &ArrayRef, row: usize); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe as a follow on we can consider removing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am a bit worried about if we merge them, some extra There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A good thing to benchmark (as a follow on PR) perhaps |
||
|
||
fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize); | ||
|
||
/// Returns the number of rows stored in this builder | ||
fn len(&self) -> usize; | ||
/// Returns the number of bytes used by this [`GroupColumn`] | ||
|
@@ -113,6 +116,7 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn | |
self.group_values[lhs_row] == array.as_primitive::<T>().value(rhs_row) | ||
} | ||
|
||
|
||
fn append_val(&mut self, array: &ArrayRef, row: usize) { | ||
// Perf: skip null check if input can't have nulls | ||
if NULLABLE { | ||
|
@@ -128,6 +132,15 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn | |
} | ||
} | ||
|
||
fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { | ||
if NULLABLE { | ||
self.nulls.append(false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be optimized to append nulls for entire batch instead of per value There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I plan to refactor the interface for supporting input a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (i.e. remove it here and call it in such a way we use https://docs.rs/arrow/latest/arrow/array/struct.BooleanBufferBuilder.html#method.append_n There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I add the 🤔 I guess, it is likely due the new introduced branch of
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To eliminate this extra branch, I think we need to refactor the I am trying it. |
||
self.group_values.push(array.as_primitive::<T>().value(row)); | ||
} else { | ||
self.group_values.push(array.as_primitive::<T>().value(row)); | ||
} | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.group_values.len() | ||
} | ||
|
@@ -218,6 +231,17 @@ where | |
} | ||
} | ||
|
||
fn append_non_nullable_val_inner<B>(&mut self, array: &ArrayRef, row: usize) | ||
where | ||
B: ByteArrayType, | ||
{ | ||
let arr = array.as_bytes::<B>(); | ||
self.nulls.append(false); | ||
let value: &[u8] = arr.value(row).as_ref(); | ||
self.buffer.append_slice(value); | ||
self.offsets.push(O::usize_as(self.buffer.len())); | ||
} | ||
|
||
fn equal_to_inner<B>(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool | ||
where | ||
B: ByteArrayType, | ||
|
@@ -287,6 +311,27 @@ where | |
}; | ||
} | ||
|
||
fn append_non_nullable_val(&mut self, column: &ArrayRef, row: usize) { | ||
// Sanity array type | ||
match self.output_type { | ||
OutputType::Binary => { | ||
debug_assert!(matches!( | ||
column.data_type(), | ||
DataType::Binary | DataType::LargeBinary | ||
)); | ||
self.append_non_nullable_val_inner::<GenericBinaryType<O>>(column, row) | ||
} | ||
OutputType::Utf8 => { | ||
debug_assert!(matches!( | ||
column.data_type(), | ||
DataType::Utf8 | DataType::LargeUtf8 | ||
)); | ||
self.append_non_nullable_val_inner::<GenericStringType<O>>(column, row) | ||
} | ||
_ => unreachable!("View types should use `ArrowBytesViewMap`"), | ||
}; | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.offsets.len() - 1 | ||
} | ||
|
@@ -382,7 +427,7 @@ where | |
} | ||
_ => unreachable!("View types should use `ArrowBytesViewMap`"), | ||
} | ||
} | ||
} | ||
} | ||
|
||
/// An implementation of [`GroupColumn`] for binary view and utf8 view types. | ||
|
@@ -482,6 +527,35 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> { | |
self.views.push(view); | ||
} | ||
|
||
fn append_val_non_nullable_inner(&mut self, array: &ArrayRef, row: usize) | ||
where | ||
B: ByteViewType, | ||
{ | ||
let arr = array.as_byte_view::<B>(); | ||
|
||
// Not null row case | ||
self.nulls.append(false); | ||
let value: &[u8] = arr.value(row).as_ref(); | ||
|
||
let value_len = value.len(); | ||
let view = if value_len <= 12 { | ||
make_view(value, 0, 0) | ||
} else { | ||
// Ensure big enough block to hold the value firstly | ||
self.ensure_in_progress_big_enough(value_len); | ||
|
||
// Append value | ||
let buffer_index = self.completed.len(); | ||
let offset = self.in_progress.len(); | ||
self.in_progress.extend_from_slice(value); | ||
|
||
make_view(value, buffer_index as u32, offset as u32) | ||
}; | ||
|
||
// Append view | ||
self.views.push(view); | ||
} | ||
|
||
fn ensure_in_progress_big_enough(&mut self, value_len: usize) { | ||
debug_assert!(value_len > 12); | ||
let require_cap = self.in_progress.len() + value_len; | ||
|
@@ -776,6 +850,10 @@ impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> { | |
fn append_val(&mut self, array: &ArrayRef, row: usize) { | ||
self.append_val_inner(array, row) | ||
} | ||
|
||
fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { | ||
self.append_val_non_nullable_inner(array, row); | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.views.len() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
with_capacity
can probably be improved (as a follow on PR) to avoid some smaller allocations