-
-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add branch_id
to distinguish between reusable branches and pipeline breakers
#883
base: main
Are you sure you want to change the base?
Changes from 15 commits
fae5c6e
d598734
2345cd4
d88270d
948cd83
045bbef
93e0d28
7ddda99
8c2d977
7184bcf
5ac9394
fb2aa9f
061de6f
e486590
d28f906
366415a
ee523ea
369c142
5ee43dd
7379a01
68e048c
f79155a
9fcc246
391d8f6
4326a25
1b6b090
c9e0384
cc120ee
9257b72
12432da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,7 @@ | |
from tlz import merge_sorted, partition, unique | ||
|
||
from dask_expr import _core as core | ||
from dask_expr._core import BranchId | ||
from dask_expr._util import ( | ||
_calc_maybe_new_divisions, | ||
_convert_to_list, | ||
|
@@ -476,9 +477,9 @@ def _args(self) -> list: | |
if self._keyword_only: | ||
args = [ | ||
self.operand(p) for p in self._parameters if p not in self._keyword_only | ||
] + self.operands[len(self._parameters) :] | ||
] + self.argument_operands[len(self._parameters) :] | ||
return args | ||
return self.operands | ||
return self.argument_operands | ||
|
||
def _broadcast_dep(self, dep: Expr): | ||
# Checks if a dependency should be broadcasted to | ||
|
@@ -562,7 +563,7 @@ def _broadcast_dep(self, dep: Expr): | |
|
||
@property | ||
def args(self): | ||
return [self.frame] + self.operands[len(self._parameters) :] | ||
return [self.frame] + self.argument_operands[len(self._parameters) :] | ||
|
||
@functools.cached_property | ||
def _meta(self): | ||
|
@@ -658,7 +659,7 @@ def __str__(self): | |
|
||
@functools.cached_property | ||
def args(self): | ||
return self.operands[len(self._parameters) :] | ||
return self.argument_operands[len(self._parameters) :] | ||
|
||
@functools.cached_property | ||
def _dfs(self): | ||
|
@@ -725,7 +726,7 @@ def _meta(self): | |
meta = self.operand("meta") | ||
args = [self.frame._meta] + [ | ||
arg._meta if isinstance(arg, Expr) else arg | ||
for arg in self.operands[len(self._parameters) :] | ||
for arg in self.argument_operands[len(self._parameters) :] | ||
] | ||
return _get_meta_map_partitions( | ||
args, | ||
|
@@ -737,7 +738,7 @@ def _meta(self): | |
) | ||
|
||
def _divisions(self): | ||
args = [self.frame] + self.operands[len(self._parameters) :] | ||
args = [self.frame] + self.argument_operands[len(self._parameters) :] | ||
return calc_divisions_for_align(*args) | ||
|
||
def _lower(self): | ||
|
@@ -792,15 +793,15 @@ def args(self): | |
return ( | ||
[self.frame] | ||
+ [self.func, self.before, self.after] | ||
+ self.operands[len(self._parameters) :] | ||
+ self.argument_operands[len(self._parameters) :] | ||
) | ||
|
||
@functools.cached_property | ||
def _meta(self): | ||
meta = self.operand("meta") | ||
args = [self.frame._meta] + [ | ||
arg._meta if isinstance(arg, Expr) else arg | ||
for arg in self.operands[len(self._parameters) :] | ||
for arg in self.argument_operands[len(self._parameters) :] | ||
] | ||
return _get_meta_map_partitions( | ||
args, | ||
|
@@ -1094,7 +1095,11 @@ class Sample(Blockwise): | |
|
||
@functools.cached_property | ||
def _meta(self): | ||
args = [self.operands[0]._meta] + [self.operands[1][0]] + self.operands[2:] | ||
args = ( | ||
[self.operands[0]._meta] | ||
+ [self.operands[1][0]] | ||
+ self.argument_operands[2:] | ||
) | ||
return self.operation(*args) | ||
|
||
def _task(self, index: int): | ||
|
@@ -1696,11 +1701,11 @@ class Assign(Elemwise): | |
|
||
@functools.cached_property | ||
def keys(self): | ||
return self.operands[1::2] | ||
return self.argument_operands[1::2] | ||
|
||
@functools.cached_property | ||
def vals(self): | ||
return self.operands[2::2] | ||
return self.argument_operands[2::2] | ||
|
||
@functools.cached_property | ||
def _meta(self): | ||
|
@@ -1725,7 +1730,7 @@ def _simplify_down(self): | |
if self._check_for_previously_created_column(self.frame): | ||
# don't squash if we are using a column that was previously created | ||
return | ||
return Assign(*self.frame.operands, *self.operands[1:]) | ||
return Assign(*self.frame.argument_operands, *self.operands[1:]) | ||
|
||
def _check_for_previously_created_column(self, child): | ||
input_columns = [] | ||
|
@@ -1753,7 +1758,7 @@ def _simplify_up(self, parent, dependents): | |
if k in columns: | ||
new_args.extend([k, v]) | ||
else: | ||
new_args = self.operands[1:] | ||
new_args = self.argument_operands[1:] | ||
|
||
columns = [col for col in self.frame.columns if col in cols] | ||
return type(parent)( | ||
|
@@ -1778,12 +1783,12 @@ class CaseWhen(Elemwise): | |
|
||
@functools.cached_property | ||
def caselist(self): | ||
c = self.operands[1:] | ||
c = self.argument_operands[1:] | ||
return [(c[i], c[i + 1]) for i in range(0, len(c), 2)] | ||
|
||
@functools.cached_property | ||
def _meta(self): | ||
c = self.operands[1:] | ||
c = self.argument_operands[1:] | ||
caselist = [ | ||
( | ||
meta_nonempty(c[i]._meta) if isinstance(c[i], Expr) else c[i], | ||
|
@@ -2714,9 +2719,11 @@ class _DelayedExpr(Expr): | |
# TODO | ||
_parameters = ["obj"] | ||
|
||
def __init__(self, obj): | ||
def __init__(self, obj, _branch_id=None): | ||
self.obj = obj | ||
self.operands = [obj] | ||
if _branch_id is None: | ||
_branch_id = BranchId(0) | ||
self.operands = [obj, _branch_id] | ||
|
||
def __str__(self): | ||
return f"{type(self).__name__}({str(self.obj)})" | ||
|
@@ -2744,7 +2751,9 @@ def normalize_expression(expr): | |
return expr._name | ||
|
||
|
||
def optimize(expr: Expr, fuse: bool = True) -> Expr: | ||
def optimize( | ||
expr: Expr, fuse: bool = True, common_subplan_elimination: bool = True | ||
) -> Expr: | ||
"""High level query optimization | ||
|
||
This leverages three optimization passes: | ||
|
@@ -2758,24 +2767,37 @@ def optimize(expr: Expr, fuse: bool = True) -> Expr: | |
Input expression to optimize | ||
fuse: | ||
whether or not to turn on blockwise fusion | ||
common_subplan_elimination : bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not mistaken, our default is actually a very radical form of "common subplan elimination". I think this feature is actually quite the opposite and rather something like "replicate common subplan"? I guess other engines are doing it the other way round, aren't they? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took the name from here: https://docs.pola.rs/py-polars/html/reference/lazyframe/api/polars.LazyFrame.explain.html
And should have read the actual documentation, yes you are correct this should be the other way round There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is also https://en.wikipedia.org/wiki/Common_subexpression_elimination with a simple example =) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming is hard. @hendrikmakait any thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry that I'm late to the party. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding Common Subexpression Elimination vs Common Subplan Elimination: I think Common Subexpression Elimination is more common for data systems and compilers/IR-based systems. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interestingly. I'm not entirely sure what the difference is, but from what I understand That is, |
||
whether we want to reuse common subplans that are found in the graph and | ||
are used in self-joins or similar which require all data be held in memory | ||
at some point. Only set this to false if your dataset fits into memory. | ||
|
||
See Also | ||
-------- | ||
simplify | ||
optimize_blockwise_fusion | ||
""" | ||
result = expr | ||
while True: | ||
if common_subplan_elimination: | ||
out = result.rewrite("reuse", cache={}) | ||
else: | ||
out = result | ||
out = out.simplify() | ||
if out._name == result._name or not common_subplan_elimination: | ||
break | ||
result = out | ||
|
||
# Simplify | ||
result = expr.simplify() | ||
result = out | ||
|
||
# Manipulate Expression to make it more efficient | ||
result = result.rewrite(kind="tune") | ||
result = result.rewrite(kind="tune", cache={}) | ||
|
||
# Lower | ||
result = result.lower_completely() | ||
|
||
# Cull | ||
result = result.rewrite(kind="cull") | ||
result = result.rewrite(kind="cull", cache={}) | ||
|
||
# Final graph-specific optimizations | ||
if fuse: | ||
|
@@ -3307,7 +3329,7 @@ def __str__(self): | |
|
||
@functools.cached_property | ||
def args(self): | ||
return self.operands[len(self._parameters) :] | ||
return self.argument_operands[len(self._parameters) :] | ||
|
||
@functools.cached_property | ||
def _dfs(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about just storing this as an attribute? Why does this need to be an operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having it in the operands makes sure that it is used for
_name
, which is what we mostly care about. If that is overridden and forgotten, then the whole thing won't work properlyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get that. I'm just concerned this is messing with other things since
operands
are everywhere. If it works I'm fine.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think to "avoid" the kind of confusion I'm thinking about you introduced argument_opearands but this may also be confusing to developers