From 467f636459356cb2ec81f9d03d51c8158e719e2b Mon Sep 17 00:00:00 2001 From: thorfour Date: Tue, 10 Oct 2023 11:55:07 -0500 Subject: [PATCH 1/3] optimize arrow project If the projection matches the entire arrow records simply return the whole record --- pqarrow/arrow.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pqarrow/arrow.go b/pqarrow/arrow.go index 4f3973ad8..036f1ee5b 100644 --- a/pqarrow/arrow.go +++ b/pqarrow/arrow.go @@ -1206,5 +1206,11 @@ func Project(r arrow.Record, projections []logicalplan.Expr) arrow.Record { } } + // If the projection matches the entire record, return the record as is. + if len(cols) == r.Schema().NumFields() { + r.Retain() // NOTE: we're creating another reference to this record, so retain it + return r + } + return array.NewRecord(arrow.NewSchema(fields, nil), cols, r.NumRows()) } From e3c7b0e32134d7506af3566eb5c4aca41c2525d1 Mon Sep 17 00:00:00 2001 From: thorfour Date: Tue, 10 Oct 2023 12:30:41 -0500 Subject: [PATCH 2/3] optimize: prehashed column matching Instead of using regexp just use the dyncol and potentially a new NotExpr wrapper --- query/logicalplan/expr.go | 28 ++++++++++++++++++++++++++++ query/logicalplan/optimize.go | 8 +++----- query/logicalplan/optimize_test.go | 4 ++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/query/logicalplan/expr.go b/query/logicalplan/expr.go index b7d5e4762..b16cb9d92 100644 --- a/query/logicalplan/expr.go +++ b/query/logicalplan/expr.go @@ -772,3 +772,31 @@ func (a *AllExpr) MatchColumn(columnName string) bool { return true } func (a *AllExpr) MatchPath(path string) bool { return true } func (a *AllExpr) Computed() bool { return false } func (a *AllExpr) Clone() Expr { return &AllExpr{} } + +type NotExpr struct { + Expr Expr +} + +func Not(expr Expr) *NotExpr { + return &NotExpr{ + Expr: expr, + } +} + +func (n *NotExpr) DataType(*parquet.Schema) (arrow.DataType, error) { return nil, nil } +func (n *NotExpr) Accept(visitor Visitor) bool { + continu := visitor.PreVisit(n) + if !continu { + return false + } + + return visitor.PostVisit(n) +} +func (n *NotExpr) Name() string { return "not " + n.Name() } +func (n *NotExpr) ColumnsUsedExprs() []Expr { + return []Expr{&NotExpr{Expr: n.Expr}} +} +func (n *NotExpr) MatchColumn(columnName string) bool { return !n.Expr.MatchColumn(columnName) } +func (n *NotExpr) MatchPath(path string) bool { return !n.Expr.MatchPath(path) } +func (n *NotExpr) Computed() bool { return false } +func (n *NotExpr) Clone() Expr { return &NotExpr{Expr: n.Expr} } diff --git a/query/logicalplan/optimize.go b/query/logicalplan/optimize.go index 8d3e2ed50..d7fcfa4c0 100644 --- a/query/logicalplan/optimize.go +++ b/query/logicalplan/optimize.go @@ -1,12 +1,10 @@ package logicalplan import ( - "regexp" - "golang.org/x/exp/slices" ) -var hashedMatch = regexp.MustCompile("^hashed.") +var hashedMatch = "hashed" type Optimizer interface { Optimize(plan *LogicalPlan) *LogicalPlan @@ -17,7 +15,7 @@ func DefaultOptimizers() []Optimizer { &AverageAggregationPushDown{}, &PhysicalProjectionPushDown{ defaultProjections: []Expr{ - RegExpNotColumnMatch(hashedMatch), + Not(DynCol(hashedMatch)), }, }, &FilterPushDown{}, @@ -126,7 +124,7 @@ func (p *PhysicalProjectionPushDown) optimize(plan *LogicalPlan, columnsUsedExpr columnsUsedExprs = append(columnsUsedExprs, expr.ColumnsUsedExprs()...) } p.defaultProjections = []Expr{} - columnsUsedExprs = append(columnsUsedExprs, RegExpColumnMatch(hashedMatch)) + columnsUsedExprs = append(columnsUsedExprs, DynCol(hashedMatch)) } if plan.Input != nil { diff --git a/query/logicalplan/optimize_test.go b/query/logicalplan/optimize_test.go index 3835eb48e..b8937cfe1 100644 --- a/query/logicalplan/optimize_test.go +++ b/query/logicalplan/optimize_test.go @@ -35,7 +35,7 @@ func TestOptimizePhysicalProjectionPushDown(t *testing.T) { &Column{ColumnName: "stacktrace"}, &Column{ColumnName: "stacktrace"}, &Column{ColumnName: "value"}, - RegExpColumnMatch(hashedMatch), + DynCol(hashedMatch), &Column{ColumnName: "labels.test"}, }, }, @@ -205,7 +205,7 @@ func TestAllOptimizers(t *testing.T) { &Column{ColumnName: "stacktrace"}, &Column{ColumnName: "stacktrace"}, &Column{ColumnName: "value"}, - RegExpColumnMatch(hashedMatch), + DynCol(hashedMatch), &Column{ColumnName: "labels.test"}, }, Filter: &BinaryExpr{ From 19d06b1a58bed25ccd3b3f096fe18ca0e94894fc Mon Sep 17 00:00:00 2001 From: thorfour Date: Tue, 10 Oct 2023 12:35:24 -0500 Subject: [PATCH 3/3] whoops --- query/logicalplan/expr.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query/logicalplan/expr.go b/query/logicalplan/expr.go index b16cb9d92..d8c24fdc4 100644 --- a/query/logicalplan/expr.go +++ b/query/logicalplan/expr.go @@ -792,7 +792,7 @@ func (n *NotExpr) Accept(visitor Visitor) bool { return visitor.PostVisit(n) } -func (n *NotExpr) Name() string { return "not " + n.Name() } +func (n *NotExpr) Name() string { return "not " + n.Expr.Name() } func (n *NotExpr) ColumnsUsedExprs() []Expr { return []Expr{&NotExpr{Expr: n.Expr}} }