Skip to content

Commit

Permalink
perf: Optimize JSONHas calls when referencing property groups colum…
Browse files Browse the repository at this point in the history
…ns (#24897)
  • Loading branch information
tkaemming authored Sep 11, 2024
1 parent 959b17f commit 0977ddc
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 23 deletions.
1 change: 0 additions & 1 deletion mypy-baseline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ posthog/hogql/printer.py:0: error: Argument 1 to "lookup_field_by_name" has inco
posthog/hogql/printer.py:0: error: Item "TableType" of "TableType | TableAliasType" has no attribute "alias" [union-attr]
posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr-defined]
posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr-defined]
posthog/hogql/printer.py:0: error: Argument 2 to "_get_materialized_column" of "_Printer" has incompatible type "str | int"; expected "str" [arg-type]
posthog/hogql/printer.py:0: error: Argument 1 to "_print_identifier" of "_Printer" has incompatible type "str | None"; expected "str" [arg-type]
posthog/user_permissions.py:0: error: Incompatible return value type (got "int", expected "Level | None") [return-value]
posthog/user_permissions.py:0: error: Incompatible return value type (got "int", expected "Level | None") [return-value]
Expand Down
60 changes: 38 additions & 22 deletions posthog/hogql/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None:
else:
assert constant_expr is not None # appease mypy - if we got this far, we should have a constant

property_source = self.__get_materialized_property_source(property_type)
property_source = self.__get_materialized_property_source_for_property_type(property_type)
if not isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return None

Expand Down Expand Up @@ -681,7 +681,7 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None:
if left_type is None or len(left_type.chain) > 1:
return None

property_source = self.__get_materialized_property_source(left_type)
property_source = self.__get_materialized_property_source_for_property_type(left_type)
if not isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return None

Expand Down Expand Up @@ -943,21 +943,31 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None:
expr_type = expr_type.type
return expr_type

if node.name in ("isNull", "isNotNull"):
assert len(node.args) == 1, "expected unary call"
arg_type = resolve_field_type(node.args[0])
# TODO: can probably optimize chained operations, but will need more thought
if isinstance(arg_type, ast.PropertyType) and len(arg_type.chain) == 1:
property_source = self.__get_materialized_property_source(arg_type)
if not isinstance(property_source, PrintableMaterializedPropertyGroupItem):
match node:
case ast.Call(name="isNull" | "isNotNull" as function_name, args=[field]):
# TODO: can probably optimize chained operations, but will need more thought
field_type = resolve_field_type(field)
if isinstance(field_type, ast.PropertyType) and len(field_type.chain) == 1:
property_source = self.__get_materialized_property_source_for_property_type(field_type)
if not isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return None

match function_name:
case "isNull":
return f"not({property_source.has_expr})"
case "isNotNull":
return property_source.has_expr
case _:
raise ValueError(f"unexpected node name: {function_name}")
case ast.Call(name="JSONHas", args=[field, ast.Constant(value=property_name)]):
# TODO: can probably optimize chained operations here as well
field_type = resolve_field_type(field)
if not isinstance(field_type, ast.FieldType):
return None

if node.name == "isNull":
return f"not({property_source.has_expr})"
elif node.name == "isNotNull":
property_source = self.__get_materialized_property_source(field_type, str(property_name))
if isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return property_source.has_expr
else:
raise ValueError("unexpected node name")

return None # nothing to optimize

Expand Down Expand Up @@ -1242,17 +1252,24 @@ def visit_field_type(self, type: ast.FieldType):

return field_sql

def __get_materialized_property_source(
def __get_materialized_property_source_for_property_type(
self, type: ast.PropertyType
) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None:
"""
Find a materialized property for the first part of the property chain.
Find a materialized property for the provided property type.
"""
return self.__get_materialized_property_source(type.field_type, str(type.chain[0]))

def __get_materialized_property_source(
self, field_type: ast.FieldType, property_name: str
) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None:
"""
Find a materialized property for the provided field type and property name.
"""
# TODO: It likely makes sense to make this independent of whether or not property groups are used.
if self.context.modifiers.materializationMode == "disabled":
return None

field_type = type.field_type
field = field_type.resolve_database_field(self.context)

# check for a materialised column
Expand All @@ -1269,7 +1286,7 @@ def __get_materialized_property_source(
raise QueryError(f"Can't resolve field {field_type.name} on table {table_name}")
field_name = cast(Union[Literal["properties"], Literal["person_properties"]], field.name)

materialized_column = self._get_materialized_column(table_name, type.chain[0], field_name)
materialized_column = self._get_materialized_column(table_name, property_name, field_name)
if materialized_column:
return PrintableMaterializedColumn(
self.visit(field_type.table_type),
Expand All @@ -1279,7 +1296,6 @@ def __get_materialized_property_source(
PropertyGroupsMode.ENABLED,
PropertyGroupsMode.OPTIMIZED,
):
property_name = str(type.chain[0])
# For now, we're assuming that properties are in either no groups or one group, so just using the
# first group returned is fine. If we start putting properties in multiple groups, this should be
# revisited to find the optimal set (i.e. smallest set) of groups to read from.
Expand All @@ -1298,9 +1314,9 @@ def __get_materialized_property_source(
):
# :KLUDGE: Legacy person properties handling. Only used within non-HogQL queries, such as insights.
if self.context.modifiers.personsOnEventsMode != PersonsOnEventsMode.DISABLED:
materialized_column = self._get_materialized_column("events", str(type.chain[0]), "person_properties")
materialized_column = self._get_materialized_column("events", property_name, "person_properties")
else:
materialized_column = self._get_materialized_column("person", str(type.chain[0]), "properties")
materialized_column = self._get_materialized_column("person", property_name, "properties")
if materialized_column:
return PrintableMaterializedColumn(None, self._print_identifier(materialized_column))

Expand All @@ -1310,7 +1326,7 @@ def visit_property_type(self, type: ast.PropertyType):
if type.joined_subquery is not None and type.joined_subquery_field_name is not None:
return f"{self._print_identifier(type.joined_subquery.alias)}.{self._print_identifier(type.joined_subquery_field_name)}"

materialized_property_source = self.__get_materialized_property_source(type)
materialized_property_source = self.__get_materialized_property_source_for_property_type(type)
if materialized_property_source is not None:
if isinstance(materialized_property_source, PrintableMaterializedColumn):
# TODO: rematerialize all columns to properly support empty strings and "null" string values.
Expand Down
11 changes: 11 additions & 0 deletions posthog/hogql/test/test_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,17 @@ def test_property_groups_optimized_null_comparisons(self) -> None:
{"hogql_val_0": "key"},
)

def test_property_groups_optimized_has(self) -> None:
self._test_property_group_comparison(
"JSONHas(properties, 'key')",
"has(events.properties_group_custom, %(hogql_val_0)s)",
{"hogql_val_0": "key"},
expected_skip_indexes_used={"properties_group_custom_keys_bf"},
)

# TODO: Chained operations/path traversal could be optimized further, but is left alone for now.
self._test_property_group_comparison("JSONHas(properties, 'foo', 'bar')", None)

def test_property_groups_optimized_in_comparisons(self) -> None:
# The IN operator works much like equality when the right hand side of the expression is all constants. Like
# equality, it also needs to handle the empty string special case.
Expand Down

0 comments on commit 0977ddc

Please sign in to comment.