From 0977ddc4e1abbeecf9acc8f75b67240e2a10e577 Mon Sep 17 00:00:00 2001 From: ted kaemming <65315+tkaemming@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:39:28 -0700 Subject: [PATCH] perf: Optimize `JSONHas` calls when referencing property groups columns (#24897) --- mypy-baseline.txt | 1 - posthog/hogql/printer.py | 60 +++++++++++++++++++----------- posthog/hogql/test/test_printer.py | 11 ++++++ 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 149978d16ec83..27ce3e7bd8286 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -255,7 +255,6 @@ posthog/hogql/printer.py:0: error: Argument 1 to "lookup_field_by_name" has inco posthog/hogql/printer.py:0: error: Item "TableType" of "TableType | TableAliasType" has no attribute "alias" [union-attr] posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr-defined] posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr-defined] -posthog/hogql/printer.py:0: error: Argument 2 to "_get_materialized_column" of "_Printer" has incompatible type "str | int"; expected "str" [arg-type] posthog/hogql/printer.py:0: error: Argument 1 to "_print_identifier" of "_Printer" has incompatible type "str | None"; expected "str" [arg-type] posthog/user_permissions.py:0: error: Incompatible return value type (got "int", expected "Level | None") [return-value] posthog/user_permissions.py:0: error: Incompatible return value type (got "int", expected "Level | None") [return-value] diff --git a/posthog/hogql/printer.py b/posthog/hogql/printer.py index 1f481d071ce1c..2402d786e54cc 100644 --- a/posthog/hogql/printer.py +++ b/posthog/hogql/printer.py @@ -645,7 +645,7 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None: else: assert constant_expr is not None # appease mypy - if we got this far, we should have a constant - property_source = self.__get_materialized_property_source(property_type) + property_source = self.__get_materialized_property_source_for_property_type(property_type) if not isinstance(property_source, PrintableMaterializedPropertyGroupItem): return None @@ -681,7 +681,7 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None: if left_type is None or len(left_type.chain) > 1: return None - property_source = self.__get_materialized_property_source(left_type) + property_source = self.__get_materialized_property_source_for_property_type(left_type) if not isinstance(property_source, PrintableMaterializedPropertyGroupItem): return None @@ -943,21 +943,31 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None: expr_type = expr_type.type return expr_type - if node.name in ("isNull", "isNotNull"): - assert len(node.args) == 1, "expected unary call" - arg_type = resolve_field_type(node.args[0]) - # TODO: can probably optimize chained operations, but will need more thought - if isinstance(arg_type, ast.PropertyType) and len(arg_type.chain) == 1: - property_source = self.__get_materialized_property_source(arg_type) - if not isinstance(property_source, PrintableMaterializedPropertyGroupItem): + match node: + case ast.Call(name="isNull" | "isNotNull" as function_name, args=[field]): + # TODO: can probably optimize chained operations, but will need more thought + field_type = resolve_field_type(field) + if isinstance(field_type, ast.PropertyType) and len(field_type.chain) == 1: + property_source = self.__get_materialized_property_source_for_property_type(field_type) + if not isinstance(property_source, PrintableMaterializedPropertyGroupItem): + return None + + match function_name: + case "isNull": + return f"not({property_source.has_expr})" + case "isNotNull": + return property_source.has_expr + case _: + raise ValueError(f"unexpected node name: {function_name}") + case ast.Call(name="JSONHas", args=[field, ast.Constant(value=property_name)]): + # TODO: can probably optimize chained operations here as well + field_type = resolve_field_type(field) + if not isinstance(field_type, ast.FieldType): return None - if node.name == "isNull": - return f"not({property_source.has_expr})" - elif node.name == "isNotNull": + property_source = self.__get_materialized_property_source(field_type, str(property_name)) + if isinstance(property_source, PrintableMaterializedPropertyGroupItem): return property_source.has_expr - else: - raise ValueError("unexpected node name") return None # nothing to optimize @@ -1242,17 +1252,24 @@ def visit_field_type(self, type: ast.FieldType): return field_sql - def __get_materialized_property_source( + def __get_materialized_property_source_for_property_type( self, type: ast.PropertyType ) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None: """ - Find a materialized property for the first part of the property chain. + Find a materialized property for the provided property type. + """ + return self.__get_materialized_property_source(type.field_type, str(type.chain[0])) + + def __get_materialized_property_source( + self, field_type: ast.FieldType, property_name: str + ) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None: + """ + Find a materialized property for the provided field type and property name. """ # TODO: It likely makes sense to make this independent of whether or not property groups are used. if self.context.modifiers.materializationMode == "disabled": return None - field_type = type.field_type field = field_type.resolve_database_field(self.context) # check for a materialised column @@ -1269,7 +1286,7 @@ def __get_materialized_property_source( raise QueryError(f"Can't resolve field {field_type.name} on table {table_name}") field_name = cast(Union[Literal["properties"], Literal["person_properties"]], field.name) - materialized_column = self._get_materialized_column(table_name, type.chain[0], field_name) + materialized_column = self._get_materialized_column(table_name, property_name, field_name) if materialized_column: return PrintableMaterializedColumn( self.visit(field_type.table_type), @@ -1279,7 +1296,6 @@ def __get_materialized_property_source( PropertyGroupsMode.ENABLED, PropertyGroupsMode.OPTIMIZED, ): - property_name = str(type.chain[0]) # For now, we're assuming that properties are in either no groups or one group, so just using the # first group returned is fine. If we start putting properties in multiple groups, this should be # revisited to find the optimal set (i.e. smallest set) of groups to read from. @@ -1298,9 +1314,9 @@ def __get_materialized_property_source( ): # :KLUDGE: Legacy person properties handling. Only used within non-HogQL queries, such as insights. if self.context.modifiers.personsOnEventsMode != PersonsOnEventsMode.DISABLED: - materialized_column = self._get_materialized_column("events", str(type.chain[0]), "person_properties") + materialized_column = self._get_materialized_column("events", property_name, "person_properties") else: - materialized_column = self._get_materialized_column("person", str(type.chain[0]), "properties") + materialized_column = self._get_materialized_column("person", property_name, "properties") if materialized_column: return PrintableMaterializedColumn(None, self._print_identifier(materialized_column)) @@ -1310,7 +1326,7 @@ def visit_property_type(self, type: ast.PropertyType): if type.joined_subquery is not None and type.joined_subquery_field_name is not None: return f"{self._print_identifier(type.joined_subquery.alias)}.{self._print_identifier(type.joined_subquery_field_name)}" - materialized_property_source = self.__get_materialized_property_source(type) + materialized_property_source = self.__get_materialized_property_source_for_property_type(type) if materialized_property_source is not None: if isinstance(materialized_property_source, PrintableMaterializedColumn): # TODO: rematerialize all columns to properly support empty strings and "null" string values. diff --git a/posthog/hogql/test/test_printer.py b/posthog/hogql/test/test_printer.py index ee6bc9d084e3d..b7ed4e7437062 100644 --- a/posthog/hogql/test/test_printer.py +++ b/posthog/hogql/test/test_printer.py @@ -515,6 +515,17 @@ def test_property_groups_optimized_null_comparisons(self) -> None: {"hogql_val_0": "key"}, ) + def test_property_groups_optimized_has(self) -> None: + self._test_property_group_comparison( + "JSONHas(properties, 'key')", + "has(events.properties_group_custom, %(hogql_val_0)s)", + {"hogql_val_0": "key"}, + expected_skip_indexes_used={"properties_group_custom_keys_bf"}, + ) + + # TODO: Chained operations/path traversal could be optimized further, but is left alone for now. + self._test_property_group_comparison("JSONHas(properties, 'foo', 'bar')", None) + def test_property_groups_optimized_in_comparisons(self) -> None: # The IN operator works much like equality when the right hand side of the expression is all constants. Like # equality, it also needs to handle the empty string special case.