Skip to content

Commit

Permalink
feat(funnels): use hogql for legacy insight funnels api (#27420)
Browse files Browse the repository at this point in the history
  • Loading branch information
thmsobrmlr authored Jan 15, 2025
1 parent 9bbe988 commit 80b579b
Show file tree
Hide file tree
Showing 9 changed files with 2,003 additions and 1,310 deletions.
388 changes: 202 additions & 186 deletions ee/clickhouse/views/test/funnel/__snapshots__/test_clickhouse_funnel.ambr

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -2,80 +2,89 @@
# name: ClickhouseTestUnorderedFunnelGroups.test_unordered_funnel_with_groups
'''
/* user_id:0 request:_snapshot_ */
SELECT countIf(steps = 1) step_1,
countIf(steps = 2) step_2,
avg(step_1_average_conversion_time_inner) step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) step_1_median_conversion_time
SELECT countIf(ifNull(equals(steps, 1), 0)) AS step_1,
countIf(ifNull(equals(steps, 2), 0)) AS step_2,
avg(step_1_average_conversion_time_inner) AS step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) AS step_1_median_conversion_time
FROM
(SELECT aggregation_target,
steps,
avg(step_1_conversion_time) step_1_average_conversion_time_inner,
median(step_1_conversion_time) step_1_median_conversion_time_inner
(SELECT aggregation_target AS aggregation_target,
steps AS steps,
avg(step_1_conversion_time) AS step_1_average_conversion_time_inner,
median(step_1_conversion_time) AS step_1_median_conversion_time_inner
FROM
(SELECT aggregation_target,
steps,
max(steps) over (PARTITION BY aggregation_target) as max_steps,
step_1_conversion_time
(SELECT aggregation_target AS aggregation_target,
steps AS steps,
max(steps) OVER (PARTITION BY aggregation_target) AS max_steps,
step_1_conversion_time AS step_1_conversion_time
FROM
(SELECT *,
arraySort([latest_0,latest_1]) as event_times,
arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 1, 0), 1]) AS steps ,
arraySort([latest_0,latest_1]) as conversion_times,
if(isNotNull(conversion_times[2])
AND conversion_times[2] <= conversion_times[1] + INTERVAL 14 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time
(SELECT aggregation_target AS aggregation_target,
timestamp AS timestamp,
step_0 AS step_0,
latest_0 AS latest_0,
step_1 AS step_1,
latest_1 AS latest_1,
arraySort([latest_0, latest_1]) AS event_times,
arraySum([if(and(ifNull(less(latest_0, latest_1), 0), ifNull(lessOrEquals(latest_1, plus(toTimeZone(latest_0, 'UTC'), toIntervalDay(14))), 0)), 1, 0), 1]) AS steps,
arraySort([latest_0, latest_1]) AS conversion_times,
if(and(isNotNull(conversion_times[2]), ifNull(lessOrEquals(conversion_times[2], plus(toTimeZone(conversion_times[1], 'UTC'), toIntervalDay(14))), 0)), dateDiff('second', conversion_times[1], conversion_times[2]), NULL) AS step_1_conversion_time
FROM
(SELECT aggregation_target, timestamp, step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target
ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
(SELECT aggregation_target AS aggregation_target,
timestamp AS timestamp,
step_0 AS step_0,
latest_0 AS latest_0,
step_1 AS step_1,
min(latest_1) OVER (PARTITION BY aggregation_target
ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) AS latest_1
FROM
(SELECT e.timestamp as timestamp,
e."$group_0" as aggregation_target,
if(event = 'user signed up', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'paid', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM events e
WHERE team_id = 99999
AND event IN ['paid', 'user signed up']
AND toTimeZone(timestamp, 'UTC') >= toDateTime('2020-01-01 00:00:00', 'UTC')
AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-14 23:59:59', 'UTC')
AND (NOT has([''], "$group_0"))
AND (step_0 = 1
OR step_1 = 1) ))
WHERE step_0 = 1
UNION ALL SELECT *,
arraySort([latest_0,latest_1]) as event_times,
arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 1, 0), 1]) AS steps ,
arraySort([latest_0,latest_1]) as conversion_times,
if(isNotNull(conversion_times[2])
AND conversion_times[2] <= conversion_times[1] + INTERVAL 14 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time
(SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp,
e.`$group_0` AS aggregation_target,
if(equals(e.event, 'user signed up'), 1, 0) AS step_0,
if(ifNull(equals(step_0, 1), 0), timestamp, NULL) AS latest_0,
if(equals(e.event, 'paid'), 1, 0) AS step_1,
if(ifNull(equals(step_1, 1), 0), timestamp, NULL) AS latest_1
FROM events AS e
WHERE and(equals(e.team_id, 99999), and(and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-01 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-14 23:59:59.999999', 6, 'UTC'))), in(e.event, tuple('paid', 'user signed up')), ifNull(notEquals(nullIf(nullIf(e.`$group_0`, ''), 'null'), ''), 1)), or(ifNull(equals(step_0, 1), 0), ifNull(equals(step_1, 1), 0)))))
WHERE ifNull(equals(step_0, 1), 0)
UNION ALL SELECT aggregation_target AS aggregation_target,
timestamp AS timestamp,
step_0 AS step_0,
latest_0 AS latest_0,
step_1 AS step_1,
latest_1 AS latest_1,
arraySort([latest_0, latest_1]) AS event_times,
arraySum([if(and(ifNull(less(latest_0, latest_1), 0), ifNull(lessOrEquals(latest_1, plus(toTimeZone(latest_0, 'UTC'), toIntervalDay(14))), 0)), 1, 0), 1]) AS steps,
arraySort([latest_0, latest_1]) AS conversion_times,
if(and(isNotNull(conversion_times[2]), ifNull(lessOrEquals(conversion_times[2], plus(toTimeZone(conversion_times[1], 'UTC'), toIntervalDay(14))), 0)), dateDiff('second', conversion_times[1], conversion_times[2]), NULL) AS step_1_conversion_time
FROM
(SELECT aggregation_target, timestamp, step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target
ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
(SELECT aggregation_target AS aggregation_target,
timestamp AS timestamp,
step_0 AS step_0,
latest_0 AS latest_0,
step_1 AS step_1,
min(latest_1) OVER (PARTITION BY aggregation_target
ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) AS latest_1
FROM
(SELECT e.timestamp as timestamp,
e."$group_0" as aggregation_target,
if(event = 'paid', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'user signed up', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM events e
WHERE team_id = 99999
AND event IN ['paid', 'user signed up']
AND toTimeZone(timestamp, 'UTC') >= toDateTime('2020-01-01 00:00:00', 'UTC')
AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-14 23:59:59', 'UTC')
AND (NOT has([''], "$group_0"))
AND (step_0 = 1
OR step_1 = 1) ))
WHERE step_0 = 1 ))
(SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp,
e.`$group_0` AS aggregation_target,
if(equals(e.event, 'paid'), 1, 0) AS step_0,
if(ifNull(equals(step_0, 1), 0), timestamp, NULL) AS latest_0,
if(equals(e.event, 'user signed up'), 1, 0) AS step_1,
if(ifNull(equals(step_1, 1), 0), timestamp, NULL) AS latest_1
FROM events AS e
WHERE and(equals(e.team_id, 99999), and(and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-01 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-14 23:59:59.999999', 6, 'UTC'))), in(e.event, tuple('paid', 'user signed up')), ifNull(notEquals(nullIf(nullIf(e.`$group_0`, ''), 'null'), ''), 1)), or(ifNull(equals(step_0, 1), 0), ifNull(equals(step_1, 1), 0)))))
WHERE ifNull(equals(step_0, 1), 0)))
GROUP BY aggregation_target,
steps
HAVING steps = max(max_steps))
HAVING ifNull(equals(steps, max(max_steps)), isNull(steps)
and isNull(max(max_steps))))
LIMIT 100 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1,
format_csv_allow_double_quotes=0,
max_ast_elements=4000000,
max_expanded_ast_elements=4000000,
max_bytes_before_external_group_by=23622320128,
allow_experimental_analyzer=1
'''
# ---
# name: ClickhouseTestUnorderedFunnelGroups.test_unordered_funnel_with_groups.1
Expand Down
25 changes: 2 additions & 23 deletions ee/clickhouse/views/test/funnel/test_clickhouse_funnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ee.clickhouse.views.test.funnel.util import (
EventPattern,
FunnelRequest,
get_funnel_actors_ok,
get_funnel_ok,
)
from posthog.constants import INSIGHT_FUNNELS
Expand Down Expand Up @@ -103,10 +102,6 @@ def test_funnel_aggregation_with_groups(self):
assert result["paid"]["count"] == 1
assert result["paid"]["average_conversion_time"] == 86400

actors = get_funnel_actors_ok(self.client, result["user signed up"]["converted_people_url"])
actor_ids = [str(val["id"]) for val in actors]
assert actor_ids == ["org:5", "org:6"]

@snapshot_clickhouse_queries
def test_funnel_group_aggregation_with_groups_entity_filtering(self):
self._create_groups()
Expand Down Expand Up @@ -160,10 +155,6 @@ def test_funnel_group_aggregation_with_groups_entity_filtering(self):
assert result["paid"]["count"] == 1
assert result["paid"]["average_conversion_time"] == 86400

actors = get_funnel_actors_ok(self.client, result["user signed up"]["converted_people_url"])
actor_ids = [str(val["id"]) for val in actors]
assert actor_ids == ["org:5"]

@snapshot_clickhouse_queries
def test_funnel_with_groups_entity_filtering(self):
self._create_groups()
Expand Down Expand Up @@ -194,7 +185,7 @@ def test_funnel_with_groups_entity_filtering(self):
},
]
}
created_people = journeys_for(events_by_person, self.team)
journeys_for(events_by_person, self.team)

params = FunnelRequest(
events=json.dumps(
Expand All @@ -219,11 +210,6 @@ def test_funnel_with_groups_entity_filtering(self):
assert result["paid"]["count"] == 1
assert result["paid"]["average_conversion_time"] == 86400

actors = get_funnel_actors_ok(self.client, result["user signed up"]["converted_people_url"])
actor_ids = [str(val["id"]) for val in actors]

assert actor_ids == sorted([str(created_people["user_1"].uuid)])

@snapshot_clickhouse_queries
def test_funnel_with_groups_global_filtering(self):
self._create_groups()
Expand Down Expand Up @@ -256,7 +242,7 @@ def test_funnel_with_groups_global_filtering(self):
},
],
}
created_people = journeys_for(events_by_person, self.team)
journeys_for(events_by_person, self.team)

params = FunnelRequest(
events=json.dumps(
Expand Down Expand Up @@ -284,10 +270,3 @@ def test_funnel_with_groups_global_filtering(self):

assert result["user signed up"]["count"] == 1
assert result["paid"]["count"] == 0

actors = get_funnel_actors_ok(self.client, result["user signed up"]["converted_people_url"])
actor_ids = [str(val["id"]) for val in actors]

assert actor_ids == sorted([str(created_people["user_1"].uuid)])

# TODO: move all tests
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ee.clickhouse.views.test.funnel.util import (
EventPattern,
FunnelRequest,
get_funnel_actors_ok,
get_funnel_ok,
)
from posthog.constants import INSIGHT_FUNNELS
Expand Down Expand Up @@ -100,6 +99,3 @@ def test_unordered_funnel_with_groups(self):
assert result["Completed 1 step"]["count"] == 2
assert result["Completed 2 steps"]["count"] == 1
assert result["Completed 2 steps"]["average_conversion_time"] == 86400

actors = get_funnel_actors_ok(self.client, result["Completed 1 step"]["converted_people_url"])
assert len(actors) == 2
7 changes: 0 additions & 7 deletions ee/clickhouse/views/test/funnel/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,6 @@ def get_funnel_ok(client: Client, team_id: int, request: FunnelRequest) -> dict[
return final


def get_funnel_actors_ok(client: Client, url: str):
response = client.get(url)

assert response.status_code == 200, response.content
return response.json()["results"][0]["people"]


def get_funnel_correlation(client: Client, team_id: int, request: FunnelCorrelationRequest):
return client.get(
f"/api/projects/{team_id}/insights/funnel/correlation",
Expand Down
24 changes: 23 additions & 1 deletion posthog/api/insight.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any, Optional, Union, cast

import posthoganalytics
from pydantic import BaseModel
import structlog
from django.db import transaction
from django.db.models import Count, Prefetch, QuerySet
Expand Down Expand Up @@ -1097,10 +1098,17 @@ def funnel(self, request: request.Request, *args: Any, **kwargs: Any) -> Respons
timings = HogQLTimings()
try:
with timings.measure("calculate"):
funnel = self.calculate_funnel(request)
query_method = get_query_method(request=request, team=self.team)
if query_method == "hogql":
funnel = self.calculate_funnel_hogql(request)
else:
funnel = self.calculate_funnel(request)

except ExposedHogQLError as e:
raise ValidationError(str(e))

if isinstance(funnel["result"], BaseModel):
funnel["result"] = funnel["result"].model_dump()
funnel["result"] = protect_old_clients_from_multi_property_default(request.data, funnel["result"])
funnel["timings"] = [val.model_dump() for val in timings.to_list()]

Expand Down Expand Up @@ -1128,6 +1136,20 @@ def calculate_funnel(self, request: request.Request) -> dict[str, Any]:
"timezone": team.timezone,
}

@cached_by_filters
def calculate_funnel_hogql(self, request: request.Request) -> dict[str, Any]:
team = self.team
filter = Filter(request=request, team=team)
filter = filter.shallow_clone(overrides={"insight": "FUNNELS"})
query = filter_to_query(filter.to_dict())
query_runner = get_query_runner(query, team, limit_context=None)

# we use the legacy caching mechanism (@cached_by_filters decorator), no need to cache in the query runner
result = query_runner.run(execution_mode=ExecutionMode.CALCULATE_BLOCKING_ALWAYS)
assert isinstance(result, schema.CachedFunnelsQueryResponse)

return {"result": result.results, "timezone": team.timezone}

# ******************************************
# /projects/:id/insights/retention
# params:
Expand Down
Loading

0 comments on commit 80b579b

Please sign in to comment.