Skip to content

Commit

Permalink
feat(web-analytics): Improve session v2 double-event count workaround (
Browse files Browse the repository at this point in the history
…#24402)

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
robbie-c and github-actions[bot] authored Aug 19, 2024
1 parent 15d9a07 commit 3a0d758
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 13 deletions.
18 changes: 18 additions & 0 deletions posthog/clickhouse/migrations/0077_sessions_v2_faster_bounce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions
from posthog.models.raw_sessions.migrations import (
WRITABLE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL,
DISTRIBUTED_RAW_SESSIONS_ADD_EVENT_COUNT_SESSION_REPLAY_EVENTS_TABLE_SQL,
BASE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL,
)
from posthog.models.raw_sessions.sql import DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL, RAW_SESSIONS_TABLE_MV_SQL

operations = [
# drop the mv, so we are no longer receiving events from the sessions table
run_sql_with_exceptions(DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL()),
# now we can alter the target tables
run_sql_with_exceptions(WRITABLE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL()),
run_sql_with_exceptions(DISTRIBUTED_RAW_SESSIONS_ADD_EVENT_COUNT_SESSION_REPLAY_EVENTS_TABLE_SQL()),
run_sql_with_exceptions(BASE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL()),
# and then recreate the materialized view
run_sql_with_exceptions(RAW_SESSIONS_TABLE_MV_SQL()),
]
9 changes: 9 additions & 0 deletions posthog/clickhouse/test/__snapshots__/test_schema.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -1758,6 +1758,8 @@
autocapture_uniq AggregateFunction(uniq, Nullable(UUID)),
screen_count SimpleAggregateFunction(sum, Int64),
screen_uniq AggregateFunction(uniq, Nullable(UUID)),
-- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these
page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)),

-- replay
maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv
Expand Down Expand Up @@ -1831,6 +1833,7 @@
uniqState(if(event='$autocapture', uuid, NULL)) as autocapture_uniq,
sumIf(1, event='$screen') as screen_count,
uniqState(if(event='$screen', uuid, NULL)) as screen_uniq,
uniqUpToState(1)(if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to,

-- replay
false as maybe_has_session_replay
Expand Down Expand Up @@ -2419,6 +2422,8 @@
autocapture_uniq AggregateFunction(uniq, Nullable(UUID)),
screen_count SimpleAggregateFunction(sum, Int64),
screen_uniq AggregateFunction(uniq, Nullable(UUID)),
-- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these
page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)),

-- replay
maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv
Expand Down Expand Up @@ -2718,6 +2723,8 @@
autocapture_uniq AggregateFunction(uniq, Nullable(UUID)),
screen_count SimpleAggregateFunction(sum, Int64),
screen_uniq AggregateFunction(uniq, Nullable(UUID)),
-- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these
page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)),

-- replay
maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv
Expand Down Expand Up @@ -3526,6 +3533,8 @@
autocapture_uniq AggregateFunction(uniq, Nullable(UUID)),
screen_count SimpleAggregateFunction(sum, Int64),
screen_uniq AggregateFunction(uniq, Nullable(UUID)),
-- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these
page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)),

-- replay
maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv
Expand Down
2 changes: 2 additions & 0 deletions posthog/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from infi.clickhouse_orm import Database

from posthog.client import sync_execute
from posthog.models.raw_sessions.sql import TRUNCATE_RAW_SESSIONS_TABLE_SQL
from posthog.test.base import PostHogTestCase, run_clickhouse_statement_in_parallel


Expand Down Expand Up @@ -89,6 +90,7 @@ def reset_clickhouse_tables():
TRUNCATE_PERFORMANCE_EVENTS_TABLE_SQL,
TRUNCATE_CHANNEL_DEFINITION_TABLE_SQL,
TRUNCATE_SESSIONS_TABLE_SQL(),
TRUNCATE_RAW_SESSIONS_TABLE_SQL(),
TRUNCATE_HEATMAPS_TABLE_SQL(),
]

Expand Down
31 changes: 31 additions & 0 deletions posthog/models/raw_sessions/migrations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from django.conf import settings

from posthog.models.raw_sessions.sql import RAW_SESSIONS_DATA_TABLE, TABLE_BASE_NAME

ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL = """
ALTER TABLE {table_name} on CLUSTER '{cluster}'
ADD COLUMN IF NOT EXISTS
page_screen_autocapture_uniq_up_to
AggregateFunction(uniqUpTo(1), Nullable(UUID))
"""

BASE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL = (
lambda: ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL.format(
table_name=TABLE_BASE_NAME,
cluster=settings.CLICKHOUSE_CLUSTER,
)
)

WRITABLE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL = (
lambda: ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL.format(
table_name="writable_raw_sessions",
cluster=settings.CLICKHOUSE_CLUSTER,
)
)

DISTRIBUTED_RAW_SESSIONS_ADD_EVENT_COUNT_SESSION_REPLAY_EVENTS_TABLE_SQL = (
lambda: ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL.format(
table_name=RAW_SESSIONS_DATA_TABLE(),
cluster=settings.CLICKHOUSE_CLUSTER,
)
)
6 changes: 5 additions & 1 deletion posthog/models/raw_sessions/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
lambda: f"DROP TABLE IF EXISTS {RAW_SESSIONS_DATA_TABLE()} ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
)
DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL = (
lambda: f"DROP MATERIALISED VIEW IF EXISTS {TABLE_BASE_NAME}_mv ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
lambda: f"DROP TABLE IF EXISTS {TABLE_BASE_NAME}_mv ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
)
DROP_RAW_SESSION_VIEW_SQL = (
lambda: f"DROP VIEW IF EXISTS {TABLE_BASE_NAME}_v ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
Expand Down Expand Up @@ -94,6 +94,8 @@
autocapture_uniq AggregateFunction(uniq, Nullable(UUID)),
screen_count SimpleAggregateFunction(sum, Int64),
screen_uniq AggregateFunction(uniq, Nullable(UUID)),
-- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these
page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)),
-- replay
maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv
Expand Down Expand Up @@ -212,6 +214,7 @@ def source_int_column(column_name: str) -> str:
initializeAggregation('uniqState', if(event='autocapture', uuid, NULL)) as autocapture_uniq,
if(event='$screen', 1, 0) as screen_count,
initializeAggregation('uniqState', if(event='screen', uuid, NULL)) as screen_uniq,
initializeAggregation('uniqUpToState(1)', if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to,
-- replay
false as maybe_has_session_replay
Expand Down Expand Up @@ -318,6 +321,7 @@ def source_int_column(column_name: str) -> str:
uniqState(if(event='$autocapture', uuid, NULL)) as autocapture_uniq,
sumIf(1, event='$screen') as screen_count,
uniqState(if(event='$screen', uuid, NULL)) as screen_uniq,
uniqUpToState(1)(if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to,
-- replay
false as maybe_has_session_replay
Expand Down
2 changes: 1 addition & 1 deletion posthog/models/sessions/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
lambda: f"DROP TABLE IF EXISTS {SESSIONS_DATA_TABLE()} ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
)
DROP_SESSION_MATERIALIZED_VIEW_SQL = (
lambda: f"DROP MATERIALISED VIEW IF EXISTS {TABLE_BASE_NAME}_mv ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
lambda: f"DROP TABLE IF EXISTS {TABLE_BASE_NAME}_mv ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"
)
DROP_SESSION_VIEW_SQL = lambda: f"DROP VIEW IF EXISTS {TABLE_BASE_NAME}_v ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'"

Expand Down
39 changes: 28 additions & 11 deletions posthog/test/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,14 @@ class ClickhouseDestroyTablesMixin(BaseTest):

def setUp(self):
super().setUp()
run_clickhouse_statement_in_parallel(
[
DROP_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_SESSION_VIEW_SQL(),
DROP_RAW_SESSION_VIEW_SQL(),
]
)
run_clickhouse_statement_in_parallel(
[
DROP_DISTRIBUTED_EVENTS_TABLE_SQL,
Expand All @@ -980,10 +988,6 @@ def setUp(self):
DROP_CHANNEL_DEFINITION_DICTIONARY_SQL,
DROP_SESSION_TABLE_SQL(),
DROP_RAW_SESSION_TABLE_SQL(),
DROP_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_SESSION_VIEW_SQL(),
DROP_RAW_SESSION_VIEW_SQL(),
]
)
run_clickhouse_statement_in_parallel(
Expand All @@ -1003,19 +1007,31 @@ def setUp(self):
DISTRIBUTED_EVENTS_TABLE_SQL(),
DISTRIBUTED_SESSION_RECORDING_EVENTS_TABLE_SQL(),
DISTRIBUTED_SESSION_REPLAY_EVENTS_TABLE_SQL(),
DISTRIBUTED_SESSIONS_TABLE_SQL(),
DISTRIBUTED_RAW_SESSIONS_TABLE_SQL(),
]
)
run_clickhouse_statement_in_parallel(
[
CHANNEL_DEFINITION_DATA_SQL(),
SESSIONS_TABLE_MV_SQL(),
RAW_SESSIONS_TABLE_MV_SQL(),
SESSIONS_VIEW_SQL(),
RAW_SESSIONS_VIEW_SQL(),
DISTRIBUTED_SESSIONS_TABLE_SQL(),
DISTRIBUTED_RAW_SESSIONS_TABLE_SQL(),
]
)

def tearDown(self):
super().tearDown()

run_clickhouse_statement_in_parallel(
[
DROP_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_SESSION_VIEW_SQL(),
DROP_RAW_SESSION_VIEW_SQL(),
]
)
run_clickhouse_statement_in_parallel(
[
DROP_DISTRIBUTED_EVENTS_TABLE_SQL,
Expand All @@ -1029,13 +1045,8 @@ def tearDown(self):
DROP_CHANNEL_DEFINITION_DICTIONARY_SQL,
DROP_SESSION_TABLE_SQL(),
DROP_RAW_SESSION_TABLE_SQL(),
DROP_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_RAW_SESSION_MATERIALIZED_VIEW_SQL(),
DROP_SESSION_VIEW_SQL(),
DROP_RAW_SESSION_VIEW_SQL(),
]
)

run_clickhouse_statement_in_parallel(
[
EVENTS_TABLE_SQL(),
Expand All @@ -1055,6 +1066,12 @@ def tearDown(self):
DISTRIBUTED_SESSION_REPLAY_EVENTS_TABLE_SQL(),
DISTRIBUTED_SESSIONS_TABLE_SQL(),
DISTRIBUTED_RAW_SESSIONS_TABLE_SQL(),
]
)
run_clickhouse_statement_in_parallel(
[
SESSIONS_TABLE_MV_SQL(),
RAW_SESSIONS_TABLE_MV_SQL(),
SESSIONS_VIEW_SQL(),
RAW_SESSIONS_VIEW_SQL(),
CHANNEL_DEFINITION_DATA_SQL(),
Expand Down

0 comments on commit 3a0d758

Please sign in to comment.