From 9f62d7d4c69baace366db5d85c398223a12e1ac0 Mon Sep 17 00:00:00 2001 From: ilya Date: Mon, 11 Dec 2023 11:31:21 +0000 Subject: [PATCH] Update large tables statistics (#2146) # Description After https://github.com/cowprotocol/services/pull/2135 is deployed in staging, large tables row count metrics are not updated for hours. Statistics come from `pg_class` table, which is randomly updated by the DB itself according to internal thresholds. In order to guarantee statistics update for large tables, a new background task with hourly rate triggers `ANALYZE {large_table_name}`, which takes around 15s in staging. That is still significantly faster compared to the initial `SELECT COUNT(*)` query(150-600s). # Changes A new background task running each hour executes `ANALYZE` for each large table. ## How to test Logs observation --- crates/autopilot/src/database.rs | 37 ++++++++++++++++++++++++++++++-- crates/autopilot/src/run.rs | 5 +---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/crates/autopilot/src/database.rs b/crates/autopilot/src/database.rs index f641263cb5..54642ea9c8 100644 --- a/crates/autopilot/src/database.rs +++ b/crates/autopilot/src/database.rs @@ -12,8 +12,9 @@ mod quotes; pub mod recent_settlements; use { - sqlx::{PgConnection, PgPool}, + sqlx::{Executor, PgConnection, PgPool}, std::time::Duration, + tracing::Instrument, }; #[derive(Debug, Clone)] @@ -50,6 +51,15 @@ impl Postgres { Ok(()) } + + pub async fn update_large_tables_stats(&self) -> sqlx::Result<()> { + for &table in database::LARGE_TABLES { + let mut ex = self.0.acquire().await?; + analyze_table(&mut ex, table).await?; + } + + Ok(()) + } } async fn count_rows_in_table(ex: &mut PgConnection, table: &str) -> sqlx::Result { @@ -62,6 +72,11 @@ async fn estimate_rows_in_table(ex: &mut PgConnection, table: &str) -> sqlx::Res sqlx::query_scalar(&query).fetch_one(ex).await } +async fn analyze_table(ex: &mut PgConnection, table: &str) -> sqlx::Result<()> { + let query = format!("ANALYZE {table};"); + ex.execute(sqlx::query(&query)).await.map(|_| ()) +} + async fn count_unused_app_data(ex: &mut PgConnection) -> sqlx::Result { let query = r#" SELECT @@ -99,7 +114,16 @@ impl Metrics { } } -pub async fn database_metrics(db: Postgres) -> ! { +pub fn run_database_metrics_work(db: Postgres) { + let span = tracing::info_span!("database_metrics"); + // Spawn the task for updating large table statistics + tokio::spawn(update_large_tables_stats(db.clone()).instrument(span.clone())); + + // Spawn the task for database metrics + tokio::task::spawn(database_metrics(db).instrument(span)); +} + +async fn database_metrics(db: Postgres) -> ! { loop { if let Err(err) = db.update_database_metrics().await { tracing::error!(?err, "failed to update table rows metric"); @@ -108,6 +132,15 @@ pub async fn database_metrics(db: Postgres) -> ! { } } +async fn update_large_tables_stats(db: Postgres) -> ! { + loop { + if let Err(err) = db.update_large_tables_stats().await { + tracing::error!(?err, "failed to update large tables stats"); + } + tokio::time::sleep(Duration::from_secs(60 * 60)).await; + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/autopilot/src/run.rs b/crates/autopilot/src/run.rs index 5a452f8218..8c294d24b2 100644 --- a/crates/autopilot/src/run.rs +++ b/crates/autopilot/src/run.rs @@ -97,10 +97,7 @@ pub async fn run(args: Arguments) { assert!(args.shadow.is_none(), "cannot run in shadow mode"); let db = Postgres::new(args.db_url.as_str()).await.unwrap(); - tokio::task::spawn( - crate::database::database_metrics(db.clone()) - .instrument(tracing::info_span!("database_metrics")), - ); + crate::database::run_database_metrics_work(db.clone()); let http_factory = HttpClientFactory::new(&args.http_client); let web3 = shared::ethrpc::web3(