From 5328d088701b3b3706bec529e3f3a1f8db0c3668 Mon Sep 17 00:00:00 2001 From: Martin Beckmann Date: Wed, 25 Oct 2023 12:06:43 +0200 Subject: [PATCH] Dont reset node during test (#2012) # Description Ever since we replaced `hardhat` with `anvil` we got flaky tests. The issue seems to be related to deadlocks inside `anvil`. It's possible that these deadlocks could be related to running the same node instance for all tests and resetting it over and over again. # Changes Drop `anvil` from the `docker-compose` file and instead spawn a fresh `anvil` instance for every `e2e` test. ## How to test Rerun CI a couple of times and :pray: that we don't see flaky errors? --- .github/workflows/pull-request.yaml | 3 +- crates/e2e/src/nodes/forked_node.rs | 40 +--------- crates/e2e/src/nodes/local_node.rs | 30 -------- crates/e2e/src/nodes/mod.rs | 109 +++++++++++++++++++++++++++- crates/e2e/src/setup/mod.rs | 52 +++++++------ crates/observe/src/panic_hook.rs | 13 ++++ docker-compose.yaml | 9 --- 7 files changed, 153 insertions(+), 103 deletions(-) diff --git a/.github/workflows/pull-request.yaml b/.github/workflows/pull-request.yaml index 639e49e13d..39e88fbcf5 100644 --- a/.github/workflows/pull-request.yaml +++ b/.github/workflows/pull-request.yaml @@ -66,6 +66,7 @@ jobs: CARGO_PROFILE_TEST_DEBUG: 0 steps: - uses: actions/checkout@v3 + - uses: foundry-rs/foundry-toolchain@v1 - uses: Swatinem/rust-cache@v2 # Start the build process in the background. The following cargo test command will automatically # wait for the build process to be done before proceeding. @@ -73,7 +74,7 @@ jobs: - uses: yu-ichiro/spin-up-docker-compose-action@v1 with: file: docker-compose.yaml - up-opts: -d db migrations chain + up-opts: -d db migrations - run: cargo test -p e2e local_node -- --ignored --test-threads 1 --nocapture test-driver: diff --git a/crates/e2e/src/nodes/forked_node.rs b/crates/e2e/src/nodes/forked_node.rs index 1092abb24f..1da85c7406 100644 --- a/crates/e2e/src/nodes/forked_node.rs +++ b/crates/e2e/src/nodes/forked_node.rs @@ -1,49 +1,11 @@ use { - super::TestNode, ethcontract::H160, - reqwest::{IntoUrl, Url}, + reqwest::Url, serde_json::json, std::fmt::Debug, web3::{api::Namespace, helpers::CallFuture, Transport}, }; -pub struct Forker { - forked_node_api: ForkedNodeApi, - fork_url: Url, -} - -impl Forker { - pub async fn new(web3: &web3::Web3, solver_address: H160, fork_url: impl IntoUrl) -> Self { - let fork_url = fork_url.into_url().expect("Invalid fork URL"); - - let forked_node_api = web3.api::>(); - forked_node_api - .fork(&fork_url) - .await - .expect("Test network must support anvil_reset"); - - forked_node_api - .impersonate(&solver_address) - .await - .expect("Test network must support anvil_impersonateAccount"); - - Self { - forked_node_api, - fork_url, - } - } -} - -#[async_trait::async_trait(?Send)] -impl TestNode for Forker { - async fn reset(&self) { - self.forked_node_api - .fork(&self.fork_url) - .await - .expect("Test network must support anvil_reset"); - } -} - #[derive(Debug, Clone)] pub struct ForkedNodeApi { transport: T, diff --git a/crates/e2e/src/nodes/local_node.rs b/crates/e2e/src/nodes/local_node.rs index 3d7b4a51ce..99231d0f88 100644 --- a/crates/e2e/src/nodes/local_node.rs +++ b/crates/e2e/src/nodes/local_node.rs @@ -1,40 +1,10 @@ use { - super::TestNode, chrono::{DateTime, Utc}, ethcontract::{H160, U256}, std::fmt::Debug, web3::{api::Namespace, helpers::CallFuture, Transport}, }; -pub struct Resetter { - test_node_api: TestNodeApi, - snapshot_id: U256, -} - -impl Resetter { - pub async fn new(web3: &web3::Web3) -> Self { - let test_node_api = web3.api::>(); - let snapshot_id = test_node_api - .snapshot() - .await - .expect("Test network must support evm_snapshot"); - Self { - test_node_api, - snapshot_id, - } - } -} - -#[async_trait::async_trait(?Send)] -impl TestNode for Resetter { - async fn reset(&self) { - self.test_node_api - .revert(&self.snapshot_id) - .await - .expect("Test network must support evm_revert"); - } -} - #[derive(Debug, Clone)] pub struct TestNodeApi { transport: T, diff --git a/crates/e2e/src/nodes/mod.rs b/crates/e2e/src/nodes/mod.rs index e51603a574..52c124e00e 100644 --- a/crates/e2e/src/nodes/mod.rs +++ b/crates/e2e/src/nodes/mod.rs @@ -1,9 +1,112 @@ pub mod forked_node; pub mod local_node; +/// The default node URL that should be used for e2e tests. pub const NODE_HOST: &str = "http://127.0.0.1:8545"; -#[async_trait::async_trait(?Send)] -pub trait TestNode { - async fn reset(&self); +/// A blockchain node for development purposes. Dropping this type will +/// terminate the node. +pub struct Node { + process: Option, +} + +impl Node { + /// Spawns a new node that is forked from the given URL. + pub async fn forked(fork: impl reqwest::IntoUrl) -> Self { + Self::spawn_process(&["--port", "8545", "--fork-url", fork.as_str()]).await + } + + /// Spawns a new local test net with some default parameters. + pub async fn new() -> Self { + Self::spawn_process(&[ + "--port", + "8545", + "--gas-price", + "1", + "--gas-limit", + "10000000", + "--base-fee", + "0", + "--balance", + "1000000", + "--chain-id", + "1", + "--timestamp", + "1577836800", + ]) + .await + } + + /// Spawn a new node instance using the list of given arguments. + async fn spawn_process(args: &[&str]) -> Self { + use tokio::io::AsyncBufReadExt as _; + + // Allow using some custom logic to spawn `anvil` by setting `ANVIL_COMMAND`. + // For example if you set up a command that spins up a docker container. + let command = std::env::var("ANVIL_COMMAND").unwrap_or("anvil".to_string()); + + let mut process = tokio::process::Command::new(command) + .args(args) + .stdout(std::process::Stdio::piped()) + .spawn() + .unwrap(); + + let stdout = process.stdout.take().unwrap(); + let (sender, receiver) = tokio::sync::oneshot::channel::(); + + tokio::task::spawn(async move { + let mut sender = Some(sender); + const NEEDLE: &str = "Listening on "; + let mut reader = tokio::io::BufReader::new(stdout).lines(); + while let Some(line) = reader.next_line().await.unwrap() { + tracing::trace!(line); + if let Some(addr) = line.strip_prefix(NEEDLE) { + match sender.take() { + Some(sender) => sender.send(format!("http://{addr}")).unwrap(), + None => tracing::error!(addr, "detected multiple anvil endpoints"), + } + } + } + }); + + let _url = tokio::time::timeout(tokio::time::Duration::from_secs(1), receiver) + .await + .expect("finding anvil URL timed out") + .unwrap(); + Self { + process: Some(process), + } + } + + /// Most reliable way to kill the process. If you get the chance to manually + /// clean up the [`Node`] do it because the [`Drop::drop`] + /// implementation can not be as reliable due to missing async support. + pub async fn kill(&mut self) { + let mut process = match self.process.take() { + Some(node) => node, + // Somebody already called `Node::kill()` + None => return, + }; + + if let Err(err) = process.kill().await { + tracing::error!(?err, "failed to kill node process"); + } + } +} + +impl Drop for Node { + fn drop(&mut self) { + let mut process = match self.process.take() { + Some(process) => process, + // Somebody already called `Node::kill()` + None => return, + }; + + // This only sends SIGKILL to the process but does not wait for the process to + // actually terminate. But since `anvil` is fairly well behaved that + // should be good enough in many cases. + if let Err(err) = process.start_kill() { + tracing::error!(?err, "failed to kill node process"); + } + } } diff --git a/crates/e2e/src/setup/mod.rs b/crates/e2e/src/setup/mod.rs index d6a8ba84ea..4b6d203d70 100644 --- a/crates/e2e/src/setup/mod.rs +++ b/crates/e2e/src/setup/mod.rs @@ -5,7 +5,7 @@ pub mod onchain_components; mod services; use { - crate::nodes::{forked_node::Forker, local_node::Resetter, TestNode, NODE_HOST}, + crate::nodes::{Node, NODE_HOST}, anyhow::{anyhow, Result}, ethcontract::{futures::FutureExt, H160}, shared::ethrpc::{create_test_transport, Web3}, @@ -14,7 +14,7 @@ use { io::Write, iter::empty, panic::{self, AssertUnwindSafe}, - sync::Mutex, + sync::{Arc, Mutex}, time::Duration, }, tempfile::TempPath, @@ -93,7 +93,7 @@ where F: FnOnce(Web3) -> Fut, Fut: Future, { - run(f, empty::<&str>(), None, None).await + run(f, empty::<&str>(), None).await } pub async fn run_test_with_extra_filters( @@ -104,7 +104,7 @@ pub async fn run_test_with_extra_filters( Fut: Future, T: AsRef, { - run(f, extra_filters, None, None).await + run(f, extra_filters, None).await } pub async fn run_forked_test(f: F, solver_address: H160, fork_url: String) @@ -112,7 +112,7 @@ where F: FnOnce(Web3) -> Fut, Fut: Future, { - run(f, empty::<&str>(), Some(solver_address), Some(fork_url)).await + run(f, empty::<&str>(), Some((solver_address, fork_url))).await } pub async fn run_forked_test_with_extra_filters( @@ -125,15 +125,11 @@ pub async fn run_forked_test_with_extra_filters( Fut: Future, T: AsRef, { - run(f, extra_filters, Some(solver_address), Some(fork_url)).await + run(f, extra_filters, Some((solver_address, fork_url))).await } -async fn run( - f: F, - filters: impl IntoIterator, - solver_address: Option, - fork_url: Option, -) where +async fn run(f: F, filters: impl IntoIterator, fork: Option<(H160, String)>) +where F: FnOnce(Web3) -> Fut, Fut: Future, T: AsRef, @@ -148,25 +144,39 @@ async fn run( // it but rather in the locked state. let _lock = NODE_MUTEX.lock(); + let node = match &fork { + Some((_, fork)) => Node::forked(fork).await, + None => Node::new().await, + }; + + let node = Arc::new(Mutex::new(Some(node))); + let node_panic_handle = node.clone(); + observe::panic_hook::prepend_panic_handler(Box::new(move |_| { + // Drop node in panic handler because `.catch_unwind()` does not catch all + // panics + let _ = node_panic_handle.lock().unwrap().take(); + })); + let http = create_test_transport(NODE_HOST); let web3 = Web3::new(http); - - let test_node: Box = - if let (Some(fork_url), Some(solver_address)) = (fork_url, solver_address) { - Box::new(Forker::new(&web3, solver_address, fork_url).await) - } else { - Box::new(Resetter::new(&web3).await) - }; + if let Some((solver, _)) = &fork { + Web3::api::>(&web3) + .impersonate(solver) + .await + .unwrap(); + } services::clear_database().await; - // Hack: the closure may actually be unwind unsafe; moreover, `catch_unwind` // does not catch some types of panics. In this cases, the state of the node // is not restored. This is not considered an issue since this function // is supposed to be used in a test environment. let result = AssertUnwindSafe(f(web3.clone())).catch_unwind().await; - test_node.reset().await; + let node = node.lock().unwrap().take(); + if let Some(mut node) = node { + node.kill().await; + } services::clear_database().await; if let Err(err) = result { diff --git a/crates/observe/src/panic_hook.rs b/crates/observe/src/panic_hook.rs index 417e6879e5..4291dfa64d 100644 --- a/crates/observe/src/panic_hook.rs +++ b/crates/observe/src/panic_hook.rs @@ -17,6 +17,19 @@ pub fn install() { std::panic::set_hook(Box::new(new_hook)); } +/// Installs a panic handler that executes [`handler`] plus whatever panic +/// handler was already set up. +/// This can be useful to make absolutely sure to clean up some resources like +/// running processes on a panic. +pub fn prepend_panic_handler(handler: Box) { + let previous_hook = std::panic::take_hook(); + let new_hook = move |info: &std::panic::PanicInfo| { + handler(info); + previous_hook(info); + }; + std::panic::set_hook(Box::new(new_hook)); +} + #[cfg(test)] mod tests { use super::*; diff --git a/docker-compose.yaml b/docker-compose.yaml index a170086b00..cf432a7dff 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -33,14 +33,5 @@ services: source: ./database/sql/ target: /flyway/sql - chain: - image: ghcr.io/foundry-rs/foundry:latest - restart: always - entrypoint: /usr/local/bin/anvil --gas-price 1 --gas-limit 10000000 --base-fee 0 --balance 1000000 --chain-id 1 --timestamp 1577836800 - environment: - - ANVIL_IP_ADDR=0.0.0.0 - ports: - - 8545:8545 - volumes: postgres: