From 622c34343821c5d27fc2a0a4557b7f312f75b4b2 Mon Sep 17 00:00:00 2001 From: Jerome Gravel-Niquet Date: Thu, 19 Oct 2023 10:11:49 -0400 Subject: [PATCH] Parallel synchronization (#78) - Sync protocol is now driven by the client. This allows syncing in parallel with multiple nodes and optimizing throughput. - Adds OpenTelemetry instrumentation (optional) - Many more metrics - Uses a patches quinn that does not pre-allocate streams, saving hundreds of MBs of memory - Fixes deadlocks when accessing the in-memory bookkeeping - Batches "cleared" versions inserts to get at actual "current" versions faster - Reduced max foca packet size. TODO: This should be configurable in the future. - Fix connection pool locking madness --- .github/workflows/ci.yml | 23 +- .github/workflows/release.yml | 11 +- CHANGELOG.md | 1 + Cargo.lock | 279 ++++++- Cargo.toml | 9 + book.toml | 2 +- crates/corro-admin/src/lib.rs | 45 +- crates/corro-agent/Cargo.toml | 3 + crates/corro-agent/src/agent.rs | 720 ++++++++--------- crates/corro-agent/src/api/peer.rs | 940 +++++++++++++++++------ crates/corro-agent/src/api/public/mod.rs | 4 +- crates/corro-agent/src/broadcast/mod.rs | 69 +- crates/corro-agent/src/transport.rs | 512 ++++++++++-- crates/corro-types/Cargo.toml | 1 + crates/corro-types/src/agent.rs | 271 +++++-- crates/corro-types/src/broadcast.rs | 16 +- crates/corro-types/src/config.rs | 41 +- crates/corro-types/src/members.rs | 12 +- crates/corro-types/src/sync.rs | 322 +++++++- crates/corrosion/Cargo.toml | 5 + crates/corrosion/src/admin.rs | 1 - crates/corrosion/src/command/agent.rs | 140 +++- crates/corrosion/src/main.rs | 94 ++- crates/spawn/src/lib.rs | 16 +- doc/mdbook-admonish.css | 162 ++-- 25 files changed, 2736 insertions(+), 963 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f2cca48..e75d1e28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ on: name: CI env: - RUSTFLAGS: -D warnings + RUSTFLAGS: -D warnings --cfg tokio_unstable CARGO_TERM_COLOR: always jobs: @@ -24,24 +24,23 @@ jobs: runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v3 - - - uses: rui314/setup-mold@v1 + env: + SCCACHE_GHA_ENABLED: "true" + RUSTC_WRAPPER: "sccache" - - name: Install Rust stable - uses: dtolnay/rust-toolchain@stable - with: - target: ${{ matrix.target }} + steps: + - uses: actions/checkout@v4 + - name: Install Rust specified toolchain + run: rustup show + - name: Install cargo-nextest uses: taiki-e/install-action@v2 with: tool: cargo-nextest - - uses: Swatinem/rust-cache@v2 - with: - cache-on-failure: true + - name: Run sccache-cache + uses: mozilla-actions/sccache-action@v0.0.3 - name: Test with latest nextest release run: cargo nextest run --profile ci --workspace --target ${{ matrix.target }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ae6ea128..ebde2d7b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,7 @@ on: - v[0-9]+.* env: - RUSTFLAGS: -D warnings + RUSTFLAGS: -D warnings --cfg tokio_unstable CARGO_TERM_COLOR: always jobs: @@ -34,6 +34,9 @@ jobs: - target: aarch64-unknown-linux-gnu os: ubuntu-latest runs-on: ${{ matrix.os }} + env: + SCCACHE_GHA_ENABLED: "true" + RUSTC_WRAPPER: "sccache" steps: - uses: actions/checkout@v3 @@ -44,10 +47,8 @@ jobs: target: ${{ matrix.target }} if: startsWith(matrix.os, 'ubuntu') - - uses: Swatinem/rust-cache@v2 - with: - cache-on-failure: true - key: ${{ matrix.target }} + - name: Run sccache-cache + uses: mozilla-actions/sccache-action@v0.0.3 - uses: taiki-e/upload-rust-binary-action@v1 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e6f62ef..cfa84f0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Parallel synchronization w/ many deadlock and bug fixes ([#78](../../pull/78)) - Upgraded to cr-sqlite 0.16.0 (unreleased) ([#75](../../pull/75)) - Rewrite compaction logic to be more correct and efficient ([#74](../../pull/74)) - `corrosion consul sync` will now bundle services and checks in a single transaction (changeset) ([#73](../../pull/73)) diff --git a/Cargo.lock b/Cargo.lock index 8fb509e9..57fe977d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -720,9 +720,11 @@ dependencies = [ "hyper", "itertools", "metrics", + "opentelemetry", "parking_lot", "quinn", "quinn-plaintext", + "quinn-proto", "quoted-string", "rand", "rangemap", @@ -743,6 +745,7 @@ dependencies = [ "tower", "tower-http", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tripwire", "trust-dns-resolver", @@ -863,6 +866,7 @@ dependencies = [ "itertools", "metrics", "once_cell", + "opentelemetry", "parking_lot", "rand", "rangemap", @@ -912,6 +916,9 @@ dependencies = [ "notify", "notify-debouncer-mini", "once_cell", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-semantic-conventions", "rusqlite", "seahash", "serde", @@ -923,10 +930,12 @@ dependencies = [ "tikv-jemallocator", "time", "tokio", + "tokio-metrics", "tokio-serde", "tokio-util", "tracing", "tracing-filter", + "tracing-opentelemetry", "tracing-subscriber", "tripwire", "uuid", @@ -1715,6 +1724,18 @@ dependencies = [ "tokio-rustls", ] +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + [[package]] name = "iana-time-zone" version = "0.1.56" @@ -1890,9 +1911,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -2341,6 +2362,104 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" +dependencies = [ + "async-trait", + "futures-core", + "http", + "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", + "prost", + "thiserror", + "tokio", + "tonic", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", + "prost", + "tonic", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry_api" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" +dependencies = [ + "futures-channel", + "futures-util", + "indexmap", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", + "urlencoding", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" +dependencies = [ + "async-trait", + "crossbeam-channel", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api", + "ordered-float", + "percent-encoding", + "rand", + "regex", + "serde_json", + "thiserror", + "tokio", + "tokio-stream", +] + +[[package]] +name = "ordered-float" +version = "3.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +dependencies = [ + "num-traits", +] + [[package]] name = "overload" version = "0.1.1" @@ -2459,9 +2578,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" [[package]] name = "pin-utils" @@ -2564,6 +2683,29 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "quanta" version = "0.11.0" @@ -2617,9 +2759,8 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13f81c9a9d574310b8351f8666f5a93ac3b0069c45c28ad52c10291389a7cf9" +version = "0.10.5" +source = "git+https://github.com/jeromegn/quinn?rev=108f25a6#108f25a6d45ce0c41acf2d87f8d0b2d35fedfbaa" dependencies = [ "bytes", "rand", @@ -3540,11 +3681,10 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.29.1" +version = "1.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" dependencies = [ - "autocfg", "backtrace", "bytes", "libc", @@ -3553,11 +3693,21 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.4.9", + "socket2 0.5.3", "tokio-macros", "windows-sys 0.48.0", ] +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-macros" version = "2.1.0" @@ -3569,6 +3719,18 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "tokio-metrics" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b2fc67d5dec41db679b9b052eb572269616926040b7831e32c8a152df77b84" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-rustls" version = "0.24.0" @@ -3641,6 +3803,34 @@ dependencies = [ "serde", ] +[[package]] +name = "tonic" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" +dependencies = [ + "async-trait", + "axum", + "base64 0.21.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tower" version = "0.4.13" @@ -3649,8 +3839,11 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", + "indexmap", "pin-project", "pin-project-lite", + "rand", + "slab", "tokio", "tokio-util", "tower-layer", @@ -3693,11 +3886,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "ee2ef2af84856a50c1d430afce2fdded0a4ec7eda868db86409b4543df0797f9" dependencies = [ - "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -3706,20 +3898,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.28", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", "valuable", @@ -3755,6 +3947,21 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" +dependencies = [ + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", +] + [[package]] name = "tracing-serde" version = "0.1.3" @@ -3767,9 +3974,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" +checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", "nu-ansi-term", @@ -3959,6 +4166,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" @@ -4036,9 +4249,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -4046,24 +4259,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.28", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4071,22 +4284,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.28", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "web-sys" diff --git a/Cargo.toml b/Cargo.toml index 6e44fc59..108166af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,9 +36,13 @@ itertools = { version = "0.10.5" } metrics = "0.21.0" metrics-exporter-prometheus = "0.12.0" once_cell = "1.17.1" +opentelemetry = { version = "0.20.0", features = ["rt-tokio"] } +opentelemetry-otlp = { version = "0.13.0" } +opentelemetry-semantic-conventions = { version = "0.12.0" } parking_lot = { version = "0.12.1" } pin-project-lite = "0.2.9" quinn = "0.10.2" +quinn-proto = "0.10.5" quinn-plaintext = "0.1.0" quoted-string = "0.6.1" rand = { version = "0.8.5", features = ["small_rng"] } @@ -60,6 +64,7 @@ tempfile = "3.5.0" thiserror = "1.0.40" time = { version = "0.3.15", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full"] } +tokio-metrics = "0.3.0" tokio-serde = { version = "0.8", features = ["json"] } tokio-stream = { version = "0.1.12", features = ["sync"] } tokio-util = { version = "0.7.7", features = ["io", "codec", "net"] } @@ -67,6 +72,7 @@ tower = { version = "0.4.13", features = ["limit", "load-shed", "buffer"] } tower-http = { version = "0.4.0", features = ["trace", "auth"] } tracing = "0.1.37" tracing-filter = { version = "0.1.0-alpha.2", features = ["smallvec"] } +tracing-opentelemetry = { version = "0.21.0", default-features = false, features = ["tracing-log"]} tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] } trust-dns-resolver = "0.22.0" uhlc = { version = "0.6.3", features = ["defmt"] } @@ -74,6 +80,9 @@ uuid = { version = "1.3.1", features = ["v4", "serde"] } webpki = { version = "0.22.0", features = ["std"] } http = { version = "0.2.9" } +[patch.crates-io] +quinn-proto = { git = "https://github.com/jeromegn/quinn", rev = "108f25a6" } + [profile.release] debug = 1 incremental = true diff --git a/book.toml b/book.toml index 40c34233..40fc647c 100644 --- a/book.toml +++ b/book.toml @@ -7,7 +7,7 @@ title = "Corrosion" [preprocessor.admonish] command = "mdbook-admonish" -assets_version = "2.0.2" # do not edit: managed by `mdbook-admonish install` +assets_version = "3.0.0" # do not edit: managed by `mdbook-admonish install` [output.html] additional-css = ["./doc/mdbook-admonish.css"] diff --git a/crates/corro-admin/src/lib.rs b/crates/corro-admin/src/lib.rs index d05ddd7d..bbb9df1e 100644 --- a/crates/corro-admin/src/lib.rs +++ b/crates/corro-admin/src/lib.rs @@ -3,6 +3,7 @@ use std::{fmt::Display, time::Duration}; use camino::Utf8PathBuf; use corro_types::{ agent::{Agent, LockKind, LockMeta, LockState}, + broadcast::{FocaCmd, FocaInput}, sqlite::SqlitePoolError, sync::generate_sync, }; @@ -10,7 +11,10 @@ use futures::{SinkExt, TryStreamExt}; use serde::{Deserialize, Serialize}; use spawn::spawn_counted; use time::OffsetDateTime; -use tokio::net::{UnixListener, UnixStream}; +use tokio::{ + net::{UnixListener, UnixStream}, + sync::mpsc, +}; use tokio_serde::{formats::Json, Framed}; use tokio_util::codec::LengthDelimitedCodec; use tracing::{debug, error, info, warn}; @@ -80,6 +84,7 @@ pub enum Command { Ping, Sync(SyncCommand), Locks { top: usize }, + Cluster(ClusterCommand), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -87,6 +92,11 @@ pub enum SyncCommand { Generate, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ClusterCommand { + MembershipStates, +} + #[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum LogLevel { @@ -171,17 +181,12 @@ async fn handle_conn( Ok(json) => send(&mut stream, Response::Json(json)).await, Err(e) => send_error(&mut stream, e).await, } + send_success(&mut stream).await; } Command::Locks { top } => { info_log(&mut stream, "gathering top locks").await; - let registry = { - agent - .bookie() - .read("admin:registry") - .await - .registry() - .clone() - }; + let bookie = agent.bookie(); + let registry = bookie.registry(); let topn: Vec = { registry @@ -198,6 +203,28 @@ async fn handle_conn( Ok(json) => send(&mut stream, Response::Json(json)).await, Err(e) => send_error(&mut stream, e).await, } + send_success(&mut stream).await; + } + Command::Cluster(ClusterCommand::MembershipStates) => { + info_log(&mut stream, "gathering membership state").await; + + let (tx, mut rx) = mpsc::channel(1024); + if let Err(e) = agent + .tx_foca() + .send(FocaInput::Cmd(FocaCmd::MembershipStates(tx))) + .await + { + send_error(&mut stream, e).await; + continue; + } + + while let Some(member) = rx.recv().await { + match serde_json::to_value(&member) { + Ok(json) => send(&mut stream, Response::Json(json)).await, + Err(e) => send_error(&mut stream, e).await, + } + } + send_success(&mut stream).await; } }, Ok(None) => { diff --git a/crates/corro-agent/Cargo.toml b/crates/corro-agent/Cargo.toml index 4e76a60d..1e6e2561 100644 --- a/crates/corro-agent/Cargo.toml +++ b/crates/corro-agent/Cargo.toml @@ -20,8 +20,10 @@ hex = { workspace = true } hyper = { workspace = true } itertools = { workspace = true } metrics = { workspace = true } +opentelemetry = { workspace = true } parking_lot = { workspace = true } quinn = { workspace = true } +quinn-proto = { workspace = true } quinn-plaintext = { workspace = true } quoted-string = { workspace = true } rand = { workspace = true } @@ -44,6 +46,7 @@ tokio-util = { workspace = true } tower = { workspace = true } tower-http = { workspace = true } tracing = { workspace = true } +tracing-opentelemetry = { workspace = true } tracing-subscriber = { workspace = true } tripwire = { path = "../tripwire" } trust-dns-resolver = { workspace = true } diff --git a/crates/corro-agent/src/agent.rs b/crates/corro-agent/src/agent.rs index c77efd66..5042e328 100644 --- a/crates/corro-agent/src/agent.rs +++ b/crates/corro-agent/src/agent.rs @@ -1,4 +1,5 @@ use std::{ + cmp, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, convert::Infallible, net::SocketAddr, @@ -9,7 +10,7 @@ use std::{ use crate::{ api::{ - peer::{bidirectional_sync, gossip_client_endpoint, gossip_server_endpoint, SyncError}, + peer::{gossip_server_endpoint, parallel_sync, serve_sync, SyncError}, public::{ api_v1_db_schema, api_v1_queries, api_v1_transactions, pubsub::{ @@ -19,13 +20,16 @@ use crate::{ }, }, broadcast::runtime_loop, - transport::{ConnectError, Transport}, + transport::{Transport, TransportError}, }; use arc_swap::ArcSwap; use corro_types::{ actor::{Actor, ActorId}, - agent::{Agent, AgentConfig, BookedVersions, Bookie, ChangeError, KnownDbVersion, SplitPool}, + agent::{ + Agent, AgentConfig, BookedVersions, Bookie, ChangeError, KnownDbVersion, PartialVersion, + SplitPool, + }, broadcast::{ BiPayload, BiPayloadV1, BroadcastInput, BroadcastV1, ChangeSource, ChangeV1, Changeset, ChangesetParts, FocaInput, Timestamp, UniPayload, UniPayloadV1, @@ -46,7 +50,7 @@ use axum::{ routing::{get, post}, BoxError, Extension, Router, TypedHeader, }; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use foca::{Member, Notification}; use futures::{FutureExt, StreamExt}; use hyper::{server::conn::AddrIncoming, StatusCode}; @@ -59,17 +63,16 @@ use rusqlite::{named_params, params, Connection, OptionalExtension, Transaction} use spawn::spawn_counted; use speedy::Readable; use tokio::{ - io::AsyncReadExt, net::TcpListener, sync::mpsc::{channel, Receiver, Sender}, task::block_in_place, - time::{sleep, timeout}, + time::{error::Elapsed, sleep, timeout}, }; use tokio_stream::{wrappers::ReceiverStream, StreamExt as TokioStreamExt}; -use tokio_util::codec::{Decoder, FramedRead, LengthDelimitedCodec}; +use tokio_util::codec::{FramedRead, LengthDelimitedCodec}; use tower::{limit::ConcurrencyLimitLayer, load_shed::LoadShedLayer}; use tower_http::trace::TraceLayer; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, debug_span, error, info, trace, warn, Instrument}; use tripwire::{Outcome, PreemptibleFutureExt, TimeoutFutureExt, Tripwire}; use trust_dns_resolver::{ error::ResolveErrorKind, @@ -90,6 +93,7 @@ pub struct AgentOptions { pub rx_apply: Receiver<(ActorId, i64)>, pub rx_empty: Receiver<(ActorId, RangeInclusive)>, pub rx_changes: Receiver<(ChangeV1, ChangeSource)>, + pub rx_foca: Receiver, pub rtt_rx: Receiver<(SocketAddr, Duration)>, pub tripwire: Tripwire, } @@ -214,7 +218,7 @@ pub async fn setup(conf: Config, tripwire: Tripwire) -> eyre::Result<(Agent, Age let mut conn = pool.write_priority().await?; let tx = conn.transaction()?; - clear_buffered_meta(&tx, actor_id, version)?; + clear_buffered_meta(&tx, actor_id, version..=version)?; tx.commit()?; continue; } @@ -236,11 +240,9 @@ pub async fn setup(conf: Config, tripwire: Tripwire) -> eyre::Result<(Agent, Age let gossip_server_endpoint = gossip_server_endpoint(&conf.gossip).await?; let gossip_addr = gossip_server_endpoint.local_addr()?; - let gossip_client_endpoint = gossip_client_endpoint(&conf.gossip).await?; - let (rtt_tx, rtt_rx) = channel(128); - let transport = Transport::new(gossip_client_endpoint, rtt_tx); + let transport = Transport::new(&conf.gossip, rtt_tx).await?; let api_listener = TcpListener::bind(conf.api.bind_addr).await?; let api_addr = api_listener.local_addr()?; @@ -254,7 +256,8 @@ pub async fn setup(conf: Config, tripwire: Tripwire) -> eyre::Result<(Agent, Age let (tx_bcast, rx_bcast) = channel(10240); let (tx_empty, rx_empty) = channel(10240); - let (tx_changes, rx_changes) = channel(10240); + let (tx_changes, rx_changes) = channel(5192); + let (tx_foca, rx_foca) = channel(10240); let opts = AgentOptions { actor_id, @@ -265,6 +268,7 @@ pub async fn setup(conf: Config, tripwire: Tripwire) -> eyre::Result<(Agent, Age rx_apply, rx_empty, rx_changes, + rx_foca, rtt_rx, tripwire: tripwire.clone(), }; @@ -282,6 +286,7 @@ pub async fn setup(conf: Config, tripwire: Tripwire) -> eyre::Result<(Agent, Age tx_apply, tx_empty, tx_changes, + tx_foca, schema: RwLock::new(schema), tripwire, }); @@ -308,6 +313,7 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { rx_apply, rx_empty, rx_changes, + rx_foca, rtt_rx, } = opts; @@ -371,7 +377,6 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { let gossip_addr = gossip_server_endpoint.local_addr()?; - let (foca_tx, foca_rx) = channel(10240); let (member_events_tx, member_events_rx) = tokio::sync::broadcast::channel::(512); runtime_loop( @@ -382,7 +387,7 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { ), agent.clone(), transport.clone(), - foca_rx, + rx_foca, rx_bcast, member_events_rx.resubscribe(), to_send_tx, @@ -410,13 +415,12 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { // async message decoder task tokio::spawn({ - let bookie = agent.bookie().clone(); - let self_actor_id = agent.actor_id(); + let agent = agent.clone(); async move { while let Some(payload) = process_uni_rx.recv().await { match payload { UniPayload::V1(UniPayloadV1::Broadcast(bcast)) => { - handle_change(bcast, self_actor_id, &bookie, &bcast_msg_tx).await + handle_change(&agent, bcast, &bcast_msg_tx).await } } } @@ -428,7 +432,6 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { spawn_counted({ let agent = agent.clone(); let mut tripwire = tripwire.clone(); - let foca_tx = foca_tx.clone(); async move { loop { let connecting = match gossip_server_endpoint @@ -444,7 +447,6 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { let process_uni_tx = process_uni_tx.clone(); let agent = agent.clone(); let tripwire = tripwire.clone(); - let foca_tx = foca_tx.clone(); tokio::spawn(async move { let remote_addr = connecting.remote_address(); // let local_ip = connecting.local_ip().unwrap(); @@ -465,7 +467,7 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { tokio::spawn({ let conn = conn.clone(); let mut tripwire = tripwire.clone(); - let foca_tx = foca_tx.clone(); + let foca_tx = agent.tx_foca().clone(); async move { loop { let b = tokio::select! { @@ -498,7 +500,7 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { let mut tripwire = tripwire.clone(); async move { loop { - let mut rx = tokio::select! { + let rx = tokio::select! { rx_res = conn.accept_uni() => match rx_res { Ok(rx) => rx, Err(e) => { @@ -522,64 +524,37 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { tokio::spawn({ let process_uni_tx = process_uni_tx.clone(); async move { - let mut codec = LengthDelimitedCodec::new(); - let mut buf = BytesMut::new(); - - let stream_ended = loop { - loop { - match codec.decode(&mut buf) { - Ok(Some(b)) => { - // TODO: checksum? - let b = b.freeze(); - - match UniPayload::read_from_buffer(&b) { - Ok(payload) => { - trace!( - "parsed a payload: {payload:?}" - ); - - if let Err(e) = process_uni_tx - .send(payload) - .await - { - error!("could not send UniPayload for processing: {e}"); - // this means we won't be able to process more... - return; - } + let mut framed = + FramedRead::new(rx, LengthDelimitedCodec::new()); + + loop { + match StreamExt::next(&mut framed).await { + Some(Ok(b)) => { + counter!("corro.peer.stream.bytes.recv.total", b.len() as u64, "type" => "uni"); + match UniPayload::read_from_buffer(&b) { + Ok(payload) => { + trace!("parsed a payload: {payload:?}"); + + if let Err(e) = + process_uni_tx.send(payload).await + { + error!("could not send UniPayload for processing: {e}"); + // this means we won't be able to process more... + return; } - Err(e) => { - error!( + } + Err(e) => { + error!( "could not decode UniPayload: {e}" ); - continue; - } + continue; } } - Ok(None) => break, - Err(e) => { - error!("decode error: {e}"); - } } - } - - match rx.read_buf(&mut buf).await { - Ok(0) => { - break true; - } - Ok(n) => { - counter!("corro.peer.stream.bytes.recv.total", n as u64, "type" => "uni"); - trace!("read {n} bytes"); + Some(Err(e)) => { + error!("decode error: {e}"); } - Err(e) => { - error!("error reading bytes into buffer: {e}"); - break true; - } - } - }; - - if !stream_ended { - if let Err(e) = rx.stop(0u32.into()) { - warn!("error stopping recved uni stream: {e}"); + None => break, } } } @@ -605,7 +580,7 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { } }; - increment_counter!("corro.peer.streams.accept.total", "type" => "bi"); + increment_counter!("corro.peer.stream.accept.total", "type" => "bi"); debug!( "accepted a bidirectional stream from {}", @@ -639,24 +614,20 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { Ok(payload) => { match payload { BiPayload::V1( - BiPayloadV1::SyncState(state), + BiPayloadV1::SyncStart { + actor_id, + trace_ctx, + }, ) => { + trace!("framed read buffer len: {}", framed.read_buffer().len()); // println!("got sync state: {state:?}"); - if let Err(e) = - bidirectional_sync( - &agent, - generate_sync( - agent.bookie(), - agent.actor_id(), - ) - .await, - Some(state), - framed.into_inner(), - tx, - ) - .await + if let Err(e) = serve_sync( + &agent, actor_id, + trace_ctx, framed, tx, + ) + .await { - warn!("could not complete bidirectional sync: {e}"); + warn!("could not complete receiving sync: {e}"); } break; } @@ -698,7 +669,6 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { tokio::spawn({ let agent = agent.clone(); - let foca_tx = foca_tx.clone(); async move { let mut boff = backoff::Backoff::new(10) .timeout_range(Duration::from_secs(5), Duration::from_secs(120)) @@ -719,7 +689,10 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { Ok(addrs) => { for addr in addrs.iter() { debug!("Bootstrapping w/ {addr}"); - if let Err(e) = foca_tx.send(FocaInput::Announce((*addr).into())).await + if let Err(e) = agent + .tx_foca() + .send(FocaInput::Announce((*addr).into())) + .await { error!("could not send foca Announce message: {e}"); } else { @@ -800,7 +773,11 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { } } - foca_tx.send(FocaInput::ApplyMany(foca_states)).await.ok(); + agent + .tx_foca() + .send(FocaInput::ApplyMany(foca_states)) + .await + .ok(); } let api = Router::new() @@ -918,19 +895,16 @@ pub async fn run(agent: Agent, opts: AgentOptions) -> eyre::Result<()> { let mut db_cleanup_interval = tokio::time::interval(Duration::from_secs(60 * 15)); - tokio::spawn(handle_gossip_to_send(transport, to_send_rx)); + tokio::spawn(handle_gossip_to_send(transport.clone(), to_send_rx)); tokio::spawn(handle_notifications( agent.clone(), notifications_rx, - foca_tx.clone(), member_events_tx, )); - tokio::spawn(metrics_loop(agent.clone())); + tokio::spawn(metrics_loop(agent.clone(), transport)); tokio::spawn(handle_broadcasts(agent.clone(), bcast_rx)); - // tokio::spawn - loop { tokio::select! { biased; @@ -1064,64 +1038,27 @@ async fn clear_overwritten_versions(agent: Agent) { .for_actor(actor_id) }; - let mut conn = match pool.write_low().await { - Ok(conn) => conn, - Err(e) => { - error!("could not acquire low priority write connection: {e}"); - continue; - } - }; - let mut bookedw = booked .write(format!("clearing:{}", actor_id.as_simple())) .await; - let res = block_in_place(|| { - let tx = conn.transaction()?; - - let mut new_copy = bookedw.clone(); - - for (db_v, v) in to_clear - .iter() - .filter(|(_db_v, v)| bookedw.contains_current(v)) - { - deleted += tx - .prepare_cached("DELETE FROM __corro_bookkeeping WHERE db_version = ?")? - .execute([db_v])?; - new_copy.insert(*v..=*v, KnownDbVersion::Cleared); + for (_db_v, v) in to_clear.iter() { + if bookedw.contains_current(v) { + bookedw.insert(*v, KnownDbVersion::Cleared); + deleted += 1; } + } - for (range, known) in to_clear - .iter() - .filter_map(|(_, v)| new_copy.get_key_value(v)) - .dedup() - { - match known { - KnownDbVersion::Cleared => { - inserted += store_empty_changeset( - &tx, - actor_id, - *range.start()..=*range.end(), - )?; - } - known => { - warn!(%actor_id, "unexpected known db version when attempting to clear: {known:?}"); - } - } + for range in to_clear + .iter() + .filter_map(|(_, v)| bookedw.cleared.get(v)) + .dedup() + { + if let Err(e) = agent.tx_empty().try_send((actor_id, range.clone())) { + error!("could not schedule version to be cleared: {e}"); + } else { + inserted += 1; } - - tx.commit()?; - - debug!("compacted in-db version state for actor {actor_id}, deleted: {deleted}, inserted: {inserted}"); - - **bookedw = BookedVersions(new_copy); - debug!("compacted in-memory cache by clearing db versions for actor {actor_id}, new total: {}", bookedw.len()); - - Ok::<_, rusqlite::Error>(()) - }); - - if let Err(e) = res { - error!(%actor_id, "could not compact bookkeeping: {e}"); } } @@ -1133,27 +1070,21 @@ async fn clear_overwritten_versions(agent: Agent) { } } -// const CHECKSUM_SEEDS: [u64; 4] = [ -// 0x16f11fe89b0d677c, -// 0xb480a793d8e6c86c, -// 0x6fe2e5aaf078ebc9, -// 0x14f994a4c5259381, -// ]; - -async fn metrics_loop(agent: Agent) { +async fn metrics_loop(agent: Agent, transport: Transport) { let mut metrics_interval = tokio::time::interval(Duration::from_secs(10)); loop { metrics_interval.tick().await; - block_in_place(|| collect_metrics(&agent)); + block_in_place(|| collect_metrics(&agent, &transport)); } } // const MAX_COUNT_TO_HASH: i64 = 500_000; -fn collect_metrics(agent: &Agent) { +fn collect_metrics(agent: &Agent, transport: &Transport) { agent.pool().emit_metrics(); + transport.emit_metrics(); let schema = agent.schema().read(); @@ -1203,60 +1134,42 @@ fn collect_metrics(agent: &Agent) { error!("could not query count for buffered changes: {e}"); } } - - // for name in low_count_tables { - // if let Some(table) = schema.tables.get(name) { - // let pks = table.pk.iter().cloned().collect::>().join(","); - // match conn - // .prepare_cached(&format!("SELECT * FROM {name} ORDER BY {pks}")) - // .and_then(|mut prepped| { - // let col_count = prepped.column_count(); - // prepped.query(()).and_then(|mut rows| { - // let mut hasher = seahash::SeaHasher::with_seeds( - // CHECKSUM_SEEDS[0], - // CHECKSUM_SEEDS[1], - // CHECKSUM_SEEDS[2], - // CHECKSUM_SEEDS[3], - // ); - // while let Ok(Some(row)) = rows.next() { - // for idx in 0..col_count { - // let v: SqliteValue = row.get(idx)?; - // v.hash(&mut hasher); - // } - // } - // Ok(hasher.finish()) - // }) - // }) { - // Ok(hash) => { - // gauge!("corro.db.table.checksum", hash as f64, "table" => name.clone()); - // } - // Err(e) => { - // error!("could not query clock table values for hashing {table}: {e}"); - // } - // } - // } - // } } -pub async fn handle_change( - bcast: BroadcastV1, - self_actor_id: ActorId, - bookie: &Bookie, - bcast_msg_tx: &Sender, -) { +pub async fn handle_change(agent: &Agent, bcast: BroadcastV1, bcast_msg_tx: &Sender) { match bcast { BroadcastV1::Change(change) => { + let diff = if let Some(ts) = change.ts() { + if let Ok(id) = change.actor_id.try_into() { + Some( + agent + .clock() + .new_timestamp() + .get_diff_duration(&uhlc::Timestamp::new(ts.0, id)), + ) + } else { + None + } + } else { + None + }; + increment_counter!("corro.broadcast.recv.count", "kind" => "change"); trace!("handling {} changes", change.len()); - if bookie - .write(format!( - "handle_change(for_actor):{}", - change.actor_id.as_simple() - )) - .await - .for_actor(change.actor_id) + let booked = { + agent + .bookie() + .write(format!( + "handle_change(for_actor):{}", + change.actor_id.as_simple() + )) + .await + .for_actor(change.actor_id) + }; + + if booked .read(format!( "handle_change(contains?):{}", change.actor_id.as_simple() @@ -1268,9 +1181,14 @@ pub async fn handle_change( return; } - if change.actor_id == self_actor_id { + if change.actor_id == agent.actor_id() { return; } + + if let Some(diff) = diff { + histogram!("corro.broadcast.recv.lag.seconds", diff.as_secs_f64()); + } + if let Err(e) = bcast_msg_tx.send(BroadcastV1::Change(change)).await { error!("could not send change message through broadcast channel: {e}"); } @@ -1278,19 +1196,7 @@ pub async fn handle_change( } } -// fn chunk_range( -// range: RangeInclusive, -// chunk_size: i64, -// ) -> impl Iterator> { -// range -// .clone() -// .step_by(chunk_size as usize) -// .map(move |block_start| { -// let block_end = (block_start + chunk_size).min(*range.end()); -// block_start..=block_end -// }) -// } - +#[tracing::instrument(skip_all)] fn find_cleared_db_versions(tx: &Transaction) -> rusqlite::Result> { let tables = tx .prepare_cached( @@ -1313,12 +1219,19 @@ fn find_cleared_db_versions(tx: &Transaction) -> rusqlite::Result> .join(" UNION ") ); - let cleared_db_version = tx + let start = Instant::now(); + let cleared_db_versions: BTreeSet = tx .prepare_cached(&to_clear_query)? .query_map([], |row| row.get(0))? .collect::>()?; - Ok(cleared_db_version) + info!( + "aggregated {} db versions to clear in {:?}", + cleared_db_versions.len(), + start.elapsed() + ); + + Ok(cleared_db_versions) } async fn handle_gossip_to_send(transport: Transport, mut to_send_rx: Receiver<(Actor, Bytes)>) { @@ -1327,18 +1240,19 @@ async fn handle_gossip_to_send(transport: Transport, mut to_send_rx: Receiver<(A trace!("got gossip to send to {actor:?}"); let addr = actor.addr(); + let actor_id = actor.id(); let transport = transport.clone(); + let len = data.len(); spawn_counted(async move { - let len = data.len(); if let Err(e) = transport.send_datagram(addr, data).await { error!("could not write datagram {addr}: {e}"); return; } - increment_counter!("corro.peer.datagram.sent.total", "actor_id" => actor.id().to_string()); + increment_counter!("corro.peer.datagram.sent.total", "actor_id" => actor_id.to_string()); counter!("corro.peer.datagram.bytes.sent.total", len as u64); - }); + }.instrument(debug_span!("send_swim_payload", %addr, %actor_id, buf_size = len))); } } @@ -1363,14 +1277,13 @@ async fn handle_broadcasts(agent: Agent, mut bcast_rx: Receiver) { async fn handle_notifications( agent: Agent, mut notification_rx: Receiver>, - foca_tx: Sender, member_events: tokio::sync::broadcast::Sender, ) { while let Some(notification) = notification_rx.recv().await { trace!("handle notification"); match notification { Notification::MemberUp(actor) => { - let added = { agent.members().write().add_member(&actor) }; + let (added, same) = { agent.members().write().add_member(&actor) }; trace!("Member Up {actor:?} (added: {added})"); if added { debug!("Member Up {actor:?}"); @@ -1379,13 +1292,27 @@ async fn handle_notifications( // notify of new cluster size let members_len = { agent.members().read().states.len() as u32 }; if let Ok(size) = members_len.try_into() { - if let Err(e) = foca_tx.send(FocaInput::ClusterSize(size)).await { + if let Err(e) = agent.tx_foca().send(FocaInput::ClusterSize(size)).await { error!("could not send new foca cluster size: {e}"); } } member_events.send(MemberEvent::Up(actor.clone())).ok(); + } else if !same { + // had a older timestamp! + if let Err(e) = agent + .tx_foca() + .send(FocaInput::ApplyMany(vec![foca::Member::new( + actor.clone(), + foca::Incarnation::default(), + foca::State::Down, + )])) + .await + { + warn!(?actor, "could not manually declare actor as down! {e}"); + } } + increment_counter!("corro.swim.notification", "type" => "memberup"); } Notification::MemberDown(actor) => { let removed = { agent.members().write().remove_member(&actor) }; @@ -1397,25 +1324,30 @@ async fn handle_notifications( // notify of new cluster size let member_len = { agent.members().read().states.len() as u32 }; if let Ok(size) = member_len.try_into() { - if let Err(e) = foca_tx.send(FocaInput::ClusterSize(size)).await { + if let Err(e) = agent.tx_foca().send(FocaInput::ClusterSize(size)).await { error!("could not send new foca cluster size: {e}"); } } member_events.send(MemberEvent::Down(actor.clone())).ok(); } + increment_counter!("corro.swim.notification", "type" => "memberdown"); } Notification::Active => { info!("Current node is considered ACTIVE"); + increment_counter!("corro.swim.notification", "type" => "active"); } Notification::Idle => { warn!("Current node is considered IDLE"); + increment_counter!("corro.swim.notification", "type" => "idle"); } // this happens when we leave the cluster Notification::Defunct => { debug!("Current node is considered DEFUNCT"); + increment_counter!("corro.swim.notification", "type" => "defunct"); } Notification::Rejoin(id) => { info!("Rejoined the cluster with id: {id:?}"); + increment_counter!("corro.swim.notification", "type" => "rejoin"); } } } @@ -1600,44 +1532,56 @@ fn store_empty_changeset( actor_id: ActorId, versions: RangeInclusive, ) -> Result { + // first, delete "current" versions, they're now gone! + let deleted = tx.prepare_cached("DELETE FROM __corro_bookkeeping WHERE actor_id = ? AND start_version >= ? AND start_version <= ? AND end_version IS NULL")?.execute(params![actor_id, versions.start(), versions.end()])?; + + if deleted > 0 { + debug!("deleted {deleted} still-live versions from database's bookkeeping"); + } + + // insert cleared versions let inserted = tx .prepare_cached( " - INSERT INTO __corro_bookkeeping (actor_id, start_version, end_version, db_version, last_seq, ts) - VALUES (?, ?, ?, NULL, NULL, NULL) - ON CONFLICT (actor_id, start_version) DO UPDATE SET - end_version = excluded.end_version, - db_version = NULL, - last_seq = NULL, - ts = NULL - WHERE end_version < excluded.end_version; - ", + INSERT INTO __corro_bookkeeping (actor_id, start_version, end_version, db_version, last_seq, ts) + VALUES (?, ?, ?, NULL, NULL, NULL) + ON CONFLICT (actor_id, start_version) DO UPDATE SET + end_version = excluded.end_version, + db_version = NULL, + last_seq = NULL, + ts = NULL + WHERE end_version < excluded.end_version; + ", )? .execute(params![actor_id, versions.start(), versions.end()])?; - for version in versions { - clear_buffered_meta(tx, actor_id, version)?; - } + // remove any buffered data or seqs bookkeeping for these versions + clear_buffered_meta(tx, actor_id, versions)?; Ok(inserted) } -fn clear_buffered_meta(tx: &Transaction, actor_id: ActorId, version: i64) -> rusqlite::Result<()> { +fn clear_buffered_meta( + tx: &Transaction, + actor_id: ActorId, + versions: RangeInclusive, +) -> rusqlite::Result<()> { // remove all buffered changes for cleanup purposes let count = tx - .prepare_cached("DELETE FROM __corro_buffered_changes WHERE site_id = ? AND version = ?")? - .execute(params![actor_id, version])?; - debug!(%actor_id, version, "deleted {count} buffered changes"); + .prepare_cached("DELETE FROM __corro_buffered_changes WHERE site_id = ? AND version >= ? AND version <= ?")? + .execute(params![actor_id, versions.start(), versions.end()])?; + debug!(%actor_id, ?versions, "deleted {count} buffered changes"); // delete all bookkept sequences for this version let count = tx - .prepare_cached("DELETE FROM __corro_seq_bookkeeping WHERE site_id = ? AND version = ?")? - .execute(params![actor_id, version])?; - debug!(%actor_id, version, "deleted {count} sequences in bookkeeping"); + .prepare_cached("DELETE FROM __corro_seq_bookkeeping WHERE site_id = ? AND version >= ? AND version <= ?")? + .execute(params![actor_id, versions.start(), versions.end()])?; + debug!(%actor_id, ?versions, "deleted {count} sequences in bookkeeping"); Ok(()) } +#[tracing::instrument(skip(agent), err)] async fn process_fully_buffered_changes( agent: &Agent, actor_id: ActorId, @@ -1667,8 +1611,8 @@ async fn process_fully_buffered_changes( let inserted = block_in_place(|| { let (last_seq, ts) = { - match bookedw.get(&version) { - Some(KnownDbVersion::Partial { seqs, last_seq, ts }) => { + match bookedw.partials.get(&version) { + Some(PartialVersion { seqs, last_seq, ts }) => { if seqs.gaps(&(0..=*last_seq)).count() != 0 { error!(%actor_id, version, "found sequence gaps: {:?}, aborting!", seqs.gaps(&(0..=*last_seq)).collect::>()); // TODO: return an error here @@ -1676,12 +1620,8 @@ async fn process_fully_buffered_changes( } (*last_seq, *ts) } - Some(_) => { - warn!(%actor_id, %version, "already processed buffered changes, returning"); - return Ok(false); - } None => { - warn!(%actor_id, %version, "version not found in cache,returning"); + warn!(%actor_id, %version, "version not found in cache, returning"); return Ok(false); } } @@ -1714,7 +1654,7 @@ async fn process_fully_buffered_changes( info!(%actor_id, version, "no buffered rows, skipped insertion into crsql_changes"); } - clear_buffered_meta(&tx, actor_id, version)?; + clear_buffered_meta(&tx, actor_id, version..=version)?; let rows_impacted: i64 = tx .prepare_cached("SELECT crsql_rows_impacted()")? @@ -1790,11 +1730,12 @@ async fn process_fully_buffered_changes( Ok(inserted) } +#[tracing::instrument(skip(agent, changes), err)] pub async fn process_multiple_changes( agent: &Agent, changes: Vec<(ChangeV1, ChangeSource)>, ) -> Result<(), ChangeError> { - info!(self_actor_id = %agent.actor_id(), "processing multiple changes, len: {}", changes.len()); + info!(self_actor_id = %agent.actor_id(), "processing multiple changes, len: {}", changes.iter().map(|(change, _)| cmp::max(change.len(), 1)).sum::()); let bookie = agent.bookie(); @@ -1831,6 +1772,7 @@ pub async fn process_multiple_changes( let mut conn = agent.pool().write_normal().await?; block_in_place(|| { + let start = Instant::now(); let tx = conn.transaction()?; let mut knowns: BTreeMap> = BTreeMap::new(); @@ -1889,10 +1831,8 @@ pub async fn process_multiple_changes( // optimizing this, insert later! let (known, changeset) = if change.is_complete() && change.is_empty() { - if let Err(e) = agent - .tx_empty() - .blocking_send((actor_id, change.versions())) - { + // we never want to block here + if let Err(e) = agent.tx_empty().try_send((actor_id, change.versions())) { error!("could not send empty changed versions into channel: {e}"); } ( @@ -1971,6 +1911,8 @@ pub async fn process_multiple_changes( tx.commit()?; + info!("committed {count} changes in {:?}", start.elapsed()); + for (actor_id, knowns) in knowns { let booked = { bookie @@ -1981,7 +1923,7 @@ pub async fn process_multiple_changes( .for_actor(actor_id) }; let mut booked_write = booked.blocking_write(format!( - "process_multiple_changes(booked writer):{}", + "process_multiple_changes(booked writer, post commit):{}", actor_id.as_simple() )); @@ -2029,6 +1971,7 @@ pub async fn process_multiple_changes( Ok(()) } +#[tracing::instrument(skip(tx, parts), err)] fn process_incomplete_version( tx: &Transaction, actor_id: ActorId, @@ -2119,6 +2062,7 @@ fn process_incomplete_version( }) } +#[tracing::instrument(skip(tx, last_db_version, parts), err)] fn process_complete_version( tx: &Transaction, actor_id: ActorId, @@ -2218,7 +2162,7 @@ fn process_complete_version( }; // in case we got both buffered data and a complete set of changes - clear_buffered_meta(tx, actor_id, version)?; + clear_buffered_meta(tx, actor_id, version..=version)?; for (table_name, count) in changes_per_table { counter!("corro.changes.committed", count, "table" => table_name.to_string(), "source" => "remote"); @@ -2227,6 +2171,7 @@ fn process_complete_version( Ok::<_, rusqlite::Error>((known_version, new_changeset)) } +#[tracing::instrument(skip(tx, last_db_version, change), err)] fn process_single_version( tx: &Transaction, last_db_version: Option, @@ -2306,7 +2251,7 @@ pub enum SyncClientError { #[error("service unavailable right now")] Unavailable, #[error(transparent)] - Connect(#[from] ConnectError), + Connect(#[from] TransportError), #[error("request timed out")] RequestTimedOut, #[error(transparent)] @@ -2345,21 +2290,25 @@ pub enum SyncRecvError { #[error("expected sync clock message, received something else")] ExpectedClockMessage, #[error("timed out waiting for sync message")] - TimedOut, + TimedOut(#[from] Elapsed), #[error("changes channel is closed")] ChangesChannelClosed, + #[error("requests channel is closed")] + RequestsChannelClosed, } +#[tracing::instrument(skip_all, err, level = "debug")] async fn handle_sync(agent: &Agent, transport: &Transport) -> Result<(), SyncClientError> { let sync_state = generate_sync(agent.bookie(), agent.actor_id()).await; + for (actor_id, needed) in sync_state.need.iter() { - gauge!("corro.sync.client.needed", needed.len() as f64, "actor_id" => actor_id.0.to_string()); + gauge!("corro.sync.client.needed", needed.len() as f64, "actor_id" => actor_id.to_string()); } for (actor_id, version) in sync_state.heads.iter() { gauge!("corro.sync.client.head", *version as f64, "actor_id" => actor_id.to_string()); } - let (actor_id, addr) = { + let chosen: Vec<(ActorId, SocketAddr)> = { let candidates = { let members = agent.members().read(); @@ -2367,8 +2316,8 @@ async fn handle_sync(agent: &Agent, transport: &Transport) -> Result<(), SyncCli .states .iter() .filter(|(id, _state)| **id != agent.actor_id()) - .map(|(id, state)| (*id, state.addr)) - .collect::>() + .map(|(id, state)| (*id, state.ring.unwrap_or(255), state.addr)) + .collect::>() }; if candidates.is_empty() { @@ -2377,55 +2326,46 @@ async fn handle_sync(agent: &Agent, transport: &Transport) -> Result<(), SyncCli debug!("found {} candidates to synchronize with", candidates.len()); + let desired_count = cmp::max(cmp::min(candidates.len() / 100, 10), 3); + let mut rng = StdRng::from_entropy(); - let mut choices = candidates.into_iter().choose_multiple(&mut rng, 2); + let mut choices = candidates + .into_iter() + .choose_multiple(&mut rng, desired_count * 2); choices.sort_by(|a, b| { + // most missing actors first sync_state .need_len_for_actor(&b.0) .cmp(&sync_state.need_len_for_actor(&a.0)) + // if equal, look at proximity (via `ring`) + .then_with(|| a.1.cmp(&b.1)) }); - if let Some(chosen) = choices.get(0).cloned() { - chosen - } else { - return Err(SyncClientError::NoGoodCandidate); - } + choices.truncate(desired_count); + choices + .into_iter() + .map(|(actor_id, _, addr)| (actor_id, addr)) + .collect() }; - debug!( - actor_id = %agent.actor_id(), "syncing with: {}, need len: {}", - actor_id, - sync_state.need_len(), - ); - - debug!(actor = %agent.actor_id(), "sync message: {sync_state:?}"); - - increment_counter!("corro.sync.client.member", "id" => actor_id.0.to_string(), "addr" => addr.to_string()); - - histogram!( - "corro.sync.client.request.operations.need.count", - sync_state.need.len() as f64 - ); - - let (tx, rx) = transport - .open_bi(addr) - .await - .map_err(crate::transport::ConnectError::from)?; - - increment_counter!("corro.sync.attempts.count", "id" => actor_id.0.to_string(), "addr" => addr.to_string()); - - // FIXME: check if it's ok to sync (don't overload host) + if chosen.is_empty() { + return Err(SyncClientError::NoGoodCandidate); + } let start = Instant::now(); - let n = bidirectional_sync(agent, sync_state, None, rx, tx).await?; + let n = parallel_sync(agent, transport, chosen.clone(), sync_state).await?; let elapsed = start.elapsed(); if n > 0 { info!( "synced {n} changes w/ {} in {}s @ {} changes/s", - actor_id, + chosen + .into_iter() + .map(|(actor_id, _)| actor_id.to_string()) + .collect::>() + .join(", "), elapsed.as_secs_f64(), n as f64 / elapsed.as_secs_f64() ); @@ -2448,7 +2388,8 @@ async fn handle_changes( loop { tokio::select! { Some((change, src)) = rx_changes.recv() => { - count += std::cmp::max(change.len(), 1); + counter!("corro.agent.changes.recv", std::cmp::max(change.len(), 1) as u64); // count empties... + count += change.len(); // don't count empties buf.push((change, src)); if count < MIN_CHANGES_CHUNK { continue; @@ -2481,7 +2422,9 @@ async fn handle_changes( // drain! while let Ok((change, src)) = rx_changes.try_recv() { - count += std::cmp::max(change.len(), 1); + let changes_count = std::cmp::max(change.len(), 1); + counter!("corro.agent.changes.recv", changes_count as u64); + count += changes_count; buf.push((change, src)); if count >= MIN_CHANGES_CHUNK { // drain and process current changes! @@ -2502,11 +2445,60 @@ async fn handle_changes( const CHECK_EMPTIES_TO_INSERT_AFTER: Duration = Duration::from_secs(120); +async fn write_empties_loop( + agent: Agent, + mut rx_empty: Receiver<(ActorId, RangeInclusive)>, + mut tripwire: Tripwire, +) { + let mut empties: BTreeMap> = BTreeMap::new(); + + let next_empties_check = tokio::time::sleep(CHECK_EMPTIES_TO_INSERT_AFTER); + tokio::pin!(next_empties_check); + + loop { + tokio::select! { + maybe_empty = rx_empty.recv() => match maybe_empty { + Some((actor_id, versions)) => { + empties.entry(actor_id).or_default().insert(versions); + continue; + }, + None => { + debug!("empties queue is done"); + break; + } + }, + _ = &mut next_empties_check => { + next_empties_check.as_mut().reset(tokio::time::Instant::now() + CHECK_EMPTIES_TO_INSERT_AFTER); + if empties.is_empty() { + continue; + } + }, + _ = &mut tripwire => break + } + + if let Err(e) = process_completed_empties(&agent, &mut empties).await { + error!("could not process empties: {e}"); + } + } + info!("Draining empty versions to process..."); + // drain empties channel + while let Ok((actor_id, versions)) = rx_empty.try_recv() { + empties.entry(actor_id).or_default().insert(versions); + } + + if !empties.is_empty() { + info!("inserting last unprocessed empties before shut down"); + if let Err(e) = process_completed_empties(&agent, &mut empties).await { + error!("could not process empties: {e}"); + } + } +} + async fn sync_loop( agent: Agent, transport: Transport, mut rx_apply: Receiver<(ActorId, i64)>, - mut rx_empty: Receiver<(ActorId, RangeInclusive)>, + rx_empty: Receiver<(ActorId, RangeInclusive)>, mut tripwire: Tripwire, ) { let mut sync_backoff = backoff::Backoff::new(0) @@ -2515,61 +2507,12 @@ async fn sync_loop( let next_sync_at = tokio::time::sleep(sync_backoff.next().unwrap()); tokio::pin!(next_sync_at); - spawn_counted({ - let mut tripwire = tripwire.clone(); - let agent = agent.clone(); - async move { - let mut inserted_empties = 0; - let mut empties: BTreeMap>> = BTreeMap::new(); - - let next_empties_check = tokio::time::sleep(CHECK_EMPTIES_TO_INSERT_AFTER); - tokio::pin!(next_empties_check); - - loop { - tokio::select! { - maybe_empty = rx_empty.recv() => match maybe_empty { - Some((actor_id, versions)) => { - empties.entry(actor_id).or_default().push(versions); - inserted_empties += 1; - - if inserted_empties < 1000 { - continue; - } - }, - None => { - debug!("empties queue is done"); - break; - } - }, - _ = &mut next_empties_check => { - next_empties_check.as_mut().reset(tokio::time::Instant::now() + CHECK_EMPTIES_TO_INSERT_AFTER); - if empties.is_empty() { - continue; - } - }, - _ = &mut tripwire => break - } - - inserted_empties = 0; - - if let Err(e) = process_completed_empties(&agent, &mut empties).await { - error!("could not process empties: {e}"); - } - } - info!("Draining empty versions to process..."); - // drain empties channel - while let Ok((actor_id, versions)) = rx_empty.try_recv() { - empties.entry(actor_id).or_default().push(versions); - } - - if !empties.is_empty() { - info!("inserting last unprocessed empties before shut down"); - if let Err(e) = process_completed_empties(&agent, &mut empties).await { - error!("could not process empties: {e}"); - } - } - } - }); + // TODO: move this elsewhere, doesn't have be spawned here... + spawn_counted(write_empties_loop( + agent.clone(), + rx_empty, + tripwire.clone(), + )); loop { enum Branch { @@ -2608,7 +2551,10 @@ async fn sync_loop( break; } tripwire::Outcome::Completed(res) => { - if res.is_err() { + if let Err(e) = res { + if !matches!(e, SyncClientError::NoGoodCandidate) { + error!("could not sync: {e}"); + } // keep syncing until we successfully sync continue; } @@ -2636,50 +2582,44 @@ async fn sync_loop( } } +#[tracing::instrument(skip_all, err)] async fn process_completed_empties( agent: &Agent, - empties: &mut BTreeMap>>, + empties: &mut BTreeMap>, ) -> eyre::Result<()> { info!( "processing empty versions (count: {})", - empties.values().map(Vec::len).sum::() + empties.values().map(RangeInclusiveSet::len).sum::() ); - let mut conn = agent.pool().write_normal().await?; - block_in_place(|| { - let tx = conn.transaction()?; + let mut inserted = 0; - let mut inserted = 0; - while let Some((actor_id, empties)) = empties.pop_first() { - let booked = { - agent - .bookie() - .blocking_write(format!( - "process_completed_empties(for_actor_blocking):{}", - actor_id.as_simple() - )) - .for_actor(actor_id) - }; - let bookedw = booked.blocking_write(format!( - "process_completed_empties(booked writer):{}", - actor_id.as_simple() - )); + let start = Instant::now(); + while let Some((actor_id, empties)) = empties.pop_first() { + let v = empties.into_iter().collect::>(); - for (range, _) in empties - .iter() - .filter_map(|range| bookedw.get_key_value(range.start())) - .dedup() - { - inserted += store_empty_changeset(&tx, actor_id, range.clone())?; - } - } + for ranges in v.chunks(50) { + let mut conn = agent.pool().write_low().await?; + block_in_place(|| { + let tx = conn.transaction()?; - tx.commit()?; + for range in ranges { + inserted += store_empty_changeset(&tx, actor_id, range.clone())?; + } - info!("upserted {inserted} empty versions"); + tx.commit()?; - Ok(()) - }) + Ok::<_, eyre::Report>(()) + })?; + } + } + + info!( + "upserted {inserted} empty version ranges in {:?}", + start.elapsed() + ); + + Ok(()) } pub fn migrate(conn: &mut Connection) -> rusqlite::Result<()> { @@ -3158,7 +3098,7 @@ pub mod tests { let start = Instant::now(); - let mut interval = tokio::time::interval(Duration::from_secs(3)); + let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(MissedTickBehavior::Delay); loop { interval.tick().await; diff --git a/crates/corro-agent/src/api/peer.rs b/crates/corro-agent/src/api/peer.rs index 17375b6f..cbd6be7b 100644 --- a/crates/corro-agent/src/api/peer.rs +++ b/crates/corro-agent/src/api/peer.rs @@ -1,31 +1,45 @@ use std::cmp; -use std::collections::{BTreeSet, HashMap}; +use std::collections::HashMap; use std::net::SocketAddr; use std::ops::RangeInclusive; use std::sync::Arc; use std::time::{Duration, Instant}; -use bytes::{Buf, BufMut, BytesMut}; +use bytes::{BufMut, BytesMut}; use compact_str::format_compact; -use corro_types::agent::{Agent, KnownDbVersion, SplitPool}; -use corro_types::broadcast::{ChangeSource, ChangeV1, Changeset, Timestamp}; +use corro_types::agent::{Agent, KnownDbVersion, KnownVersion, PartialVersion, SplitPool}; +use corro_types::broadcast::{ + BiPayload, BiPayloadV1, ChangeSource, ChangeV1, Changeset, Timestamp, +}; use corro_types::change::{row_to_change, Change}; use corro_types::config::{GossipConfig, TlsClientConfig}; -use corro_types::sync::{SyncMessage, SyncMessageEncodeError, SyncMessageV1, SyncStateV1}; -use futures::{Stream, TryFutureExt}; -use metrics::counter; +use corro_types::sync::{ + generate_sync, SyncMessage, SyncMessageEncodeError, SyncMessageV1, SyncNeedV1, SyncRejectionV1, + SyncRequestV1, SyncStateV1, SyncTraceContextV1, +}; +use futures::stream::FuturesUnordered; +use futures::{Future, Stream, TryFutureExt, TryStreamExt}; +use itertools::Itertools; +use metrics::{counter, increment_counter}; use quinn::{RecvStream, SendStream}; +use rand::seq::SliceRandom; +use rangemap::RangeInclusiveSet; use rusqlite::{params, Connection}; use speedy::Writable; -use tokio::io::{AsyncWrite, AsyncWriteExt}; -use tokio::sync::mpsc::{channel, Sender}; +use tokio::io::AsyncWriteExt; +use tokio::sync::mpsc::{self, unbounded_channel, Sender}; use tokio::task::block_in_place; -use tokio_stream::StreamExt; +use tokio::time::timeout; +use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream}; +use tokio_stream::StreamExt as TokioStreamExt; +// use tokio_stream::StreamExt as TokioStreamExt; use tokio_util::codec::{Encoder, FramedRead, LengthDelimitedCodec}; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing_opentelemetry::OpenTelemetrySpanExt; use crate::agent::SyncRecvError; use crate::api::public::ChunkedChanges; +use crate::transport::{Transport, TransportError}; use corro_types::{ actor::ActorId, @@ -38,6 +52,10 @@ pub enum SyncError { Send(#[from] SyncSendError), #[error(transparent)] Recv(#[from] SyncRecvError), + #[error(transparent)] + Rejection(#[from] SyncRejectionV1), + #[error(transparent)] + Transport(#[from] TransportError), } #[derive(Debug, thiserror::Error)] @@ -45,6 +63,9 @@ pub enum SyncSendError { #[error(transparent)] Io(#[from] std::io::Error), + #[error(transparent)] + Write(#[from] quinn::WriteError), + #[error(transparent)] Rusqlite(#[from] rusqlite::Error), @@ -90,7 +111,7 @@ fn build_quinn_transport_config(config: &GossipConfig) -> quinn::TransportConfig transport_config.max_concurrent_bidi_streams(32u32.into()); // max concurrent unidirectional streams - transport_config.max_concurrent_uni_streams(512u32.into()); + transport_config.max_concurrent_uni_streams(256u32.into()); if let Some(max_mtu) = config.max_mtu { info!("Setting maximum MTU for QUIC at {max_mtu}"); @@ -319,79 +340,11 @@ impl rustls::client::ServerCertVerifier for SkipServerVerification { } } -const MAX_CHANGES_BYTES_PER_MESSAGE: usize = 64 * 1024; -const MIN_CHANGES_BYTES_PER_MESSAGE: usize = 2 * 1024; +const MAX_CHANGES_BYTES_PER_MESSAGE: usize = 8 * 1024; +const MIN_CHANGES_BYTES_PER_MESSAGE: usize = 1024; const ADAPT_CHUNK_SIZE_THRESHOLD: Duration = Duration::from_millis(500); -async fn process_range( - booked: &Booked, - pool: &SplitPool, - range: &RangeInclusive, - actor_id: ActorId, - is_local: bool, - sender: &Sender, -) -> eyre::Result<()> { - let (start, end) = (range.start(), range.end()); - trace!("processing range {start}..={end} for {}", actor_id); - - let overlapping: Vec<(_, KnownDbVersion)> = { - booked - .read(format!( - "process_range(overlapping):{}", - actor_id.as_simple() - )) - .await - .overlapping(range) - .map(|(k, v)| (k.clone(), v.clone())) - .collect() - }; - - for (versions, known_version) in overlapping { - debug!("got overlapping range {versions:?} in {range:?}"); - - let mut processed = BTreeSet::new(); - // optimization, cleared versions can't be revived... sending a single batch! - if let KnownDbVersion::Cleared = &known_version { - sender - .send(SyncMessage::V1(SyncMessageV1::Changeset(ChangeV1 { - actor_id, - changeset: Changeset::Empty { versions }, - }))) - .await?; - continue; - } - - for version in versions { - let known = { - booked - .read(format!("process_range[{version}]:{}", actor_id.as_simple())) - .await - .get(&version) - .cloned() - }; - if let Some(known_version) = known { - process_version( - pool, - actor_id, - is_local, - version, - known_version, - booked, - vec![], - sender, - ) - .await?; - processed.insert(version); - } - } - - debug!("processed versions {processed:?}"); - } - - Ok(()) -} - #[allow(clippy::too_many_arguments)] fn handle_known_version( conn: &mut Connection, @@ -496,7 +449,7 @@ fn handle_known_version( )); let maybe_current_version = match bw.get(&version) { Some(known) => match known { - KnownDbVersion::Partial { seqs, .. } => { + KnownVersion::Partial(PartialVersion { seqs, .. }) => { if seqs != &partial_seqs { info!(%actor_id, version, "different partial sequences, updating! range_needed: {range_needed:?}"); partial_seqs = seqs.clone(); @@ -508,8 +461,8 @@ fn handle_known_version( } None } - known @ KnownDbVersion::Current { .. } => Some(known.clone()), - KnownDbVersion::Cleared => { + known @ KnownVersion::Current(_) => Some(known.into()), + KnownVersion::Cleared => { debug!(%actor_id, version, "in-memory bookkeeping has been cleared, aborting."); break; } @@ -706,95 +659,180 @@ async fn process_sync( local_actor_id: ActorId, pool: SplitPool, bookie: Bookie, - sync_state: SyncStateV1, sender: Sender, + recv: mpsc::Receiver, ) -> eyre::Result<()> { - let booked_actors: HashMap = { - bookie - .read("process_sync") - .await - .iter() - .map(|(k, v)| (*k, v.clone())) - .collect() - }; + let chunked_reqs = ReceiverStream::new(recv).chunks_timeout(10, Duration::from_millis(500)); + tokio::pin!(chunked_reqs); - for (actor_id, booked) in booked_actors { - if actor_id == sync_state.actor_id { - trace!("skipping itself!"); - // don't send the requester's data - continue; - } - // trace!(actor_id = %local_actor_id, "processing sync for {actor_id}, last: {:?}", booked.last().await); + let (job_tx, job_rx) = unbounded_channel(); - let is_local = actor_id == local_actor_id; + let mut buf = + futures::StreamExt::buffer_unordered( + UnboundedReceiverStream::< + std::pin::Pin> + Send>>, + >::new(job_rx), + 6, + ); + + let mut current_haves = vec![]; + let mut partial_needs = vec![]; - // 1. process needed versions - if let Some(needed) = sync_state.need.get(&actor_id) { - for range in needed { - process_range(&booked, &pool, range, actor_id, is_local, &sender).await?; + loop { + let reqs = tokio::select! { + maybe_reqs = chunked_reqs.next() => match maybe_reqs { + Some(reqs) => reqs, + None => break, + }, + Some(res) = buf.next() => { + res?; + continue; + }, + else => { + break; } - } + }; - // 2. process partial needs - if let Some(partially_needed) = sync_state.partial_need.get(&actor_id) { - for (version, seqs_needed) in partially_needed.iter() { - let known = { - booked - .read(format!("process_sync(partials)[{version}]")) - .await - .get(version) - .cloned() - }; - if let Some(known) = known { - process_version( - &pool, - actor_id, - is_local, - *version, - known, - &booked, - seqs_needed.clone(), - &sender, - ) - .await?; + let agg = reqs + .into_iter() + .flatten() + .group_by(|req| req.0) + .into_iter() + .map(|(actor_id, reqs)| (actor_id, reqs.flat_map(|(_, reqs)| reqs).collect())) + .collect::)>>(); + + for (actor_id, needs) in agg { + let booked = match { bookie.read("process_sync").await.get(&actor_id).cloned() } { + Some(booked) => booked, + None => continue, + }; + + let is_local = actor_id == local_actor_id; + + let mut cleared: RangeInclusiveSet = RangeInclusiveSet::new(); + + { + let read = booked.read("process_need(full)").await; + + for need in needs { + match need { + SyncNeedV1::Full { versions } => { + for version in versions { + match read.get(&version) { + Some(KnownVersion::Cleared) => { + cleared.insert(version..=version); + } + Some(known) => { + current_haves.push((version, KnownDbVersion::from(known))); + } + None => continue, + } + } + } + SyncNeedV1::Partial { version, seqs } => match read.get(&version) { + Some(KnownVersion::Cleared) => { + cleared.insert(version..=version); + } + Some(known) => { + partial_needs.push((version, KnownDbVersion::from(known), seqs)); + } + None => continue, + }, + } } } - } - // 3. process newer-than-heads - let their_last_version = sync_state.heads.get(&actor_id).copied().unwrap_or(0); - let our_last_version = booked - .read(format!( - "process_sync(our_last_version):{}", - actor_id.as_simple() - )) - .await - .last() - .unwrap_or(0); + for versions in cleared { + let sender = sender.clone(); + if job_tx + .send(Box::pin(async move { + sender + .send(SyncMessage::V1(SyncMessageV1::Changeset(ChangeV1 { + actor_id, + changeset: Changeset::Empty { versions }, + }))) + .await + .map_err(eyre::Report::from) + })) + .is_err() + { + eyre::bail!("could not send into job channel"); + } + } - trace!(actor_id = %local_actor_id, "their last version: {their_last_version} vs ours: {our_last_version}"); + for (version, known_version) in current_haves.drain(..) { + let pool = pool.clone(); + let booked = booked.clone(); + let sender = sender.clone(); + if job_tx + .send(Box::pin(async move { + process_version( + &pool, + actor_id, + is_local, + version, + known_version, + &booked, + vec![], + &sender, + ) + .await + })) + .is_err() + { + eyre::bail!("could not send into job channel"); + } + } - if their_last_version >= our_last_version { - // nothing to teach the other node! - continue; + for (version, known_version, seqs_needed) in partial_needs.drain(..) { + let pool = pool.clone(); + let booked = booked.clone(); + let sender = sender.clone(); + if job_tx + .send(Box::pin(async move { + process_version( + &pool, + actor_id, + is_local, + version, + known_version, + &booked, + seqs_needed, + &sender, + ) + .await + })) + .is_err() + { + eyre::bail!("could not send into job channel"); + } + } } - - process_range( - &booked, - &pool, - &((their_last_version + 1)..=our_last_version), - actor_id, - is_local, - &sender, - ) - .await?; } + debug!("done w/ sync server loop"); + + drop(job_tx); + + buf.try_collect().await?; debug!("done processing sync state"); Ok(()) } +fn chunk_range( + range: RangeInclusive, + chunk_size: i64, +) -> impl Iterator> { + range + .clone() + .step_by(chunk_size as usize) + .map(move |block_start| { + let block_end = (block_start + chunk_size).min(*range.end()); + block_start..=block_end + }) +} + fn encode_sync_msg( codec: &mut LengthDelimitedCodec, encode_buf: &mut BytesMut, @@ -804,44 +842,67 @@ fn encode_sync_msg( msg.write_to_stream(encode_buf.writer()) .map_err(SyncMessageEncodeError::from)?; + let data = encode_buf.split().freeze(); + trace!("encoded sync message, len: {}", data.len()); + codec.encode(data, send_buf)?; + Ok(()) +} + +async fn encode_write_bipayload_msg( + codec: &mut LengthDelimitedCodec, + encode_buf: &mut BytesMut, + send_buf: &mut BytesMut, + msg: BiPayload, + write: &mut SendStream, +) -> Result<(), SyncSendError> { + encode_bipayload_msg(codec, encode_buf, send_buf, msg)?; + + write_buf(send_buf, write).await +} + +fn encode_bipayload_msg( + codec: &mut LengthDelimitedCodec, + encode_buf: &mut BytesMut, + send_buf: &mut BytesMut, + msg: BiPayload, +) -> Result<(), SyncSendError> { + msg.write_to_stream(encode_buf.writer()) + .map_err(SyncMessageEncodeError::from)?; + codec.encode(encode_buf.split().freeze(), send_buf)?; Ok(()) } -async fn encode_write_sync_msg( +async fn encode_write_sync_msg( codec: &mut LengthDelimitedCodec, encode_buf: &mut BytesMut, send_buf: &mut BytesMut, msg: SyncMessage, - write: &mut W, + write: &mut SendStream, ) -> Result<(), SyncSendError> { encode_sync_msg(codec, encode_buf, send_buf, msg)?; - write_sync_buf(send_buf, write).await + write_buf(send_buf, write).await } -async fn write_sync_buf( - send_buf: &mut BytesMut, - write: &mut W, -) -> Result<(), SyncSendError> { - while send_buf.has_remaining() { - let n = write.write_buf(send_buf).await?; - if n == 0 { - break; - } - counter!("corro.sync.chunk.sent.bytes", n as u64); - } +#[tracing::instrument(skip_all, fields(buf_size = send_buf.len()), err)] +async fn write_buf(send_buf: &mut BytesMut, write: &mut SendStream) -> Result<(), SyncSendError> { + let len = send_buf.len(); + write.write_chunk(send_buf.split().freeze()).await?; + counter!("corro.sync.chunk.sent.bytes", len as u64); Ok(()) } +#[tracing::instrument(skip(read), fields(buf_size = tracing::field::Empty), err)] pub async fn read_sync_msg> + Unpin>( read: &mut R, ) -> Result, SyncRecvError> { - match tokio::time::timeout(Duration::from_secs(5), read.next()).await { - Ok(Some(buf_res)) => match buf_res { + match read.next().await { + Some(buf_res) => match buf_res { Ok(mut buf) => { counter!("corro.sync.chunk.recv.bytes", buf.len() as u64); + tracing::Span::current().record("buf_size", buf.len()); match SyncMessage::from_buf(&mut buf) { Ok(msg) => Ok(Some(msg)), Err(e) => Err(SyncRecvError::from(e)), @@ -849,47 +910,430 @@ pub async fn read_sync_msg> + Unpin>( } Err(e) => Err(SyncRecvError::from(e)), }, - Ok(None) => Ok(None), - Err(_e) => Err(SyncRecvError::TimedOut), + None => Ok(None), } } -pub async fn bidirectional_sync( +#[tracing::instrument(skip_all, err)] +pub async fn parallel_sync( agent: &Agent, + transport: &Transport, + members: Vec<(ActorId, SocketAddr)>, our_sync_state: SyncStateV1, - their_sync_state: Option, - read: RecvStream, - mut write: SendStream, ) -> Result { - let (tx, mut rx) = channel::(256); + trace!( + self_actor_id = %agent.actor_id(), + "parallel syncing w/ {}", + members + .iter() + .map(|(actor_id, _)| actor_id.to_string()) + .collect::>() + .join(", ") + ); + + let mut trace_ctx = SyncTraceContextV1::default(); + opentelemetry::global::get_text_map_propagator(|prop| { + prop.inject_context(&tracing::Span::current().context(), &mut trace_ctx) + }); + + let results = FuturesUnordered::from_iter(members.iter().map(|(actor_id, addr)| { + let trace_ctx = trace_ctx.clone(); + async { + ( + *actor_id, + *addr, + async { + let mut codec = LengthDelimitedCodec::new(); + let mut send_buf = BytesMut::new(); + let mut encode_buf = BytesMut::new(); + + let actor_id = *actor_id; + let (mut tx, rx) = transport.open_bi(*addr).await?; + let mut read = FramedRead::new(rx, LengthDelimitedCodec::new()); + + encode_write_bipayload_msg( + &mut codec, + &mut encode_buf, + &mut send_buf, + BiPayload::V1(BiPayloadV1::SyncStart {actor_id: agent.actor_id(), trace_ctx}), + &mut tx, + ).instrument(info_span!("write_sync_start")) + .await?; + + trace!(%actor_id, self_actor_id = %agent.actor_id(), "sent start payload"); + + encode_write_sync_msg( + &mut codec, + &mut encode_buf, + &mut send_buf, + SyncMessage::V1(SyncMessageV1::Clock(agent.clock().new_timestamp().into())), + &mut tx, + ).instrument(info_span!("write_sync_clock")) + .await?; + + trace!(%actor_id, self_actor_id = %agent.actor_id(), "sent clock payload"); + tx.flush().instrument(info_span!("quic_flush")).await.map_err(SyncSendError::from)?; + + trace!(%actor_id, self_actor_id = %agent.actor_id(), "flushed sync payloads"); + + let their_sync_state = match timeout(Duration::from_secs(2), read_sync_msg(&mut read)).instrument(info_span!("read_sync_state")).await.map_err(SyncRecvError::from)?? { + Some(SyncMessage::V1(SyncMessageV1::State(state))) => state, + Some(SyncMessage::V1(SyncMessageV1::Rejection(rejection))) => { + return Err(rejection.into()) + } + Some(_) => return Err(SyncRecvError::ExpectedSyncState.into()), + None => return Err(SyncRecvError::UnexpectedEndOfStream.into()), + }; + trace!(%actor_id, self_actor_id = %agent.actor_id(), "read state payload: {their_sync_state:?}"); + + match timeout(Duration::from_secs(2), read_sync_msg(&mut read)).instrument(info_span!("read_sync_clock")).await.map_err(SyncRecvError::from)?? { + Some(SyncMessage::V1(SyncMessageV1::Clock(ts))) => match actor_id.try_into() { + Ok(id) => { + if let Err(e) = agent + .clock() + .update_with_timestamp(&uhlc::Timestamp::new(ts.to_ntp64(), id)) + { + warn!("could not update clock from actor {actor_id}: {e}"); + } + } + Err(e) => { + error!("could not convert ActorId to uhlc ID: {e}"); + } + }, + Some(_) => return Err(SyncRecvError::ExpectedClockMessage.into()), + None => return Err(SyncRecvError::UnexpectedEndOfStream.into()), + } + trace!(%actor_id, self_actor_id = %agent.actor_id(), "read clock payload"); + + increment_counter!("corro.sync.client.member", "id" => actor_id.to_string(), "addr" => addr.to_string()); + + let needs = our_sync_state.compute_available_needs(&their_sync_state); - let mut read = FramedRead::new(read, LengthDelimitedCodec::new()); + trace!(%actor_id, self_actor_id = %agent.actor_id(), "computed needs"); + Ok::<_, SyncError>((needs, tx, read)) + }.await + ) + }.instrument(info_span!("sync_client_handshake", %actor_id, %addr)) + })) + .collect::)>>() + .await; + + debug!("collected member needs and such!"); + + let syncers = results.into_iter().fold(Ok(vec![]), |agg, (actor_id, addr, res)| { + match res { + Ok((needs, tx, read)) => { + let mut v = agg.unwrap_or_default(); + v.push((actor_id, addr, needs, tx, read)); + Ok(v) + }, + Err(e) => { + increment_counter!("corro.sync.client.handshake.errors", "actor_id" => actor_id.to_string(), "addr" => addr.to_string(), "error" => e.to_string()); + match agg { + Ok(v) if !v.is_empty() => Ok(v), + _ => Err(e) + } + } + } + })?; + + let len = syncers.len(); + + let (readers, mut servers) = { + let mut rng = rand::thread_rng(); + syncers.into_iter().fold( + (Vec::with_capacity(len), Vec::with_capacity(len)), + |(mut readers, mut servers), (actor_id, addr, needs, tx, read)| { + if needs.is_empty() { + trace!(%actor_id, "no needs!"); + return (readers, servers); + } + readers.push((actor_id, read)); + + trace!(%actor_id, "needs: {needs:?}"); + + info!(%actor_id, %addr, "needs len: {}", needs.values().map(|needs| needs.iter().map(|need| match need { + SyncNeedV1::Full {versions} => (versions.end() - versions.start()) as usize + 1, + SyncNeedV1::Partial {..} => 0, + }).sum::()).sum::()); + + servers.push(( + actor_id, + addr, + needs + .into_iter() + .flat_map(|(actor_id, needs)| { + let mut needs: Vec<_> = needs + .into_iter() + .flat_map(|need| match need { + // chunk the versions, sometimes it's 0..=1000000 and that's far too big for a chunk! + SyncNeedV1::Full { versions } => chunk_range(versions, 10) + .map(|versions| SyncNeedV1::Full { versions }) + .collect(), + + need => vec![need], + }) + .collect(); + + // NOTE: IMPORTANT! shuffle the vec so we don't keep looping over the same later on + needs.shuffle(&mut rng); + + needs + .into_iter() + .map(|need| (actor_id, need)) + .collect::>() + }) + .collect::>(), + tx, + )); + + (readers, servers) + }, + ) + }; + + if readers.is_empty() && servers.is_empty() { + return Ok(0); + } + + tokio::spawn(async move { + // reusable buffers and constructs + let mut codec = LengthDelimitedCodec::new(); + let mut send_buf = BytesMut::new(); + let mut encode_buf = BytesMut::new(); + + // already requested full versions + let mut req_full: HashMap> = HashMap::new(); + + // already requested partial version sequences + let mut req_partials: HashMap<(ActorId, i64), RangeInclusiveSet> = HashMap::new(); + + let start = Instant::now(); + + loop { + if servers.is_empty() { + break; + } + let mut next_servers = Vec::with_capacity(servers.len()); + 'servers: for (server_actor_id, addr, mut needs, mut tx) in servers { + if needs.is_empty() { + continue; + } + for (actor_id, need) in needs.drain(0..cmp::min(10, needs.len())) { + let actual_needs = match need { + SyncNeedV1::Full { versions } => { + let range = req_full.entry(actor_id).or_default(); + + let mut new_versions = + RangeInclusiveSet::from_iter([versions.clone()].into_iter()); + + // check if we've already requested + for overlap in range.overlapping(&versions) { + new_versions.remove(overlap.clone()); + } + + if new_versions.is_empty() { + continue; + } + + new_versions + .into_iter() + .map(|versions| { + range.remove(versions.clone()); + SyncNeedV1::Full { versions } + }) + .collect() + } + SyncNeedV1::Partial { version, seqs } => { + let range = req_partials.entry((actor_id, version)).or_default(); + let mut new_seqs = + RangeInclusiveSet::from_iter(seqs.clone().into_iter()); + + for seqs in seqs { + for overlap in range.overlapping(&seqs) { + new_seqs.remove(overlap.clone()); + } + } + + if new_seqs.is_empty() { + continue; + } + + vec![SyncNeedV1::Partial { + version, + seqs: new_seqs + .into_iter() + .map(|seqs| { + range.remove(seqs.clone()); + seqs + }) + .collect(), + }] + } + }; + + if actual_needs.is_empty() { + warn!(%server_actor_id, %actor_id, %addr, "nothing to send!"); + continue; + } + + let req_len = actual_needs.len(); + + if let Err(e) = encode_sync_msg( + &mut codec, + &mut encode_buf, + &mut send_buf, + SyncMessage::V1(SyncMessageV1::Request(vec![(actor_id, actual_needs)])), + ) { + error!(%server_actor_id, %actor_id, %addr, "could not encode sync request: {e} (elapsed: {:?})", start.elapsed()); + continue 'servers; + } + + counter!("corro.sync.client.req.sent", req_len as u64, "actor_id" => server_actor_id.to_string()); + } + + if !send_buf.is_empty() { + if let Err(e) = write_buf(&mut send_buf, &mut tx).await { + error!(%server_actor_id, %addr, "could not write sync requests: {e} (elapsed: {:?})", start.elapsed()); + continue; + } + } + + if needs.is_empty() { + if let Err(e) = tx.finish().instrument(info_span!("quic_finish")).await { + warn!("could not finish stream while sending sync requests: {e}"); + } + info!(%server_actor_id, %addr, "done trying to sync w/ actor after {:?}", start.elapsed()); + continue; + } + + next_servers.push((server_actor_id, addr, needs, tx)); + } + servers = next_servers; + } + }.instrument(info_span!("send_sync_requests"))); + + // now handle receiving changesets! + + let counts = FuturesUnordered::from_iter(readers.into_iter().map(|(actor_id, mut read)| { + let tx_changes = agent.tx_changes().clone(); + async move { + let mut count = 0; + + loop { + match read_sync_msg(&mut read).await { + Ok(None) => { + break; + } + Err(e) => { + error!(%actor_id, "sync recv error: {e}"); + break; + } + Ok(Some(msg)) => match msg { + SyncMessage::V1(SyncMessageV1::Changeset(change)) => { + let changes_len = cmp::max(change.len(), 1); + // tracing::Span::current().record("changes_len", changes_len); + count += changes_len; + counter!("corro.sync.changes.recv", changes_len as u64, "actor_id" => actor_id.to_string()); + tx_changes + .send((change, ChangeSource::Sync)) + .await + .map_err(|_| SyncRecvError::ChangesChannelClosed)?; + } + SyncMessage::V1(SyncMessageV1::Request(_)) => { + warn!("received sync request message unexpectedly, ignoring"); + continue; + } + SyncMessage::V1(SyncMessageV1::State(_)) => { + warn!("received sync state message unexpectedly, ignoring"); + continue; + } + SyncMessage::V1(SyncMessageV1::Clock(_)) => { + warn!("received sync clock message unexpectedly, ignoring"); + continue; + } + SyncMessage::V1(SyncMessageV1::Rejection(rejection)) => { + return Err(rejection.into()) + } + }, + } + } + + info!(%actor_id, %count, "done reading sync messages"); + + Ok(count) + }.instrument(info_span!("read_sync_requests_responses", %actor_id)) + })) + .collect::>>() + .await; + + for res in counts.iter() { + if let Err(e) = res { + error!("could not properly recv from peer: {e}"); + } + } + + Ok(counts.into_iter().flatten().sum::()) +} + +#[tracing::instrument(skip(agent, their_actor_id, read, write), fields(actor_id = %their_actor_id), err)] +pub async fn serve_sync( + agent: &Agent, + their_actor_id: ActorId, + trace_ctx: SyncTraceContextV1, + mut read: FramedRead, + mut write: SendStream, +) -> Result { + let context = + opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&trace_ctx)); + tracing::Span::current().set_parent(context); + + info!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "received sync request"); let mut codec = LengthDelimitedCodec::new(); let mut send_buf = BytesMut::new(); let mut encode_buf = BytesMut::new(); + // read the clock + match read_sync_msg(&mut read) + .instrument(info_span!("read_peer_clock")) + .await? + { + Some(SyncMessage::V1(SyncMessageV1::Clock(ts))) => match their_actor_id.try_into() { + Ok(id) => { + if let Err(e) = agent + .clock() + .update_with_timestamp(&uhlc::Timestamp::new(ts.to_ntp64(), id)) + { + warn!("could not update clock from actor {their_actor_id}: {e}"); + } + } + Err(e) => { + error!("could not convert ActorId to uhlc ID: {e}"); + } + }, + Some(_) => return Err(SyncRecvError::ExpectedClockMessage.into()), + None => return Err(SyncRecvError::UnexpectedEndOfStream.into()), + } + + trace!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "read clock"); + + let sync_state = generate_sync(agent.bookie(), agent.actor_id()).await; + + // first, send the current sync state encode_write_sync_msg( &mut codec, &mut encode_buf, &mut send_buf, - SyncMessage::V1(SyncMessageV1::State(our_sync_state)), + SyncMessage::V1(SyncMessageV1::State(sync_state)), &mut write, ) + .instrument(info_span!("write_sync_state")) .await?; - write.flush().await.map_err(SyncSendError::from)?; - - let their_sync_state = match their_sync_state { - Some(state) => state, - None => match read_sync_msg(&mut read).await? { - Some(SyncMessage::V1(SyncMessageV1::State(state))) => state, - Some(_) => return Err(SyncRecvError::ExpectedSyncState.into()), - None => return Err(SyncRecvError::UnexpectedEndOfStream.into()), - }, - }; - let their_actor_id = their_sync_state.actor_id; + trace!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "sent sync state"); + // then the current clock's timestamp encode_write_sync_msg( &mut codec, &mut encode_buf, @@ -897,48 +1341,58 @@ pub async fn bidirectional_sync( SyncMessage::V1(SyncMessageV1::Clock(agent.clock().new_timestamp().into())), &mut write, ) + .instrument(info_span!("write_sync_clock")) .await?; - write.flush().await.map_err(SyncSendError::from)?; + trace!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "sent clock"); - match read_sync_msg(&mut read).await? { - Some(SyncMessage::V1(SyncMessageV1::Clock(ts))) => match their_actor_id.try_into() { - Ok(id) => { - if let Err(e) = agent - .clock() - .update_with_timestamp(&uhlc::Timestamp::new(ts.to_ntp64(), id)) - { - warn!("could not update clock from actor {their_actor_id}: {e}"); - } - } - Err(e) => { - error!("could not convert ActorId to uhlc ID: {e}"); - } - }, - Some(_) => return Err(SyncRecvError::ExpectedClockMessage.into()), - None => return Err(SyncRecvError::UnexpectedEndOfStream.into()), - } + // ensure we flush here so the data gets there fast. clock needs to be fresh! + write + .flush() + .instrument(info_span!("quic_flush")) + .await + .map_err(SyncSendError::from)?; + trace!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "flushed sync messages"); + + let (tx_need, rx_need) = mpsc::channel(1024); + let (tx, mut rx) = mpsc::channel::(256); tokio::spawn( process_sync( agent.actor_id(), agent.pool().clone(), agent.bookie().clone(), - their_sync_state, tx, + rx_need, ) + .instrument(info_span!("process_sync")) .inspect_err(|e| error!("could not process sync request: {e}")), ); - let tx_changes = agent.tx_changes().clone(); - - let (_sent_count, recv_count) = tokio::try_join!( + let (send_res, recv_res) = tokio::join!( async move { let mut count = 0; let mut check_buf = tokio::time::interval(Duration::from_secs(1)); + let mut stopped = false; + loop { tokio::select! { + biased; + + stopped_res = write.stopped() => { + match stopped_res { + Ok(code) => { + info!(actor_id = %their_actor_id, "send stream was stopped by peer, code: {code}"); + }, + Err(e) => { + warn!(actor_id = %their_actor_id, "error waiting for stop from stream: {e}"); + } + } + stopped = true; + break; + }, + maybe_msg = rx.recv() => match maybe_msg { Some(msg) => { if let SyncMessage::V1(SyncMessageV1::Changeset(change)) = &msg { @@ -947,27 +1401,30 @@ pub async fn bidirectional_sync( encode_sync_msg(&mut codec, &mut encode_buf, &mut send_buf, msg)?; if send_buf.len() >= 16 * 1024 { - write_sync_buf(&mut send_buf, &mut write).await?; + write_buf(&mut send_buf, &mut write).await?; } }, None => { break; } }, + _ = check_buf.tick() => { if !send_buf.is_empty() { - write_sync_buf(&mut send_buf, &mut write).await?; + write_buf(&mut send_buf, &mut write).await?; } } } } - if !send_buf.is_empty() { - write_sync_buf(&mut send_buf, &mut write).await?; - } + if !stopped { + if !send_buf.is_empty() { + write_buf(&mut send_buf, &mut write).await?; + } - if let Err(e) = write.finish().await { - warn!("could not properly finish QUIC send stream: {e}"); + if let Err(e) = write.finish().await { + warn!("could not properly finish QUIC send stream: {e}"); + } } debug!(actor_id = %agent.actor_id(), "done writing sync messages (count: {count})"); @@ -975,7 +1432,7 @@ pub async fn bidirectional_sync( counter!("corro.sync.changes.sent", count as u64, "actor_id" => their_actor_id.to_string()); Ok::<_, SyncError>(count) - }, + }.instrument(info_span!("process_versions_to_send")), async move { let mut count = 0; @@ -989,34 +1446,51 @@ pub async fn bidirectional_sync( break; } Ok(Some(msg)) => match msg { - SyncMessage::V1(SyncMessageV1::Changeset(change)) => { - count += change.len(); - tx_changes - .send((change, ChangeSource::Sync)) + SyncMessage::V1(SyncMessageV1::Request(req)) => { + trace!(actor_id = %their_actor_id, self_actor_id = %agent.actor_id(), "read req: {req:?}"); + count += req + .iter() + .map(|(_, needs)| { + needs.iter().map(|need| need.count()).sum::() + }) + .sum::(); + tx_need + .send(req) .await - .map_err(|_| SyncRecvError::ChangesChannelClosed)?; + .map_err(|_| SyncRecvError::RequestsChannelClosed)?; + } + SyncMessage::V1(SyncMessageV1::Changeset(_)) => { + warn!(actor_id = %their_actor_id, "received sync changeset message unexpectedly, ignoring"); + continue; } SyncMessage::V1(SyncMessageV1::State(_)) => { - warn!("received sync state message more than once, ignoring"); + warn!(actor_id = %their_actor_id, "received sync state message unexpectedly, ignoring"); continue; } SyncMessage::V1(SyncMessageV1::Clock(_)) => { - warn!("received sync clock message more than once, ignoring"); + warn!(actor_id = %their_actor_id, "received sync clock message more than once, ignoring"); continue; } + SyncMessage::V1(SyncMessageV1::Rejection(rejection)) => { + return Err(rejection.into()) + } }, } } debug!(actor_id = %agent.actor_id(), "done reading sync messages"); - counter!("corro.sync.changes.recv", count as u64, "actor_id" => their_actor_id.to_string()); + counter!("corro.sync.requests.recv", count as u64, "actor_id" => their_actor_id.to_string()); Ok(count) - } - )?; + }.instrument(info_span!("process_version_requests")) + ); + + if let Err(e) = send_res { + error!(actor_id = %their_actor_id, "could not complete serving sync due to a send side error: {e}"); + } - Ok(recv_count) + recv_res } #[cfg(test)] @@ -1149,8 +1623,8 @@ mod tests { { let read = booked.read("test").await; - assert_eq!(read.get(&1).unwrap().clone(), known1); - assert_eq!(read.get(&2).unwrap().clone(), known2); + assert_eq!(KnownDbVersion::from(read.get(&1).unwrap()), known1); + assert_eq!(KnownDbVersion::from(read.get(&2).unwrap()), known2); } { diff --git a/crates/corro-agent/src/api/public/mod.rs b/crates/corro-agent/src/api/public/mod.rs index 81fa64b2..f4e1df1a 100644 --- a/crates/corro-agent/src/api/public/mod.rs +++ b/crates/corro-agent/src/api/public/mod.rs @@ -197,7 +197,7 @@ where return Ok((ret, start.elapsed())); } - let last_version = book_writer.last().unwrap_or(0); + let last_version = book_writer.last().unwrap_or_default(); trace!("last_version: {last_version}"); let version = last_version + 1; trace!("version: {version}"); @@ -306,6 +306,7 @@ where }) } +#[tracing::instrument(skip_all, err)] fn execute_statement(tx: &Transaction, stmt: &Statement) -> rusqlite::Result { let mut prepped = match &stmt { Statement::Simple(q) => tx.prepare(q), @@ -326,6 +327,7 @@ fn execute_statement(tx: &Transaction, stmt: &Statement) -> rusqlite::Result, diff --git a/crates/corro-agent/src/broadcast/mod.rs b/crates/corro-agent/src/broadcast/mod.rs index 83dddfa5..1170aaa7 100644 --- a/crates/corro-agent/src/broadcast/mod.rs +++ b/crates/corro-agent/src/broadcast/mod.rs @@ -33,7 +33,7 @@ use tripwire::Tripwire; use corro_types::{ actor::Actor, agent::Agent, - broadcast::{BroadcastInput, DispatchRuntime, FocaInput, UniPayload, UniPayloadV1}, + broadcast::{BroadcastInput, DispatchRuntime, FocaCmd, FocaInput, UniPayload, UniPayloadV1}, members::MemberEvent, }; @@ -348,6 +348,16 @@ pub fn runtime_loop( error!("foca apply_many error: {e}"); } } + FocaInput::Cmd(cmd) => match cmd { + FocaCmd::MembershipStates(sender) => { + for member in foca.iter_membership_state() { + if let Err(e) = sender.send(member.clone()).await { + error!("could not send back foca membership: {e}"); + break; + } + } + } + }, }, Branch::MemberEvents(evts) => { trace!("handling Branch::MemberEvents"); @@ -472,6 +482,10 @@ pub fn runtime_loop( trace!("handling Branch::Metrics"); { gauge!("corro.gossip.members", foca.num_members() as f64); + gauge!( + "corro.gossip.member.states", + foca.iter_membership_state().count() as f64 + ); gauge!( "corro.gossip.updates_backlog", foca.updates_backlog() as f64 @@ -614,7 +628,11 @@ pub fn runtime_loop( let members = agent.members().read(); for addr in members.ring0() { // this spawns, so we won't be holding onto the read lock for long - transmit_broadcast(payload.clone(), transport.clone(), addr); + tokio::spawn(transmit_broadcast( + payload.clone(), + transport.clone(), + addr, + )); } if local_bcast_buf.len() >= BROADCAST_CUTOFF { @@ -684,7 +702,11 @@ pub fn runtime_loop( for addr in broadcast_to { debug!(actor = %actor_id, "broadcasting {} bytes to: {addr} (send count: {})", pending.payload.len(), pending.send_count); - transmit_broadcast(pending.payload.clone(), transport.clone(), addr); + tokio::spawn(transmit_broadcast( + pending.payload.clone(), + transport.clone(), + addr, + )); } pending.send_count = pending.send_count.wrapping_add(1); @@ -714,7 +736,7 @@ fn make_foca_config(cluster_size: NonZeroU32) -> foca::Config { // max payload size for udp datagrams, use a safe value here... // TODO: calculate from smallest max datagram size for all QUIC conns - config.max_packet_size = 1200.try_into().unwrap(); + config.max_packet_size = 1178.try_into().unwrap(); config } @@ -744,33 +766,20 @@ impl PendingBroadcast { } } -fn transmit_broadcast(payload: Bytes, transport: Transport, addr: SocketAddr) { - tokio::spawn(async move { - trace!("singly broadcasting to {addr}"); - let mut stream = match transport.open_uni(addr).await { - Ok(s) => s, - Err(e) => { - error!("could not open unidirectional stream to {addr}: {e}"); - return; - } - }; +#[tracing::instrument(skip(payload, transport), fields(buf_size = payload.len()), level = "debug")] +async fn transmit_broadcast(payload: Bytes, transport: Transport, addr: SocketAddr) { + trace!("singly broadcasting to {addr}"); - match tokio::time::timeout(Duration::from_secs(5), stream.write_all(&payload)).await { - Err(_e) => { - warn!("timed out writing broadcast to uni stream"); - return; - } - Ok(Err(e)) => { - error!("could not write to uni stream to {addr}: {e}"); - return; - } - Ok(Ok(_)) => { - counter!("corro.peer.stream.bytes.sent.total", payload.len() as u64, "type" => "uni"); - } + let len = payload.len(); + match tokio::time::timeout(Duration::from_secs(5), transport.send_uni(addr, payload)).await { + Err(_e) => { + warn!("timed out writing broadcast to uni stream"); } - - if let Err(e) = stream.finish().await { - debug!("could not finish broadcast uni stream to {addr}: {e}"); + Ok(Err(e)) => { + error!("could not write to uni stream to {addr}: {e}"); } - }); + Ok(Ok(_)) => { + counter!("corro.peer.stream.bytes.sent.total", len as u64, "type" => "uni"); + } + } } diff --git a/crates/corro-agent/src/transport.rs b/crates/corro-agent/src/transport.rs index e5958e68..c0d0120f 100644 --- a/crates/corro-agent/src/transport.rs +++ b/crates/corro-agent/src/transport.rs @@ -1,51 +1,72 @@ use std::{ collections::HashMap, + hash::{Hash, Hasher}, net::SocketAddr, sync::Arc, time::{Duration, Instant}, }; use bytes::Bytes; -use metrics::{histogram, increment_counter}; +use corro_types::config::GossipConfig; +use metrics::{gauge, histogram, increment_counter}; use quinn::{ ApplicationClose, Connection, ConnectionError, Endpoint, RecvStream, SendDatagramError, - SendStream, + SendStream, WriteError, }; -use tokio::sync::{mpsc, RwLock}; -use tracing::{debug, warn}; +use quinn_proto::ConnectionStats; +use tokio::{ + sync::{mpsc, Mutex, RwLock}, + time::error::Elapsed, +}; +use tracing::{debug, info_span, warn, Instrument}; + +use crate::api::peer::gossip_client_endpoint; #[derive(Debug, Clone)] pub struct Transport(Arc); #[derive(Debug)] struct TransportInner { - endpoint: Endpoint, - conns: RwLock>, + endpoints: Vec, + conns: RwLock>>>>, rtt_tx: mpsc::Sender<(SocketAddr, Duration)>, } #[derive(Debug, thiserror::Error)] -pub enum ConnectError { +pub enum TransportError { #[error(transparent)] Connect(#[from] quinn::ConnectError), #[error(transparent)] Connection(#[from] quinn::ConnectionError), #[error(transparent)] Datagram(#[from] SendDatagramError), + #[error(transparent)] + SendStreamWrite(#[from] WriteError), + #[error(transparent)] + TimedOut(#[from] Elapsed), } impl Transport { - pub fn new(endpoint: Endpoint, rtt_tx: mpsc::Sender<(SocketAddr, Duration)>) -> Self { - Self(Arc::new(TransportInner { - endpoint, + pub async fn new( + config: &GossipConfig, + rtt_tx: mpsc::Sender<(SocketAddr, Duration)>, + ) -> eyre::Result { + let mut endpoints = vec![]; + for _ in 0..4 { + endpoints.push(gossip_client_endpoint(config).await?); + } + Ok(Self(Arc::new(TransportInner { + endpoints, conns: Default::default(), rtt_tx, - })) + }))) } - pub async fn send_datagram(&self, addr: SocketAddr, data: Bytes) -> Result<(), ConnectError> { + #[tracing::instrument(skip(self, data), fields(buf_size = data.len()), level = "debug", err)] + pub async fn send_datagram(&self, addr: SocketAddr, data: Bytes) -> Result<(), TransportError> { let conn = self.connect(addr).await?; debug!("connected to {addr}"); + match conn.send_datagram(data.clone()) { Ok(send) => { debug!("sent datagram to {addr}"); @@ -56,6 +77,9 @@ impl Transport { } Err(e) => { increment_counter!("corro.transport.send_datagram.errors", "addr" => addr.to_string(), "error" => e.to_string()); + if matches!(e, SendDatagramError::TooLarge) { + warn!(%addr, "attempted to send a larger-than-PMTU datagram. len: {}, pmtu: {:?}", data.len(), conn.max_datagram_size()); + } return Err(e.into()); } } @@ -65,28 +89,48 @@ impl Transport { Ok(conn.send_datagram(data)?) } - pub async fn open_uni(&self, addr: SocketAddr) -> Result { + #[tracing::instrument(skip(self, data), fields(buf_size = data.len()), level = "debug", err)] + pub async fn send_uni(&self, addr: SocketAddr, data: Bytes) -> Result<(), TransportError> { let conn = self.connect(addr).await?; - match conn.open_uni().await { - Ok(send) => return Ok(send), + + let mut stream = match conn + .open_uni() + .instrument(info_span!("quic_open_uni")) + .await + { + Ok(stream) => stream, Err(e @ ConnectionError::VersionMismatch) => { return Err(e.into()); } Err(e) => { debug!("retryable error attempting to open unidirectional stream: {e}"); + let conn = self.connect(addr).await?; + conn.open_uni() + .instrument(info_span!("quic_open_uni")) + .await? } - } + }; - let conn = self.connect(addr).await?; - Ok(conn.open_uni().await?) + stream + .write_chunk(data) + .instrument(info_span!("quic_write_chunk")) + .await?; + + stream + .finish() + .instrument(info_span!("quic_finish")) + .await?; + + Ok(()) } + #[tracing::instrument(skip(self), err)] pub async fn open_bi( &self, addr: SocketAddr, - ) -> Result<(SendStream, RecvStream), ConnectError> { + ) -> Result<(SendStream, RecvStream), TransportError> { let conn = self.connect(addr).await?; - match conn.open_bi().await { + match conn.open_bi().instrument(info_span!("quic_open_bi")).await { Ok(send_recv) => return Ok(send_recv), Err(e @ ConnectionError::VersionMismatch) => { return Err(e.into()); @@ -98,52 +142,416 @@ impl Transport { // retry, it should reconnect! let conn = self.connect(addr).await?; - Ok(conn.open_bi().await?) + Ok(conn + .open_bi() + .instrument(info_span!("quic_open_bi")) + .await?) } - async fn connect(&self, addr: SocketAddr) -> Result { - let server_name = addr.ip().to_string(); + async fn measured_connect( + &self, + addr: SocketAddr, + server_name: String, + ) -> Result { + let start = Instant::now(); + + let mut hasher = seahash::SeaHasher::new(); + addr.hash(&mut hasher); + let endpoint_idx = (hasher.finish() % self.0.endpoints.len() as u64) as usize; + + async { + match tokio::time::timeout(Duration::from_secs(5), self + .0 + .endpoints[endpoint_idx] + .connect(addr, &server_name)?) + .await + { + Ok(Ok(conn)) => { + histogram!( + "corro.transport.connect.time.seconds", + start.elapsed().as_secs_f64() + ); + tracing::Span::current().record("rtt", conn.rtt().as_secs_f64()); + Ok(conn) + }, + Ok(Err(e)) => { + increment_counter!("corro.transport.connect.errors", "addr" => server_name, "error" => e.to_string()); + Err(e.into()) + } + Err(e) => { + increment_counter!("corro.transport.connect.errors", "addr" => server_name, "error" => "timed out"); + Err(e.into()) + } + } + }.instrument(info_span!("quic_connect", %addr, rtt = tracing::field::Empty)).await + } + // this shouldn't block for long... + async fn get_lock(&self, addr: SocketAddr) -> Arc>> { { let r = self.0.conns.read().await; - if let Some(conn) = r.get(&addr).cloned() { - if test_conn(&conn) { - if let Err(e) = self.0.rtt_tx.try_send((addr, conn.rtt())) { - debug!("could not send RTT for connection through sender: {e}"); - } - return Ok(conn); - } + if let Some(lock) = r.get(&addr) { + return lock.clone(); } } - let conn = { - let mut w = self.0.conns.write().await; - if let Some(conn) = w.get(&addr).cloned() { - if test_conn(&conn) { - return Ok(conn); + let mut w = self.0.conns.write().await; + w.entry(addr).or_default().clone() + } + + #[tracing::instrument(skip(self), fields(tid = ?std::thread::current().id()), level = "debug", err)] + async fn connect(&self, addr: SocketAddr) -> Result { + let conn_lock = self.get_lock(addr).await; + + let mut lock = conn_lock.lock().await; + + if let Some(conn) = lock.as_ref() { + if test_conn(conn) { + if let Err(e) = self.0.rtt_tx.try_send((addr, conn.rtt())) { + debug!("could not send RTT for connection through sender: {e}"); } + return Ok(conn.clone()); } + } - let start = Instant::now(); - let conn = match self.0.endpoint.connect(addr, server_name.as_str())?.await { - Ok(conn) => { - histogram!("corro.transport.connect.time.seconds", start.elapsed().as_secs_f64(), "addr" => server_name); - conn - } - Err(e) => { - increment_counter!("corro.transport.connect.errors", "addr" => server_name, "error" => e.to_string()); - return Err(e.into()); - } - }; + // clear it, if there was one it didn't pass the test. + *lock = None; - if let Err(e) = self.0.rtt_tx.try_send((addr, conn.rtt())) { - debug!("could not send RTT for connection through sender: {e}"); - } - w.insert(addr, conn.clone()); - conn + let conn = self.measured_connect(addr, addr.ip().to_string()).await?; + *lock = Some(conn.clone()); + Ok(conn) + } + + pub fn emit_metrics(&self) { + let conns = { + let read = self.0.conns.blocking_read(); + read.iter() + .filter_map(|(addr, conn)| { + conn.blocking_lock() + .as_ref() + .map(|conn| (*addr, conn.stats())) + }) + .collect::>() }; - Ok(conn) + gauge!("corro.transport.connections", conns.len() as f64); + + // make aggregate stats for all connections... so as to not overload a metrics server + let stats = conns + .iter() + .fold(ConnectionStats::default(), |mut acc, (addr, stats)| { + gauge!("corro.transport.path.cwnd", stats.path.cwnd as f64, "addr" => addr.to_string()); + gauge!("corro.transport.path.congestion_events", stats.path.congestion_events as f64, "addr" => addr.to_string()); + gauge!("corro.transport.path.black_holes_detected", stats.path.black_holes_detected as f64, "addr" => addr.to_string()); + + acc.path.lost_packets += stats.path.lost_packets; + acc.path.lost_bytes += stats.path.lost_bytes; + acc.path.sent_packets += stats.path.sent_packets; + acc.path.sent_plpmtud_probes += stats.path.sent_plpmtud_probes; + acc.path.lost_plpmtud_probes += stats.path.lost_plpmtud_probes; + + acc.frame_rx.acks += stats.frame_rx.acks; + acc.frame_rx.crypto += stats.frame_rx.crypto; + acc.frame_rx.connection_close += stats.frame_rx.connection_close; + acc.frame_rx.data_blocked += stats.frame_rx.data_blocked; + acc.frame_rx.datagram += stats.frame_rx.datagram; + acc.frame_rx.handshake_done += stats.frame_rx.handshake_done; + acc.frame_rx.max_data += stats.frame_rx.max_data; + acc.frame_rx.max_stream_data += stats.frame_rx.max_stream_data; + acc.frame_rx.max_streams_bidi += stats.frame_rx.max_streams_bidi; + acc.frame_rx.max_streams_uni += stats.frame_rx.max_streams_uni; + acc.frame_rx.new_connection_id += stats.frame_rx.new_connection_id; + acc.frame_rx.new_token += stats.frame_rx.new_token; + acc.frame_rx.path_challenge += stats.frame_rx.path_challenge; + acc.frame_rx.path_response += stats.frame_rx.path_response; + acc.frame_rx.ping += stats.frame_rx.ping; + acc.frame_rx.reset_stream += stats.frame_rx.reset_stream; + acc.frame_rx.retire_connection_id += stats.frame_rx.retire_connection_id; + acc.frame_rx.stream_data_blocked += stats.frame_rx.stream_data_blocked; + acc.frame_rx.streams_blocked_bidi += stats.frame_rx.streams_blocked_bidi; + acc.frame_rx.streams_blocked_uni += stats.frame_rx.streams_blocked_uni; + acc.frame_rx.stop_sending += stats.frame_rx.stop_sending; + acc.frame_rx.stream += stats.frame_rx.stream; + + acc.frame_tx.acks += stats.frame_tx.acks; + acc.frame_tx.crypto += stats.frame_tx.crypto; + acc.frame_tx.connection_close += stats.frame_tx.connection_close; + acc.frame_tx.data_blocked += stats.frame_tx.data_blocked; + acc.frame_tx.datagram += stats.frame_tx.datagram; + acc.frame_tx.handshake_done += stats.frame_tx.handshake_done; + acc.frame_tx.max_data += stats.frame_tx.max_data; + acc.frame_tx.max_stream_data += stats.frame_tx.max_stream_data; + acc.frame_tx.max_streams_bidi += stats.frame_tx.max_streams_bidi; + acc.frame_tx.max_streams_uni += stats.frame_tx.max_streams_uni; + acc.frame_tx.new_connection_id += stats.frame_tx.new_connection_id; + acc.frame_tx.new_token += stats.frame_tx.new_token; + acc.frame_tx.path_challenge += stats.frame_tx.path_challenge; + acc.frame_tx.path_response += stats.frame_tx.path_response; + acc.frame_tx.ping += stats.frame_tx.ping; + acc.frame_tx.reset_stream += stats.frame_tx.reset_stream; + acc.frame_tx.retire_connection_id += stats.frame_tx.retire_connection_id; + acc.frame_tx.stream_data_blocked += stats.frame_tx.stream_data_blocked; + acc.frame_tx.streams_blocked_bidi += stats.frame_tx.streams_blocked_bidi; + acc.frame_tx.streams_blocked_uni += stats.frame_tx.streams_blocked_uni; + acc.frame_tx.stop_sending += stats.frame_tx.stop_sending; + acc.frame_tx.stream += stats.frame_tx.stream; + + acc.udp_rx.bytes += stats.udp_rx.bytes; + acc.udp_rx.datagrams += stats.udp_rx.datagrams; + acc.udp_rx.transmits += stats.udp_rx.transmits; + + acc.udp_tx.bytes += stats.udp_tx.bytes; + acc.udp_tx.datagrams += stats.udp_tx.datagrams; + acc.udp_tx.transmits += stats.udp_tx.transmits; + + acc + }); + gauge!( + "corro.transport.path.lost_packets", + stats.path.lost_packets as f64 + ); + gauge!( + "corro.transport.path.lost_bytes", + stats.path.lost_bytes as f64 + ); + gauge!( + "corro.transport.path.sent_packets", + stats.path.sent_packets as f64 + ); + gauge!( + "corro.transport.path.sent_plpmtud_probes", + stats.path.sent_plpmtud_probes as f64 + ); + gauge!( + "corro.transport.path.lost_plpmtud_probes", + stats.path.lost_plpmtud_probes as f64 + ); + + gauge!("corro.transport.frame_rx", stats.frame_rx.acks as f64, "type" => "acks"); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.crypto as f64, + "type" => "crypto" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.connection_close as f64, + "type" => "connection_close" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.data_blocked as f64, + "type" => "data_blocked" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.datagram as f64, + "type" => "datagram" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.handshake_done as f64, + "type" => "handshake_done" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.max_data as f64, + "type" => "max_data" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.max_stream_data as f64, + "type" => "max_stream_data" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.max_streams_bidi as f64, + "type" => "max_streams_bidi" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.max_streams_uni as f64, + "type" => "max_streams_uni" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.new_connection_id as f64, + "type" => "new_connection_id" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.new_token as f64, + "type" => "new_token" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.path_challenge as f64, + "type" => "path_challenge" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.path_response as f64, + "type" => "path_response" + ); + gauge!("corro.transport.frame_rx", stats.frame_rx.ping as f64, "type" => "ping"); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.reset_stream as f64, + "type" => "reset_stream" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.retire_connection_id as f64, + "type" => "retire_connection_id" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.stream_data_blocked as f64, + "type" => "stream_data_blocked" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.streams_blocked_bidi as f64, + "type" => "streams_blocked_bidi" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.streams_blocked_uni as f64, + "type" => "streams_blocked_uni" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.stop_sending as f64, + "type" => "stop_sending" + ); + gauge!( + "corro.transport.frame_rx", + stats.frame_rx.stream as f64, + "type" => "stream" + ); + + gauge!("corro.transport.frame_tx", stats.frame_tx.acks as f64, "type" => "acks"); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.crypto as f64, + "type" => "crypto" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.connection_close as f64, + "type" => "connection_close" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.data_blocked as f64, + "type" => "data_blocked" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.datagram as f64, + "type" => "datagram" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.handshake_done as f64, + "type" => "handshake_done" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.max_data as f64, + "type" => "max_data" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.max_stream_data as f64, + "type" => "max_stream_data" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.max_streams_bidi as f64, + "type" => "max_streams_bidi" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.max_streams_uni as f64, + "type" => "max_streams_uni" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.new_connection_id as f64, + "type" => "new_connection_id" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.new_token as f64, + "type" => "new_token" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.path_challenge as f64, + "type" => "path_challenge" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.path_response as f64, + "type" => "path_response" + ); + gauge!("corro.transport.frame_tx", stats.frame_tx.ping as f64, "type" => "ping"); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.reset_stream as f64, + "type" => "reset_stream" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.retire_connection_id as f64, + "type" => "retire_connection_id" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.stream_data_blocked as f64, + "type" => "stream_data_blocked" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.streams_blocked_bidi as f64, + "type" => "streams_blocked_bidi" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.streams_blocked_uni as f64, + "type" => "streams_blocked_uni" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.stop_sending as f64, + "type" => "stop_sending" + ); + gauge!( + "corro.transport.frame_tx", + stats.frame_tx.stream as f64, + "type" => "stream" + ); + + gauge!("corro.transport.udp_rx.bytes", stats.udp_rx.bytes as f64); + gauge!( + "corro.transport.udp_rx.datagrams", + stats.udp_rx.datagrams as f64 + ); + gauge!( + "corro.transport.udp_rx.transmits", + stats.udp_rx.transmits as f64 + ); + + gauge!("corro.transport.udp_tx.bytes", stats.udp_tx.bytes as f64); + gauge!( + "corro.transport.udp_tx.datagrams", + stats.udp_tx.datagrams as f64 + ); + gauge!( + "corro.transport.udp_tx.transmits", + stats.udp_tx.transmits as f64 + ); } } diff --git a/crates/corro-types/Cargo.toml b/crates/corro-types/Cargo.toml index 3a059c76..cd9bec9a 100644 --- a/crates/corro-types/Cargo.toml +++ b/crates/corro-types/Cargo.toml @@ -24,6 +24,7 @@ indexmap = { workspace = true } itertools = { workspace = true } metrics = { workspace = true } once_cell = { workspace = true } +opentelemetry = { workspace = true } parking_lot = { workspace = true } rand = { workspace = true } rangemap = { workspace = true } diff --git a/crates/corro-types/src/agent.rs b/crates/corro-types/src/agent.rs index 45e987f7..2e7ca705 100644 --- a/crates/corro-types/src/agent.rs +++ b/crates/corro-types/src/agent.rs @@ -17,14 +17,14 @@ use compact_str::CompactString; use indexmap::IndexMap; use metrics::{gauge, histogram}; use parking_lot::RwLock; -use rangemap::{RangeInclusiveMap, RangeInclusiveSet}; +use rangemap::RangeInclusiveSet; use rusqlite::{Connection, InterruptHandle}; use serde::{Deserialize, Serialize}; use tokio::{ runtime::Handle, sync::{ mpsc::{channel, Sender}, - oneshot, + oneshot, Semaphore, }, }; use tokio::{ @@ -35,12 +35,12 @@ use tokio::{ task::block_in_place, }; use tokio_util::sync::{CancellationToken, DropGuard}; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, Instrument}; use tripwire::Tripwire; use crate::{ actor::ActorId, - broadcast::{BroadcastInput, ChangeSource, ChangeV1, Timestamp}, + broadcast::{BroadcastInput, ChangeSource, ChangeV1, FocaInput, Timestamp}, config::Config, pubsub::MatcherHandle, schema::NormalizedSchema, @@ -68,6 +68,7 @@ pub struct AgentConfig { pub tx_apply: Sender<(ActorId, i64)>, pub tx_empty: Sender<(ActorId, RangeInclusive)>, pub tx_changes: Sender<(ChangeV1, ChangeSource)>, + pub tx_foca: Sender, pub schema: RwLock, pub tripwire: Tripwire, @@ -87,7 +88,14 @@ pub struct AgentInner { tx_apply: Sender<(ActorId, i64)>, tx_empty: Sender<(ActorId, RangeInclusive)>, tx_changes: Sender<(ChangeV1, ChangeSource)>, + tx_foca: Sender, schema: RwLock, + limits: Limits, +} + +#[derive(Debug, Clone)] +pub struct Limits { + pub sync: Arc, } impl Agent { @@ -106,7 +114,11 @@ impl Agent { tx_apply: config.tx_apply, tx_empty: config.tx_empty, tx_changes: config.tx_changes, + tx_foca: config.tx_foca, schema: config.schema, + limits: Limits { + sync: Arc::new(Semaphore::new(3)), + }, })) } @@ -150,6 +162,10 @@ impl Agent { &self.0.tx_empty } + pub fn tx_foca(&self) -> &Sender { + &self.0.tx_foca + } + pub fn bookie(&self) -> &Bookie { &self.0.bookie } @@ -177,6 +193,10 @@ impl Agent { pub fn set_config(&self, new_conf: Config) { self.0.config.store(Arc::new(new_conf)) } + + pub fn limits(&self) -> &Limits { + &self.0.limits + } } #[derive(Debug, Clone)] @@ -332,14 +352,17 @@ impl SplitPool { } // get a read-only connection + #[tracing::instrument(skip(self), level = "debug")] pub async fn read(&self) -> Result, SqlitePoolError> { self.0.read.get().await } + #[tracing::instrument(skip(self), level = "debug")] pub fn read_blocking(&self) -> Result, SqlitePoolError> { Handle::current().block_on(self.0.read.get()) } + #[tracing::instrument(skip(self), level = "debug")] pub async fn dedicated(&self) -> rusqlite::Result { block_in_place(|| { let mut conn = rusqlite::Connection::open(&self.0.path)?; @@ -349,16 +372,19 @@ impl SplitPool { } // get a high priority write connection (e.g. client input) + #[tracing::instrument(skip(self))] pub async fn write_priority(&self) -> Result { self.write_inner(&self.0.priority_tx, "priority").await } // get a normal priority write connection (e.g. sync process) + #[tracing::instrument(skip(self))] pub async fn write_normal(&self) -> Result { self.write_inner(&self.0.normal_tx, "normal").await } // get a low priority write connection (e.g. background tasks) + #[tracing::instrument(skip(self))] pub async fn write_low(&self) -> Result { self.write_inner(&self.0.low_tx, "low").await } @@ -375,12 +401,15 @@ impl SplitPool { histogram!("corro.sqlite.pool.queue.seconds", start.elapsed().as_secs_f64(), "queue" => queue); let conn = self.0.write.get().await?; - tokio::spawn(timeout_wait( - token.clone(), - conn.get_interrupt_handle(), - Duration::from_secs(30), - queue, - )); + tokio::spawn( + timeout_wait( + token.clone(), + conn.get_interrupt_handle(), + Duration::from_secs(30), + queue, + ) + .in_current_span(), + ); Ok(WriteConn { conn, @@ -484,6 +513,7 @@ impl CountedTokioRwLock { } } + #[tracing::instrument(skip(self, label), level = "debug")] pub async fn write>( &self, label: C, @@ -491,6 +521,7 @@ impl CountedTokioRwLock { self.registry.acquire_write(label, &self.lock).await } + #[tracing::instrument(skip(self, label), level = "debug")] pub fn blocking_write>( &self, label: C, @@ -498,6 +529,7 @@ impl CountedTokioRwLock { self.registry.acquire_blocking_write(label, &self.lock) } + #[tracing::instrument(skip(self, label), level = "debug")] pub fn blocking_write_owned>( &self, label: C, @@ -506,12 +538,25 @@ impl CountedTokioRwLock { .acquire_blocking_write_owned(label, self.lock.clone()) } + #[tracing::instrument(skip(self, label), level = "debug")] + pub fn blocking_read>( + &self, + label: C, + ) -> CountedTokioRwLockReadGuard<'_, T> { + self.registry.acquire_blocking_read(label, &self.lock) + } + + #[tracing::instrument(skip(self, label), level = "debug")] pub async fn read>( &self, label: C, ) -> CountedTokioRwLockReadGuard<'_, T> { self.registry.acquire_read(label, &self.lock).await } + + pub fn registry(&self) -> &LockRegistry { + &self.registry + } } pub struct CountedTokioRwLockWriteGuard<'a, T> { @@ -694,6 +739,30 @@ impl LockRegistry { CountedTokioRwLockReadGuard { lock: w, _tracker } } + fn acquire_blocking_read<'a, T, C: Into>( + &self, + label: C, + lock: &'a TokioRwLock, + ) -> CountedTokioRwLockReadGuard<'a, T> { + let id = self.gen_id(); + self.insert_lock( + id, + LockMeta { + label: label.into(), + kind: LockKind::Read, + state: LockState::Acquiring, + started_at: Instant::now(), + }, + ); + let _tracker = LockTracker { + id, + registry: self.clone(), + }; + let w = lock.blocking_read(); + self.set_lock_state(&id, LockState::Locked); + CountedTokioRwLockReadGuard { lock: w, _tracker } + } + fn set_lock_state(&self, id: &LockId, state: LockState) { if let Some(meta) = self.map.write().get_mut(id) { meta.state = state @@ -734,23 +803,102 @@ impl Drop for LockTracker { } } -#[derive(Default)] -pub struct BookedVersions(pub RangeInclusiveMap); +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct CurrentVersion { + // cr-sqlite db version + pub db_version: i64, + // actual last sequence originally produced + pub last_seq: i64, + // timestamp when the change was produced by the source + pub ts: Timestamp, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct PartialVersion { + // range of sequences recorded + pub seqs: RangeInclusiveSet, + // actual last sequence originally produced + pub last_seq: i64, + // timestamp when the change was produced by the source + pub ts: Timestamp, +} + +impl From for KnownDbVersion { + fn from(PartialVersion { seqs, last_seq, ts }: PartialVersion) -> Self { + KnownDbVersion::Partial { seqs, last_seq, ts } + } +} + +#[derive(Debug)] +pub enum KnownVersion<'a> { + Cleared, + Current(&'a CurrentVersion), + Partial(&'a PartialVersion), +} + +impl<'a> KnownVersion<'a> { + pub fn is_cleared(&self) -> bool { + matches!(self, KnownVersion::Cleared) + } +} + +impl<'a> From> for KnownDbVersion { + fn from(value: KnownVersion<'a>) -> Self { + match value { + KnownVersion::Cleared => KnownDbVersion::Cleared, + KnownVersion::Current(CurrentVersion { + db_version, + last_seq, + ts, + }) => KnownDbVersion::Current { + db_version: *db_version, + last_seq: *last_seq, + ts: *ts, + }, + KnownVersion::Partial(PartialVersion { seqs, last_seq, ts }) => { + KnownDbVersion::Partial { + seqs: seqs.clone(), + last_seq: *last_seq, + ts: *ts, + } + } + } + } +} + +#[derive(Default, Clone)] +pub struct BookedVersions { + pub cleared: RangeInclusiveSet, + pub current: BTreeMap, + pub partials: BTreeMap, +} impl BookedVersions { + pub fn contains_version(&self, version: &i64) -> bool { + self.cleared.contains(version) + || self.current.contains_key(version) + || self.partials.contains_key(version) + } + + pub fn get(&self, version: &i64) -> Option { + self.cleared + .get(version) + .map(|_| KnownVersion::Cleared) + .or_else(|| self.current.get(version).map(KnownVersion::Current)) + .or_else(|| self.partials.get(version).map(KnownVersion::Partial)) + } + pub fn contains(&self, version: i64, seqs: Option<&RangeInclusive>) -> bool { - match seqs { - Some(check_seqs) => match self.0.get(&version) { - Some(known) => match known { - KnownDbVersion::Partial { seqs, .. } => { - check_seqs.clone().all(|seq| seqs.contains(&seq)) + self.contains_version(&version) + && seqs + .map(|check_seqs| match self.get(&version) { + Some(KnownVersion::Cleared) | Some(KnownVersion::Current(_)) => true, + Some(KnownVersion::Partial(partial)) => { + check_seqs.clone().all(|seq| partial.seqs.contains(&seq)) } - KnownDbVersion::Current { .. } | KnownDbVersion::Cleared => true, - }, - None => false, - }, - None => self.0.contains_key(&version), - } + None => false, + }) + .unwrap_or(true) } pub fn contains_all( @@ -762,24 +910,26 @@ impl BookedVersions { } pub fn contains_current(&self, version: &i64) -> bool { - matches!(self.0.get(version), Some(KnownDbVersion::Current { .. })) + self.current.contains_key(version) } pub fn current_versions(&self) -> BTreeMap { - self.0 + self.current .iter() - .filter_map(|(range, known)| { - if let KnownDbVersion::Current { db_version, .. } = known { - Some((*db_version, *range.start())) - } else { - None - } - }) + .map(|(version, current)| (current.db_version, *version)) .collect() } pub fn last(&self) -> Option { - self.0.iter().map(|(k, _v)| *k.end()).max() + std::cmp::max( + // TODO: we probably don't need to traverse all of that... + // maybe use `skip` based on the len + self.cleared.iter().map(|k| *k.end()).max(), + std::cmp::max( + self.current.last_key_value().map(|(k, _)| *k), + self.partials.last_key_value().map(|(k, _)| *k), + ), + ) } pub fn insert(&mut self, version: i64, known_version: KnownDbVersion) { @@ -787,15 +937,43 @@ impl BookedVersions { } pub fn insert_many(&mut self, versions: RangeInclusive, known_version: KnownDbVersion) { - self.0.insert(versions, known_version); + match known_version { + KnownDbVersion::Partial { seqs, last_seq, ts } => { + self.partials + .insert(*versions.start(), PartialVersion { seqs, last_seq, ts }); + } + KnownDbVersion::Current { + db_version, + last_seq, + ts, + } => { + let version = *versions.start(); + self.partials.remove(&version); + self.current.insert( + version, + CurrentVersion { + db_version, + last_seq, + ts, + }, + ); + } + KnownDbVersion::Cleared => { + for version in versions.clone() { + self.partials.remove(&version); + self.current.remove(&version); + } + self.cleared.insert(versions); + } + } } -} -impl Deref for BookedVersions { - type Target = RangeInclusiveMap; + pub fn all_versions(&self) -> RangeInclusiveSet { + let mut versions = self.cleared.clone(); + versions.extend(self.current.keys().map(|key| *key..=*key)); + versions.extend(self.partials.keys().map(|key| *key..=*key)); - fn deref(&self) -> &Self::Target { - &self.0 + versions } } @@ -830,6 +1008,13 @@ impl Booked { self.0.blocking_write(label) } + pub fn blocking_read>( + &self, + label: L, + ) -> CountedTokioRwLockReadGuard<'_, BookedVersions> { + self.0.blocking_read(label) + } + pub fn blocking_write_owned>( &self, label: L, @@ -856,10 +1041,6 @@ impl BookieInner { }) .clone() } - - pub fn registry(&self) -> &LockRegistry { - &self.registry - } } impl Deref for BookieInner { @@ -908,4 +1089,8 @@ impl Bookie { ) -> CountedTokioRwLockWriteGuard { self.0.blocking_write(label) } + + pub fn registry(&self) -> &LockRegistry { + self.0.registry() + } } diff --git a/crates/corro-types/src/broadcast.rs b/crates/corro-types/src/broadcast.rs index 83191e07..730c4c16 100644 --- a/crates/corro-types/src/broadcast.rs +++ b/crates/corro-types/src/broadcast.rs @@ -16,13 +16,13 @@ use rusqlite::{ use serde::{Deserialize, Serialize}; use speedy::{Context, Readable, Reader, Writable, Writer}; use time::OffsetDateTime; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::{self, Sender}; use tracing::{error, trace}; use uhlc::{ParseNTP64Error, NTP64}; use crate::{ actor::{Actor, ActorId}, - sync::SyncStateV1, + sync::SyncTraceContextV1, }; #[derive(Debug, Clone, Readable, Writable)] @@ -42,7 +42,11 @@ pub enum BiPayload { #[derive(Debug, Clone, Readable, Writable)] pub enum BiPayloadV1 { - SyncState(SyncStateV1), + SyncStart { + actor_id: ActorId, + #[speedy(default_on_eof)] + trace_ctx: SyncTraceContextV1, + }, } #[derive(Debug)] @@ -51,6 +55,12 @@ pub enum FocaInput { Data(Bytes), ClusterSize(NonZeroU32), ApplyMany(Vec>), + Cmd(FocaCmd), +} + +#[derive(Debug)] +pub enum FocaCmd { + MembershipStates(mpsc::Sender>), } #[derive(Debug, Clone, Readable, Writable)] diff --git a/crates/corro-types/src/config.rs b/crates/corro-types/src/config.rs index ee1db705..ef56afd6 100644 --- a/crates/corro-types/src/config.rs +++ b/crates/corro-types/src/config.rs @@ -16,7 +16,7 @@ pub struct Config { pub admin: AdminConfig, #[serde(default)] - pub telemetry: Option, + pub telemetry: TelemetryConfig, #[serde(default)] pub log: LogConfig, @@ -24,13 +24,24 @@ pub struct Config { pub consul: Option, } +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub struct TelemetryConfig { + pub prometheus: Option, + pub open_telemetry: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PrometheusConfig { + #[serde(alias = "addr")] + pub bind_addr: SocketAddr, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] -pub enum TelemetryConfig { - Prometheus { - #[serde(alias = "addr")] - bind_addr: SocketAddr, - }, +pub enum OtelConfig { + FromEnv, + Exporter { endpoint: String }, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -178,7 +189,7 @@ pub struct ConfigBuilder { gossip_addr: Option, api_addr: Option, admin_path: Option, - metrics_addr: Option, + prometheus_addr: Option, bootstrap: Option>, log: Option, schema_paths: Vec, @@ -203,8 +214,8 @@ impl ConfigBuilder { self } - pub fn metrics_addr(mut self, addr: SocketAddr) -> Self { - self.metrics_addr = Some(addr); + pub fn prometheus_addr(mut self, addr: SocketAddr) -> Self { + self.prometheus_addr = Some(addr); self } @@ -245,6 +256,14 @@ impl ConfigBuilder { pub fn build(self) -> Result { let db_path = self.db_path.ok_or(ConfigBuilderError::DbPathRequired)?; + + let telemetry = TelemetryConfig { + prometheus: self + .prometheus_addr + .map(|bind_addr| PrometheusConfig { bind_addr }), + open_telemetry: None, + }; + Ok(Config { db: DbConfig { path: db_path, @@ -269,9 +288,7 @@ impl ConfigBuilder { admin: AdminConfig { uds_path: self.admin_path.unwrap_or_else(default_admin_path), }, - telemetry: self - .metrics_addr - .map(|bind_addr| TelemetryConfig::Prometheus { bind_addr }), + telemetry, log: self.log.unwrap_or_default(), consul: self.consul, diff --git a/crates/corro-types/src/members.rs b/crates/corro-types/src/members.rs index 908070f9..ae925b71 100644 --- a/crates/corro-types/src/members.rs +++ b/crates/corro-types/src/members.rs @@ -49,7 +49,7 @@ impl Members { // A result of `true` means that the effective list of // cluster member addresses has changed - pub fn add_member(&mut self, actor: &Actor) -> bool { + pub fn add_member(&mut self, actor: &Actor) -> (bool, bool) { let actor_id = actor.id(); let member = self .states @@ -58,12 +58,14 @@ impl Members { trace!("member: {member:?}"); - if actor.ts().to_time() < member.ts.to_time() { + if actor.ts().to_duration() < member.ts.to_duration() { debug!("older timestamp, ignoring"); - return false; + return (false, false); } - let newer = actor.ts() > member.ts; + // sometimes, this can be equal + let newer = actor.ts().to_duration() > member.ts.to_duration(); + let same = actor.ts().to_duration() == member.ts.to_duration(); if newer { member.addr = actor.addr(); @@ -73,7 +75,7 @@ impl Members { self.recalculate_rings(actor.addr()); } - newer + (newer, same) } // A result of `true` means that the effective list of diff --git a/crates/corro-types/src/sync.rs b/crates/corro-types/src/sync.rs index de1dafb1..2a191378 100644 --- a/crates/corro-types/src/sync.rs +++ b/crates/corro-types/src/sync.rs @@ -1,13 +1,16 @@ -use std::{collections::HashMap, io, ops::RangeInclusive}; +use std::{cmp, collections::HashMap, io, ops::RangeInclusive}; use bytes::BytesMut; +use opentelemetry::propagation::{Extractor, Injector}; +use rangemap::RangeInclusiveSet; use serde::{Deserialize, Serialize}; use speedy::{Readable, Writable}; use tokio_util::codec::{Decoder, LengthDelimitedCodec}; +use tracing::warn; use crate::{ actor::ActorId, - agent::{Booked, Bookie, KnownDbVersion}, + agent::{Booked, Bookie}, broadcast::{ChangeV1, Timestamp}, }; @@ -21,6 +24,53 @@ pub enum SyncMessageV1 { State(SyncStateV1), Changeset(ChangeV1), Clock(Timestamp), + Rejection(SyncRejectionV1), + Request(SyncRequestV1), +} + +#[derive(Debug, Default, Clone, PartialEq, Readable, Writable)] +pub struct SyncTraceContextV1 { + pub traceparent: Option, + pub tracestate: Option, +} + +impl Injector for SyncTraceContextV1 { + fn set(&mut self, key: &str, value: String) { + match key { + "traceparent" if !value.is_empty() => self.traceparent = Some(value), + "tracestate" if !value.is_empty() => self.tracestate = Some(value), + _ => {} + } + } +} + +impl Extractor for SyncTraceContextV1 { + fn get(&self, key: &str) -> Option<&str> { + match key { + "traceparent" => self.traceparent.as_deref(), + "tracestate" => self.tracestate.as_deref(), + _ => None, + } + } + + fn keys(&self) -> Vec<&str> { + let mut v = Vec::with_capacity(2); + if self.traceparent.is_some() { + v.push("traceparent"); + } + if self.tracestate.is_some() { + v.push("tracestate"); + } + v + } +} + +pub type SyncRequestV1 = Vec<(ActorId, Vec)>; + +#[derive(Debug, thiserror::Error, Clone, PartialEq, Readable, Writable)] +pub enum SyncRejectionV1 { + #[error("max concurrency reached")] + MaxConcurrencyReached, } #[derive(Debug, Default, Clone, PartialEq, Readable, Writable, Serialize, Deserialize)] @@ -66,6 +116,149 @@ impl SyncStateV1 { .map(|partials| partials.len() as i64) .unwrap_or(0) } + + pub fn compute_available_needs( + &self, + other: &SyncStateV1, + ) -> HashMap> { + let mut needs: HashMap> = HashMap::new(); + + for (actor_id, head) in other.heads.iter() { + if *actor_id == self.actor_id { + continue; + } + if *head == 0 { + warn!(actor_id = %other.actor_id, "sent a 0 head version for actor id {}", actor_id); + continue; + } + let other_haves = { + let mut haves = RangeInclusiveSet::from_iter([(1..=*head)].into_iter()); + + // remove needs + if let Some(other_need) = other.need.get(actor_id) { + for need in other_need.iter() { + // create gaps + haves.remove(need.clone()); + } + } + + // remove partials + if let Some(other_partials) = other.partial_need.get(actor_id) { + for (v, _) in other_partials.iter() { + haves.remove(*v..=*v); + } + } + + // we are left with all the versions they fully have! + + haves + }; + + if let Some(our_need) = self.need.get(actor_id) { + for range in our_need.iter() { + for overlap in other_haves.overlapping(range) { + let start = cmp::max(range.start(), overlap.start()); + let end = cmp::min(range.end(), overlap.end()); + needs.entry(*actor_id).or_default().push(SyncNeedV1::Full { + versions: *start..=*end, + }) + } + } + } + + if let Some(our_partials) = self.partial_need.get(actor_id) { + for (v, seqs) in our_partials.iter() { + if other_haves.contains(v) { + needs + .entry(*actor_id) + .or_default() + .push(SyncNeedV1::Partial { + version: *v, + seqs: seqs.clone(), + }); + } else if let Some(other_seqs) = other + .partial_need + .get(actor_id) + .and_then(|versions| versions.get(v)) + { + let max_other_seq = other_seqs.iter().map(|range| *range.end()).max(); + let max_our_seq = seqs.iter().map(|range| *range.end()).max(); + + let end_seq = cmp::max(max_other_seq, max_our_seq); + + if let Some(end) = end_seq { + let mut other_seqs_haves = RangeInclusiveSet::from_iter([0..=end]); + + for seqs in other_seqs.iter() { + other_seqs_haves.remove(seqs.clone()); + } + + let seqs = seqs + .iter() + .flat_map(|range| { + other_seqs_haves + .overlapping(range) + .map(|overlap| { + let start = cmp::max(range.start(), overlap.start()); + let end = cmp::min(range.end(), overlap.end()); + *start..=*end + }) + .collect::>>() + }) + .collect::>>(); + + if !seqs.is_empty() { + needs + .entry(*actor_id) + .or_default() + .push(SyncNeedV1::Partial { version: *v, seqs }); + } + } + } + } + } + + let missing = match self.heads.get(actor_id) { + Some(our_head) => { + if head > our_head { + Some((*our_head + 1)..=*head) + } else { + None + } + } + None => Some(1..=*head), + }; + + if let Some(missing) = missing { + needs + .entry(*actor_id) + .or_default() + .push(SyncNeedV1::Full { versions: missing }); + } + } + + needs + } +} + +#[derive(Debug, Clone, PartialEq, Readable, Writable)] +pub enum SyncNeedV1 { + Full { + versions: RangeInclusive, + }, + Partial { + version: i64, + seqs: Vec>, + }, +} + +impl SyncNeedV1 { + pub fn count(&self) -> usize { + match self { + SyncNeedV1::Full { versions } => (versions.end() - versions.start()) as usize + 1, + SyncNeedV1::Partial { .. } => 1, + } + } } impl From for SyncMessage { @@ -75,6 +268,7 @@ impl From for SyncMessage { } // generates a `SyncMessage` to tell another node what versions we're missing +#[tracing::instrument(skip_all)] pub async fn generate_sync(bookie: &Bookie, actor_id: ActorId) -> SyncStateV1 { let mut state = SyncStateV1 { actor_id, @@ -91,45 +285,28 @@ pub async fn generate_sync(bookie: &Bookie, actor_id: ActorId) -> SyncStateV1 { }; for (actor_id, booked) in actors { - let last_version = match { - booked - .read(format!("generate_sync(last):{}", actor_id.as_simple())) - .await - .last() - } { - Some(v) => v, + let bookedr = booked + .read(format!("generate_sync:{}", actor_id.as_simple())) + .await; + + let last_version = match { bookedr.last() } { None => continue, + Some(v) => v, }; - let need: Vec<_> = { - booked - .read(format!("generate_sync(need):{}", actor_id.as_simple())) - .await - .gaps(&(1..=last_version)) - .collect() - }; + let need: Vec<_> = { bookedr.all_versions().gaps(&(1..=last_version)).collect() }; if !need.is_empty() { state.need.insert(actor_id, need); } { - let read = booked - .read(format!("generate_sync(partials):{}", actor_id.as_simple())) - .await; - for (range, known) in read.iter() { - if let KnownDbVersion::Partial { seqs, last_seq, .. } = known { - if seqs.gaps(&(0..=*last_seq)).count() == 0 { - // soon to be processed, but we got it all - continue; - } - - state - .partial_need - .entry(actor_id) - .or_default() - .insert(*range.start(), seqs.gaps(&(0..=*last_seq)).collect()); - } + for (v, partial) in bookedr.partials.iter() { + state + .partial_need + .entry(actor_id) + .or_default() + .insert(*v, partial.seqs.gaps(&(0..=partial.last_seq)).collect()); } } @@ -183,3 +360,84 @@ impl SyncMessage { }) } } + +#[cfg(test)] +mod tests { + use uuid::Uuid; + + use super::*; + + #[test] + fn test_compute_available_needs() { + let actor1 = ActorId(Uuid::new_v4()); + + let mut our_state = SyncStateV1::default(); + our_state.heads.insert(actor1, 10); + + let mut other_state = SyncStateV1::default(); + other_state.heads.insert(actor1, 13); + + assert_eq!( + our_state.compute_available_needs(&other_state), + [(actor1, vec![SyncNeedV1::Full { versions: 11..=13 }])].into() + ); + + our_state.need.entry(actor1).or_default().push(2..=5); + our_state.need.entry(actor1).or_default().push(7..=7); + + assert_eq!( + our_state.compute_available_needs(&other_state), + [( + actor1, + vec![ + SyncNeedV1::Full { versions: 2..=5 }, + SyncNeedV1::Full { versions: 7..=7 }, + SyncNeedV1::Full { versions: 11..=13 } + ] + )] + .into() + ); + + our_state + .partial_need + .insert(actor1, [(9i64, vec![100..=120, 130..=132])].into()); + + assert_eq!( + our_state.compute_available_needs(&other_state), + [( + actor1, + vec![ + SyncNeedV1::Full { versions: 2..=5 }, + SyncNeedV1::Full { versions: 7..=7 }, + SyncNeedV1::Partial { + version: 9, + seqs: vec![100..=120, 130..=132] + }, + SyncNeedV1::Full { versions: 11..=13 } + ] + )] + .into() + ); + + other_state + .partial_need + .insert(actor1, [(9i64, vec![100..=110, 130..=130])].into()); + + assert_eq!( + our_state.compute_available_needs(&other_state), + [( + actor1, + vec![ + SyncNeedV1::Full { versions: 2..=5 }, + SyncNeedV1::Full { versions: 7..=7 }, + SyncNeedV1::Partial { + version: 9, + seqs: vec![111..=120, 131..=132] + }, + SyncNeedV1::Full { versions: 11..=13 } + ] + )] + .into() + ); + } +} diff --git a/crates/corrosion/Cargo.toml b/crates/corrosion/Cargo.toml index 335b034a..28ff4d0e 100644 --- a/crates/corrosion/Cargo.toml +++ b/crates/corrosion/Cargo.toml @@ -26,6 +26,9 @@ metrics-exporter-prometheus = { workspace = true } notify = { version = "6.0.1", default-features = false, features = ["macos_kqueue"] } notify-debouncer-mini = { version = "0.3.0", default-features = false } once_cell = { workspace = true } +opentelemetry = { workspace = true } +opentelemetry-otlp = { workspace = true } +opentelemetry-semantic-conventions = { workspace = true } rusqlite = { workspace = true } seahash = { workspace = true } serde = { workspace = true } @@ -37,10 +40,12 @@ tempfile = { workspace = true } tikv-jemallocator = "0.5" time = { workspace = true } tokio = { workspace = true } +tokio-metrics = { workspace = true } tokio-serde = { workspace = true } tokio-util = { workspace = true } tracing = { workspace = true } tracing-filter = { workspace = true } +tracing-opentelemetry = { workspace = true } tracing-subscriber = { workspace = true } tripwire = { path = "../tripwire" } uuid = { workspace = true } diff --git a/crates/corrosion/src/admin.rs b/crates/corrosion/src/admin.rs index be7f8e90..1cab86c5 100644 --- a/crates/corrosion/src/admin.rs +++ b/crates/corrosion/src/admin.rs @@ -57,7 +57,6 @@ impl AdminConn { } Response::Json(json) => { println!("{}", serde_json::to_string_pretty(&json).unwrap()); - break; } }, } diff --git a/crates/corrosion/src/command/agent.rs b/crates/corrosion/src/command/agent.rs index 4fb1618c..af5f3a38 100644 --- a/crates/corrosion/src/command/agent.rs +++ b/crates/corrosion/src/command/agent.rs @@ -1,11 +1,12 @@ -use std::net::SocketAddr; +use std::{net::SocketAddr, time::Duration}; use camino::Utf8PathBuf; use corro_admin::AdminConfig; -use corro_types::config::{Config, TelemetryConfig}; +use corro_types::config::{Config, PrometheusConfig}; use metrics::gauge; use metrics_exporter_prometheus::PrometheusBuilder; use spawn::wait_for_all_pending_handles; +use tokio_metrics::RuntimeMonitor; use tracing::{error, info}; use crate::VERSION; @@ -13,10 +14,12 @@ use crate::VERSION; pub async fn run(config: Config, config_path: &Utf8PathBuf) -> eyre::Result<()> { info!("Starting Corrosion Agent v{VERSION}"); - if let Some(TelemetryConfig::Prometheus { bind_addr }) = config.telemetry { + if let Some(PrometheusConfig { bind_addr }) = config.telemetry.prometheus { setup_prometheus(bind_addr).expect("could not setup prometheus"); let info = crate::version(); gauge!("corro.build.info", 1.0, "version" => info.crate_info.version.to_string(), "ts" => info.timestamp.to_string(), "rustc_version" => info.compiler.version.to_string()); + + start_tokio_runtime_reporter(); } let (tripwire, tripwire_worker) = tripwire::Tripwire::new_signals(); @@ -67,6 +70,9 @@ fn setup_prometheus(addr: SocketAddr) -> eyre::Result<()> { 0.100, // 100ms 0.200, // 200ms 1.0, // 1s + 2.0, // 2s + 3.0, // 3s + 4.0, // 4s 5.0, // 5s 10.0, // 10s :screaming: 30.0, 60.0, @@ -74,3 +80,131 @@ fn setup_prometheus(addr: SocketAddr) -> eyre::Result<()> { .install()?; Ok(()) } + +fn start_tokio_runtime_reporter() { + let handle = tokio::runtime::Handle::current(); + + { + let runtime_monitor = RuntimeMonitor::new(&handle); + tokio::spawn(async move { + for metrics in runtime_monitor.intervals() { + gauge!("corro.tokio.workers_count", metrics.workers_count as f64); + gauge!( + "corro.tokio.total_park_count", + metrics.total_park_count as f64 + ); + gauge!("corro.tokio.max_park_count", metrics.max_park_count as f64); + gauge!("corro.tokio.min_park_count", metrics.min_park_count as f64); + gauge!( + "corro.tokio.total_noop_count", + metrics.total_noop_count as f64 + ); + gauge!("corro.tokio.max_noop_count", metrics.max_noop_count as f64); + gauge!("corro.tokio.min_noop_count", metrics.min_noop_count as f64); + gauge!( + "corro.tokio.total_steal_count", + metrics.total_steal_count as f64 + ); + gauge!( + "corro.tokio.max_steal_count", + metrics.max_steal_count as f64 + ); + gauge!( + "corro.tokio.min_steal_count", + metrics.min_steal_count as f64 + ); + gauge!( + "corro.tokio.total_steal_operations", + metrics.total_steal_operations as f64 + ); + gauge!( + "corro.tokio.max_steal_operations", + metrics.max_steal_operations as f64 + ); + gauge!( + "corro.tokio.min_steal_operations", + metrics.min_steal_operations as f64 + ); + gauge!( + "corro.tokio.num_remote_schedules", + metrics.num_remote_schedules as f64 + ); + gauge!( + "corro.tokio.total_local_schedule_count", + metrics.total_local_schedule_count as f64 + ); + gauge!( + "corro.tokio.max_local_schedule_count", + metrics.max_local_schedule_count as f64 + ); + gauge!( + "corro.tokio.min_local_schedule_count", + metrics.min_local_schedule_count as f64 + ); + gauge!( + "corro.tokio.total_overflow_count", + metrics.total_overflow_count as f64 + ); + gauge!( + "corro.tokio.max_overflow_count", + metrics.max_overflow_count as f64 + ); + gauge!( + "corro.tokio.min_overflow_count", + metrics.min_overflow_count as f64 + ); + gauge!( + "corro.tokio.total_polls_count", + metrics.total_polls_count as f64 + ); + gauge!( + "corro.tokio.max_polls_count", + metrics.max_polls_count as f64 + ); + gauge!( + "corro.tokio.min_polls_count", + metrics.min_polls_count as f64 + ); + gauge!( + "corro.tokio.total_busy_seconds", + metrics.total_busy_duration.as_secs_f64() + ); + gauge!( + "corro.tokio.max_busy_seconds", + metrics.max_busy_duration.as_secs_f64() + ); + gauge!( + "corro.tokio.min_busy_seconds", + metrics.min_busy_duration.as_secs_f64() + ); + gauge!( + "corro.tokio.injection_queue_depth", + metrics.injection_queue_depth as f64 + ); + gauge!( + "corro.tokio.total_local_queue_depth", + metrics.total_local_queue_depth as f64 + ); + gauge!( + "corro.tokio.max_local_queue_depth", + metrics.max_local_queue_depth as f64 + ); + gauge!( + "corro.tokio.min_local_queue_depth", + metrics.min_local_queue_depth as f64 + ); + gauge!( + "corro.tokio.budget_forced_yield_count", + metrics.budget_forced_yield_count as f64 + ); + gauge!( + "corro.tokio.io_driver_ready_count", + metrics.io_driver_ready_count as f64 + ); + + // wait 2s + tokio::time::sleep(Duration::from_secs(2)).await; + } + }); + } +} diff --git a/crates/corrosion/src/main.rs b/crates/corrosion/src/main.rs index 580a2543..c27eb307 100644 --- a/crates/corrosion/src/main.rs +++ b/crates/corrosion/src/main.rs @@ -16,10 +16,20 @@ use corro_api_types::SqliteValue; use corro_client::CorrosionApiClient; use corro_types::{ api::{ExecResult, QueryEvent, Statement}, - config::{default_admin_path, Config, ConfigError, LogFormat}, + config::{default_admin_path, Config, ConfigError, LogFormat, OtelConfig}, }; use futures::StreamExt; use once_cell::sync::OnceCell; +use opentelemetry::{ + global, + sdk::{ + propagation::TraceContextPropagator, + trace::{self, BatchConfig}, + Resource, + }, + KeyValue, +}; +use opentelemetry_otlp::WithExportConfig; use rusqlite::{Connection, OptionalExtension}; use tokio_util::codec::{Decoder, LinesCodec}; use tracing::{debug, error, info, warn}; @@ -51,19 +61,74 @@ fn init_tracing(cli: &Cli) -> Result<(), ConfigError> { eprintln!("While parsing env filters: {diags}, using default"); } + global::set_text_map_propagator(TraceContextPropagator::new()); + // Tracing let (env_filter, _handle) = tracing_subscriber::reload::Layer::new(filter.layer()); let sub = tracing_subscriber::registry::Registry::default().with(env_filter); - match config.log.format { - LogFormat::Plaintext => { - sub.with(tracing_subscriber::fmt::Layer::new().with_ansi(config.log.colors)) + if let Some(otel) = &config.telemetry.open_telemetry { + let otlp_exporter = opentelemetry_otlp::new_exporter().tonic().with_env(); + let otlp_exporter = match otel { + OtelConfig::FromEnv => otlp_exporter, + OtelConfig::Exporter { endpoint } => otlp_exporter.with_endpoint(endpoint), + }; + + let batch_config = BatchConfig::default().with_max_queue_size(10240); + + let trace_config = trace::config().with_resource(Resource::new([ + KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + "corrosion", + ), + KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_VERSION, + VERSION, + ), + KeyValue::new( + opentelemetry_semantic_conventions::resource::HOST_NAME, + hostname::get().unwrap().to_string_lossy().into_owned(), + ), + ])); + + let tracer = opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter(otlp_exporter) + .with_trace_config(trace_config) + .with_batch_config(batch_config) + .install_batch(opentelemetry::runtime::Tokio) + .expect("Failed to initialize OpenTelemetry OTLP exporter."); + + let sub = sub.with(tracing_opentelemetry::layer().with_tracer(tracer)); + match config.log.format { + LogFormat::Plaintext => { + sub.with(tracing_subscriber::fmt::Layer::new().with_ansi(config.log.colors)) + .init(); + } + LogFormat::Json => { + sub.with( + tracing_subscriber::fmt::Layer::new() + .json() + .with_span_list(false), + ) .init(); + } } - LogFormat::Json => { - sub.with(tracing_subscriber::fmt::Layer::new().json()) + } else { + match config.log.format { + LogFormat::Plaintext => { + sub.with(tracing_subscriber::fmt::Layer::new().with_ansi(config.log.colors)) + .init(); + } + LogFormat::Json => { + sub.with( + tracing_subscriber::fmt::Layer::new() + .json() + .with_span_list(false), + ) .init(); + } } } } else { @@ -208,6 +273,13 @@ async fn process_cli(cli: Cli) -> eyre::Result<()> { restored.old_len, restored.new_len ); } + Command::Cluster(ClusterCommand::MembershipStates) => { + let mut conn = AdminConn::connect(cli.admin_path()).await?; + conn.send_command(corro_admin::Command::Cluster( + corro_admin::ClusterCommand::MembershipStates, + )) + .await?; + } Command::Consul(cmd) => match cmd { ConsulCommand::Sync => match cli.config()?.consul.as_ref() { Some(consul) => { @@ -443,6 +515,10 @@ enum Command { self_actor_id: bool, }, + /// Cluster interactions + #[command(subcommand)] + Cluster(ClusterCommand), + /// Consul interactions #[command(subcommand)] Consul(ConsulCommand), @@ -490,6 +566,12 @@ enum Command { Tls(TlsCommand), } +#[derive(Subcommand)] +enum ClusterCommand { + /// Dumps the current member states + MembershipStates, +} + #[derive(Subcommand)] enum ConsulCommand { /// Synchronizes the local consul agent with Corrosion diff --git a/crates/spawn/src/lib.rs b/crates/spawn/src/lib.rs index 0b800120..fd075395 100644 --- a/crates/spawn/src/lib.rs +++ b/crates/spawn/src/lib.rs @@ -7,7 +7,7 @@ use std::time::Duration; use futures::Future; use pin_project_lite::pin_project; -use tracing::{info, trace, Instrument}; +use tracing::{info, trace}; /// Global counter for [spawn_counted] and [spawn_counted_w_handle] pub static PENDING_HANDLES: AtomicUsize = AtomicUsize::new(0); @@ -19,24 +19,30 @@ where F: Future + Send + 'static, F::Output: Send, { - tokio::spawn(CountedFut::new(fut, &PENDING_HANDLES).in_current_span()) + tokio::spawn(CountedFut::new(fut, &PENDING_HANDLES)) } /// Spawn `fut` as a [CountedFut] (increments/decrements an [AtomicUsize]) /// on the given [tokio::runtime::Handle]. #[track_caller] -pub fn spawn_counted_w_handle(fut: F, h: &tokio::runtime::Handle) -> tokio::task::JoinHandle +pub fn spawn_counted_w_handle( + fut: F, + h: &tokio::runtime::Handle, +) -> tokio::task::JoinHandle where F: Future + Send + 'static, F::Output: Send, { - h.spawn(CountedFut::new(fut, &PENDING_HANDLES).in_current_span()) + h.spawn(CountedFut::new(fut, &PENDING_HANDLES)) } /// Spawn blocking `fut` as a [CountedFut] (increments/decrements an [AtomicUsize]) /// on the given [tokio::runtime::Handle]. #[track_caller] -pub fn spawn_blocking_counted_w_handle(func: F, h: &tokio::runtime::Handle) -> tokio::task::JoinHandle +pub fn spawn_blocking_counted_w_handle( + func: F, + h: &tokio::runtime::Handle, +) -> tokio::task::JoinHandle where F: FnOnce() -> R + Send + 'static, R: Send + 'static, diff --git a/doc/mdbook-admonish.css b/doc/mdbook-admonish.css index c3e9869e..e0a33655 100644 --- a/doc/mdbook-admonish.css +++ b/doc/mdbook-admonish.css @@ -1,31 +1,18 @@ @charset "UTF-8"; :root { - --md-admonition-icon--note: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--abstract: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--info: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--tip: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--success: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--question: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--warning: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--failure: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--danger: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--bug: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--example: - url("data:image/svg+xml;charset=utf-8,"); - --md-admonition-icon--quote: - url("data:image/svg+xml;charset=utf-8,"); - --md-details-icon: - url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-note: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-abstract: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-info: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-tip: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-success: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-question: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-warning: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-failure: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-danger: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-bug: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-example: url("data:image/svg+xml;charset=utf-8,"); + --md-admonition-icon--admonish-quote: url("data:image/svg+xml;charset=utf-8,"); + --md-details-icon: url("data:image/svg+xml;charset=utf-8,"); } :is(.admonition) { @@ -132,204 +119,204 @@ details[open].admonition > summary.admonition-title::after { transform: rotate(90deg); } -:is(.admonition):is(.note) { +:is(.admonition):is(.admonish-note) { border-color: #448aff; } -:is(.note) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-note) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(68, 138, 255, 0.1); } -:is(.note) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-note) > :is(.admonition-title, summary.admonition-title)::before { background-color: #448aff; - mask-image: var(--md-admonition-icon--note); - -webkit-mask-image: var(--md-admonition-icon--note); + mask-image: var(--md-admonition-icon--admonish-note); + -webkit-mask-image: var(--md-admonition-icon--admonish-note); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.abstract, .summary, .tldr) { +:is(.admonition):is(.admonish-abstract, .admonish-summary, .admonish-tldr) { border-color: #00b0ff; } -:is(.abstract, .summary, .tldr) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(0, 176, 255, 0.1); } -:is(.abstract, .summary, .tldr) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title)::before { background-color: #00b0ff; - mask-image: var(--md-admonition-icon--abstract); - -webkit-mask-image: var(--md-admonition-icon--abstract); + mask-image: var(--md-admonition-icon--admonish-abstract); + -webkit-mask-image: var(--md-admonition-icon--admonish-abstract); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.info, .todo) { +:is(.admonition):is(.admonish-info, .admonish-todo) { border-color: #00b8d4; } -:is(.info, .todo) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(0, 184, 212, 0.1); } -:is(.info, .todo) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title)::before { background-color: #00b8d4; - mask-image: var(--md-admonition-icon--info); - -webkit-mask-image: var(--md-admonition-icon--info); + mask-image: var(--md-admonition-icon--admonish-info); + -webkit-mask-image: var(--md-admonition-icon--admonish-info); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.tip, .hint, .important) { +:is(.admonition):is(.admonish-tip, .admonish-hint, .admonish-important) { border-color: #00bfa5; } -:is(.tip, .hint, .important) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(0, 191, 165, 0.1); } -:is(.tip, .hint, .important) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title)::before { background-color: #00bfa5; - mask-image: var(--md-admonition-icon--tip); - -webkit-mask-image: var(--md-admonition-icon--tip); + mask-image: var(--md-admonition-icon--admonish-tip); + -webkit-mask-image: var(--md-admonition-icon--admonish-tip); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.success, .check, .done) { +:is(.admonition):is(.admonish-success, .admonish-check, .admonish-done) { border-color: #00c853; } -:is(.success, .check, .done) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(0, 200, 83, 0.1); } -:is(.success, .check, .done) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title)::before { background-color: #00c853; - mask-image: var(--md-admonition-icon--success); - -webkit-mask-image: var(--md-admonition-icon--success); + mask-image: var(--md-admonition-icon--admonish-success); + -webkit-mask-image: var(--md-admonition-icon--admonish-success); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.question, .help, .faq) { +:is(.admonition):is(.admonish-question, .admonish-help, .admonish-faq) { border-color: #64dd17; } -:is(.question, .help, .faq) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(100, 221, 23, 0.1); } -:is(.question, .help, .faq) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title)::before { background-color: #64dd17; - mask-image: var(--md-admonition-icon--question); - -webkit-mask-image: var(--md-admonition-icon--question); + mask-image: var(--md-admonition-icon--admonish-question); + -webkit-mask-image: var(--md-admonition-icon--admonish-question); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.warning, .caution, .attention) { +:is(.admonition):is(.admonish-warning, .admonish-caution, .admonish-attention) { border-color: #ff9100; } -:is(.warning, .caution, .attention) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(255, 145, 0, 0.1); } -:is(.warning, .caution, .attention) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title)::before { background-color: #ff9100; - mask-image: var(--md-admonition-icon--warning); - -webkit-mask-image: var(--md-admonition-icon--warning); + mask-image: var(--md-admonition-icon--admonish-warning); + -webkit-mask-image: var(--md-admonition-icon--admonish-warning); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.failure, .fail, .missing) { +:is(.admonition):is(.admonish-failure, .admonish-fail, .admonish-missing) { border-color: #ff5252; } -:is(.failure, .fail, .missing) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(255, 82, 82, 0.1); } -:is(.failure, .fail, .missing) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title)::before { background-color: #ff5252; - mask-image: var(--md-admonition-icon--failure); - -webkit-mask-image: var(--md-admonition-icon--failure); + mask-image: var(--md-admonition-icon--admonish-failure); + -webkit-mask-image: var(--md-admonition-icon--admonish-failure); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.danger, .error) { +:is(.admonition):is(.admonish-danger, .admonish-error) { border-color: #ff1744; } -:is(.danger, .error) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(255, 23, 68, 0.1); } -:is(.danger, .error) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title)::before { background-color: #ff1744; - mask-image: var(--md-admonition-icon--danger); - -webkit-mask-image: var(--md-admonition-icon--danger); + mask-image: var(--md-admonition-icon--admonish-danger); + -webkit-mask-image: var(--md-admonition-icon--admonish-danger); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.bug) { +:is(.admonition):is(.admonish-bug) { border-color: #f50057; } -:is(.bug) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(245, 0, 87, 0.1); } -:is(.bug) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title)::before { background-color: #f50057; - mask-image: var(--md-admonition-icon--bug); - -webkit-mask-image: var(--md-admonition-icon--bug); + mask-image: var(--md-admonition-icon--admonish-bug); + -webkit-mask-image: var(--md-admonition-icon--admonish-bug); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.example) { +:is(.admonition):is(.admonish-example) { border-color: #7c4dff; } -:is(.example) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-example) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(124, 77, 255, 0.1); } -:is(.example) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-example) > :is(.admonition-title, summary.admonition-title)::before { background-color: #7c4dff; - mask-image: var(--md-admonition-icon--example); - -webkit-mask-image: var(--md-admonition-icon--example); + mask-image: var(--md-admonition-icon--admonish-example); + -webkit-mask-image: var(--md-admonition-icon--admonish-example); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; -webkit-mask-repeat: no-repeat; } -:is(.admonition):is(.quote, .cite) { +:is(.admonition):is(.admonish-quote, .admonish-cite) { border-color: #9e9e9e; } -:is(.quote, .cite) > :is(.admonition-title, summary.admonition-title) { +:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title) { background-color: rgba(158, 158, 158, 0.1); } -:is(.quote, .cite) > :is(.admonition-title, summary.admonition-title)::before { +:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title)::before { background-color: #9e9e9e; - mask-image: var(--md-admonition-icon--quote); - -webkit-mask-image: var(--md-admonition-icon--quote); + mask-image: var(--md-admonition-icon--admonish-quote); + -webkit-mask-image: var(--md-admonition-icon--admonish-quote); mask-repeat: no-repeat; -webkit-mask-repeat: no-repeat; mask-size: contain; @@ -340,7 +327,8 @@ details[open].admonition > summary.admonition-title::after { background-color: var(--sidebar-bg); } -.ayu :is(.admonition), .coal :is(.admonition) { +.ayu :is(.admonition), +.coal :is(.admonition) { background-color: var(--theme-hover); }