Skip to content

Commit

Permalink
op-conductor robustness + bootstrap/startup script improvement
Browse files Browse the repository at this point in the history
op-conductor robustness fixes in localnet

modified "setup-raft.bash" script to allow configuration of op-conductor raft setup from environment
  • Loading branch information
ClaytonNorthey92 committed Nov 20, 2024
1 parent 1ae8c7f commit 3d6c0e2
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 35 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/localnet-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ jobs:
- name: "run localnet"
run: docker compose -f ./e2e/docker-compose.yml up -d

- name: "kill an op-node after a minute"
run: sleep 60 && docker compose -f ./e2e/docker-compose.yml down op-node
- name: "kill an op-node after 15 seconds, then wait 3 minutes (the healthcheck interval + time for another sequencer to take over)"
run: sleep 15 && docker compose -f ./e2e/docker-compose.yml down op-node && sleep 180

- name: "get localnet stats"
working-directory: ./e2e/monitor
Expand Down
68 changes: 49 additions & 19 deletions e2e/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,10 @@ services:
depends_on:
- "geth-l1"
healthcheck:
test: ["CMD-SHELL", "ls /l2configs/rollup.json"]
timeout: 60s
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"]
timeout: 1s
retries: 300
interval: 1s
environment:
ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}"
OP_GETH_L1_RPC: "http://geth-l1:8545"
Expand Down Expand Up @@ -240,8 +242,10 @@ services:
op-geth-l2:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "ls /l2configs/rollup.json"]
timeout: 60s
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"]
timeout: 1s
retries: 300
interval: 1s
environment:
ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}"
OP_GETH_L1_RPC: "http://geth-l1:8545"
Expand Down Expand Up @@ -273,8 +277,10 @@ services:
op-geth-l2:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "ls /l2configs/rollup.json"]
timeout: 60s
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 30303"]
timeout: 1s
retries: 300
interval: 1s
environment:
ADMIN_PRIVATE_KEY: "${ADMIN_PRIVATE_KEY}"
OP_GETH_L1_RPC: "http://geth-l1:8545"
Expand Down Expand Up @@ -310,6 +316,11 @@ services:
condition: "service_started"
op-geth-l2:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"]
timeout: 1s
retries: 300
interval: 1s
environment:
OP_NODE_BSS_WS: "http://bssd:8081/v1/ws"
command:
Expand All @@ -329,7 +340,7 @@ services:
- "--l1.trustrpc"
- "--log.level=info"
- "--l1.trustrpc=true"
- "--l1.http-poll-interval=6s"
- "--l1.http-poll-interval=1s"
- "--p2p.no-discovery"
- "--p2p.priv.path=/tmp/op-node-priv-key.txt"
- "--p2p.sequencer.key=${ADMIN_PRIVATE_KEY}"
Expand Down Expand Up @@ -364,6 +375,11 @@ services:
condition: "service_started"
op-geth-l2-2:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"]
timeout: 1s
retries: 300
interval: 1s
environment:
OP_NODE_BSS_WS: "http://bssd:8081/v1/ws"
command:
Expand Down Expand Up @@ -415,6 +431,11 @@ services:
condition: "service_started"
op-geth-l2-3:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "nc -w 1 -vz 0.0.0.0 9222"]
timeout: 1s
retries: 300
interval: 1s
environment:
OP_NODE_BSS_WS: "http://bssd:8081/v1/ws"
command:
Expand Down Expand Up @@ -575,6 +596,8 @@ services:
- "--l2oo-address=${L2OO_ADDRESS}"
- "--private-key=${ADMIN_PRIVATE_KEY}"
- "--l1-eth-rpc=http://geth-l1:8545"
networks:
e2e:

op-proposer-2:
build:
Expand Down Expand Up @@ -634,19 +657,20 @@ services:
- "op-conductor/bin/op-conductor"
- "--consensus.addr=op-conductor"
- "--consensus.port=50050"
- "--raft.server.id=op-conductor-1"
- "--raft.server.id=op-conductor-1:50050"
- "--raft.storage.dir=/tmp/raft"
- "--raft.bootstrap"
- "--node.rpc=http://op-node:8547"
- "--execution.rpc=http://op-geth-l2:8546"
- "--healthcheck.unsafe-interval=10"
- "--healthcheck.unsafe-interval=12"
- "--healthcheck.safe-interval=200"
- "--healthcheck.min-peer-count=1"
- "--healthcheck.interval=60"
- "--healthcheck.interval=120"
- "--rollup.config=/l2configs/rollup.json"
- "--log.format=terminal"
- "--rpc.addr=0.0.0.0"
- "--rpc.port=8547"
- "--paused"
volumes:
- "l2configs:/l2configs"
- "./jwt.txt:/tmp/jwt.txt"
Expand All @@ -655,7 +679,7 @@ services:
e2e:
depends_on:
op-node:
condition: "service_started"
condition: "service_healthy"
op-geth-l2:
condition: "service_healthy"
ports:
Expand All @@ -669,18 +693,19 @@ services:
- "op-conductor/bin/op-conductor"
- "--consensus.addr=op-conductor-2"
- "--consensus.port=50051"
- "--raft.server.id=op-conductor-2"
- "--raft.server.id=op-conductor-2:50051"
- "--raft.storage.dir=/tmp/raft"
- "--node.rpc=http://op-node-2:8547"
- "--execution.rpc=http://op-geth-l2-2:8546"
- "--healthcheck.unsafe-interval=10"
- "--healthcheck.unsafe-interval=12"
- "--healthcheck.safe-interval=200"
- "--healthcheck.min-peer-count=1"
- "--healthcheck.interval=60"
- "--healthcheck.interval=120"
- "--rollup.config=/l2configs/rollup.json"
- "--log.format=terminal"
- "--rpc.addr=0.0.0.0"
- "--rpc.port=8547"
- "--paused"
volumes:
- "l2configs:/l2configs"
- "./jwt.txt:/tmp/jwt.txt"
Expand All @@ -689,7 +714,7 @@ services:
e2e:
depends_on:
op-node-2:
condition: "service_started"
condition: "service_healthy"
op-geth-l2-2:
condition: "service_healthy"
ports:
Expand All @@ -703,18 +728,19 @@ services:
- "op-conductor/bin/op-conductor"
- "--consensus.addr=op-conductor-3"
- "--consensus.port=50052"
- "--raft.server.id=op-conductor-3"
- "--raft.server.id=op-conductor-3:50052"
- "--raft.storage.dir=/tmp/raft"
- "--node.rpc=http://op-node-3:8547"
- "--execution.rpc=http://op-geth-l2-3:8546"
- "--healthcheck.unsafe-interval=10"
- "--healthcheck.unsafe-interval=12"
- "--healthcheck.safe-interval=200"
- "--healthcheck.min-peer-count=1"
- "--healthcheck.interval=60"
- "--healthcheck.interval=120"
- "--rollup.config=/l2configs/rollup.json"
- "--log.format=terminal"
- "--rpc.addr=0.0.0.0"
- "--rpc.port=8547"
- "--paused"
volumes:
- "l2configs:/l2configs"
- "./jwt.txt:/tmp/jwt.txt"
Expand All @@ -723,7 +749,7 @@ services:
e2e:
depends_on:
op-node-3:
condition: "service_started"
condition: "service_healthy"
op-geth-l2-3:
condition: "service_healthy"
ports:
Expand All @@ -735,6 +761,10 @@ services:
context: "."
entrypoint:
- "bash"
environment:
OPNODE_RPCS: 'http://op-node:8547,http://op-node-2:8547,http://op-node-3:8547'
OPCONDUCTOR_RPCS: 'http://op-conductor:8547,http://op-conductor-2:8547,http://op-conductor-3:8547'
OPCONDUCTOR_RAFT_VOTERS: 'op-conductor:50050,op-conductor-2:50051,op-conductor-3:50052'
command:
- "/tmp/setup-raft.bash"
depends_on:
Expand Down
1 change: 0 additions & 1 deletion e2e/entrypointl2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ fi
--authrpc.addr=0.0.0.0 \
--authrpc.port=8551 \
--authrpc.jwtsecret=/tmp/jwt.txt \
--verbosity=5 \
--gpo.maxprice=1 \
--tbc.network=localnet \
--tbc.initheight=1 \
Expand Down
2 changes: 2 additions & 0 deletions e2e/optimism-stack.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,5 @@ RUN forge build
WORKDIR /git/optimism

RUN make devnet-allocs

RUN apt-get install -y netcat-openbsd
48 changes: 35 additions & 13 deletions e2e/setup-raft.bash
Original file line number Diff line number Diff line change
@@ -1,20 +1,42 @@
#! /bin/bash

set -ev
set -evx

curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor-2:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' http://op-conductor-3:8547
curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_stopSequencer\",\"params\":[],\"id\":3}" http://op-node:8547
IFS=',' read -ra conductor_rpcs <<< "$OPCONDUCTOR_RPCS"
IFS=',' read -ra conductor_rafts <<< "$OPCONDUCTOR_RAFT_VOTERS"
IFS=',' read -ra opnode_rpcs <<< "$OPNODE_RPCS"
opnode_rpc=

# find the leader
for i in "${!conductor_rpcs[@]}"; do
is_leader=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_leader","params":[],"id":4}' "${conductor_rpcs[$i]}" | jq '.result')
if [ "$is_leader" = 'true' ]; then
opnode_rpc=${opnode_rpcs[$i]}
fi
done

curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":1}' http://op-node:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_addServerAsVoter","params":["op-conductor-2", "op-conductor-2:50051"],"id":4}' http://op-conductor:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_addServerAsVoter","params":["op-conductor-3", "op-conductor-3:50052"],"id":4}' http://op-conductor:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor-2:8547
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' http://op-conductor-3:8547

unsafe_head=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":2}' http://op-node:8547 | jq '.result.unsafe_l2.hash' )
if [ "$opnode_rpc" = '' ]; then
echo "could not find leader, aborting"
exit 1
fi

# pause each conductor so we can modify state
for rpc in "${conductor_rpcs[@]}"; do
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_pause","params":[],"id":4}' $rpc
done

# for each raft port in op-conductor, add as a voter. this may error when adding self as a voter with the leader, but that's ok, the others should succeed
for raft in "${conductor_rafts[@]}"; do
curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"conductor_addServerAsVoter\",\"params\":[\"$raft\", \"$raft\"],\"id\":4}" ${conductor_rpcs[0]}
done

# resume the conductors
for rpc in "${conductor_rpcs[@]}"; do
curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"conductor_resume","params":[],"id":4}' $rpc
done

# restart the sequencer using the unsafe head from the leader's sync status
unsafe_head=$(curl -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":2}' $opnode_rpc | jq '.result.unsafe_l2.hash' )
echo "unsafe_head=$unsafe_head"
curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_startSequencer\",\"params\":[$unsafe_head],\"id\":3}" http://op-node:8547
curl -X POST -H "Content-Type: application/json" --data "{\"jsonrpc\":\"2.0\",\"method\":\"admin_startSequencer\",\"params\":[$unsafe_head],\"id\":3}" $opnode_rpc

0 comments on commit 3d6c0e2

Please sign in to comment.