Skip to content

Commit

Permalink
Fix intermittent failures after docker run in common tests (#79)
Browse files Browse the repository at this point in the history
When the server containers are started in common tests we wait a while
before checking for node availability using an `ered_client`. If the
Github CI is choked some containers might take longer time to start than
our defined sleep.

Lets replace this fixed time with a common function that wait for a
`connection_up` event from each node instead.
This should give faster initiations during normal times when Github CI
is not choked.
Using a `connect_timeout` will also avoid failures when connecting to
containers during startup.

Additionally in this PR; replace the `ered_SUITE` local function `recv/2`
with the common `?MSG` macro; and a fixup of ered_connection:opt()
to include the recently added `connect_timeout`.

Signed-off-by: Björn Svensson <[email protected]>
  • Loading branch information
bjosv authored Dec 10, 2024
1 parent d7fc282 commit eca7dc9
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 28 deletions.
8 changes: 5 additions & 3 deletions src/ered_connection.erl
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@
%% If commands are queued up in the process message queue this is the max
%% amount of messages that will be received and sent in one call
{batch_size, non_neg_integer()} |
%% Timeout passed to gen_tcp:connect/4 or ssl:connect/4.
{connect_timeout, timeout()} |
%% Options passed to gen_tcp:connect/4.
{tcp_options, [gen_tcp:connect_option()]} |
%% Timeout passed to gen_tcp:connect/4.
%% Timeout passed to gen_tcp:connect/4. DEPRECATED.
{tcp_connect_timeout, timeout()} |
%% Options passed to ssl:connect/3. If this config parameter is present,
%% Options passed to ssl:connect/4. If this config parameter is present,
%% TLS is used.
{tls_options, [ssl:tls_client_option()]} |
%% Timeout passed to ssl:connect/3.
%% Timeout passed to ssl:connect/4. DEPRECATED.
{tls_connect_timeout, timeout()} |
%% Callback for push notifications
{push_cb, push_cb()} |
Expand Down
24 changes: 7 additions & 17 deletions test/ered_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ all() ->

-define(DEFAULT_SERVER_DOCKER_IMAGE, "valkey/valkey:8.0.1").

-define(CLIENT_OPTS, [{connection_opts, [{connect_timeout, 500}]}]).

init_per_suite(_Config) ->
stop_containers(), % just in case there is junk from previous runs
Image = os:getenv("SERVER_DOCKER_IMAGE", ?DEFAULT_SERVER_DOCKER_IMAGE),
Expand All @@ -54,20 +56,15 @@ init_per_suite(_Config) ->
[P, Image, EnableDebugCommand, P])
|| P <- ?PORTS]),

timer:sleep(2000),
lists:foreach(fun(Port) ->
{ok,Pid} = ered_client:start_link("127.0.0.1", Port, []),
{ok, <<"PONG">>} = ered_client:command(Pid, [<<"ping">>]),
ered_client:stop(Pid)
end, ?PORTS),
ered_test_utils:wait_for_all_nodes_available(?PORTS, ?CLIENT_OPTS),

create_cluster(),
wait_for_consistent_cluster(),
[].

init_per_testcase(_Testcase, Config) ->
%% Quick check that cluster is OK; otherwise restart everything.
case catch ered_test_utils:check_consistent_cluster(?PORTS, []) of
case catch ered_test_utils:check_consistent_cluster(?PORTS, ?CLIENT_OPTS) of
ok ->
[];
_ ->
Expand Down Expand Up @@ -99,7 +96,7 @@ wait_for_consistent_cluster() ->
wait_for_consistent_cluster(?PORTS).

wait_for_consistent_cluster(Ports) ->
ered_test_utils:wait_for_consistent_cluster(Ports, []).
ered_test_utils:wait_for_consistent_cluster(Ports, ?CLIENT_OPTS).

end_per_suite(_Config) ->
stop_containers().
Expand Down Expand Up @@ -722,14 +719,14 @@ t_queue_full(_) ->
Loop(N-1)
end(21),

recv({reply, {error, queue_overflow}}, 1000),
?MSG({reply, {error, queue_overflow}}),
[ct:pal("~s\n", [os:cmd("redis-cli -p " ++ integer_to_list(Port) ++ " CLIENT UNPAUSE")]) || Port <- Ports],
?MSG(#{msg_type := queue_full}),
?MSG(#{msg_type := cluster_not_ok, reason := master_queue_full}),

?MSG(#{msg_type := queue_ok}),
?MSG(#{msg_type := cluster_ok}),
[recv({reply, {ok, <<"PONG">>}}, 1000) || _ <- lists:seq(1,20)],
[?MSG({reply, {ok, <<"PONG">>}}) || _ <- lists:seq(1,20)],
no_more_msgs(),
ok.

Expand Down Expand Up @@ -978,13 +975,6 @@ start_cluster() ->
start_cluster(Opts) ->
ered_test_utils:start_cluster(?PORTS, Opts).

recv(Msg, Time) ->
receive
Msg -> Msg
after Time ->
error({timeout, Msg, erlang:process_info(self(), messages)})
end.

no_more_msgs() ->
{messages,Msgs} = erlang:process_info(self(), messages),
case Msgs of
Expand Down
25 changes: 24 additions & 1 deletion test/ered_test_utils.erl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

-export([start_cluster/2,
check_consistent_cluster/2,
wait_for_consistent_cluster/2]).
wait_for_consistent_cluster/2,
wait_for_all_nodes_available/2]).

%% Start a cluster client and wait for cluster_ok.
start_cluster(Ports, Opts) ->
Expand Down Expand Up @@ -72,6 +73,28 @@ wait_for_consistent_cluster(Ports, ClientOpts) ->
end
end(20).

%% Wait for all nodes to be available for communication.
wait_for_all_nodes_available(Ports, ClientOpts) ->
Pids = [fun(Port) ->
{ok, Pid} = ered_client:start_link("127.0.0.1", Port, [{info_pid, self()}] ++ ClientOpts),
Pid
end(P) || P <- Ports],
wait_for_connection_up(Pids),
no_more_msgs().

wait_for_connection_up([]) ->
ok;
wait_for_connection_up(Pids) ->
{_, {Pid, _, _}, _} = ?MSG({connection_status, _, connection_up}, 4000),
{ok, <<"PONG">>} = ered_client:command(Pid, [<<"ping">>]),

%% Stop client and allow optional connect_error events
ered_client:stop(Pid),
?MSG({connection_status, {Pid, _, _}, {connection_down, {client_stopped, _}}}),
?OPTIONAL_MSG({connection_status, {Pid, _, _}, {connection_down, _}}),
?OPTIONAL_MSG({connection_status, {Pid, _, _}, {connection_down, _}}),
wait_for_connection_up(lists:delete(Pid, Pids)).

no_more_msgs() ->
{messages,Msgs} = erlang:process_info(self(), messages),
case Msgs of
Expand Down
10 changes: 3 additions & 7 deletions test/ered_tls_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ groups() ->
{verify, verify_peer},
{server_name_indication, "Server"}]).

-define(CLIENT_OPTS, [{connection_opts, [{tls_options, ?TLS_OPTS}]}]).
-define(CLIENT_OPTS, [{connection_opts, [{tls_options, ?TLS_OPTS},
{connect_timeout, 500}]}]).

init_per_suite(_Config) ->
stop_containers(), % just in case there is junk from previous runs
Expand Down Expand Up @@ -112,12 +113,7 @@ start_containers() ->
[P, Path, Image, EnableDebugCommand, P])
|| P <- ?PORTS]),

timer:sleep(3000),
lists:foreach(fun(Port) ->
{ok,Pid} = ered_client:start_link("127.0.0.1", Port, ?CLIENT_OPTS),
{ok, <<"PONG">>} = ered_client:command(Pid, [<<"ping">>]),
ered_client:stop(Pid)
end, ?PORTS).
ered_test_utils:wait_for_all_nodes_available(?PORTS, ?CLIENT_OPTS).

stop_containers() ->
cmd_log([io_lib:format("docker stop redis-tls-~p; docker rm redis-tls-~p;", [P, P])
Expand Down

0 comments on commit eca7dc9

Please sign in to comment.