From 7d312573bde7019f08f62bb942e4289088567789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Thu, 11 Dec 2025 12:35:14 +0100 Subject: [PATCH 1/6] Add a log message when dropping a rabbit request The level is NOTICE, because: - It is still normal MongooseIM behaviour (system behaving as designed) and one log per message could flood the logs. - It is more important that INFO, because it is not desired - especially if this condition persists. --- src/event_pusher/mod_event_pusher_rabbit.erl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/event_pusher/mod_event_pusher_rabbit.erl b/src/event_pusher/mod_event_pusher_rabbit.erl index 54afe2228e0..4ba21592c74 100644 --- a/src/event_pusher/mod_event_pusher_rabbit.erl +++ b/src/event_pusher/mod_event_pusher_rabbit.erl @@ -23,6 +23,7 @@ -include_lib("mongooseim/include/mod_event_pusher_events.hrl"). -include_lib("mongooseim/include/mongoose_config_spec.hrl"). +-include_lib("mongooseim/include/mongoose.hrl"). -behaviour(gen_mod). -behaviour(mongoose_module_metrics). @@ -140,7 +141,14 @@ call_rabbit_worker(HostType, Msg) -> -spec cast_rabbit_worker(mongooseim:host_type(), Msg :: term()) -> ok. cast_rabbit_worker(HostType, Msg) -> - mongoose_wpool:cast(rabbit, HostType, ?POOL_TAG, Msg). + try + mongoose_wpool:cast(rabbit, HostType, ?POOL_TAG, Msg) + catch + exit:no_workers -> + ?LOG_NOTICE(#{what => no_event_pusher_rabbit_worker_available, + text => <<"Dropping request because no rabbit worker is available">>, + host_type => HostType, dropped_request => Msg}) + end. -spec exchange_keys() -> [exchange_key()]. exchange_keys() -> From 447715ba45692b1df17fec3fad5655b119915d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Fri, 5 Dec 2025 18:23:27 +0100 Subject: [PATCH 2/6] Implement reconnection to RabbitMQ in the scope of one worker - Configurable delay and number of attempts - Plain timer:sleep/1 is used just like for RDBMS. This should be good enough for short time and prevent subsequent requests from being dropped. - It is recommended to use with max_worker_queue_length. --- src/config/mongoose_config_spec.erl | 12 ++++++- src/mongoose_rabbit_worker.erl | 36 +++++++++++++++---- test/common/config_parser_helper.erl | 8 +++-- test/config_parser_SUITE.erl | 10 ++++++ .../outgoing_pools.toml | 2 ++ 5 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/config/mongoose_config_spec.erl b/src/config/mongoose_config_spec.erl index eef639f1aff..e9e1a4b4449 100644 --- a/src/config/mongoose_config_spec.erl +++ b/src/config/mongoose_config_spec.erl @@ -546,7 +546,8 @@ outgoing_pool_connection(<<"rabbit">>) -> <<"virtual_host">> => #option{type = binary, validate = non_empty}, <<"confirms_enabled">> => #option{type = boolean}, - <<"tls">> => tls([client]) + <<"tls">> => tls([client]), + <<"reconnect">> => rabbit_reconnect() }, include = always, defaults = #{<<"host">> => "localhost", @@ -631,6 +632,15 @@ sql_tls() -> sql_tls_extra() -> #section{items = #{<<"required">> => #option{type = boolean}}}. +%% path: outgoing_pools.rabbit.*.connection.reconnect +rabbit_reconnect() -> + #section{items = #{<<"attempts">> => #option{type = integer, validate = non_negative}, + <<"delay">> => #option{type = integer, validate = non_negative}}, + defaults = #{<<"attempts">> => 0, + <<"delay">> => 2000 % milliseconds + }, + include = always}. + %% TLS options tls(Entities) when is_list(Entities) -> diff --git a/src/mongoose_rabbit_worker.erl b/src/mongoose_rabbit_worker.erl index bae4c45088d..a3e72e2b28c 100644 --- a/src/mongoose_rabbit_worker.erl +++ b/src/mongoose_rabbit_worker.erl @@ -44,7 +44,12 @@ username := binary(), password := binary(), virtual_host := binary(), - confirms_enabled := boolean()}. + confirms_enabled := boolean(), + reconnect := reconnect()}. + +-type reconnect() :: #{attempts := non_neg_integer(), + delay := non_neg_integer() % milliseconds + }. -type publish_result() :: boolean() | timeout | {channel_exception, any(), any()}. @@ -171,7 +176,28 @@ maybe_restart_rabbit_connection(#{connection := Conn} = State) -> end. -spec establish_rabbit_connection(state()) -> state(). -establish_rabbit_connection(State) -> +establish_rabbit_connection(State = #{opts := #{reconnect := #{attempts := Attempts}}}) -> + establish_rabbit_connection(State, Attempts). + +-spec establish_rabbit_connection(state(), non_neg_integer()) -> state(). +establish_rabbit_connection(State, RemainingAttempts) -> + case start_amqp_connection(State) of + {ok, NewState} -> + NewState; + {error, Error} when RemainingAttempts > 0 -> + ?LOG_WARNING(#{what => rabbit_connection_failed, reason => Error, worker_state => State, + remaining_attempts => RemainingAttempts}), + #{opts := #{reconnect := #{delay := Delay}}} = State, + timer:sleep(Delay), + establish_rabbit_connection(State, RemainingAttempts - 1); + {error, Error} when RemainingAttempts =:= 0 -> + ErrorInfo = #{what => rabbit_connection_failed, reason => Error, worker_state => State}, + ?LOG_ERROR(ErrorInfo), + exit(ErrorInfo) + end. + +-spec start_amqp_connection(state()) -> {ok, state()} | {error, term()}. +start_amqp_connection(State) -> #{opts := Opts, host_type := HostType, pool_tag := PoolTag} = State, case amqp_connection:start(mongoose_amqp:network_params(Opts)) of {ok, Connection} -> @@ -182,14 +208,12 @@ establish_rabbit_connection(State) -> maybe_enable_confirms(Channel, Opts), ?LOG_DEBUG(#{what => rabbit_connection_established, host_type => HostType, pool_tag => PoolTag, opts => Opts}), - State#{connection => Connection, channel => Channel}; + {ok, State#{connection => Connection, channel => Channel}}; {error, Error} -> mongoose_instrument:execute(wpool_rabbit_connections, #{host_type => HostType, pool_tag => PoolTag}, #{failed => 1}), - ?LOG_ERROR(#{what => rabbit_connection_failed, reason => Error, - host_type => HostType, pool_tag => PoolTag, opts => Opts}), - exit("connection to a Rabbit server failed") + {error, Error} end. -spec close_rabbit_connection(Connection :: pid(), Channel :: pid(), diff --git a/test/common/config_parser_helper.erl b/test/common/config_parser_helper.erl index 7c7172f899b..d84dccc44de 100644 --- a/test/common/config_parser_helper.erl +++ b/test/common/config_parser_helper.erl @@ -363,7 +363,8 @@ options("outgoing_pools") -> servers => ["ldap-server.example.com"]}}, #{type => rabbit, scope => host_type, tag => event_pusher, opts => #{workers => 20, max_worker_queue_len => 100}, - conn_opts => #{confirms_enabled => true}}, + conn_opts => #{confirms_enabled => true, + reconnect => #{attempts => 5, delay => 1000}}}, #{type => rdbms, opts => #{workers => 5}, conn_opts => #{query_timeout => 5000, keepalive_interval => 30, @@ -828,7 +829,8 @@ default_pool_conn_opts(rabbit) -> username => <<"guest">>, password => <<"guest">>, virtual_host => <<"/">>, - confirms_enabled => false}; + confirms_enabled => false, + reconnect => default_config([outgoing_pools, rabbit, tag, conn_opts, reconnect])}; default_pool_conn_opts(redis) -> #{host => "127.0.0.1", port => 6379, @@ -1302,6 +1304,8 @@ default_config([outgoing_pools, Type, _Tag, opts]) -> default_pool_wpool_opts(Type); default_config([outgoing_pools, Type, _Tag, conn_opts]) -> default_pool_conn_opts(Type); +default_config([outgoing_pools, rabbit, _Tag, conn_opts, reconnect]) -> + #{attempts => 0, delay => 2000}; default_config([outgoing_pools, _Type, _Tag, conn_opts, tls]) -> maps:merge(default_tls(), #{server_name_indication => default_sni()}); default_config([outgoing_pools, _Type, _Tag, conn_opts, tls, server_name_indication]) -> diff --git a/test/config_parser_SUITE.erl b/test/config_parser_SUITE.erl index 056b7ade3de..be8e48cda58 100644 --- a/test/config_parser_SUITE.erl +++ b/test/config_parser_SUITE.erl @@ -145,6 +145,7 @@ groups() -> pool_elastic_connection, pool_rabbit, pool_rabbit_connection, + pool_rabbit_connection_reconnect, pool_rabbit_connection_tls, pool_ldap, pool_ldap_connection, @@ -1112,6 +1113,15 @@ pool_rabbit_connection(_Config) -> ?err(T(#{<<"virtual_host">> => <<>>})), ?err(T(#{<<"confirms_enabled">> => <<"yes">>})). +pool_rabbit_connection_reconnect(_Config) -> + P = [outgoing_pools, 1, conn_opts, reconnect], + T = fun(Opts) -> pool_conn_raw(<<"rabbit">>, #{<<"reconnect">> => Opts}) end, + ?cfg(P, default_config([outgoing_pools, rabbit, default, conn_opts, reconnect]), T(#{})), + ?cfg(P ++ [attempts], 5, T(#{<<"attempts">> => 5})), + ?cfg(P ++ [delay], 0, T(#{<<"delay">> => 0})), + ?err(T(#{<<"attempts">> => -1})), + ?err(T(#{<<"delay">> => <<"infinity">>})). + pool_rabbit_connection_tls(_Config) -> P = [outgoing_pools, 1, conn_opts, tls], T = fun(Opts) -> pool_conn_raw(<<"rabbit">>, #{<<"tls">> => Opts}) end, diff --git a/test/config_parser_SUITE_data/outgoing_pools.toml b/test/config_parser_SUITE_data/outgoing_pools.toml index 4818fa761a8..bcf21043e44 100644 --- a/test/config_parser_SUITE_data/outgoing_pools.toml +++ b/test/config_parser_SUITE_data/outgoing_pools.toml @@ -55,6 +55,8 @@ username = "guest" password = "guest" confirms_enabled = true + reconnect.attempts = 5 + reconnect.delay = 1000 [outgoing_pools.ldap.default] scope = "host_type" From f51ff65bf0268b254e187c41d36c4e5b7f560ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Mon, 8 Dec 2025 16:11:07 +0100 Subject: [PATCH 3/6] Add early recognition of failed rabbit connections Use a monitor as described in amqp_connection.erl Channel is not monitored separately for simplicity. In case of a socket failure, both the connection and the channel processes die. --- src/mongoose_rabbit_worker.erl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mongoose_rabbit_worker.erl b/src/mongoose_rabbit_worker.erl index a3e72e2b28c..a8fae56a15c 100644 --- a/src/mongoose_rabbit_worker.erl +++ b/src/mongoose_rabbit_worker.erl @@ -91,6 +91,11 @@ handle_cast({amqp_publish, Method, Payload}, State) -> handle_amqp_publish(Method, Payload, State). -spec handle_info(term(), state()) -> {noreply, state()}. +handle_info({'DOWN', _Ref, process, Connection, _}, State) -> + {noreply, case State of + #{connection := Connection} -> establish_rabbit_connection(State); + #{} -> State % probably already reconnected + end}; handle_info(Req, State) -> ?UNEXPECTED_INFO(Req), {noreply, State}. @@ -201,6 +206,7 @@ start_amqp_connection(State) -> #{opts := Opts, host_type := HostType, pool_tag := PoolTag} = State, case amqp_connection:start(mongoose_amqp:network_params(Opts)) of {ok, Connection} -> + monitor(process, Connection), % resulting ref is ignored as there is only one monitor mongoose_instrument:execute(wpool_rabbit_connections, #{host_type => HostType, pool_tag => PoolTag}, #{active => 1, opened => 1}), From 9f2bbea04e239dfe728a65e2bc907668af196dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Thu, 11 Dec 2025 12:50:16 +0100 Subject: [PATCH 4/6] Enable confirms for each restarted channel Previously, reopening a channel for an existing connection would ignore the confirms_enabled option. --- src/mongoose_rabbit_worker.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/mongoose_rabbit_worker.erl b/src/mongoose_rabbit_worker.erl index a8fae56a15c..0972b791230 100644 --- a/src/mongoose_rabbit_worker.erl +++ b/src/mongoose_rabbit_worker.erl @@ -171,11 +171,10 @@ maybe_wait_for_confirms(_, _) -> true. -spec maybe_restart_rabbit_connection(state()) -> state(). -maybe_restart_rabbit_connection(#{connection := Conn} = State) -> - case is_process_alive(Conn) of +maybe_restart_rabbit_connection(#{connection := Connection, opts := Opts} = State) -> + case is_process_alive(Connection) of true -> - {ok, Channel} = amqp_connection:open_channel(Conn), - State#{channel := Channel}; + State#{channel := open_amqp_channel(Connection, Opts)}; false -> establish_rabbit_connection(State) end. @@ -210,8 +209,7 @@ start_amqp_connection(State) -> mongoose_instrument:execute(wpool_rabbit_connections, #{host_type => HostType, pool_tag => PoolTag}, #{active => 1, opened => 1}), - {ok, Channel} = amqp_connection:open_channel(Connection), - maybe_enable_confirms(Channel, Opts), + Channel = open_amqp_channel(Connection, Opts), ?LOG_DEBUG(#{what => rabbit_connection_established, host_type => HostType, pool_tag => PoolTag, opts => Opts}), {ok, State#{connection => Connection, channel => Channel}}; @@ -222,6 +220,12 @@ start_amqp_connection(State) -> {error, Error} end. +-spec open_amqp_channel(pid(), opts()) -> pid(). +open_amqp_channel(Connection, Opts) -> + {ok, Channel} = amqp_connection:open_channel(Connection), + maybe_enable_confirms(Channel, Opts), + Channel. + -spec close_rabbit_connection(Connection :: pid(), Channel :: pid(), HostType :: mongooseim:host_type_or_global(), PoolTag :: atom()) -> ok | no_return(). From f3af13c704c04364d9e3ed949722a86168072b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Fri, 12 Dec 2025 17:33:42 +0100 Subject: [PATCH 5/6] Document rabbit reconnect options --- doc/configuration/outgoing-connections.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/configuration/outgoing-connections.md b/doc/configuration/outgoing-connections.md index 6b7877f3f62..aa7313d5056 100644 --- a/doc/configuration/outgoing-connections.md +++ b/doc/configuration/outgoing-connections.md @@ -349,6 +349,27 @@ Sets the RabbitMQ Virtual Host. The host needs to exist, as it is **not** create Enables/disables one-to-one publishers confirms. +### `outgoing_pools.rabbit.*.connection.reconnect.attempts` +* **Syntax:** non-negative integer +* **Default:** 0 +* **Example:** `reconnect.attempts = 5` + +By default, a failed connection attempt results in an immediate restart of the affected worker. +When this happens, its incoming request queue is lost, and any requests present in the queue are dropped. +To avoid this, you can use this option to specify a number of reconnection attempts before the worker is restarted. + +!!! Warning + Using this option might result in a lot of requests being accumulated in the worker queues - especially if `reconnect.delay` multiplied by `reconnect.attempts` is a long time period. + Thus, we recommend using the [`max_worker_queue_len`](#outgoing_poolsmax_worker_queue_len) option as a safety valve is such cases. + +### `outgoing_pools.rabbit.*.connection.reconnect.delay` +* **Syntax:** non-negative integer (milliseconds) +* **Default:** 2000 +* **Example:** `reconnect.delay = 5000` + +Delay (in milliseconds) between consecutive reconnection attempts. +This option is effective only if the value of `reconnect.attempts` is positive. + --- To enable TLS, you need to include the [TLS section](#tls-options) in the connection options. From 9f014fff2d8ac9b95e8cfffdab3e6d1e31c81837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chrz=C4=85szcz?= Date: Thu, 11 Dec 2025 16:23:21 +0100 Subject: [PATCH 6/6] Add tests for handling rabbit connection failures --- .../tests/mod_event_pusher_rabbit_SUITE.erl | 121 +++++++++++++++++- .../tests/mod_event_pusher_rabbit_utils.erl | 40 ++++++ 2 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 big_tests/tests/mod_event_pusher_rabbit_utils.erl diff --git a/big_tests/tests/mod_event_pusher_rabbit_SUITE.erl b/big_tests/tests/mod_event_pusher_rabbit_SUITE.erl index 215a887ab75..513ba28b518 100644 --- a/big_tests/tests/mod_event_pusher_rabbit_SUITE.erl +++ b/big_tests/tests/mod_event_pusher_rabbit_SUITE.erl @@ -34,6 +34,8 @@ -define(RABBIT_HTTP_ENDPOINT, "http://127.0.0.1:15672"). +-define(UTILS_MODULE, mod_event_pusher_rabbit_utils). + -type rabbit_binding() :: {Queue :: binary(), Exchange :: binary(), RoutingKey :: binary()}. @@ -55,7 +57,8 @@ all() -> {group, chat_message_publish}, {group, group_chat_message_publish}, {group, instrumentation}, - {group, filter_and_metadata} + {group, filter_and_metadata}, + {group, single_worker} ]. groups() -> @@ -68,7 +71,8 @@ groups() -> {chat_message_publish, [], chat_message_publish_tests()}, {group_chat_message_publish, [], group_chat_message_publish_tests()}, {instrumentation, [], instrumentation_tests()}, - {filter_and_metadata, [], filter_and_metadata_tests()}]. + {filter_and_metadata, [], filter_and_metadata_tests()}, + {single_worker, [], single_worker_tests()}]. pool_startup_tests() -> [rabbit_pool_starts_with_default_config]. @@ -108,6 +112,12 @@ filter_and_metadata_tests() -> [messages_published_events_are_not_executed, presence_messages_are_properly_formatted_with_metadata]. +single_worker_tests() -> + [connection_is_restarted_on_error, + connection_is_restarted_with_retries, + connection_is_restarted_with_retries_and_queue_limit, + worker_is_restarted_after_failed_retries]. + suite() -> escalus:suite(). @@ -126,12 +136,15 @@ init_per_suite(Config) -> {ok, _} = application:ensure_all_started(amqp_client), muc_helper:load_muc(), mongoose_helper:inject_module(mod_event_pusher_filter), + mongoose_helper:inject_module(?UTILS_MODULE), + rpc(mim(), ?UTILS_MODULE, start, []), escalus:init_per_suite(Config); false -> {skip, "RabbitMQ server is not available on default port."} end. end_per_suite(Config) -> + rpc(mim(), ?UTILS_MODULE, stop, []), escalus_fresh:clean(), muc_helper:unload_muc(), escalus:end_per_suite(Config), @@ -579,6 +592,86 @@ messages_published_events_are_executed(Config) -> end, #{expected_count => 2}) % for sender and receiver end). +connection_is_restarted_on_error(Config) -> + escalus:story( + Config, [{bob, 1}], + fun(Bob) -> + %% GIVEN intermittent rabbit connection failure + BobJID = client_lower_short_jid(Bob), + listen_to_presence_events_from_rabbit([BobJID], Config), + {ok, Worker} = get_rabbit_worker(), + simulate_rabbit_connection_error(), + + %% WHEN user sends presence + send_presence_stanzas([Bob], 1), + + %% THEN event is delivered because the worker kept its queue + ?assertReceivedMatch({#'basic.deliver'{routing_key = BobJID}, + #amqp_msg{}}, timer:seconds(5)), + ?assertEqual({ok, Worker}, get_rabbit_worker()) + end). + +connection_is_restarted_with_retries(Config) -> + escalus:story( + Config, [{bob, 1}], + fun(Bob) -> + %% GIVEN an intermittent rabbit connection failure lasting for 2 connect attempts + BobJID = client_lower_short_jid(Bob), + listen_to_presence_events_from_rabbit([BobJID], Config), + {ok, Worker} = get_rabbit_worker(), + simulate_rabbit_connection_error(2), + + %% WHEN user sends presence + send_presence_stanzas([Bob], 1), + + %% THEN event is delivered because the worker kept its queue + ?assertReceivedMatch({#'basic.deliver'{routing_key = BobJID}, + #amqp_msg{}}, timer:seconds(5)), + ?assertEqual({ok, Worker}, get_rabbit_worker()) + end). + +connection_is_restarted_with_retries_and_queue_limit(Config) -> + escalus:story( + Config, [{bob, 1}], + fun(Bob) -> + %% GIVEN an intermittent rabbit connection failure lasting for 2 connect attempts + BobJID = client_lower_short_jid(Bob), + listen_to_presence_events_from_rabbit([BobJID], Config), + {ok, Worker} = get_rabbit_worker(), + simulate_rabbit_connection_error(2), + + %% WHEN user sends 2 presences + send_presence_stanzas([Bob], 2), + + %% THEN: - first event is delivered because the worker kept its queue + %% - second event is dropped because max_worker_queue_len is 1 + DecodedMessage = get_decoded_message_from_rabbit(BobJID), + ?assertMatch(#{<<"present">> := false}, DecodedMessage), + assert_no_message_from_rabbit([BobJID]), + ?assertEqual({ok, Worker}, get_rabbit_worker()) + end). + +worker_is_restarted_after_failed_retries(Config) -> + escalus:story( + Config, [{bob, 1}], + fun(Bob) -> + %% GIVEN an intermittent rabbit connection failure lasting for 3 connect attempts + BobJID = client_lower_short_jid(Bob), + listen_to_presence_events_from_rabbit([BobJID], Config), + {ok, Worker} = get_rabbit_worker(), + simulate_rabbit_connection_error(3), + + %% WHEN user sends presence + send_presence_stanzas([Bob], 1), + + %% THEN event is dropped because worker is restarted (reconnect.attempts was 2) + assert_no_message_from_rabbit([BobJID]), + wait_for_new_rabbit_worker(Worker), + send_presence_stanzas([Bob], 1), + ?assertReceivedMatch({#'basic.deliver'{routing_key = BobJID}, #amqp_msg{}}, + timer:seconds(5)) + end). + %%-------------------------------------------------------------------- %% Test helpers %%-------------------------------------------------------------------- @@ -797,6 +890,21 @@ assert_no_message_from_rabbit(RoutingKeys) -> 500 -> ok % To save time, this timeout is shorter than in the positive test end. +simulate_rabbit_connection_error() -> + rpc(mim(), ?UTILS_MODULE, ?FUNCTION_NAME, [domain(), 5671, 0]). + +simulate_rabbit_connection_error(Count) -> + rpc(mim(), ?UTILS_MODULE, ?FUNCTION_NAME, [domain(), 5671, Count]). + +wait_for_new_rabbit_worker(OldWorker) -> + {ok, {ok, NewWorker}} = + wait_helper:wait_until(fun get_rabbit_worker/0, true, + #{validator => fun({ok, Worker}) -> Worker =/= OldWorker end}), + NewWorker. + +get_rabbit_worker() -> + rpc(mim(), mongoose_wpool, get_worker, [rabbit, domain(), event_pusher]). + %%-------------------------------------------------------------------- %% Utils %%-------------------------------------------------------------------- @@ -809,13 +917,20 @@ start_rabbit_tls_wpool(Host, GroupName) -> BasicConnOpts = #{tls => tls_config(), port => 5671, virtual_host => ?VHOST}, ConnOpts = maps:merge(BasicConnOpts, extra_conn_opts(GroupName)), ensure_vhost(?VHOST), - start_rabbit_wpool(Host, BasicOpts#{conn_opts => ConnOpts}). + start_rabbit_wpool(Host, maps:merge(BasicOpts#{conn_opts => ConnOpts}, extra_opts(GroupName))). extra_conn_opts(presence_status_publish_with_confirms) -> #{confirms_enabled => true}; +extra_conn_opts(single_worker) -> + #{reconnect => #{attempts => 2, delay => 1000}}; % Note: in case of flaky tests, increase delay extra_conn_opts(_GroupName) -> #{}. +extra_opts(single_worker) -> + #{opts => #{workers => 1, max_worker_queue_len => 1}}; +extra_opts(_) -> + #{}. + tls_config() -> #{certfile => "priv/ssl/fake_cert.pem", keyfile => "priv/ssl/fake_key.pem", diff --git a/big_tests/tests/mod_event_pusher_rabbit_utils.erl b/big_tests/tests/mod_event_pusher_rabbit_utils.erl new file mode 100644 index 00000000000..d2719c1b475 --- /dev/null +++ b/big_tests/tests/mod_event_pusher_rabbit_utils.erl @@ -0,0 +1,40 @@ +-module(mod_event_pusher_rabbit_utils). + +-moduledoc """ +Utilities for mod_event_pusher_rabbit_SUITE. +This module is injected into the mongooseim node. +""". + +-export([start/0, stop/0, simulate_rabbit_connection_error/3]). + +start() -> + meck:new(gen_tcp, [unstick, no_link, passthrough]). + +stop() -> + meck:unload(gen_tcp). + +simulate_rabbit_connection_error(HostType, Port, ReconnectFailures) -> + {ok, Worker} = mongoose_wpool:get_worker(rabbit, HostType, event_pusher), + State = sys:get_state(Worker), + c:c(wpool_process, [{d, 'TEST'}]), % export get_state/1 + #{connection := Connection} = wpool_process:get_state(State), + simulate_tcp_connect_errors(Port, ReconnectFailures), + MonitorRef = monitor(process, Connection), + Connection ! {socket_error, simulated}, + receive {'DOWN', MonitorRef, process, Connection, _} -> ok end. + +simulate_tcp_connect_errors(_Port, 0) -> + ok; +simulate_tcp_connect_errors(Port, Count) -> + persistent_term:put({tcp_connect_errors, Port}, Count), + meck:expect(gen_tcp, connect, fun tcp_connect/4). + +tcp_connect(Address, Port, Opts, Timeout) -> + case persistent_term:get({tcp_connect_errors, Port}, 0) of + 0 -> + persistent_term:erase({tcp_connect_errors, Port}), + meck:passthrough([Address, Port, Opts, Timeout]); + N when N > 0 -> + persistent_term:put({tcp_connect_errors, Port}, N - 1), + {error, simulated_reconnect_error} + end.