diff --git a/include/rabbit_cli.hrl b/include/rabbit_cli.hrl index 7f5db6053b07..a0d1ecfdd519 100644 --- a/include/rabbit_cli.hrl +++ b/include/rabbit_cli.hrl @@ -34,7 +34,7 @@ -define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}). -define(QUIET_DEF, {?QUIET_OPT, flag}). -define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}). --define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, "infinity"}}). +-define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, use_default}}). -define(VERBOSE_DEF, {?VERBOSE_OPT, flag}). -define(MINIMAL_DEF, {?MINIMAL_OPT, flag}). diff --git a/scripts/rabbitmq-env b/scripts/rabbitmq-env index 62cff0b248b5..6a7b2f91f037 100755 --- a/scripts/rabbitmq-env +++ b/scripts/rabbitmq-env @@ -62,8 +62,11 @@ RABBITMQ_HOME="$(rmq_realpath "${RABBITMQ_SCRIPTS_DIR}/..")" ## Set defaults . ${RABBITMQ_SCRIPTS_DIR}/rabbitmq-defaults +DEFAULT_SCHEDULER_BIND_TYPE="db" +[ "x" = "x$RABBITMQ_SCHEDULER_BIND_TYPE" ] && RABBITMQ_SCHEDULER_BIND_TYPE=${DEFAULT_SCHEDULER_BIND_TYPE} + ## Common defaults -SERVER_ERL_ARGS="+P 1048576" +SERVER_ERL_ARGS="+P 1048576 +stbt $RABBITMQ_SCHEDULER_BIND_TYPE " # We save the current value of $RABBITMQ_PID_FILE in case it was set by # an init script. If $CONF_ENV_FILE overrides it again, we must ignore diff --git a/scripts/rabbitmq-env.bat b/scripts/rabbitmq-env.bat index 4c5691bedb1b..a4a5c9dc19bb 100644 --- a/scripts/rabbitmq-env.bat +++ b/scripts/rabbitmq-env.bat @@ -30,10 +30,14 @@ REM ## Set defaults REM . ${SCRIPT_DIR}/rabbitmq-defaults call "%SCRIPT_DIR%\rabbitmq-defaults.bat" -REM These common defaults aren't referenced in the batch scripts -REM ## Common defaults -REM SERVER_ERL_ARGS="+P 1048576" -REM +set DEFAULT_SCHEDULER_BIND_TYPE=db + +REM [ "x" = "x$RABBITMQ_SCHEDULER_BIND_TYPE" ] && RABBITMQ_SCHEDULER_BIND_TYPE=${DEFAULT_SCHEDULER_BIND_TYPE} +REM set the default scheduling bind type +if "!RABBITMQ_SCHEDULER_BIND_TYPE!"=="" ( + set RABBITMQ_SCHEDULER_BIND_TYPE=!DEFAULT_SCHEDULER_BIND_TYPE! +) + REM # warn about old rabbitmq.conf file, if no new one REM if [ -f /etc/rabbitmq/rabbitmq.conf ] && \ REM [ ! -f ${CONF_ENV_FILE} ] ; then @@ -41,9 +45,8 @@ REM echo -n "WARNING: ignoring /etc/rabbitmq/rabbitmq.conf -- " REM echo "location has moved to ${CONF_ENV_FILE}" REM fi -REM ERL_ARGS aren't referenced in the batch scripts REM Common defaults -REM set SERVER_ERL_ARGS=+P 1048576 +set SERVER_ERL_ARGS=+P 1048576 +stbt !RABBITMQ_SCHEDULER_BIND_TYPE! REM ## Get configuration variables from the configure environment file REM [ -f ${CONF_ENV_FILE} ] && . ${CONF_ENV_FILE} || true @@ -151,7 +154,9 @@ if "!RABBITMQ_DIST_PORT!"=="" ( ) REM [ "x" = "x$RABBITMQ_SERVER_ERL_ARGS" ] && RABBITMQ_SERVER_ERL_ARGS=${SERVER_ERL_ARGS} -REM No Windows equivalent +if "!RABBITMQ_SERVER_ERL_ARGS!"=="" ( + set RABBITMQ_SERVER_ERL_ARGS=!SERVER_ERL_ARGS! +) REM [ "x" = "x$RABBITMQ_CONFIG_FILE" ] && RABBITMQ_CONFIG_FILE=${CONFIG_FILE} diff --git a/scripts/rabbitmq-server.bat b/scripts/rabbitmq-server.bat index 20a0bd882312..a15f24e586f5 100644 --- a/scripts/rabbitmq-server.bat +++ b/scripts/rabbitmq-server.bat @@ -156,9 +156,8 @@ if "!ENV_OK!"=="false" ( !RABBITMQ_NAME_TYPE! !RABBITMQ_NODENAME! ^ +W w ^ +A "!RABBITMQ_IO_THREAD_POOL_SIZE!" ^ -+P 1048576 ^ -!RABBITMQ_LISTEN_ARG! ^ !RABBITMQ_SERVER_ERL_ARGS! ^ +!RABBITMQ_LISTEN_ARG! ^ -kernel inet_default_connect_options "[{nodelay, true}]" ^ !RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS! ^ -sasl errlog_type error ^ diff --git a/scripts/rabbitmq-service.bat b/scripts/rabbitmq-service.bat index 2fb34ddb2881..c9e404db463a 100644 --- a/scripts/rabbitmq-service.bat +++ b/scripts/rabbitmq-service.bat @@ -232,9 +232,8 @@ set ERLANG_SERVICE_ARGUMENTS= ^ !RABBITMQ_CONFIG_ARG! ^ +W w ^ +A "!RABBITMQ_IO_THREAD_POOL_SIZE!" ^ -+P 1048576 ^ -!RABBITMQ_LISTEN_ARG! ^ !RABBITMQ_SERVER_ERL_ARGS! ^ +!RABBITMQ_LISTEN_ARG! ^ -kernel inet_default_connect_options "[{nodelay,true}]" ^ !RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS! ^ -sasl errlog_type error ^ diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl index b9b21352ae57..7f410ac7523b 100644 --- a/src/rabbit_control_main.erl +++ b/src/rabbit_control_main.erl @@ -114,13 +114,15 @@ [stop, stop_app, start_app, wait, reset, force_reset, rotate_logs, join_cluster, change_cluster_node_type, update_cluster_nodes, forget_cluster_node, rename_cluster_node, cluster_status, status, - environment, eval, force_boot, help, node_health_check, hipe_compile]). + environment, eval, force_boot, help, hipe_compile]). +%% [Command | {Command, DefaultTimeoutInMilliSeconds}] -define(COMMANDS_WITH_TIMEOUT, [list_user_permissions, list_policies, list_queues, list_exchanges, list_bindings, list_connections, list_channels, list_consumers, list_vhosts, list_parameters, - purge_queue]). + purge_queue, + {node_health_check, 70000}]). %%---------------------------------------------------------------------------- @@ -152,7 +154,7 @@ start() -> end end, try - T = case get_timeout(Opts) of + T = case get_timeout(Command, Opts) of {ok, Timeout} -> Timeout; {error, _} -> @@ -187,8 +189,23 @@ print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg) -> end, io:nl(). -get_timeout(Opts) -> - parse_timeout(proplists:get_value(?TIMEOUT_OPT, Opts, ?RPC_TIMEOUT)). +get_timeout(Command, Opts) -> + Default = case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of + none -> + infinity; + {Command, true} -> + ?RPC_TIMEOUT; + {Command, D} -> + D + end, + Result = case proplists:get_value(?TIMEOUT_OPT, Opts, Default) of + use_default -> + parse_timeout(Default); + Value -> + parse_timeout(Value) + end, + Result. + parse_number(N) when is_list(N) -> try list_to_integer(N) of @@ -234,11 +251,11 @@ do_action(Command, Node, Args, Opts, Inform, Timeout) -> false -> case ensure_app_running(Node) of ok -> - case lists:member(Command, ?COMMANDS_WITH_TIMEOUT) of - true -> + case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of + {Command, _} -> announce_timeout(Timeout, Inform), action(Command, Node, Args, Opts, Inform, Timeout); - false -> + none -> action(Command, Node, Args, Opts, Inform) end; E -> E @@ -559,17 +576,6 @@ action(eval, Node, [Expr], _Opts, _Inform) -> action(help, _Node, _Args, _Opts, _Inform) -> io:format("~s", [rabbit_ctl_usage:usage()]); -action(node_health_check, Node, _Args, _Opts, Inform) -> - Inform("Checking health of node ~p", [Node]), - try - rabbit_health_check:node(Node), - io:format("Health check passed~n") - catch - {node_is_ko, ErrorMsg, ErrorCode} -> - io:format("Heath check failed:~n~s~n", [ErrorMsg]), - halt(ErrorCode) - end; - action(Command, Node, Args, Opts, Inform) -> %% For backward compatibility, run commands accepting a timeout with %% the default timeout. @@ -685,7 +691,17 @@ action(list_consumers, Node, _Args, Opts, Inform, Timeout) -> Nodes = nodes_in_cluster(Node, Timeout), call_emitter(Node, {rabbit_amqqueue, emit_consumers_all, [Nodes, VHostArg]}, rabbit_amqqueue:consumer_info_keys(), - [{timeout, Timeout}, {chunks, length(Nodes)}]). + [{timeout, Timeout}, {chunks, length(Nodes)}]); + +action(node_health_check, Node, _Args, _Opts, Inform, Timeout) -> + Inform("Checking health of node ~p", [Node]), + case rabbit_health_check:node(Node, Timeout) of + ok -> + io:format("Health check passed~n"), + ok; + Other -> + Other + end. format_parse_error({_Line, Mod, Err}) -> lists:flatten(Mod:format_error(Err)). diff --git a/test/health_check_SUITE.erl b/test/health_check_SUITE.erl new file mode 100644 index 000000000000..4d8f56e9d305 --- /dev/null +++ b/test/health_check_SUITE.erl @@ -0,0 +1,167 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License +%% at http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and +%% limitations under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is GoPivotal, Inc. +%% Copyright (c) 2016 Pivotal Software, Inc. All rights reserved. +%% +-module(health_check_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("amqp_client/include/amqp_client.hrl"). + +-export([all/0 + ,groups/0 + ,init_per_suite/1 + ,end_per_suite/1 + ,init_per_testcase/2 + ,end_per_testcase/2 + ]). + +-export([ignores_remote_dead_channel/1 + ,detects_local_dead_channel/1 + ,ignores_remote_dead_queue/1 + ,detects_local_dead_queue/1 + ,ignores_remote_alarms/1 + ,detects_local_alarm/1 + ,honors_timeout_argument/1 + ]). + +all() -> + [{group, all_cases}]. + +groups() -> + [{all_cases, [], + [ignores_remote_dead_queue + ,detects_local_dead_queue + ,ignores_remote_dead_channel + ,detects_local_dead_channel + ,ignores_remote_alarms + ,detects_local_alarm + ,honors_timeout_argument + ]}]. + +init_per_suite(Config) -> + rabbit_ct_helpers:log_environment(), + rabbit_ct_helpers:run_setup_steps(Config). + +end_per_suite(Config) -> + rabbit_ct_helpers:run_teardown_steps(Config). + +init_per_testcase(Testcase, Config0) -> + rabbit_ct_helpers:testcase_started(Config0, Testcase), + Config1 = rabbit_ct_helpers:set_config( + Config0, [{rmq_nodes_count, 2}, + {rmq_nodes_clustered, true}]), + rabbit_ct_helpers:run_steps(Config1, + rabbit_ct_broker_helpers:setup_steps() ++ + rabbit_ct_client_helpers:setup_steps()). + +end_per_testcase(Testcase, Config0) -> + Config1 = case rabbit_ct_helpers:get_config(Config0, save_config) of + undefined -> Config0; + C -> C + end, + Config2 = rabbit_ct_helpers:run_steps(Config1, + rabbit_ct_client_helpers:teardown_steps() ++ + rabbit_ct_broker_helpers:teardown_steps()), + rabbit_ct_helpers:testcase_finished(Config2, Testcase). + +%%---------------------------------------------------------------------------- +%% Test cases +%%---------------------------------------------------------------------------- +ignores_remote_dead_channel(Config) -> + [A, B] = open_channel_and_declare_queue_everywhere(Config), + CPid = suspend_single_channel(Config, B), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + resume_sys_process(Config, B, CPid), + ok. + +detects_local_dead_channel(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + CPid = suspend_single_channel(Config, A), + {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "operation node_health_check.*timed out"), + resume_sys_process(Config, A, CPid), + ok. + +ignores_remote_dead_queue(Config) -> + [A, B] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, B), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + resume_sys_process(Config, B, QPid), + ok. + +detects_local_dead_queue(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, A), + {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "operation node_health_check.*timed out"), + resume_sys_process(Config, A, QPid), + ok. + +ignores_remote_alarms(Config) -> + [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rabbitmqctl(Config, B, + ["set_vm_memory_high_watermark", "0.000000001"]), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + ok. + +detects_local_alarm(Config) -> + [A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rabbitmqctl(Config, A, + ["set_vm_memory_high_watermark", "0.000000001"]), + {error, 70, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "resource alarm.*in effect"), + ok. + +honors_timeout_argument(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, A), + + case timer:tc(rabbit_ct_broker_helpers, rabbitmqctl, [Config, A, ["-t", "5", "node_health_check"]]) of + {TimeSpent, {error, 75, _}} -> + if TimeSpent < 5000000 -> exit({too_fast, TimeSpent}); + TimeSpent > 7000000 -> exit({too_slow, TimeSpent}); %% +2 seconds for rabbitmqctl overhead + true -> ok + end; + {_, Unexpected} -> + exit({unexpected, Unexpected}) + end, + resume_sys_process(Config, A, QPid), + ok. + +%%---------------------------------------------------------------------------- +%% Helpers +%%---------------------------------------------------------------------------- +open_channel_and_declare_queue_everywhere(Config) -> + Nodes = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + lists:foreach(fun(Node) -> + Ch = rabbit_ct_client_helpers:open_channel(Config, Node), + #'queue.declare_ok'{} = amqp_channel:call(Ch, #'queue.declare'{}) + end, + Nodes), + Nodes. + +suspend_single_queue(Config, Node) -> + [QPid|_] = [rabbit_amqqueue:pid_of(Q) || Q <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_amqqueue, list, []), + Node == node(rabbit_amqqueue:pid_of(Q))], + rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [QPid]), + QPid. + +suspend_single_channel(Config, Node) -> + [CPid|_] = [Pid || Pid <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_channel, list_local, []), + Node == node(Pid)], + rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [CPid]), + CPid. + +resume_sys_process(Config, Node, Pid) -> + rabbit_ct_broker_helpers:rpc(Config, Node, sys, resume, [Pid]).