我经常遇到这样的问题:k8s 中的一个 rabbitmq 节点突然消耗了节点上的所有内存,然后被 OOM 杀死。重启后一切正常,但是……这种情况每周可能会重复一两次,所以这不是预期的行为。我找不到任何原因。也许有人可以帮我解决这个问题?
我的环境:
- Kubernetes 集群 1.25
- Rabbitmq 是通过 helm 包安装的,docker 镜像 rabbitmq 版本为 - docker.io/bitnami/rabbitmq:3.10.7-debian-11-r4
- 具有 3 个节点和仲裁队列的 Rabbitmq 集群(约 180 个)
- 消费者是.Net 应用程序
RabbitMQ 是通过此conf部署的: 配置
RabbitMQ 节点关闭后,我收到了以下日志:
2024-03-14T15:45:30.587934978Z stdout F 2024-03-14 15:45:30.587524+00:00 [info] <0.1277.0> vm_memory_high_watermark set. Memory used:3418648576 allowed:3221225472
2024-03-14T15:45:30.588245587Z stdout F 2024-03-14 15:45:30.587780+00:00 [warning] <0.1275.0> memory resource limit alarm set on node '[email protected]'.
2024-03-14T15:45:30.588266609Z stdout F 2024-03-14 15:45:30.587780+00:00 [warning] <0.1275.0> **********************************************************
2024-03-14T15:45:30.588274577Z stdout F 2024-03-14 15:45:30.587780+00:00 [warning] <0.1275.0> *** Publishers will be blocked until this alarm clears ***
2024-03-14T15:45:30.588279298Z stdout F 2024-03-14 15:45:30.587780+00:00 [warning] <0.1275.0> **********************************************************
2024-03-14T15:45:36.245444636Z stdout F 2024-03-14 15:45:08.706447+00:00 [error] <0.2064.0> ** State machine '%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5' terminating
2024-03-14T15:45:36.245493345Z stdout F 2024-03-14 15:45:08.706447+00:00 [error] <0.2064.0> ** Last event = {cast,
{append_entries_rpc,80,
{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'},
2600142,2600142,79,[]}}
2024-03-14T15:45:36.245527246Z stdout F 2024-03-14 15:45:08.706447+00:00 [error] <0.2064.0> ** When server state = [{id,
{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'}},
{opt,terminate},
{raft_state,follower},
{leader_last_seen,undefined},
{num_pending_commands,0},
{num_delayed_commands,0},
{num_pending_applied_notifications,0},
{election_timeout_set,true},
{ra_server_state,
#{aux =>
{aux_v2,
'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
{0,true},
{inactive,-576460717736502,1,1.0},
{aux_gc,0},
undefined,undefined},
cluster =>
#{{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'} =>
#{commit_index_sent => 2600142,
match_index => 2600142,
next_index => 2600144,query_index => 0,
status => normal},
{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'} =>
#{commit_index_sent => 0,match_index => 0,
next_index => 1,query_index => 0,
status => normal},
{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'} =>
#{commit_index_sent => 2600142,
match_index => 2600142,
next_index => 2600144,query_index => 0,
status => normal}},
commit_index => 2600142,
counter =>
{write_concurrency,
#Ref<0.839822614.4015652865.199022>},
current_term => 80,
effective_machine_module => rabbit_fifo,
effective_machine_version => 2,
id =>
{'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
'[email protected]'},
last_applied => 2600142,leader_id => undefined,
log =>
#{cache_size => 1,first_index => 2600143,
last_index => 2600143,
last_written_index_term => {2600142,79},
num_segments => 0,open_segments => 0,
snapshot_index => 2600142,type => ra_log},
log_id =>
"queue '092f56e3-1277-4bcc-b5de-479d59e8a0a5' in vhost '/'",
machine =>
#{checkout_message_bytes => 0,
config =>
#{consumer_strategy => competing,
dead_lettering_enabled => false,
delivery_limit => undefined,
expires => undefined,max_bytes => undefined,
max_length => undefined,
msg_ttl => undefined,
name =>
'%2F_092f56e3-1277-4bcc-b5de-479d59e8a0a5',
release_cursor_interval => {2048,2048},
resource =>
{resource,<<"/">>,queue,
<<"092f56e3-1277-4bcc-b5de-479d59e8a0a5">>}},
discard_checkout_message_bytes => 0,
discard_message_bytes => 0,
enqueue_message_bytes => 0,
in_memory_message_bytes => 0,
num_checked_out => 0,num_consumers => 10,
num_discard_checked_out => 0,
num_discarded => 0,num_enqueuers => 4,
num_in_memory_ready_messages => 0,
num_messages => 0,num_ready_messages => 0,},
machine_version => 2,
uid => <<"2F_092IRAXC8BNGV2Y">>,
voted_for => undefined}}]
2024-03-14T15:45:36.246097117Z stdout F 2024-03-14 15:45:08.706447+00:00 [error] <0.2064.0> ** Reason for termination = error:{badmatch,
{not_found,
{ra_log,
{cfg,<<"2F_092IRAXC8BNGV2Y">>,
"queue '092f56e3-1277-4bcc-b5de-479d59e8a0a5' in vhost '/'",
"/bitnami/rabbitmq/mnesia/[email protected]/quorum/[email protected]/2F_092IRAXC8BNGV2Y",
8192,ra_log_snapshot,20,ra_log_wal,
ra_log_segment_writer,
{write_concurrency,
#Ref<0.839822614.4015652865.199022>},
#{closed_mem_tbls =>
ra_log_closed_mem_tables,
directory => ra_directory,
directory_rev => ra_directory_reverse,
log_ets => ra_log_ets,
wal => ra_log_wal,
2600143,2600143,79,
{2600142,79},
{ra_snapshot,<<"2F_092IRAXC8BNGV2Y">>,
ra_log_snapshot,
"/bitnami/rabbitmq/mnesia/[email protected]/quorum/[email protected]/2F_092IRAXC8BNGV2Y/snapshots",
undefined,undefined,
{2600142,79}},
#{2600143 =>
{2600143,79,
{'$usr',
#{ts => 1710431108471},
{enqueue,<0.8668.937>,9014,
{basic_message,
{resource,<<"/">>,exchange,<<>>},
[<<"092f56e3-1277-4bcc-b5de-479d59e8a0a5">>],
{content,60,none,
<<152,0,16,97,112,112,108,105,99,
97,116,105,111,110,47,106,115,
111,110,2,0>>,
rabbit_framing_amqp_0_9_1,
[<<191,209,140,209,143,32,208,190,
208,178,209,129,209,143,208,
178,208,176,208,189,208,184,
49,48,46,49,49,51,46,50,52,46,
.................
2024-03-14T15:46:04.335079169Z stderr F Killed