我正在尝试使用 watchdog 以本机复制模式部署 pgpool,以管理 2 个 Active-Standby 节点。
当我在两台服务器上启动 pgpool2 时,它们各自声称自己是唯一存活的节点并成为主节点。
我有 的生命检查方法,heartbeat
并将每台主机的心跳目的地设置为另一台主机。我确认我可以使用 在主机之间发送 UDP 消息nc
。我是否遗漏或误解了什么?
另外,我不明白看门狗端口实际上做什么,如果心跳端口发送和接收健康检查并且 pcp 有自己的管理端口,那么看门狗 TCP 套接字会做什么?
pgpool.conf 来自第一台服务器,第二台服务器的配置相同,除了主机名中的 01/02。配置已略微缩减,主要是注释和不相关的内容,以便简洁(日志、缓存)
#------------------------------------------------------------------------------
# CONNECTIONS
#------------------------------------------------------------------------------
# - pgpool Connection Settings -
listen_addresses = '*'
port = 6432
socket_dir = '/var/run/postgresql'
listen_backlog_multiplier = 2
serialize_accept = off
pcp_listen_addresses = '*'
pcp_port = 9898
pcp_socket_dir = '/var/run/postgresql'
# - Backend Connection Settings -
backend_hostname0 = 'pgsql01.example.com'
backend_port0 = 5432
backend_weight0 = 1
backend_data_directory0 = '/var/lib/pgsql/data'
backend_flag0 = 'ALLOW_TO_FAILOVER'
backend_hostname1 = 'pgsql02.example.com'
backend_port1 = 5432
backend_weight1 = 1
backend_data_directory1 = '/var/lib/pgsql/data'
backend_flag1 = 'ALLOW_TO_FAILOVER'
# - Authentication -
enable_pool_hba = on
pool_passwd = 'pool_passwd'
authentication_timeout = 60
# - SSL Connections -
ssl = off
#------------------------------------------------------------------------------
# POOLS
#------------------------------------------------------------------------------
# - Concurrent session and pool size -
num_init_children = 32
max_pool = 4
# - Life time -
child_life_time = 300
child_max_connections = 0
connection_life_time = 0
client_idle_limit = 0
#------------------------------------------------------------------------------
# CONNECTION POOLING
#------------------------------------------------------------------------------
connection_cache = on
reset_query_list = 'ABORT; DISCARD ALL'
#------------------------------------------------------------------------------
# REPLICATION MODE
#------------------------------------------------------------------------------
replication_mode = on
replicate_select = off
insert_lock = on
lobj_lock_table = ''
# - Degenerate handling -
replication_stop_on_mismatch = off
failover_if_affected_tuples_mismatch = off
#------------------------------------------------------------------------------
# LOAD BALANCING MODE
#------------------------------------------------------------------------------
load_balance_mode = on
ignore_leading_white_space = on
white_function_list = ''
black_function_list = 'nextval,setval,nextval,setval'
database_redirect_preference_list = ''
app_name_redirect_preference_list = ''
allow_sql_comments = on
#------------------------------------------------------------------------------
# MASTER/SLAVE MODE
#------------------------------------------------------------------------------
master_slave_mode = off
master_slave_sub_mode = 'stream'
# - Streaming -
sr_check_period = 0
sr_check_user = 'nobody'
sr_check_password = ''
sr_check_database = 'postgres'
delay_threshold = 0
# - Special commands -
follow_master_command = ''
#------------------------------------------------------------------------------
# HEALTH CHECK GLOBAL PARAMETERS
#------------------------------------------------------------------------------
health_check_period = 0
health_check_timeout = 20
health_check_user = 'nobody'
health_check_password = ''
health_check_database = ''
health_check_max_retries = 0
health_check_retry_delay = 1
connect_timeout = 10000
#------------------------------------------------------------------------------
# FAILOVER AND FAILBACK
#------------------------------------------------------------------------------
failover_command = ''
failback_command = ''
fail_over_on_backend_error = on
search_primary_node_timeout = 300
#------------------------------------------------------------------------------
# ONLINE RECOVERY
#------------------------------------------------------------------------------
recovery_user = 'nobody'
recovery_password = ''
recovery_1st_stage_command = ''
recovery_2nd_stage_command = ''
recovery_timeout = 90
client_idle_limit_in_recovery = 0
#------------------------------------------------------------------------------
# WATCHDOG
#------------------------------------------------------------------------------
# - Enabling -
use_watchdog = on
trusted_servers = 'pgsql-test.example.com,proxy.example.com,proxy01.example.com,proxy02.example.com'
ping_path = '/bin'
# - Watchdog communication Settings -
wd_hostname = 'pgsql01.example.com'
wd_port = 9000
wd_priority = 2
wd_authkey = 'Hunter1'
wd_ipc_socket_dir = '/tmp'
# - Virtual IP control Setting -
delegate_IP = '10.10.10.92'
if_cmd_path = '/sbin'
if_up_cmd = 'ip addr add $_IP_$/23 dev ens192 label ens192:0'
if_down_cmd = 'ip addr del $_IP_$/23 dev ens192'
arping_path = '/usr/bin'
arping_cmd = 'arping -U $_IP_$ -w 1'
# - Behaivor on escalation Setting -
clear_memqcache_on_escalation = on
wd_escalation_command = ''
wd_de_escalation_command = ''
# - Watchdog consensus settings for failover -
failover_when_quorum_exists = on
failover_require_consensus = on
allow_multiple_failover_requests_from_node = off
# -- common --
wd_monitoring_interfaces_list = '' # Comma separated list of interfaces names to monitor.
wd_lifecheck_method = 'heartbeat'
wd_interval = 5
# -- heartbeat mode --
wd_heartbeat_port = 9694
wd_heartbeat_keepalive = 2
wd_heartbeat_deadtime = 30
heartbeat_destination0 = 'pgsql02.example.com'
heartbeat_destination_port0 = 9694
heartbeat_device0 = ''
# -- query mode --
wd_life_point = 3
wd_lifecheck_query = 'SELECT 1'
wd_lifecheck_dbname = 'template1'
wd_lifecheck_user = 'nobody'
wd_lifecheck_password = ''
# - Other pgpool Connection Settings -
other_pgpool_hostname0 = 'pgsql02'
other_pgpool_port0 = 6432
other_wd_port0 = 9694
答案1
我从pgpool-常规邮件列表。
other_wd_port0
应该9000
在我的配置中而不是9694
。