- 我已经设置了由 3 个节点 + 1 个延迟隐藏节点 + 仲裁器组成的 mongod 副本。
- 我已经设置了 DNS:主内部 DNS 和辅助内部 DNS(绑定)服务器,这样我就可以通过普通 FQDN 名称而不是 IP 地址来引用节点。
- 当主 DNS 发生故障时(如果主 DNS 发生故障),我有辅助 DNS 来处理请求。
问题:
当我模拟主 DNS 关闭时 - 我完全破坏了副本集,因为主节点看不到其他节点,并在 5-10 秒后变为次要节点
当主 DNS 关闭时,我的主节点(mongodb-cluster-shard-01-rA.site-aws.com)显示的内容如下:
siteRS0:SECONDARY> rs.status()
{
"set" : "siteRS0",
"date" : ISODate("2014-08-10T03:16:22Z"),
"myState" : 2,
"members" : [
{
"_id" : 0,
"name" : "mongodb-cluster-shard-01-rA.site-aws.com:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 1913839,
"optime" : Timestamp(1407628608, 1),
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
"self" : true
},
{
"_id" : 1,
"name" : "mongodb-cluster-shard-01-rB.site-aws.com:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(1407628608, 1),
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
"lastHeartbeat" : ISODate("2014-08-10T03:16:08Z"),
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:52Z"),
"pingMs" : 0,
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
},
{
"_id" : 2,
"name" : "mongodb-cluster-shard-01-arbiter.site-aws.com:30000",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"lastHeartbeat" : ISODate("2014-08-10T03:16:19Z"),
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:45Z"),
"pingMs" : 0
},
{
"_id" : 3,
"name" : "mongodb-cluster-shard-01-rC.site-aws.com:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(1407628608, 1),
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
"lastHeartbeat" : ISODate("2014-08-10T03:16:16Z"),
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:52Z"),
"pingMs" : 0,
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
},
{
"_id" : 4,
"name" : "mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(1407628608, 1),
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
"lastHeartbeat" : ISODate("2014-08-10T03:16:00Z"),
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:49Z"),
"pingMs" : 0,
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
}
],
"ok" : 1
}
如果我去日志,我会看到很多 getaddrinfo 消息:
[root@mongodb-cluster-shard-01-rA ec2-user]# tail /mongo/log/mongod.log
2014-08-10T02:35:13.044+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-arbiter.site-aws.com") failed: Name or service not known
2014-08-10T02:35:13.469+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
2014-08-10T02:35:13.469+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:13.968+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
2014-08-10T02:35:13.968+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:17.059+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
2014-08-10T02:35:17.059+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:18.476+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
2014-08-10T02:35:18.669+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:18.976+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
[root@mongodb-cluster-shard-01-rA ec2-user]# tail /mongo/log/mongod.log
2014-08-10T02:35:17.059+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
2014-08-10T02:35:17.059+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:18.476+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
2014-08-10T02:35:18.669+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:18.976+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
2014-08-10T02:35:20.051+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-arbiter.site-aws.com") failed: Name or service not known
2014-08-10T02:35:20.051+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-arbiter.site-aws.com:30000: couldn't connect to server mongodb-cluster-shard-01-arbiter.site-aws.com:30000 (0.0.0.0) failed, address resolved to 0.0.0.0
2014-08-10T02:35:23.677+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
2014-08-10T02:35:24.066+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
2014-08-10T02:35:24.066+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
[root@mongodb-cluster-shard-01-rA ec2-user]#
但是 nslookup 可以正确解析 FQDN 到 IP:
[root@mongodb-cluster-shard-01-rA ec2-user]# nslookup mongodb-cluster-shard-01-rC.site-aws.com
Server: 10.233.147.18 (this is secondary dns)
Address: 10.233.147.18#53
Name: mongodb-cluster-shard-01-rC.site-aws.com
Address: 10.220.153.211
启动主 DNS (.119) 后: 然后我很快就会通过主 DNS 来解决这个问题
[root@mongodb-cluster-shard-01-rA ec2-user]# nslookup mongodb-cluster-shard-01-rC.site-aws.com
Server: 10.35.147.119
Address: 10.35.147.119#53
一旦主 DNS 启动并运行,一切就会恢复正常。我的副本成为主 DNS,一切正常。那么我错过了什么或做错了什么?
我的 mongo 实例有以下 /etc/resolve.conf 文件:
[root@mongodb-cluster-shard-01-rA log]# cat /etc/resolv.conf
; generated by /sbin/dhclient-script
search us-west-2.compute.internal site.com
nameserver 10.35.147.119
nameserver 10.233.147.18
nameserver 172.16.0.23
nameserver 172.16.0.23
主 DNS /etc/named.conf:
options {
#listen-on port 53 { 127.0.0.1; 10.224.3.36};
listen-on-v6 port 53 { ::1; };
directory "/var/named";
dump-file "/var/named/data/cache_dump.db";
statistics-file "/var/named/data/named_stats.txt";
memstatistics-file "/var/named/data/named_mem_stats.txt";
allow-query { any; };
recursion no;
dnssec-enable yes;
dnssec-validation yes;
dnssec-lookaside auto;
/* Path to ISC DLV key */
bindkeys-file "/etc/named.iscdlv.key";
managed-keys-directory "/var/named/dynamic";
notify yes;
also-notify { 10.233.147.18; };
};
logging {
channel default_debug {
file "data/named.run";
severity dynamic;
};
};
zone "site-aws.com" IN {
type master;
file "site-aws.com.zone";
allow-update { none; };
allow-query { any; };
allow-transfer {10.233.147.18; };
};
include "/etc/named.rfc1912.zones";
include "/etc/named.root.key";
“site-aws.com.zone”定义:
$TTL 86400
@ IN SOA ns1.site-aws.com. root.site-aws.com. (
2013042203 ;Serial
300 ;Refresh
1800 ;Retry
604800 ;Expire
86400 ;Minimum TTL
)
; Specify our two nameservers
IN NS ns1.site-aws.com.
; IN NS ns2.site-aws.com.
; Resolve nameserver hostnames to IP, replace with your two droplet IP addresses.
ns1 IN A 10.224.3.36
;ns2 IN A 2.2.2.2
; Define hostname -> IP pairs which you wish to resolve
devops IN A 10.35.147.119
mongodb-cluster-shard-01-rA IN A 10.230.9.223
mongodb-cluster-shard-01-rB IN A 10.17.6.57
mongodb-cluster-shard-01-rC IN A 10.220.153.211
mongodb-cluster-shard-01-arbiter IN A 10.251.112.114
mongodb-cluster-shard-01-rA-backup-hidden IN A 10.230.20.83
mongodb-cluster-backup IN A 10.230.20.83
prod-redis-cluster-01-rA IN A 10.226.207.86
ns1 IN A 10.35.147.119
ns2
IN A 10.233.147.18
辅助 DNS /etc/named.conf:
options {
#listen-on port 53 { 127.0.0.1; 10.224.3.36};
listen-on-v6 port 53 { ::1; };
directory "/var/named";
dump-file "/var/named/data/cache_dump.db";
statistics-file "/var/named/data/named_stats.txt";
memstatistics-file "/var/named/data/named_mem_stats.txt";
allow-query { any; };
recursion no;
dnssec-enable yes;
dnssec-validation yes;
dnssec-lookaside auto;
/* Path to ISC DLV key */
bindkeys-file "/etc/named.iscdlv.key";
managed-keys-directory "/var/named/dynamic";
};
logging {
channel default_debug {
file "data/named.run";
severity dynamic;
};
};
zone "site-aws.com" IN {
type slave;
file "site-aws.com.zone";
allow-query { any; };
allow-transfer {10.35.147.119; }; ## NS1 is allowed for zone transfer when necessary ##
masters {10.35.147.119; }; ## the master NS1 is defined ##
};
include "/etc/named.rfc1912.zones";
include "/etc/named.root.key";
辅助 DNS 已同步 site-aws.com.zone - 文件存在。
那么问题是,为什么副本 mongodb 会这样表现。如何确保如果主 DNS 发生故障,副本(以及通过 FQDN 引用内部节点的所有其他节点)仍可正常运行
答案1
问题出在 glibc 中,即缓存 /etc/resolve.conf 数据。我通过安装 nscd 解决了这个问题:
yum install nscd; chkconfig nscd on; /etc/init.d/nscd start
之后问题就消失了。几个相关主题:
- https://stackoverflow.com/questions/125466/using-glibc-why-does-my-gethostbyname-fail-after-i-dhcp-has-changed-the-dns-ser
- https://jira.mongodb.org/browse/SERVER-7587
- https://jira.mongodb.org/browse/SERVER-12099
希望这对将来的某人有所帮助。