MongoDB Primary 在大约 2 周后持续崩溃

MongoDB Primary 在大约 2 周后持续崩溃

我们在 Azure 中的 3 个虚拟机上运行 mongodb 集群。我自己设置了集群,我们已经使用了大约一年。几个月前,我们的主服务器上出现了 mongodb 崩溃。幸运的是,辅助服务器接管了,我们的平台没有受到任何损害。问题是,从那时起,主服务器大约每两周就会崩溃一次。

我们在 Grafana 中对这些服务器进行了自定义监控,崩溃前一刻,“执行 I/O 所花费的时间”波动很大,峰值高于正常水平。 不久之后,服务器崩溃并出现以下错误:

2017-08-27T23:29:31.537+0000 F -        [NetworkInterfaceASIO-BGSync-0] std::exception::what(): Resource temporarily unavailable
Actual exception type: std::system_error

 0x1556b32 0x1555e42 0x1ce5506 0x1ce5551 0x14e0a35 0x14e1258 0x12d996e 0x12da16e 0x12da8a8 0x12cd06c 0x12c2a38 0x12c3f9a 0x12c4cd9 0x12c0f85 0x128c858 0x129ee53 0x129f48d 0x1576824 0x1576a21 0x12b75fb 0x1d00200 0x7fa0454f56ba 0x7fa04522b3dd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"1156B32","s":"_ZN5mongo15printStackTraceERSo"},{"b":"400000","o":"1155E42"},{"b":"400000","o":"18E5506","s":"_ZN10__cxxabiv111__terminateEPFvvE"},{"b":"400000","o":"18E5551"},{"b":"400000","o":"10E0A35","s":"_ZN5mongo10ThreadPool25_startWorkerThread_inlockEv"},{"b":"400000","o":"10E1258","s":"_ZN5mongo10ThreadPool8scheduleESt8functionIFvvEE"},{"b":"400000","o":"ED996E","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESD_St11unique_lockISt5mutexE"},{"b":"400000","o":"EDA16E","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESt11unique_lockISt5mutexE"},{"b":"400000","o":"EDA8A8"},{"b":"400000","o":"ECD06C","s":"_ZN5mongo8executor20NetworkInterfaceASIO7AsyncOp6finishERKNS_10StatusWithINS0_21RemoteCommandResponseEEE"},{"b":"400000","o":"EC2A38","s":"_ZN5mongo8executor20NetworkInterfaceASIO18_completeOperationEPNS1_7AsyncOpERKNS_10StatusWithINS0_21RemoteCommandResponseEEE"},{"b":"400000","o":"EC3F9A","s":"_ZN5mongo8executor20NetworkInterfaceASIO20_completedOpCallbackEPNS1_7AsyncOpE"},{"b":"400000","o":"EC4CD9"},{"b":"400000","o":"EC0F85"},{"b":"400000","o":"E8C858","s":"_ZN4asio6detail14strand_service8dispatchINS0_7binder2IRSt8functionIFvSt10error_codemEES5_mEEEEvRPNS1_11strand_implERT_"},{"b":"400000","o":"E9EE53","s":"_ZN4asio6detail14strand_service8dispatchINS0_17rewrapped_handlerINS0_7binder2INS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS8_EEEENS_17mutable_buffers_1ENS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEESI_mEESK_EEEEvRPNS1_11strand_implERT_"},{"b":"400000","o":"E9F48D","s":"_ZN4asio6detail23reactive_socket_recv_opINS_17mutable_buffers_1ENS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS6_EEEES2_NS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEEE11do_completeEPvPNS0_19scheduler_operationERKSF_m"},{"b":"400000","o":"1176824","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_11scoped_lockINS0_11posix_mutexEEERNS0_21scheduler_thread_infoERKSt10error_code"},{"b":"400000","o":"1176A21","s":"_ZN4asio6detail9scheduler3runERSt10error_code"},{"b":"400000","o":"EB75FB"},{"b":"400000","o":"1900200"},{"b":"7FA0454EE000","o":"76BA"},{"b":"7FA045124000","o":"1073DD","s":"clone"}],"processInfo":{ "mongodbVersion" : "3.2.16", "gitVersion" : "056bf45128114e44c5358c7a8776fb582363e094", "compiledModules" : [], "uname" : { "sysname" : "Linux", "release" : "4.4.0-92-generic", "version" : "#115-Ubuntu SMP Thu Aug 10 09:04:33 UTC 2017", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000", "buildId" : "B4C77D1B42936B23E28A2739927CB25274DB2D96" }, { "b" : "7FFE41517000", "elfType" : 3, "buildId" : "F23E2C79BCC8E97B12E1BB62A1BF196F8423FC40" }, { "b" : "7FA04647A000", "path" : "/lib/x86_64-linux-gnu/libssl.so.1.0.0", "elfType" : 3, "buildId" : "675F454AD6FD0B6CA2E41127C7B98079DA37F7B6" }, { "b" : "7FA046036000", "path" : "/lib/x86_64-linux-gnu/libcrypto.so.1.0.0", "elfType" : 3, "buildId" : "2DA08A7E5BF610030DD33B70DB951399626B7496" }, { "b" : "7FA045E2E000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3, "buildId" : "F951C1E0765FCAE48F82CAFE35D1ADD36D6C9AF9" }, { "b" : "7FA045C2A000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3, "buildId" : "0FC788F0861846257B5F1773FBD438E95DFC1032" }, { "b" : "7FA045921000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3, "buildId" : "FF7A33D389E756CA381A8189291A968EA5E1F4F8" }, { "b" : "7FA04570B000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3, "buildId" : "68220AE2C65D65C1B6AAA12FA6765A6EC2F5F434" }, { "b" : "7FA0454EE000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3, "buildId" : "27F189EF8DB8C3734C6A678E6EF3CB0B206D58B2" }, { "b" : "7FA045124000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3, "buildId" : "088A6E00A1814622219F346B41E775B8DD46C518" }, { "b" : "7FA0466E3000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "9157F205547F0EB588E2AB1F2F120B74253A43EA" } ] }}
 mongod(_ZN5mongo15printStackTraceERSo+0x32) [0x1556b32]
 mongod(+0x1155E42) [0x1555e42]
 mongod(_ZN10__cxxabiv111__terminateEPFvvE+0x6) [0x1ce5506]
 mongod(+0x18E5551) [0x1ce5551]
 mongod(_ZN5mongo10ThreadPool25_startWorkerThread_inlockEv+0x965) [0x14e0a35]
 mongod(_ZN5mongo10ThreadPool8scheduleESt8functionIFvvEE+0x358) [0x14e1258]
 mongod(_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESD_St11unique_lockISt5mutexE+0x1FE) [0x12d996e]
 mongod(_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESt11unique_lockISt5mutexE+0x3E) [0x12da16e]
 mongod(+0xEDA8A8) [0x12da8a8]
 mongod(_ZN5mongo8executor20NetworkInterfaceASIO7AsyncOp6finishERKNS_10StatusWithINS0_21RemoteCommandResponseEEE+0x18C) [0x12cd06c]
 mongod(_ZN5mongo8executor20NetworkInterfaceASIO18_completeOperationEPNS1_7AsyncOpERKNS_10StatusWithINS0_21RemoteCommandResponseEEE+0x378) [0x12c2a38]
 mongod(_ZN5mongo8executor20NetworkInterfaceASIO20_completedOpCallbackEPNS1_7AsyncOpE+0x6A) [0x12c3f9a]
 mongod(+0xEC4CD9) [0x12c4cd9]
 mongod(+0xEC0F85) [0x12c0f85]
 mongod(_ZN4asio6detail14strand_service8dispatchINS0_7binder2IRSt8functionIFvSt10error_codemEES5_mEEEEvRPNS1_11strand_implERT_+0x88) [0x128c858]
 mongod(_ZN4asio6detail14strand_service8dispatchINS0_17rewrapped_handlerINS0_7binder2INS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS8_EEEENS_17mutable_buffers_1ENS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEESI_mEESK_EEEEvRPNS1_11strand_implERT_+0x6D3) [0x129ee53]
 mongod(_ZN4asio6detail23reactive_socket_recv_opINS_17mutable_buffers_1ENS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS6_EEEES2_NS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEEE11do_completeEPvPNS0_19scheduler_operationERKSF_m+0x1ED) [0x129f48d]
 mongod(_ZN4asio6detail9scheduler10do_run_oneERNS0_11scoped_lockINS0_11posix_mutexEEERNS0_21scheduler_thread_infoERKSt10error_code+0x304) [0x1576824]
 mongod(_ZN4asio6detail9scheduler3runERSt10error_code+0xC1) [0x1576a21]
 mongod(+0xEB75FB) [0x12b75fb]
 mongod(+0x1900200) [0x1d00200]
 libpthread.so.0(+0x76BA) [0x7fa0454f56ba]
 libc.so.6(clone+0x6D) [0x7fa04522b3dd]
-----  END BACKTRACE  -----
2017-08-27T23:29:31.487+0000 I COMMAND  [conn11335] command admin.$cmd command: isMaster { ismaster: true, client: { driver: { name: "nodejs", version: "2.2.24" }, os: { type: "Linux", name: "linux", architecture: "x64", version: "4.4.0-71-generic" }, platform: "Node.js v4.8.3, LE, mongodb-core: 2.1.8" } } keyUpdates:0 writeConflicts:0 numYields:0 reslen:342 locks:{} protocol:op_query 23099ms

我不知道该从哪里开始调试这个错误。奇怪的是,尽管 2 个辅助服务器具有完全相同的配置,但它们从未遇到停机。

如果需要,我的 mongo 配置:

storage:
  dbPath: /data
  journal:
    enabled: true

systemLog:
  destination: file
  logAppend: true
  path: /var/log/mongodb/mongod.log

net:
  port: 27017
  bindIp: 0.0.0.0

security:
   keyFile: /etc/mongo-keyfile

replication:
   replSetName: rs0
   oplogSizeMB: 150

每台服务器都在 Azure 上运行 Ubuntu 16.04,并安装最新版本的 mongo 3.2(目前为 3.2.16)

答案1

我们的环境中也发生了完全相同的情况。我们发现当时另一个日常作业正在启动并消耗所有系统内存,偶尔会导致 MongoDB 崩溃并出现相同的错误 + 堆栈跟踪。

我们在 Ubuntu 14.04 LTS 上运行带有 MongoDB 3.4.9 的三服务器副本集。

相关内容