当检查返回 0 时，Nagios SNMP 不会发出警报

2024-6-2 • tag-icon

我正在尝试设置水冷设备的检查。如果流量达到 0，我想设置警报。这是我的配置：

#/etc/nagios/custom/commands.cfg
## Chillwater
define command {
    command_name        chillwatersnmp1
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -P 2c -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.266  -r "[1-9][0-9]*\.[0-9]{6}"
}

define command {
    command_name        chillwatersnmp2
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -P 2c -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.279  -r "[1-9][0-9]*\.[0-9]{6}"
}

define command {
    command_name        chillwatersnmp3
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -P 2c -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.279 -s 0 -c0:120
}
#/etc/nagios/custom/hosts/chillwater.cfg
define hostgroup {
    hostgroup_name      chillwater
    alias               chillwater
    members             cc-bb-mr.company.com,cc-bb-north.company.com
}

#/etc/nagios/custom/services/chillwater.cfg
define servicegroup {
        servicegroup_name       chillwater
        alias                   chillwater
}

define service {
    use                   basic-service
    host_name             cc-bb-mr.company.com
    service_description   Bridge-6-heat-flow
    servicegroups         chillwater
    check_command         chillwatersnmp1
    contact_groups        chillwatergroup
}

define service {
    use                   basic-service
    host_name             cc-bb-mr.company.com
    service_description   Bridge-1-heat-flow
    servicegroups         chillwater
    check_command         chillwatersnmp2
    contact_groups        chillwatergroup
}

define service {
    use                   basic-service
    host_name             cc-bb-north.company.com
    service_description   HPC-cool-heat-flow
    servicegroups         chillwater
    check_command         chillwatersnmp3
    contact_groups        chillwatergroup
}

#/etc/nagios/templates.cfg
define service{
        name                            generic-service         ; The 'name' of this service template
        active_checks_enabled           1                       ; Active service checks are enabled
        passive_checks_enabled          1                       ; Passive service checks are enabled/accepted
        parallelize_check               1                       ; Active service checks should be parallelized (disabling this can lead to major performance problems)
        obsess_over_service             1                       ; We should obsess over this service (if necessary)
        check_freshness                 0                       ; Default is to NOT check service 'freshness'
        notifications_enabled           1                       ; Service notifications are enabled
        event_handler_enabled           1                       ; Service event handler is enabled
        flap_detection_enabled          1                       ; Flap detection is enabled
        flap_detection_options          o,c

        process_perf_data               1                       ; Process performance data
        retain_status_information       1                       ; Retain status information across program restarts
        retain_nonstatus_information    1                       ; Retain non-status information across program restarts
        is_volatile                     0                       ; The service is not volatile
        check_period                    24x7                    ; The service can be checked at any time of the day
        max_check_attempts              3                       ; Re-check the service up to 3 times in order to determine its final (hard) state
        check_interval           10                     ; Check the service every 10 minutes under normal conditions
        retry_interval            2                     ; Re-check the service every two minutes until a hard state can be determined
        contact_groups                  admins                  ; Notifications get sent out to everyone in the 'admins' group
        notification_options            c,r                     ; Send notifications about warning, unknown, critical, and recovery events
        notification_interval           60                      ; Re-notify about service problems every hour
        notification_period             24x7                    ; Notifications can be sent out at any time
         register                        0                      ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
        }
define service{
        use                             generic-service
        name                            basic-service
        is_volatile                     0
        check_period                    24x7
        max_check_attempts              2
        check_interval           10
        retry_interval            3
        notification_options            c,r
        notification_interval           10
        notification_period             24x7
        register                        0
        }

我在这里遗漏了什么？我等了十分钟（notification_interval）才收到警报，但 Web 控制台中什么也没有出现。

编辑：使用这些命令，我可以让 nagios 识别关键状态：

    command_name        z_chillwatersnmp1
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.266 -s 0 --invert-search
}

define command {
    command_name        z_chillwatersnmp2
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.279 -s 0 --invert-search
}

define command {
    command_name        z_chillwatersnmp3
    command_line        $USER1$/check_snmp -H $HOSTNAME$ -C techlook -o 1.3.6.1.4.1.3815.1.2.2.1.1.2.1.1.2.279 -s 0 --invert-search
}

问题已经解决了。现在的问题是，为什么当这些情况变得严重时，通知却没有发送？

相关内容