到目前为止,我们一直在使用 Prometheus,取得了相当大的成功。然而,我们最近注意到,我们设置的一组警报最终被错误标记:它显示为与实际不同的环境。这种情况发生在我们有多个目标的两种环境中,所以我想知道这是语法问题(不知何故?其余的都很好)还是 blackbox-exporter 的(使用)问题。
具体来说,列出多个目标的环境会将第二个及后续目标标记为列表中的下一个环境(ENV-B1 的 .2 和 .3 显示为 ENV-B2,而 ENV-B2 的 .2 和 .3 显示为 ENV-B3)。其余的都是正确的。
这些是我的配置(替换了各种信息):
# prometheus.yml
rule_files:
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
# Note: Prometheus lists these in a different order, it's not clear why.
- job_name: "a client (direct)"
metrics_path: /probe
params:
module: [https_health]
static_configs:
# ACME
- targets: [ "http://1.1.1.1:8089/isAlive" ]
labels:
environment: S1
project: ACME
service: web-api
importance: moderate
- targets: [ "http://2.2.2.2:8089/isAlive" ]
labels:
environment: S2
project: ACME
service: web-api
importance: moderate
# B environments run multiple instances; Check all of them.
- targets:
- "http://3.3.3.1:8089/isAlive"
- "http://3.3.3.2:8089/isAlive"
- "http://3.3.3.3:8089/isAlive"
labels:
environment: B1
project: ACME
service: web-api
importance: high
# B environments run multiple instances; Check all of them.
- targets:
- "http://4.4.4.1:8089/isAlive"
- "http://4.4.4.2:8089/isAlive"
- "http://4.4.4.3:8089/isAlive"
labels:
environment: B2
project: ACME
service: web-api
importance: high
# B environments run multiple instances . . . except B3.
- targets: [ "http://5.5.5.5:9089/isAlive" ]
labels:
environment: B3
project: ACME
service: web-api
importance: high
# (more)
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 10.10.10.10:32222
# (etc)
# alertmanager.yml
global:
receivers:
- name: slack-devteam-acme
slack_configs:
- send_resolved: true
api_url: "...url..."
username: "Prometheus Monitoring"
title: '{{if eq .Status "firing"}}:fire:{{else}}:successful:{{end}} {{.CommonAnnotations.summary}} {{if eq .Status "firing"}}failing{{else}}resolved{{end}}'
title_link: "http://10.10.10.10:32600/alerts"
text: "{{.CommonAnnotations.description}}\nInstance(s):{{range .Alerts}}\n{{.Labels.instance}} {{.Labels.environment}} {{.Labels.project}} {{.Labels.service}}{{ end }}"
# alerts.yml
- name: blackbox-export-basic
rules:
- alert: ProbeFailure
expr: probe_success < 1
for: 5m
# labels:
# severity: page
annotations:
summary: "{{$labels.job}} probe"
description: "{{$labels.job}} failed check for 5+ minutes."
#description: "{{$labels.instance}} ({{$labels.job}}) failed check for 10+ minutes."
# blackbox-exporter.config.yml
modules:
http_build_number:
prober: http
http:
fail_if_body_not_matches_regexp:
- "buildNumber"
https_health:
prober: http
http:
fail_if_body_not_matches_regexp:
- "Success"
# (etc)