3.1 配置文件结构概述
基本结构
Alertmanager 的配置文件采用 YAML 格式,主要包含以下几个部分:
# alertmanager.yml 基本结构
global:
# 全局配置
templates:
# 模板文件路径
route:
# 路由配置
receivers:
# 接收器配置
inhibit_rules:
# 抑制规则
time_intervals:
# 时间间隔配置
mute_time_intervals:
# 静默时间间隔
配置文件示例
# 完整的配置文件示例
global:
# SMTP 配置
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'app-password'
smtp_require_tls: true
# 全局解析超时
resolve_timeout: 5m
# HTTP 配置
http_config:
proxy_url: 'http://proxy.example.com:8080'
# Slack 全局配置
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
# PagerDuty 全局配置
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# 模板文件
templates:
- '/etc/alertmanager/templates/*.tmpl'
# 路由配置
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-team'
group_wait: 5s
repeat_interval: 30m
- match:
team: database
receiver: 'dba-team'
group_by: ['alertname', 'instance']
- match_re:
service: '^(web|api).*'
receiver: 'web-team'
continue: true
# 接收器配置
receivers:
- name: 'default'
email_configs:
- to: 'admin@example.com'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
{{ end }}
- name: 'critical-team'
email_configs:
- to: 'critical@example.com'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#critical-alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
pagerduty_configs:
- routing_key: 'YOUR_PAGERDUTY_INTEGRATION_KEY'
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'dba-team'
email_configs:
- to: 'dba@example.com'
subject: '[DB] {{ .GroupLabels.alertname }}'
webhook_configs:
- url: 'http://dba-webhook.example.com/alerts'
send_resolved: true
- name: 'web-team'
slack_configs:
- channel: '#web-alerts'
username: 'alertmanager'
icon_emoji: ':warning:'
title: 'Web Service Alert'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Service:* {{ .Labels.service }}
*Instance:* {{ .Labels.instance }}
*Severity:* {{ .Labels.severity }}
{{ end }}
# 抑制规则
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
- source_match:
alertname: 'NodeDown'
target_match_re:
alertname: '^(NodeCPU|NodeMemory|NodeDisk).*'
equal: ['instance']
# 时间间隔配置
time_intervals:
- name: 'business-hours'
time_intervals:
- times:
- start_time: '09:00'
end_time: '17:00'
weekdays: ['monday:friday']
- name: 'weekends'
time_intervals:
- times:
- start_time: '00:00'
end_time: '23:59'
weekdays: ['saturday', 'sunday']
# 静默时间间隔
mute_time_intervals:
- name: 'maintenance-window'
time_intervals:
- times:
- start_time: '02:00'
end_time: '04:00'
weekdays: ['sunday']
3.2 全局配置(Global)
SMTP 配置
global:
# SMTP 服务器配置
smtp_smarthost: 'smtp.gmail.com:587' # SMTP 服务器地址和端口
smtp_from: 'alerts@example.com' # 发件人邮箱
smtp_auth_username: 'alerts@example.com' # SMTP 认证用户名
smtp_auth_password: 'app-password' # SMTP 认证密码
smtp_auth_secret: 'smtp-secret' # SMTP 认证密钥(与 password 二选一)
smtp_auth_identity: 'alerts@example.com' # SMTP 认证身份
smtp_require_tls: true # 是否要求 TLS
smtp_hello: 'alertmanager.example.com' # SMTP HELLO 主机名
HTTP 配置
global:
# HTTP 客户端配置
http_config:
proxy_url: 'http://proxy.example.com:8080' # 代理服务器
tls_config:
ca_file: '/etc/ssl/certs/ca.pem' # CA 证书文件
cert_file: '/etc/ssl/certs/client.pem' # 客户端证书
key_file: '/etc/ssl/private/client.key' # 客户端私钥
server_name: 'alertmanager.example.com' # 服务器名称
insecure_skip_verify: false # 是否跳过证书验证
basic_auth:
username: 'user'
password: 'pass'
bearer_token: 'bearer-token'
bearer_token_file: '/etc/alertmanager/token'
oauth2:
client_id: 'client-id'
client_secret: 'client-secret'
token_url: 'https://oauth.example.com/token'
scopes: ['scope1', 'scope2']
第三方服务配置
global:
# Slack 全局配置
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
slack_api_url_file: '/etc/alertmanager/slack_url'
# PagerDuty 全局配置
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# OpsGenie 全局配置
opsgenie_api_url: 'https://api.opsgenie.com/'
opsgenie_api_key: 'your-opsgenie-api-key'
opsgenie_api_key_file: '/etc/alertmanager/opsgenie_key'
# VictorOps 全局配置
victorops_api_url: 'https://alert.victorops.com/integrations/generic/20131114/alert/'
victorops_api_key: 'your-victorops-api-key'
# WeChat 全局配置
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
wechat_api_secret: 'your-wechat-secret'
wechat_api_corp_id: 'your-corp-id'
其他全局配置
global:
# 解析超时时间
resolve_timeout: 5m
# 时区设置
timezone: 'Asia/Shanghai'
# 自定义标签
external_labels:
cluster: 'production'
region: 'us-west-1'
environment: 'prod'
3.3 路由配置(Route)
基础路由配置
route:
# 分组标签
group_by: ['alertname', 'cluster', 'service']
# 等待时间配置
group_wait: 10s # 等待同组告警的时间
group_interval: 10s # 同组告警的发送间隔
repeat_interval: 1h # 重复发送间隔
# 默认接收器
receiver: 'default'
# 是否继续匹配后续路由
continue: false
# 子路由
routes:
- match:
severity: critical
receiver: 'critical-team'
group_wait: 5s
repeat_interval: 30m
高级路由配置
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
# 严重告警路由
- match:
severity: critical
receiver: 'critical-team'
group_wait: 5s
group_interval: 5s
repeat_interval: 15m
routes:
- match:
team: database
receiver: 'dba-critical'
group_by: ['alertname', 'instance']
- match:
team: infrastructure
receiver: 'infra-critical'
# 正则匹配路由
- match_re:
service: '^(web|api|frontend).*'
receiver: 'web-team'
group_by: ['alertname', 'service']
continue: true # 继续匹配后续路由
# 多条件匹配
- matchers:
- alertname = "HighCPUUsage"
- severity =~ "warning|critical"
- instance !~ "test.*"
receiver: 'cpu-alerts'
# 时间窗口路由
- match:
team: operations
receiver: 'ops-team'
active_time_intervals:
- 'business-hours'
mute_time_intervals:
- 'maintenance-window'
# 基于标签值的动态路由
- match:
team: '{{ .GroupLabels.team }}'
receiver: '{{ .GroupLabels.team }}-alerts'
路由匹配器详解
# 精确匹配
match:
severity: critical
alertname: HighCPUUsage
# 正则匹配
match_re:
instance: '^prod-.*'
service: '(web|api)'
# 新式匹配器(推荐)
matchers:
- alertname = "HighCPUUsage" # 精确匹配
- severity =~ "warning|critical" # 正则匹配
- instance !~ "test.*" # 正则不匹配
- team != "development" # 不等于匹配
3.4 接收器配置(Receivers)
邮件接收器
receivers:
- name: 'email-team'
email_configs:
- to: 'team@example.com'
from: 'alerts@example.com' # 覆盖全局配置
subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
body: |
{{ if eq .Status "firing" }}
🚨 告警触发
{{ else }}
✅ 告警恢复
{{ end }}
告警组: {{ .GroupLabels.alertname }}
集群: {{ .GroupLabels.cluster }}
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
严重程度: {{ .Labels.severity }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}
{{ end }}
# HTML 格式邮件
html: |
<h2>{{ if eq .Status "firing" }}🚨 告警触发{{ else }}✅ 告警恢复{{ end }}</h2>
<table border="1">
<tr><th>告警名称</th><th>实例</th><th>严重程度</th><th>状态</th></tr>
{{ range .Alerts }}
<tr>
<td>{{ .Labels.alertname }}</td>
<td>{{ .Labels.instance }}</td>
<td>{{ .Labels.severity }}</td>
<td>{{ .Status }}</td>
</tr>
{{ end }}
</table>
# 邮件头部
headers:
X-Priority: '1' # 高优先级
X-Mailer: 'Alertmanager'
# SMTP 配置覆盖
smarthost: 'smtp.company.com:587'
auth_username: 'alerts@company.com'
auth_password: 'company-password'
require_tls: true
Slack 接收器
receivers:
- name: 'slack-team'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
username: 'Alertmanager'
icon_emoji: ':warning:'
icon_url: 'https://example.com/alertmanager-icon.png'
title: '{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }}'
title_link: 'http://alertmanager.example.com'
text: |
{{ if eq .Status "firing" }}
*状态:* 告警触发
{{ else }}
*状态:* 告警恢复
{{ end }}
*集群:* {{ .GroupLabels.cluster }}
*服务:* {{ .GroupLabels.service }}
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*实例:* {{ .Labels.instance }}
*严重程度:* {{ .Labels.severity }}
{{ end }}
# 颜色配置
color: |
{{ if eq .Status "firing" }}
{{ if eq .GroupLabels.severity "critical" }}danger
{{ else if eq .GroupLabels.severity "warning" }}warning
{{ else }}good{{ end }}
{{ else }}good{{ end }}
# 字段配置
fields:
- title: '告警数量'
value: '{{ len .Alerts }}'
short: true
- title: '集群'
value: '{{ .GroupLabels.cluster }}'
short: true
# 操作按钮
actions:
- type: 'button'
text: '查看详情'
url: 'http://prometheus.example.com/alerts'
- type: 'button'
text: '静默告警'
url: 'http://alertmanager.example.com/#/silences/new'
# 发送已解决的告警
send_resolved: true
# HTTP 配置
http_config:
proxy_url: 'http://proxy.example.com:8080'
Webhook 接收器
receivers:
- name: 'webhook-team'
webhook_configs:
- url: 'http://webhook.example.com/alerts'
send_resolved: true
# HTTP 方法
http_config:
basic_auth:
username: 'webhook-user'
password: 'webhook-pass'
bearer_token: 'webhook-token'
tls_config:
insecure_skip_verify: true
# 自定义头部
http_headers:
'Content-Type': 'application/json'
'X-Custom-Header': 'alertmanager'
# 最大告警数量
max_alerts: 10
PagerDuty 接收器
receivers:
- name: 'pagerduty-team'
pagerduty_configs:
- routing_key: 'YOUR_PAGERDUTY_INTEGRATION_KEY'
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
# 事件详情
details:
cluster: '{{ .GroupLabels.cluster }}'
service: '{{ .GroupLabels.service }}'
alert_count: '{{ len .Alerts }}'
# 严重程度映射
severity: |
{{ if eq .GroupLabels.severity "critical" }}critical
{{ else if eq .GroupLabels.severity "warning" }}warning
{{ else }}info{{ end }}
# 客户端信息
client: 'Alertmanager'
client_url: 'http://alertmanager.example.com'
# 自定义链接
links:
- href: 'http://prometheus.example.com/alerts'
text: 'Prometheus Alerts'
- href: 'http://grafana.example.com/dashboard'
text: 'Grafana Dashboard'
# 图片
images:
- src: 'http://example.com/chart.png'
alt: 'Alert Chart'
微信接收器
receivers:
- name: 'wechat-team'
wechat_configs:
- corp_id: 'your-corp-id'
api_secret: 'your-api-secret'
to_user: '@all' # 或指定用户 'user1|user2'
to_party: '1' # 部门ID
to_tag: 'tag1' # 标签
agent_id: '1000001'
title: '{{ .GroupLabels.alertname }}'
message: |
{{ if eq .Status "firing" }}
告警触发
{{ else }}
告警恢复
{{ end }}
集群: {{ .GroupLabels.cluster }}
服务: {{ .GroupLabels.service }}
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
{{ end }}
# 消息类型
message_type: 'text' # text, markdown, textcard
3.5 抑制规则(Inhibit Rules)
基础抑制规则
inhibit_rules:
# 严重告警抑制警告告警
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
# 节点宕机抑制节点相关告警
- source_match:
alertname: 'NodeDown'
target_match_re:
alertname: '^(NodeCPU|NodeMemory|NodeDisk|NodeNetwork).*'
equal: ['instance']
# 服务宕机抑制服务相关告警
- source_match:
alertname: 'ServiceDown'
target_match_re:
alertname: '^(ServiceResponse|ServiceError).*'
equal: ['service', 'instance']
高级抑制规则
inhibit_rules:
# 使用新式匹配器
- source_matchers:
- alertname = "DatacenterDown"
target_matchers:
- alertname =~ "(Node|Service|Database).*"
- datacenter = "{{ .Labels.datacenter }}"
equal: ['datacenter']
# 维护窗口抑制
- source_match:
alertname: 'MaintenanceMode'
target_match_re:
alertname: '.*'
equal: ['instance']
# 集群级别抑制
- source_match:
severity: 'critical'
scope: 'cluster'
target_match:
scope: 'node'
equal: ['cluster']
# 时间相关抑制
- source_match:
alertname: 'ScheduledMaintenance'
target_match_re:
alertname: '^((?!ScheduledMaintenance).)*$' # 排除自身
equal: ['maintenance_window']
3.6 时间间隔配置
时间间隔定义
time_intervals:
# 工作时间
- name: 'business-hours'
time_intervals:
- times:
- start_time: '09:00'
end_time: '17:00'
weekdays: ['monday:friday']
days_of_month: ['1:31']
months: ['1:12']
years: ['2024:2025']
# 周末时间
- name: 'weekends'
time_intervals:
- times:
- start_time: '00:00'
end_time: '23:59'
weekdays: ['saturday', 'sunday']
# 节假日
- name: 'holidays'
time_intervals:
- times:
- start_time: '00:00'
end_time: '23:59'
days_of_month: ['1'] # 元旦
months: ['1']
- times:
- start_time: '00:00'
end_time: '23:59'
days_of_month: ['1']
months: ['10'] # 国庆节
# 维护窗口
- name: 'maintenance-window'
time_intervals:
- times:
- start_time: '02:00'
end_time: '04:00'
weekdays: ['sunday']
- times:
- start_time: '01:00'
end_time: '03:00'
days_of_month: ['1'] # 每月第一天
# 夜间时间
- name: 'night-hours'
time_intervals:
- times:
- start_time: '22:00'
end_time: '23:59'
- times:
- start_time: '00:00'
end_time: '06:00'
在路由中使用时间间隔
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
# 工作时间的严重告警
- match:
severity: critical
receiver: 'oncall-team'
active_time_intervals:
- 'business-hours'
group_wait: 5s
repeat_interval: 15m
# 非工作时间的严重告警
- match:
severity: critical
receiver: 'emergency-team'
active_time_intervals:
- 'night-hours'
- 'weekends'
- 'holidays'
group_wait: 2s
repeat_interval: 10m
# 维护窗口静默
- match:
team: infrastructure
receiver: 'infra-team'
mute_time_intervals:
- 'maintenance-window'
3.7 模板配置
模板文件路径
templates:
- '/etc/alertmanager/templates/*.tmpl'
- '/etc/alertmanager/templates/custom.tmpl'
- '/opt/alertmanager/templates/company.tmpl'
自定义模板示例
{{/* /etc/alertmanager/templates/custom.tmpl */}}
{{/* 定义邮件主题模板 */}}
{{ define "email.subject" }}
[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} ({{ len .Alerts }})
{{ end }}
{{/* 定义邮件正文模板 */}}
{{ define "email.body" }}
{{ if eq .Status "firing" }}
🚨 告警触发
{{ else }}
✅ 告警恢复
{{ end }}
告警组: {{ .GroupLabels.alertname }}
集群: {{ .GroupLabels.cluster | default "未知" }}
环境: {{ .GroupLabels.environment | default "未知" }}
告警数量: {{ len .Alerts }}
{{ range .Alerts }}
=====================================
告警名称: {{ .Labels.alertname }}
实例: {{ .Labels.instance }}
严重程度: {{ .Labels.severity }}
摘要: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}
标签:
{{ range .Labels.SortedPairs }}- {{ .Name }}: {{ .Value }}
{{ end }}
注解:
{{ range .Annotations.SortedPairs }}- {{ .Name }}: {{ .Value }}
{{ end }}
{{ end }}
查看详情: {{ .ExternalURL }}
{{ end }}
{{/* 定义 Slack 消息模板 */}}
{{ define "slack.title" }}
{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }}
{{ end }}
{{ define "slack.text" }}
{{ if eq .Status "firing" }}
*状态:* 告警触发 ({{ len .Alerts }}个)
{{ else }}
*状态:* 告警恢复
{{ end }}
*集群:* {{ .GroupLabels.cluster | default "未知" }}
*环境:* {{ .GroupLabels.environment | default "未知" }}
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*实例:* {{ .Labels.instance }}
*严重程度:* {{ .Labels.severity }}
{{ if .Annotations.runbook_url }}*处理手册:* {{ .Annotations.runbook_url }}{{ end }}
{{ end }}
{{ end }}
{{/* 定义微信消息模板 */}}
{{ define "wechat.message" }}
{{ if eq .Status "firing" }}
【告警触发】
{{ else }}
【告警恢复】
{{ end }}
告警组: {{ .GroupLabels.alertname }}
集群: {{ .GroupLabels.cluster | default "未知" }}
环境: {{ .GroupLabels.environment | default "未知" }}
数量: {{ len .Alerts }}个
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
实例: {{ .Labels.instance }}
级别: {{ .Labels.severity }}
时间: {{ .StartsAt.Format "01-02 15:04" }}
{{ end }}
{{ end }}
在接收器中使用模板
receivers:
- name: 'email-with-template'
email_configs:
- to: 'team@example.com'
subject: '{{ template "email.subject" . }}'
body: '{{ template "email.body" . }}'
- name: 'slack-with-template'
slack_configs:
- channel: '#alerts'
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
- name: 'wechat-with-template'
wechat_configs:
- to_user: '@all'
message: '{{ template "wechat.message" . }}'
3.8 配置验证和测试
配置文件验证
# 验证配置文件语法
amtool config check alertmanager.yml
# 验证并显示配置
amtool config show --config.file=alertmanager.yml
# 验证路由配置
amtool config routes show --config.file=alertmanager.yml
# 测试路由匹配
amtool config routes test \
--config.file=alertmanager.yml \
alertname=TestAlert severity=critical team=web
配置热重载
# 发送 SIGHUP 信号重载配置
kill -HUP $(pgrep alertmanager)
# 或使用 API 重载
curl -X POST http://localhost:9093/-/reload
# Docker 容器重载
docker kill -s HUP alertmanager
# Kubernetes 重载
kubectl exec -n monitoring alertmanager-0 -- kill -HUP 1
配置测试脚本
#!/bin/bash
# test-config.sh
CONFIG_FILE="alertmanager.yml"
ALERTMANAGER_URL="http://localhost:9093"
echo "=== 配置文件验证 ==="
if amtool config check "$CONFIG_FILE"; then
echo "✅ 配置文件语法正确"
else
echo "❌ 配置文件语法错误"
exit 1
fi
echo "\n=== 路由测试 ==="
# 测试不同类型的告警路由
test_cases=(
"alertname=HighCPUUsage severity=warning team=web"
"alertname=DatabaseDown severity=critical team=database"
"alertname=NodeDown severity=critical"
"alertname=ServiceError severity=warning service=api"
)
for test_case in "${test_cases[@]}"; do
echo "测试: $test_case"
result=$(amtool config routes test --config.file="$CONFIG_FILE" $test_case)
echo "结果: $result"
echo "---"
done
echo "\n=== API 连接测试 ==="
if curl -s "$ALERTMANAGER_URL/api/v1/status" > /dev/null; then
echo "✅ Alertmanager API 可访问"
else
echo "❌ Alertmanager API 不可访问"
fi
echo "\n=== 发送测试告警 ==="
test_alert='[
{
"labels": {
"alertname": "TestAlert",
"severity": "warning",
"instance": "test-server:9100",
"team": "test"
},
"annotations": {
"summary": "This is a test alert",
"description": "This alert is for configuration testing"
},
"startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)'"
}
]'
if curl -XPOST "$ALERTMANAGER_URL/api/v1/alerts" \
-H "Content-Type: application/json" \
-d "$test_alert" > /dev/null 2>&1; then
echo "✅ 测试告警发送成功"
else
echo "❌ 测试告警发送失败"
fi
echo "\n=== 配置测试完成 ==="
3.9 常见配置模式
多环境配置
# 生产环境配置
global:
smtp_smarthost: 'smtp.company.com:587'
smtp_from: 'alerts-prod@company.com'
external_labels:
environment: 'production'
cluster: 'prod-cluster'
route:
group_by: ['alertname', 'cluster']
group_wait: 5s
group_interval: 5s
repeat_interval: 30m
receiver: 'prod-default'
routes:
- match:
severity: critical
receiver: 'prod-critical'
group_wait: 2s
repeat_interval: 15m
receivers:
- name: 'prod-default'
email_configs:
- to: 'prod-alerts@company.com'
slack_configs:
- channel: '#prod-alerts'
pagerduty_configs:
- routing_key: 'prod-pagerduty-key'
- name: 'prod-critical'
email_configs:
- to: 'prod-oncall@company.com'
slack_configs:
- channel: '#prod-critical'
pagerduty_configs:
- routing_key: 'prod-critical-key'
团队分离配置
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
# 基础设施团队
- match:
team: infrastructure
receiver: 'infra-team'
group_by: ['alertname', 'instance']
routes:
- match:
severity: critical
receiver: 'infra-oncall'
# 数据库团队
- match:
team: database
receiver: 'dba-team'
group_by: ['alertname', 'database']
routes:
- match:
severity: critical
receiver: 'dba-oncall'
# 应用团队
- match_re:
team: '^(web|api|mobile)$'
receiver: 'app-team'
group_by: ['alertname', 'service']
continue: true
# 安全团队
- match:
category: security
receiver: 'security-team'
group_wait: 0s
repeat_interval: 5m
receivers:
- name: 'infra-team'
email_configs:
- to: 'infra@company.com'
slack_configs:
- channel: '#infra-alerts'
- name: 'infra-oncall'
pagerduty_configs:
- routing_key: 'infra-oncall-key'
slack_configs:
- channel: '#infra-critical'
- name: 'dba-team'
email_configs:
- to: 'dba@company.com'
slack_configs:
- channel: '#db-alerts'
- name: 'security-team'
email_configs:
- to: 'security@company.com'
slack_configs:
- channel: '#security-alerts'
webhook_configs:
- url: 'http://siem.company.com/alerts'
本章小结
本章详细介绍了 Alertmanager 配置文件的各个组成部分:
核心配置组件
- 全局配置:SMTP、HTTP、第三方服务的全局设置
- 路由配置:告警分发逻辑和匹配规则
- 接收器配置:各种通知渠道的详细配置
- 抑制规则:告警抑制逻辑和条件
- 时间间隔:时间窗口和静默时间配置
- 模板系统:自定义消息格式和内容
配置最佳实践
- 结构化设计:合理组织配置文件结构
- 模板化管理:使用模板提高配置复用性
- 环境分离:不同环境使用不同配置
- 团队分离:按团队职责分配告警
- 验证测试:定期验证和测试配置
- 版本控制:使用 Git 管理配置变更
下一步学习
在下一章中,我们将深入学习告警路由与分组的高级用法,包括: - 复杂路由规则设计 - 动态路由配置 - 告警分组策略 - 路由性能优化
下一章: 告警路由与分组