3.1 配置文件结构概述

基本结构

Alertmanager 的配置文件采用 YAML 格式,主要包含以下几个部分:

# alertmanager.yml 基本结构
global:
  # 全局配置

templates:
  # 模板文件路径

route:
  # 路由配置

receivers:
  # 接收器配置

inhibit_rules:
  # 抑制规则

time_intervals:
  # 时间间隔配置

mute_time_intervals:
  # 静默时间间隔

配置文件示例

# 完整的配置文件示例
global:
  # SMTP 配置
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: 'app-password'
  smtp_require_tls: true
  
  # 全局解析超时
  resolve_timeout: 5m
  
  # HTTP 配置
  http_config:
    proxy_url: 'http://proxy.example.com:8080'
  
  # Slack 全局配置
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
  
  # PagerDuty 全局配置
  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'

# 模板文件
templates:
  - '/etc/alertmanager/templates/*.tmpl'

# 路由配置
route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  
  routes:
  - match:
      severity: critical
    receiver: 'critical-team'
    group_wait: 5s
    repeat_interval: 30m
    
  - match:
      team: database
    receiver: 'dba-team'
    group_by: ['alertname', 'instance']
    
  - match_re:
      service: '^(web|api).*'
    receiver: 'web-team'
    continue: true

# 接收器配置
receivers:
- name: 'default'
  email_configs:
  - to: 'admin@example.com'
    subject: '[ALERT] {{ .GroupLabels.alertname }}'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
      {{ end }}

- name: 'critical-team'
  email_configs:
  - to: 'critical@example.com'
    subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
  slack_configs:
  - channel: '#critical-alerts'
    title: 'Critical Alert'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
  pagerduty_configs:
  - routing_key: 'YOUR_PAGERDUTY_INTEGRATION_KEY'
    description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'dba-team'
  email_configs:
  - to: 'dba@example.com'
    subject: '[DB] {{ .GroupLabels.alertname }}'
  webhook_configs:
  - url: 'http://dba-webhook.example.com/alerts'
    send_resolved: true

- name: 'web-team'
  slack_configs:
  - channel: '#web-alerts'
    username: 'alertmanager'
    icon_emoji: ':warning:'
    title: 'Web Service Alert'
    text: |
      {{ range .Alerts }}
      *Alert:* {{ .Annotations.summary }}
      *Service:* {{ .Labels.service }}
      *Instance:* {{ .Labels.instance }}
      *Severity:* {{ .Labels.severity }}
      {{ end }}

# 抑制规则
inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'instance']

- source_match:
    alertname: 'NodeDown'
  target_match_re:
    alertname: '^(NodeCPU|NodeMemory|NodeDisk).*'
  equal: ['instance']

# 时间间隔配置
time_intervals:
- name: 'business-hours'
  time_intervals:
  - times:
    - start_time: '09:00'
      end_time: '17:00'
    weekdays: ['monday:friday']
    
- name: 'weekends'
  time_intervals:
  - times:
    - start_time: '00:00'
      end_time: '23:59'
    weekdays: ['saturday', 'sunday']

# 静默时间间隔
mute_time_intervals:
- name: 'maintenance-window'
  time_intervals:
  - times:
    - start_time: '02:00'
      end_time: '04:00'
    weekdays: ['sunday']

3.2 全局配置(Global)

SMTP 配置

global:
  # SMTP 服务器配置
  smtp_smarthost: 'smtp.gmail.com:587'  # SMTP 服务器地址和端口
  smtp_from: 'alerts@example.com'       # 发件人邮箱
  smtp_auth_username: 'alerts@example.com'  # SMTP 认证用户名
  smtp_auth_password: 'app-password'    # SMTP 认证密码
  smtp_auth_secret: 'smtp-secret'       # SMTP 认证密钥(与 password 二选一)
  smtp_auth_identity: 'alerts@example.com'  # SMTP 认证身份
  smtp_require_tls: true                # 是否要求 TLS
  smtp_hello: 'alertmanager.example.com'  # SMTP HELLO 主机名

HTTP 配置

global:
  # HTTP 客户端配置
  http_config:
    proxy_url: 'http://proxy.example.com:8080'  # 代理服务器
    tls_config:
      ca_file: '/etc/ssl/certs/ca.pem'          # CA 证书文件
      cert_file: '/etc/ssl/certs/client.pem'    # 客户端证书
      key_file: '/etc/ssl/private/client.key'   # 客户端私钥
      server_name: 'alertmanager.example.com'  # 服务器名称
      insecure_skip_verify: false               # 是否跳过证书验证
    basic_auth:
      username: 'user'
      password: 'pass'
    bearer_token: 'bearer-token'
    bearer_token_file: '/etc/alertmanager/token'
    oauth2:
      client_id: 'client-id'
      client_secret: 'client-secret'
      token_url: 'https://oauth.example.com/token'
      scopes: ['scope1', 'scope2']

第三方服务配置

global:
  # Slack 全局配置
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
  slack_api_url_file: '/etc/alertmanager/slack_url'
  
  # PagerDuty 全局配置
  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
  
  # OpsGenie 全局配置
  opsgenie_api_url: 'https://api.opsgenie.com/'
  opsgenie_api_key: 'your-opsgenie-api-key'
  opsgenie_api_key_file: '/etc/alertmanager/opsgenie_key'
  
  # VictorOps 全局配置
  victorops_api_url: 'https://alert.victorops.com/integrations/generic/20131114/alert/'
  victorops_api_key: 'your-victorops-api-key'
  
  # WeChat 全局配置
  wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  wechat_api_secret: 'your-wechat-secret'
  wechat_api_corp_id: 'your-corp-id'

其他全局配置

global:
  # 解析超时时间
  resolve_timeout: 5m
  
  # 时区设置
  timezone: 'Asia/Shanghai'
  
  # 自定义标签
  external_labels:
    cluster: 'production'
    region: 'us-west-1'
    environment: 'prod'

3.3 路由配置(Route)

基础路由配置

route:
  # 分组标签
  group_by: ['alertname', 'cluster', 'service']
  
  # 等待时间配置
  group_wait: 10s      # 等待同组告警的时间
  group_interval: 10s  # 同组告警的发送间隔
  repeat_interval: 1h  # 重复发送间隔
  
  # 默认接收器
  receiver: 'default'
  
  # 是否继续匹配后续路由
  continue: false
  
  # 子路由
  routes:
  - match:
      severity: critical
    receiver: 'critical-team'
    group_wait: 5s
    repeat_interval: 30m

高级路由配置

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  
  routes:
  # 严重告警路由
  - match:
      severity: critical
    receiver: 'critical-team'
    group_wait: 5s
    group_interval: 5s
    repeat_interval: 15m
    routes:
    - match:
        team: database
      receiver: 'dba-critical'
      group_by: ['alertname', 'instance']
    - match:
        team: infrastructure
      receiver: 'infra-critical'
  
  # 正则匹配路由
  - match_re:
      service: '^(web|api|frontend).*'
    receiver: 'web-team'
    group_by: ['alertname', 'service']
    continue: true  # 继续匹配后续路由
  
  # 多条件匹配
  - matchers:
    - alertname = "HighCPUUsage"
    - severity =~ "warning|critical"
    - instance !~ "test.*"
    receiver: 'cpu-alerts'
  
  # 时间窗口路由
  - match:
      team: operations
    receiver: 'ops-team'
    active_time_intervals:
    - 'business-hours'
    mute_time_intervals:
    - 'maintenance-window'
  
  # 基于标签值的动态路由
  - match:
      team: '{{ .GroupLabels.team }}'
    receiver: '{{ .GroupLabels.team }}-alerts'

路由匹配器详解

# 精确匹配
match:
  severity: critical
  alertname: HighCPUUsage

# 正则匹配
match_re:
  instance: '^prod-.*'
  service: '(web|api)'

# 新式匹配器(推荐)
matchers:
- alertname = "HighCPUUsage"        # 精确匹配
- severity =~ "warning|critical"    # 正则匹配
- instance !~ "test.*"              # 正则不匹配
- team != "development"             # 不等于匹配

3.4 接收器配置(Receivers)

邮件接收器

receivers:
- name: 'email-team'
  email_configs:
  - to: 'team@example.com'
    from: 'alerts@example.com'  # 覆盖全局配置
    subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
    body: |
      {{ if eq .Status "firing" }}
      🚨 告警触发
      {{ else }}
      ✅ 告警恢复
      {{ end }}
      
      告警组: {{ .GroupLabels.alertname }}
      集群: {{ .GroupLabels.cluster }}
      
      {{ range .Alerts }}
      告警: {{ .Annotations.summary }}
      描述: {{ .Annotations.description }}
      实例: {{ .Labels.instance }}
      严重程度: {{ .Labels.severity }}
      开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
      {{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}
      {{ end }}
    
    # HTML 格式邮件
    html: |
      <h2>{{ if eq .Status "firing" }}🚨 告警触发{{ else }}✅ 告警恢复{{ end }}</h2>
      <table border="1">
        <tr><th>告警名称</th><th>实例</th><th>严重程度</th><th>状态</th></tr>
        {{ range .Alerts }}
        <tr>
          <td>{{ .Labels.alertname }}</td>
          <td>{{ .Labels.instance }}</td>
          <td>{{ .Labels.severity }}</td>
          <td>{{ .Status }}</td>
        </tr>
        {{ end }}
      </table>
    
    # 邮件头部
    headers:
      X-Priority: '1'  # 高优先级
      X-Mailer: 'Alertmanager'
    
    # SMTP 配置覆盖
    smarthost: 'smtp.company.com:587'
    auth_username: 'alerts@company.com'
    auth_password: 'company-password'
    require_tls: true

Slack 接收器

receivers:
- name: 'slack-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
    channel: '#alerts'
    username: 'Alertmanager'
    icon_emoji: ':warning:'
    icon_url: 'https://example.com/alertmanager-icon.png'
    
    title: '{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }}'
    title_link: 'http://alertmanager.example.com'
    
    text: |
      {{ if eq .Status "firing" }}
      *状态:* 告警触发
      {{ else }}
      *状态:* 告警恢复
      {{ end }}
      *集群:* {{ .GroupLabels.cluster }}
      *服务:* {{ .GroupLabels.service }}
      
      {{ range .Alerts }}
      *告警:* {{ .Annotations.summary }}
      *描述:* {{ .Annotations.description }}
      *实例:* {{ .Labels.instance }}
      *严重程度:* {{ .Labels.severity }}
      {{ end }}
    
    # 颜色配置
    color: |
      {{ if eq .Status "firing" }}
        {{ if eq .GroupLabels.severity "critical" }}danger
        {{ else if eq .GroupLabels.severity "warning" }}warning
        {{ else }}good{{ end }}
      {{ else }}good{{ end }}
    
    # 字段配置
    fields:
    - title: '告警数量'
      value: '{{ len .Alerts }}'
      short: true
    - title: '集群'
      value: '{{ .GroupLabels.cluster }}'
      short: true
    
    # 操作按钮
    actions:
    - type: 'button'
      text: '查看详情'
      url: 'http://prometheus.example.com/alerts'
    - type: 'button'
      text: '静默告警'
      url: 'http://alertmanager.example.com/#/silences/new'
    
    # 发送已解决的告警
    send_resolved: true
    
    # HTTP 配置
    http_config:
      proxy_url: 'http://proxy.example.com:8080'

Webhook 接收器

receivers:
- name: 'webhook-team'
  webhook_configs:
  - url: 'http://webhook.example.com/alerts'
    send_resolved: true
    
    # HTTP 方法
    http_config:
      basic_auth:
        username: 'webhook-user'
        password: 'webhook-pass'
      bearer_token: 'webhook-token'
      tls_config:
        insecure_skip_verify: true
    
    # 自定义头部
    http_headers:
      'Content-Type': 'application/json'
      'X-Custom-Header': 'alertmanager'
    
    # 最大告警数量
    max_alerts: 10

PagerDuty 接收器

receivers:
- name: 'pagerduty-team'
  pagerduty_configs:
  - routing_key: 'YOUR_PAGERDUTY_INTEGRATION_KEY'
    description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
    
    # 事件详情
    details:
      cluster: '{{ .GroupLabels.cluster }}'
      service: '{{ .GroupLabels.service }}'
      alert_count: '{{ len .Alerts }}'
    
    # 严重程度映射
    severity: |
      {{ if eq .GroupLabels.severity "critical" }}critical
      {{ else if eq .GroupLabels.severity "warning" }}warning
      {{ else }}info{{ end }}
    
    # 客户端信息
    client: 'Alertmanager'
    client_url: 'http://alertmanager.example.com'
    
    # 自定义链接
    links:
    - href: 'http://prometheus.example.com/alerts'
      text: 'Prometheus Alerts'
    - href: 'http://grafana.example.com/dashboard'
      text: 'Grafana Dashboard'
    
    # 图片
    images:
    - src: 'http://example.com/chart.png'
      alt: 'Alert Chart'

微信接收器

receivers:
- name: 'wechat-team'
  wechat_configs:
  - corp_id: 'your-corp-id'
    api_secret: 'your-api-secret'
    to_user: '@all'  # 或指定用户 'user1|user2'
    to_party: '1'    # 部门ID
    to_tag: 'tag1'   # 标签
    agent_id: '1000001'
    
    title: '{{ .GroupLabels.alertname }}'
    message: |
      {{ if eq .Status "firing" }}
      告警触发
      {{ else }}
      告警恢复
      {{ end }}
      
      集群: {{ .GroupLabels.cluster }}
      服务: {{ .GroupLabels.service }}
      
      {{ range .Alerts }}
      告警: {{ .Annotations.summary }}
      描述: {{ .Annotations.description }}
      实例: {{ .Labels.instance }}
      {{ end }}
    
    # 消息类型
    message_type: 'text'  # text, markdown, textcard

3.5 抑制规则(Inhibit Rules)

基础抑制规则

inhibit_rules:
# 严重告警抑制警告告警
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'instance']

# 节点宕机抑制节点相关告警
- source_match:
    alertname: 'NodeDown'
  target_match_re:
    alertname: '^(NodeCPU|NodeMemory|NodeDisk|NodeNetwork).*'
  equal: ['instance']

# 服务宕机抑制服务相关告警
- source_match:
    alertname: 'ServiceDown'
  target_match_re:
    alertname: '^(ServiceResponse|ServiceError).*'
  equal: ['service', 'instance']

高级抑制规则

inhibit_rules:
# 使用新式匹配器
- source_matchers:
  - alertname = "DatacenterDown"
  target_matchers:
  - alertname =~ "(Node|Service|Database).*"
  - datacenter = "{{ .Labels.datacenter }}"
  equal: ['datacenter']

# 维护窗口抑制
- source_match:
    alertname: 'MaintenanceMode'
  target_match_re:
    alertname: '.*'
  equal: ['instance']

# 集群级别抑制
- source_match:
    severity: 'critical'
    scope: 'cluster'
  target_match:
    scope: 'node'
  equal: ['cluster']

# 时间相关抑制
- source_match:
    alertname: 'ScheduledMaintenance'
  target_match_re:
    alertname: '^((?!ScheduledMaintenance).)*$'  # 排除自身
  equal: ['maintenance_window']

3.6 时间间隔配置

时间间隔定义

time_intervals:
# 工作时间
- name: 'business-hours'
  time_intervals:
  - times:
    - start_time: '09:00'
      end_time: '17:00'
    weekdays: ['monday:friday']
    days_of_month: ['1:31']
    months: ['1:12']
    years: ['2024:2025']

# 周末时间
- name: 'weekends'
  time_intervals:
  - times:
    - start_time: '00:00'
      end_time: '23:59'
    weekdays: ['saturday', 'sunday']

# 节假日
- name: 'holidays'
  time_intervals:
  - times:
    - start_time: '00:00'
      end_time: '23:59'
    days_of_month: ['1']  # 元旦
    months: ['1']
  - times:
    - start_time: '00:00'
      end_time: '23:59'
    days_of_month: ['1']
    months: ['10']  # 国庆节

# 维护窗口
- name: 'maintenance-window'
  time_intervals:
  - times:
    - start_time: '02:00'
      end_time: '04:00'
    weekdays: ['sunday']
  - times:
    - start_time: '01:00'
      end_time: '03:00'
    days_of_month: ['1']  # 每月第一天

# 夜间时间
- name: 'night-hours'
  time_intervals:
  - times:
    - start_time: '22:00'
      end_time: '23:59'
  - times:
    - start_time: '00:00'
      end_time: '06:00'

在路由中使用时间间隔

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  
  routes:
  # 工作时间的严重告警
  - match:
      severity: critical
    receiver: 'oncall-team'
    active_time_intervals:
    - 'business-hours'
    group_wait: 5s
    repeat_interval: 15m
  
  # 非工作时间的严重告警
  - match:
      severity: critical
    receiver: 'emergency-team'
    active_time_intervals:
    - 'night-hours'
    - 'weekends'
    - 'holidays'
    group_wait: 2s
    repeat_interval: 10m
  
  # 维护窗口静默
  - match:
      team: infrastructure
    receiver: 'infra-team'
    mute_time_intervals:
    - 'maintenance-window'

3.7 模板配置

模板文件路径

templates:
  - '/etc/alertmanager/templates/*.tmpl'
  - '/etc/alertmanager/templates/custom.tmpl'
  - '/opt/alertmanager/templates/company.tmpl'

自定义模板示例

{{/* /etc/alertmanager/templates/custom.tmpl */}}

{{/* 定义邮件主题模板 */}}
{{ define "email.subject" }}
[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} ({{ len .Alerts }})
{{ end }}

{{/* 定义邮件正文模板 */}}
{{ define "email.body" }}
{{ if eq .Status "firing" }}
🚨 告警触发
{{ else }}
✅ 告警恢复
{{ end }}

告警组: {{ .GroupLabels.alertname }}
集群: {{ .GroupLabels.cluster | default "未知" }}
环境: {{ .GroupLabels.environment | default "未知" }}
告警数量: {{ len .Alerts }}

{{ range .Alerts }}
=====================================
告警名称: {{ .Labels.alertname }}
实例: {{ .Labels.instance }}
严重程度: {{ .Labels.severity }}
摘要: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}

标签:
{{ range .Labels.SortedPairs }}- {{ .Name }}: {{ .Value }}
{{ end }}

注解:
{{ range .Annotations.SortedPairs }}- {{ .Name }}: {{ .Value }}
{{ end }}
{{ end }}

查看详情: {{ .ExternalURL }}
{{ end }}

{{/* 定义 Slack 消息模板 */}}
{{ define "slack.title" }}
{{ if eq .Status "firing" }}🚨{{ else }}✅{{ end }} {{ .GroupLabels.alertname }}
{{ end }}

{{ define "slack.text" }}
{{ if eq .Status "firing" }}
*状态:* 告警触发 ({{ len .Alerts }}个)
{{ else }}
*状态:* 告警恢复
{{ end }}
*集群:* {{ .GroupLabels.cluster | default "未知" }}
*环境:* {{ .GroupLabels.environment | default "未知" }}

{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*实例:* {{ .Labels.instance }}
*严重程度:* {{ .Labels.severity }}
{{ if .Annotations.runbook_url }}*处理手册:* {{ .Annotations.runbook_url }}{{ end }}
{{ end }}
{{ end }}

{{/* 定义微信消息模板 */}}
{{ define "wechat.message" }}
{{ if eq .Status "firing" }}
【告警触发】
{{ else }}
【告警恢复】
{{ end }}

告警组: {{ .GroupLabels.alertname }}
集群: {{ .GroupLabels.cluster | default "未知" }}
环境: {{ .GroupLabels.environment | default "未知" }}
数量: {{ len .Alerts }}个

{{ range .Alerts }}
告警: {{ .Annotations.summary }}
实例: {{ .Labels.instance }}
级别: {{ .Labels.severity }}
时间: {{ .StartsAt.Format "01-02 15:04" }}
{{ end }}
{{ end }}

在接收器中使用模板

receivers:
- name: 'email-with-template'
  email_configs:
  - to: 'team@example.com'
    subject: '{{ template "email.subject" . }}'
    body: '{{ template "email.body" . }}'

- name: 'slack-with-template'
  slack_configs:
  - channel: '#alerts'
    title: '{{ template "slack.title" . }}'
    text: '{{ template "slack.text" . }}'

- name: 'wechat-with-template'
  wechat_configs:
  - to_user: '@all'
    message: '{{ template "wechat.message" . }}'

3.8 配置验证和测试

配置文件验证

# 验证配置文件语法
amtool config check alertmanager.yml

# 验证并显示配置
amtool config show --config.file=alertmanager.yml

# 验证路由配置
amtool config routes show --config.file=alertmanager.yml

# 测试路由匹配
amtool config routes test \
  --config.file=alertmanager.yml \
  alertname=TestAlert severity=critical team=web

配置热重载

# 发送 SIGHUP 信号重载配置
kill -HUP $(pgrep alertmanager)

# 或使用 API 重载
curl -X POST http://localhost:9093/-/reload

# Docker 容器重载
docker kill -s HUP alertmanager

# Kubernetes 重载
kubectl exec -n monitoring alertmanager-0 -- kill -HUP 1

配置测试脚本

#!/bin/bash
# test-config.sh

CONFIG_FILE="alertmanager.yml"
ALERTMANAGER_URL="http://localhost:9093"

echo "=== 配置文件验证 ==="
if amtool config check "$CONFIG_FILE"; then
    echo "✅ 配置文件语法正确"
else
    echo "❌ 配置文件语法错误"
    exit 1
fi

echo "\n=== 路由测试 ==="
# 测试不同类型的告警路由
test_cases=(
    "alertname=HighCPUUsage severity=warning team=web"
    "alertname=DatabaseDown severity=critical team=database"
    "alertname=NodeDown severity=critical"
    "alertname=ServiceError severity=warning service=api"
)

for test_case in "${test_cases[@]}"; do
    echo "测试: $test_case"
    result=$(amtool config routes test --config.file="$CONFIG_FILE" $test_case)
    echo "结果: $result"
    echo "---"
done

echo "\n=== API 连接测试 ==="
if curl -s "$ALERTMANAGER_URL/api/v1/status" > /dev/null; then
    echo "✅ Alertmanager API 可访问"
else
    echo "❌ Alertmanager API 不可访问"
fi

echo "\n=== 发送测试告警 ==="
test_alert='[
  {
    "labels": {
      "alertname": "TestAlert",
      "severity": "warning",
      "instance": "test-server:9100",
      "team": "test"
    },
    "annotations": {
      "summary": "This is a test alert",
      "description": "This alert is for configuration testing"
    },
    "startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)'"
  }
]'

if curl -XPOST "$ALERTMANAGER_URL/api/v1/alerts" \
   -H "Content-Type: application/json" \
   -d "$test_alert" > /dev/null 2>&1; then
    echo "✅ 测试告警发送成功"
else
    echo "❌ 测试告警发送失败"
fi

echo "\n=== 配置测试完成 ==="

3.9 常见配置模式

多环境配置

# 生产环境配置
global:
  smtp_smarthost: 'smtp.company.com:587'
  smtp_from: 'alerts-prod@company.com'
  external_labels:
    environment: 'production'
    cluster: 'prod-cluster'

route:
  group_by: ['alertname', 'cluster']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 30m
  receiver: 'prod-default'
  
  routes:
  - match:
      severity: critical
    receiver: 'prod-critical'
    group_wait: 2s
    repeat_interval: 15m

receivers:
- name: 'prod-default'
  email_configs:
  - to: 'prod-alerts@company.com'
  slack_configs:
  - channel: '#prod-alerts'
  pagerduty_configs:
  - routing_key: 'prod-pagerduty-key'

- name: 'prod-critical'
  email_configs:
  - to: 'prod-oncall@company.com'
  slack_configs:
  - channel: '#prod-critical'
  pagerduty_configs:
  - routing_key: 'prod-critical-key'

团队分离配置

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  
  routes:
  # 基础设施团队
  - match:
      team: infrastructure
    receiver: 'infra-team'
    group_by: ['alertname', 'instance']
    routes:
    - match:
        severity: critical
      receiver: 'infra-oncall'
  
  # 数据库团队
  - match:
      team: database
    receiver: 'dba-team'
    group_by: ['alertname', 'database']
    routes:
    - match:
        severity: critical
      receiver: 'dba-oncall'
  
  # 应用团队
  - match_re:
      team: '^(web|api|mobile)$'
    receiver: 'app-team'
    group_by: ['alertname', 'service']
    continue: true
  
  # 安全团队
  - match:
      category: security
    receiver: 'security-team'
    group_wait: 0s
    repeat_interval: 5m

receivers:
- name: 'infra-team'
  email_configs:
  - to: 'infra@company.com'
  slack_configs:
  - channel: '#infra-alerts'

- name: 'infra-oncall'
  pagerduty_configs:
  - routing_key: 'infra-oncall-key'
  slack_configs:
  - channel: '#infra-critical'

- name: 'dba-team'
  email_configs:
  - to: 'dba@company.com'
  slack_configs:
  - channel: '#db-alerts'

- name: 'security-team'
  email_configs:
  - to: 'security@company.com'
  slack_configs:
  - channel: '#security-alerts'
  webhook_configs:
  - url: 'http://siem.company.com/alerts'

本章小结

本章详细介绍了 Alertmanager 配置文件的各个组成部分:

核心配置组件

  1. 全局配置:SMTP、HTTP、第三方服务的全局设置
  2. 路由配置:告警分发逻辑和匹配规则
  3. 接收器配置:各种通知渠道的详细配置
  4. 抑制规则:告警抑制逻辑和条件
  5. 时间间隔:时间窗口和静默时间配置
  6. 模板系统:自定义消息格式和内容

配置最佳实践

  1. 结构化设计:合理组织配置文件结构
  2. 模板化管理:使用模板提高配置复用性
  3. 环境分离:不同环境使用不同配置
  4. 团队分离:按团队职责分配告警
  5. 验证测试:定期验证和测试配置
  6. 版本控制:使用 Git 管理配置变更

下一步学习

在下一章中,我们将深入学习告警路由与分组的高级用法,包括: - 复杂路由规则设计 - 动态路由配置 - 告警分组策略 - 路由性能优化


下一章: 告警路由与分组