概述
告警是监控系统的核心功能之一,Prometheus通过Alertmanager组件提供了强大的告警管理能力。本章将深入介绍如何配置和管理Prometheus告警系统,包括告警规则编写、Alertmanager配置、通知渠道设置和告警策略管理。
学习目标
通过本章学习,你将掌握:
- 告警规则编写:学会编写有效的告警规则
- Alertmanager配置:掌握告警管理器的配置方法
- 通知渠道设置:配置多种通知方式(邮件、Slack、钉钉等)
- 告警策略管理:实现告警分组、抑制和静默
- 告警最佳实践:避免告警疲劳,提高告警质量
告警规则基础
1. 告警规则结构
from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional
import yaml
class AlertSeverity(Enum):
"""告警严重级别"""
CRITICAL = "critical"
WARNING = "warning"
INFO = "info"
class AlertState(Enum):
"""告警状态"""
PENDING = "pending"
FIRING = "firing"
RESOLVED = "resolved"
@dataclass
class AlertRule:
"""告警规则数据结构"""
name: str
expr: str
duration: str
severity: AlertSeverity
summary: str
description: str
labels: Dict[str, str]
annotations: Dict[str, str]
class AlertRuleManager:
"""告警规则管理器"""
def __init__(self):
self.rules = []
self.rule_groups = {}
def create_basic_rules(self) -> str:
"""创建基础告警规则"""
return """
# 基础告警规则组
groups:
- name: basic_alerts
rules:
# 实例宕机告警
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
team: infrastructure
annotations:
summary: "实例 {{ $labels.instance }} 已宕机"
description: "实例 {{ $labels.instance }} 在过去1分钟内无法访问"
runbook_url: "https://wiki.example.com/runbooks/instance-down"
# 高CPU使用率告警
- alert: HighCPUUsage
expr: |
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "实例 {{ $labels.instance }} CPU使用率过高"
description: "实例 {{ $labels.instance }} CPU使用率为 {{ $value }}%,持续5分钟"
runbook_url: "https://wiki.example.com/runbooks/high-cpu"
# 高内存使用率告警
- alert: HighMemoryUsage
expr: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "实例 {{ $labels.instance }} 内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率为 {{ $value }}%,持续5分钟"
runbook_url: "https://wiki.example.com/runbooks/high-memory"
# 磁盘空间不足告警
- alert: DiskSpaceLow
expr: |
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 1m
labels:
severity: critical
team: infrastructure
annotations:
summary: "实例 {{ $labels.instance }} 磁盘空间不足"
description: "实例 {{ $labels.instance }} 挂载点 {{ $labels.mountpoint }} 磁盘使用率为 {{ $value }}%"
runbook_url: "https://wiki.example.com/runbooks/disk-space"
# 磁盘IO等待时间过长
- alert: HighDiskIOWait
expr: |
rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "实例 {{ $labels.instance }} 磁盘IO等待时间过长"
description: "实例 {{ $labels.instance }} IO等待时间为 {{ $value }}%,持续5分钟"
runbook_url: "https://wiki.example.com/runbooks/high-iowait"
"""
def create_application_rules(self) -> str:
"""创建应用程序告警规则"""
return """
# 应用程序告警规则组
groups:
- name: application_alerts
rules:
# HTTP错误率过高
- alert: HighHTTPErrorRate
expr: |
sum(rate(http_requests_total{status_code=~"[45].."}[5m])) by (job, instance)
/
sum(rate(http_requests_total[5m])) by (job, instance) * 100 > 5
for: 2m
labels:
severity: warning
team: backend
annotations:
summary: "服务 {{ $labels.job }} 错误率过高"
description: "服务 {{ $labels.job }} 在实例 {{ $labels.instance }} 上的错误率为 {{ $value }}%"
runbook_url: "https://wiki.example.com/runbooks/high-error-rate"
# HTTP响应时间过长
- alert: HighHTTPLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le)
) > 0.5
for: 3m
labels:
severity: warning
team: backend
annotations:
summary: "服务 {{ $labels.job }} 响应时间过长"
description: "服务 {{ $labels.job }} 在实例 {{ $labels.instance }} 上的P95响应时间为 {{ $value }}秒"
runbook_url: "https://wiki.example.com/runbooks/high-latency"
# 服务请求量异常下降
- alert: LowRequestRate
expr: |
sum(rate(http_requests_total[5m])) by (job) < 10
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "服务 {{ $labels.job }} 请求量异常下降"
description: "服务 {{ $labels.job }} 的请求量为 {{ $value }} req/s,可能存在问题"
runbook_url: "https://wiki.example.com/runbooks/low-request-rate"
# 数据库连接池耗尽
- alert: DatabaseConnectionPoolExhausted
expr: |
mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 90
for: 1m
labels:
severity: critical
team: database
annotations:
summary: "数据库 {{ $labels.instance }} 连接池即将耗尽"
description: "数据库 {{ $labels.instance }} 连接使用率为 {{ $value }}%"
runbook_url: "https://wiki.example.com/runbooks/db-connections"
# 队列积压过多
- alert: HighQueueBacklog
expr: |
queue_size > 1000
for: 2m
labels:
severity: warning
team: backend
annotations:
summary: "队列 {{ $labels.queue_name }} 积压过多"
description: "队列 {{ $labels.queue_name }} 当前积压 {{ $value }} 个任务"
runbook_url: "https://wiki.example.com/runbooks/queue-backlog"
"""
def create_business_rules(self) -> str:
"""创建业务指标告警规则"""
return """
# 业务指标告警规则组
groups:
- name: business_alerts
rules:
# 订单量异常下降
- alert: LowOrderRate
expr: |
sum(rate(orders_total[10m])) < 5
for: 5m
labels:
severity: warning
team: business
annotations:
summary: "订单量异常下降"
description: "过去10分钟订单量为 {{ $value }} 单/分钟,低于正常水平"
runbook_url: "https://wiki.example.com/runbooks/low-orders"
# 支付失败率过高
- alert: HighPaymentFailureRate
expr: |
sum(rate(payment_failures_total[5m]))
/
sum(rate(payment_attempts_total[5m])) * 100 > 2
for: 3m
labels:
severity: critical
team: payment
annotations:
summary: "支付失败率过高"
description: "支付失败率为 {{ $value }}%,超过阈值"
runbook_url: "https://wiki.example.com/runbooks/payment-failures"
# 用户注册量异常
- alert: AbnormalUserRegistration
expr: |
abs(sum(rate(user_registrations_total[1h])) - sum(rate(user_registrations_total[1h] offset 24h))) > 50
for: 10m
labels:
severity: info
team: growth
annotations:
summary: "用户注册量异常"
description: "当前小时注册量与昨日同期相比差异为 {{ $value }} 人"
runbook_url: "https://wiki.example.com/runbooks/registration-anomaly"
# 库存不足告警
- alert: LowInventory
expr: |
inventory_quantity < 10
for: 0s
labels:
severity: warning
team: inventory
annotations:
summary: "商品 {{ $labels.product_id }} 库存不足"
description: "商品 {{ $labels.product_id }} 当前库存为 {{ $value }} 件"
runbook_url: "https://wiki.example.com/runbooks/low-inventory"
"""
def create_security_rules(self) -> str:
"""创建安全相关告警规则"""
return """
# 安全告警规则组
groups:
- name: security_alerts
rules:
# 异常登录尝试
- alert: HighFailedLoginAttempts
expr: |
sum(rate(login_failures_total[5m])) by (source_ip) > 10
for: 1m
labels:
severity: warning
team: security
annotations:
summary: "IP {{ $labels.source_ip }} 异常登录尝试"
description: "IP {{ $labels.source_ip }} 在5分钟内失败登录 {{ $value }} 次"
runbook_url: "https://wiki.example.com/runbooks/failed-logins"
# 可疑API调用
- alert: SuspiciousAPIUsage
expr: |
sum(rate(api_requests_total[1m])) by (api_key, endpoint) > 100
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "API密钥 {{ $labels.api_key }} 使用异常"
description: "API密钥 {{ $labels.api_key }} 对端点 {{ $labels.endpoint }} 的调用频率为 {{ $value }} req/min"
runbook_url: "https://wiki.example.com/runbooks/api-abuse"
# SSL证书即将过期
- alert: SSLCertificateExpiringSoon
expr: |
(ssl_certificate_expiry_timestamp - time()) / 86400 < 30
for: 1h
labels:
severity: warning
team: infrastructure
annotations:
summary: "SSL证书即将过期"
description: "域名 {{ $labels.domain }} 的SSL证书将在 {{ $value }} 天后过期"
runbook_url: "https://wiki.example.com/runbooks/ssl-expiry"
# 异常文件访问
- alert: UnauthorizedFileAccess
expr: |
sum(rate(file_access_denied_total[5m])) by (user, file_path) > 5
for: 1m
labels:
severity: critical
team: security
annotations:
summary: "用户 {{ $labels.user }} 异常文件访问"
description: "用户 {{ $labels.user }} 尝试访问 {{ $labels.file_path }},被拒绝 {{ $value }} 次"
runbook_url: "https://wiki.example.com/runbooks/file-access"
"""
def generate_rule_validation_script(self) -> str:
"""生成规则验证脚本"""
return """
#!/bin/bash
# Prometheus告警规则验证脚本
set -e
RULES_DIR="/etc/prometheus/rules"
PROMETHEUS_URL="http://localhost:9090"
TEMP_DIR="/tmp/prometheus-rules-test"
echo "开始验证Prometheus告警规则..."
# 创建临时目录
mkdir -p $TEMP_DIR
# 验证规则文件语法
echo "1. 验证规则文件语法"
for rule_file in $RULES_DIR/*.yml; do
if [ -f "$rule_file" ]; then
echo "验证文件: $rule_file"
promtool check rules "$rule_file"
if [ $? -eq 0 ]; then
echo "✓ $rule_file 语法正确"
else
echo "✗ $rule_file 语法错误"
exit 1
fi
fi
done
# 验证规则查询
echo "\n2. 验证规则查询"
for rule_file in $RULES_DIR/*.yml; do
if [ -f "$rule_file" ]; then
echo "验证查询: $rule_file"
promtool query instant $PROMETHEUS_URL 'up'
if [ $? -eq 0 ]; then
echo "✓ Prometheus连接正常"
else
echo "✗ 无法连接到Prometheus"
exit 1
fi
fi
done
# 测试规则表达式
echo "\n3. 测试规则表达式"
cat > $TEMP_DIR/test_queries.txt << 'EOF'
up == 0
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
sum(rate(http_requests_total{status_code=~"[45].."}[5m])) by (job, instance) / sum(rate(http_requests_total[5m])) by (job, instance) * 100 > 5
EOF
while IFS= read -r query; do
echo "测试查询: $query"
promtool query instant $PROMETHEUS_URL "$query" > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ 查询执行成功"
else
echo "⚠ 查询可能有问题(可能是因为没有相关指标)"
fi
done < $TEMP_DIR/test_queries.txt
# 清理临时文件
rm -rf $TEMP_DIR
echo "\n告警规则验证完成!"
"""
def generate_rule_best_practices(self) -> List[str]:
"""生成告警规则最佳实践"""
return [
"✓ 使用有意义的告警名称和描述",
"✓ 设置合适的for持续时间避免误报",
"✓ 包含runbook_url指向处理文档",
"✓ 使用标准的severity标签",
"✓ 在annotations中提供足够的上下文信息",
"✓ 避免过于复杂的PromQL表达式",
"✓ 使用记录规则简化复杂查询",
"✓ 定期审查和更新告警阈值",
"✓ 为不同团队设置不同的标签",
"✓ 测试告警规则的有效性",
"✓ 避免告警风暴和级联告警",
"✓ 使用模板变量提高可读性",
"✓ 考虑业务影响设置优先级",
"✓ 实施告警规则版本控制",
"✓ 监控告警规则的性能影响"
]
# 使用示例
rule_manager = AlertRuleManager()
# 生成各类告警规则
basic_rules = rule_manager.create_basic_rules()
app_rules = rule_manager.create_application_rules()
business_rules = rule_manager.create_business_rules()
security_rules = rule_manager.create_security_rules()
print("告警规则已生成")
print(f"\n基础规则示例:")
print(basic_rules[:300] + "...")
print(f"\n应用规则示例:")
print(app_rules[:300] + "...")
# 生成验证脚本
validation_script = rule_manager.generate_rule_validation_script()
print(f"\n验证脚本长度: {len(validation_script)} 字符")
# 获取最佳实践
best_practices = rule_manager.generate_rule_best_practices()
print(f"\n最佳实践数量: {len(best_practices)}")
for practice in best_practices[:5]:
print(practice)
print("...")
Alertmanager配置
1. 基础配置
class AlertmanagerConfig:
"""Alertmanager配置管理器"""
def __init__(self):
self.config = {}
self.routes = []
self.receivers = []
self.inhibit_rules = []
def generate_basic_config(self) -> str:
"""生成基础Alertmanager配置"""
return """
# Alertmanager基础配置
global:
# SMTP配置
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'your-password'
smtp_require_tls: true
# Slack配置
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
# 全局标签
external_labels:
cluster: 'production'
environment: 'prod'
# 模板文件
templates:
- '/etc/alertmanager/templates/*.tmpl'
# 路由配置
route:
# 默认接收器
receiver: 'default'
# 分组配置
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
# 子路由
routes:
# 关键告警立即发送
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
group_interval: 5m
repeat_interval: 30m
# 基础设施告警
- match:
team: infrastructure
receiver: 'infrastructure-team'
group_by: ['alertname', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
# 应用程序告警
- match:
team: backend
receiver: 'backend-team'
group_by: ['alertname', 'service']
group_wait: 1m
group_interval: 10m
repeat_interval: 4h
# 安全告警
- match:
team: security
receiver: 'security-team'
group_wait: 0s
group_interval: 1m
repeat_interval: 15m
# 业务告警
- match:
team: business
receiver: 'business-team'
group_by: ['alertname']
group_wait: 5m
group_interval: 30m
repeat_interval: 12h
# 接收器配置
receivers:
# 默认接收器
- name: 'default'
email_configs:
- to: 'admin@example.com'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
标签: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
# 关键告警接收器
- name: 'critical-alerts'
email_configs:
- to: 'oncall@example.com'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
body: |
🚨 关键告警 🚨
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
严重级别: {{ .Labels.severity }}
影响服务: {{ .Labels.service | default "未知" }}
处理手册: {{ .Annotations.runbook_url | default "无" }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
slack_configs:
- channel: '#critical-alerts'
title: '🚨 关键告警'
text: |
{{ range .Alerts }}
*告警*: {{ .Annotations.summary }}
*描述*: {{ .Annotations.description }}
*标签*: {{ range .Labels.SortedPairs }}`{{ .Name }}`={{ .Value }} {{ end }}
{{ end }}
send_resolved: true
# 基础设施团队接收器
- name: 'infrastructure-team'
email_configs:
- to: 'infra-team@example.com'
subject: '[INFRA] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#infrastructure'
title: '🔧 基础设施告警'
send_resolved: true
# 后端团队接收器
- name: 'backend-team'
email_configs:
- to: 'backend-team@example.com'
subject: '[BACKEND] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#backend-alerts'
title: '💻 后端服务告警'
send_resolved: true
# 安全团队接收器
- name: 'security-team'
email_configs:
- to: 'security-team@example.com'
subject: '[SECURITY] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#security-alerts'
title: '🔒 安全告警'
send_resolved: true
# 业务团队接收器
- name: 'business-team'
email_configs:
- to: 'business-team@example.com'
subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#business-metrics'
title: '📊 业务指标告警'
send_resolved: true
# 抑制规则
inhibit_rules:
# 实例宕机时抑制其他告警
- source_match:
alertname: 'InstanceDown'
target_match_re:
alertname: '(HighCPUUsage|HighMemoryUsage|DiskSpaceLow)'
equal: ['instance']
# 关键告警抑制警告告警
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
# 服务不可用时抑制性能告警
- source_match:
alertname: 'ServiceUnavailable'
target_match_re:
alertname: '(HighLatency|HighErrorRate)'
equal: ['service']
"""
def generate_advanced_config(self) -> str:
"""生成高级Alertmanager配置"""
return """
# Alertmanager高级配置
global:
# 解析超时
resolve_timeout: 5m
# HTTP配置
http_config:
proxy_url: 'http://proxy.example.com:8080'
tls_config:
insecure_skip_verify: false
# SMTP配置(支持多个SMTP服务器)
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password_file: '/etc/alertmanager/smtp_password'
smtp_require_tls: true
smtp_hello: 'alertmanager.example.com'
# 企业微信配置
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
wechat_api_secret: 'your-wechat-secret'
wechat_api_corp_id: 'your-corp-id'
# PagerDuty配置
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# OpsGenie配置
opsgenie_api_url: 'https://api.opsgenie.com/'
opsgenie_api_key_file: '/etc/alertmanager/opsgenie_key'
# 模板配置
templates:
- '/etc/alertmanager/templates/*.tmpl'
- '/etc/alertmanager/custom-templates/*.tmpl'
# 高级路由配置
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
routes:
# 时间敏感的路由(工作时间)
- match:
severity: critical
receiver: 'critical-business-hours'
active_time_intervals:
- 'business-hours'
group_wait: 0s
repeat_interval: 15m
# 非工作时间的关键告警
- match:
severity: critical
receiver: 'critical-after-hours'
active_time_intervals:
- 'after-hours'
group_wait: 0s
repeat_interval: 30m
# 基于标签的复杂路由
- match_re:
service: '(api|web|database)'
environment: 'production'
receiver: 'production-services'
routes:
# API服务特殊处理
- match:
service: api
receiver: 'api-team'
group_by: ['alertname', 'endpoint']
# 数据库告警
- match:
service: database
receiver: 'dba-team'
group_by: ['alertname', 'database']
# 测试环境告警(降低优先级)
- match:
environment: 'staging'
receiver: 'staging-alerts'
group_wait: 5m
group_interval: 30m
repeat_interval: 24h
# 开发环境告警(仅记录)
- match:
environment: 'development'
receiver: 'dev-alerts'
group_wait: 10m
repeat_interval: 0 # 不重复发送
# 时间间隔定义
time_intervals:
- name: 'business-hours'
time_intervals:
- times:
- start_time: '09:00'
end_time: '18:00'
weekdays: ['monday:friday']
location: 'Asia/Shanghai'
- name: 'after-hours'
time_intervals:
- times:
- start_time: '18:00'
end_time: '09:00'
weekdays: ['monday:friday']
location: 'Asia/Shanghai'
- weekdays: ['saturday', 'sunday']
location: 'Asia/Shanghai'
- name: 'maintenance-window'
time_intervals:
- times:
- start_time: '02:00'
end_time: '04:00'
weekdays: ['sunday']
location: 'Asia/Shanghai'
# 高级接收器配置
receivers:
- name: 'default'
webhook_configs:
- url: 'http://alertmanager-webhook:8080/webhook'
send_resolved: true
http_config:
bearer_token_file: '/etc/alertmanager/webhook_token'
# 工作时间关键告警
- name: 'critical-business-hours'
email_configs:
- to: 'oncall@example.com'
subject: '[URGENT] {{ .GroupLabels.alertname }}'
headers:
Priority: 'high'
X-Priority: '1'
slack_configs:
- api_url_file: '/etc/alertmanager/slack_webhook'
channel: '#critical-alerts'
title: '🚨 紧急告警 - 工作时间'
text: |
<!channel> 紧急告警需要立即处理
{{ range .Alerts }}
*告警*: {{ .Annotations.summary }}
*描述*: {{ .Annotations.description }}
*处理手册*: {{ .Annotations.runbook_url }}
{{ end }}
pagerduty_configs:
- routing_key_file: '/etc/alertmanager/pagerduty_key'
description: '{{ .GroupLabels.alertname }}'
severity: 'critical'
details:
summary: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
source: '{{ .GroupLabels.instance }}'
# 非工作时间关键告警
- name: 'critical-after-hours'
email_configs:
- to: 'oncall-night@example.com'
subject: '[CRITICAL-NIGHT] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#critical-alerts'
title: '🌙 夜间关键告警'
wechat_configs:
- corp_id: 'your-corp-id'
to_user: '@all'
agent_id: 'your-agent-id'
api_secret_file: '/etc/alertmanager/wechat_secret'
message: |
【夜间紧急告警】
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
时间: {{ .StartsAt.Format "15:04:05" }}
{{ end }}
# API团队接收器
- name: 'api-team'
slack_configs:
- channel: '#api-team'
title: '🔌 API服务告警'
fields:
- title: '服务'
value: '{{ .GroupLabels.service }}'
short: true
- title: '端点'
value: '{{ .GroupLabels.endpoint }}'
short: true
webhook_configs:
- url: 'http://api-monitoring:8080/alerts'
http_config:
basic_auth:
username: 'alertmanager'
password_file: '/etc/alertmanager/api_webhook_password'
# DBA团队接收器
- name: 'dba-team'
email_configs:
- to: 'dba-team@example.com'
subject: '[DATABASE] {{ .GroupLabels.alertname }}'
body: |
数据库告警详情:
{{ range .Alerts }}
数据库: {{ .Labels.database }}
实例: {{ .Labels.instance }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
查询语句: {{ .Labels.query | default "无" }}
表名: {{ .Labels.table | default "无" }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}
{{ end }}
# 高级抑制规则
inhibit_rules:
# 维护窗口期间抑制所有告警
- source_match:
alertname: 'MaintenanceMode'
target_match_re:
alertname: '.*'
equal: ['cluster']
# 网络分区时抑制实例告警
- source_match:
alertname: 'NetworkPartition'
target_match_re:
alertname: '(InstanceDown|HighLatency)'
equal: ['datacenter']
# 负载均衡器故障时抑制后端告警
- source_match:
alertname: 'LoadBalancerDown'
target_match_re:
alertname: '(ServiceUnavailable|HighErrorRate)'
equal: ['service']
# 存储故障时抑制相关告警
- source_match:
alertname: 'StorageFailure'
target_match_re:
alertname: '(DiskSpaceLow|HighDiskIOWait|DatabaseConnectionIssues)'
equal: ['storage_cluster']
"""
# 使用示例
alertmanager_config = AlertmanagerConfig()
# 生成配置
basic_config = alertmanager_config.generate_basic_config()
advanced_config = alertmanager_config.generate_advanced_config()
print("Alertmanager配置已生成")
print(f"\n基础配置长度: {len(basic_config)} 字符")
print(f"高级配置长度: {len(advanced_config)} 字符")
print(f"\n基础配置示例:")
print(basic_config[:400] + "...")
通知渠道配置
NotificationChannelManager 类
from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
class ChannelType(Enum):
EMAIL = "email"
SLACK = "slack"
WEBHOOK = "webhook"
PAGERDUTY = "pagerduty"
WECHAT = "wechat"
DINGTALK = "dingtalk"
TELEGRAM = "telegram"
DISCORD = "discord"
@dataclass
class NotificationChannel:
name: str
type: ChannelType
config: Dict[str, Any]
enabled: bool = True
class NotificationChannelManager:
def __init__(self):
self.channels = {}
def create_email_config(self, smtp_server: str, port: int = 587,
username: str = "", password: str = "",
from_addr: str = "", to_addrs: List[str] = None) -> Dict[str, Any]:
"""创建邮件通知配置"""
return {
"global": {
"smtp_smarthost": f"{smtp_server}:{port}",
"smtp_from": from_addr,
"smtp_auth_username": username,
"smtp_auth_password": password,
"smtp_require_tls": True
},
"receivers": [{
"name": "email-notifications",
"email_configs": [{
"to": ", ".join(to_addrs or []),
"subject": "[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}",
"body": """
告警详情:
{{ range .Alerts }}
告警名称: {{ .Annotations.summary }}
告警描述: {{ .Annotations.description }}
告警级别: {{ .Labels.severity }}
实例: {{ .Labels.instance }}
开始时间: {{ .StartsAt }}
{{ end }}
"""
}]
}]
}
def create_slack_config(self, webhook_url: str, channel: str = "#alerts",
username: str = "Prometheus") -> Dict[str, Any]:
"""创建 Slack 通知配置"""
return {
"receivers": [{
"name": "slack-notifications",
"slack_configs": [{
"api_url": webhook_url,
"channel": channel,
"username": username,
"title": "{{ .GroupLabels.alertname }}",
"text": """
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*级别:* {{ .Labels.severity }}
*实例:* {{ .Labels.instance }}
*时间:* {{ .StartsAt }}
{{ end }}
""",
"color": "{{ if eq .Status \"firing\" }}danger{{ else }}good{{ end }}"
}]
}]
}
def create_webhook_config(self, url: str, headers: Dict[str, str] = None) -> Dict[str, Any]:
"""创建 Webhook 通知配置"""
config = {
"receivers": [{
"name": "webhook-notifications",
"webhook_configs": [{
"url": url,
"send_resolved": True,
"http_config": {
"basic_auth": {
"username": "prometheus",
"password": "secret"
}
}
}]
}]
}
if headers:
config["receivers"][0]["webhook_configs"][0]["http_config"]["headers"] = headers
return config
def create_telegram_config(self, bot_token: str, chat_id: str) -> Dict[str, Any]:
"""创建 Telegram 通知配置"""
return {
"receivers": [{
"name": "telegram-notifications",
"telegram_configs": [{
"bot_token": bot_token,
"chat_id": chat_id,
"message": """
🚨 *Prometheus 告警*
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*级别:* {{ .Labels.severity }}
*实例:* {{ .Labels.instance }}
*时间:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
""",
"parse_mode": "Markdown"
}]
}]
}
def create_discord_config(self, webhook_url: str) -> Dict[str, Any]:
"""创建 Discord 通知配置"""
return {
"receivers": [{
"name": "discord-notifications",
"discord_configs": [{
"webhook_url": webhook_url,
"title": "Prometheus 告警",
"message": """
{{ range .Alerts }}
**告警:** {{ .Annotations.summary }}
**描述:** {{ .Annotations.description }}
**级别:** {{ .Labels.severity }}
**实例:** {{ .Labels.instance }}
**时间:** {{ .StartsAt }}
{{ end }}
"""
}]
}]
}
def generate_multi_channel_config(self) -> str:
"""生成多渠道通知配置示例"""
return """
# 多渠道通知配置示例
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'password'
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
wechat_api_corp_id: 'your-corp-id'
route:
receiver: 'default'
group_by: ['alertname']
routes:
# 关键告警 - 多渠道通知
- match:
severity: critical
receiver: 'critical-multi-channel'
group_wait: 0s
repeat_interval: 15m
# 警告告警 - Slack + 邮件
- match:
severity: warning
receiver: 'warning-notifications'
group_wait: 5m
repeat_interval: 2h
# 信息告警 - 仅Slack
- match:
severity: info
receiver: 'info-notifications'
group_wait: 10m
repeat_interval: 12h
receivers:
- name: 'default'
email_configs:
- to: 'admin@example.com'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
# 关键告警多渠道
- name: 'critical-multi-channel'
email_configs:
- to: 'oncall@example.com,manager@example.com'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
body: |
🚨 关键告警 🚨
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
处理手册: {{ .Annotations.runbook_url }}
{{ end }}
slack_configs:
- channel: '#critical-alerts'
title: '🚨 关键告警'
text: |
<!channel> 关键告警需要立即处理
{{ range .Alerts }}
*告警*: {{ .Annotations.summary }}
*描述*: {{ .Annotations.description }}
*实例*: {{ .Labels.instance }}
*处理手册*: {{ .Annotations.runbook_url }}
{{ end }}
send_resolved: true
wechat_configs:
- agent_id: 'your-agent-id'
api_secret: 'your-api-secret'
to_user: '@all'
message: |
【紧急告警】
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
实例: {{ .Labels.instance }}
时间: {{ .StartsAt.Format "15:04" }}
{{ end }}
pagerduty_configs:
- routing_key: 'your-pagerduty-key'
description: '{{ .GroupLabels.alertname }}'
severity: 'critical'
# 警告告警
- name: 'warning-notifications'
email_configs:
- to: 'team@example.com'
subject: '[WARNING] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#alerts'
title: '⚠️ 警告告警'
send_resolved: true
# 信息告警
- name: 'info-notifications'
slack_configs:
- channel: '#monitoring'
title: 'ℹ️ 信息告警'
send_resolved: true
"""
# 使用示例
channel_manager = NotificationChannelManager()
# 创建各种通知渠道配置
email_config = channel_manager.create_email_config(
smtp_server="smtp.example.com",
from_addr="alerts@example.com",
to_addrs=["admin@example.com", "team@example.com"]
)
slack_config = channel_manager.create_slack_config(
webhook_url="https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
channel="#alerts"
)
wechat_config = channel_manager.create_wechat_config(
corp_id="your-corp-id",
agent_id="your-agent-id",
api_secret="your-api-secret"
)
print("通知渠道配置已生成")
print(f"邮件配置: {len(str(email_config))} 字符")
print(f"Slack配置: {len(str(slack_config))} 字符")
print(f"企业微信配置: {len(str(wechat_config))} 字符")
告警模板系统
AlertTemplateManager 类
class AlertTemplateManager:
"""告警模板管理器"""
def __init__(self):
self.templates = {}
def create_email_templates(self) -> Dict[str, str]:
"""创建邮件模板"""
return {
"critical.tmpl": """
{{ define "email.critical.subject" }}
[🚨 CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.instance }}
{{ end }}
{{ define "email.critical.body" }}
<!DOCTYPE html>
<html>
<head>
<style>
body { font-family: Arial, sans-serif; }
.critical { background-color: #ff4444; color: white; padding: 10px; }
.alert-info { background-color: #f5f5f5; padding: 15px; margin: 10px 0; }
.label { font-weight: bold; }
</style>
</head>
<body>
<div class="critical">
<h2>🚨 关键告警通知</h2>
</div>
{{ range .Alerts }}
<div class="alert-info">
<h3>{{ .Annotations.summary }}</h3>
<p><span class="label">描述:</span> {{ .Annotations.description }}</p>
<p><span class="label">实例:</span> {{ .Labels.instance }}</p>
<p><span class="label">服务:</span> {{ .Labels.job }}</p>
<p><span class="label">严重级别:</span> {{ .Labels.severity }}</p>
<p><span class="label">开始时间:</span> {{ .StartsAt.Format "2006-01-02 15:04:05" }}</p>
{{ if .Annotations.runbook_url }}
<p><span class="label">处理手册:</span> <a href="{{ .Annotations.runbook_url }}">点击查看</a></p>
{{ end }}
{{ if .Annotations.dashboard_url }}
<p><span class="label">监控面板:</span> <a href="{{ .Annotations.dashboard_url }}">点击查看</a></p>
{{ end }}
</div>
{{ end }}
<p><small>此邮件由 Prometheus Alertmanager 自动发送</small></p>
</body>
</html>
{{ end }}
""",
"warning.tmpl": """
{{ define "email.warning.subject" }}
[⚠️ WARNING] {{ .GroupLabels.alertname }}
{{ end }}
{{ define "email.warning.body" }}
警告告警通知
{{ range .Alerts }}
告警名称: {{ .Annotations.summary }}
告警描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
服务: {{ .Labels.job }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .Annotations.runbook_url }}
处理手册: {{ .Annotations.runbook_url }}
{{ end }}
{{ end }}
--
此邮件由 Prometheus Alertmanager 自动发送
{{ end }}
"""
}
def create_slack_templates(self) -> Dict[str, str]:
"""创建 Slack 模板"""
return {
"slack.tmpl": """
{{ define "slack.title" }}
{{ if eq .Status "firing" }}
{{ if eq .CommonLabels.severity "critical" }}🚨{{ else if eq .CommonLabels.severity "warning" }}⚠️{{ else }}ℹ️{{ end }}
{{ else }}✅{{ end }}
{{ .GroupLabels.alertname }}
{{ end }}
{{ define "slack.text" }}
{{ if eq .Status "firing" }}
*状态:* 🔥 触发中
{{ else }}
*状态:* ✅ 已恢复
{{ end }}
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*实例:* {{ .Labels.instance }}
*服务:* {{ .Labels.job }}
*级别:* {{ .Labels.severity }}
*时间:* {{ if eq $.Status "firing" }}{{ .StartsAt.Format "15:04:05" }}{{ else }}{{ .EndsAt.Format "15:04:05" }}{{ end }}
{{ if .Annotations.runbook_url }}
*处理手册:* <{{ .Annotations.runbook_url }}|点击查看>
{{ end }}
{{ if .Annotations.dashboard_url }}
*监控面板:* <{{ .Annotations.dashboard_url }}|点击查看>
{{ end }}
{{ end }}
{{ end }}
{{ define "slack.color" }}
{{ if eq .Status "firing" }}
{{ if eq .CommonLabels.severity "critical" }}danger{{ else if eq .CommonLabels.severity "warning" }}warning{{ else }}good{{ end }}
{{ else }}good{{ end }}
{{ end }}
"""
}
def create_wechat_templates(self) -> Dict[str, str]:
"""创建企业微信模板"""
return {
"wechat.tmpl": """
{{ define "wechat.message" }}
{{ if eq .Status "firing" }}【告警通知】{{ else }}【恢复通知】{{ end }}
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
级别: {{ .Labels.severity }}
时间: {{ if eq $.Status "firing" }}{{ .StartsAt.Format "15:04" }}{{ else }}{{ .EndsAt.Format "15:04" }}{{ end }}
{{ if .Annotations.runbook_url }}
手册: {{ .Annotations.runbook_url }}
{{ end }}
{{ end }}
{{ end }}
"""
}
def create_webhook_templates(self) -> Dict[str, str]:
"""创建 Webhook 模板"""
return {
"webhook.tmpl": """
{{ define "webhook.payload" }}
{
"status": "{{ .Status }}",
"alerts": [
{{ range $index, $alert := .Alerts }}
{{ if $index }},{{ end }}
{
"status": "{{ $alert.Status }}",
"labels": {
{{ range $key, $value := $alert.Labels }}
"{{ $key }}": "{{ $value }}",
{{ end }}
"alertname": "{{ $alert.Labels.alertname }}"
},
"annotations": {
{{ range $key, $value := $alert.Annotations }}
"{{ $key }}": "{{ $value }}",
{{ end }}
"summary": "{{ $alert.Annotations.summary }}"
},
"startsAt": "{{ $alert.StartsAt }}",
"endsAt": "{{ $alert.EndsAt }}",
"generatorURL": "{{ $alert.GeneratorURL }}"
}
{{ end }}
],
"groupLabels": {
{{ range $key, $value := .GroupLabels }}
"{{ $key }}": "{{ $value }}",
{{ end }}
"alertname": "{{ .GroupLabels.alertname }}"
},
"commonLabels": {
{{ range $key, $value := .CommonLabels }}
"{{ $key }}": "{{ $value }}",
{{ end }}
"alertname": "{{ .CommonLabels.alertname }}"
},
"commonAnnotations": {
{{ range $key, $value := .CommonAnnotations }}
"{{ $key }}": "{{ $value }}",
{{ end }}
"summary": "{{ .CommonAnnotations.summary }}"
},
"externalURL": "{{ .ExternalURL }}",
"version": "4",
"groupKey": "{{ .GroupKey }}",
"truncatedAlerts": {{ .TruncatedAlerts }}
}
{{ end }}
"""
}
def generate_template_config(self) -> str:
"""生成模板配置文件"""
return """
# Alertmanager 模板配置
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
# 模板文件路径
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
receiver: 'default'
group_by: ['alertname']
routes:
- match:
severity: critical
receiver: 'critical-with-template'
- match:
severity: warning
receiver: 'warning-with-template'
receivers:
- name: 'default'
email_configs:
- to: 'admin@example.com'
subject: '{{ template "email.warning.subject" . }}'
body: '{{ template "email.warning.body" . }}'
- name: 'critical-with-template'
email_configs:
- to: 'oncall@example.com'
subject: '{{ template "email.critical.subject" . }}'
body: '{{ template "email.critical.body" . }}'
headers:
Content-Type: 'text/html; charset=UTF-8'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#critical-alerts'
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
color: '{{ template "slack.color" . }}'
send_resolved: true
- name: 'warning-with-template'
email_configs:
- to: 'team@example.com'
subject: '{{ template "email.warning.subject" . }}'
body: '{{ template "email.warning.body" . }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
color: '{{ template "slack.color" . }}'
send_resolved: true
"""
# 使用示例
template_manager = AlertTemplateManager()
# 生成各种模板
email_templates = template_manager.create_email_templates()
slack_templates = template_manager.create_slack_templates()
wechat_templates = template_manager.create_wechat_templates()
webhook_templates = template_manager.create_webhook_templates()
print("告警模板已生成")
print(f"邮件模板数量: {len(email_templates)}")
print(f"Slack模板数量: {len(slack_templates)}")
print(f"企业微信模板数量: {len(wechat_templates)}")
print(f"Webhook模板数量: {len(webhook_templates)}")
# 生成模板配置
template_config = template_manager.generate_template_config()
print(f"\n模板配置长度: {len(template_config)} 字符")
高级告警功能
AlertAdvancedManager 类
class AlertAdvancedManager:
"""高级告警功能管理器"""
def __init__(self):
self.silences = []
self.maintenance_windows = []
self.escalation_policies = []
def create_silence_config(self) -> str:
"""创建静默配置"""
return """
# 告警静默管理
# 1. 通过 amtool 命令行工具创建静默
# 静默特定实例的所有告警(维护期间)
amtool silence add instance="server-01.example.com" --duration="2h" --comment="服务器维护"
# 静默特定告警类型
amtool silence add alertname="HighCPUUsage" --duration="1h" --comment="已知问题,正在处理"
# 静默特定服务的告警
amtool silence add job="web-server" severity="warning" --duration="30m" --comment="部署期间"
# 2. 通过 API 创建静默
curl -X POST http://alertmanager:9093/api/v1/silences \
-H "Content-Type: application/json" \
-d '{
"matchers": [
{
"name": "alertname",
"value": "InstanceDown",
"isRegex": false
},
{
"name": "instance",
"value": "server-.*",
"isRegex": true
}
],
"startsAt": "2024-01-01T10:00:00Z",
"endsAt": "2024-01-01T12:00:00Z",
"createdBy": "admin@example.com",
"comment": "计划维护窗口"
}'
# 3. 查看当前静默
amtool silence query
# 4. 删除静默
amtool silence expire <silence-id>
"""
def create_inhibition_rules(self) -> str:
"""创建抑制规则配置"""
return """
# 高级抑制规则配置
inhibit_rules:
# 1. 实例宕机时抑制所有相关告警
- source_match:
alertname: 'InstanceDown'
target_match_re:
alertname: '(HighCPUUsage|HighMemoryUsage|DiskSpaceLow|HighDiskIOWait|ServiceUnavailable)'
equal: ['instance']
# 2. 网络分区时抑制连接相关告警
- source_match:
alertname: 'NetworkPartition'
target_match_re:
alertname: '(DatabaseConnectionFailed|APITimeout|ServiceUnavailable)'
equal: ['datacenter', 'zone']
# 3. 负载均衡器故障时抑制后端服务告警
- source_match:
alertname: 'LoadBalancerDown'
target_match_re:
alertname: '(HighLatency|HighErrorRate|ServiceUnavailable)'
equal: ['service', 'environment']
# 4. 存储集群故障时抑制相关告警
- source_match:
alertname: 'StorageClusterDown'
target_match_re:
alertname: '(DatabaseSlow|DiskSpaceLow|BackupFailed)'
equal: ['storage_cluster']
# 5. 关键告警抑制同类警告告警
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance', 'service']
# 6. 维护模式时抑制所有告警
- source_match:
alertname: 'MaintenanceMode'
target_match_re:
alertname: '.*'
equal: ['cluster', 'environment']
# 7. 上游服务故障时抑制下游告警
- source_match:
alertname: 'UpstreamServiceDown'
service: 'auth-service'
target_match_re:
alertname: '(AuthenticationFailed|UnauthorizedAccess)'
equal: ['environment']
# 8. DNS故障时抑制域名解析相关告警
- source_match:
alertname: 'DNSResolutionFailed'
target_match_re:
alertname: '(ServiceDiscoveryFailed|ExternalAPITimeout)'
equal: ['dns_zone']
"""
def create_escalation_policy(self) -> str:
"""创建告警升级策略"""
return """
# 告警升级策略配置
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
routes:
# 关键告警升级策略
- match:
severity: critical
receiver: 'critical-level-1'
group_wait: 0s
repeat_interval: 15m
routes:
# 15分钟后升级到二级
- match:
severity: critical
receiver: 'critical-level-2'
group_wait: 15m
repeat_interval: 30m
routes:
# 45分钟后升级到三级
- match:
severity: critical
receiver: 'critical-level-3'
group_wait: 30m
repeat_interval: 1h
# 业务告警升级策略
- match:
team: business
severity: warning
receiver: 'business-level-1'
group_wait: 5m
repeat_interval: 2h
routes:
# 2小时后升级
- match:
team: business
severity: warning
receiver: 'business-level-2'
group_wait: 2h
repeat_interval: 4h
# 安全告警立即升级
- match:
team: security
receiver: 'security-immediate'
group_wait: 0s
repeat_interval: 5m
receivers:
# 关键告警一级响应
- name: 'critical-level-1'
email_configs:
- to: 'oncall-primary@example.com'
subject: '[CRITICAL-L1] {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#critical-alerts'
title: '🚨 关键告警 - 一级响应'
# 关键告警二级响应
- name: 'critical-level-2'
email_configs:
- to: 'oncall-primary@example.com,oncall-secondary@example.com'
subject: '[CRITICAL-L2] {{ .GroupLabels.alertname }} - 升级'
slack_configs:
- channel: '#critical-alerts'
title: '🚨🚨 关键告警 - 二级响应(升级)'
pagerduty_configs:
- routing_key: 'level-2-routing-key'
severity: 'critical'
# 关键告警三级响应
- name: 'critical-level-3'
email_configs:
- to: 'oncall-primary@example.com,oncall-secondary@example.com,manager@example.com'
subject: '[CRITICAL-L3] {{ .GroupLabels.alertname }} - 最高级别'
slack_configs:
- channel: '#critical-alerts'
title: '🚨🚨🚨 关键告警 - 三级响应(最高级别)'
text: '<!channel> 关键告警已升级到最高级别,需要立即处理'
pagerduty_configs:
- routing_key: 'level-3-routing-key'
severity: 'critical'
wechat_configs:
- agent_id: 'emergency-agent'
to_user: '@all'
message: '【紧急告警-最高级别】需要立即处理'
"""
def create_maintenance_window_config(self) -> str:
"""创建维护窗口配置"""
return """
# 维护窗口配置
time_intervals:
# 定期维护窗口
- name: 'weekly-maintenance'
time_intervals:
- times:
- start_time: '02:00'
end_time: '04:00'
weekdays: ['sunday']
location: 'Asia/Shanghai'
# 月度维护窗口
- name: 'monthly-maintenance'
time_intervals:
- times:
- start_time: '01:00'
end_time: '05:00'
weekdays: ['saturday']
days_of_month: ['1']
location: 'Asia/Shanghai'
# 紧急维护窗口
- name: 'emergency-maintenance'
time_intervals:
- times:
- start_time: '00:00'
end_time: '23:59'
weekdays: ['monday:sunday']
location: 'Asia/Shanghai'
# 工作时间
- name: 'business-hours'
time_intervals:
- times:
- start_time: '09:00'
end_time: '18:00'
weekdays: ['monday:friday']
location: 'Asia/Shanghai'
# 非工作时间
- name: 'after-hours'
time_intervals:
- times:
- start_time: '18:00'
end_time: '09:00'
weekdays: ['monday:friday']
location: 'Asia/Shanghai'
- weekdays: ['saturday', 'sunday']
location: 'Asia/Shanghai'
# 在路由中使用维护窗口
route:
receiver: 'default'
routes:
# 维护窗口期间的告警处理
- match:
maintenance: 'true'
receiver: 'maintenance-alerts'
active_time_intervals:
- 'weekly-maintenance'
- 'monthly-maintenance'
group_wait: 10m
repeat_interval: 0 # 维护期间不重复发送
# 工作时间的关键告警
- match:
severity: critical
receiver: 'critical-business-hours'
active_time_intervals:
- 'business-hours'
group_wait: 0s
repeat_interval: 15m
# 非工作时间的关键告警
- match:
severity: critical
receiver: 'critical-after-hours'
active_time_intervals:
- 'after-hours'
group_wait: 0s
repeat_interval: 30m
receivers:
- name: 'maintenance-alerts'
email_configs:
- to: 'maintenance-team@example.com'
subject: '[MAINTENANCE] {{ .GroupLabels.alertname }}'
body: |
维护期间告警通知
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
注意: 此告警发生在维护窗口期间,可能与维护活动相关。
"""
def create_alert_routing_strategies(self) -> str:
"""创建告警路由策略"""
return """
# 高级告警路由策略
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
routes:
# 1. 基于环境的路由
- match:
environment: 'production'
receiver: 'production-alerts'
group_wait: 0s
routes:
# 生产环境关键告警
- match:
severity: critical
receiver: 'production-critical'
group_wait: 0s
repeat_interval: 15m
# 生产环境警告告警
- match:
severity: warning
receiver: 'production-warning'
group_wait: 5m
repeat_interval: 1h
# 2. 基于服务类型的路由
- match_re:
service: '(api|web|gateway)'
receiver: 'frontend-team'
routes:
# API服务特殊处理
- match:
service: api
receiver: 'api-team'
group_by: ['alertname', 'endpoint']
# Web服务特殊处理
- match:
service: web
receiver: 'web-team'
group_by: ['alertname', 'instance']
# 3. 基于数据中心的路由
- match:
datacenter: 'dc1'
receiver: 'dc1-team'
routes:
# DC1的数据库告警
- match:
service: database
receiver: 'dc1-dba-team'
- match:
datacenter: 'dc2'
receiver: 'dc2-team'
routes:
# DC2的数据库告警
- match:
service: database
receiver: 'dc2-dba-team'
# 4. 基于业务线的路由
- match:
business_unit: 'ecommerce'
receiver: 'ecommerce-team'
routes:
# 电商支付告警
- match:
component: payment
receiver: 'payment-team'
group_wait: 0s
repeat_interval: 10m
# 电商订单告警
- match:
component: order
receiver: 'order-team'
group_wait: 2m
repeat_interval: 30m
# 5. 基于告警频率的路由
- match:
alert_frequency: 'high'
receiver: 'high-frequency-alerts'
group_wait: 30s
group_interval: 5m
repeat_interval: 30m
# 6. 基于客户影响的路由
- match:
customer_impact: 'high'
receiver: 'customer-impact-team'
group_wait: 0s
repeat_interval: 10m
routes:
# VIP客户影响
- match:
customer_tier: 'vip'
receiver: 'vip-customer-team'
group_wait: 0s
repeat_interval: 5m
# 7. 测试和开发环境路由
- match:
environment: 'staging'
receiver: 'staging-alerts'
group_wait: 10m
repeat_interval: 4h
- match:
environment: 'development'
receiver: 'dev-alerts'
group_wait: 30m
repeat_interval: 0 # 开发环境不重复发送
"""
# 使用示例
advanced_manager = AlertAdvancedManager()
# 生成各种高级配置
silence_config = advanced_manager.create_silence_config()
inhibition_rules = advanced_manager.create_inhibition_rules()
escalation_policy = advanced_manager.create_escalation_policy()
maintenance_config = advanced_manager.create_maintenance_window_config()
routing_strategies = advanced_manager.create_alert_routing_strategies()
print("高级告警功能配置已生成")
print(f"静默配置长度: {len(silence_config)} 字符")
print(f"抑制规则长度: {len(inhibition_rules)} 字符")
print(f"升级策略长度: {len(escalation_policy)} 字符")
print(f"维护窗口配置长度: {len(maintenance_config)} 字符")
print(f"路由策略长度: {len(routing_strategies)} 字符")
告警最佳实践
AlertBestPractices 类
class AlertBestPractices:
"""告警最佳实践管理器"""
def __init__(self):
self.practices = []
self.anti_patterns = []
self.guidelines = []
def get_alert_design_principles(self) -> List[str]:
"""获取告警设计原则"""
return [
"🎯 告警应该是可操作的 - 每个告警都应该有明确的处理步骤",
"📊 告警应该基于症状而非原因 - 关注用户体验而非技术细节",
"⚡ 告警应该及时 - 在问题影响用户之前发出告警",
"🔍 告警应该具有足够的上下文 - 提供诊断和解决问题所需的信息",
"📈 告警阈值应该基于历史数据和业务需求",
"🔄 告警应该有明确的恢复条件",
"📝 告警应该包含处理文档链接",
"🎚️ 告警应该有适当的严重级别分类",
"🚫 避免告警疲劳 - 减少噪音和误报",
"🔗 告警应该与监控面板关联"
]
def get_alert_severity_guidelines(self) -> Dict[str, Dict[str, str]]:
"""获取告警严重级别指南"""
return {
"critical": {
"定义": "需要立即响应的告警,通常影响服务可用性或数据完整性",
"响应时间": "立即(5分钟内)",
"示例": "服务完全不可用、数据丢失、安全漏洞",
"通知方式": "电话、短信、邮件、即时消息",
"升级策略": "15分钟内无响应则升级"
},
"warning": {
"定义": "需要关注但不需要立即响应的告警",
"响应时间": "1小时内",
"示例": "性能下降、资源使用率高、非关键功能异常",
"通知方式": "邮件、即时消息",
"升级策略": "4小时内无响应则升级"
},
"info": {
"定义": "信息性告警,用于记录和趋势分析",
"响应时间": "下个工作日",
"示例": "部署完成、配置变更、趋势异常",
"通知方式": "邮件、日志",
"升级策略": "不升级"
}
}
def get_alert_naming_conventions(self) -> Dict[str, List[str]]:
"""获取告警命名规范"""
return {
"好的命名": [
"HighCPUUsage - 清晰描述问题",
"DatabaseConnectionPoolExhausted - 具体且可操作",
"APIResponseTimeHigh - 基于用户体验",
"DiskSpaceLow - 简洁明了",
"SSLCertificateExpiringSoon - 预防性告警"
],
"避免的命名": [
"Alert1 - 无意义的名称",
"SomethingWrong - 过于模糊",
"CPUAlert - 不够具体",
"Error - 过于通用",
"Problem - 没有描述性"
],
"命名规则": [
"使用驼峰命名法(CamelCase)",
"以问题类型开头(High, Low, Failed, Down等)",
"包含受影响的组件或服务",
"避免使用缩写和技术术语",
"保持名称简洁但具有描述性"
]
}
def get_alert_threshold_guidelines(self) -> str:
"""获取告警阈值设置指南"""
return """
# 告警阈值设置指南
## 1. 基于历史数据设置阈值
### CPU使用率告警
# 分析过去30天的CPU使用率分布
# 设置阈值为P95 + 10%作为警告,P99作为关键
histogram_quantile(0.95, rate(node_cpu_seconds_total[30d])) + 0.1 # Warning
histogram_quantile(0.99, rate(node_cpu_seconds_total[30d])) # Critical
### 内存使用率告警
# 基于应用程序的内存使用模式
# 考虑内存泄漏的检测
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85 # Warning
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.95 # Critical
### 磁盘空间告警
# 基于磁盘增长速率预测
# 预留足够时间进行清理
predict_linear(node_filesystem_free_bytes[6h], 24*3600) < 0 # 预测24小时后磁盘满
## 2. 基于业务影响设置阈值
### API响应时间
# 基于SLA要求设置
# 99%的请求应在500ms内完成
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5
### 错误率告警
# 基于用户体验影响
# 错误率超过1%影响用户体验
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.01
## 3. 动态阈值设置
### 基于时间模式的动态阈值
# 工作时间和非工作时间使用不同阈值
(
(
hour() >= 9 and hour() <= 18 and
sum(rate(http_requests_total[5m])) < 100 # 工作时间阈值
) or (
(hour() < 9 or hour() > 18) and
sum(rate(http_requests_total[5m])) < 20 # 非工作时间阈值
)
)
### 基于历史同期的动态阈值
# 与上周同期比较
abs(
sum(rate(http_requests_total[5m])) -
sum(rate(http_requests_total[5m] offset 7d))
) / sum(rate(http_requests_total[5m] offset 7d)) > 0.3
## 4. 阈值调优建议
### 避免告警疲劳
- 定期审查告警频率和准确性
- 调整过于敏感的阈值
- 合并相关告警
- 使用抑制规则减少噪音
### 阈值测试
- 在测试环境验证阈值设置
- 模拟故障场景测试告警响应
- 收集团队反馈调整阈值
- 建立阈值调整的版本控制
### 文档化阈值决策
- 记录阈值设置的原因
- 定期审查和更新阈值
- 建立阈值变更审批流程
- 监控阈值调整的效果
"""
def get_alert_fatigue_prevention(self) -> List[str]:
"""获取告警疲劳预防策略"""
return [
"🎯 精确的阈值设置 - 避免过于敏感的告警",
"📊 基于影响的告警 - 只对真正影响用户的问题告警",
"🔄 智能分组 - 将相关告警合并为一个通知",
"⏰ 适当的重复间隔 - 避免过于频繁的重复通知",
"🚫 有效的抑制规则 - 在已知问题期间抑制相关告警",
"🔧 自动化修复 - 对可自动修复的问题实施自动化",
"📈 趋势分析 - 使用趋势而非绝对值进行告警",
"🎚️ 分级告警 - 使用不同严重级别避免所有告警都是紧急的",
"📝 定期审查 - 定期审查告警规则的有效性",
"🔍 根因分析 - 解决根本原因而非症状",
"⚡ 快速静默 - 提供快速静默机制处理已知问题",
"📊 告警指标 - 监控告警系统本身的健康状况"
]
def generate_alert_quality_metrics(self) -> str:
"""生成告警质量指标"""
return """
# 告警质量监控指标
## 1. 告警准确性指标
### 误报率(False Positive Rate)
# 计算被静默或快速解决的告警比例
(
sum(increase(alertmanager_silences_total[24h])) +
sum(increase(alertmanager_alerts_resolved_total{resolution_time="<5m"}[24h]))
) / sum(increase(alertmanager_alerts_total[24h])) * 100
### 漏报检测
# 通过SLA违反但无告警的情况检测
# 需要结合业务指标和告警状态
## 2. 告警响应指标
### 平均响应时间(MTTA - Mean Time To Acknowledge)
avg(alertmanager_alert_ack_time_seconds)
### 平均解决时间(MTTR - Mean Time To Resolve)
avg(alertmanager_alert_resolution_time_seconds)
### 告警升级率
sum(increase(alertmanager_alerts_escalated_total[24h])) /
sum(increase(alertmanager_alerts_total[24h])) * 100
## 3. 告警频率指标
### 每日告警数量
sum(increase(alertmanager_alerts_total[24h]))
### 高频告警识别
topk(10,
sum by (alertname) (
increase(alertmanager_alerts_total[24h])
)
)
### 告警风暴检测
# 短时间内大量告警
sum(increase(alertmanager_alerts_total[5m])) > 50
## 4. 告警覆盖率指标
### 服务覆盖率
# 有告警规则的服务比例
count(count by (service) (up)) / count(count by (service) (prometheus_rule_group_rules))
### 关键路径覆盖
# 关键业务流程的告警覆盖情况
## 5. 告警效果指标
### 问题预防率
# 通过告警预防的问题数量
### 用户影响减少
# 通过及时告警减少的用户影响时间
## 6. 告警系统健康指标
### Alertmanager可用性
up{job="alertmanager"}
### 告警规则评估延迟
prometheus_rule_evaluation_duration_seconds
### 通知发送成功率
sum(rate(alertmanager_notifications_total{state="success"}[5m])) /
sum(rate(alertmanager_notifications_total[5m])) * 100
## 7. 团队效率指标
### 告警处理效率
# 每个团队的告警处理速度和质量
### 告警知识积累
# Runbook使用率和更新频率
### 告警培训效果
# 新团队成员的告警处理能力提升
"""
# 使用示例
best_practices = AlertBestPractices()
# 获取各种最佳实践指南
design_principles = best_practices.get_alert_design_principles()
severity_guidelines = best_practices.get_alert_severity_guidelines()
naming_conventions = best_practices.get_alert_naming_conventions()
threshold_guidelines = best_practices.get_alert_threshold_guidelines()
fatigue_prevention = best_practices.get_alert_fatigue_prevention()
quality_metrics = best_practices.generate_alert_quality_metrics()
print("告警最佳实践指南已生成")
print(f"设计原则数量: {len(design_principles)}")
print(f"严重级别指南: {len(severity_guidelines)} 个级别")
print(f"命名规范类别: {len(naming_conventions)} 个")
print(f"疲劳预防策略: {len(fatigue_prevention)} 条")
print(f"质量指标长度: {len(quality_metrics)} 字符")
print("\n设计原则示例:")
for i, principle in enumerate(design_principles[:3], 1):
print(f"{i}. {principle}")
故障排除与监控
AlertTroubleshootingManager 类
class AlertTroubleshootingManager:
"""告警故障排除管理器"""
def __init__(self):
self.common_issues = []
self.diagnostic_queries = []
self.health_checks = []
def get_common_alerting_issues(self) -> Dict[str, Dict[str, str]]:
"""获取常见告警问题及解决方案"""
return {
"告警未触发": {
"症状": "预期的告警没有发出",
"可能原因": "规则语法错误、阈值设置不当、标签匹配问题、数据缺失",
"诊断步骤": "检查规则语法、验证查询结果、确认数据源、检查标签",
"解决方案": "修正规则语法、调整阈值、修复数据收集、更新标签匹配"
},
"告警风暴": {
"症状": "短时间内大量告警触发",
"可能原因": "阈值过于敏感、级联故障、配置错误、数据异常",
"诊断步骤": "分析告警模式、检查系统状态、审查配置变更",
"解决方案": "调整阈值、添加抑制规则、修复根本问题、临时静默"
},
"通知未发送": {
"症状": "告警触发但通知未收到",
"可能原因": "Alertmanager配置错误、网络问题、接收器故障、路由错误",
"诊断步骤": "检查Alertmanager日志、验证网络连接、测试接收器",
"解决方案": "修正配置、修复网络、更新接收器配置、调整路由规则"
},
"告警延迟": {
"症状": "告警触发时间过晚",
"可能原因": "评估间隔过长、查询复杂度高、资源不足、网络延迟",
"诊断步骤": "检查评估时间、分析查询性能、监控资源使用",
"解决方案": "优化查询、增加资源、调整评估间隔、简化规则"
},
"误报告警": {
"症状": "告警频繁触发但实际无问题",
"可能原因": "阈值设置不当、数据噪音、时间窗口不合适",
"诊断步骤": "分析历史数据、检查阈值合理性、评估时间窗口",
"解决方案": "调整阈值、增加平滑处理、优化时间窗口、添加条件"
}
}
def generate_diagnostic_queries(self) -> str:
"""生成诊断查询"""
return """
# 告警系统诊断查询
## 1. Prometheus 健康检查
### Prometheus 服务状态
up{job="prometheus"}
### 规则评估延迟
prometheus_rule_evaluation_duration_seconds
### 规则评估失败
increase(prometheus_rule_evaluation_failures_total[5m])
### 查询执行时间
histogram_quantile(0.95, rate(prometheus_engine_query_duration_seconds_bucket[5m]))
### 存储使用情况
prometheus_tsdb_symbol_table_size_bytes / 1024 / 1024 # MB
prometheus_tsdb_head_series
## 2. Alertmanager 健康检查
### Alertmanager 服务状态
up{job="alertmanager"}
### 告警处理延迟
histogram_quantile(0.95, rate(alertmanager_notification_latency_seconds_bucket[5m]))
### 通知发送状态
rate(alertmanager_notifications_total[5m])
rate(alertmanager_notifications_failed_total[5m])
### 活跃告警数量
alertmanager_alerts
alertmanager_alerts{state="active"}
alertmanager_alerts{state="suppressed"}
### 静默规则数量
alertmanager_silences
alertmanager_silences{state="active"}
## 3. 告警规则诊断
### 规则组评估时间
prometheus_rule_group_last_evaluation_timestamp_seconds
### 规则组评估间隔
prometheus_rule_group_interval_seconds
### 活跃告警规则
ALERT_FOR_STATE
### 规则评估错误
increase(prometheus_rule_evaluation_failures_total[1h])
## 4. 网络和连接诊断
### Prometheus 到 Alertmanager 连接
prometheus_notifications_alertmanagers_discovered
prometheus_notifications_dropped_total
### 外部服务连接测试
probe_success{job="blackbox"}
probe_duration_seconds{job="blackbox"}
## 5. 资源使用诊断
### CPU 使用率
rate(process_cpu_seconds_total{job="prometheus"}[5m]) * 100
rate(process_cpu_seconds_total{job="alertmanager"}[5m]) * 100
### 内存使用
process_resident_memory_bytes{job="prometheus"} / 1024 / 1024 # MB
process_resident_memory_bytes{job="alertmanager"} / 1024 / 1024 # MB
### 磁盘使用
prometheus_tsdb_wal_size_bytes / 1024 / 1024 # MB
prometheus_tsdb_head_chunks
## 6. 告警质量诊断
### 告警频率分析
topk(10, sum by (alertname) (increase(ALERTS[24h])))
### 告警持续时间分析
histogram_quantile(0.95,
sum by (alertname) (
rate(ALERTS_FOR_STATE[24h])
)
)
### 静默使用分析
topk(10, sum by (alertname) (increase(alertmanager_silences_total[24h])))
## 7. 性能优化诊断
### 慢查询识别
topk(10,
avg by (query) (
rate(prometheus_engine_query_duration_seconds_sum[5m]) /
rate(prometheus_engine_query_duration_seconds_count[5m])
)
)
### 高基数指标识别
topk(10, count by (__name__)({__name__=~".+"}))
### 存储增长率
rate(prometheus_tsdb_symbol_table_size_bytes[1h])
"""
def create_health_monitoring_config(self) -> str:
"""创建健康监控配置"""
return """
# 告警系统健康监控配置
groups:
- name: alerting-system-health
rules:
# Prometheus 健康检查
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 1m
labels:
severity: critical
component: prometheus
annotations:
summary: "Prometheus 服务不可用"
description: "Prometheus 实例 {{ $labels.instance }} 已宕机超过1分钟"
runbook_url: "https://runbooks.example.com/prometheus-down"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_evaluation_duration_seconds > 30
for: 5m
labels:
severity: warning
component: prometheus
annotations:
summary: "Prometheus 规则评估缓慢"
description: "规则组 {{ $labels.rule_group }} 评估时间超过30秒"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 0m
labels:
severity: critical
component: prometheus
annotations:
summary: "Prometheus 规则评估失败"
description: "规则组 {{ $labels.rule_group }} 评估失败"
# Alertmanager 健康检查
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 1m
labels:
severity: critical
component: alertmanager
annotations:
summary: "Alertmanager 服务不可用"
description: "Alertmanager 实例 {{ $labels.instance }} 已宕机超过1分钟"
runbook_url: "https://runbooks.example.com/alertmanager-down"
- alert: AlertmanagerNotificationsFailing
expr: |
(
rate(alertmanager_notifications_failed_total[5m]) /
rate(alertmanager_notifications_total[5m])
) > 0.1
for: 5m
labels:
severity: warning
component: alertmanager
annotations:
summary: "Alertmanager 通知发送失败率高"
description: "通知发送失败率为 {{ $value | humanizePercentage }}"
- alert: AlertmanagerConfigReloadFailed
expr: alertmanager_config_last_reload_successful == 0
for: 0m
labels:
severity: critical
component: alertmanager
annotations:
summary: "Alertmanager 配置重载失败"
description: "Alertmanager 配置重载失败,请检查配置文件"
# 告警质量监控
- alert: HighAlertVolume
expr: sum(increase(ALERTS[1h])) > 100
for: 0m
labels:
severity: warning
component: alerting
annotations:
summary: "告警数量过高"
description: "过去1小时内产生了 {{ $value }} 个告警,可能存在告警风暴"
- alert: AlertStorm
expr: sum(increase(ALERTS[5m])) > 50
for: 0m
labels:
severity: critical
component: alerting
annotations:
summary: "告警风暴检测"
description: "过去5分钟内产生了 {{ $value }} 个告警,疑似告警风暴"
- alert: HighSilenceUsage
expr: |
(
sum(alertmanager_silences{state="active"}) /
sum(alertmanager_alerts)
) > 0.5
for: 10m
labels:
severity: warning
component: alerting
annotations:
summary: "静默使用率过高"
description: "当前有 {{ $value | humanizePercentage }} 的告警被静默"
# 性能监控
- alert: PrometheusHighMemoryUsage
expr: |
(
process_resident_memory_bytes{job="prometheus"} /
node_memory_MemTotal_bytes
) > 0.8
for: 5m
labels:
severity: warning
component: prometheus
annotations:
summary: "Prometheus 内存使用率高"
description: "Prometheus 内存使用率为 {{ $value | humanizePercentage }}"
- alert: PrometheusHighDiskUsage
expr: |
(
prometheus_tsdb_wal_size_bytes +
prometheus_tsdb_head_chunks_bytes
) / 1024 / 1024 / 1024 > 10 # 10GB
for: 5m
labels:
severity: warning
component: prometheus
annotations:
summary: "Prometheus 磁盘使用量高"
description: "Prometheus 存储使用量为 {{ $value | humanize }}GB"
# 连接监控
- alert: PrometheusAlertmanagerConnectionFailed
expr: prometheus_notifications_alertmanagers_discovered == 0
for: 2m
labels:
severity: critical
component: prometheus
annotations:
summary: "Prometheus 无法连接到 Alertmanager"
description: "Prometheus 未发现任何可用的 Alertmanager 实例"
- alert: PrometheusNotificationDropped
expr: increase(prometheus_notifications_dropped_total[5m]) > 0
for: 0m
labels:
severity: warning
component: prometheus
annotations:
summary: "Prometheus 通知被丢弃"
description: "过去5分钟内有 {{ $value }} 个通知被丢弃"
"""
def create_troubleshooting_runbook(self) -> str:
"""创建故障排除手册"""
return """
# 告警系统故障排除手册
## 1. 告警未触发故障排除
### 步骤1: 验证规则语法
```bash
# 使用 promtool 验证规则文件
promtool check rules /path/to/rules.yml
# 检查 Prometheus 配置
promtool check config /path/to/prometheus.yml
步骤2: 测试查询表达式
# 在 Prometheus Web UI 中测试查询
# 或使用 API 测试
curl 'http://prometheus:9090/api/v1/query?query=up'
步骤3: 检查数据可用性
# 检查目标是否正常采集
curl 'http://prometheus:9090/api/v1/query?query=up{job="your-job"}'
# 检查指标是否存在
curl 'http://prometheus:9090/api/v1/label/__name__/values'
步骤4: 验证标签匹配
# 检查标签值
curl 'http://prometheus:9090/api/v1/label/job/values'
# 测试标签选择器
curl 'http://prometheus:9090/api/v1/query?query=up{instance="target:9090"}'
2. 通知未发送故障排除
步骤1: 检查 Alertmanager 状态
# 检查 Alertmanager 服务状态
curl http://alertmanager:9093/-/healthy
# 查看活跃告警
curl http://alertmanager:9093/api/v1/alerts
步骤2: 验证路由配置
# 测试路由匹配
curl -X POST http://alertmanager:9093/api/v1/alerts \
-H "Content-Type: application/json" \
-d '[{
"labels": {
"alertname": "TestAlert",
"severity": "warning"
}
}]'
步骤3: 检查接收器配置
# 查看 Alertmanager 配置
curl http://alertmanager:9093/api/v1/status
# 检查通知历史
curl http://alertmanager:9093/api/v1/alerts/groups
步骤4: 测试通知渠道
# 测试邮件配置
echo "Test email" | mail -s "Test" user@example.com
# 测试 Slack webhook
curl -X POST -H 'Content-type: application/json' \
--data '{"text":"Test message"}' \
YOUR_SLACK_WEBHOOK_URL
3. 告警风暴处理
步骤1: 快速静默
# 静默所有告警(紧急情况)
amtool silence add alertname=~".*" --duration="1h" --comment="Emergency silence"
# 静默特定服务
amtool silence add service="problematic-service" --duration="30m"
步骤2: 分析告警模式
# 查看告警统计
curl 'http://prometheus:9090/api/v1/query?query=topk(10,sum by (alertname)(ALERTS))'
# 分析告警时间线
curl 'http://prometheus:9090/api/v1/query_range?query=sum(ALERTS)&start=...&end=...&step=60s'
步骤3: 识别根本原因
# 检查系统指标
curl 'http://prometheus:9090/api/v1/query?query=up'
curl 'http://prometheus:9090/api/v1/query?query=node_load1'
# 检查最近的配置变更
git log --oneline --since="1 hour ago" -- prometheus/
4. 性能问题排除
步骤1: 识别慢查询
# 查看查询性能
curl 'http://prometheus:9090/api/v1/query?query=topk(10,prometheus_engine_query_duration_seconds)'
# 分析规则评估时间
curl 'http://prometheus:9090/api/v1/query?query=prometheus_rule_evaluation_duration_seconds'
步骤2: 优化查询
# 使用 promtool 分析查询
promtool query instant 'your_complex_query'
# 检查高基数指标
curl 'http://prometheus:9090/api/v1/query?query=topk(10,count by (__name__)({__name__=~".+"}))'
步骤3: 资源监控
# 监控 Prometheus 资源使用
curl 'http://prometheus:9090/api/v1/query?query=process_resident_memory_bytes{job="prometheus"}'
curl 'http://prometheus:9090/api/v1/query?query=rate(process_cpu_seconds_total{job="prometheus"}[5m])'
5. 配置验证工具
Prometheus 配置验证
#!/bin/bash
# prometheus-config-check.sh
echo "检查 Prometheus 配置..."
promtool check config /etc/prometheus/prometheus.yml
echo "检查告警规则..."
for file in /etc/prometheus/rules/*.yml; do
echo "检查 $file"
promtool check rules "$file"
done
echo "测试配置重载..."
curl -X POST http://localhost:9090/-/reload
Alertmanager 配置验证
#!/bin/bash
# alertmanager-config-check.sh
echo "检查 Alertmanager 配置..."
amtool check-config /etc/alertmanager/alertmanager.yml
echo "测试路由配置..."
amtool config routes --config.file=/etc/alertmanager/alertmanager.yml
echo "测试配置重载..."
curl -X POST http://localhost:9093/-/reload
6. 监控脚本
告警系统健康检查脚本
#!/bin/bash
# alert-system-health.sh
PROMETHEUS_URL="http://localhost:9090"
ALERTMANAGER_URL="http://localhost:9093"
echo "=== 告警系统健康检查 ==="
# 检查 Prometheus
echo "检查 Prometheus 状态..."
if curl -s "$PROMETHEUS_URL/-/healthy" > /dev/null; then
echo "✓ Prometheus 健康"
else
echo "✗ Prometheus 不健康"
fi
# 检查 Alertmanager
echo "检查 Alertmanager 状态..."
if curl -s "$ALERTMANAGER_URL/-/healthy" > /dev/null; then
echo "✓ Alertmanager 健康"
else
echo "✗ Alertmanager 不健康"
fi
# 检查活跃告警
echo "检查活跃告警..."
ALERT_COUNT=$(curl -s "$ALERTMANAGER_URL/api/v1/alerts" | jq '.data | length')
echo "当前活跃告警数量: $ALERT_COUNT"
# 检查静默规则
echo "检查静默规则..."
SILENCE_COUNT=$(curl -s "$ALERTMANAGER_URL/api/v1/silences" | jq '.data | length')
echo "当前静默规则数量: $SILENCE_COUNT"
echo "=== 检查完成 ==="
”“”
使用示例
troubleshooting_manager = AlertTroubleshootingManager()
获取故障排除信息
common_issues = troubleshooting_manager.get_common_alerting_issues() diagnostic_queries = troubleshooting_manager.generate_diagnostic_queries() health_monitoring = troubleshooting_manager.create_health_monitoring_config() troubleshooting_runbook = troubleshooting_manager.create_troubleshooting_runbook()
print(“告警故障排除指南已生成”) print(f”常见问题数量: {len(common_issues)}“) print(f”诊断查询长度: {len(diagnostic_queries)} 字符”) print(f”健康监控配置长度: {len(health_monitoring)} 字符”) print(f”故障排除手册长度: {len(troubleshooting_runbook)} 字符”)
print(“\n常见问题示例:”) for issue, details in list(common_issues.items())[:2]: print(f”- {issue}: {details[‘症状’]}“) “`
总结
通过本章的学习,我们全面掌握了 Prometheus 告警与通知系统的各个方面:
🎯 核心要点
告警规则设计
- 基于症状而非原因的告警策略
- 合理的阈值设置和时间窗口
- 清晰的告警命名和分类
- 完整的告警元数据和文档
Alertmanager 配置
- 灵活的路由和分组策略
- 多样化的通知渠道配置
- 智能的抑制和静默机制
- 高级的升级和时间窗口管理
通知渠道管理
- 邮件、Slack、企业微信等多种渠道
- 个性化的通知模板
- 基于场景的通知策略
- 通知的可靠性和及时性保障
高级功能应用
- 动态路由和智能分发
- 告警升级和自动化处理
- 维护窗口和计划停机管理
- 告警质量监控和优化
🚀 最佳实践
设计原则
- 告警应该是可操作的
- 避免告警疲劳
- 基于业务影响设置优先级
- 持续优化和改进
运维管理
- 定期审查告警规则
- 监控告警系统健康状况
- 建立完善的故障排除流程
- 团队培训和知识分享
技术实现
- 使用版本控制管理配置
- 自动化测试和验证
- 监控和日志记录
- 性能优化和扩展性考虑
📚 下一步学习
深入学习
- Prometheus 查询语言 (PromQL) 高级用法
- 自定义 Exporter 开发
- 告警规则的自动化测试
- 大规模部署的性能优化
实践项目
- 构建完整的监控告警系统
- 集成现有的运维工具链
- 开发自定义通知渠道
- 实现智能告警分析
扩展学习
- Grafana 可视化集成
- 日志监控和分析
- 分布式追踪系统
- 云原生监控解决方案
通过系统学习和实践,你已经具备了构建和管理企业级告警系统的能力。记住,优秀的告警系统不仅仅是技术实现,更需要结合业务理解、团队协作和持续改进的运维文化。
恭喜!你已经完成了 Prometheus 告警与通知系统的学习。 🎉
继续探索 Prometheus 生态系统的其他组件,构建更加完善的监控解决方案!
def create_pagerduty_config(self, routing_key: str, service_key: str = "") -> Dict[str, Any]:
"""创建 PagerDuty 通知配置"""
return {
"receivers": [{
"name": "pagerduty-notifications",
"pagerduty_configs": [{
"routing_key": routing_key,
"service_key": service_key,
"description": "{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}",
"severity": "{{ .Labels.severity }}",
"details": {
"instance": "{{ .Labels.instance }}",
"job": "{{ .Labels.job }}",
"description": "{{ .Annotations.description }}"
}
}]
}]
}
def create_wechat_config(self, corp_id: str, agent_id: str, api_secret: str,
to_user: str = "@all") -> Dict[str, Any]:
"""创建企业微信通知配置"""
return {
"global": {
"wechat_api_url": "https://qyapi.weixin.qq.com/cgi-bin/",
"wechat_api_corp_id": corp_id
},
"receivers": [{
"name": "wechat-notifications",
"wechat_configs": [{
"agent_id": agent_id,
"api_secret": api_secret,
"to_user": to_user,
"message": """
告警通知 {{ range .Alerts }} 告警: {{ .Annotations.summary }} 描述: {{ .Annotations.description }} 级别: {{ .Labels.severity }} 实例: {{ .Labels.instance }} 时间: {{ .StartsAt }} {{ end }} “”” }] }] }
def create_dingtalk_config(self, webhook_url: str, secret: str = "") -> Dict[str, Any]:
"""创建钉钉通知配置"""
config = {
"receivers": [{
"name": "dingtalk-notifications",
"webhook_configs": [{
"url": webhook_url,
"send_resolved": True,
"title": "Prometheus 告警通知",
"message": """
{{ range .Alerts }} 告警名称: {{ .Annotations.summary }} 告警描述: {{ .Annotations.description }} 告警级别: {{ .Labels.severity }} 实例: {{ .Labels.instance }} 开始时间: {{ .StartsAt }} {{ end }} “”” }] }] }
if secret:
config["receivers"][0]["webhook_configs"][0]["http_config"] = {
"headers": {
"Content-Type": "application/json"
}
}
return config