概述

告警是监控系统的核心功能之一,Prometheus通过Alertmanager组件提供了强大的告警管理能力。本章将深入介绍如何配置和管理Prometheus告警系统,包括告警规则编写、Alertmanager配置、通知渠道设置和告警策略管理。

学习目标

通过本章学习,你将掌握:

  1. 告警规则编写:学会编写有效的告警规则
  2. Alertmanager配置:掌握告警管理器的配置方法
  3. 通知渠道设置:配置多种通知方式(邮件、Slack、钉钉等)
  4. 告警策略管理:实现告警分组、抑制和静默
  5. 告警最佳实践:避免告警疲劳,提高告警质量

告警规则基础

1. 告警规则结构

from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional
import yaml

class AlertSeverity(Enum):
    """告警严重级别"""
    CRITICAL = "critical"
    WARNING = "warning"
    INFO = "info"

class AlertState(Enum):
    """告警状态"""
    PENDING = "pending"
    FIRING = "firing"
    RESOLVED = "resolved"

@dataclass
class AlertRule:
    """告警规则数据结构"""
    name: str
    expr: str
    duration: str
    severity: AlertSeverity
    summary: str
    description: str
    labels: Dict[str, str]
    annotations: Dict[str, str]

class AlertRuleManager:
    """告警规则管理器"""
    
    def __init__(self):
        self.rules = []
        self.rule_groups = {}
    
    def create_basic_rules(self) -> str:
        """创建基础告警规则"""
        return """
# 基础告警规则组
groups:
  - name: basic_alerts
    rules:
      # 实例宕机告警
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
          team: infrastructure
        annotations:
          summary: "实例 {{ $labels.instance }} 已宕机"
          description: "实例 {{ $labels.instance }} 在过去1分钟内无法访问"
          runbook_url: "https://wiki.example.com/runbooks/instance-down"
      
      # 高CPU使用率告警
      - alert: HighCPUUsage
        expr: |
          100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "实例 {{ $labels.instance }} CPU使用率过高"
          description: "实例 {{ $labels.instance }} CPU使用率为 {{ $value }}%,持续5分钟"
          runbook_url: "https://wiki.example.com/runbooks/high-cpu"
      
      # 高内存使用率告警
      - alert: HighMemoryUsage
        expr: |
          (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "实例 {{ $labels.instance }} 内存使用率过高"
          description: "实例 {{ $labels.instance }} 内存使用率为 {{ $value }}%,持续5分钟"
          runbook_url: "https://wiki.example.com/runbooks/high-memory"
      
      # 磁盘空间不足告警
      - alert: DiskSpaceLow
        expr: |
          (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
        for: 1m
        labels:
          severity: critical
          team: infrastructure
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘空间不足"
          description: "实例 {{ $labels.instance }} 挂载点 {{ $labels.mountpoint }} 磁盘使用率为 {{ $value }}%"
          runbook_url: "https://wiki.example.com/runbooks/disk-space"
      
      # 磁盘IO等待时间过长
      - alert: HighDiskIOWait
        expr: |
          rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
        for: 5m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘IO等待时间过长"
          description: "实例 {{ $labels.instance }} IO等待时间为 {{ $value }}%,持续5分钟"
          runbook_url: "https://wiki.example.com/runbooks/high-iowait"
"""
    
    def create_application_rules(self) -> str:
        """创建应用程序告警规则"""
        return """
# 应用程序告警规则组
groups:
  - name: application_alerts
    rules:
      # HTTP错误率过高
      - alert: HighHTTPErrorRate
        expr: |
          sum(rate(http_requests_total{status_code=~"[45].."}[5m])) by (job, instance)
          /
          sum(rate(http_requests_total[5m])) by (job, instance) * 100 > 5
        for: 2m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "服务 {{ $labels.job }} 错误率过高"
          description: "服务 {{ $labels.job }} 在实例 {{ $labels.instance }} 上的错误率为 {{ $value }}%"
          runbook_url: "https://wiki.example.com/runbooks/high-error-rate"
      
      # HTTP响应时间过长
      - alert: HighHTTPLatency
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le)
          ) > 0.5
        for: 3m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "服务 {{ $labels.job }} 响应时间过长"
          description: "服务 {{ $labels.job }} 在实例 {{ $labels.instance }} 上的P95响应时间为 {{ $value }}秒"
          runbook_url: "https://wiki.example.com/runbooks/high-latency"
      
      # 服务请求量异常下降
      - alert: LowRequestRate
        expr: |
          sum(rate(http_requests_total[5m])) by (job) < 10
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "服务 {{ $labels.job }} 请求量异常下降"
          description: "服务 {{ $labels.job }} 的请求量为 {{ $value }} req/s,可能存在问题"
          runbook_url: "https://wiki.example.com/runbooks/low-request-rate"
      
      # 数据库连接池耗尽
      - alert: DatabaseConnectionPoolExhausted
        expr: |
          mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 90
        for: 1m
        labels:
          severity: critical
          team: database
        annotations:
          summary: "数据库 {{ $labels.instance }} 连接池即将耗尽"
          description: "数据库 {{ $labels.instance }} 连接使用率为 {{ $value }}%"
          runbook_url: "https://wiki.example.com/runbooks/db-connections"
      
      # 队列积压过多
      - alert: HighQueueBacklog
        expr: |
          queue_size > 1000
        for: 2m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "队列 {{ $labels.queue_name }} 积压过多"
          description: "队列 {{ $labels.queue_name }} 当前积压 {{ $value }} 个任务"
          runbook_url: "https://wiki.example.com/runbooks/queue-backlog"
"""
    
    def create_business_rules(self) -> str:
        """创建业务指标告警规则"""
        return """
# 业务指标告警规则组
groups:
  - name: business_alerts
    rules:
      # 订单量异常下降
      - alert: LowOrderRate
        expr: |
          sum(rate(orders_total[10m])) < 5
        for: 5m
        labels:
          severity: warning
          team: business
        annotations:
          summary: "订单量异常下降"
          description: "过去10分钟订单量为 {{ $value }} 单/分钟,低于正常水平"
          runbook_url: "https://wiki.example.com/runbooks/low-orders"
      
      # 支付失败率过高
      - alert: HighPaymentFailureRate
        expr: |
          sum(rate(payment_failures_total[5m]))
          /
          sum(rate(payment_attempts_total[5m])) * 100 > 2
        for: 3m
        labels:
          severity: critical
          team: payment
        annotations:
          summary: "支付失败率过高"
          description: "支付失败率为 {{ $value }}%,超过阈值"
          runbook_url: "https://wiki.example.com/runbooks/payment-failures"
      
      # 用户注册量异常
      - alert: AbnormalUserRegistration
        expr: |
          abs(sum(rate(user_registrations_total[1h])) - sum(rate(user_registrations_total[1h] offset 24h))) > 50
        for: 10m
        labels:
          severity: info
          team: growth
        annotations:
          summary: "用户注册量异常"
          description: "当前小时注册量与昨日同期相比差异为 {{ $value }} 人"
          runbook_url: "https://wiki.example.com/runbooks/registration-anomaly"
      
      # 库存不足告警
      - alert: LowInventory
        expr: |
          inventory_quantity < 10
        for: 0s
        labels:
          severity: warning
          team: inventory
        annotations:
          summary: "商品 {{ $labels.product_id }} 库存不足"
          description: "商品 {{ $labels.product_id }} 当前库存为 {{ $value }} 件"
          runbook_url: "https://wiki.example.com/runbooks/low-inventory"
"""
    
    def create_security_rules(self) -> str:
        """创建安全相关告警规则"""
        return """
# 安全告警规则组
groups:
  - name: security_alerts
    rules:
      # 异常登录尝试
      - alert: HighFailedLoginAttempts
        expr: |
          sum(rate(login_failures_total[5m])) by (source_ip) > 10
        for: 1m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "IP {{ $labels.source_ip }} 异常登录尝试"
          description: "IP {{ $labels.source_ip }} 在5分钟内失败登录 {{ $value }} 次"
          runbook_url: "https://wiki.example.com/runbooks/failed-logins"
      
      # 可疑API调用
      - alert: SuspiciousAPIUsage
        expr: |
          sum(rate(api_requests_total[1m])) by (api_key, endpoint) > 100
        for: 2m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "API密钥 {{ $labels.api_key }} 使用异常"
          description: "API密钥 {{ $labels.api_key }} 对端点 {{ $labels.endpoint }} 的调用频率为 {{ $value }} req/min"
          runbook_url: "https://wiki.example.com/runbooks/api-abuse"
      
      # SSL证书即将过期
      - alert: SSLCertificateExpiringSoon
        expr: |
          (ssl_certificate_expiry_timestamp - time()) / 86400 < 30
        for: 1h
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "SSL证书即将过期"
          description: "域名 {{ $labels.domain }} 的SSL证书将在 {{ $value }} 天后过期"
          runbook_url: "https://wiki.example.com/runbooks/ssl-expiry"
      
      # 异常文件访问
      - alert: UnauthorizedFileAccess
        expr: |
          sum(rate(file_access_denied_total[5m])) by (user, file_path) > 5
        for: 1m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "用户 {{ $labels.user }} 异常文件访问"
          description: "用户 {{ $labels.user }} 尝试访问 {{ $labels.file_path }},被拒绝 {{ $value }} 次"
          runbook_url: "https://wiki.example.com/runbooks/file-access"
"""
    
    def generate_rule_validation_script(self) -> str:
        """生成规则验证脚本"""
        return """
#!/bin/bash
# Prometheus告警规则验证脚本

set -e

RULES_DIR="/etc/prometheus/rules"
PROMETHEUS_URL="http://localhost:9090"
TEMP_DIR="/tmp/prometheus-rules-test"

echo "开始验证Prometheus告警规则..."

# 创建临时目录
mkdir -p $TEMP_DIR

# 验证规则文件语法
echo "1. 验证规则文件语法"
for rule_file in $RULES_DIR/*.yml; do
    if [ -f "$rule_file" ]; then
        echo "验证文件: $rule_file"
        promtool check rules "$rule_file"
        if [ $? -eq 0 ]; then
            echo "✓ $rule_file 语法正确"
        else
            echo "✗ $rule_file 语法错误"
            exit 1
        fi
    fi
done

# 验证规则查询
echo "\n2. 验证规则查询"
for rule_file in $RULES_DIR/*.yml; do
    if [ -f "$rule_file" ]; then
        echo "验证查询: $rule_file"
        promtool query instant $PROMETHEUS_URL 'up'
        if [ $? -eq 0 ]; then
            echo "✓ Prometheus连接正常"
        else
            echo "✗ 无法连接到Prometheus"
            exit 1
        fi
    fi
done

# 测试规则表达式
echo "\n3. 测试规则表达式"
cat > $TEMP_DIR/test_queries.txt << 'EOF'
up == 0
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
sum(rate(http_requests_total{status_code=~"[45].."}[5m])) by (job, instance) / sum(rate(http_requests_total[5m])) by (job, instance) * 100 > 5
EOF

while IFS= read -r query; do
    echo "测试查询: $query"
    promtool query instant $PROMETHEUS_URL "$query" > /dev/null 2>&1
    if [ $? -eq 0 ]; then
        echo "✓ 查询执行成功"
    else
        echo "⚠ 查询可能有问题(可能是因为没有相关指标)"
    fi
done < $TEMP_DIR/test_queries.txt

# 清理临时文件
rm -rf $TEMP_DIR

echo "\n告警规则验证完成!"
"""
    
    def generate_rule_best_practices(self) -> List[str]:
        """生成告警规则最佳实践"""
        return [
            "✓ 使用有意义的告警名称和描述",
            "✓ 设置合适的for持续时间避免误报",
            "✓ 包含runbook_url指向处理文档",
            "✓ 使用标准的severity标签",
            "✓ 在annotations中提供足够的上下文信息",
            "✓ 避免过于复杂的PromQL表达式",
            "✓ 使用记录规则简化复杂查询",
            "✓ 定期审查和更新告警阈值",
            "✓ 为不同团队设置不同的标签",
            "✓ 测试告警规则的有效性",
            "✓ 避免告警风暴和级联告警",
            "✓ 使用模板变量提高可读性",
            "✓ 考虑业务影响设置优先级",
            "✓ 实施告警规则版本控制",
            "✓ 监控告警规则的性能影响"
        ]

# 使用示例
rule_manager = AlertRuleManager()

# 生成各类告警规则
basic_rules = rule_manager.create_basic_rules()
app_rules = rule_manager.create_application_rules()
business_rules = rule_manager.create_business_rules()
security_rules = rule_manager.create_security_rules()

print("告警规则已生成")
print(f"\n基础规则示例:")
print(basic_rules[:300] + "...")

print(f"\n应用规则示例:")
print(app_rules[:300] + "...")

# 生成验证脚本
validation_script = rule_manager.generate_rule_validation_script()
print(f"\n验证脚本长度: {len(validation_script)} 字符")

# 获取最佳实践
best_practices = rule_manager.generate_rule_best_practices()
print(f"\n最佳实践数量: {len(best_practices)}")
for practice in best_practices[:5]:
    print(practice)
print("...")

Alertmanager配置

1. 基础配置

class AlertmanagerConfig:
    """Alertmanager配置管理器"""
    
    def __init__(self):
        self.config = {}
        self.routes = []
        self.receivers = []
        self.inhibit_rules = []
    
    def generate_basic_config(self) -> str:
        """生成基础Alertmanager配置"""
        return """
# Alertmanager基础配置
global:
  # SMTP配置
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: 'your-password'
  smtp_require_tls: true
  
  # Slack配置
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
  
  # 全局标签
  external_labels:
    cluster: 'production'
    environment: 'prod'

# 模板文件
templates:
  - '/etc/alertmanager/templates/*.tmpl'

# 路由配置
route:
  # 默认接收器
  receiver: 'default'
  
  # 分组配置
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  
  # 子路由
  routes:
    # 关键告警立即发送
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 0s
      group_interval: 5m
      repeat_interval: 30m
    
    # 基础设施告警
    - match:
        team: infrastructure
      receiver: 'infrastructure-team'
      group_by: ['alertname', 'instance']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 2h
    
    # 应用程序告警
    - match:
        team: backend
      receiver: 'backend-team'
      group_by: ['alertname', 'service']
      group_wait: 1m
      group_interval: 10m
      repeat_interval: 4h
    
    # 安全告警
    - match:
        team: security
      receiver: 'security-team'
      group_wait: 0s
      group_interval: 1m
      repeat_interval: 15m
    
    # 业务告警
    - match:
        team: business
      receiver: 'business-team'
      group_by: ['alertname']
      group_wait: 5m
      group_interval: 30m
      repeat_interval: 12h

# 接收器配置
receivers:
  # 默认接收器
  - name: 'default'
    email_configs:
      - to: 'admin@example.com'
        subject: '[ALERT] {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          描述: {{ .Annotations.description }}
          标签: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
          时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ end }}
  
  # 关键告警接收器
  - name: 'critical-alerts'
    email_configs:
      - to: 'oncall@example.com'
        subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
        body: |
          🚨 关键告警 🚨
          
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          描述: {{ .Annotations.description }}
          严重级别: {{ .Labels.severity }}
          影响服务: {{ .Labels.service | default "未知" }}
          处理手册: {{ .Annotations.runbook_url | default "无" }}
          开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ end }}
    
    slack_configs:
      - channel: '#critical-alerts'
        title: '🚨 关键告警'
        text: |
          {{ range .Alerts }}
          *告警*: {{ .Annotations.summary }}
          *描述*: {{ .Annotations.description }}
          *标签*: {{ range .Labels.SortedPairs }}`{{ .Name }}`={{ .Value }} {{ end }}
          {{ end }}
        send_resolved: true
  
  # 基础设施团队接收器
  - name: 'infrastructure-team'
    email_configs:
      - to: 'infra-team@example.com'
        subject: '[INFRA] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#infrastructure'
        title: '🔧 基础设施告警'
        send_resolved: true
  
  # 后端团队接收器
  - name: 'backend-team'
    email_configs:
      - to: 'backend-team@example.com'
        subject: '[BACKEND] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#backend-alerts'
        title: '💻 后端服务告警'
        send_resolved: true
  
  # 安全团队接收器
  - name: 'security-team'
    email_configs:
      - to: 'security-team@example.com'
        subject: '[SECURITY] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#security-alerts'
        title: '🔒 安全告警'
        send_resolved: true
  
  # 业务团队接收器
  - name: 'business-team'
    email_configs:
      - to: 'business-team@example.com'
        subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#business-metrics'
        title: '📊 业务指标告警'
        send_resolved: true

# 抑制规则
inhibit_rules:
  # 实例宕机时抑制其他告警
  - source_match:
      alertname: 'InstanceDown'
    target_match_re:
      alertname: '(HighCPUUsage|HighMemoryUsage|DiskSpaceLow)'
    equal: ['instance']
  
  # 关键告警抑制警告告警
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']
  
  # 服务不可用时抑制性能告警
  - source_match:
      alertname: 'ServiceUnavailable'
    target_match_re:
      alertname: '(HighLatency|HighErrorRate)'
    equal: ['service']
"""
    
    def generate_advanced_config(self) -> str:
        """生成高级Alertmanager配置"""
        return """
# Alertmanager高级配置
global:
  # 解析超时
  resolve_timeout: 5m
  
  # HTTP配置
  http_config:
    proxy_url: 'http://proxy.example.com:8080'
    tls_config:
      insecure_skip_verify: false
  
  # SMTP配置(支持多个SMTP服务器)
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password_file: '/etc/alertmanager/smtp_password'
  smtp_require_tls: true
  smtp_hello: 'alertmanager.example.com'
  
  # 企业微信配置
  wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  wechat_api_secret: 'your-wechat-secret'
  wechat_api_corp_id: 'your-corp-id'
  
  # PagerDuty配置
  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
  
  # OpsGenie配置
  opsgenie_api_url: 'https://api.opsgenie.com/'
  opsgenie_api_key_file: '/etc/alertmanager/opsgenie_key'

# 模板配置
templates:
  - '/etc/alertmanager/templates/*.tmpl'
  - '/etc/alertmanager/custom-templates/*.tmpl'

# 高级路由配置
route:
  receiver: 'default'
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  
  routes:
    # 时间敏感的路由(工作时间)
    - match:
        severity: critical
      receiver: 'critical-business-hours'
      active_time_intervals:
        - 'business-hours'
      group_wait: 0s
      repeat_interval: 15m
    
    # 非工作时间的关键告警
    - match:
        severity: critical
      receiver: 'critical-after-hours'
      active_time_intervals:
        - 'after-hours'
      group_wait: 0s
      repeat_interval: 30m
    
    # 基于标签的复杂路由
    - match_re:
        service: '(api|web|database)'
        environment: 'production'
      receiver: 'production-services'
      routes:
        # API服务特殊处理
        - match:
            service: api
          receiver: 'api-team'
          group_by: ['alertname', 'endpoint']
        
        # 数据库告警
        - match:
            service: database
          receiver: 'dba-team'
          group_by: ['alertname', 'database']
    
    # 测试环境告警(降低优先级)
    - match:
        environment: 'staging'
      receiver: 'staging-alerts'
      group_wait: 5m
      group_interval: 30m
      repeat_interval: 24h
    
    # 开发环境告警(仅记录)
    - match:
        environment: 'development'
      receiver: 'dev-alerts'
      group_wait: 10m
      repeat_interval: 0  # 不重复发送

# 时间间隔定义
time_intervals:
  - name: 'business-hours'
    time_intervals:
      - times:
          - start_time: '09:00'
            end_time: '18:00'
        weekdays: ['monday:friday']
        location: 'Asia/Shanghai'
  
  - name: 'after-hours'
    time_intervals:
      - times:
          - start_time: '18:00'
            end_time: '09:00'
        weekdays: ['monday:friday']
        location: 'Asia/Shanghai'
      - weekdays: ['saturday', 'sunday']
        location: 'Asia/Shanghai'
  
  - name: 'maintenance-window'
    time_intervals:
      - times:
          - start_time: '02:00'
            end_time: '04:00'
        weekdays: ['sunday']
        location: 'Asia/Shanghai'

# 高级接收器配置
receivers:
  - name: 'default'
    webhook_configs:
      - url: 'http://alertmanager-webhook:8080/webhook'
        send_resolved: true
        http_config:
          bearer_token_file: '/etc/alertmanager/webhook_token'
  
  # 工作时间关键告警
  - name: 'critical-business-hours'
    email_configs:
      - to: 'oncall@example.com'
        subject: '[URGENT] {{ .GroupLabels.alertname }}'
        headers:
          Priority: 'high'
          X-Priority: '1'
    
    slack_configs:
      - api_url_file: '/etc/alertmanager/slack_webhook'
        channel: '#critical-alerts'
        title: '🚨 紧急告警 - 工作时间'
        text: |
          <!channel> 紧急告警需要立即处理
          {{ range .Alerts }}
          *告警*: {{ .Annotations.summary }}
          *描述*: {{ .Annotations.description }}
          *处理手册*: {{ .Annotations.runbook_url }}
          {{ end }}
    
    pagerduty_configs:
      - routing_key_file: '/etc/alertmanager/pagerduty_key'
        description: '{{ .GroupLabels.alertname }}'
        severity: 'critical'
        details:
          summary: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
          source: '{{ .GroupLabels.instance }}'
  
  # 非工作时间关键告警
  - name: 'critical-after-hours'
    email_configs:
      - to: 'oncall-night@example.com'
        subject: '[CRITICAL-NIGHT] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#critical-alerts'
        title: '🌙 夜间关键告警'
    
    wechat_configs:
      - corp_id: 'your-corp-id'
        to_user: '@all'
        agent_id: 'your-agent-id'
        api_secret_file: '/etc/alertmanager/wechat_secret'
        message: |
          【夜间紧急告警】
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          时间: {{ .StartsAt.Format "15:04:05" }}
          {{ end }}
  
  # API团队接收器
  - name: 'api-team'
    slack_configs:
      - channel: '#api-team'
        title: '🔌 API服务告警'
        fields:
          - title: '服务'
            value: '{{ .GroupLabels.service }}'
            short: true
          - title: '端点'
            value: '{{ .GroupLabels.endpoint }}'
            short: true
    
    webhook_configs:
      - url: 'http://api-monitoring:8080/alerts'
        http_config:
          basic_auth:
            username: 'alertmanager'
            password_file: '/etc/alertmanager/api_webhook_password'
  
  # DBA团队接收器
  - name: 'dba-team'
    email_configs:
      - to: 'dba-team@example.com'
        subject: '[DATABASE] {{ .GroupLabels.alertname }}'
        body: |
          数据库告警详情:
          
          {{ range .Alerts }}
          数据库: {{ .Labels.database }}
          实例: {{ .Labels.instance }}
          告警: {{ .Annotations.summary }}
          描述: {{ .Annotations.description }}
          
          查询语句: {{ .Labels.query | default "无" }}
          表名: {{ .Labels.table | default "无" }}
          
          开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ if .EndsAt }}结束时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}{{ end }}
          {{ end }}

# 高级抑制规则
inhibit_rules:
  # 维护窗口期间抑制所有告警
  - source_match:
      alertname: 'MaintenanceMode'
    target_match_re:
      alertname: '.*'
    equal: ['cluster']
  
  # 网络分区时抑制实例告警
  - source_match:
      alertname: 'NetworkPartition'
    target_match_re:
      alertname: '(InstanceDown|HighLatency)'
    equal: ['datacenter']
  
  # 负载均衡器故障时抑制后端告警
  - source_match:
      alertname: 'LoadBalancerDown'
    target_match_re:
      alertname: '(ServiceUnavailable|HighErrorRate)'
    equal: ['service']
  
  # 存储故障时抑制相关告警
  - source_match:
      alertname: 'StorageFailure'
    target_match_re:
      alertname: '(DiskSpaceLow|HighDiskIOWait|DatabaseConnectionIssues)'
    equal: ['storage_cluster']
"""

# 使用示例
alertmanager_config = AlertmanagerConfig()

# 生成配置
basic_config = alertmanager_config.generate_basic_config()
advanced_config = alertmanager_config.generate_advanced_config()

print("Alertmanager配置已生成")
print(f"\n基础配置长度: {len(basic_config)} 字符")
print(f"高级配置长度: {len(advanced_config)} 字符")

print(f"\n基础配置示例:")
print(basic_config[:400] + "...")

通知渠道配置

NotificationChannelManager 类

from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional, Any

class ChannelType(Enum):
    EMAIL = "email"
    SLACK = "slack"
    WEBHOOK = "webhook"
    PAGERDUTY = "pagerduty"
    WECHAT = "wechat"
    DINGTALK = "dingtalk"
    TELEGRAM = "telegram"
    DISCORD = "discord"

@dataclass
class NotificationChannel:
    name: str
    type: ChannelType
    config: Dict[str, Any]
    enabled: bool = True
    
class NotificationChannelManager:
    def __init__(self):
        self.channels = {}
    
    def create_email_config(self, smtp_server: str, port: int = 587, 
                           username: str = "", password: str = "",
                           from_addr: str = "", to_addrs: List[str] = None) -> Dict[str, Any]:
        """创建邮件通知配置"""
        return {
            "global": {
                "smtp_smarthost": f"{smtp_server}:{port}",
                "smtp_from": from_addr,
                "smtp_auth_username": username,
                "smtp_auth_password": password,
                "smtp_require_tls": True
            },
            "receivers": [{
                "name": "email-notifications",
                "email_configs": [{
                    "to": ", ".join(to_addrs or []),
                    "subject": "[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}",
                    "body": """
告警详情:
{{ range .Alerts }}
告警名称: {{ .Annotations.summary }}
告警描述: {{ .Annotations.description }}
告警级别: {{ .Labels.severity }}
实例: {{ .Labels.instance }}
开始时间: {{ .StartsAt }}
{{ end }}
                    """
                }]
            }]
        }
    
    def create_slack_config(self, webhook_url: str, channel: str = "#alerts",
                           username: str = "Prometheus") -> Dict[str, Any]:
        """创建 Slack 通知配置"""
        return {
            "receivers": [{
                "name": "slack-notifications",
                "slack_configs": [{
                    "api_url": webhook_url,
                    "channel": channel,
                    "username": username,
                    "title": "{{ .GroupLabels.alertname }}",
                    "text": """
{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*级别:* {{ .Labels.severity }}
*实例:* {{ .Labels.instance }}
*时间:* {{ .StartsAt }}
{{ end }}
                    """,
                    "color": "{{ if eq .Status \"firing\" }}danger{{ else }}good{{ end }}"
                }]
            }]
        }
    
    def create_webhook_config(self, url: str, headers: Dict[str, str] = None) -> Dict[str, Any]:
        """创建 Webhook 通知配置"""
        config = {
            "receivers": [{
                "name": "webhook-notifications",
                "webhook_configs": [{
                    "url": url,
                    "send_resolved": True,
                    "http_config": {
                        "basic_auth": {
                            "username": "prometheus",
                            "password": "secret"
                        }
                    }
                }]
            }]
        }
        
        if headers:
            config["receivers"][0]["webhook_configs"][0]["http_config"]["headers"] = headers
            
        return config
    
    def create_telegram_config(self, bot_token: str, chat_id: str) -> Dict[str, Any]:
        """创建 Telegram 通知配置"""
        return {
            "receivers": [{
                "name": "telegram-notifications",
                "telegram_configs": [{
                    "bot_token": bot_token,
                    "chat_id": chat_id,
                    "message": """
🚨 *Prometheus 告警*

{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*级别:* {{ .Labels.severity }}
*实例:* {{ .Labels.instance }}
*时间:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
                    """,
                    "parse_mode": "Markdown"
                }]
            }]
        }
    
    def create_discord_config(self, webhook_url: str) -> Dict[str, Any]:
        """创建 Discord 通知配置"""
        return {
            "receivers": [{
                "name": "discord-notifications",
                "discord_configs": [{
                    "webhook_url": webhook_url,
                    "title": "Prometheus 告警",
                    "message": """
{{ range .Alerts }}
**告警:** {{ .Annotations.summary }}
**描述:** {{ .Annotations.description }}
**级别:** {{ .Labels.severity }}
**实例:** {{ .Labels.instance }}
**时间:** {{ .StartsAt }}
{{ end }}
                    """
                }]
            }]
        }
    
    def generate_multi_channel_config(self) -> str:
        """生成多渠道通知配置示例"""
        return """
# 多渠道通知配置示例
global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: 'password'
  
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
  
  wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  wechat_api_corp_id: 'your-corp-id'

route:
  receiver: 'default'
  group_by: ['alertname']
  routes:
    # 关键告警 - 多渠道通知
    - match:
        severity: critical
      receiver: 'critical-multi-channel'
      group_wait: 0s
      repeat_interval: 15m
    
    # 警告告警 - Slack + 邮件
    - match:
        severity: warning
      receiver: 'warning-notifications'
      group_wait: 5m
      repeat_interval: 2h
    
    # 信息告警 - 仅Slack
    - match:
        severity: info
      receiver: 'info-notifications'
      group_wait: 10m
      repeat_interval: 12h

receivers:
  - name: 'default'
    email_configs:
      - to: 'admin@example.com'
        subject: '[ALERT] {{ .GroupLabels.alertname }}'
  
  # 关键告警多渠道
  - name: 'critical-multi-channel'
    email_configs:
      - to: 'oncall@example.com,manager@example.com'
        subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
        body: |
          🚨 关键告警 🚨
          
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          描述: {{ .Annotations.description }}
          实例: {{ .Labels.instance }}
          开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          处理手册: {{ .Annotations.runbook_url }}
          {{ end }}
    
    slack_configs:
      - channel: '#critical-alerts'
        title: '🚨 关键告警'
        text: |
          <!channel> 关键告警需要立即处理
          {{ range .Alerts }}
          *告警*: {{ .Annotations.summary }}
          *描述*: {{ .Annotations.description }}
          *实例*: {{ .Labels.instance }}
          *处理手册*: {{ .Annotations.runbook_url }}
          {{ end }}
        send_resolved: true
    
    wechat_configs:
      - agent_id: 'your-agent-id'
        api_secret: 'your-api-secret'
        to_user: '@all'
        message: |
          【紧急告警】
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          实例: {{ .Labels.instance }}
          时间: {{ .StartsAt.Format "15:04" }}
          {{ end }}
    
    pagerduty_configs:
      - routing_key: 'your-pagerduty-key'
        description: '{{ .GroupLabels.alertname }}'
        severity: 'critical'
  
  # 警告告警
  - name: 'warning-notifications'
    email_configs:
      - to: 'team@example.com'
        subject: '[WARNING] {{ .GroupLabels.alertname }}'
    
    slack_configs:
      - channel: '#alerts'
        title: '⚠️ 警告告警'
        send_resolved: true
  
  # 信息告警
  - name: 'info-notifications'
    slack_configs:
      - channel: '#monitoring'
        title: 'ℹ️ 信息告警'
        send_resolved: true
"""

# 使用示例
channel_manager = NotificationChannelManager()

# 创建各种通知渠道配置
email_config = channel_manager.create_email_config(
    smtp_server="smtp.example.com",
    from_addr="alerts@example.com",
    to_addrs=["admin@example.com", "team@example.com"]
)

slack_config = channel_manager.create_slack_config(
    webhook_url="https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
    channel="#alerts"
)

wechat_config = channel_manager.create_wechat_config(
    corp_id="your-corp-id",
    agent_id="your-agent-id",
    api_secret="your-api-secret"
)

print("通知渠道配置已生成")
print(f"邮件配置: {len(str(email_config))} 字符")
print(f"Slack配置: {len(str(slack_config))} 字符")
print(f"企业微信配置: {len(str(wechat_config))} 字符")

告警模板系统

AlertTemplateManager 类

class AlertTemplateManager:
    """告警模板管理器"""
    
    def __init__(self):
        self.templates = {}
    
    def create_email_templates(self) -> Dict[str, str]:
        """创建邮件模板"""
        return {
            "critical.tmpl": """
{{ define "email.critical.subject" }}
[🚨 CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.instance }}
{{ end }}

{{ define "email.critical.body" }}
<!DOCTYPE html>
<html>
<head>
    <style>
        body { font-family: Arial, sans-serif; }
        .critical { background-color: #ff4444; color: white; padding: 10px; }
        .alert-info { background-color: #f5f5f5; padding: 15px; margin: 10px 0; }
        .label { font-weight: bold; }
    </style>
</head>
<body>
    <div class="critical">
        <h2>🚨 关键告警通知</h2>
    </div>
    
    {{ range .Alerts }}
    <div class="alert-info">
        <h3>{{ .Annotations.summary }}</h3>
        <p><span class="label">描述:</span> {{ .Annotations.description }}</p>
        <p><span class="label">实例:</span> {{ .Labels.instance }}</p>
        <p><span class="label">服务:</span> {{ .Labels.job }}</p>
        <p><span class="label">严重级别:</span> {{ .Labels.severity }}</p>
        <p><span class="label">开始时间:</span> {{ .StartsAt.Format "2006-01-02 15:04:05" }}</p>
        {{ if .Annotations.runbook_url }}
        <p><span class="label">处理手册:</span> <a href="{{ .Annotations.runbook_url }}">点击查看</a></p>
        {{ end }}
        {{ if .Annotations.dashboard_url }}
        <p><span class="label">监控面板:</span> <a href="{{ .Annotations.dashboard_url }}">点击查看</a></p>
        {{ end }}
    </div>
    {{ end }}
    
    <p><small>此邮件由 Prometheus Alertmanager 自动发送</small></p>
</body>
</html>
{{ end }}
            """,
            
            "warning.tmpl": """
{{ define "email.warning.subject" }}
[⚠️ WARNING] {{ .GroupLabels.alertname }}
{{ end }}

{{ define "email.warning.body" }}
警告告警通知

{{ range .Alerts }}
告警名称: {{ .Annotations.summary }}
告警描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
服务: {{ .Labels.job }}
开始时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .Annotations.runbook_url }}
处理手册: {{ .Annotations.runbook_url }}
{{ end }}

{{ end }}

--
此邮件由 Prometheus Alertmanager 自动发送
{{ end }}
            """
        }
    
    def create_slack_templates(self) -> Dict[str, str]:
        """创建 Slack 模板"""
        return {
            "slack.tmpl": """
{{ define "slack.title" }}
{{ if eq .Status "firing" }}
{{ if eq .CommonLabels.severity "critical" }}🚨{{ else if eq .CommonLabels.severity "warning" }}⚠️{{ else }}ℹ️{{ end }}
{{ else }}✅{{ end }}
{{ .GroupLabels.alertname }}
{{ end }}

{{ define "slack.text" }}
{{ if eq .Status "firing" }}
*状态:* 🔥 触发中
{{ else }}
*状态:* ✅ 已恢复
{{ end }}

{{ range .Alerts }}
*告警:* {{ .Annotations.summary }}
*描述:* {{ .Annotations.description }}
*实例:* {{ .Labels.instance }}
*服务:* {{ .Labels.job }}
*级别:* {{ .Labels.severity }}
*时间:* {{ if eq $.Status "firing" }}{{ .StartsAt.Format "15:04:05" }}{{ else }}{{ .EndsAt.Format "15:04:05" }}{{ end }}
{{ if .Annotations.runbook_url }}
*处理手册:* <{{ .Annotations.runbook_url }}|点击查看>
{{ end }}
{{ if .Annotations.dashboard_url }}
*监控面板:* <{{ .Annotations.dashboard_url }}|点击查看>
{{ end }}

{{ end }}
{{ end }}

{{ define "slack.color" }}
{{ if eq .Status "firing" }}
{{ if eq .CommonLabels.severity "critical" }}danger{{ else if eq .CommonLabels.severity "warning" }}warning{{ else }}good{{ end }}
{{ else }}good{{ end }}
{{ end }}
            """
        }
    
    def create_wechat_templates(self) -> Dict[str, str]:
        """创建企业微信模板"""
        return {
            "wechat.tmpl": """
{{ define "wechat.message" }}
{{ if eq .Status "firing" }}【告警通知】{{ else }}【恢复通知】{{ end }}

{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
实例: {{ .Labels.instance }}
级别: {{ .Labels.severity }}
时间: {{ if eq $.Status "firing" }}{{ .StartsAt.Format "15:04" }}{{ else }}{{ .EndsAt.Format "15:04" }}{{ end }}
{{ if .Annotations.runbook_url }}
手册: {{ .Annotations.runbook_url }}
{{ end }}

{{ end }}
{{ end }}
            """
        }
    
    def create_webhook_templates(self) -> Dict[str, str]:
        """创建 Webhook 模板"""
        return {
            "webhook.tmpl": """
{{ define "webhook.payload" }}
{
  "status": "{{ .Status }}",
  "alerts": [
    {{ range $index, $alert := .Alerts }}
    {{ if $index }},{{ end }}
    {
      "status": "{{ $alert.Status }}",
      "labels": {
        {{ range $key, $value := $alert.Labels }}
        "{{ $key }}": "{{ $value }}",
        {{ end }}
        "alertname": "{{ $alert.Labels.alertname }}"
      },
      "annotations": {
        {{ range $key, $value := $alert.Annotations }}
        "{{ $key }}": "{{ $value }}",
        {{ end }}
        "summary": "{{ $alert.Annotations.summary }}"
      },
      "startsAt": "{{ $alert.StartsAt }}",
      "endsAt": "{{ $alert.EndsAt }}",
      "generatorURL": "{{ $alert.GeneratorURL }}"
    }
    {{ end }}
  ],
  "groupLabels": {
    {{ range $key, $value := .GroupLabels }}
    "{{ $key }}": "{{ $value }}",
    {{ end }}
    "alertname": "{{ .GroupLabels.alertname }}"
  },
  "commonLabels": {
    {{ range $key, $value := .CommonLabels }}
    "{{ $key }}": "{{ $value }}",
    {{ end }}
    "alertname": "{{ .CommonLabels.alertname }}"
  },
  "commonAnnotations": {
    {{ range $key, $value := .CommonAnnotations }}
    "{{ $key }}": "{{ $value }}",
    {{ end }}
    "summary": "{{ .CommonAnnotations.summary }}"
  },
  "externalURL": "{{ .ExternalURL }}",
  "version": "4",
  "groupKey": "{{ .GroupKey }}",
  "truncatedAlerts": {{ .TruncatedAlerts }}
}
{{ end }}
            """
        }
    
    def generate_template_config(self) -> str:
        """生成模板配置文件"""
        return """
# Alertmanager 模板配置
global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'

# 模板文件路径
templates:
  - '/etc/alertmanager/templates/*.tmpl'

route:
  receiver: 'default'
  group_by: ['alertname']
  routes:
    - match:
        severity: critical
      receiver: 'critical-with-template'
    - match:
        severity: warning
      receiver: 'warning-with-template'

receivers:
  - name: 'default'
    email_configs:
      - to: 'admin@example.com'
        subject: '{{ template "email.warning.subject" . }}'
        body: '{{ template "email.warning.body" . }}'
  
  - name: 'critical-with-template'
    email_configs:
      - to: 'oncall@example.com'
        subject: '{{ template "email.critical.subject" . }}'
        body: '{{ template "email.critical.body" . }}'
        headers:
          Content-Type: 'text/html; charset=UTF-8'
    
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
        channel: '#critical-alerts'
        title: '{{ template "slack.title" . }}'
        text: '{{ template "slack.text" . }}'
        color: '{{ template "slack.color" . }}'
        send_resolved: true
  
  - name: 'warning-with-template'
    email_configs:
      - to: 'team@example.com'
        subject: '{{ template "email.warning.subject" . }}'
        body: '{{ template "email.warning.body" . }}'
    
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
        channel: '#alerts'
        title: '{{ template "slack.title" . }}'
        text: '{{ template "slack.text" . }}'
        color: '{{ template "slack.color" . }}'
        send_resolved: true
"""

# 使用示例
template_manager = AlertTemplateManager()

# 生成各种模板
email_templates = template_manager.create_email_templates()
slack_templates = template_manager.create_slack_templates()
wechat_templates = template_manager.create_wechat_templates()
webhook_templates = template_manager.create_webhook_templates()

print("告警模板已生成")
print(f"邮件模板数量: {len(email_templates)}")
print(f"Slack模板数量: {len(slack_templates)}")
print(f"企业微信模板数量: {len(wechat_templates)}")
print(f"Webhook模板数量: {len(webhook_templates)}")

# 生成模板配置
template_config = template_manager.generate_template_config()
print(f"\n模板配置长度: {len(template_config)} 字符")

高级告警功能

AlertAdvancedManager 类

class AlertAdvancedManager:
    """高级告警功能管理器"""
    
    def __init__(self):
        self.silences = []
        self.maintenance_windows = []
        self.escalation_policies = []
    
    def create_silence_config(self) -> str:
        """创建静默配置"""
        return """
# 告警静默管理

# 1. 通过 amtool 命令行工具创建静默
# 静默特定实例的所有告警(维护期间)
amtool silence add instance="server-01.example.com" --duration="2h" --comment="服务器维护"

# 静默特定告警类型
amtool silence add alertname="HighCPUUsage" --duration="1h" --comment="已知问题,正在处理"

# 静默特定服务的告警
amtool silence add job="web-server" severity="warning" --duration="30m" --comment="部署期间"

# 2. 通过 API 创建静默
curl -X POST http://alertmanager:9093/api/v1/silences \
  -H "Content-Type: application/json" \
  -d '{
    "matchers": [
      {
        "name": "alertname",
        "value": "InstanceDown",
        "isRegex": false
      },
      {
        "name": "instance",
        "value": "server-.*",
        "isRegex": true
      }
    ],
    "startsAt": "2024-01-01T10:00:00Z",
    "endsAt": "2024-01-01T12:00:00Z",
    "createdBy": "admin@example.com",
    "comment": "计划维护窗口"
  }'

# 3. 查看当前静默
amtool silence query

# 4. 删除静默
amtool silence expire <silence-id>
"""
    
    def create_inhibition_rules(self) -> str:
        """创建抑制规则配置"""
        return """
# 高级抑制规则配置
inhibit_rules:
  # 1. 实例宕机时抑制所有相关告警
  - source_match:
      alertname: 'InstanceDown'
    target_match_re:
      alertname: '(HighCPUUsage|HighMemoryUsage|DiskSpaceLow|HighDiskIOWait|ServiceUnavailable)'
    equal: ['instance']
  
  # 2. 网络分区时抑制连接相关告警
  - source_match:
      alertname: 'NetworkPartition'
    target_match_re:
      alertname: '(DatabaseConnectionFailed|APITimeout|ServiceUnavailable)'
    equal: ['datacenter', 'zone']
  
  # 3. 负载均衡器故障时抑制后端服务告警
  - source_match:
      alertname: 'LoadBalancerDown'
    target_match_re:
      alertname: '(HighLatency|HighErrorRate|ServiceUnavailable)'
    equal: ['service', 'environment']
  
  # 4. 存储集群故障时抑制相关告警
  - source_match:
      alertname: 'StorageClusterDown'
    target_match_re:
      alertname: '(DatabaseSlow|DiskSpaceLow|BackupFailed)'
    equal: ['storage_cluster']
  
  # 5. 关键告警抑制同类警告告警
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance', 'service']
  
  # 6. 维护模式时抑制所有告警
  - source_match:
      alertname: 'MaintenanceMode'
    target_match_re:
      alertname: '.*'
    equal: ['cluster', 'environment']
  
  # 7. 上游服务故障时抑制下游告警
  - source_match:
      alertname: 'UpstreamServiceDown'
      service: 'auth-service'
    target_match_re:
      alertname: '(AuthenticationFailed|UnauthorizedAccess)'
    equal: ['environment']
  
  # 8. DNS故障时抑制域名解析相关告警
  - source_match:
      alertname: 'DNSResolutionFailed'
    target_match_re:
      alertname: '(ServiceDiscoveryFailed|ExternalAPITimeout)'
    equal: ['dns_zone']
"""
    
    def create_escalation_policy(self) -> str:
        """创建告警升级策略"""
        return """
# 告警升级策略配置
route:
  receiver: 'default'
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  
  routes:
    # 关键告警升级策略
    - match:
        severity: critical
      receiver: 'critical-level-1'
      group_wait: 0s
      repeat_interval: 15m
      routes:
        # 15分钟后升级到二级
        - match:
            severity: critical
          receiver: 'critical-level-2'
          group_wait: 15m
          repeat_interval: 30m
          routes:
            # 45分钟后升级到三级
            - match:
                severity: critical
              receiver: 'critical-level-3'
              group_wait: 30m
              repeat_interval: 1h
    
    # 业务告警升级策略
    - match:
        team: business
        severity: warning
      receiver: 'business-level-1'
      group_wait: 5m
      repeat_interval: 2h
      routes:
        # 2小时后升级
        - match:
            team: business
            severity: warning
          receiver: 'business-level-2'
          group_wait: 2h
          repeat_interval: 4h
    
    # 安全告警立即升级
    - match:
        team: security
      receiver: 'security-immediate'
      group_wait: 0s
      repeat_interval: 5m

receivers:
  # 关键告警一级响应
  - name: 'critical-level-1'
    email_configs:
      - to: 'oncall-primary@example.com'
        subject: '[CRITICAL-L1] {{ .GroupLabels.alertname }}'
    slack_configs:
      - channel: '#critical-alerts'
        title: '🚨 关键告警 - 一级响应'
  
  # 关键告警二级响应
  - name: 'critical-level-2'
    email_configs:
      - to: 'oncall-primary@example.com,oncall-secondary@example.com'
        subject: '[CRITICAL-L2] {{ .GroupLabels.alertname }} - 升级'
    slack_configs:
      - channel: '#critical-alerts'
        title: '🚨🚨 关键告警 - 二级响应(升级)'
    pagerduty_configs:
      - routing_key: 'level-2-routing-key'
        severity: 'critical'
  
  # 关键告警三级响应
  - name: 'critical-level-3'
    email_configs:
      - to: 'oncall-primary@example.com,oncall-secondary@example.com,manager@example.com'
        subject: '[CRITICAL-L3] {{ .GroupLabels.alertname }} - 最高级别'
    slack_configs:
      - channel: '#critical-alerts'
        title: '🚨🚨🚨 关键告警 - 三级响应(最高级别)'
        text: '<!channel> 关键告警已升级到最高级别,需要立即处理'
    pagerduty_configs:
      - routing_key: 'level-3-routing-key'
        severity: 'critical'
    wechat_configs:
      - agent_id: 'emergency-agent'
        to_user: '@all'
        message: '【紧急告警-最高级别】需要立即处理'
"""
    
    def create_maintenance_window_config(self) -> str:
        """创建维护窗口配置"""
        return """
# 维护窗口配置
time_intervals:
  # 定期维护窗口
  - name: 'weekly-maintenance'
    time_intervals:
      - times:
          - start_time: '02:00'
            end_time: '04:00'
        weekdays: ['sunday']
        location: 'Asia/Shanghai'
  
  # 月度维护窗口
  - name: 'monthly-maintenance'
    time_intervals:
      - times:
          - start_time: '01:00'
            end_time: '05:00'
        weekdays: ['saturday']
        days_of_month: ['1']
        location: 'Asia/Shanghai'
  
  # 紧急维护窗口
  - name: 'emergency-maintenance'
    time_intervals:
      - times:
          - start_time: '00:00'
            end_time: '23:59'
        weekdays: ['monday:sunday']
        location: 'Asia/Shanghai'
  
  # 工作时间
  - name: 'business-hours'
    time_intervals:
      - times:
          - start_time: '09:00'
            end_time: '18:00'
        weekdays: ['monday:friday']
        location: 'Asia/Shanghai'
  
  # 非工作时间
  - name: 'after-hours'
    time_intervals:
      - times:
          - start_time: '18:00'
            end_time: '09:00'
        weekdays: ['monday:friday']
        location: 'Asia/Shanghai'
      - weekdays: ['saturday', 'sunday']
        location: 'Asia/Shanghai'

# 在路由中使用维护窗口
route:
  receiver: 'default'
  routes:
    # 维护窗口期间的告警处理
    - match:
        maintenance: 'true'
      receiver: 'maintenance-alerts'
      active_time_intervals:
        - 'weekly-maintenance'
        - 'monthly-maintenance'
      group_wait: 10m
      repeat_interval: 0  # 维护期间不重复发送
    
    # 工作时间的关键告警
    - match:
        severity: critical
      receiver: 'critical-business-hours'
      active_time_intervals:
        - 'business-hours'
      group_wait: 0s
      repeat_interval: 15m
    
    # 非工作时间的关键告警
    - match:
        severity: critical
      receiver: 'critical-after-hours'
      active_time_intervals:
        - 'after-hours'
      group_wait: 0s
      repeat_interval: 30m

receivers:
  - name: 'maintenance-alerts'
    email_configs:
      - to: 'maintenance-team@example.com'
        subject: '[MAINTENANCE] {{ .GroupLabels.alertname }}'
        body: |
          维护期间告警通知
          
          {{ range .Alerts }}
          告警: {{ .Annotations.summary }}
          描述: {{ .Annotations.description }}
          时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ end }}
          
          注意: 此告警发生在维护窗口期间,可能与维护活动相关。
"""
    
    def create_alert_routing_strategies(self) -> str:
        """创建告警路由策略"""
        return """
# 高级告警路由策略
route:
  receiver: 'default'
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  
  routes:
    # 1. 基于环境的路由
    - match:
        environment: 'production'
      receiver: 'production-alerts'
      group_wait: 0s
      routes:
        # 生产环境关键告警
        - match:
            severity: critical
          receiver: 'production-critical'
          group_wait: 0s
          repeat_interval: 15m
        
        # 生产环境警告告警
        - match:
            severity: warning
          receiver: 'production-warning'
          group_wait: 5m
          repeat_interval: 1h
    
    # 2. 基于服务类型的路由
    - match_re:
        service: '(api|web|gateway)'
      receiver: 'frontend-team'
      routes:
        # API服务特殊处理
        - match:
            service: api
          receiver: 'api-team'
          group_by: ['alertname', 'endpoint']
        
        # Web服务特殊处理
        - match:
            service: web
          receiver: 'web-team'
          group_by: ['alertname', 'instance']
    
    # 3. 基于数据中心的路由
    - match:
        datacenter: 'dc1'
      receiver: 'dc1-team'
      routes:
        # DC1的数据库告警
        - match:
            service: database
          receiver: 'dc1-dba-team'
    
    - match:
        datacenter: 'dc2'
      receiver: 'dc2-team'
      routes:
        # DC2的数据库告警
        - match:
            service: database
          receiver: 'dc2-dba-team'
    
    # 4. 基于业务线的路由
    - match:
        business_unit: 'ecommerce'
      receiver: 'ecommerce-team'
      routes:
        # 电商支付告警
        - match:
            component: payment
          receiver: 'payment-team'
          group_wait: 0s
          repeat_interval: 10m
        
        # 电商订单告警
        - match:
            component: order
          receiver: 'order-team'
          group_wait: 2m
          repeat_interval: 30m
    
    # 5. 基于告警频率的路由
    - match:
        alert_frequency: 'high'
      receiver: 'high-frequency-alerts'
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 30m
    
    # 6. 基于客户影响的路由
    - match:
        customer_impact: 'high'
      receiver: 'customer-impact-team'
      group_wait: 0s
      repeat_interval: 10m
      routes:
        # VIP客户影响
        - match:
            customer_tier: 'vip'
          receiver: 'vip-customer-team'
          group_wait: 0s
          repeat_interval: 5m
    
    # 7. 测试和开发环境路由
    - match:
        environment: 'staging'
      receiver: 'staging-alerts'
      group_wait: 10m
      repeat_interval: 4h
    
    - match:
        environment: 'development'
      receiver: 'dev-alerts'
      group_wait: 30m
      repeat_interval: 0  # 开发环境不重复发送
"""

# 使用示例
advanced_manager = AlertAdvancedManager()

# 生成各种高级配置
silence_config = advanced_manager.create_silence_config()
inhibition_rules = advanced_manager.create_inhibition_rules()
escalation_policy = advanced_manager.create_escalation_policy()
maintenance_config = advanced_manager.create_maintenance_window_config()
routing_strategies = advanced_manager.create_alert_routing_strategies()

print("高级告警功能配置已生成")
print(f"静默配置长度: {len(silence_config)} 字符")
print(f"抑制规则长度: {len(inhibition_rules)} 字符")
print(f"升级策略长度: {len(escalation_policy)} 字符")
print(f"维护窗口配置长度: {len(maintenance_config)} 字符")
print(f"路由策略长度: {len(routing_strategies)} 字符")

告警最佳实践

AlertBestPractices 类

class AlertBestPractices:
    """告警最佳实践管理器"""
    
    def __init__(self):
        self.practices = []
        self.anti_patterns = []
        self.guidelines = []
    
    def get_alert_design_principles(self) -> List[str]:
        """获取告警设计原则"""
        return [
            "🎯 告警应该是可操作的 - 每个告警都应该有明确的处理步骤",
            "📊 告警应该基于症状而非原因 - 关注用户体验而非技术细节",
            "⚡ 告警应该及时 - 在问题影响用户之前发出告警",
            "🔍 告警应该具有足够的上下文 - 提供诊断和解决问题所需的信息",
            "📈 告警阈值应该基于历史数据和业务需求",
            "🔄 告警应该有明确的恢复条件",
            "📝 告警应该包含处理文档链接",
            "🎚️ 告警应该有适当的严重级别分类",
            "🚫 避免告警疲劳 - 减少噪音和误报",
            "🔗 告警应该与监控面板关联"
        ]
    
    def get_alert_severity_guidelines(self) -> Dict[str, Dict[str, str]]:
        """获取告警严重级别指南"""
        return {
            "critical": {
                "定义": "需要立即响应的告警,通常影响服务可用性或数据完整性",
                "响应时间": "立即(5分钟内)",
                "示例": "服务完全不可用、数据丢失、安全漏洞",
                "通知方式": "电话、短信、邮件、即时消息",
                "升级策略": "15分钟内无响应则升级"
            },
            "warning": {
                "定义": "需要关注但不需要立即响应的告警",
                "响应时间": "1小时内",
                "示例": "性能下降、资源使用率高、非关键功能异常",
                "通知方式": "邮件、即时消息",
                "升级策略": "4小时内无响应则升级"
            },
            "info": {
                "定义": "信息性告警,用于记录和趋势分析",
                "响应时间": "下个工作日",
                "示例": "部署完成、配置变更、趋势异常",
                "通知方式": "邮件、日志",
                "升级策略": "不升级"
            }
        }
    
    def get_alert_naming_conventions(self) -> Dict[str, List[str]]:
        """获取告警命名规范"""
        return {
            "好的命名": [
                "HighCPUUsage - 清晰描述问题",
                "DatabaseConnectionPoolExhausted - 具体且可操作",
                "APIResponseTimeHigh - 基于用户体验",
                "DiskSpaceLow - 简洁明了",
                "SSLCertificateExpiringSoon - 预防性告警"
            ],
            "避免的命名": [
                "Alert1 - 无意义的名称",
                "SomethingWrong - 过于模糊",
                "CPUAlert - 不够具体",
                "Error - 过于通用",
                "Problem - 没有描述性"
            ],
            "命名规则": [
                "使用驼峰命名法(CamelCase)",
                "以问题类型开头(High, Low, Failed, Down等)",
                "包含受影响的组件或服务",
                "避免使用缩写和技术术语",
                "保持名称简洁但具有描述性"
            ]
        }
    
    def get_alert_threshold_guidelines(self) -> str:
        """获取告警阈值设置指南"""
        return """
# 告警阈值设置指南

## 1. 基于历史数据设置阈值

### CPU使用率告警
# 分析过去30天的CPU使用率分布
# 设置阈值为P95 + 10%作为警告,P99作为关键
histogram_quantile(0.95, rate(node_cpu_seconds_total[30d])) + 0.1  # Warning
histogram_quantile(0.99, rate(node_cpu_seconds_total[30d]))        # Critical

### 内存使用率告警
# 基于应用程序的内存使用模式
# 考虑内存泄漏的检测
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85  # Warning
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.95  # Critical

### 磁盘空间告警
# 基于磁盘增长速率预测
# 预留足够时间进行清理
predict_linear(node_filesystem_free_bytes[6h], 24*3600) < 0  # 预测24小时后磁盘满

## 2. 基于业务影响设置阈值

### API响应时间
# 基于SLA要求设置
# 99%的请求应在500ms内完成
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5

### 错误率告警
# 基于用户体验影响
# 错误率超过1%影响用户体验
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.01

## 3. 动态阈值设置

### 基于时间模式的动态阈值
# 工作时间和非工作时间使用不同阈值
(
  (
    hour() >= 9 and hour() <= 18 and 
    sum(rate(http_requests_total[5m])) < 100  # 工作时间阈值
  ) or (
    (hour() < 9 or hour() > 18) and 
    sum(rate(http_requests_total[5m])) < 20   # 非工作时间阈值
  )
)

### 基于历史同期的动态阈值
# 与上周同期比较
abs(
  sum(rate(http_requests_total[5m])) - 
  sum(rate(http_requests_total[5m] offset 7d))
) / sum(rate(http_requests_total[5m] offset 7d)) > 0.3

## 4. 阈值调优建议

### 避免告警疲劳
- 定期审查告警频率和准确性
- 调整过于敏感的阈值
- 合并相关告警
- 使用抑制规则减少噪音

### 阈值测试
- 在测试环境验证阈值设置
- 模拟故障场景测试告警响应
- 收集团队反馈调整阈值
- 建立阈值调整的版本控制

### 文档化阈值决策
- 记录阈值设置的原因
- 定期审查和更新阈值
- 建立阈值变更审批流程
- 监控阈值调整的效果
"""
    
    def get_alert_fatigue_prevention(self) -> List[str]:
        """获取告警疲劳预防策略"""
        return [
            "🎯 精确的阈值设置 - 避免过于敏感的告警",
            "📊 基于影响的告警 - 只对真正影响用户的问题告警",
            "🔄 智能分组 - 将相关告警合并为一个通知",
            "⏰ 适当的重复间隔 - 避免过于频繁的重复通知",
            "🚫 有效的抑制规则 - 在已知问题期间抑制相关告警",
            "🔧 自动化修复 - 对可自动修复的问题实施自动化",
            "📈 趋势分析 - 使用趋势而非绝对值进行告警",
            "🎚️ 分级告警 - 使用不同严重级别避免所有告警都是紧急的",
            "📝 定期审查 - 定期审查告警规则的有效性",
            "🔍 根因分析 - 解决根本原因而非症状",
            "⚡ 快速静默 - 提供快速静默机制处理已知问题",
            "📊 告警指标 - 监控告警系统本身的健康状况"
        ]
    
    def generate_alert_quality_metrics(self) -> str:
        """生成告警质量指标"""
        return """
# 告警质量监控指标

## 1. 告警准确性指标

### 误报率(False Positive Rate)
# 计算被静默或快速解决的告警比例
(
  sum(increase(alertmanager_silences_total[24h])) + 
  sum(increase(alertmanager_alerts_resolved_total{resolution_time="<5m"}[24h]))
) / sum(increase(alertmanager_alerts_total[24h])) * 100

### 漏报检测
# 通过SLA违反但无告警的情况检测
# 需要结合业务指标和告警状态

## 2. 告警响应指标

### 平均响应时间(MTTA - Mean Time To Acknowledge)
avg(alertmanager_alert_ack_time_seconds)

### 平均解决时间(MTTR - Mean Time To Resolve)
avg(alertmanager_alert_resolution_time_seconds)

### 告警升级率
sum(increase(alertmanager_alerts_escalated_total[24h])) / 
sum(increase(alertmanager_alerts_total[24h])) * 100

## 3. 告警频率指标

### 每日告警数量
sum(increase(alertmanager_alerts_total[24h]))

### 高频告警识别
topk(10, 
  sum by (alertname) (
    increase(alertmanager_alerts_total[24h])
  )
)

### 告警风暴检测
# 短时间内大量告警
sum(increase(alertmanager_alerts_total[5m])) > 50

## 4. 告警覆盖率指标

### 服务覆盖率
# 有告警规则的服务比例
count(count by (service) (up)) / count(count by (service) (prometheus_rule_group_rules))

### 关键路径覆盖
# 关键业务流程的告警覆盖情况

## 5. 告警效果指标

### 问题预防率
# 通过告警预防的问题数量

### 用户影响减少
# 通过及时告警减少的用户影响时间

## 6. 告警系统健康指标

### Alertmanager可用性
up{job="alertmanager"}

### 告警规则评估延迟
prometheus_rule_evaluation_duration_seconds

### 通知发送成功率
sum(rate(alertmanager_notifications_total{state="success"}[5m])) / 
sum(rate(alertmanager_notifications_total[5m])) * 100

## 7. 团队效率指标

### 告警处理效率
# 每个团队的告警处理速度和质量

### 告警知识积累
# Runbook使用率和更新频率

### 告警培训效果
# 新团队成员的告警处理能力提升
"""

# 使用示例
best_practices = AlertBestPractices()

# 获取各种最佳实践指南
design_principles = best_practices.get_alert_design_principles()
severity_guidelines = best_practices.get_alert_severity_guidelines()
naming_conventions = best_practices.get_alert_naming_conventions()
threshold_guidelines = best_practices.get_alert_threshold_guidelines()
fatigue_prevention = best_practices.get_alert_fatigue_prevention()
quality_metrics = best_practices.generate_alert_quality_metrics()

print("告警最佳实践指南已生成")
print(f"设计原则数量: {len(design_principles)}")
print(f"严重级别指南: {len(severity_guidelines)} 个级别")
print(f"命名规范类别: {len(naming_conventions)} 个")
print(f"疲劳预防策略: {len(fatigue_prevention)} 条")
print(f"质量指标长度: {len(quality_metrics)} 字符")

print("\n设计原则示例:")
for i, principle in enumerate(design_principles[:3], 1):
    print(f"{i}. {principle}")

故障排除与监控

AlertTroubleshootingManager 类

class AlertTroubleshootingManager:
    """告警故障排除管理器"""
    
    def __init__(self):
        self.common_issues = []
        self.diagnostic_queries = []
        self.health_checks = []
    
    def get_common_alerting_issues(self) -> Dict[str, Dict[str, str]]:
        """获取常见告警问题及解决方案"""
        return {
            "告警未触发": {
                "症状": "预期的告警没有发出",
                "可能原因": "规则语法错误、阈值设置不当、标签匹配问题、数据缺失",
                "诊断步骤": "检查规则语法、验证查询结果、确认数据源、检查标签",
                "解决方案": "修正规则语法、调整阈值、修复数据收集、更新标签匹配"
            },
            "告警风暴": {
                "症状": "短时间内大量告警触发",
                "可能原因": "阈值过于敏感、级联故障、配置错误、数据异常",
                "诊断步骤": "分析告警模式、检查系统状态、审查配置变更",
                "解决方案": "调整阈值、添加抑制规则、修复根本问题、临时静默"
            },
            "通知未发送": {
                "症状": "告警触发但通知未收到",
                "可能原因": "Alertmanager配置错误、网络问题、接收器故障、路由错误",
                "诊断步骤": "检查Alertmanager日志、验证网络连接、测试接收器",
                "解决方案": "修正配置、修复网络、更新接收器配置、调整路由规则"
            },
            "告警延迟": {
                "症状": "告警触发时间过晚",
                "可能原因": "评估间隔过长、查询复杂度高、资源不足、网络延迟",
                "诊断步骤": "检查评估时间、分析查询性能、监控资源使用",
                "解决方案": "优化查询、增加资源、调整评估间隔、简化规则"
            },
            "误报告警": {
                "症状": "告警频繁触发但实际无问题",
                "可能原因": "阈值设置不当、数据噪音、时间窗口不合适",
                "诊断步骤": "分析历史数据、检查阈值合理性、评估时间窗口",
                "解决方案": "调整阈值、增加平滑处理、优化时间窗口、添加条件"
            }
        }
    
    def generate_diagnostic_queries(self) -> str:
        """生成诊断查询"""
        return """
# 告警系统诊断查询

## 1. Prometheus 健康检查

### Prometheus 服务状态
up{job="prometheus"}

### 规则评估延迟
prometheus_rule_evaluation_duration_seconds

### 规则评估失败
increase(prometheus_rule_evaluation_failures_total[5m])

### 查询执行时间
histogram_quantile(0.95, rate(prometheus_engine_query_duration_seconds_bucket[5m]))

### 存储使用情况
prometheus_tsdb_symbol_table_size_bytes / 1024 / 1024  # MB
prometheus_tsdb_head_series

## 2. Alertmanager 健康检查

### Alertmanager 服务状态
up{job="alertmanager"}

### 告警处理延迟
histogram_quantile(0.95, rate(alertmanager_notification_latency_seconds_bucket[5m]))

### 通知发送状态
rate(alertmanager_notifications_total[5m])
rate(alertmanager_notifications_failed_total[5m])

### 活跃告警数量
alertmanager_alerts
alertmanager_alerts{state="active"}
alertmanager_alerts{state="suppressed"}

### 静默规则数量
alertmanager_silences
alertmanager_silences{state="active"}

## 3. 告警规则诊断

### 规则组评估时间
prometheus_rule_group_last_evaluation_timestamp_seconds

### 规则组评估间隔
prometheus_rule_group_interval_seconds

### 活跃告警规则
ALERT_FOR_STATE

### 规则评估错误
increase(prometheus_rule_evaluation_failures_total[1h])

## 4. 网络和连接诊断

### Prometheus 到 Alertmanager 连接
prometheus_notifications_alertmanagers_discovered
prometheus_notifications_dropped_total

### 外部服务连接测试
probe_success{job="blackbox"}
probe_duration_seconds{job="blackbox"}

## 5. 资源使用诊断

### CPU 使用率
rate(process_cpu_seconds_total{job="prometheus"}[5m]) * 100
rate(process_cpu_seconds_total{job="alertmanager"}[5m]) * 100

### 内存使用
process_resident_memory_bytes{job="prometheus"} / 1024 / 1024  # MB
process_resident_memory_bytes{job="alertmanager"} / 1024 / 1024  # MB

### 磁盘使用
prometheus_tsdb_wal_size_bytes / 1024 / 1024  # MB
prometheus_tsdb_head_chunks

## 6. 告警质量诊断

### 告警频率分析
topk(10, sum by (alertname) (increase(ALERTS[24h])))

### 告警持续时间分析
histogram_quantile(0.95, 
  sum by (alertname) (
    rate(ALERTS_FOR_STATE[24h])
  )
)

### 静默使用分析
topk(10, sum by (alertname) (increase(alertmanager_silences_total[24h])))

## 7. 性能优化诊断

### 慢查询识别
topk(10, 
  avg by (query) (
    rate(prometheus_engine_query_duration_seconds_sum[5m]) /
    rate(prometheus_engine_query_duration_seconds_count[5m])
  )
)

### 高基数指标识别
topk(10, count by (__name__)({__name__=~".+"}))

### 存储增长率
rate(prometheus_tsdb_symbol_table_size_bytes[1h])
"""
    
    def create_health_monitoring_config(self) -> str:
        """创建健康监控配置"""
        return """
# 告警系统健康监控配置

groups:
  - name: alerting-system-health
    rules:
      # Prometheus 健康检查
      - alert: PrometheusDown
        expr: up{job="prometheus"} == 0
        for: 1m
        labels:
          severity: critical
          component: prometheus
        annotations:
          summary: "Prometheus 服务不可用"
          description: "Prometheus 实例 {{ $labels.instance }} 已宕机超过1分钟"
          runbook_url: "https://runbooks.example.com/prometheus-down"
      
      - alert: PrometheusRuleEvaluationSlow
        expr: prometheus_rule_evaluation_duration_seconds > 30
        for: 5m
        labels:
          severity: warning
          component: prometheus
        annotations:
          summary: "Prometheus 规则评估缓慢"
          description: "规则组 {{ $labels.rule_group }} 评估时间超过30秒"
      
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
        for: 0m
        labels:
          severity: critical
          component: prometheus
        annotations:
          summary: "Prometheus 规则评估失败"
          description: "规则组 {{ $labels.rule_group }} 评估失败"
      
      # Alertmanager 健康检查
      - alert: AlertmanagerDown
        expr: up{job="alertmanager"} == 0
        for: 1m
        labels:
          severity: critical
          component: alertmanager
        annotations:
          summary: "Alertmanager 服务不可用"
          description: "Alertmanager 实例 {{ $labels.instance }} 已宕机超过1分钟"
          runbook_url: "https://runbooks.example.com/alertmanager-down"
      
      - alert: AlertmanagerNotificationsFailing
        expr: |
          (
            rate(alertmanager_notifications_failed_total[5m]) /
            rate(alertmanager_notifications_total[5m])
          ) > 0.1
        for: 5m
        labels:
          severity: warning
          component: alertmanager
        annotations:
          summary: "Alertmanager 通知发送失败率高"
          description: "通知发送失败率为 {{ $value | humanizePercentage }}"
      
      - alert: AlertmanagerConfigReloadFailed
        expr: alertmanager_config_last_reload_successful == 0
        for: 0m
        labels:
          severity: critical
          component: alertmanager
        annotations:
          summary: "Alertmanager 配置重载失败"
          description: "Alertmanager 配置重载失败,请检查配置文件"
      
      # 告警质量监控
      - alert: HighAlertVolume
        expr: sum(increase(ALERTS[1h])) > 100
        for: 0m
        labels:
          severity: warning
          component: alerting
        annotations:
          summary: "告警数量过高"
          description: "过去1小时内产生了 {{ $value }} 个告警,可能存在告警风暴"
      
      - alert: AlertStorm
        expr: sum(increase(ALERTS[5m])) > 50
        for: 0m
        labels:
          severity: critical
          component: alerting
        annotations:
          summary: "告警风暴检测"
          description: "过去5分钟内产生了 {{ $value }} 个告警,疑似告警风暴"
      
      - alert: HighSilenceUsage
        expr: |
          (
            sum(alertmanager_silences{state="active"}) /
            sum(alertmanager_alerts)
          ) > 0.5
        for: 10m
        labels:
          severity: warning
          component: alerting
        annotations:
          summary: "静默使用率过高"
          description: "当前有 {{ $value | humanizePercentage }} 的告警被静默"
      
      # 性能监控
      - alert: PrometheusHighMemoryUsage
        expr: |
          (
            process_resident_memory_bytes{job="prometheus"} /
            node_memory_MemTotal_bytes
          ) > 0.8
        for: 5m
        labels:
          severity: warning
          component: prometheus
        annotations:
          summary: "Prometheus 内存使用率高"
          description: "Prometheus 内存使用率为 {{ $value | humanizePercentage }}"
      
      - alert: PrometheusHighDiskUsage
        expr: |
          (
            prometheus_tsdb_wal_size_bytes +
            prometheus_tsdb_head_chunks_bytes
          ) / 1024 / 1024 / 1024 > 10  # 10GB
        for: 5m
        labels:
          severity: warning
          component: prometheus
        annotations:
          summary: "Prometheus 磁盘使用量高"
          description: "Prometheus 存储使用量为 {{ $value | humanize }}GB"
      
      # 连接监控
      - alert: PrometheusAlertmanagerConnectionFailed
        expr: prometheus_notifications_alertmanagers_discovered == 0
        for: 2m
        labels:
          severity: critical
          component: prometheus
        annotations:
          summary: "Prometheus 无法连接到 Alertmanager"
          description: "Prometheus 未发现任何可用的 Alertmanager 实例"
      
      - alert: PrometheusNotificationDropped
        expr: increase(prometheus_notifications_dropped_total[5m]) > 0
        for: 0m
        labels:
          severity: warning
          component: prometheus
        annotations:
          summary: "Prometheus 通知被丢弃"
          description: "过去5分钟内有 {{ $value }} 个通知被丢弃"
"""
    
    def create_troubleshooting_runbook(self) -> str:
        """创建故障排除手册"""
        return """
# 告警系统故障排除手册

## 1. 告警未触发故障排除

### 步骤1: 验证规则语法
```bash
# 使用 promtool 验证规则文件
promtool check rules /path/to/rules.yml

# 检查 Prometheus 配置
promtool check config /path/to/prometheus.yml

步骤2: 测试查询表达式

# 在 Prometheus Web UI 中测试查询
# 或使用 API 测试
curl 'http://prometheus:9090/api/v1/query?query=up'

步骤3: 检查数据可用性

# 检查目标是否正常采集
curl 'http://prometheus:9090/api/v1/query?query=up{job="your-job"}'

# 检查指标是否存在
curl 'http://prometheus:9090/api/v1/label/__name__/values'

步骤4: 验证标签匹配

# 检查标签值
curl 'http://prometheus:9090/api/v1/label/job/values'

# 测试标签选择器
curl 'http://prometheus:9090/api/v1/query?query=up{instance="target:9090"}'

2. 通知未发送故障排除

步骤1: 检查 Alertmanager 状态

# 检查 Alertmanager 服务状态
curl http://alertmanager:9093/-/healthy

# 查看活跃告警
curl http://alertmanager:9093/api/v1/alerts

步骤2: 验证路由配置

# 测试路由匹配
curl -X POST http://alertmanager:9093/api/v1/alerts \
  -H "Content-Type: application/json" \
  -d '[{
    "labels": {
      "alertname": "TestAlert",
      "severity": "warning"
    }
  }]'

步骤3: 检查接收器配置

# 查看 Alertmanager 配置
curl http://alertmanager:9093/api/v1/status

# 检查通知历史
curl http://alertmanager:9093/api/v1/alerts/groups

步骤4: 测试通知渠道

# 测试邮件配置
echo "Test email" | mail -s "Test" user@example.com

# 测试 Slack webhook
curl -X POST -H 'Content-type: application/json' \
  --data '{"text":"Test message"}' \
  YOUR_SLACK_WEBHOOK_URL

3. 告警风暴处理

步骤1: 快速静默

# 静默所有告警(紧急情况)
amtool silence add alertname=~".*" --duration="1h" --comment="Emergency silence"

# 静默特定服务
amtool silence add service="problematic-service" --duration="30m"

步骤2: 分析告警模式

# 查看告警统计
curl 'http://prometheus:9090/api/v1/query?query=topk(10,sum by (alertname)(ALERTS))'

# 分析告警时间线
curl 'http://prometheus:9090/api/v1/query_range?query=sum(ALERTS)&start=...&end=...&step=60s'

步骤3: 识别根本原因

# 检查系统指标
curl 'http://prometheus:9090/api/v1/query?query=up'
curl 'http://prometheus:9090/api/v1/query?query=node_load1'

# 检查最近的配置变更
git log --oneline --since="1 hour ago" -- prometheus/

4. 性能问题排除

步骤1: 识别慢查询

# 查看查询性能
curl 'http://prometheus:9090/api/v1/query?query=topk(10,prometheus_engine_query_duration_seconds)'

# 分析规则评估时间
curl 'http://prometheus:9090/api/v1/query?query=prometheus_rule_evaluation_duration_seconds'

步骤2: 优化查询

# 使用 promtool 分析查询
promtool query instant 'your_complex_query'

# 检查高基数指标
curl 'http://prometheus:9090/api/v1/query?query=topk(10,count by (__name__)({__name__=~".+"}))'

步骤3: 资源监控

# 监控 Prometheus 资源使用
curl 'http://prometheus:9090/api/v1/query?query=process_resident_memory_bytes{job="prometheus"}'
curl 'http://prometheus:9090/api/v1/query?query=rate(process_cpu_seconds_total{job="prometheus"}[5m])'

5. 配置验证工具

Prometheus 配置验证

#!/bin/bash
# prometheus-config-check.sh

echo "检查 Prometheus 配置..."
promtool check config /etc/prometheus/prometheus.yml

echo "检查告警规则..."
for file in /etc/prometheus/rules/*.yml; do
    echo "检查 $file"
    promtool check rules "$file"
done

echo "测试配置重载..."
curl -X POST http://localhost:9090/-/reload

Alertmanager 配置验证

#!/bin/bash
# alertmanager-config-check.sh

echo "检查 Alertmanager 配置..."
amtool check-config /etc/alertmanager/alertmanager.yml

echo "测试路由配置..."
amtool config routes --config.file=/etc/alertmanager/alertmanager.yml

echo "测试配置重载..."
curl -X POST http://localhost:9093/-/reload

6. 监控脚本

告警系统健康检查脚本

#!/bin/bash
# alert-system-health.sh

PROMETHEUS_URL="http://localhost:9090"
ALERTMANAGER_URL="http://localhost:9093"

echo "=== 告警系统健康检查 ==="

# 检查 Prometheus
echo "检查 Prometheus 状态..."
if curl -s "$PROMETHEUS_URL/-/healthy" > /dev/null; then
    echo "✓ Prometheus 健康"
else
    echo "✗ Prometheus 不健康"
fi

# 检查 Alertmanager
echo "检查 Alertmanager 状态..."
if curl -s "$ALERTMANAGER_URL/-/healthy" > /dev/null; then
    echo "✓ Alertmanager 健康"
else
    echo "✗ Alertmanager 不健康"
fi

# 检查活跃告警
echo "检查活跃告警..."
ALERT_COUNT=$(curl -s "$ALERTMANAGER_URL/api/v1/alerts" | jq '.data | length')
echo "当前活跃告警数量: $ALERT_COUNT"

# 检查静默规则
echo "检查静默规则..."
SILENCE_COUNT=$(curl -s "$ALERTMANAGER_URL/api/v1/silences" | jq '.data | length')
echo "当前静默规则数量: $SILENCE_COUNT"

echo "=== 检查完成 ==="

”“”

使用示例

troubleshooting_manager = AlertTroubleshootingManager()

获取故障排除信息

common_issues = troubleshooting_manager.get_common_alerting_issues() diagnostic_queries = troubleshooting_manager.generate_diagnostic_queries() health_monitoring = troubleshooting_manager.create_health_monitoring_config() troubleshooting_runbook = troubleshooting_manager.create_troubleshooting_runbook()

print(“告警故障排除指南已生成”) print(f”常见问题数量: {len(common_issues)}“) print(f”诊断查询长度: {len(diagnostic_queries)} 字符”) print(f”健康监控配置长度: {len(health_monitoring)} 字符”) print(f”故障排除手册长度: {len(troubleshooting_runbook)} 字符”)

print(“\n常见问题示例:”) for issue, details in list(common_issues.items())[:2]: print(f”- {issue}: {details[‘症状’]}“) “`

总结

通过本章的学习,我们全面掌握了 Prometheus 告警与通知系统的各个方面:

🎯 核心要点

  1. 告警规则设计

    • 基于症状而非原因的告警策略
    • 合理的阈值设置和时间窗口
    • 清晰的告警命名和分类
    • 完整的告警元数据和文档
  2. Alertmanager 配置

    • 灵活的路由和分组策略
    • 多样化的通知渠道配置
    • 智能的抑制和静默机制
    • 高级的升级和时间窗口管理
  3. 通知渠道管理

    • 邮件、Slack、企业微信等多种渠道
    • 个性化的通知模板
    • 基于场景的通知策略
    • 通知的可靠性和及时性保障
  4. 高级功能应用

    • 动态路由和智能分发
    • 告警升级和自动化处理
    • 维护窗口和计划停机管理
    • 告警质量监控和优化

🚀 最佳实践

  1. 设计原则

    • 告警应该是可操作的
    • 避免告警疲劳
    • 基于业务影响设置优先级
    • 持续优化和改进
  2. 运维管理

    • 定期审查告警规则
    • 监控告警系统健康状况
    • 建立完善的故障排除流程
    • 团队培训和知识分享
  3. 技术实现

    • 使用版本控制管理配置
    • 自动化测试和验证
    • 监控和日志记录
    • 性能优化和扩展性考虑

📚 下一步学习

  1. 深入学习

    • Prometheus 查询语言 (PromQL) 高级用法
    • 自定义 Exporter 开发
    • 告警规则的自动化测试
    • 大规模部署的性能优化
  2. 实践项目

    • 构建完整的监控告警系统
    • 集成现有的运维工具链
    • 开发自定义通知渠道
    • 实现智能告警分析
  3. 扩展学习

    • Grafana 可视化集成
    • 日志监控和分析
    • 分布式追踪系统
    • 云原生监控解决方案

通过系统学习和实践,你已经具备了构建和管理企业级告警系统的能力。记住,优秀的告警系统不仅仅是技术实现,更需要结合业务理解、团队协作和持续改进的运维文化。


恭喜!你已经完成了 Prometheus 告警与通知系统的学习。 🎉

继续探索 Prometheus 生态系统的其他组件,构建更加完善的监控解决方案!

def create_pagerduty_config(self, routing_key: str, service_key: str = "") -> Dict[str, Any]:
    """创建 PagerDuty 通知配置"""
    return {
        "receivers": [{
            "name": "pagerduty-notifications",
            "pagerduty_configs": [{
                "routing_key": routing_key,
                "service_key": service_key,
                "description": "{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}",
                "severity": "{{ .Labels.severity }}",
                "details": {
                    "instance": "{{ .Labels.instance }}",
                    "job": "{{ .Labels.job }}",
                    "description": "{{ .Annotations.description }}"
                }
            }]
        }]
    }

def create_wechat_config(self, corp_id: str, agent_id: str, api_secret: str,
                        to_user: str = "@all") -> Dict[str, Any]:
    """创建企业微信通知配置"""
    return {
        "global": {
            "wechat_api_url": "https://qyapi.weixin.qq.com/cgi-bin/",
            "wechat_api_corp_id": corp_id
        },
        "receivers": [{
            "name": "wechat-notifications",
            "wechat_configs": [{
                "agent_id": agent_id,
                "api_secret": api_secret,
                "to_user": to_user,
                "message": """

告警通知 {{ range .Alerts }} 告警: {{ .Annotations.summary }} 描述: {{ .Annotations.description }} 级别: {{ .Labels.severity }} 实例: {{ .Labels.instance }} 时间: {{ .StartsAt }} {{ end }} “”” }] }] }

def create_dingtalk_config(self, webhook_url: str, secret: str = "") -> Dict[str, Any]:
    """创建钉钉通知配置"""
    config = {
        "receivers": [{
            "name": "dingtalk-notifications",
            "webhook_configs": [{
                "url": webhook_url,
                "send_resolved": True,
                "title": "Prometheus 告警通知",
                "message": """

{{ range .Alerts }} 告警名称: {{ .Annotations.summary }} 告警描述: {{ .Annotations.description }} 告警级别: {{ .Labels.severity }} 实例: {{ .Labels.instance }} 开始时间: {{ .StartsAt }} {{ end }} “”” }] }] }

    if secret:
        config["receivers"][0]["webhook_configs"][0]["http_config"] = {
            "headers": {
                "Content-Type": "application/json"
            }
        }

    return config