10.1 企业级监控解决方案设计

架构设计概览

flowchart TB
    subgraph "数据采集层"
        A[Node Exporter]
        B[Application Metrics]
        C[Custom Exporters]
        D[Blackbox Exporter]
    end
    
    subgraph "数据存储层"
        E[Prometheus Cluster]
        F[Thanos/Cortex]
        G[Long-term Storage]
    end
    
    subgraph "告警处理层"
        H[Alertmanager Cluster]
        I[Alert Rules]
        J[Routing Logic]
    end
    
    subgraph "通知渠道层"
        K[Email]
        L[Slack]
        M[PagerDuty]
        N[Webhook]
        O[SMS]
    end
    
    subgraph "可视化层"
        P[Grafana]
        Q[Custom Dashboards]
        R[Alert Dashboard]
    end
    
    A --> E
    B --> E
    C --> E
    D --> E
    
    E --> F
    F --> G
    
    E --> I
    I --> H
    H --> J
    
    J --> K
    J --> L
    J --> M
    J --> N
    J --> O
    
    E --> P
    P --> Q
    P --> R

企业级配置示例

# enterprise-alertmanager.yml
global:
  # SMTP 配置
  smtp_smarthost: 'smtp.company.com:587'
  smtp_from: 'monitoring@company.com'
  smtp_auth_username: 'monitoring@company.com'
  smtp_auth_password: 'secure_password'
  smtp_require_tls: true
  
  # HTTP 配置
  http_config:
    proxy_url: 'http://proxy.company.com:8080'
    tls_config:
      insecure_skip_verify: false
  
  # 全局标签
  external_labels:
    cluster: 'production'
    region: 'us-east-1'
    environment: 'prod'

# 模板配置
templates:
  - '/etc/alertmanager/templates/*.tmpl'

# 路由配置
route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'default-receiver'
  
  routes:
  # 关键业务系统告警
  - match:
      severity: critical
    receiver: 'critical-alerts'
    group_wait: 10s
    group_interval: 2m
    repeat_interval: 30m
    routes:
    - match:
        service: 'payment-service'
      receiver: 'payment-critical'
    - match:
        service: 'user-service'
      receiver: 'user-critical'
  
  # 基础设施告警
  - match:
      category: infrastructure
    receiver: 'infrastructure-alerts'
    routes:
    - match:
        component: 'database'
      receiver: 'dba-team'
    - match:
        component: 'network'
      receiver: 'network-team'
  
  # 应用程序告警
  - match:
      category: application
    receiver: 'application-alerts'
    routes:
    - match:
        team: 'frontend'
      receiver: 'frontend-team'
    - match:
        team: 'backend'
      receiver: 'backend-team'
  
  # 安全告警
  - match:
      category: security
    receiver: 'security-alerts'
    group_wait: 0s
    group_interval: 1m
    repeat_interval: 15m
  
  # 维护窗口期间的告警
  - match:
      maintenance: 'true'
    receiver: 'maintenance-alerts'
    group_interval: 1h
    repeat_interval: 24h

# 抑制规则
inhibit_rules:
# 节点宕机时抑制该节点上的其他告警
- source_match:
    alertname: 'NodeDown'
  target_match:
    instance: '.*'
  target_match_re:
    instance: '(.+)'
  source_match_re:
    instance: '(.+)'
  equal: ['instance']

# 服务不可用时抑制相关的性能告警
- source_match:
    alertname: 'ServiceDown'
  target_match:
    alertname: 'HighLatency|HighErrorRate'
  equal: ['service', 'instance']

# 数据库主从切换时抑制连接告警
- source_match:
    alertname: 'DatabaseFailover'
  target_match:
    alertname: 'DatabaseConnectionFailed'
  equal: ['cluster']

# 接收器配置
receivers:
# 默认接收器
- name: 'default-receiver'
  email_configs:
  - to: 'ops-team@company.com'
    subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.default.html" . }}

# 关键告警接收器
- name: 'critical-alerts'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/CRITICAL/ALERTS/WEBHOOK'
    channel: '#critical-alerts'
    title: '🚨 CRITICAL ALERT'
    text: |
      {{ template "slack.critical.text" . }}
    actions:
    - type: button
      text: 'Acknowledge'
      url: '{{ template "slack.ack.url" . }}'
    - type: button
      text: 'Runbook'
      url: '{{ template "slack.runbook.url" . }}'
  
  pagerduty_configs:
  - routing_key: 'your-pagerduty-integration-key'
    description: '{{ template "pagerduty.description" . }}'
    severity: 'critical'
    details:
      cluster: '{{ .GroupLabels.cluster }}'
      service: '{{ .GroupLabels.service }}'
      runbook: '{{ template "pagerduty.runbook.url" . }}'

# 支付服务关键告警
- name: 'payment-critical'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/PAYMENT/TEAM/WEBHOOK'
    channel: '#payment-alerts'
    title: '💳 Payment Service Critical Alert'
    text: |
      {{ template "slack.payment.text" . }}
  
  email_configs:
  - to: 'payment-team@company.com,cto@company.com'
    subject: '[URGENT] Payment Service Alert'
    body: |
      {{ template "email.payment.html" . }}
  
  webhook_configs:
  - url: 'https://api.company.com/alerts/payment'
    http_config:
      bearer_token: 'payment-webhook-token'

# 用户服务关键告警
- name: 'user-critical'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/USER/TEAM/WEBHOOK'
    channel: '#user-service-alerts'
    title: '👤 User Service Critical Alert'
    text: |
      {{ template "slack.user.text" . }}

# 基础设施告警
- name: 'infrastructure-alerts'
  email_configs:
  - to: 'infrastructure@company.com'
    subject: '[INFRA] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.infrastructure.html" . }}

# DBA 团队告警
- name: 'dba-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/DBA/TEAM/WEBHOOK'
    channel: '#database-alerts'
    title: '🗄️ Database Alert'
    text: |
      {{ template "slack.database.text" . }}
  
  email_configs:
  - to: 'dba@company.com'
    subject: '[DB] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.database.html" . }}

# 网络团队告警
- name: 'network-team'
  email_configs:
  - to: 'network@company.com'
    subject: '[NETWORK] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.network.html" . }}

# 应用程序告警
- name: 'application-alerts'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/APP/TEAM/WEBHOOK'
    channel: '#application-alerts'
    title: '📱 Application Alert'
    text: |
      {{ template "slack.application.text" . }}

# 前端团队告警
- name: 'frontend-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/FRONTEND/TEAM/WEBHOOK'
    channel: '#frontend-alerts'
    title: '🎨 Frontend Alert'
    text: |
      {{ template "slack.frontend.text" . }}

# 后端团队告警
- name: 'backend-team'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/BACKEND/TEAM/WEBHOOK'
    channel: '#backend-alerts'
    title: '⚙️ Backend Alert'
    text: |
      {{ template "slack.backend.text" . }}

# 安全告警
- name: 'security-alerts'
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/SECURITY/TEAM/WEBHOOK'
    channel: '#security-alerts'
    title: '🔒 Security Alert'
    text: |
      {{ template "slack.security.text" . }}
  
  email_configs:
  - to: 'security@company.com,ciso@company.com'
    subject: '[SECURITY] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.security.html" . }}
  
  webhook_configs:
  - url: 'https://siem.company.com/api/alerts'
    http_config:
      bearer_token: 'security-webhook-token'

# 维护告警
- name: 'maintenance-alerts'
  email_configs:
  - to: 'maintenance@company.com'
    subject: '[MAINTENANCE] {{ .GroupLabels.alertname }}'
    body: |
      {{ template "email.maintenance.html" . }}

# 时间间隔配置
time_intervals:
- name: 'business-hours'
  time_intervals:
  - times:
    - start_time: '09:00'
      end_time: '17:00'
    weekdays: ['monday:friday']
    location: 'America/New_York'

- name: 'weekend'
  time_intervals:
  - times:
    - start_time: '00:00'
      end_time: '23:59'
    weekdays: ['saturday', 'sunday']

- name: 'maintenance-window'
  time_intervals:
  - times:
    - start_time: '02:00'
      end_time: '04:00'
    weekdays: ['sunday']

10.2 多环境部署实践

环境配置管理

#!/bin/bash
# multi-env-deployment.sh

# 环境配置
ENVIRONMENTS=("dev" "staging" "prod")
BASE_DIR="/opt/alertmanager"
CONFIG_REPO="https://github.com/company/alertmanager-configs.git"

# 部署函数
deploy_environment() {
    local env="$1"
    local config_dir="$BASE_DIR/$env"
    
    echo "=== 部署 $env 环境 ==="
    
    # 创建目录结构
    mkdir -p "$config_dir"/{config,data,logs}
    
    # 克隆配置仓库
    if [[ ! -d "$config_dir/configs" ]]; then
        git clone "$CONFIG_REPO" "$config_dir/configs"
    else
        cd "$config_dir/configs" && git pull
    fi
    
    # 生成环境特定配置
    generate_config "$env" "$config_dir"
    
    # 部署 Alertmanager
    deploy_alertmanager "$env" "$config_dir"
    
    # 验证部署
    verify_deployment "$env"
}

# 生成配置文件
generate_config() {
    local env="$1"
    local config_dir="$2"
    
    echo "生成 $env 环境配置..."
    
    # 使用模板生成配置
    envsubst < "$config_dir/configs/templates/alertmanager.yml.template" > "$config_dir/config/alertmanager.yml"
    
    # 环境特定变量
    case "$env" in
        "dev")
            export SMTP_HOST="smtp-dev.company.com"
            export SLACK_WEBHOOK="https://hooks.slack.com/dev"
            export LOG_LEVEL="debug"
            export REPLICAS="1"
            ;;
        "staging")
            export SMTP_HOST="smtp-staging.company.com"
            export SLACK_WEBHOOK="https://hooks.slack.com/staging"
            export LOG_LEVEL="info"
            export REPLICAS="2"
            ;;
        "prod")
            export SMTP_HOST="smtp.company.com"
            export SLACK_WEBHOOK="https://hooks.slack.com/prod"
            export LOG_LEVEL="warn"
            export REPLICAS="3"
            ;;
    esac
    
    # 应用环境变量
    envsubst < "$config_dir/configs/templates/docker-compose.yml.template" > "$config_dir/docker-compose.yml"
}

# 部署 Alertmanager
deploy_alertmanager() {
    local env="$1"
    local config_dir="$2"
    
    echo "部署 $env 环境 Alertmanager..."
    
    cd "$config_dir"
    
    # 停止现有服务
    docker-compose down
    
    # 启动新服务
    docker-compose up -d
    
    # 等待服务启动
    sleep 30
}

# 验证部署
verify_deployment() {
    local env="$1"
    local port
    
    case "$env" in
        "dev") port="9093" ;;
        "staging") port="9094" ;;
        "prod") port="9095" ;;
    esac
    
    echo "验证 $env 环境部署..."
    
    # 健康检查
    if curl -s "http://localhost:$port/-/healthy" > /dev/null; then
        echo "✓ $env 环境健康检查通过"
    else
        echo "✗ $env 环境健康检查失败"
        return 1
    fi
    
    # 配置验证
    if curl -s "http://localhost:$port/api/v1/status" | jq -e '.data.configYAML' > /dev/null; then
        echo "✓ $env 环境配置验证通过"
    else
        echo "✗ $env 环境配置验证失败"
        return 1
    fi
}

# 主函数
main() {
    case "${1:-all}" in
        "dev"|"staging"|"prod")
            deploy_environment "$1"
            ;;
        "all")
            for env in "${ENVIRONMENTS[@]}"; do
                deploy_environment "$env"
            done
            ;;
        "verify")
            for env in "${ENVIRONMENTS[@]}"; do
                verify_deployment "$env"
            done
            ;;
        *)
            echo "Usage: $0 {dev|staging|prod|all|verify}"
            exit 1
            ;;
    esac
}

main "$@"

Kubernetes 多环境部署

# k8s-multi-env/base/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- alertmanager-deployment.yaml
- alertmanager-service.yaml
- alertmanager-configmap.yaml
- alertmanager-pvc.yaml

commonLabels:
  app: alertmanager
  version: v0.25.0

images:
- name: prom/alertmanager
  newTag: v0.25.0

---
# k8s-multi-env/overlays/dev/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: monitoring-dev

resources:
- ../../base

patchesStrategicMerge:
- alertmanager-patch.yaml

configMapGenerator:
- name: alertmanager-config
  files:
  - alertmanager.yml
  behavior: replace

replicas:
- name: alertmanager
  count: 1

---
# k8s-multi-env/overlays/staging/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: monitoring-staging

resources:
- ../../base

patchesStrategicMerge:
- alertmanager-patch.yaml

configMapGenerator:
- name: alertmanager-config
  files:
  - alertmanager.yml
  behavior: replace

replicas:
- name: alertmanager
  count: 2

---
# k8s-multi-env/overlays/prod/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: monitoring-prod

resources:
- ../../base

patchesStrategicMerge:
- alertmanager-patch.yaml

configMapGenerator:
- name: alertmanager-config
  files:
  - alertmanager.yml
  behavior: replace

replicas:
- name: alertmanager
  count: 3

resources:
- ingress.yaml
- hpa.yaml

10.3 性能调优案例分析

高负载场景优化

#!/usr/bin/env python3
# performance-analysis.py

import requests
import time
import json
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import numpy as np

class AlertmanagerPerformanceAnalyzer:
    """Alertmanager 性能分析工具"""
    
    def __init__(self, base_url, prometheus_url):
        self.base_url = base_url
        self.prometheus_url = prometheus_url
        self.metrics = []
    
    def collect_metrics(self, duration_minutes=60):
        """收集性能指标"""
        
        end_time = datetime.now()
        start_time = end_time - timedelta(minutes=duration_minutes)
        
        queries = {
            'http_requests_total': 'rate(alertmanager_http_requests_total[5m])',
            'http_request_duration': 'histogram_quantile(0.95, rate(alertmanager_http_request_duration_seconds_bucket[5m]))',
            'notifications_total': 'rate(alertmanager_notifications_total[5m])',
            'notifications_failed': 'rate(alertmanager_notifications_failed_total[5m])',
            'alerts_active': 'alertmanager_alerts{state="active"}',
            'memory_usage': 'process_resident_memory_bytes{job="alertmanager"}',
            'cpu_usage': 'rate(process_cpu_seconds_total{job="alertmanager"}[5m])'
        }
        
        for metric_name, query in queries.items():
            try:
                response = requests.get(
                    f"{self.prometheus_url}/api/v1/query_range",
                    params={
                        'query': query,
                        'start': start_time.timestamp(),
                        'end': end_time.timestamp(),
                        'step': '60s'
                    }
                )
                
                if response.status_code == 200:
                    data = response.json()
                    self.metrics.append({
                        'name': metric_name,
                        'data': data['data']['result']
                    })
                    print(f"✓ 收集指标: {metric_name}")
                else:
                    print(f"✗ 收集指标失败: {metric_name}")
                    
            except Exception as e:
                print(f"✗ 收集指标异常 {metric_name}: {e}")
    
    def analyze_performance(self):
        """分析性能数据"""
        
        analysis_results = {}
        
        for metric in self.metrics:
            metric_name = metric['name']
            metric_data = metric['data']
            
            if not metric_data:
                continue
            
            # 提取数值
            values = []
            for series in metric_data:
                for timestamp, value in series['values']:
                    try:
                        values.append(float(value))
                    except ValueError:
                        continue
            
            if values:
                analysis_results[metric_name] = {
                    'avg': np.mean(values),
                    'max': np.max(values),
                    'min': np.min(values),
                    'p95': np.percentile(values, 95),
                    'p99': np.percentile(values, 99)
                }
        
        return analysis_results
    
    def generate_recommendations(self, analysis_results):
        """生成优化建议"""
        
        recommendations = []
        
        # HTTP 请求分析
        if 'http_request_duration' in analysis_results:
            duration_p95 = analysis_results['http_request_duration']['p95']
            if duration_p95 > 5.0:  # 5秒
                recommendations.append({
                    'type': 'performance',
                    'severity': 'high',
                    'issue': 'HTTP请求延迟过高',
                    'current_value': f'{duration_p95:.2f}s',
                    'recommendation': '考虑增加实例数量或优化配置文件复杂度'
                })
        
        # 内存使用分析
        if 'memory_usage' in analysis_results:
            memory_max = analysis_results['memory_usage']['max']
            memory_gb = memory_max / (1024**3)
            if memory_gb > 2.0:  # 2GB
                recommendations.append({
                    'type': 'resource',
                    'severity': 'medium',
                    'issue': '内存使用量较高',
                    'current_value': f'{memory_gb:.2f}GB',
                    'recommendation': '检查告警数量和配置复杂度,考虑增加内存限制'
                })
        
        # 通知失败率分析
        if 'notifications_failed' in analysis_results and 'notifications_total' in analysis_results:
            failed_rate = analysis_results['notifications_failed']['avg']
            total_rate = analysis_results['notifications_total']['avg']
            if total_rate > 0:
                failure_percentage = (failed_rate / total_rate) * 100
                if failure_percentage > 5:  # 5%
                    recommendations.append({
                        'type': 'reliability',
                        'severity': 'high',
                        'issue': '通知失败率过高',
                        'current_value': f'{failure_percentage:.2f}%',
                        'recommendation': '检查通知渠道配置和网络连接'
                    })
        
        return recommendations
    
    def generate_report(self):
        """生成性能报告"""
        
        print("\n=== Alertmanager 性能分析报告 ===")
        print(f"分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        # 分析数据
        analysis_results = self.analyze_performance()
        
        # 显示指标统计
        print("\n=== 性能指标统计 ===")
        for metric_name, stats in analysis_results.items():
            print(f"\n{metric_name}:")
            print(f"  平均值: {stats['avg']:.4f}")
            print(f"  最大值: {stats['max']:.4f}")
            print(f"  最小值: {stats['min']:.4f}")
            print(f"  95分位: {stats['p95']:.4f}")
            print(f"  99分位: {stats['p99']:.4f}")
        
        # 生成建议
        recommendations = self.generate_recommendations(analysis_results)
        
        if recommendations:
            print("\n=== 优化建议 ===")
            for i, rec in enumerate(recommendations, 1):
                print(f"\n{i}. {rec['issue']} (严重程度: {rec['severity']})")
                print(f"   当前值: {rec['current_value']}")
                print(f"   建议: {rec['recommendation']}")
        else:
            print("\n=== 优化建议 ===")
            print("当前性能表现良好,无需特别优化。")
        
        return analysis_results, recommendations

# 负载测试工具
class LoadTester:
    """Alertmanager 负载测试工具"""
    
    def __init__(self, alertmanager_url):
        self.alertmanager_url = alertmanager_url
        self.results = []
    
    def send_test_alert(self, alert_data):
        """发送测试告警"""
        
        start_time = time.time()
        try:
            response = requests.post(
                f"{self.alertmanager_url}/api/v1/alerts",
                json=alert_data,
                timeout=10
            )
            end_time = time.time()
            
            return {
                'success': response.status_code == 200,
                'duration': end_time - start_time,
                'status_code': response.status_code
            }
        except Exception as e:
            end_time = time.time()
            return {
                'success': False,
                'duration': end_time - start_time,
                'error': str(e)
            }
    
    def run_load_test(self, concurrent_users=10, duration_seconds=60):
        """运行负载测试"""
        
        print(f"开始负载测试: {concurrent_users} 并发用户, {duration_seconds} 秒")
        
        # 测试告警数据
        test_alert = [{
            "labels": {
                "alertname": "LoadTestAlert",
                "instance": f"test-instance-{i}",
                "severity": "warning",
                "job": "load-test"
            },
            "annotations": {
                "summary": "Load test alert",
                "description": "This is a load test alert"
            },
            "startsAt": datetime.now().isoformat() + "Z"
        } for i in range(10)]  # 每次发送10个告警
        
        start_time = time.time()
        
        with ThreadPoolExecutor(max_workers=concurrent_users) as executor:
            while time.time() - start_time < duration_seconds:
                futures = []
                for _ in range(concurrent_users):
                    future = executor.submit(self.send_test_alert, test_alert)
                    futures.append(future)
                
                for future in futures:
                    result = future.result()
                    self.results.append(result)
                
                time.sleep(1)  # 每秒发送一轮
        
        self.analyze_results()
    
    def analyze_results(self):
        """分析测试结果"""
        
        if not self.results:
            print("没有测试结果")
            return
        
        successful_requests = [r for r in self.results if r['success']]
        failed_requests = [r for r in self.results if not r['success']]
        
        durations = [r['duration'] for r in successful_requests]
        
        print("\n=== 负载测试结果 ===")
        print(f"总请求数: {len(self.results)}")
        print(f"成功请求: {len(successful_requests)}")
        print(f"失败请求: {len(failed_requests)}")
        print(f"成功率: {len(successful_requests)/len(self.results)*100:.2f}%")
        
        if durations:
            print(f"平均响应时间: {np.mean(durations):.4f}s")
            print(f"最大响应时间: {np.max(durations):.4f}s")
            print(f"最小响应时间: {np.min(durations):.4f}s")
            print(f"95分位响应时间: {np.percentile(durations, 95):.4f}s")
            print(f"99分位响应时间: {np.percentile(durations, 99):.4f}s")

# 使用示例
def main():
    # 性能分析
    analyzer = AlertmanagerPerformanceAnalyzer(
        base_url="http://localhost:9093",
        prometheus_url="http://localhost:9090"
    )
    
    print("收集性能指标...")
    analyzer.collect_metrics(duration_minutes=60)
    
    print("生成性能报告...")
    analyzer.generate_report()
    
    # 负载测试
    load_tester = LoadTester("http://localhost:9093")
    load_tester.run_load_test(concurrent_users=5, duration_seconds=30)

if __name__ == "__main__":
    main()

10.4 故障处理实战演练

故障模拟与恢复

#!/bin/bash
# disaster-recovery-drill.sh

# 配置变量
ALERTMANAGER_CLUSTER=("alertmanager-1" "alertmanager-2" "alertmanager-3")
BACKUP_DIR="/backup/alertmanager"
LOG_FILE="/var/log/disaster-recovery.log"

# 日志函数
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# 故障场景1: 单节点故障
simulate_node_failure() {
    local node="$1"
    log "=== 模拟节点故障: $node ==="
    
    # 停止节点
    log "停止节点 $node"
    docker stop "$node" || systemctl stop "alertmanager@$node"
    
    # 等待故障检测
    sleep 30
    
    # 检查集群状态
    check_cluster_health
    
    # 模拟故障持续时间
    log "故障持续中,等待 2 分钟..."
    sleep 120
    
    # 恢复节点
    log "恢复节点 $node"
    docker start "$node" || systemctl start "alertmanager@$node"
    
    # 等待恢复
    sleep 60
    
    # 验证恢复
    verify_node_recovery "$node"
}

# 故障场景2: 网络分区
simulate_network_partition() {
    log "=== 模拟网络分区 ==="
    
    # 创建网络分区(隔离第一个节点)
    local isolated_node="${ALERTMANAGER_CLUSTER[0]}"
    log "隔离节点: $isolated_node"
    
    # 使用 iptables 模拟网络分区
    for node in "${ALERTMANAGER_CLUSTER[@]:1}"; do
        iptables -A INPUT -s "$node" -j DROP
        iptables -A OUTPUT -d "$node" -j DROP
    done
    
    # 等待分区检测
    sleep 60
    
    # 检查集群状态
    check_cluster_health
    
    # 模拟分区持续时间
    log "网络分区持续中,等待 3 分钟..."
    sleep 180
    
    # 恢复网络
    log "恢复网络连接"
    iptables -F INPUT
    iptables -F OUTPUT
    
    # 等待恢复
    sleep 120
    
    # 验证恢复
    verify_cluster_recovery
}

# 故障场景3: 配置文件损坏
simulate_config_corruption() {
    log "=== 模拟配置文件损坏 ==="
    
    local config_file="/etc/alertmanager/alertmanager.yml"
    local backup_file="$config_file.backup"
    
    # 备份原配置
    cp "$config_file" "$backup_file"
    
    # 损坏配置文件
    log "损坏配置文件"
    echo "invalid yaml content" > "$config_file"
    
    # 重启服务触发错误
    log "重启服务"
    systemctl restart alertmanager
    
    # 等待错误检测
    sleep 30
    
    # 检查服务状态
    if ! systemctl is-active --quiet alertmanager; then
        log "✓ 检测到配置错误,服务未启动"
    else
        log "✗ 配置错误未被检测到"
    fi
    
    # 恢复配置
    log "恢复配置文件"
    cp "$backup_file" "$config_file"
    
    # 重启服务
    systemctl restart alertmanager
    
    # 验证恢复
    sleep 30
    if systemctl is-active --quiet alertmanager; then
        log "✓ 服务恢复正常"
    else
        log "✗ 服务恢复失败"
    fi
}

# 故障场景4: 数据损坏
simulate_data_corruption() {
    log "=== 模拟数据损坏 ==="
    
    local data_dir="/var/lib/alertmanager"
    local backup_dir="$BACKUP_DIR/data-$(date +%Y%m%d-%H%M%S)"
    
    # 备份数据
    log "备份当前数据"
    mkdir -p "$backup_dir"
    cp -r "$data_dir"/* "$backup_dir/"
    
    # 停止服务
    systemctl stop alertmanager
    
    # 损坏数据
    log "损坏数据文件"
    find "$data_dir" -name "*.db" -exec dd if=/dev/zero of={} bs=1024 count=1 \;
    
    # 启动服务
    systemctl start alertmanager
    
    # 检查服务状态
    sleep 30
    check_service_health
    
    # 如果需要,从备份恢复
    if ! check_service_health; then
        log "从备份恢复数据"
        systemctl stop alertmanager
        rm -rf "$data_dir"/*
        cp -r "$backup_dir"/* "$data_dir/"
        systemctl start alertmanager
        
        sleep 30
        verify_service_recovery
    fi
}

# 检查集群健康状态
check_cluster_health() {
    log "检查集群健康状态"
    
    for node in "${ALERTMANAGER_CLUSTER[@]}"; do
        if curl -s "http://$node:9093/-/healthy" > /dev/null; then
            log "✓ 节点 $node 健康"
        else
            log "✗ 节点 $node 不健康"
        fi
    done
    
    # 检查集群状态
    local cluster_status
    cluster_status=$(curl -s "http://${ALERTMANAGER_CLUSTER[0]}:9093/api/v1/status" | jq -r '.data.cluster.status')
    log "集群状态: $cluster_status"
}

# 检查服务健康状态
check_service_health() {
    if curl -s "http://localhost:9093/-/healthy" > /dev/null; then
        log "✓ Alertmanager 服务健康"
        return 0
    else
        log "✗ Alertmanager 服务不健康"
        return 1
    fi
}

# 验证节点恢复
verify_node_recovery() {
    local node="$1"
    
    log "验证节点 $node 恢复状态"
    
    # 检查节点健康
    if curl -s "http://$node:9093/-/healthy" > /dev/null; then
        log "✓ 节点 $node 健康检查通过"
    else
        log "✗ 节点 $node 健康检查失败"
        return 1
    fi
    
    # 检查集群成员
    local members
    members=$(curl -s "http://$node:9093/api/v1/status" | jq -r '.data.cluster.peers | length')
    log "集群成员数量: $members"
    
    if [[ $members -eq ${#ALERTMANAGER_CLUSTER[@]} ]]; then
        log "✓ 集群成员完整"
    else
        log "✗ 集群成员不完整"
    fi
}

# 验证集群恢复
verify_cluster_recovery() {
    log "验证集群恢复状态"
    
    # 等待集群同步
    sleep 60
    
    # 检查所有节点
    local healthy_nodes=0
    for node in "${ALERTMANAGER_CLUSTER[@]}"; do
        if curl -s "http://$node:9093/-/healthy" > /dev/null; then
            ((healthy_nodes++))
        fi
    done
    
    log "健康节点数量: $healthy_nodes/${#ALERTMANAGER_CLUSTER[@]}"
    
    if [[ $healthy_nodes -eq ${#ALERTMANAGER_CLUSTER[@]} ]]; then
        log "✓ 集群完全恢复"
    else
        log "✗ 集群恢复不完整"
    fi
}

# 验证服务恢复
verify_service_recovery() {
    log "验证服务恢复状态"
    
    if check_service_health; then
        log "✓ 服务恢复成功"
        
        # 检查配置
        if curl -s "http://localhost:9093/api/v1/status" | jq -e '.data.configYAML' > /dev/null; then
            log "✓ 配置加载成功"
        else
            log "✗ 配置加载失败"
        fi
    else
        log "✗ 服务恢复失败"
    fi
}

# 生成故障报告
generate_drill_report() {
    log "=== 故障演练报告 ==="
    log "演练时间: $(date)"
    log "演练场景: $1"
    
    # 统计日志中的成功/失败
    local success_count
    local failure_count
    
    success_count=$(grep -c "✓" "$LOG_FILE")
    failure_count=$(grep -c "✗" "$LOG_FILE")
    
    log "成功检查: $success_count"
    log "失败检查: $failure_count"
    
    if [[ $failure_count -eq 0 ]]; then
        log "演练结果: 通过"
    else
        log "演练结果: 需要改进"
    fi
}

# 主函数
main() {
    local scenario="${1:-all}"
    
    log "=== 开始故障恢复演练 ==="
    
    case "$scenario" in
        "node-failure")
            simulate_node_failure "${ALERTMANAGER_CLUSTER[0]}"
            generate_drill_report "节点故障"
            ;;
        "network-partition")
            simulate_network_partition
            generate_drill_report "网络分区"
            ;;
        "config-corruption")
            simulate_config_corruption
            generate_drill_report "配置损坏"
            ;;
        "data-corruption")
            simulate_data_corruption
            generate_drill_report "数据损坏"
            ;;
        "all")
            simulate_node_failure "${ALERTMANAGER_CLUSTER[0]}"
            sleep 300  # 5分钟间隔
            simulate_config_corruption
            sleep 300
            simulate_data_corruption
            generate_drill_report "全面演练"
            ;;
        *)
            echo "Usage: $0 {node-failure|network-partition|config-corruption|data-corruption|all}"
            exit 1
            ;;
    esac
    
    log "=== 故障恢复演练完成 ==="
}

main "$@"

10.5 本章小结

实战经验总结

本章通过实际案例展示了 Alertmanager 在企业环境中的应用,涵盖了以下关键实践:

  1. 企业级解决方案设计

    • 完整的监控架构规划
    • 多层次告警路由策略
    • 企业级配置管理
  2. 多环境部署实践

    • 环境配置标准化
    • 自动化部署流程
    • Kubernetes 原生部署
  3. 性能调优实战

    • 性能指标监控分析
    • 负载测试与优化
    • 资源使用优化
  4. 故障处理演练

    • 常见故障场景模拟
    • 自动化恢复流程
    • 灾难恢复验证

核心实践要点

实践领域 关键技术 核心价值 实施难度
架构设计 分层设计、路由策略 可扩展性、可维护性
多环境部署 配置管理、自动化 一致性、效率
性能优化 监控分析、负载测试 稳定性、性能
故障处理 自动化恢复、演练 可靠性、恢复能力

最佳实践建议

  1. 架构设计原则

    • 采用分层架构,职责清晰
    • 实施松耦合设计,便于扩展
    • 考虑高可用和容灾需求
  2. 运维管理策略

    • 建立标准化的部署流程
    • 实施持续监控和优化
    • 定期进行故障演练
  3. 团队协作模式

    • 建立明确的告警责任制
    • 实施有效的沟通机制
    • 持续改进和知识分享
  4. 技术选型考虑

    • 优先选择成熟稳定的技术
    • 考虑团队技术栈匹配度
    • 评估长期维护成本

项目实施路径

  1. 阶段一:基础建设(1-2周)

    • 环境搭建和基础配置
    • 核心告警规则配置
    • 基本通知渠道配置
  2. 阶段二:功能完善(2-3周)

    • 高级路由策略实施
    • 多通知渠道集成
    • 告警抑制和静默配置
  3. 阶段三:高可用部署(1-2周)

    • 集群部署和配置
    • 负载均衡和故障转移
    • 数据备份和恢复
  4. 阶段四:监控和优化(持续)

    • 性能监控和分析
    • 持续优化和调整
    • 故障演练和改进

成功关键因素

  1. 技术因素

    • 合理的架构设计
    • 完善的监控体系
    • 自动化运维能力
  2. 管理因素

    • 明确的项目目标
    • 充分的资源投入
    • 有效的项目管理
  3. 人员因素

    • 专业的技术团队
    • 良好的协作文化
    • 持续学习能力

总结

通过本教程的学习,您已经掌握了 Alertmanager 从基础概念到企业级应用的完整知识体系。从安装部署到高级配置,从单机部署到集群管理,从基本使用到性能优化,每个环节都提供了详细的指导和实战案例。

希望这些知识和经验能够帮助您在实际项目中构建稳定、高效、可靠的告警管理系统,为企业的监控体系建设贡献价值。

记住,监控和告警系统的建设是一个持续改进的过程,需要根据业务发展和技术演进不断优化和完善。保持学习和实践的态度,您将能够应对各种挑战,构建出色的监控解决方案。