7.1 高可用架构概述
高可用需求分析
在生产环境中,Alertmanager 的高可用性至关重要,需要考虑以下方面:
flowchart TD
A[高可用需求] --> B[服务可用性]
A --> C[数据一致性]
A --> D[故障恢复]
A --> E[负载分担]
B --> B1[99.9% 可用性]
B --> B2[零单点故障]
B --> B3[自动故障转移]
C --> C1[告警状态同步]
C --> C2[静默规则同步]
C --> C3[配置一致性]
D --> D1[快速故障检测]
D --> D2[自动恢复机制]
D --> D3[数据备份恢复]
E --> E1[请求负载均衡]
E --> E2[处理能力扩展]
E --> E3[地理分布部署]
集群架构模式
架构模式 | 节点数量 | 一致性 | 复杂度 | 适用场景 |
---|---|---|---|---|
主备模式 | 2 | 强一致 | 低 | 小规模环境 |
多主模式 | 3+ | 最终一致 | 中 | 中等规模 |
分片模式 | 6+ | 分区一致 | 高 | 大规模环境 |
混合模式 | 9+ | 分层一致 | 很高 | 企业级部署 |
集群通信机制
sequenceDiagram
participant A as Alertmanager-1
participant B as Alertmanager-2
participant C as Alertmanager-3
participant P as Prometheus
P->>A: 发送告警
A->>B: 同步告警状态
A->>C: 同步告警状态
B->>A: 确认同步
C->>A: 确认同步
A->>P: 返回接收确认
Note over A,C: Gossip 协议同步
B->>A: 心跳检测
C->>A: 心跳检测
A->>B: 心跳响应
A->>C: 心跳响应
7.2 集群配置基础
基础集群配置
# alertmanager-cluster.yml
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
# 集群配置
cluster:
# 监听地址
listen-address: '0.0.0.0:9094'
# 对等节点发现
peer:
- 'alertmanager-1.example.com:9094'
- 'alertmanager-2.example.com:9094'
- 'alertmanager-3.example.com:9094'
# 集群标识
cluster-id: 'production-cluster'
# Gossip 协议配置
gossip-interval: '200ms'
pushpull-interval: '60s'
# 网络配置
tcp-timeout: '10s'
probe-timeout: '500ms'
probe-interval: '1s'
# 重连配置
reconnect-interval: '10s'
reconnect-timeout: '6h'
# 路由配置
route:
receiver: 'default'
group_by: ['cluster', 'alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
# 接收器配置
receivers:
- name: 'default'
email_configs:
- to: 'team@example.com'
subject: '[{{ .Status }}] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Instance: {{ .Labels.instance }}
{{ end }}
# 抑制规则
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['instance']
Docker Compose 集群部署
# docker-compose-cluster.yml
version: '3.8'
services:
alertmanager-1:
image: prom/alertmanager:latest
container_name: alertmanager-1
hostname: alertmanager-1
ports:
- "9093:9093"
- "9094:9094"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-1-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.listen-address=0.0.0.0:9093'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-2:9094'
- '--cluster.peer=alertmanager-3:9094'
- '--log.level=debug'
networks:
- alertmanager-cluster
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
alertmanager-2:
image: prom/alertmanager:latest
container_name: alertmanager-2
hostname: alertmanager-2
ports:
- "9193:9093"
- "9194:9094"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-2-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.listen-address=0.0.0.0:9093'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-1:9094'
- '--cluster.peer=alertmanager-3:9094'
- '--log.level=debug'
networks:
- alertmanager-cluster
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
alertmanager-3:
image: prom/alertmanager:latest
container_name: alertmanager-3
hostname: alertmanager-3
ports:
- "9293:9093"
- "9294:9094"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-3-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.listen-address=0.0.0.0:9093'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-1:9094'
- '--cluster.peer=alertmanager-2:9094'
- '--log.level=debug'
networks:
- alertmanager-cluster
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# 负载均衡器
nginx:
image: nginx:alpine
container_name: alertmanager-lb
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
depends_on:
- alertmanager-1
- alertmanager-2
- alertmanager-3
networks:
- alertmanager-cluster
restart: unless-stopped
volumes:
alertmanager-1-data:
alertmanager-2-data:
alertmanager-3-data:
networks:
alertmanager-cluster:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
Nginx 负载均衡配置
# nginx.conf
events {
worker_connections 1024;
}
http {
upstream alertmanager_cluster {
# 健康检查和负载均衡
server alertmanager-1:9093 max_fails=3 fail_timeout=30s;
server alertmanager-2:9093 max_fails=3 fail_timeout=30s;
server alertmanager-3:9093 max_fails=3 fail_timeout=30s;
}
# 日志格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'upstream_addr=$upstream_addr '
'upstream_response_time=$upstream_response_time';
access_log /var/log/nginx/access.log main;
error_log /var/log/nginx/error.log warn;
server {
listen 80;
server_name alertmanager.example.com;
# 健康检查端点
location /-/healthy {
proxy_pass http://alertmanager_cluster;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 健康检查超时设置
proxy_connect_timeout 5s;
proxy_send_timeout 5s;
proxy_read_timeout 5s;
}
# API 端点(需要会话保持)
location /api/ {
proxy_pass http://alertmanager_cluster;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 会话保持
ip_hash;
# 超时设置
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
# Web UI(可以轮询)
location / {
proxy_pass http://alertmanager_cluster;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket 支持
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# 超时设置
proxy_connect_timeout 10s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
# 状态页面
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
allow 172.20.0.0/16;
deny all;
}
}
}
7.3 Kubernetes 集群部署
StatefulSet 部署
# alertmanager-statefulset.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitoring
data:
alertmanager.yml: |
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
route:
receiver: 'default'
group_by: ['cluster', 'alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
receivers:
- name: 'default'
email_configs:
- to: 'team@example.com'
subject: '[{{ .Status }}] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Instance: {{ .Labels.instance }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['instance']
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
serviceName: alertmanager-headless
replicas: 3
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
serviceAccountName: alertmanager
securityContext:
runAsUser: 65534
runAsGroup: 65534
fsGroup: 65534
containers:
- name: alertmanager
image: prom/alertmanager:v0.26.0
ports:
- containerPort: 9093
name: web
- containerPort: 9094
name: cluster
args:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --web.listen-address=0.0.0.0:9093
- --cluster.listen-address=0.0.0.0:9094
- --cluster.peer=alertmanager-0.alertmanager-headless.monitoring.svc.cluster.local:9094
- --cluster.peer=alertmanager-1.alertmanager-headless.monitoring.svc.cluster.local:9094
- --cluster.peer=alertmanager-2.alertmanager-headless.monitoring.svc.cluster.local:9094
- --log.level=info
- --web.external-url=http://alertmanager.example.com
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
volumeMounts:
- name: config
mountPath: /etc/alertmanager
- name: storage
mountPath: /alertmanager
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
volumes:
- name: config
configMap:
name: alertmanager-config
volumeClaimTemplates:
- metadata:
name: storage
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: "fast-ssd"
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager-headless
namespace: monitoring
labels:
app: alertmanager
spec:
clusterIP: None
ports:
- port: 9093
targetPort: 9093
name: web
- port: 9094
targetPort: 9094
name: cluster
selector:
app: alertmanager
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
type: ClusterIP
ports:
- port: 9093
targetPort: 9093
name: web
selector:
app: alertmanager
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: alertmanager
rules:
- apiGroups: [""]
resources: ["nodes", "services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: alertmanager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: alertmanager
subjects:
- kind: ServiceAccount
name: alertmanager
namespace: monitoring
Ingress 配置
# alertmanager-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# 负载均衡配置
nginx.ingress.kubernetes.io/upstream-hash-by: "$remote_addr"
nginx.ingress.kubernetes.io/load-balance: "round_robin"
# 健康检查
nginx.ingress.kubernetes.io/health-check-path: "/-/healthy"
nginx.ingress.kubernetes.io/health-check-interval: "30s"
# 超时设置
nginx.ingress.kubernetes.io/proxy-connect-timeout: "10"
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
# 认证(可选)
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: alertmanager-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
# 限流(可选)
nginx.ingress.kubernetes.io/rate-limit: "100"
nginx.ingress.kubernetes.io/rate-limit-window: "1m"
spec:
tls:
- hosts:
- alertmanager.example.com
secretName: alertmanager-tls
rules:
- host: alertmanager.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager
port:
number: 9093
---
# 认证密钥(可选)
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-auth
namespace: monitoring
type: Opaque
data:
auth: YWRtaW46JGFwcjEkSDY1dnVhNzAkLnRiTXhPbGRBWVBSMWJUcWJKdXN2MQo= # admin:admin
---
# TLS 证书(可选)
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-tls
namespace: monitoring
type: kubernetes.io/tls
data:
tls.crt: LS0tLS1CRUdJTi... # base64 encoded certificate
tls.key: LS0tLS1CRUdJTi... # base64 encoded private key
HPA 自动扩缩容
# alertmanager-hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: alertmanager-hpa
namespace: monitoring
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: StatefulSet
name: alertmanager
minReplicas: 3
maxReplicas: 9
metrics:
# CPU 使用率
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
# 内存使用率
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
# 自定义指标:告警处理速率
- type: Pods
pods:
metric:
name: alertmanager_alerts_received_total
target:
type: AverageValue
averageValue: "100"
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 100
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Max
7.4 数据同步与一致性
Gossip 协议配置
# 高级集群配置
cluster:
# 基础网络配置
listen-address: '0.0.0.0:9094'
advertise-address: '192.168.1.10:9094' # 外部可访问地址
# 对等节点配置
peer:
- '192.168.1.11:9094'
- '192.168.1.12:9094'
# Gossip 协议优化
gossip-interval: '200ms' # Gossip 间隔
pushpull-interval: '60s' # 推拉同步间隔
# 网络超时配置
tcp-timeout: '10s' # TCP 连接超时
probe-timeout: '500ms' # 探测超时
probe-interval: '1s' # 探测间隔
# 重连配置
reconnect-interval: '10s' # 重连间隔
reconnect-timeout: '6h' # 重连超时
# 集群标识
cluster-id: 'production' # 集群ID
# 加密配置(可选)
tls-config:
cert-file: '/etc/alertmanager/tls/server.crt'
key-file: '/etc/alertmanager/tls/server.key'
ca-file: '/etc/alertmanager/tls/ca.crt'
insecure-skip-verify: false
数据同步监控
# cluster_monitor.py
import requests
import json
import time
from datetime import datetime
import logging
from typing import List, Dict, Any
class AlertmanagerClusterMonitor:
def __init__(self, nodes: List[str]):
self.nodes = nodes
self.logger = logging.getLogger(__name__)
def get_cluster_status(self, node: str) -> Dict[str, Any]:
"""获取节点集群状态"""
try:
response = requests.get(
f"http://{node}/api/v1/status",
timeout=10
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to get status from {node}: {e}")
return {}
def get_cluster_peers(self, node: str) -> List[Dict[str, Any]]:
"""获取节点的对等节点信息"""
try:
response = requests.get(
f"http://{node}/api/v1/cluster/peers",
timeout=10
)
response.raise_for_status()
return response.json().get('data', [])
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to get peers from {node}: {e}")
return []
def get_alerts(self, node: str) -> List[Dict[str, Any]]:
"""获取节点的告警信息"""
try:
response = requests.get(
f"http://{node}/api/v1/alerts",
timeout=10
)
response.raise_for_status()
return response.json().get('data', [])
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to get alerts from {node}: {e}")
return []
def get_silences(self, node: str) -> List[Dict[str, Any]]:
"""获取节点的静默信息"""
try:
response = requests.get(
f"http://{node}/api/v1/silences",
timeout=10
)
response.raise_for_status()
return response.json().get('data', [])
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to get silences from {node}: {e}")
return []
def check_cluster_health(self) -> Dict[str, Any]:
"""检查集群健康状态"""
health_report = {
'timestamp': datetime.utcnow().isoformat(),
'total_nodes': len(self.nodes),
'healthy_nodes': 0,
'unhealthy_nodes': [],
'cluster_consistent': True,
'data_consistency': {
'alerts_consistent': True,
'silences_consistent': True,
'peers_consistent': True
},
'nodes_status': {}
}
node_data = {}
# 收集所有节点数据
for node in self.nodes:
self.logger.info(f"Checking node: {node}")
status = self.get_cluster_status(node)
peers = self.get_cluster_peers(node)
alerts = self.get_alerts(node)
silences = self.get_silences(node)
if status:
health_report['healthy_nodes'] += 1
node_data[node] = {
'status': status,
'peers': peers,
'alerts': alerts,
'silences': silences,
'alert_count': len(alerts),
'silence_count': len(silences),
'peer_count': len(peers)
}
health_report['nodes_status'][node] = {
'healthy': True,
'alert_count': len(alerts),
'silence_count': len(silences),
'peer_count': len(peers)
}
else:
health_report['unhealthy_nodes'].append(node)
health_report['nodes_status'][node] = {
'healthy': False,
'error': 'Unable to connect'
}
# 检查数据一致性
if len(node_data) > 1:
self.check_data_consistency(node_data, health_report)
return health_report
def check_data_consistency(self, node_data: Dict, health_report: Dict):
"""检查数据一致性"""
nodes = list(node_data.keys())
# 检查告警一致性
alert_counts = [node_data[node]['alert_count'] for node in nodes]
if len(set(alert_counts)) > 1:
health_report['data_consistency']['alerts_consistent'] = False
health_report['cluster_consistent'] = False
self.logger.warning(f"Alert count inconsistency: {dict(zip(nodes, alert_counts))}")
# 检查静默一致性
silence_counts = [node_data[node]['silence_count'] for node in nodes]
if len(set(silence_counts)) > 1:
health_report['data_consistency']['silences_consistent'] = False
health_report['cluster_consistent'] = False
self.logger.warning(f"Silence count inconsistency: {dict(zip(nodes, silence_counts))}")
# 检查对等节点一致性
peer_counts = [node_data[node]['peer_count'] for node in nodes]
if len(set(peer_counts)) > 1:
health_report['data_consistency']['peers_consistent'] = False
health_report['cluster_consistent'] = False
self.logger.warning(f"Peer count inconsistency: {dict(zip(nodes, peer_counts))}")
def generate_health_report(self) -> str:
"""生成健康报告"""
health = self.check_cluster_health()
report = f"""
=== Alertmanager 集群健康报告 ===
时间: {health['timestamp']}
总节点数: {health['total_nodes']}
健康节点数: {health['healthy_nodes']}
集群一致性: {'✅ 正常' if health['cluster_consistent'] else '❌ 异常'}
节点状态:
"""
for node, status in health['nodes_status'].items():
if status['healthy']:
report += f" ✅ {node}: 健康 (告警: {status['alert_count']}, 静默: {status['silence_count']}, 对等: {status['peer_count']})\n"
else:
report += f" ❌ {node}: 不健康 - {status.get('error', 'Unknown error')}\n"
if health['unhealthy_nodes']:
report += f"\n不健康节点: {', '.join(health['unhealthy_nodes'])}\n"
consistency = health['data_consistency']
report += f"""
数据一致性:
告警一致性: {'✅' if consistency['alerts_consistent'] else '❌'}
静默一致性: {'✅' if consistency['silences_consistent'] else '❌'}
对等节点一致性: {'✅' if consistency['peers_consistent'] else '❌'}
"""
return report
def monitor_loop(self, interval: int = 60):
"""监控循环"""
self.logger.info(f"Starting cluster monitoring loop (interval: {interval}s)")
while True:
try:
report = self.generate_health_report()
print(report)
# 检查是否需要告警
health = self.check_cluster_health()
if not health['cluster_consistent'] or health['unhealthy_nodes']:
self.send_alert(health)
time.sleep(interval)
except KeyboardInterrupt:
self.logger.info("Monitoring stopped by user")
break
except Exception as e:
self.logger.error(f"Error in monitoring loop: {e}")
time.sleep(interval)
def send_alert(self, health_data: Dict):
"""发送集群异常告警"""
# 这里可以集成各种告警渠道
self.logger.error(f"Cluster health issue detected: {health_data}")
# 示例:发送到 Slack
# slack_webhook = "https://hooks.slack.com/services/..."
# requests.post(slack_webhook, json={"text": f"Alertmanager cluster issue: {health_data}"})
# 使用示例
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
# 配置集群节点
cluster_nodes = [
"alertmanager-1.example.com:9093",
"alertmanager-2.example.com:9093",
"alertmanager-3.example.com:9093"
]
# 创建监控器
monitor = AlertmanagerClusterMonitor(cluster_nodes)
# 生成一次性报告
print(monitor.generate_health_report())
# 或者启动持续监控
# monitor.monitor_loop(interval=30)
7.5 故障转移与恢复
自动故障检测
#!/bin/bash
# failover-detection.sh
# 配置
ALERTMANAGER_NODES=(
"alertmanager-1.example.com:9093"
"alertmanager-2.example.com:9093"
"alertmanager-3.example.com:9093"
)
HEALTH_CHECK_INTERVAL=30
FAILURE_THRESHOLD=3
LOG_FILE="/var/log/alertmanager-failover.log"
# 故障计数器
declare -A failure_counts
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}
# 健康检查函数
check_health() {
local node="$1"
local url="http://$node/-/healthy"
if curl -s --max-time 10 "$url" > /dev/null 2>&1; then
return 0 # 健康
else
return 1 # 不健康
fi
}
# 获取集群状态
get_cluster_status() {
local node="$1"
local url="http://$node/api/v1/status"
curl -s --max-time 10 "$url" 2>/dev/null
}
# 故障转移处理
handle_failover() {
local failed_node="$1"
log "CRITICAL: Node $failed_node has failed, initiating failover procedures"
# 1. 从负载均衡器移除故障节点
remove_from_load_balancer "$failed_node"
# 2. 通知监控系统
send_failover_alert "$failed_node"
# 3. 尝试重启故障节点
restart_node "$failed_node"
# 4. 检查集群是否还有足够的健康节点
check_cluster_quorum
}
# 从负载均衡器移除节点
remove_from_load_balancer() {
local node="$1"
log "Removing $node from load balancer"
# 示例:更新 Nginx upstream 配置
# sed -i "/server $node/s/^/#/" /etc/nginx/conf.d/alertmanager.conf
# nginx -s reload
# 示例:更新 HAProxy 配置
# echo "disable server alertmanager/$node" | socat stdio /var/run/haproxy.sock
# 示例:Kubernetes 中标记 Pod 为不可用
# kubectl patch pod alertmanager-pod --type='merge' -p='{"metadata":{"labels":{"health":"unhealthy"}}}'
}
# 发送故障转移告警
send_failover_alert() {
local failed_node="$1"
local alert_data='{
"alerts": [
{
"labels": {
"alertname": "AlertmanagerNodeFailure",
"instance": "'$failed_node'",
"severity": "critical",
"service": "alertmanager"
},
"annotations": {
"summary": "Alertmanager node failure detected",
"description": "Alertmanager node '$failed_node' has failed and failover procedures have been initiated"
},
"startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)'"
}
]
}'
# 发送到其他健康的 Alertmanager 节点
for node in "${ALERTMANAGER_NODES[@]}"; do
if [ "$node" != "$failed_node" ] && check_health "$node"; then
curl -XPOST "http://$node/api/v1/alerts" \
-H "Content-Type: application/json" \
-d "$alert_data" > /dev/null 2>&1
break
fi
done
}
# 重启节点
restart_node() {
local node="$1"
log "Attempting to restart $node"
# 根据部署方式选择重启方法
# Docker 方式
# docker restart alertmanager-container
# Systemd 方式
# systemctl restart alertmanager
# Kubernetes 方式
# kubectl delete pod -l app=alertmanager
# 等待重启完成
sleep 30
# 检查重启是否成功
if check_health "$node"; then
log "SUCCESS: Node $node restarted successfully"
failure_counts["$node"]=0
add_to_load_balancer "$node"
else
log "ERROR: Node $node restart failed"
fi
}
# 添加到负载均衡器
add_to_load_balancer() {
local node="$1"
log "Adding $node back to load balancer"
# 示例:恢复 Nginx upstream 配置
# sed -i "/server $node/s/^#//" /etc/nginx/conf.d/alertmanager.conf
# nginx -s reload
# 示例:恢复 HAProxy 配置
# echo "enable server alertmanager/$node" | socat stdio /var/run/haproxy.sock
}
# 检查集群法定人数
check_cluster_quorum() {
local healthy_count=0
for node in "${ALERTMANAGER_NODES[@]}"; do
if check_health "$node"; then
((healthy_count++))
fi
done
local total_nodes=${#ALERTMANAGER_NODES[@]}
local required_quorum=$((total_nodes / 2 + 1))
if [ $healthy_count -lt $required_quorum ]; then
log "CRITICAL: Cluster quorum lost! Healthy nodes: $healthy_count, Required: $required_quorum"
send_quorum_lost_alert
else
log "INFO: Cluster quorum maintained. Healthy nodes: $healthy_count"
fi
}
# 发送法定人数丢失告警
send_quorum_lost_alert() {
log "EMERGENCY: Alertmanager cluster quorum lost - manual intervention required"
# 发送紧急通知
# 可以集成短信、电话等紧急通知渠道
}
# 主监控循环
main_loop() {
log "Starting Alertmanager failover detection"
while true; do
for node in "${ALERTMANAGER_NODES[@]}"; do
if check_health "$node"; then
# 节点健康,重置故障计数
if [ "${failure_counts[$node]:-0}" -gt 0 ]; then
log "INFO: Node $node recovered"
failure_counts["$node"]=0
fi
else
# 节点不健康,增加故障计数
failure_counts["$node"]=$((${failure_counts[$node]:-0} + 1))
log "WARNING: Node $node health check failed (${failure_counts[$node]}/$FAILURE_THRESHOLD)"
# 达到故障阈值,触发故障转移
if [ "${failure_counts[$node]}" -ge $FAILURE_THRESHOLD ]; then
handle_failover "$node"
fi
fi
done
sleep $HEALTH_CHECK_INTERVAL
done
}
# 信号处理
trap 'log "Failover detection stopped"; exit 0' SIGTERM SIGINT
# 启动主循环
main_loop
数据恢复脚本
# data_recovery.py
import os
import json
import shutil
import requests
from datetime import datetime, timedelta
from pathlib import Path
import logging
import argparse
class AlertmanagerDataRecovery:
def __init__(self, data_dir: str, backup_dir: str, cluster_nodes: list):
self.data_dir = Path(data_dir)
self.backup_dir = Path(backup_dir)
self.cluster_nodes = cluster_nodes
self.logger = logging.getLogger(__name__)
def create_backup(self) -> str:
"""创建数据备份"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = self.backup_dir / f"alertmanager_backup_{timestamp}"
try:
# 创建备份目录
backup_path.mkdir(parents=True, exist_ok=True)
# 备份数据文件
if self.data_dir.exists():
shutil.copytree(self.data_dir, backup_path / "data", dirs_exist_ok=True)
# 备份集群状态
cluster_state = self.get_cluster_state()
with open(backup_path / "cluster_state.json", 'w') as f:
json.dump(cluster_state, f, indent=2)
# 备份告警和静默
alerts = self.get_all_alerts()
silences = self.get_all_silences()
with open(backup_path / "alerts.json", 'w') as f:
json.dump(alerts, f, indent=2)
with open(backup_path / "silences.json", 'w') as f:
json.dump(silences, f, indent=2)
self.logger.info(f"Backup created successfully: {backup_path}")
return str(backup_path)
except Exception as e:
self.logger.error(f"Failed to create backup: {e}")
raise
def restore_from_backup(self, backup_path: str) -> bool:
"""从备份恢复数据"""
backup_dir = Path(backup_path)
if not backup_dir.exists():
self.logger.error(f"Backup directory does not exist: {backup_path}")
return False
try:
# 停止 Alertmanager 服务
self.stop_alertmanager_service()
# 备份当前数据(以防恢复失败)
current_backup = self.create_backup()
self.logger.info(f"Current data backed up to: {current_backup}")
# 恢复数据文件
data_backup = backup_dir / "data"
if data_backup.exists():
if self.data_dir.exists():
shutil.rmtree(self.data_dir)
shutil.copytree(data_backup, self.data_dir)
self.logger.info("Data files restored")
# 启动 Alertmanager 服务
self.start_alertmanager_service()
# 等待服务启动
self.wait_for_service_ready()
# 恢复静默规则
silences_file = backup_dir / "silences.json"
if silences_file.exists():
self.restore_silences(silences_file)
self.logger.info("Data restoration completed successfully")
return True
except Exception as e:
self.logger.error(f"Failed to restore from backup: {e}")
return False
def get_cluster_state(self) -> dict:
"""获取集群状态"""
cluster_state = {
'timestamp': datetime.utcnow().isoformat(),
'nodes': {}
}
for node in self.cluster_nodes:
try:
response = requests.get(f"http://{node}/api/v1/status", timeout=10)
if response.status_code == 200:
cluster_state['nodes'][node] = response.json()
else:
cluster_state['nodes'][node] = {'error': f'HTTP {response.status_code}'}
except Exception as e:
cluster_state['nodes'][node] = {'error': str(e)}
return cluster_state
def get_all_alerts(self) -> list:
"""获取所有告警"""
for node in self.cluster_nodes:
try:
response = requests.get(f"http://{node}/api/v1/alerts", timeout=10)
if response.status_code == 200:
return response.json().get('data', [])
except Exception as e:
self.logger.warning(f"Failed to get alerts from {node}: {e}")
continue
return []
def get_all_silences(self) -> list:
"""获取所有静默"""
for node in self.cluster_nodes:
try:
response = requests.get(f"http://{node}/api/v1/silences", timeout=10)
if response.status_code == 200:
return response.json().get('data', [])
except Exception as e:
self.logger.warning(f"Failed to get silences from {node}: {e}")
continue
return []
def restore_silences(self, silences_file: Path):
"""恢复静默规则"""
try:
with open(silences_file, 'r') as f:
silences = json.load(f)
for silence in silences:
# 只恢复活跃的静默
if silence.get('status', {}).get('state') == 'active':
self.create_silence_from_backup(silence)
self.logger.info(f"Restored {len(silences)} silences")
except Exception as e:
self.logger.error(f"Failed to restore silences: {e}")
def create_silence_from_backup(self, silence_data: dict):
"""从备份创建静默规则"""
# 构造新的静默数据
new_silence = {
'matchers': silence_data.get('matchers', []),
'startsAt': datetime.utcnow().isoformat() + 'Z',
'endsAt': silence_data.get('endsAt'),
'createdBy': 'data-recovery',
'comment': f"Restored from backup: {silence_data.get('comment', '')}"
}
# 发送到健康的节点
for node in self.cluster_nodes:
try:
response = requests.post(
f"http://{node}/api/v1/silences",
json=new_silence,
timeout=10
)
if response.status_code == 200:
self.logger.info(f"Restored silence: {silence_data.get('id')}")
break
except Exception as e:
self.logger.warning(f"Failed to restore silence to {node}: {e}")
continue
def stop_alertmanager_service(self):
"""停止 Alertmanager 服务"""
# 根据部署方式选择停止方法
os.system("systemctl stop alertmanager")
# 或者: os.system("docker stop alertmanager")
# 或者: os.system("kubectl scale deployment alertmanager --replicas=0")
def start_alertmanager_service(self):
"""启动 Alertmanager 服务"""
# 根据部署方式选择启动方法
os.system("systemctl start alertmanager")
# 或者: os.system("docker start alertmanager")
# 或者: os.system("kubectl scale deployment alertmanager --replicas=3")
def wait_for_service_ready(self, timeout: int = 120):
"""等待服务就绪"""
start_time = datetime.now()
while (datetime.now() - start_time).seconds < timeout:
for node in self.cluster_nodes:
try:
response = requests.get(f"http://{node}/-/ready", timeout=5)
if response.status_code == 200:
self.logger.info(f"Service ready on {node}")
return True
except:
pass
time.sleep(5)
raise TimeoutError("Service did not become ready within timeout")
def cleanup_old_backups(self, keep_days: int = 30):
"""清理旧备份"""
cutoff_date = datetime.now() - timedelta(days=keep_days)
for backup_dir in self.backup_dir.glob("alertmanager_backup_*"):
try:
# 从目录名提取时间戳
timestamp_str = backup_dir.name.split('_')[-2] + '_' + backup_dir.name.split('_')[-1]
backup_date = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
if backup_date < cutoff_date:
shutil.rmtree(backup_dir)
self.logger.info(f"Removed old backup: {backup_dir}")
except Exception as e:
self.logger.warning(f"Failed to process backup {backup_dir}: {e}")
def main():
parser = argparse.ArgumentParser(description='Alertmanager Data Recovery Tool')
parser.add_argument('--action', choices=['backup', 'restore', 'cleanup'], required=True,
help='Action to perform')
parser.add_argument('--data-dir', default='/var/lib/alertmanager',
help='Alertmanager data directory')
parser.add_argument('--backup-dir', default='/var/backups/alertmanager',
help='Backup directory')
parser.add_argument('--backup-path', help='Backup path for restore operation')
parser.add_argument('--nodes', nargs='+',
default=['localhost:9093'],
help='Alertmanager cluster nodes')
parser.add_argument('--keep-days', type=int, default=30,
help='Days to keep backups for cleanup')
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
recovery = AlertmanagerDataRecovery(
data_dir=args.data_dir,
backup_dir=args.backup_dir,
cluster_nodes=args.nodes
)
if args.action == 'backup':
backup_path = recovery.create_backup()
print(f"Backup created: {backup_path}")
elif args.action == 'restore':
if not args.backup_path:
print("Error: --backup-path is required for restore operation")
return 1
success = recovery.restore_from_backup(args.backup_path)
if success:
print("Restore completed successfully")
else:
print("Restore failed")
return 1
elif args.action == 'cleanup':
recovery.cleanup_old_backups(args.keep_days)
print(f"Cleanup completed (kept {args.keep_days} days)")
return 0
if __name__ == '__main__':
exit(main())
7.6 性能优化与扩展
集群性能调优
# 高性能集群配置
global:
# SMTP 连接池
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_hello: 'alertmanager.example.com'
# HTTP 客户端优化
http_config:
idle_conn_timeout: 90s
response_header_timeout: 10s
expect_continue_timeout: 1s
max_idle_conns: 100
max_idle_conns_per_host: 10
max_conns_per_host: 50
# 集群优化配置
cluster:
listen-address: '0.0.0.0:9094'
# 网络优化
tcp-timeout: '5s'
probe-timeout: '200ms'
probe-interval: '500ms'
# Gossip 优化
gossip-interval: '100ms'
pushpull-interval: '30s'
# 重连优化
reconnect-interval: '5s'
reconnect-timeout: '2h'
# 缓冲区大小
gossip-nodes: 3
gossip-to-dead-time: '30s'
# 路由优化
route:
receiver: 'default'
group_by: ['cluster', 'service', 'alertname']
# 时间优化
group_wait: 10s # 减少等待时间
group_interval: 2m # 增加发送频率
repeat_interval: 1h # 减少重复间隔
# 并发处理
continue: false # 避免不必要的路由继续
# 接收器优化
receivers:
- name: 'default'
email_configs:
- to: 'team@example.com'
subject: '[{{ .Status }}] {{ .GroupLabels.alertname }}'
body: '{{ template "email.default" . }}'
# SMTP 优化
smarthost: 'smtp.example.com:587'
auth_username: 'alerts@example.com'
auth_password: 'password'
# 连接优化
require_tls: true
tls_config:
insecure_skip_verify: false
# 发送优化
headers:
Subject: '[{{ .Status }}] {{ .GroupLabels.alertname }}'
X-Priority: '1'
X-Mailer: 'Alertmanager'
# 模板优化
templates:
- '/etc/alertmanager/templates/*.tmpl'
# 抑制规则优化
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['instance', 'job']
- source_match:
alertname: 'NodeDown'
target_match_re:
alertname: 'Node.*'
equal: ['instance']
监控指标配置
# prometheus-alertmanager-monitoring.yml
groups:
- name: alertmanager-cluster
rules:
# 集群健康监控
- alert: AlertmanagerClusterDown
expr: up{job="alertmanager"} == 0
for: 1m
labels:
severity: critical
service: alertmanager
annotations:
summary: "Alertmanager instance is down"
description: "Alertmanager instance {{ $labels.instance }} has been down for more than 1 minute."
# 集群成员监控
- alert: AlertmanagerClusterMembersMismatch
expr: |
count by (job) (up{job="alertmanager"} == 1) !=
count by (job) (alertmanager_cluster_members{job="alertmanager"})
for: 5m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager cluster members mismatch"
description: "The number of running Alertmanager instances does not match the expected cluster size."
# 配置同步监控
- alert: AlertmanagerConfigInconsistent
expr: |
count by (job) (
count by (job, config_hash) (alertmanager_config_hash{job="alertmanager"})
) > 1
for: 5m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager configuration inconsistent"
description: "Alertmanager instances have different configuration hashes."
# 告警处理延迟
- alert: AlertmanagerHighLatency
expr: |
histogram_quantile(0.95,
rate(alertmanager_notification_latency_seconds_bucket{job="alertmanager"}[5m])
) > 10
for: 5m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager high notification latency"
description: "95th percentile notification latency is {{ $value }}s."
# 通知失败率
- alert: AlertmanagerNotificationFailureRate
expr: |
rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m]) /
rate(alertmanager_notifications_total{job="alertmanager"}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: alertmanager
annotations:
summary: "High notification failure rate"
description: "Notification failure rate is {{ $value | humanizePercentage }}."
# 内存使用监控
- alert: AlertmanagerHighMemoryUsage
expr: |
process_resident_memory_bytes{job="alertmanager"} /
node_memory_MemTotal_bytes > 0.8
for: 10m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager high memory usage"
description: "Alertmanager instance {{ $labels.instance }} is using {{ $value | humanizePercentage }} of available memory."
# 磁盘使用监控
- alert: AlertmanagerHighDiskUsage
expr: |
(node_filesystem_size_bytes{mountpoint="/var/lib/alertmanager"} -
node_filesystem_free_bytes{mountpoint="/var/lib/alertmanager"}) /
node_filesystem_size_bytes{mountpoint="/var/lib/alertmanager"} > 0.85
for: 5m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager high disk usage"
description: "Alertmanager data directory is {{ $value | humanizePercentage }} full."
性能调优脚本
#!/bin/bash
# performance-tuning.sh
# 系统级优化
optimize_system() {
echo "Optimizing system for Alertmanager cluster..."
# 网络优化
echo 'net.core.somaxconn = 65535' >> /etc/sysctl.conf
echo 'net.core.netdev_max_backlog = 5000' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_max_syn_backlog = 65535' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_keepalive_time = 600' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_keepalive_intvl = 60' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_keepalive_probes = 3' >> /etc/sysctl.conf
# 文件描述符限制
echo '* soft nofile 65535' >> /etc/security/limits.conf
echo '* hard nofile 65535' >> /etc/security/limits.conf
# 应用设置
sysctl -p
echo "System optimization completed"
}
# 容器优化
optimize_container() {
echo "Optimizing container for Alertmanager..."
# Docker 资源限制
cat > /etc/docker/daemon.json << EOF
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"default-ulimits": {
"nofile": {
"Name": "nofile",
"Hard": 65535,
"Soft": 65535
}
}
}
EOF
systemctl restart docker
echo "Container optimization completed"
}
# Kubernetes 优化
optimize_kubernetes() {
echo "Applying Kubernetes optimizations..."
# 资源配额
kubectl apply -f - << EOF
apiVersion: v1
kind: ResourceQuota
metadata:
name: alertmanager-quota
namespace: monitoring
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
persistentvolumeclaims: "3"
EOF
# 网络策略
kubectl apply -f - << EOF
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: alertmanager-netpol
namespace: monitoring
spec:
podSelector:
matchLabels:
app: alertmanager
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: monitoring
- namespaceSelector:
matchLabels:
name: default
ports:
- protocol: TCP
port: 9093
- protocol: TCP
port: 9094
egress:
- {}
EOF
echo "Kubernetes optimization completed"
}
# 监控优化
optimize_monitoring() {
echo "Setting up performance monitoring..."
# Grafana 仪表板配置
cat > alertmanager-performance-dashboard.json << 'EOF'
{
"dashboard": {
"title": "Alertmanager Cluster Performance",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(alertmanager_http_requests_total[5m])",
"legendFormat": "{{ instance }} - {{ method }}"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(alertmanager_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
}
]
},
{
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"alertmanager\"}",
"legendFormat": "{{ instance }}"
}
]
},
{
"title": "Cluster Members",
"type": "stat",
"targets": [
{
"expr": "alertmanager_cluster_members",
"legendFormat": "Members"
}
]
}
]
}
}
EOF
echo "Performance monitoring setup completed"
}
# 主函数
main() {
case "$1" in
system)
optimize_system
;;
container)
optimize_container
;;
kubernetes)
optimize_kubernetes
;;
monitoring)
optimize_monitoring
;;
all)
optimize_system
optimize_container
optimize_kubernetes
optimize_monitoring
;;
*)
echo "Usage: $0 {system|container|kubernetes|monitoring|all}"
exit 1
;;
esac
}
main "$@"
7.7 故障排除指南
常见问题诊断
#!/bin/bash
# cluster-diagnostics.sh
# 配置
ALERTMANAGER_NODES=(
"alertmanager-1.example.com:9093"
"alertmanager-2.example.com:9093"
"alertmanager-3.example.com:9093"
)
LOG_FILE="/var/log/alertmanager-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}
# 检查网络连通性
check_network_connectivity() {
log "=== 网络连通性检查 ==="
for node in "${ALERTMANAGER_NODES[@]}"; do
host=$(echo $node | cut -d: -f1)
port=$(echo $node | cut -d: -f2)
log "检查 $node..."
# TCP 连接测试
if timeout 5 bash -c "</dev/tcp/$host/$port"; then
log "✅ $node TCP 连接正常"
else
log "❌ $node TCP 连接失败"
fi
# HTTP 健康检查
if curl -s --max-time 5 "http://$node/-/healthy" > /dev/null; then
log "✅ $node HTTP 健康检查通过"
else
log "❌ $node HTTP 健康检查失败"
fi
# 集群端口检查
cluster_port=$((port + 1))
if timeout 5 bash -c "</dev/tcp/$host/$cluster_port"; then
log "✅ $node 集群端口 $cluster_port 连接正常"
else
log "❌ $node 集群端口 $cluster_port 连接失败"
fi
done
}
# 检查集群状态
check_cluster_status() {
log "=== 集群状态检查 ==="
for node in "${ALERTMANAGER_NODES[@]}"; do
log "检查节点 $node 的集群状态..."
# 获取集群成员
members=$(curl -s "http://$node/api/v1/cluster/peers" | jq -r '.data[].name' 2>/dev/null)
if [ $? -eq 0 ]; then
log "集群成员: $members"
else
log "❌ 无法获取集群成员信息"
fi
# 获取集群状态
status=$(curl -s "http://$node/api/v1/status" | jq -r '.cluster.status' 2>/dev/null)
if [ $? -eq 0 ]; then
log "集群状态: $status"
else
log "❌ 无法获取集群状态"
fi
done
}
# 检查配置一致性
check_config_consistency() {
log "=== 配置一致性检查 ==="
declare -A config_hashes
for node in "${ALERTMANAGER_NODES[@]}"; do
log "检查节点 $node 的配置..."
# 获取配置哈希
hash=$(curl -s "http://$node/api/v1/status" | jq -r '.config.original' 2>/dev/null | md5sum | cut -d' ' -f1)
if [ $? -eq 0 ] && [ -n "$hash" ]; then
config_hashes["$node"]="$hash"
log "配置哈希: $hash"
else
log "❌ 无法获取配置哈希"
fi
done
# 检查哈希一致性
unique_hashes=$(printf '%s\n' "${config_hashes[@]}" | sort -u | wc -l)
if [ "$unique_hashes" -eq 1 ]; then
log "✅ 所有节点配置一致"
else
log "❌ 节点配置不一致"
for node in "${!config_hashes[@]}"; do
log " $node: ${config_hashes[$node]}"
done
fi
}
# 检查资源使用
check_resource_usage() {
log "=== 资源使用检查 ==="
for node in "${ALERTMANAGER_NODES[@]}"; do
log "检查节点 $node 的资源使用..."
# 内存使用
memory=$(curl -s "http://$node/metrics" | grep "process_resident_memory_bytes" | awk '{print $2}' 2>/dev/null)
if [ -n "$memory" ]; then
memory_mb=$((memory / 1024 / 1024))
log "内存使用: ${memory_mb}MB"
fi
# CPU 使用
cpu=$(curl -s "http://$node/metrics" | grep "process_cpu_seconds_total" | awk '{print $2}' 2>/dev/null)
if [ -n "$cpu" ]; then
log "CPU 时间: ${cpu}s"
fi
# 文件描述符
fds=$(curl -s "http://$node/metrics" | grep "process_open_fds" | awk '{print $2}' 2>/dev/null)
if [ -n "$fds" ]; then
log "打开的文件描述符: $fds"
fi
done
}
# 检查告警处理
check_alert_processing() {
log "=== 告警处理检查 ==="
for node in "${ALERTMANAGER_NODES[@]}"; do
log "检查节点 $node 的告警处理..."
# 活跃告警数量
alerts=$(curl -s "http://$node/api/v1/alerts" | jq '.data | length' 2>/dev/null)
if [ $? -eq 0 ]; then
log "活跃告警数量: $alerts"
fi
# 静默规则数量
silences=$(curl -s "http://$node/api/v1/silences" | jq '.data | length' 2>/dev/null)
if [ $? -eq 0 ]; then
log "静默规则数量: $silences"
fi
# 通知统计
notifications_total=$(curl -s "http://$node/metrics" | grep "alertmanager_notifications_total" | awk '{print $2}' 2>/dev/null)
notifications_failed=$(curl -s "http://$node/metrics" | grep "alertmanager_notifications_failed_total" | awk '{print $2}' 2>/dev/null)
if [ -n "$notifications_total" ] && [ -n "$notifications_failed" ]; then
success_rate=$(echo "scale=2; ($notifications_total - $notifications_failed) * 100 / $notifications_total" | bc 2>/dev/null)
log "通知成功率: ${success_rate}%"
fi
done
}
# 检查日志错误
check_logs() {
log "=== 日志错误检查 ==="
# 检查系统日志中的错误
if command -v journalctl > /dev/null; then
log "检查 systemd 日志..."
error_count=$(journalctl -u alertmanager --since "1 hour ago" --no-pager | grep -i error | wc -l)
log "最近1小时错误日志数量: $error_count"
if [ "$error_count" -gt 0 ]; then
log "最近的错误日志:"
journalctl -u alertmanager --since "1 hour ago" --no-pager | grep -i error | tail -5 | while read line; do
log " $line"
done
fi
fi
# 检查容器日志
if command -v docker > /dev/null; then
log "检查 Docker 容器日志..."
for container in $(docker ps --filter "name=alertmanager" --format "{{.Names}}"); do
log "检查容器 $container..."
error_count=$(docker logs "$container" --since 1h 2>&1 | grep -i error | wc -l)
log "容器 $container 最近1小时错误数量: $error_count"
done
fi
}
# 生成诊断报告
generate_report() {
log "=== 生成诊断报告 ==="
report_file="/tmp/alertmanager-diagnostics-$(date +%Y%m%d_%H%M%S).txt"
{
echo "Alertmanager 集群诊断报告"
echo "生成时间: $(date)"
echo "=============================="
echo
cat "$LOG_FILE"
} > "$report_file"
log "诊断报告已生成: $report_file"
}
# 主函数
main() {
log "开始 Alertmanager 集群诊断..."
check_network_connectivity
check_cluster_status
check_config_consistency
check_resource_usage
check_alert_processing
check_logs
generate_report
log "诊断完成"
}
# 执行诊断
main "$@"
7.8 本章小结
核心概念回顾
高可用架构
- 多节点集群部署
- 数据同步与一致性
- 故障检测与自动恢复
- 负载均衡与流量分发
集群通信机制
- Gossip 协议实现
- 节点发现与维护
- 状态同步策略
- 网络分区处理
部署模式选择
- Docker Compose 集群
- Kubernetes StatefulSet
- 传统虚拟机部署
- 混合云架构
技术要点总结
技术领域 | 关键技术 | 实现要点 |
---|---|---|
集群架构 | Gossip 协议 | 节点发现、状态同步、故障检测 |
数据一致性 | 最终一致性 | 告警状态、静默规则、配置同步 |
负载均衡 | 多层负载均衡 | DNS、网关、应用层分发 |
故障恢复 | 自动化运维 | 健康检查、故障转移、数据恢复 |
性能优化 | 多维度调优 | 网络、存储、应用、监控 |
最佳实践要点
架构设计
- 奇数节点部署(3、5、7)
- 跨可用区分布
- 网络隔离与安全
- 资源预留与扩展
运维管理
- 自动化部署
- 持续监控
- 定期备份
- 故障演练
性能优化
- 系统级调优
- 应用级优化
- 网络优化
- 存储优化
运维价值
- 可用性提升: 99.9% 以上服务可用性
- 故障恢复: 秒级故障检测,分钟级自动恢复
- 扩展能力: 支持水平扩展和垂直扩展
- 运维效率: 自动化运维,减少人工干预
下一章预告: 第八章将深入探讨 Alertmanager 的监控与运维管理,包括指标收集、性能监控、日志分析、容量规划等内容,帮助您建立完善的运维体系。