2.1 系统要求

硬件要求

最小配置

  • CPU:1 核心
  • 内存:512MB
  • 磁盘:1GB 可用空间
  • 网络:100Mbps

推荐配置

  • CPU:2+ 核心
  • 内存:2GB+
  • 磁盘:10GB+ SSD
  • 网络:1Gbps

生产环境配置

  • CPU:4+ 核心
  • 内存:8GB+
  • 磁盘:50GB+ SSD(RAID 1)
  • 网络:10Gbps

软件要求

操作系统支持

  • Linux:Ubuntu 18.04+, CentOS 7+, RHEL 7+
  • Windows:Windows Server 2016+
  • macOS:macOS 10.14+

依赖软件

  • Go:1.17+(源码编译)
  • Docker:20.10+(容器部署)
  • Kubernetes:1.20+(K8s 部署)

网络要求

端口配置

  • 9093:Web UI 和 API 端口
  • 9094:集群通信端口
  • 9095:额外的集群端口(可选)

防火墙规则

# Ubuntu/Debian
sudo ufw allow 9093/tcp
sudo ufw allow 9094/tcp

# CentOS/RHEL
sudo firewall-cmd --permanent --add-port=9093/tcp
sudo firewall-cmd --permanent --add-port=9094/tcp
sudo firewall-cmd --reload

2.2 二进制安装

下载安装包

# 设置版本变量
VERSION="0.25.0"
ARCH="linux-amd64"

# 下载二进制包
wget https://github.com/prometheus/alertmanager/releases/download/v${VERSION}/alertmanager-${VERSION}.${ARCH}.tar.gz

# 解压安装包
tar xzf alertmanager-${VERSION}.${ARCH}.tar.gz
cd alertmanager-${VERSION}.${ARCH}

# 查看文件内容
ls -la
# alertmanager  amtool  alertmanager.yml  LICENSE  NOTICE

安装配置

# 创建用户和目录
sudo useradd --no-create-home --shell /bin/false alertmanager
sudo mkdir -p /etc/alertmanager /var/lib/alertmanager
sudo chown alertmanager:alertmanager /etc/alertmanager /var/lib/alertmanager

# 复制二进制文件
sudo cp alertmanager amtool /usr/local/bin/
sudo chown alertmanager:alertmanager /usr/local/bin/alertmanager /usr/local/bin/amtool
sudo chmod +x /usr/local/bin/alertmanager /usr/local/bin/amtool

# 复制配置文件
sudo cp alertmanager.yml /etc/alertmanager/
sudo chown alertmanager:alertmanager /etc/alertmanager/alertmanager.yml

创建系统服务

# 创建 systemd 服务文件
sudo tee /etc/systemd/system/alertmanager.service > /dev/null <<EOF
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target

[Service]
User=alertmanager
Group=alertmanager
Type=simple
ExecStart=/usr/local/bin/alertmanager \
    --config.file=/etc/alertmanager/alertmanager.yml \
    --storage.path=/var/lib/alertmanager/ \
    --web.external-url=http://localhost:9093
Restart=always
RestartSec=5

[Install]
WantedBy=multi-user.target
EOF

# 重新加载 systemd 配置
sudo systemctl daemon-reload

# 启用并启动服务
sudo systemctl enable alertmanager
sudo systemctl start alertmanager

# 检查服务状态
sudo systemctl status alertmanager

验证安装

# 检查进程
ps aux | grep alertmanager

# 检查端口
ss -tlnp | grep 9093

# 测试 API
curl http://localhost:9093/api/v1/status

# 访问 Web UI
# 浏览器访问:http://localhost:9093

2.3 Docker 部署

基础 Docker 部署

# 创建配置目录
mkdir -p /opt/alertmanager/config
mkdir -p /opt/alertmanager/data

# 创建基础配置文件
cat > /opt/alertmanager/config/alertmanager.yml <<EOF
global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alertmanager@example.com'

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'

receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://127.0.0.1:5001/'
EOF

# 运行 Alertmanager 容器
docker run -d \
  --name alertmanager \
  --restart unless-stopped \
  -p 9093:9093 \
  -v /opt/alertmanager/config:/etc/alertmanager \
  -v /opt/alertmanager/data:/alertmanager \
  prom/alertmanager:latest \
  --config.file=/etc/alertmanager/alertmanager.yml \
  --storage.path=/alertmanager \
  --web.external-url=http://localhost:9093

Docker Compose 部署

# docker-compose.yml
version: '3.8'

services:
  alertmanager:
    image: prom/alertmanager:v0.25.0
    container_name: alertmanager
    restart: unless-stopped
    ports:
      - "9093:9093"
    volumes:
      - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - ./templates:/etc/alertmanager/templates:ro
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
      - '--web.route-prefix=/'
      - '--log.level=info'
    networks:
      - monitoring
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

volumes:
  alertmanager_data:
    driver: local

networks:
  monitoring:
    driver: bridge

完整监控栈部署

# monitoring-stack.yml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:v2.40.0
    container_name: prometheus
    restart: unless-stopped
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./prometheus/rules:/etc/prometheus/rules:ro
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=15d'
      - '--web.enable-lifecycle'
      - '--alertmanager.url=http://alertmanager:9093'
    networks:
      - monitoring
    depends_on:
      - alertmanager

  alertmanager:
    image: prom/alertmanager:v0.25.0
    container_name: alertmanager
    restart: unless-stopped
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - ./alertmanager/templates:/etc/alertmanager/templates:ro
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
    networks:
      - monitoring

  grafana:
    image: grafana/grafana:9.3.0
    container_name: grafana
    restart: unless-stopped
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_USERS_ALLOW_SIGN_UP=false
    networks:
      - monitoring
    depends_on:
      - prometheus

  node-exporter:
    image: prom/node-exporter:v1.5.0
    container_name: node-exporter
    restart: unless-stopped
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    networks:
      - monitoring

volumes:
  prometheus_data:
  alertmanager_data:
  grafana_data:

networks:
  monitoring:
    driver: bridge

启动和管理

# 启动服务
docker-compose -f monitoring-stack.yml up -d

# 查看服务状态
docker-compose -f monitoring-stack.yml ps

# 查看日志
docker-compose -f monitoring-stack.yml logs alertmanager

# 重启服务
docker-compose -f monitoring-stack.yml restart alertmanager

# 停止服务
docker-compose -f monitoring-stack.yml down

# 更新配置后重新加载
docker-compose -f monitoring-stack.yml exec alertmanager \
  kill -HUP 1

2.4 Kubernetes 部署

命名空间和 RBAC

# namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: alertmanager
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: alertmanager
rules:
- apiGroups: [""]
  resources: ["nodes", "services", "endpoints", "pods"]
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: alertmanager
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: alertmanager
subjects:
- kind: ServiceAccount
  name: alertmanager
  namespace: monitoring

ConfigMap 配置

# configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitoring
data:
  alertmanager.yml: |
    global:
      smtp_smarthost: 'smtp.gmail.com:587'
      smtp_from: 'alerts@example.com'
      smtp_auth_username: 'alerts@example.com'
      smtp_auth_password: 'app-password'
      resolve_timeout: 5m
    
    templates:
      - '/etc/alertmanager/templates/*.tmpl'
    
    route:
      group_by: ['alertname', 'cluster', 'service']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'default'
      routes:
      - match:
          severity: critical
        receiver: 'critical-alerts'
        group_wait: 5s
        repeat_interval: 30m
      - match:
          severity: warning
        receiver: 'warning-alerts'
        repeat_interval: 2h
      - match_re:
          service: '^(web|api|database).*'
        receiver: 'app-team'
    
    receivers:
    - name: 'default'
      email_configs:
      - to: 'admin@example.com'
        subject: '[ALERT] {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
          {{ end }}
    
    - name: 'critical-alerts'
      email_configs:
      - to: 'oncall@example.com'
        subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
        body: |
          CRITICAL ALERT!
          
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Severity: {{ .Labels.severity }}
          Instance: {{ .Labels.instance }}
          Time: {{ .StartsAt }}
          {{ end }}
      slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
        channel: '#critical-alerts'
        title: 'Critical Alert'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
    
    - name: 'warning-alerts'
      email_configs:
      - to: 'team@example.com'
        subject: '[WARNING] {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
          {{ end }}
    
    - name: 'app-team'
      email_configs:
      - to: 'app-team@example.com'
        subject: '[APP] {{ .GroupLabels.alertname }}'
        body: |
          Application Alert:
          {{ range .Alerts }}
          Service: {{ .Labels.service }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          {{ end }}
    
    inhibit_rules:
    - source_match:
        severity: 'critical'
      target_match:
        severity: 'warning'
      equal: ['alertname', 'instance']
  
  notification.tmpl: |
    {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*?)\\:.*" "$1" }}{{ end }}
    
    {{ define "slack.default.title" }}
    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .GroupLabels) 0 }}({{ template "cluster" . }}){{ end }}
    {{ end }}
    
    {{ define "slack.default.text" }}
    {{ range .Alerts }}
    {{ if .Annotations.summary }}{{ .Annotations.summary }}{{ end }}
    {{ if .Annotations.description }}{{ .Annotations.description }}{{ end }}
    {{ end }}
    {{ end }}

Deployment 部署

# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: alertmanager
  namespace: monitoring
  labels:
    app: alertmanager
spec:
  replicas: 1
  selector:
    matchLabels:
      app: alertmanager
  template:
    metadata:
      labels:
        app: alertmanager
    spec:
      serviceAccountName: alertmanager
      containers:
      - name: alertmanager
        image: prom/alertmanager:v0.25.0
        ports:
        - containerPort: 9093
          name: web
        - containerPort: 9094
          name: cluster
        args:
          - '--config.file=/etc/alertmanager/alertmanager.yml'
          - '--storage.path=/alertmanager'
          - '--web.external-url=http://alertmanager.monitoring.svc.cluster.local:9093'
          - '--web.route-prefix=/'
          - '--log.level=info'
        volumeMounts:
        - name: config-volume
          mountPath: /etc/alertmanager
        - name: storage-volume
          mountPath: /alertmanager
        resources:
          requests:
            memory: "256Mi"
            cpu: "100m"
          limits:
            memory: "512Mi"
            cpu: "200m"
        livenessProbe:
          httpGet:
            path: /-/healthy
            port: 9093
          initialDelaySeconds: 30
          timeoutSeconds: 10
        readinessProbe:
          httpGet:
            path: /-/ready
            port: 9093
          initialDelaySeconds: 5
          timeoutSeconds: 5
      volumes:
      - name: config-volume
        configMap:
          name: alertmanager-config
      - name: storage-volume
        emptyDir: {}

Service 和 Ingress

# service.yaml
apiVersion: v1
kind: Service
metadata:
  name: alertmanager
  namespace: monitoring
  labels:
    app: alertmanager
spec:
  selector:
    app: alertmanager
  ports:
  - name: web
    port: 9093
    targetPort: 9093
    protocol: TCP
  - name: cluster
    port: 9094
    targetPort: 9094
    protocol: TCP
  type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: alertmanager
  namespace: monitoring
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
  tls:
  - hosts:
    - alertmanager.example.com
    secretName: alertmanager-tls
  rules:
  - host: alertmanager.example.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: alertmanager
            port:
              number: 9093

部署和验证

# 应用配置
kubectl apply -f namespace.yaml
kubectl apply -f configmap.yaml
kubectl apply -f deployment.yaml
kubectl apply -f service.yaml

# 检查部署状态
kubectl get pods -n monitoring
kubectl get svc -n monitoring
kubectl get ingress -n monitoring

# 查看日志
kubectl logs -f deployment/alertmanager -n monitoring

# 端口转发测试
kubectl port-forward svc/alertmanager 9093:9093 -n monitoring

# 访问 Web UI
# 浏览器访问:http://localhost:9093

2.5 高可用集群部署

集群架构设计

graph TB
    subgraph "Load Balancer"
        LB[HAProxy/Nginx]
    end
    
    subgraph "Alertmanager Cluster"
        AM1[Alertmanager-1<br/>:9093]
        AM2[Alertmanager-2<br/>:9093]
        AM3[Alertmanager-3<br/>:9093]
    end
    
    subgraph "Prometheus Instances"
        P1[Prometheus-1]
        P2[Prometheus-2]
    end
    
    subgraph "Notification Channels"
        EMAIL[Email]
        SLACK[Slack]
        WEBHOOK[Webhook]
    end
    
    P1 --> LB
    P2 --> LB
    LB --> AM1
    LB --> AM2
    LB --> AM3
    
    AM1 -.->|Gossip| AM2
    AM2 -.->|Gossip| AM3
    AM3 -.->|Gossip| AM1
    
    AM1 --> EMAIL
    AM2 --> SLACK
    AM3 --> WEBHOOK

Docker Swarm 集群部署

# alertmanager-cluster.yml
version: '3.8'

services:
  alertmanager:
    image: prom/alertmanager:v0.25.0
    networks:
      - monitoring
    volumes:
      - alertmanager-config:/etc/alertmanager:ro
      - alertmanager-data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://alertmanager:9093'
      - '--cluster.listen-address=0.0.0.0:9094'
      - '--cluster.peer=tasks.alertmanager:9094'
      - '--cluster.reconnect-timeout=10m'
      - '--cluster.gossip-interval=200ms'
      - '--cluster.pushpull-interval=60s'
    deploy:
      replicas: 3
      placement:
        max_replicas_per_node: 1
        constraints:
          - node.role == worker
      resources:
        limits:
          memory: 512M
          cpus: '0.5'
        reservations:
          memory: 256M
          cpus: '0.25'
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
      update_config:
        parallelism: 1
        delay: 10s
        failure_action: rollback
        order: stop-first
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

  alertmanager-lb:
    image: nginx:alpine
    ports:
      - "9093:80"
    networks:
      - monitoring
    volumes:
      - alertmanager-nginx-config:/etc/nginx/conf.d:ro
    deploy:
      replicas: 2
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 128M
          cpus: '0.25'
        reservations:
          memory: 64M
          cpus: '0.1'
    depends_on:
      - alertmanager

volumes:
  alertmanager-config:
    external: true
  alertmanager-data:
  alertmanager-nginx-config:
    external: true

networks:
  monitoring:
    external: true

Kubernetes StatefulSet 部署

# statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: alertmanager
  namespace: monitoring
spec:
  serviceName: alertmanager-headless
  replicas: 3
  selector:
    matchLabels:
      app: alertmanager
  template:
    metadata:
      labels:
        app: alertmanager
    spec:
      serviceAccountName: alertmanager
      containers:
      - name: alertmanager
        image: prom/alertmanager:v0.25.0
        ports:
        - containerPort: 9093
          name: web
        - containerPort: 9094
          name: cluster
        args:
          - '--config.file=/etc/alertmanager/alertmanager.yml'
          - '--storage.path=/alertmanager'
          - '--web.external-url=http://alertmanager.monitoring.svc.cluster.local:9093'
          - '--cluster.listen-address=0.0.0.0:9094'
          - '--cluster.peer=alertmanager-0.alertmanager-headless.monitoring.svc.cluster.local:9094'
          - '--cluster.peer=alertmanager-1.alertmanager-headless.monitoring.svc.cluster.local:9094'
          - '--cluster.peer=alertmanager-2.alertmanager-headless.monitoring.svc.cluster.local:9094'
          - '--cluster.reconnect-timeout=10m'
        volumeMounts:
        - name: config-volume
          mountPath: /etc/alertmanager
        - name: storage-volume
          mountPath: /alertmanager
        resources:
          requests:
            memory: "256Mi"
            cpu: "100m"
          limits:
            memory: "512Mi"
            cpu: "200m"
        livenessProbe:
          httpGet:
            path: /-/healthy
            port: 9093
          initialDelaySeconds: 30
          timeoutSeconds: 10
        readinessProbe:
          httpGet:
            path: /-/ready
            port: 9093
          initialDelaySeconds: 5
          timeoutSeconds: 5
      volumes:
      - name: config-volume
        configMap:
          name: alertmanager-config
  volumeClaimTemplates:
  - metadata:
      name: storage-volume
    spec:
      accessModes: ["ReadWriteOnce"]
      storageClassName: "fast-ssd"
      resources:
        requests:
          storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
  name: alertmanager-headless
  namespace: monitoring
spec:
  clusterIP: None
  selector:
    app: alertmanager
  ports:
  - name: web
    port: 9093
    targetPort: 9093
  - name: cluster
    port: 9094
    targetPort: 9094

负载均衡配置

# nginx.conf
upstream alertmanager {
    least_conn;
    server alertmanager-0.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
    server alertmanager-1.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
    server alertmanager-2.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
}

server {
    listen 80;
    server_name alertmanager.example.com;
    
    location / {
        proxy_pass http://alertmanager;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        # 健康检查
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_connect_timeout 5s;
        proxy_send_timeout 10s;
        proxy_read_timeout 10s;
    }
    
    location /-/healthy {
        access_log off;
        proxy_pass http://alertmanager;
    }
}

2.6 配置验证和测试

配置文件验证

# 使用 amtool 验证配置
amtool config check /etc/alertmanager/alertmanager.yml

# 验证路由配置
amtool config routes show --config.file=/etc/alertmanager/alertmanager.yml

# 测试路由匹配
amtool config routes test \
  --config.file=/etc/alertmanager/alertmanager.yml \
  alertname=TestAlert severity=warning

API 测试

# 检查状态
curl -s http://localhost:9093/api/v1/status | jq .

# 获取配置信息
curl -s http://localhost:9093/api/v1/status | jq '.data.configYAML'

# 检查集群状态
curl -s http://localhost:9093/api/v1/status | jq '.data.clusterStatus'

# 发送测试告警
curl -XPOST http://localhost:9093/api/v1/alerts -H "Content-Type: application/json" -d '[
  {
    "labels": {
      "alertname": "TestAlert",
      "severity": "warning",
      "instance": "localhost:9090"
    },
    "annotations": {
      "summary": "This is a test alert",
      "description": "This alert is for testing purposes"
    },
    "startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)'"
  }
]'

健康检查脚本

#!/bin/bash
# healthcheck.sh

ALERTMANAGER_URL="http://localhost:9093"
TIMEOUT=10

# 检查服务是否运行
check_service() {
    if ! pgrep -f alertmanager > /dev/null; then
        echo "ERROR: Alertmanager process not running"
        return 1
    fi
    echo "OK: Alertmanager process is running"
}

# 检查端口是否监听
check_port() {
    if ! nc -z localhost 9093; then
        echo "ERROR: Port 9093 is not listening"
        return 1
    fi
    echo "OK: Port 9093 is listening"
}

# 检查 API 响应
check_api() {
    local response
    response=$(curl -s --max-time $TIMEOUT "$ALERTMANAGER_URL/api/v1/status")
    
    if [ $? -ne 0 ]; then
        echo "ERROR: Failed to connect to Alertmanager API"
        return 1
    fi
    
    if ! echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
        echo "ERROR: Alertmanager API returned error: $response"
        return 1
    fi
    
    echo "OK: Alertmanager API is responding"
}

# 检查配置文件
check_config() {
    if ! amtool config check /etc/alertmanager/alertmanager.yml > /dev/null 2>&1; then
        echo "ERROR: Configuration file is invalid"
        return 1
    fi
    echo "OK: Configuration file is valid"
}

# 主检查函数
main() {
    echo "=== Alertmanager Health Check ==="
    echo "Timestamp: $(date)"
    echo
    
    local exit_code=0
    
    check_service || exit_code=1
    check_port || exit_code=1
    check_api || exit_code=1
    check_config || exit_code=1
    
    echo
    if [ $exit_code -eq 0 ]; then
        echo "✅ All checks passed"
    else
        echo "❌ Some checks failed"
    fi
    
    return $exit_code
}

main "$@"

2.7 性能调优

系统级优化

# 调整文件描述符限制
echo "alertmanager soft nofile 65536" >> /etc/security/limits.conf
echo "alertmanager hard nofile 65536" >> /etc/security/limits.conf

# 调整内核参数
echo "net.core.somaxconn = 1024" >> /etc/sysctl.conf
echo "net.ipv4.tcp_max_syn_backlog = 1024" >> /etc/sysctl.conf
sysctl -p

# 调整 systemd 服务限制
mkdir -p /etc/systemd/system/alertmanager.service.d
cat > /etc/systemd/system/alertmanager.service.d/limits.conf <<EOF
[Service]
LimitNOFILE=65536
LimitNPROC=65536
EOF

systemctl daemon-reload
systemctl restart alertmanager

应用级优化

# 优化启动参数
/usr/local/bin/alertmanager \
    --config.file=/etc/alertmanager/alertmanager.yml \
    --storage.path=/var/lib/alertmanager/ \
    --web.external-url=http://localhost:9093 \
    --web.max-connections=512 \
    --cluster.tcp-timeout=10s \
    --cluster.gossip-interval=200ms \
    --cluster.pushpull-interval=60s \
    --cluster.settle-timeout=60s \
    --log.level=warn

监控指标

# 关键监控指标
prometheus_rules:
- alert: AlertmanagerDown
  expr: up{job="alertmanager"} == 0
  for: 5m
  labels:
    severity: critical
  annotations:
    summary: "Alertmanager instance is down"

- alert: AlertmanagerHighMemoryUsage
  expr: process_resident_memory_bytes{job="alertmanager"} / 1024 / 1024 > 512
  for: 10m
  labels:
    severity: warning
  annotations:
    summary: "Alertmanager memory usage is high"

- alert: AlertmanagerConfigReloadFailed
  expr: increase(alertmanager_config_last_reload_successful[5m]) == 0
  for: 5m
  labels:
    severity: warning
  annotations:
    summary: "Alertmanager config reload failed"

本章小结

本章详细介绍了 Alertmanager 的各种安装部署方式:

部署方式对比

部署方式 适用场景 优点 缺点
二进制安装 传统环境、性能要求高 性能最佳、资源占用少 管理复杂、升级困难
Docker 部署 开发测试、快速部署 部署简单、环境隔离 性能略低、依赖 Docker
Kubernetes 云原生环境、大规模部署 自动化管理、高可用 复杂度高、学习成本大
集群部署 生产环境、高可用要求 高可用、负载分担 配置复杂、资源消耗大

最佳实践

  1. 环境选择:根据实际需求选择合适的部署方式
  2. 资源规划:合理分配 CPU、内存和存储资源
  3. 网络配置:确保端口开放和网络连通性
  4. 安全配置:使用专用用户和适当的权限
  5. 监控告警:部署监控和健康检查
  6. 备份策略:定期备份配置和数据

下一步学习

在下一章中,我们将深入学习 Alertmanager 的配置文件详解,包括: - 配置文件结构和语法 - 全局配置选项 - 路由规则配置 - 接收器配置详解 - 抑制规则设置


下一章: 配置文件详解