2.1 系统要求
硬件要求
最小配置
- CPU:1 核心
- 内存:512MB
- 磁盘:1GB 可用空间
- 网络:100Mbps
推荐配置
- CPU:2+ 核心
- 内存:2GB+
- 磁盘:10GB+ SSD
- 网络:1Gbps
生产环境配置
- CPU:4+ 核心
- 内存:8GB+
- 磁盘:50GB+ SSD(RAID 1)
- 网络:10Gbps
软件要求
操作系统支持
- Linux:Ubuntu 18.04+, CentOS 7+, RHEL 7+
- Windows:Windows Server 2016+
- macOS:macOS 10.14+
依赖软件
- Go:1.17+(源码编译)
- Docker:20.10+(容器部署)
- Kubernetes:1.20+(K8s 部署)
网络要求
端口配置
- 9093:Web UI 和 API 端口
- 9094:集群通信端口
- 9095:额外的集群端口(可选)
防火墙规则
# Ubuntu/Debian
sudo ufw allow 9093/tcp
sudo ufw allow 9094/tcp
# CentOS/RHEL
sudo firewall-cmd --permanent --add-port=9093/tcp
sudo firewall-cmd --permanent --add-port=9094/tcp
sudo firewall-cmd --reload
2.2 二进制安装
下载安装包
# 设置版本变量
VERSION="0.25.0"
ARCH="linux-amd64"
# 下载二进制包
wget https://github.com/prometheus/alertmanager/releases/download/v${VERSION}/alertmanager-${VERSION}.${ARCH}.tar.gz
# 解压安装包
tar xzf alertmanager-${VERSION}.${ARCH}.tar.gz
cd alertmanager-${VERSION}.${ARCH}
# 查看文件内容
ls -la
# alertmanager amtool alertmanager.yml LICENSE NOTICE
安装配置
# 创建用户和目录
sudo useradd --no-create-home --shell /bin/false alertmanager
sudo mkdir -p /etc/alertmanager /var/lib/alertmanager
sudo chown alertmanager:alertmanager /etc/alertmanager /var/lib/alertmanager
# 复制二进制文件
sudo cp alertmanager amtool /usr/local/bin/
sudo chown alertmanager:alertmanager /usr/local/bin/alertmanager /usr/local/bin/amtool
sudo chmod +x /usr/local/bin/alertmanager /usr/local/bin/amtool
# 复制配置文件
sudo cp alertmanager.yml /etc/alertmanager/
sudo chown alertmanager:alertmanager /etc/alertmanager/alertmanager.yml
创建系统服务
# 创建 systemd 服务文件
sudo tee /etc/systemd/system/alertmanager.service > /dev/null <<EOF
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=alertmanager
Group=alertmanager
Type=simple
ExecStart=/usr/local/bin/alertmanager \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/var/lib/alertmanager/ \
--web.external-url=http://localhost:9093
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
EOF
# 重新加载 systemd 配置
sudo systemctl daemon-reload
# 启用并启动服务
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
# 检查服务状态
sudo systemctl status alertmanager
验证安装
# 检查进程
ps aux | grep alertmanager
# 检查端口
ss -tlnp | grep 9093
# 测试 API
curl http://localhost:9093/api/v1/status
# 访问 Web UI
# 浏览器访问:http://localhost:9093
2.3 Docker 部署
基础 Docker 部署
# 创建配置目录
mkdir -p /opt/alertmanager/config
mkdir -p /opt/alertmanager/data
# 创建基础配置文件
cat > /opt/alertmanager/config/alertmanager.yml <<EOF
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alertmanager@example.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
EOF
# 运行 Alertmanager 容器
docker run -d \
--name alertmanager \
--restart unless-stopped \
-p 9093:9093 \
-v /opt/alertmanager/config:/etc/alertmanager \
-v /opt/alertmanager/data:/alertmanager \
prom/alertmanager:latest \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/alertmanager \
--web.external-url=http://localhost:9093
Docker Compose 部署
# docker-compose.yml
version: '3.8'
services:
alertmanager:
image: prom/alertmanager:v0.25.0
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- ./templates:/etc/alertmanager/templates:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
- '--web.route-prefix=/'
- '--log.level=info'
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
volumes:
alertmanager_data:
driver: local
networks:
monitoring:
driver: bridge
完整监控栈部署
# monitoring-stack.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.40.0
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
- '--alertmanager.url=http://alertmanager:9093'
networks:
- monitoring
depends_on:
- alertmanager
alertmanager:
image: prom/alertmanager:v0.25.0
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- ./alertmanager/templates:/etc/alertmanager/templates:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- monitoring
grafana:
image: grafana/grafana:9.3.0
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
networks:
- monitoring
depends_on:
- prometheus
node-exporter:
image: prom/node-exporter:v1.5.0
container_name: node-exporter
restart: unless-stopped
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
volumes:
prometheus_data:
alertmanager_data:
grafana_data:
networks:
monitoring:
driver: bridge
启动和管理
# 启动服务
docker-compose -f monitoring-stack.yml up -d
# 查看服务状态
docker-compose -f monitoring-stack.yml ps
# 查看日志
docker-compose -f monitoring-stack.yml logs alertmanager
# 重启服务
docker-compose -f monitoring-stack.yml restart alertmanager
# 停止服务
docker-compose -f monitoring-stack.yml down
# 更新配置后重新加载
docker-compose -f monitoring-stack.yml exec alertmanager \
kill -HUP 1
2.4 Kubernetes 部署
命名空间和 RBAC
# namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: alertmanager
rules:
- apiGroups: [""]
resources: ["nodes", "services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: alertmanager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: alertmanager
subjects:
- kind: ServiceAccount
name: alertmanager
namespace: monitoring
ConfigMap 配置
# configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitoring
data:
alertmanager.yml: |
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'app-password'
resolve_timeout: 5m
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 30m
- match:
severity: warning
receiver: 'warning-alerts'
repeat_interval: 2h
- match_re:
service: '^(web|api|database).*'
receiver: 'app-team'
receivers:
- name: 'default'
email_configs:
- to: 'admin@example.com'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'oncall@example.com'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
body: |
CRITICAL ALERT!
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#critical-alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'team@example.com'
subject: '[WARNING] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
{{ end }}
- name: 'app-team'
email_configs:
- to: 'app-team@example.com'
subject: '[APP] {{ .GroupLabels.alertname }}'
body: |
Application Alert:
{{ range .Alerts }}
Service: {{ .Labels.service }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
notification.tmpl: |
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*?)\\:.*" "$1" }}{{ end }}
{{ define "slack.default.title" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .GroupLabels) 0 }}({{ template "cluster" . }}){{ end }}
{{ end }}
{{ define "slack.default.text" }}
{{ range .Alerts }}
{{ if .Annotations.summary }}{{ .Annotations.summary }}{{ end }}
{{ if .Annotations.description }}{{ .Annotations.description }}{{ end }}
{{ end }}
{{ end }}
Deployment 部署
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
serviceAccountName: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager:v0.25.0
ports:
- containerPort: 9093
name: web
- containerPort: 9094
name: cluster
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://alertmanager.monitoring.svc.cluster.local:9093'
- '--web.route-prefix=/'
- '--log.level=info'
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: storage-volume
mountPath: /alertmanager
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 5
timeoutSeconds: 5
volumes:
- name: config-volume
configMap:
name: alertmanager-config
- name: storage-volume
emptyDir: {}
Service 和 Ingress
# service.yaml
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
selector:
app: alertmanager
ports:
- name: web
port: 9093
targetPort: 9093
protocol: TCP
- name: cluster
port: 9094
targetPort: 9094
protocol: TCP
type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:
- hosts:
- alertmanager.example.com
secretName: alertmanager-tls
rules:
- host: alertmanager.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager
port:
number: 9093
部署和验证
# 应用配置
kubectl apply -f namespace.yaml
kubectl apply -f configmap.yaml
kubectl apply -f deployment.yaml
kubectl apply -f service.yaml
# 检查部署状态
kubectl get pods -n monitoring
kubectl get svc -n monitoring
kubectl get ingress -n monitoring
# 查看日志
kubectl logs -f deployment/alertmanager -n monitoring
# 端口转发测试
kubectl port-forward svc/alertmanager 9093:9093 -n monitoring
# 访问 Web UI
# 浏览器访问:http://localhost:9093
2.5 高可用集群部署
集群架构设计
graph TB
subgraph "Load Balancer"
LB[HAProxy/Nginx]
end
subgraph "Alertmanager Cluster"
AM1[Alertmanager-1<br/>:9093]
AM2[Alertmanager-2<br/>:9093]
AM3[Alertmanager-3<br/>:9093]
end
subgraph "Prometheus Instances"
P1[Prometheus-1]
P2[Prometheus-2]
end
subgraph "Notification Channels"
EMAIL[Email]
SLACK[Slack]
WEBHOOK[Webhook]
end
P1 --> LB
P2 --> LB
LB --> AM1
LB --> AM2
LB --> AM3
AM1 -.->|Gossip| AM2
AM2 -.->|Gossip| AM3
AM3 -.->|Gossip| AM1
AM1 --> EMAIL
AM2 --> SLACK
AM3 --> WEBHOOK
Docker Swarm 集群部署
# alertmanager-cluster.yml
version: '3.8'
services:
alertmanager:
image: prom/alertmanager:v0.25.0
networks:
- monitoring
volumes:
- alertmanager-config:/etc/alertmanager:ro
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://alertmanager:9093'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=tasks.alertmanager:9094'
- '--cluster.reconnect-timeout=10m'
- '--cluster.gossip-interval=200ms'
- '--cluster.pushpull-interval=60s'
deploy:
replicas: 3
placement:
max_replicas_per_node: 1
constraints:
- node.role == worker
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 256M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
order: stop-first
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
alertmanager-lb:
image: nginx:alpine
ports:
- "9093:80"
networks:
- monitoring
volumes:
- alertmanager-nginx-config:/etc/nginx/conf.d:ro
deploy:
replicas: 2
placement:
constraints:
- node.role == manager
resources:
limits:
memory: 128M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.1'
depends_on:
- alertmanager
volumes:
alertmanager-config:
external: true
alertmanager-data:
alertmanager-nginx-config:
external: true
networks:
monitoring:
external: true
Kubernetes StatefulSet 部署
# statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
namespace: monitoring
spec:
serviceName: alertmanager-headless
replicas: 3
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
serviceAccountName: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager:v0.25.0
ports:
- containerPort: 9093
name: web
- containerPort: 9094
name: cluster
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://alertmanager.monitoring.svc.cluster.local:9093'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-0.alertmanager-headless.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-1.alertmanager-headless.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-2.alertmanager-headless.monitoring.svc.cluster.local:9094'
- '--cluster.reconnect-timeout=10m'
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: storage-volume
mountPath: /alertmanager
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 5
timeoutSeconds: 5
volumes:
- name: config-volume
configMap:
name: alertmanager-config
volumeClaimTemplates:
- metadata:
name: storage-volume
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: "fast-ssd"
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager-headless
namespace: monitoring
spec:
clusterIP: None
selector:
app: alertmanager
ports:
- name: web
port: 9093
targetPort: 9093
- name: cluster
port: 9094
targetPort: 9094
负载均衡配置
# nginx.conf
upstream alertmanager {
least_conn;
server alertmanager-0.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
server alertmanager-1.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
server alertmanager-2.alertmanager-headless.monitoring.svc.cluster.local:9093 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
server_name alertmanager.example.com;
location / {
proxy_pass http://alertmanager;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 健康检查
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_connect_timeout 5s;
proxy_send_timeout 10s;
proxy_read_timeout 10s;
}
location /-/healthy {
access_log off;
proxy_pass http://alertmanager;
}
}
2.6 配置验证和测试
配置文件验证
# 使用 amtool 验证配置
amtool config check /etc/alertmanager/alertmanager.yml
# 验证路由配置
amtool config routes show --config.file=/etc/alertmanager/alertmanager.yml
# 测试路由匹配
amtool config routes test \
--config.file=/etc/alertmanager/alertmanager.yml \
alertname=TestAlert severity=warning
API 测试
# 检查状态
curl -s http://localhost:9093/api/v1/status | jq .
# 获取配置信息
curl -s http://localhost:9093/api/v1/status | jq '.data.configYAML'
# 检查集群状态
curl -s http://localhost:9093/api/v1/status | jq '.data.clusterStatus'
# 发送测试告警
curl -XPOST http://localhost:9093/api/v1/alerts -H "Content-Type: application/json" -d '[
{
"labels": {
"alertname": "TestAlert",
"severity": "warning",
"instance": "localhost:9090"
},
"annotations": {
"summary": "This is a test alert",
"description": "This alert is for testing purposes"
},
"startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)'"
}
]'
健康检查脚本
#!/bin/bash
# healthcheck.sh
ALERTMANAGER_URL="http://localhost:9093"
TIMEOUT=10
# 检查服务是否运行
check_service() {
if ! pgrep -f alertmanager > /dev/null; then
echo "ERROR: Alertmanager process not running"
return 1
fi
echo "OK: Alertmanager process is running"
}
# 检查端口是否监听
check_port() {
if ! nc -z localhost 9093; then
echo "ERROR: Port 9093 is not listening"
return 1
fi
echo "OK: Port 9093 is listening"
}
# 检查 API 响应
check_api() {
local response
response=$(curl -s --max-time $TIMEOUT "$ALERTMANAGER_URL/api/v1/status")
if [ $? -ne 0 ]; then
echo "ERROR: Failed to connect to Alertmanager API"
return 1
fi
if ! echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
echo "ERROR: Alertmanager API returned error: $response"
return 1
fi
echo "OK: Alertmanager API is responding"
}
# 检查配置文件
check_config() {
if ! amtool config check /etc/alertmanager/alertmanager.yml > /dev/null 2>&1; then
echo "ERROR: Configuration file is invalid"
return 1
fi
echo "OK: Configuration file is valid"
}
# 主检查函数
main() {
echo "=== Alertmanager Health Check ==="
echo "Timestamp: $(date)"
echo
local exit_code=0
check_service || exit_code=1
check_port || exit_code=1
check_api || exit_code=1
check_config || exit_code=1
echo
if [ $exit_code -eq 0 ]; then
echo "✅ All checks passed"
else
echo "❌ Some checks failed"
fi
return $exit_code
}
main "$@"
2.7 性能调优
系统级优化
# 调整文件描述符限制
echo "alertmanager soft nofile 65536" >> /etc/security/limits.conf
echo "alertmanager hard nofile 65536" >> /etc/security/limits.conf
# 调整内核参数
echo "net.core.somaxconn = 1024" >> /etc/sysctl.conf
echo "net.ipv4.tcp_max_syn_backlog = 1024" >> /etc/sysctl.conf
sysctl -p
# 调整 systemd 服务限制
mkdir -p /etc/systemd/system/alertmanager.service.d
cat > /etc/systemd/system/alertmanager.service.d/limits.conf <<EOF
[Service]
LimitNOFILE=65536
LimitNPROC=65536
EOF
systemctl daemon-reload
systemctl restart alertmanager
应用级优化
# 优化启动参数
/usr/local/bin/alertmanager \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/var/lib/alertmanager/ \
--web.external-url=http://localhost:9093 \
--web.max-connections=512 \
--cluster.tcp-timeout=10s \
--cluster.gossip-interval=200ms \
--cluster.pushpull-interval=60s \
--cluster.settle-timeout=60s \
--log.level=warn
监控指标
# 关键监控指标
prometheus_rules:
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager instance is down"
- alert: AlertmanagerHighMemoryUsage
expr: process_resident_memory_bytes{job="alertmanager"} / 1024 / 1024 > 512
for: 10m
labels:
severity: warning
annotations:
summary: "Alertmanager memory usage is high"
- alert: AlertmanagerConfigReloadFailed
expr: increase(alertmanager_config_last_reload_successful[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Alertmanager config reload failed"
本章小结
本章详细介绍了 Alertmanager 的各种安装部署方式:
部署方式对比
部署方式 | 适用场景 | 优点 | 缺点 |
---|---|---|---|
二进制安装 | 传统环境、性能要求高 | 性能最佳、资源占用少 | 管理复杂、升级困难 |
Docker 部署 | 开发测试、快速部署 | 部署简单、环境隔离 | 性能略低、依赖 Docker |
Kubernetes | 云原生环境、大规模部署 | 自动化管理、高可用 | 复杂度高、学习成本大 |
集群部署 | 生产环境、高可用要求 | 高可用、负载分担 | 配置复杂、资源消耗大 |
最佳实践
- 环境选择:根据实际需求选择合适的部署方式
- 资源规划:合理分配 CPU、内存和存储资源
- 网络配置:确保端口开放和网络连通性
- 安全配置:使用专用用户和适当的权限
- 监控告警:部署监控和健康检查
- 备份策略:定期备份配置和数据
下一步学习
在下一章中,我们将深入学习 Alertmanager 的配置文件详解,包括: - 配置文件结构和语法 - 全局配置选项 - 路由规则配置 - 接收器配置详解 - 抑制规则设置
下一章: 配置文件详解