学习目标
通过本章学习,您将能够:
- 掌握 Docker Swarm 常见故障的诊断方法
- 学会使用各种调试工具和命令
- 了解集群、服务和网络故障的排除技巧
- 掌握日志分析和问题定位技能
- 学会预防性维护和故障预警
1. 故障诊断基础
1.1 故障分类
故障类型概览
# Docker Swarm 故障分类:
# 1. 集群级别故障
# - 节点离线或不可达
# - 管理节点选举失败
# - 集群分裂(Split-brain)
# - 证书过期或认证失败
# 2. 服务级别故障
# - 服务启动失败
# - 副本数不足
# - 服务更新失败
# - 健康检查失败
# 3. 网络级别故障
# - 容器间通信失败
# - 负载均衡异常
# - DNS 解析问题
# - 端口冲突
# 4. 存储级别故障
# - 卷挂载失败
# - 存储空间不足
# - 权限问题
# - 数据损坏
# 5. 资源级别故障
# - CPU/内存不足
# - 磁盘空间不足
# - 网络带宽限制
# - 文件描述符耗尽
故障诊断流程
# 标准故障诊断流程:
# 1. 问题识别
# - 收集故障现象
# - 确定影响范围
# - 记录故障时间
# 2. 信息收集
# - 查看系统状态
# - 收集相关日志
# - 检查配置变更
# 3. 问题分析
# - 分析日志信息
# - 对比正常状态
# - 确定根本原因
# 4. 解决方案
# - 制定修复计划
# - 实施修复措施
# - 验证修复效果
# 5. 预防措施
# - 总结经验教训
# - 完善监控告警
# - 更新运维文档
1.2 基础诊断命令
集群状态检查
#!/bin/bash
# cluster-health-check.sh
echo "=== Docker Swarm Cluster Health Check ==="
echo "Timestamp: $(date)"
echo
# 1. 基本集群信息
echo "1. Cluster Information:"
docker info | grep -A 10 "Swarm:"
echo
# 2. 节点状态
echo "2. Node Status:"
docker node ls
echo
# 3. 服务状态
echo "3. Service Status:"
docker service ls
echo
# 4. 网络状态
echo "4. Network Status:"
docker network ls
echo
# 5. 存储状态
echo "5. Volume Status:"
docker volume ls
echo
# 6. 系统资源
echo "6. System Resources:"
echo "CPU Usage:"
top -bn1 | grep "Cpu(s)"
echo "Memory Usage:"
free -h
echo "Disk Usage:"
df -h
echo
# 7. Docker 守护进程状态
echo "7. Docker Daemon Status:"
systemctl status docker --no-pager
echo
# 8. 最近的 Docker 事件
echo "8. Recent Docker Events:"
docker events --since="1h" --until="now" | tail -10
echo
# 9. 错误检查
echo "9. Error Detection:"
echo "Failed services:"
docker service ls --filter "desired-state=running" --format "table {{.Name}}\t{{.Replicas}}" | grep "0/"
echo "Unhealthy nodes:"
docker node ls --filter "availability=drain" --format "table {{.Hostname}}\t{{.Status}}\t{{.Availability}}"
echo "Network issues:"
docker network ls --filter "driver=overlay" --format "table {{.Name}}\t{{.Driver}}\t{{.Scope}}" | grep -v "swarm"
echo "=== Health Check Complete ==="
详细诊断脚本
#!/bin/bash
# detailed-diagnostics.sh
DIAG_DIR="/var/log/swarm-diagnostics"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
REPORT_FILE="$DIAG_DIR/diagnostic-report-$TIMESTAMP.txt"
# 创建诊断目录
mkdir -p $DIAG_DIR
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $REPORT_FILE
}
# 收集系统信息
collect_system_info() {
log "Collecting system information..."
{
echo "=== System Information ==="
echo "Hostname: $(hostname)"
echo "Kernel: $(uname -r)"
echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'=' -f2 | tr -d '"')"
echo "Uptime: $(uptime)"
echo "Date: $(date)"
echo
echo "=== Hardware Information ==="
echo "CPU Info:"
lscpu | grep -E "Model name|CPU\(s\)|Thread\(s\)"
echo
echo "Memory Info:"
free -h
echo
echo "Disk Info:"
df -h
echo
echo "=== Network Information ==="
echo "Network Interfaces:"
ip addr show
echo
echo "Routing Table:"
ip route show
echo
echo "Network Connections:"
ss -tuln | head -20
echo
} >> $REPORT_FILE
}
# 收集 Docker 信息
collect_docker_info() {
log "Collecting Docker information..."
{
echo "=== Docker Information ==="
docker version
echo
docker info
echo
echo "=== Docker Storage ==="
docker system df
echo
echo "=== Docker Processes ==="
docker ps -a
echo
echo "=== Docker Images ==="
docker images
echo
echo "=== Docker Networks ==="
docker network ls
echo
echo "=== Docker Volumes ==="
docker volume ls
echo
} >> $REPORT_FILE
}
# 收集 Swarm 信息
collect_swarm_info() {
log "Collecting Swarm information..."
{
echo "=== Swarm Cluster Information ==="
docker info | grep -A 20 "Swarm:"
echo
echo "=== Node Information ==="
docker node ls
echo
# 详细节点信息
for node in $(docker node ls --format '{{.Hostname}}'); do
echo "--- Node: $node ---"
docker node inspect $node --pretty
echo
done
echo "=== Service Information ==="
docker service ls
echo
# 详细服务信息
for service in $(docker service ls --format '{{.Name}}'); do
echo "--- Service: $service ---"
docker service inspect $service --pretty
echo
echo "Service Tasks:"
docker service ps $service
echo
done
echo "=== Stack Information ==="
docker stack ls
echo
# 详细 Stack 信息
for stack in $(docker stack ls --format '{{.Name}}'); do
echo "--- Stack: $stack ---"
docker stack ps $stack
echo
done
} >> $REPORT_FILE
}
# 收集日志信息
collect_logs() {
log "Collecting log information..."
{
echo "=== System Logs ==="
echo "Docker Daemon Logs (last 100 lines):"
journalctl -u docker --no-pager -n 100
echo
echo "System Logs (last 50 lines):"
tail -50 /var/log/syslog 2>/dev/null || tail -50 /var/log/messages 2>/dev/null
echo
echo "=== Service Logs ==="
for service in $(docker service ls --format '{{.Name}}'); do
echo "--- Service Logs: $service (last 50 lines) ---"
docker service logs --tail 50 $service 2>&1
echo
done
echo "=== Container Logs ==="
for container in $(docker ps --format '{{.Names}}'); do
echo "--- Container Logs: $container (last 30 lines) ---"
docker logs --tail 30 $container 2>&1
echo
done
} >> $REPORT_FILE
}
# 收集性能信息
collect_performance_info() {
log "Collecting performance information..."
{
echo "=== Performance Information ==="
echo "CPU Usage:"
top -bn1 | head -20
echo
echo "Memory Usage:"
cat /proc/meminfo
echo
echo "Disk I/O:"
iostat -x 1 3 2>/dev/null || echo "iostat not available"
echo
echo "Network Statistics:"
cat /proc/net/dev
echo
echo "Load Average:"
cat /proc/loadavg
echo
echo "Process List:"
ps aux | head -20
echo
echo "Docker Stats:"
timeout 10 docker stats --no-stream 2>/dev/null || echo "Docker stats timeout"
echo
} >> $REPORT_FILE
}
# 网络诊断
network_diagnostics() {
log "Running network diagnostics..."
{
echo "=== Network Diagnostics ==="
# 检查 Docker 网络
echo "Docker Networks:"
for network in $(docker network ls --format '{{.Name}}'); do
echo "--- Network: $network ---"
docker network inspect $network
echo
done
# 检查端口监听
echo "Listening Ports:"
netstat -tuln 2>/dev/null || ss -tuln
echo
# 检查防火墙规则
echo "Firewall Rules:"
iptables -L -n 2>/dev/null || echo "iptables not accessible"
echo
# 检查 DNS
echo "DNS Configuration:"
cat /etc/resolv.conf
echo
# 测试网络连通性
echo "Network Connectivity Tests:"
echo "Ping Google DNS:"
ping -c 3 8.8.8.8 2>&1
echo
echo "Ping Docker Hub:"
ping -c 3 registry-1.docker.io 2>&1
echo
} >> $REPORT_FILE
}
# 生成诊断报告
generate_report() {
log "Generating diagnostic report..."
# 创建 HTML 报告
local html_report="$DIAG_DIR/diagnostic-report-$TIMESTAMP.html"
cat > $html_report << 'EOF'
<!DOCTYPE html>
<html>
<head>
<title>Docker Swarm Diagnostic Report</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.header { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
.section { margin: 20px 0; border: 1px solid #ddd; padding: 10px; border-radius: 5px; }
.error { background-color: #ffebee; }
.warning { background-color: #fff3e0; }
.success { background-color: #e8f5e8; }
pre { background-color: #f5f5f5; padding: 10px; overflow-x: auto; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<div class="header">
<h1>Docker Swarm Diagnostic Report</h1>
<p>Generated: $(date)</p>
<p>Hostname: $(hostname)</p>
</div>
EOF
# 添加摘要信息
echo " <div class='section'>" >> $html_report
echo " <h2>Executive Summary</h2>" >> $html_report
# 检查集群状态
local cluster_status="Unknown"
if docker info | grep -q "Swarm: active"; then
cluster_status="Active"
elif docker info | grep -q "Swarm: inactive"; then
cluster_status="Inactive"
fi
# 检查节点数量
local node_count=$(docker node ls 2>/dev/null | wc -l)
node_count=$((node_count - 1)) # 减去标题行
# 检查服务数量
local service_count=$(docker service ls 2>/dev/null | wc -l)
service_count=$((service_count - 1)) # 减去标题行
# 检查失败的服务
local failed_services=$(docker service ls --format '{{.Name}} {{.Replicas}}' 2>/dev/null | grep "0/" | wc -l)
echo " <table>" >> $html_report
echo " <tr><th>Metric</th><th>Value</th><th>Status</th></tr>" >> $html_report
echo " <tr><td>Cluster Status</td><td>$cluster_status</td><td class='$([ "$cluster_status" = "Active" ] && echo "success" || echo "error")'></td></tr>" >> $html_report
echo " <tr><td>Node Count</td><td>$node_count</td><td class='$([ $node_count -gt 0 ] && echo "success" || echo "error")'></td></tr>" >> $html_report
echo " <tr><td>Service Count</td><td>$service_count</td><td class='success'></td></tr>" >> $html_report
echo " <tr><td>Failed Services</td><td>$failed_services</td><td class='$([ $failed_services -eq 0 ] && echo "success" || echo "error")'></td></tr>" >> $html_report
echo " </table>" >> $html_report
echo " </div>" >> $html_report
# 添加详细信息链接
echo " <div class='section'>" >> $html_report
echo " <h2>Detailed Information</h2>" >> $html_report
echo " <p>Full diagnostic report: <a href='diagnostic-report-$TIMESTAMP.txt'>diagnostic-report-$TIMESTAMP.txt</a></p>" >> $html_report
echo " </div>" >> $html_report
# 结束 HTML
echo "</body></html>" >> $html_report
log "Diagnostic report generated:"
log " Text report: $REPORT_FILE"
log " HTML report: $html_report"
}
# 主函数
main() {
log "Starting comprehensive diagnostic collection..."
collect_system_info
collect_docker_info
collect_swarm_info
collect_logs
collect_performance_info
network_diagnostics
generate_report
log "Diagnostic collection completed successfully"
echo "Reports saved to: $DIAG_DIR"
}
# 执行主函数
main
3.2 服务健康检查故障
健康检查诊断
#!/bin/bash
# health-check-diagnostics.sh
SERVICE_NAME=$1
LOG_FILE="/var/log/health-check-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查健康检查配置
check_health_config() {
local service=$1
log "Checking health check configuration for service: $service"
# 获取健康检查配置
local health_config=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck}}')
if [ "$health_config" = "<no value>" ] || [ -z "$health_config" ]; then
log "No health check configured for service $service"
return 1
fi
log "Health check configuration:"
docker service inspect $service --format '{{json .Spec.TaskTemplate.ContainerSpec.Healthcheck}}' | jq .
# 获取健康检查详细信息
local test_cmd=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Healthcheck.Test}}{{.}} {{end}}')
local interval=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Interval}}')
local timeout=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Timeout}}')
local retries=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Retries}}')
local start_period=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.StartPeriod}}')
log "Health check details:"
log " Test command: $test_cmd"
log " Interval: $interval"
log " Timeout: $timeout"
log " Retries: $retries"
log " Start period: $start_period"
return 0
}
# 测试健康检查命令
test_health_command() {
local service=$1
log "Testing health check command for service: $service"
# 获取服务的一个运行中的容器
local container_id=$(docker service ps $service --filter "desired-state=running" --format '{{.Name}}.{{.ID}}' | head -1)
if [ -z "$container_id" ]; then
log "No running containers found for service $service"
return 1
fi
# 获取实际的容器 ID
local actual_container=$(docker ps --filter "name=$container_id" --format '{{.ID}}')
if [ -z "$actual_container" ]; then
log "Container not found: $container_id"
return 1
fi
log "Testing health check on container: $actual_container"
# 获取健康检查命令
local health_cmd=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Healthcheck.Test}}{{.}} {{end}}')
if [ -n "$health_cmd" ]; then
log "Executing health check command: $health_cmd"
# 在容器中执行健康检查命令
local result=$(docker exec $actual_container $health_cmd 2>&1)
local exit_code=$?
log "Health check result:"
log " Exit code: $exit_code"
log " Output: $result"
if [ $exit_code -eq 0 ]; then
log "Health check passed"
else
log "Health check failed"
# 分析失败原因
analyze_health_failure "$result" $exit_code
fi
else
log "No health check command found"
fi
}
# 分析健康检查失败原因
analyze_health_failure() {
local output="$1"
local exit_code=$2
log "Analyzing health check failure..."
case $exit_code in
1)
log "Health check returned unhealthy status"
;;
2)
log "Health check command not found or permission denied"
;;
126)
log "Health check command not executable"
;;
127)
log "Health check command not found"
;;
*)
log "Health check failed with exit code: $exit_code"
;;
esac
# 检查常见错误模式
if echo "$output" | grep -i "connection refused"; then
log "Issue: Connection refused - service may not be listening"
elif echo "$output" | grep -i "timeout"; then
log "Issue: Timeout - service may be slow to respond"
elif echo "$output" | grep -i "permission denied"; then
log "Issue: Permission denied - check file permissions"
elif echo "$output" | grep -i "no such file"; then
log "Issue: File not found - check file paths"
fi
}
# 修复健康检查
fix_health_check() {
local service=$1
log "Health check fix options for service: $service"
echo "Available fix options:"
echo "1. Increase health check timeout"
echo "2. Increase health check interval"
echo "3. Increase retry count"
echo "4. Increase start period"
echo "5. Update health check command"
echo "6. Disable health check"
echo "7. Exit without changes"
read -p "Choose a fix option (1-7): " choice
case $choice in
1)
read -p "Enter new timeout (e.g., 30s): " timeout
if [ -n "$timeout" ]; then
log "Updating health check timeout to: $timeout"
docker service update --health-timeout $timeout $service
fi
;;
2)
read -p "Enter new interval (e.g., 30s): " interval
if [ -n "$interval" ]; then
log "Updating health check interval to: $interval"
docker service update --health-interval $interval $service
fi
;;
3)
read -p "Enter new retry count (e.g., 5): " retries
if [ -n "$retries" ]; then
log "Updating health check retries to: $retries"
docker service update --health-retries $retries $service
fi
;;
4)
read -p "Enter new start period (e.g., 60s): " start_period
if [ -n "$start_period" ]; then
log "Updating health check start period to: $start_period"
docker service update --health-start-period $start_period $service
fi
;;
5)
read -p "Enter new health check command: " health_cmd
if [ -n "$health_cmd" ]; then
log "Updating health check command to: $health_cmd"
docker service update --health-cmd "$health_cmd" $service
fi
;;
6)
log "Disabling health check..."
docker service update --no-healthcheck $service
;;
7)
log "Exiting without changes"
;;
*)
log "Invalid choice"
;;
esac
}
# 主函数
main() {
if [ -z "$SERVICE_NAME" ]; then
echo "Usage: $0 <service-name>"
exit 1
fi
log "Starting health check diagnostics for: $SERVICE_NAME"
if check_health_config $SERVICE_NAME; then
test_health_command $SERVICE_NAME
echo
read -p "Do you want to fix the health check? (yes/no): " fix_choice
if [ "$fix_choice" = "yes" ]; then
fix_health_check $SERVICE_NAME
fi
else
echo "No health check configured. Would you like to add one?"
read -p "(yes/no): " add_choice
if [ "$add_choice" = "yes" ]; then
read -p "Enter health check command: " health_cmd
if [ -n "$health_cmd" ]; then
docker service update --health-cmd "$health_cmd" $SERVICE_NAME
log "Health check added to service $SERVICE_NAME"
fi
fi
fi
log "Health check diagnostics completed"
}
# 执行主函数
main
4. 网络故障排除
4.1 网络连通性问题
网络诊断脚本
#!/bin/bash
# network-diagnostics.sh
LOG_FILE="/var/log/network-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查网络基础设施
check_network_infrastructure() {
log "Checking network infrastructure..."
# 检查 Docker 网络
log "Docker networks:"
docker network ls
# 检查 Overlay 网络
log "Overlay networks:"
docker network ls --filter "driver=overlay"
# 检查网络接口
log "Network interfaces:"
ip addr show
# 检查路由表
log "Routing table:"
ip route show
# 检查 iptables 规则
log "Iptables rules:"
iptables -L -n | head -20
# 检查网络命名空间
log "Network namespaces:"
ip netns list
}
# 检查服务网络配置
check_service_networks() {
log "Checking service network configurations..."
for service in $(docker service ls --format '{{.Name}}'); do
log "Service: $service"
# 获取服务网络
local networks=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.Networks}}{{.Target}} {{end}}')
log " Networks: $networks"
# 获取端口配置
local ports=$(docker service inspect $service --format '{{range .Spec.EndpointSpec.Ports}}{{.PublishedPort}}:{{.TargetPort}}/{{.Protocol}} {{end}}')
if [ -n "$ports" ]; then
log " Published ports: $ports"
fi
# 检查服务端点
local vip=$(docker service inspect $service --format '{{range .Endpoint.VirtualIPs}}{{.NetworkID}}:{{.Addr}} {{end}}')
if [ -n "$vip" ]; then
log " Virtual IPs: $vip"
fi
done
}
# 测试容器间连通性
test_container_connectivity() {
log "Testing container connectivity..."
# 获取所有运行中的容器
local containers=($(docker ps --format '{{.Names}}'))
if [ ${#containers[@]} -lt 2 ]; then
log "Need at least 2 containers to test connectivity"
return 1
fi
# 测试容器间的连通性
for i in "${!containers[@]}"; do
local source_container=${containers[$i]}
for j in "${!containers[@]}"; do
if [ $i -ne $j ]; then
local target_container=${containers[$j]}
log "Testing connectivity: $source_container -> $target_container"
# 获取目标容器的 IP
local target_ip=$(docker inspect $target_container --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' | head -1)
if [ -n "$target_ip" ]; then
# 在源容器中 ping 目标容器
local ping_result=$(docker exec $source_container ping -c 1 -W 2 $target_ip 2>&1)
if echo "$ping_result" | grep -q "1 received"; then
log " ✓ Connectivity successful"
else
log " ✗ Connectivity failed"
log " Target IP: $target_ip"
log " Error: $(echo "$ping_result" | tail -1)"
fi
else
log " ✗ Could not get target IP for $target_container"
fi
fi
done
done
}
# 测试服务发现
test_service_discovery() {
log "Testing service discovery..."
# 获取所有服务
local services=($(docker service ls --format '{{.Name}}'))
if [ ${#services[@]} -eq 0 ]; then
log "No services found"
return 1
fi
# 获取一个运行中的容器来测试
local test_container=$(docker ps --format '{{.Names}}' | head -1)
if [ -z "$test_container" ]; then
log "No running containers found for testing"
return 1
fi
log "Using test container: $test_container"
for service in "${services[@]}"; do
log "Testing service discovery for: $service"
# 测试 DNS 解析
local dns_result=$(docker exec $test_container nslookup $service 2>&1)
if echo "$dns_result" | grep -q "Address:"; then
local service_ip=$(echo "$dns_result" | grep "Address:" | tail -1 | awk '{print $2}')
log " ✓ DNS resolution successful: $service -> $service_ip"
# 测试连通性
local ping_result=$(docker exec $test_container ping -c 1 -W 2 $service 2>&1)
if echo "$ping_result" | grep -q "1 received"; then
log " ✓ Service connectivity successful"
else
log " ✗ Service connectivity failed"
fi
else
log " ✗ DNS resolution failed for $service"
log " Error: $(echo "$dns_result" | grep -i "error\|fail" | head -1)"
fi
done
}
# 检查负载均衡
test_load_balancing() {
log "Testing load balancing..."
# 选择一个有多个副本的服务
local service_with_replicas=$(docker service ls --format '{{.Name}} {{.Replicas}}' | grep -v "1/1" | head -1 | awk '{print $1}')
if [ -z "$service_with_replicas" ]; then
log "No services with multiple replicas found"
return 1
fi
log "Testing load balancing for service: $service_with_replicas"
# 获取服务的任务
local tasks=($(docker service ps $service_with_replicas --filter "desired-state=running" --format '{{.Name}}'))
log "Service tasks: ${tasks[*]}"
# 获取测试容器
local test_container=$(docker ps --format '{{.Names}}' | head -1)
if [ -z "$test_container" ]; then
log "No test container available"
return 1
fi
# 多次请求服务,检查负载均衡
log "Performing multiple requests to test load balancing..."
for i in {1..10}; do
# 这里假设服务有 HTTP 端点,实际情况需要根据服务类型调整
local response=$(docker exec $test_container wget -qO- --timeout=2 http://$service_with_replicas 2>&1 || echo "failed")
if [ "$response" != "failed" ]; then
log " Request $i: Success"
else
log " Request $i: Failed"
fi
sleep 0.5
done
}
# 检查网络性能
test_network_performance() {
log "Testing network performance..."
# 获取两个容器进行性能测试
local containers=($(docker ps --format '{{.Names}}' | head -2))
if [ ${#containers[@]} -lt 2 ]; then
log "Need at least 2 containers for performance testing"
return 1
fi
local source_container=${containers[0]}
local target_container=${containers[1]}
log "Performance test: $source_container -> $target_container"
# 获取目标容器 IP
local target_ip=$(docker inspect $target_container --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' | head -1)
if [ -n "$target_ip" ]; then
# 延迟测试
log "Testing latency..."
local ping_stats=$(docker exec $source_container ping -c 10 $target_ip 2>&1 | tail -1)
log " Ping statistics: $ping_stats"
# 带宽测试(如果容器中有 iperf3)
if docker exec $target_container which iperf3 > /dev/null 2>&1; then
log "Testing bandwidth with iperf3..."
# 在目标容器启动 iperf3 服务器
docker exec -d $target_container iperf3 -s
sleep 2
# 在源容器运行 iperf3 客户端
local bandwidth_result=$(docker exec $source_container iperf3 -c $target_ip -t 5 2>&1 | grep "sender")
log " Bandwidth test result: $bandwidth_result"
# 停止 iperf3 服务器
docker exec $target_container pkill iperf3
else
log " iperf3 not available for bandwidth testing"
fi
else
log "Could not get target container IP"
fi
}
# 诊断网络问题
diagnose_network_issues() {
log "Diagnosing common network issues..."
# 检查 Docker 守护进程网络配置
log "Docker daemon network configuration:"
docker info | grep -A 10 "Network:"
# 检查 Swarm 网络加密
log "Checking Swarm network encryption..."
for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
log " Network $network encrypted: $encrypted"
done
# 检查端口冲突
log "Checking for port conflicts..."
local listening_ports=$(netstat -tuln | grep LISTEN)
log "Listening ports:"
echo "$listening_ports"
# 检查防火墙规则
log "Checking firewall rules affecting Docker..."
iptables -L DOCKER-USER -n 2>/dev/null || log "DOCKER-USER chain not found"
# 检查 MTU 设置
log "Checking MTU settings..."
for interface in $(ip link show | grep -E "^[0-9]+:" | cut -d':' -f2 | tr -d ' '); do
local mtu=$(ip link show $interface | grep -o "mtu [0-9]*" | cut -d' ' -f2)
log " Interface $interface MTU: $mtu"
done
}
# 修复网络问题
fix_network_issues() {
log "Network issue fix options:"
echo "Available fix options:"
echo "1. Restart Docker daemon"
echo "2. Recreate overlay networks"
echo "3. Flush iptables rules"
echo "4. Reset network namespaces"
echo "5. Update network MTU"
echo "6. Exit without changes"
read -p "Choose a fix option (1-6): " choice
case $choice in
1)
log "Restarting Docker daemon..."
systemctl restart docker
sleep 10
log "Docker daemon restarted"
;;
2)
log "Recreating overlay networks..."
# 这需要谨慎操作,可能影响运行中的服务
echo "WARNING: This will affect running services"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
if [ "$network" != "ingress" ]; then
log "Recreating network: $network"
# 这里需要更复杂的逻辑来安全地重建网络
fi
done
fi
;;
3)
log "Flushing iptables rules..."
echo "WARNING: This will reset all iptables rules"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
iptables -F
iptables -X
iptables -t nat -F
iptables -t nat -X
systemctl restart docker
log "Iptables rules flushed and Docker restarted"
fi
;;
4)
log "Resetting network namespaces..."
echo "WARNING: This will affect all containers"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
# 删除所有网络命名空间
for ns in $(ip netns list | awk '{print $1}'); do
ip netns delete $ns
done
systemctl restart docker
log "Network namespaces reset and Docker restarted"
fi
;;
5)
read -p "Enter interface name: " interface
read -p "Enter new MTU value: " mtu
if [ -n "$interface" ] && [ -n "$mtu" ]; then
log "Setting MTU for $interface to $mtu"
ip link set dev $interface mtu $mtu
fi
;;
6)
log "Exiting without changes"
;;
*)
log "Invalid choice"
;;
esac
}
# 主函数
main() {
log "Starting comprehensive network diagnostics..."
check_network_infrastructure
check_service_networks
test_container_connectivity
test_service_discovery
test_load_balancing
test_network_performance
diagnose_network_issues
echo
read -p "Do you want to attempt network fixes? (yes/no): " fix_choice
if [ "$fix_choice" = "yes" ]; then
fix_network_issues
fi
log "Network diagnostics completed"
}
# 执行主函数
main
5. 存储故障排除
5.1 卷挂载问题
存储诊断脚本
#!/bin/bash
# storage-diagnostics.sh
LOG_FILE="/var/log/storage-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查存储基础设施
check_storage_infrastructure() {
log "Checking storage infrastructure..."
# 检查磁盘空间
log "Disk space usage:"
df -h
# 检查 inode 使用情况
log "Inode usage:"
df -i
# 检查挂载点
log "Mount points:"
mount | grep -E "(docker|overlay|tmpfs)"
# 检查 Docker 存储驱动
log "Docker storage driver:"
docker info | grep -A 10 "Storage Driver"
# 检查 Docker 根目录
local docker_root=$(docker info | grep "Docker Root Dir" | cut -d':' -f2 | tr -d ' ')
log "Docker root directory: $docker_root"
if [ -d "$docker_root" ]; then
log "Docker root directory size:"
du -sh "$docker_root"
fi
}
# 检查 Docker 卷
check_docker_volumes() {
log "Checking Docker volumes..."
# 列出所有卷
log "All Docker volumes:"
docker volume ls
# 检查卷详细信息
for volume in $(docker volume ls --format '{{.Name}}'); do
log "Volume: $volume"
docker volume inspect $volume
# 检查卷的挂载点
local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
if [ -d "$mountpoint" ]; then
log " Mountpoint: $mountpoint"
log " Size: $(du -sh "$mountpoint" | cut -f1)"
log " Permissions: $(stat -c "%a %U:%G" "$mountpoint")"
else
log " ERROR: Mountpoint not found: $mountpoint"
fi
done
# 检查孤立卷
log "Checking for dangling volumes..."
local dangling_volumes=$(docker volume ls --filter "dangling=true" --format '{{.Name}}')
if [ -n "$dangling_volumes" ]; then
log "Dangling volumes found:"
echo "$dangling_volumes"
else
log "No dangling volumes found"
fi
}
# 检查服务卷挂载
check_service_volumes() {
log "Checking service volume mounts..."
for service in $(docker service ls --format '{{.Name}}'); do
log "Service: $service"
# 获取服务的卷挂载配置
local mounts=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Mounts}}{{.Source}}:{{.Target}}:{{.Type}} {{end}}')
if [ -n "$mounts" ]; then
log " Volume mounts: $mounts"
# 检查每个挂载
for mount in $mounts; do
local source=$(echo $mount | cut -d':' -f1)
local target=$(echo $mount | cut -d':' -f2)
local type=$(echo $mount | cut -d':' -f3)
log " Mount: $source -> $target ($type)"
# 检查源路径/卷是否存在
if [ "$type" = "volume" ]; then
if docker volume inspect $source > /dev/null 2>&1; then
log " ✓ Volume $source exists"
else
log " ✗ Volume $source does not exist"
fi
elif [ "$type" = "bind" ]; then
if [ -e "$source" ]; then
log " ✓ Bind source $source exists"
log " Permissions: $(stat -c "%a %U:%G" "$source")"
else
log " ✗ Bind source $source does not exist"
fi
fi
done
else
log " No volume mounts configured"
fi
done
}
# 检查容器卷挂载状态
check_container_mounts() {
log "Checking container mount status..."
for container in $(docker ps --format '{{.Names}}'); do
log "Container: $container"
# 获取容器的挂载信息
local mounts=$(docker inspect $container --format '{{range .Mounts}}{{.Source}}:{{.Destination}}:{{.Type}}:{{.RW}} {{end}}')
if [ -n "$mounts" ]; then
log " Mounts: $mounts"
# 测试挂载点的读写权限
for mount in $mounts; do
local source=$(echo $mount | cut -d':' -f1)
local dest=$(echo $mount | cut -d':' -f2)
local type=$(echo $mount | cut -d':' -f3)
local rw=$(echo $mount | cut -d':' -f4)
log " Testing mount: $dest ($type, $rw)"
# 在容器中测试挂载点
if docker exec $container test -d "$dest" 2>/dev/null; then
log " ✓ Mount point accessible"
# 测试写权限(如果是读写挂载)
if [ "$rw" = "true" ]; then
if docker exec $container touch "$dest/.test_write" 2>/dev/null; then
docker exec $container rm "$dest/.test_write" 2>/dev/null
log " ✓ Write permission OK"
else
log " ✗ Write permission failed"
fi
fi
else
log " ✗ Mount point not accessible"
fi
done
else
log " No mounts found"
fi
done
}
# 检查存储性能
check_storage_performance() {
log "Checking storage performance..."
# 检查磁盘 I/O
log "Disk I/O statistics:"
if command -v iostat > /dev/null; then
iostat -x 1 3
else
log "iostat not available"
fi
# 检查磁盘使用率
log "Disk usage by Docker:"
docker system df
# 测试磁盘写入性能
log "Testing disk write performance..."
local test_file="/tmp/disk_test_$(date +%s)"
if dd if=/dev/zero of="$test_file" bs=1M count=100 2>&1 | grep -o "[0-9.]* MB/s"; then
log "Disk write test completed"
else
log "Disk write test failed"
fi
rm -f "$test_file"
# 检查 Docker 存储池使用情况(对于 devicemapper)
local storage_driver=$(docker info | grep "Storage Driver" | cut -d':' -f2 | tr -d ' ')
if [ "$storage_driver" = "devicemapper" ]; then
log "Devicemapper storage pool status:"
docker info | grep -A 10 "Pool"
fi
}
# 清理存储空间
cleanup_storage() {
log "Storage cleanup options:"
echo "Available cleanup options:"
echo "1. Remove unused containers"
echo "2. Remove unused images"
echo "3. Remove unused volumes"
echo "4. Remove unused networks"
echo "5. System prune (all unused objects)"
echo "6. Exit without cleanup"
read -p "Choose a cleanup option (1-6): " choice
case $choice in
1)
log "Removing unused containers..."
docker container prune -f
;;
2)
log "Removing unused images..."
docker image prune -f
;;
3)
log "Removing unused volumes..."
docker volume prune -f
;;
4)
log "Removing unused networks..."
docker network prune -f
;;
5)
log "Performing system prune..."
docker system prune -f
;;
6)
log "Exiting without cleanup"
;;
*)
log "Invalid choice"
;;
esac
}
# 修复存储问题
fix_storage_issues() {
log "Storage issue fix options:"
echo "Available fix options:"
echo "1. Fix volume permissions"
echo "2. Recreate problematic volumes"
echo "3. Restart Docker daemon"
echo "4. Clean up storage space"
echo "5. Reset Docker storage"
echo "6. Exit without changes"
read -p "Choose a fix option (1-6): " choice
case $choice in
1)
read -p "Enter volume name: " volume_name
if [ -n "$volume_name" ]; then
local mountpoint=$(docker volume inspect $volume_name --format '{{.Mountpoint}}' 2>/dev/null)
if [ -n "$mountpoint" ] && [ -d "$mountpoint" ]; then
log "Fixing permissions for volume: $volume_name"
sudo chown -R root:root "$mountpoint"
sudo chmod -R 755 "$mountpoint"
log "Permissions fixed"
else
log "Volume not found or mountpoint not accessible"
fi
fi
;;
2)
read -p "Enter volume name to recreate: " volume_name
if [ -n "$volume_name" ]; then
echo "WARNING: This will delete all data in the volume"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
log "Recreating volume: $volume_name"
docker volume rm $volume_name
docker volume create $volume_name
log "Volume recreated"
fi
fi
;;
3)
log "Restarting Docker daemon..."
systemctl restart docker
sleep 10
log "Docker daemon restarted"
;;
4)
cleanup_storage
;;
5)
echo "WARNING: This will remove all Docker data"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
log "Resetting Docker storage..."
systemctl stop docker
rm -rf /var/lib/docker/*
systemctl start docker
log "Docker storage reset completed"
fi
;;
6)
log "Exiting without changes"
;;
*)
log "Invalid choice"
;;
esac
}
# 主函数
main() {
log "Starting storage diagnostics..."
check_storage_infrastructure
check_docker_volumes
check_service_volumes
check_container_mounts
check_storage_performance
echo
read -p "Do you want to fix storage issues? (yes/no): " fix_choice
if [ "$fix_choice" = "yes" ]; then
fix_storage_issues
fi
log "Storage diagnostics completed"
}
# 执行主函数
main
6. 性能问题诊断
6.1 性能监控和分析
性能诊断脚本
#!/bin/bash
# performance-diagnostics.sh
LOG_FILE="/var/log/performance-diagnostics.log"
REPORT_DIR="/var/log/performance-reports"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
# 创建报告目录
mkdir -p $REPORT_DIR
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 收集系统性能指标
collect_system_metrics() {
log "Collecting system performance metrics..."
local report_file="$REPORT_DIR/system-metrics-$TIMESTAMP.txt"
{
echo "=== System Performance Report ==="
echo "Timestamp: $(date)"
echo "Hostname: $(hostname)"
echo
echo "=== CPU Information ==="
lscpu
echo
echo "=== CPU Usage ==="
top -bn1 | head -20
echo
echo "=== Memory Usage ==="
free -h
echo
cat /proc/meminfo | head -20
echo
echo "=== Load Average ==="
uptime
echo
cat /proc/loadavg
echo
echo "=== Disk Usage ==="
df -h
echo
echo "=== Disk I/O ==="
if command -v iostat > /dev/null; then
iostat -x 1 3
else
echo "iostat not available"
fi
echo
echo "=== Network Statistics ==="
cat /proc/net/dev
echo
echo "=== Process List ==="
ps aux --sort=-%cpu | head -20
echo
} > $report_file
log "System metrics saved to: $report_file"
}
# 收集 Docker 性能指标
collect_docker_metrics() {
log "Collecting Docker performance metrics..."
local report_file="$REPORT_DIR/docker-metrics-$TIMESTAMP.txt"
{
echo "=== Docker Performance Report ==="
echo "Timestamp: $(date)"
echo
echo "=== Docker System Info ==="
docker system df
echo
echo "=== Docker Events (last 1 hour) ==="
docker events --since="1h" --until="now" | tail -20
echo
echo "=== Container Statistics ==="
timeout 10 docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}"
echo
echo "=== Service Statistics ==="
for service in $(docker service ls --format '{{.Name}}'); do
echo "--- Service: $service ---"
docker service ps $service
echo
done
echo "=== Image Statistics ==="
docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}"
echo
echo "=== Volume Statistics ==="
for volume in $(docker volume ls --format '{{.Name}}'); do
local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
local size=$(du -sh "$mountpoint" 2>/dev/null | cut -f1 || echo "N/A")
echo "Volume: $volume, Size: $size"
done
echo
} > $report_file
log "Docker metrics saved to: $report_file"
}
# 分析容器性能
analyze_container_performance() {
log "Analyzing container performance..."
local report_file="$REPORT_DIR/container-analysis-$TIMESTAMP.txt"
{
echo "=== Container Performance Analysis ==="
echo "Timestamp: $(date)"
echo
# 获取所有运行中的容器
for container in $(docker ps --format '{{.Names}}'); do
echo "--- Container: $container ---"
# 基本信息
echo "Container Info:"
docker inspect $container --format '{{.Config.Image}} {{.State.Status}} {{.State.StartedAt}}'
echo
# 资源限制
echo "Resource Limits:"
local cpu_limit=$(docker inspect $container --format '{{.HostConfig.CpuQuota}}')
local memory_limit=$(docker inspect $container --format '{{.HostConfig.Memory}}')
echo " CPU Quota: $cpu_limit"
echo " Memory Limit: $memory_limit"
echo
# 实时统计
echo "Current Stats:"
timeout 5 docker stats --no-stream $container
echo
# 进程列表
echo "Processes:"
docker exec $container ps aux 2>/dev/null | head -10 || echo "Cannot access container processes"
echo
# 网络连接
echo "Network Connections:"
docker exec $container netstat -tuln 2>/dev/null | head -10 || echo "Cannot access network info"
echo
# 磁盘使用
echo "Disk Usage:"
docker exec $container df -h 2>/dev/null || echo "Cannot access disk info"
echo
echo "======================================"
echo
done
} > $report_file
log "Container analysis saved to: $report_file"
}
# 分析服务性能
analyze_service_performance() {
log "Analyzing service performance..."
local report_file="$REPORT_DIR/service-analysis-$TIMESTAMP.txt"
{
echo "=== Service Performance Analysis ==="
echo "Timestamp: $(date)"
echo
for service in $(docker service ls --format '{{.Name}}'); do
echo "--- Service: $service ---"
# 服务基本信息
echo "Service Info:"
docker service ls --filter "name=$service" --format "table {{.Name}}\t{{.Mode}}\t{{.Replicas}}\t{{.Image}}"
echo
# 任务分布
echo "Task Distribution:"
docker service ps $service --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Error}}"
echo
# 资源配置
echo "Resource Configuration:"
local cpu_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.NanoCPUs}}')
local memory_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.MemoryBytes}}')
local cpu_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.NanoCPUs}}')
local memory_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.MemoryBytes}}')
echo " CPU Limit: $cpu_limit"
echo " Memory Limit: $memory_limit"
echo " CPU Reservation: $cpu_reservation"
echo " Memory Reservation: $memory_reservation"
echo
# 更新历史
echo "Update History:"
docker service inspect $service --format '{{range .UpdateStatus.History}}{{.State}} {{.StartedAt}} {{.Message}}{{end}}' | tail -5
echo
# 服务日志(最近的错误)
echo "Recent Errors in Logs:"
docker service logs --tail 20 $service 2>&1 | grep -i error | tail -5
echo
echo "======================================"
echo
done
} > $report_file
log "Service analysis saved to: $report_file"
}
# 网络性能分析
analyze_network_performance() {
log "Analyzing network performance..."
local report_file="$REPORT_DIR/network-analysis-$TIMESTAMP.txt"
{
echo "=== Network Performance Analysis ==="
echo "Timestamp: $(date)"
echo
echo "=== Network Interfaces ==="
ip addr show
echo
echo "=== Network Statistics ==="
cat /proc/net/dev
echo
echo "=== Network Connections ==="
ss -tuln | head -20
echo
echo "=== Docker Networks ==="
for network in $(docker network ls --format '{{.Name}}'); do
echo "--- Network: $network ---"
docker network inspect $network --format '{{.Driver}} {{.Scope}} {{.IPAM.Config}}'
echo
done
echo "=== Overlay Network Performance ==="
for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
echo "Network: $network"
local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
echo " Encrypted: $encrypted"
# 获取连接到此网络的容器
local containers=$(docker network inspect $network --format '{{range .Containers}}{{.Name}} {{end}}')
echo " Connected containers: $containers"
echo
done
} > $report_file
log "Network analysis saved to: $report_file"
}
# 生成性能报告
generate_performance_report() {
log "Generating comprehensive performance report..."
local summary_report="$REPORT_DIR/performance-summary-$TIMESTAMP.html"
cat > $summary_report << 'EOF'
<!DOCTYPE html>
<html>
<head>
<title>Docker Swarm Performance Report</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.header { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
.section { margin: 20px 0; border: 1px solid #ddd; padding: 10px; border-radius: 5px; }
.metric { display: inline-block; margin: 10px; padding: 10px; border: 1px solid #ccc; border-radius: 5px; }
.good { background-color: #e8f5e8; }
.warning { background-color: #fff3e0; }
.critical { background-color: #ffebee; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<div class="header">
<h1>Docker Swarm Performance Report</h1>
<p>Generated: $(date)</p>
<p>Hostname: $(hostname)</p>
</div>
EOF
# 添加系统概览
echo " <div class='section'>" >> $summary_report
echo " <h2>System Overview</h2>" >> $summary_report
# CPU 使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
local cpu_class="good"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
cpu_class="critical"
elif (( $(echo "$cpu_usage > 60" | bc -l) )); then
cpu_class="warning"
fi
# 内存使用率
local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3*100/$2 }')
local mem_class="good"
if (( $(echo "$mem_usage > 90" | bc -l) )); then
mem_class="critical"
elif (( $(echo "$mem_usage > 80" | bc -l) )); then
mem_class="warning"
fi
# 磁盘使用率
local disk_usage=$(df / | awk 'NR==2{print $5}' | cut -d'%' -f1)
local disk_class="good"
if [ $disk_usage -gt 90 ]; then
disk_class="critical"
elif [ $disk_usage -gt 80 ]; then
disk_class="warning"
fi
echo " <div class='metric $cpu_class'>CPU Usage: ${cpu_usage}%</div>" >> $summary_report
echo " <div class='metric $mem_class'>Memory Usage: ${mem_usage}%</div>" >> $summary_report
echo " <div class='metric $disk_class'>Disk Usage: ${disk_usage}%</div>" >> $summary_report
echo " </div>" >> $summary_report
# 添加服务状态
echo " <div class='section'>" >> $summary_report
echo " <h2>Service Status</h2>" >> $summary_report
echo " <table>" >> $summary_report
echo " <tr><th>Service</th><th>Replicas</th><th>Status</th></tr>" >> $summary_report
for service in $(docker service ls --format '{{.Name}}'); do
local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
local desired=$(echo $replicas | cut -d'/' -f2)
local running=$(echo $replicas | cut -d'/' -f1)
local status_class="good"
if [ "$running" != "$desired" ]; then
status_class="warning"
fi
echo " <tr class='$status_class'><td>$service</td><td>$replicas</td><td>$([ "$running" = "$desired" ] && echo "Healthy" || echo "Degraded")</td></tr>" >> $summary_report
done
echo " </table>" >> $summary_report
echo " </div>" >> $summary_report
# 添加详细报告链接
echo " <div class='section'>" >> $summary_report
echo " <h2>Detailed Reports</h2>" >> $summary_report
echo " <ul>" >> $summary_report
echo " <li><a href='system-metrics-$TIMESTAMP.txt'>System Metrics</a></li>" >> $summary_report
echo " <li><a href='docker-metrics-$TIMESTAMP.txt'>Docker Metrics</a></li>" >> $summary_report
echo " <li><a href='container-analysis-$TIMESTAMP.txt'>Container Analysis</a></li>" >> $summary_report
echo " <li><a href='service-analysis-$TIMESTAMP.txt'>Service Analysis</a></li>" >> $summary_report
echo " <li><a href='network-analysis-$TIMESTAMP.txt'>Network Analysis</a></li>" >> $summary_report
echo " </ul>" >> $summary_report
echo " </div>" >> $summary_report
echo "</body></html>" >> $summary_report
log "Performance summary report generated: $summary_report"
}
# 性能优化建议
generate_optimization_recommendations() {
log "Generating optimization recommendations..."
local recommendations_file="$REPORT_DIR/optimization-recommendations-$TIMESTAMP.txt"
{
echo "=== Performance Optimization Recommendations ==="
echo "Generated: $(date)"
echo
# CPU 优化建议
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "=== CPU Optimization ==="
echo "Current CPU usage: ${cpu_usage}%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo "CRITICAL: High CPU usage detected"
echo "Recommendations:"
echo " - Scale out services to distribute load"
echo " - Optimize application code for better CPU efficiency"
echo " - Consider adding more nodes to the cluster"
echo " - Review and optimize resource limits"
elif (( $(echo "$cpu_usage > 60" | bc -l) )); then
echo "WARNING: Moderate CPU usage"
echo "Recommendations:"
echo " - Monitor trends and prepare for scaling"
echo " - Review service resource requests"
else
echo "OK: CPU usage is within acceptable range"
fi
echo
# 内存优化建议
local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3*100/$2 }')
echo "=== Memory Optimization ==="
echo "Current memory usage: ${mem_usage}%"
if (( $(echo "$mem_usage > 90" | bc -l) )); then
echo "CRITICAL: High memory usage detected"
echo "Recommendations:"
echo " - Immediately scale out memory-intensive services"
echo " - Review and optimize memory limits"
echo " - Check for memory leaks in applications"
echo " - Consider adding more memory or nodes"
elif (( $(echo "$mem_usage > 80" | bc -l) )); then
echo "WARNING: High memory usage"
echo "Recommendations:"
echo " - Monitor memory trends closely"
echo " - Optimize application memory usage"
echo " - Review service memory limits"
else
echo "OK: Memory usage is within acceptable range"
fi
echo
# 磁盘优化建议
local disk_usage=$(df / | awk 'NR==2{print $5}' | cut -d'%' -f1)
echo "=== Disk Optimization ==="
echo "Current disk usage: ${disk_usage}%"
if [ $disk_usage -gt 90 ]; then
echo "CRITICAL: High disk usage detected"
echo "Recommendations:"
echo " - Immediately clean up unused Docker objects"
echo " - Implement log rotation"
echo " - Move data to external storage"
echo " - Add more disk space"
elif [ $disk_usage -gt 80 ]; then
echo "WARNING: High disk usage"
echo "Recommendations:"
echo " - Schedule regular cleanup tasks"
echo " - Monitor disk usage trends"
echo " - Optimize log retention policies"
else
echo "OK: Disk usage is within acceptable range"
fi
echo
# 服务优化建议
echo "=== Service Optimization ==="
for service in $(docker service ls --format '{{.Name}}'); do
local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
local desired=$(echo $replicas | cut -d'/' -f2)
local running=$(echo $replicas | cut -d'/' -f1)
echo "Service: $service"
echo " Current replicas: $replicas"
if [ "$running" != "$desired" ]; then
echo " WARNING: Service is not at desired replica count"
echo " Recommendations:"
echo " - Check service logs for errors"
echo " - Verify resource availability"
echo " - Check placement constraints"
else
echo " OK: Service is healthy"
fi
echo
done
# 网络优化建议
echo "=== Network Optimization ==="
local overlay_networks=$(docker network ls --filter "driver=overlay" --format '{{.Name}}' | wc -l)
echo "Overlay networks: $overlay_networks"
if [ $overlay_networks -gt 10 ]; then
echo "WARNING: Many overlay networks detected"
echo "Recommendations:"
echo " - Consolidate networks where possible"
echo " - Remove unused networks"
fi
# 检查加密网络
for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
if [ "$encrypted" = "true" ]; then
echo "Network $network is encrypted (may impact performance)"
echo " Consider: Disable encryption if not required for security"
fi
done
echo
} > $recommendations_file
log "Optimization recommendations saved to: $recommendations_file"
}
# 主函数
main() {
log "Starting comprehensive performance diagnostics..."
collect_system_metrics
collect_docker_metrics
analyze_container_performance
analyze_service_performance
analyze_network_performance
generate_performance_report
generate_optimization_recommendations
log "Performance diagnostics completed"
log "Reports saved to: $REPORT_DIR"
echo
echo "Performance diagnostics completed!"
echo "Reports available in: $REPORT_DIR"
echo "Summary report: $REPORT_DIR/performance-summary-$TIMESTAMP.html"
}
# 执行主函数
main
7. 实践练习
练习 1:集群故障模拟与恢复
目标
模拟各种集群故障场景,练习故障诊断和恢复技能。
步骤
准备测试环境 “`bash
创建测试服务
docker service create –name web-test –replicas 3 nginx docker service create –name db-test –replicas 1 postgres:13
验证服务状态
docker service ls docker service ps web-test
2. **模拟节点故障**
```bash
# 在工作节点上停止 Docker 服务
sudo systemctl stop docker
# 观察集群反应
docker node ls
docker service ps web-test
# 使用故障诊断脚本
./cluster-health-check.sh
模拟网络分区 “`bash
使用 iptables 模拟网络分区
sudo iptables -A INPUT -s
-j DROP sudo iptables -A OUTPUT -d -j DROP
观察集群状态
docker node ls
恢复网络
sudo iptables -D INPUT -s
4. **模拟服务故障**
```bash
# 更新服务到错误的镜像
docker service update --image nginx:nonexistent web-test
# 使用服务诊断脚本
./service-diagnostics.sh web-test
# 回滚服务
docker service rollback web-test
练习 2:网络故障排除
目标
诊断和解决网络连通性问题。
步骤
创建网络测试环境 “`bash
创建自定义网络
docker network create –driver overlay test-network
部署测试服务
docker service create –name client-test –network test-network alpine sleep 3600 docker service create –name server-test –network test-network nginx
2. **测试网络连通性**
```bash
# 运行网络诊断脚本
./network-diagnostics.sh
# 手动测试连通性
docker exec $(docker ps --filter "name=client-test" --format "{{.ID}}") ping server-test
模拟网络问题 “`bash
删除网络(模拟网络配置错误)
docker network rm test-network
观察服务状态
docker service ps client-test docker service ps server-test
重新创建网络并更新服务
docker network create –driver overlay test-network docker service update –network-add test-network client-test docker service update –network-add test-network server-test
### 练习 3:性能问题诊断
#### 目标
识别和解决性能瓶颈。
#### 步骤
1. **创建高负载场景**
```bash
# 部署 CPU 密集型服务
docker service create --name cpu-stress --replicas 5 \
--limit-cpu 0.5 --reserve-cpu 0.2 \
progrium/stress --cpu 2 --timeout 300s
# 部署内存密集型服务
docker service create --name mem-stress --replicas 2 \
--limit-memory 512m --reserve-memory 256m \
progrium/stress --vm 1 --vm-bytes 400m --timeout 300s
运行性能诊断 “`bash
执行性能诊断脚本
./performance-diagnostics.sh
监控资源使用
watch -n 2 ‘docker stats –no-stream’
3. **分析和优化**
```bash
# 查看性能报告
ls /var/log/performance-reports/
# 根据建议进行优化
docker service scale cpu-stress=2 # 减少副本数
docker service update --limit-cpu 0.3 cpu-stress # 降低 CPU 限制
8. 本章总结
关键要点
故障诊断方法论
- 系统化的故障诊断流程
- 从症状到根因的分析方法
- 预防性维护的重要性
集群故障处理
- 节点故障的识别和恢复
- 管理节点选举问题
- 证书和认证问题解决
服务故障排除
- 服务启动失败的常见原因
- 健康检查配置和调试
- 资源约束和调度问题
网络故障诊断
- 网络连通性测试
- 服务发现问题
- 负载均衡验证
存储故障处理
- 卷挂载问题诊断
- 存储空间管理
- 权限和访问问题
性能问题分析
- 系统性能监控
- 资源使用分析
- 性能优化建议
最佳实践
预防性维护
- 定期运行健康检查脚本
- 监控关键性能指标
- 及时更新和打补丁
故障响应
- 建立标准化的故障响应流程
- 维护详细的故障处理文档
- 定期进行故障演练
监控和告警
- 实施全面的监控策略
- 设置合理的告警阈值
- 自动化常见问题的处理
文档和知识管理
- 记录所有故障和解决方案
- 建立知识库和最佳实践
- 定期回顾和更新流程
工具和自动化
- 开发和维护诊断脚本
- 自动化重复性任务
- 集成监控和告警系统
下一步学习
在掌握了故障排除和调试技巧后,建议继续学习:
高级监控和可观测性
- 分布式追踪
- 日志聚合和分析
- 自定义指标收集
自动化运维
- Infrastructure as Code
- CI/CD 集成
- 自动化部署和回滚
灾难恢复
- 备份和恢复策略
- 多区域部署
- 业务连续性规划
通过本章的学习,您应该能够有效地诊断和解决 Docker Swarm 集群中的各种问题,确保集群的稳定运行和最佳性能。
2. 集群故障排除
2.1 节点故障处理
节点离线处理
#!/bin/bash
# node-recovery.sh
NODE_NAME=$1
LOG_FILE="/var/log/node-recovery.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查节点状态
check_node_status() {
local node=$1
log "Checking status of node: $node"
local status=$(docker node ls --filter "name=$node" --format '{{.Status}}')
local availability=$(docker node ls --filter "name=$node" --format '{{.Availability}}')
echo "Node Status: $status"
echo "Node Availability: $availability"
if [ "$status" = "Down" ]; then
return 1
else
return 0
fi
}
# 尝试重新连接节点
reconnect_node() {
local node=$1
log "Attempting to reconnect node: $node"
# 检查节点是否可以 ping 通
if ping -c 3 $node > /dev/null 2>&1; then
log "Node $node is reachable via ping"
# 尝试 SSH 连接并重启 Docker
if ssh $node "sudo systemctl restart docker" 2>/dev/null; then
log "Docker service restarted on node $node"
sleep 10
# 检查节点是否重新加入
if check_node_status $node; then
log "Node $node successfully reconnected"
return 0
fi
else
log "Failed to restart Docker on node $node"
fi
else
log "Node $node is not reachable via ping"
fi
return 1
}
# 强制移除节点
force_remove_node() {
local node=$1
log "Force removing node: $node"
# 将节点设置为 drain 状态
docker node update --availability drain $node
# 等待任务迁移
log "Waiting for tasks to migrate from node $node..."
sleep 30
# 检查是否还有任务在该节点上
local task_count=$(docker node ps $node --filter "desired-state=running" --format '{{.Name}}' | wc -l)
if [ $task_count -gt 0 ]; then
log "Warning: $task_count tasks still running on node $node"
# 强制停止任务
for task in $(docker node ps $node --filter "desired-state=running" --format '{{.Name}}'); do
log "Force stopping task: $task"
docker service update --force $(echo $task | cut -d'.' -f1)
done
sleep 10
fi
# 移除节点
docker node rm --force $node
if [ $? -eq 0 ]; then
log "Node $node successfully removed from cluster"
return 0
else
log "Failed to remove node $node"
return 1
fi
}
# 重新加入节点
rejoin_node() {
local node=$1
log "Attempting to rejoin node: $node"
# 获取加入令牌
local worker_token=$(docker swarm join-token worker -q)
local manager_ip=$(docker info | grep "Node Address" | awk '{print $3}')
# 通过 SSH 在节点上执行加入命令
if ssh $node "docker swarm join --token $worker_token $manager_ip:2377" 2>/dev/null; then
log "Node $node successfully rejoined the cluster"
# 等待节点状态更新
sleep 5
# 验证节点状态
if check_node_status $node; then
log "Node $node is now active in the cluster"
return 0
fi
else
log "Failed to rejoin node $node to the cluster"
fi
return 1
}
# 主恢复流程
recover_node() {
local node=$1
if [ -z "$node" ]; then
echo "Usage: $0 <node-name>"
exit 1
fi
log "Starting recovery process for node: $node"
# 步骤 1: 检查当前状态
if check_node_status $node; then
log "Node $node is already active"
exit 0
fi
# 步骤 2: 尝试重新连接
if reconnect_node $node; then
log "Node $node recovery completed successfully"
exit 0
fi
# 步骤 3: 强制移除并重新加入
log "Attempting force removal and rejoin..."
if force_remove_node $node; then
sleep 5
if rejoin_node $node; then
log "Node $node recovery completed successfully"
exit 0
fi
fi
log "Node $node recovery failed"
exit 1
}
# 执行恢复
recover_node $NODE_NAME
管理节点故障处理
#!/bin/bash
# manager-node-recovery.sh
LOG_FILE="/var/log/manager-recovery.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查管理节点状态
check_manager_status() {
log "Checking manager node status..."
local manager_count=$(docker node ls --filter "role=manager" --format '{{.Hostname}} {{.Status}}' | grep "Ready" | wc -l)
local total_managers=$(docker node ls --filter "role=manager" --format '{{.Hostname}}' | wc -l)
log "Active managers: $manager_count/$total_managers"
# 检查是否有足够的管理节点
if [ $manager_count -lt 2 ]; then
log "WARNING: Insufficient active manager nodes"
return 1
fi
return 0
}
# 检查 Raft 共识状态
check_raft_status() {
log "Checking Raft consensus status..."
# 检查是否有 leader
local leader_info=$(docker info | grep "Is Manager" -A 5)
if echo "$leader_info" | grep -q "Leader: Yes"; then
log "Current node is the Raft leader"
elif echo "$leader_info" | grep -q "Leader: No"; then
log "Current node is a Raft follower"
else
log "WARNING: Raft leader status unclear"
return 1
fi
return 0
}
# 提升工作节点为管理节点
promote_worker_to_manager() {
local worker_node=$1
if [ -z "$worker_node" ]; then
# 自动选择一个健康的工作节点
worker_node=$(docker node ls --filter "role=worker" --filter "availability=active" --format '{{.Hostname}}' | head -1)
fi
if [ -z "$worker_node" ]; then
log "ERROR: No available worker nodes to promote"
return 1
fi
log "Promoting worker node $worker_node to manager"
docker node promote $worker_node
if [ $? -eq 0 ]; then
log "Successfully promoted $worker_node to manager"
return 0
else
log "Failed to promote $worker_node to manager"
return 1
fi
}
# 降级管理节点为工作节点
demote_manager_to_worker() {
local manager_node=$1
if [ -z "$manager_node" ]; then
log "ERROR: Manager node name required"
return 1
fi
log "Demoting manager node $manager_node to worker"
# 检查是否是最后一个管理节点
local manager_count=$(docker node ls --filter "role=manager" --format '{{.Hostname}}' | wc -l)
if [ $manager_count -le 1 ]; then
log "ERROR: Cannot demote the last manager node"
return 1
fi
docker node demote $manager_node
if [ $? -eq 0 ]; then
log "Successfully demoted $manager_node to worker"
return 0
else
log "Failed to demote $manager_node to worker"
return 1
fi
}
# 重建管理节点集群
rebuild_manager_cluster() {
log "Rebuilding manager cluster..."
# 获取当前管理节点列表
local managers=($(docker node ls --filter "role=manager" --format '{{.Hostname}}'))
local healthy_managers=()
# 检查每个管理节点的健康状态
for manager in "${managers[@]}"; do
local status=$(docker node ls --filter "name=$manager" --format '{{.Status}}')
if [ "$status" = "Ready" ]; then
healthy_managers+=("$manager")
else
log "Manager node $manager is not healthy (status: $status)"
fi
done
log "Healthy managers: ${healthy_managers[*]}"
# 如果健康的管理节点少于 3 个,提升工作节点
while [ ${#healthy_managers[@]} -lt 3 ]; do
if promote_worker_to_manager; then
# 重新获取健康管理节点列表
healthy_managers=($(docker node ls --filter "role=manager" --filter "availability=active" --format '{{.Hostname}}'))
else
log "Failed to promote additional manager nodes"
break
fi
done
log "Manager cluster rebuild completed"
}
# 处理集群分裂
handle_split_brain() {
log "Handling potential split-brain scenario..."
# 检查当前节点是否可以访问其他管理节点
local current_node=$(hostname)
local other_managers=($(docker node ls --filter "role=manager" --format '{{.Hostname}}' | grep -v "$current_node"))
local reachable_managers=0
for manager in "${other_managers[@]}"; do
if ping -c 1 -W 2 $manager > /dev/null 2>&1; then
log "Manager node $manager is reachable"
reachable_managers=$((reachable_managers + 1))
else
log "Manager node $manager is not reachable"
fi
done
log "Reachable managers: $reachable_managers/${#other_managers[@]}"
# 如果大部分管理节点不可达,可能存在网络分区
if [ $reachable_managers -lt $((${#other_managers[@]} / 2)) ]; then
log "WARNING: Possible network partition detected"
log "Consider manual intervention to resolve split-brain"
# 可以选择强制重新初始化集群(谨慎操作)
read -p "Force reinitialize cluster? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
force_reinit_cluster
fi
fi
}
# 强制重新初始化集群(危险操作)
force_reinit_cluster() {
log "WARNING: Force reinitializing cluster - this will lose cluster state!"
# 备份当前配置
local backup_dir="/var/backup/swarm-$(date +%Y%m%d_%H%M%S)"
mkdir -p $backup_dir
# 导出服务配置
for service in $(docker service ls --format '{{.Name}}'); do
docker service inspect $service > "$backup_dir/service-$service.json"
done
# 导出网络配置
for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
docker network inspect $network > "$backup_dir/network-$network.json"
done
log "Configuration backed up to: $backup_dir"
# 强制重新初始化
docker swarm init --force-new-cluster
if [ $? -eq 0 ]; then
log "Cluster successfully reinitialized"
log "Please manually restore services and rejoin other nodes"
else
log "Failed to reinitialize cluster"
fi
}
# 主恢复流程
main() {
log "Starting manager node recovery process..."
# 检查当前状态
if ! check_manager_status; then
log "Manager cluster needs attention"
# 尝试重建管理节点集群
rebuild_manager_cluster
fi
# 检查 Raft 状态
if ! check_raft_status; then
log "Raft consensus issues detected"
handle_split_brain
fi
log "Manager node recovery process completed"
}
# 执行主函数
main
2.2 证书和认证问题
证书诊断脚本
#!/bin/bash
# certificate-diagnostics.sh
SWARM_DIR="/var/lib/docker/swarm"
CERT_DIR="$SWARM_DIR/certificates"
LOG_FILE="/var/log/cert-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查证书文件
check_certificate_files() {
log "Checking certificate files..."
local cert_files=(
"swarm-node.crt"
"swarm-node.key"
"swarm-root-ca.crt"
)
for cert_file in "${cert_files[@]}"; do
local file_path="$CERT_DIR/$cert_file"
if [ -f "$file_path" ]; then
log "Certificate file exists: $cert_file"
# 检查文件权限
local permissions=$(stat -c "%a" "$file_path")
log " Permissions: $permissions"
# 检查文件大小
local size=$(stat -c "%s" "$file_path")
log " Size: $size bytes"
# 如果是证书文件,检查有效期
if [[ $cert_file == *.crt ]]; then
check_certificate_validity "$file_path"
fi
else
log "ERROR: Certificate file missing: $cert_file"
fi
done
}
# 检查证书有效期
check_certificate_validity() {
local cert_file=$1
log "Checking certificate validity: $(basename $cert_file)"
# 获取证书信息
local cert_info=$(openssl x509 -in "$cert_file" -text -noout 2>/dev/null)
if [ $? -eq 0 ]; then
# 提取有效期信息
local not_before=$(echo "$cert_info" | grep "Not Before" | cut -d':' -f2-)
local not_after=$(echo "$cert_info" | grep "Not After" | cut -d':' -f2-)
log " Not Before: $not_before"
log " Not After: $not_after"
# 检查是否即将过期(30天内)
local expiry_date=$(date -d "$not_after" +%s 2>/dev/null)
local current_date=$(date +%s)
local days_until_expiry=$(( (expiry_date - current_date) / 86400 ))
if [ $days_until_expiry -lt 0 ]; then
log " ERROR: Certificate has expired!"
elif [ $days_until_expiry -lt 30 ]; then
log " WARNING: Certificate expires in $days_until_expiry days"
else
log " OK: Certificate valid for $days_until_expiry days"
fi
# 检查证书主体
local subject=$(echo "$cert_info" | grep "Subject:" | cut -d':' -f2-)
log " Subject: $subject"
# 检查证书颁发者
local issuer=$(echo "$cert_info" | grep "Issuer:" | cut -d':' -f2-)
log " Issuer: $issuer"
else
log " ERROR: Cannot read certificate file"
fi
}
# 检查 TLS 连接
check_tls_connectivity() {
log "Checking TLS connectivity..."
# 获取管理节点地址
local manager_nodes=($(docker node ls --filter "role=manager" --format '{{.Hostname}}'))
for manager in "${manager_nodes[@]}"; do
log "Testing TLS connection to manager: $manager"
# 测试 Docker API TLS 连接
local tls_test=$(timeout 5 openssl s_client -connect $manager:2376 -cert "$CERT_DIR/swarm-node.crt" -key "$CERT_DIR/swarm-node.key" -CAfile "$CERT_DIR/swarm-root-ca.crt" < /dev/null 2>&1)
if echo "$tls_test" | grep -q "Verify return code: 0"; then
log " TLS connection successful"
else
log " TLS connection failed"
log " Error details: $(echo "$tls_test" | grep "verify error" | head -1)"
fi
done
}
# 重新生成证书
regenerate_certificates() {
log "Regenerating certificates..."
# 备份现有证书
local backup_dir="/var/backup/swarm-certs-$(date +%Y%m%d_%H%M%S)"
mkdir -p "$backup_dir"
if [ -d "$CERT_DIR" ]; then
cp -r "$CERT_DIR" "$backup_dir/"
log "Certificates backed up to: $backup_dir"
fi
# 停止 Docker 服务
log "Stopping Docker service..."
systemctl stop docker
# 删除现有证书
if [ -d "$CERT_DIR" ]; then
rm -rf "$CERT_DIR"
log "Existing certificates removed"
fi
# 重新启动 Docker 服务
log "Starting Docker service..."
systemctl start docker
# 等待服务启动
sleep 10
# 检查 Docker 服务状态
if systemctl is-active docker > /dev/null; then
log "Docker service restarted successfully"
# 重新加入集群
log "Attempting to rejoin cluster..."
# 这里需要管理员提供加入令牌
echo "Please run the following command on a manager node to get the join token:"
echo "docker swarm join-token worker"
echo "Then run the join command on this node."
else
log "ERROR: Failed to restart Docker service"
fi
}
# 修复证书权限
fix_certificate_permissions() {
log "Fixing certificate permissions..."
if [ -d "$CERT_DIR" ]; then
# 设置目录权限
chmod 700 "$CERT_DIR"
# 设置证书文件权限
find "$CERT_DIR" -name "*.crt" -exec chmod 644 {} \;
find "$CERT_DIR" -name "*.key" -exec chmod 600 {} \;
# 设置所有者
chown -R root:root "$CERT_DIR"
log "Certificate permissions fixed"
else
log "Certificate directory not found: $CERT_DIR"
fi
}
# 主诊断流程
main() {
log "Starting certificate diagnostics..."
# 检查是否在 Swarm 模式
if ! docker info | grep -q "Swarm: active"; then
log "ERROR: Docker is not in Swarm mode"
exit 1
fi
# 检查证书文件
check_certificate_files
# 检查 TLS 连接
check_tls_connectivity
# 修复权限
fix_certificate_permissions
log "Certificate diagnostics completed"
# 提供修复选项
echo
echo "Available actions:"
echo "1. Regenerate certificates (requires cluster rejoin)"
echo "2. Fix permissions only"
echo "3. Exit"
read -p "Choose an action (1-3): " choice
case $choice in
1)
regenerate_certificates
;;
2)
fix_certificate_permissions
;;
3)
log "Exiting without changes"
;;
*)
log "Invalid choice"
;;
esac
}
# 执行主函数
main
3. 服务故障排除
3.1 服务启动失败
服务诊断脚本
#!/bin/bash
# service-diagnostics.sh
SERVICE_NAME=$1
LOG_FILE="/var/log/service-diagnostics.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查服务状态
check_service_status() {
local service=$1
log "Checking service status: $service"
# 获取服务基本信息
local service_info=$(docker service ls --filter "name=$service" --format "table {{.Name}}\t{{.Mode}}\t{{.Replicas}}\t{{.Image}}")
if [ -z "$service_info" ]; then
log "ERROR: Service $service not found"
return 1
fi
echo "$service_info"
# 获取详细服务信息
log "Service details:"
docker service inspect $service --pretty
# 获取任务状态
log "Task status:"
docker service ps $service
return 0
}
# 分析任务失败原因
analyze_task_failures() {
local service=$1
log "Analyzing task failures for service: $service"
# 获取失败的任务
local failed_tasks=$(docker service ps $service --filter "desired-state=shutdown" --format '{{.Name}} {{.CurrentState}} {{.Error}}')
if [ -n "$failed_tasks" ]; then
log "Failed tasks found:"
echo "$failed_tasks"
# 分析常见失败原因
if echo "$failed_tasks" | grep -q "no suitable node"; then
log "Issue: No suitable node found"
log "Possible causes:"
log " - Insufficient resources on nodes"
log " - Placement constraints not met"
log " - Node labels missing"
check_placement_constraints $service
check_resource_availability $service
fi
if echo "$failed_tasks" | grep -q "image.*not found"; then
log "Issue: Image not found"
log "Possible causes:"
log " - Image name incorrect"
log " - Image not available on nodes"
log " - Registry authentication issues"
check_image_availability $service
fi
if echo "$failed_tasks" | grep -q "port.*already in use"; then
log "Issue: Port conflict"
log "Possible causes:"
log " - Port already bound by another service"
log " - Host port conflicts"
check_port_conflicts $service
fi
else
log "No failed tasks found"
fi
}
# 检查放置约束
check_placement_constraints() {
local service=$1
log "Checking placement constraints for service: $service"
# 获取服务的放置约束
local constraints=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.Placement.Constraints}}{{.}} {{end}}')
if [ -n "$constraints" ]; then
log "Placement constraints: $constraints"
# 检查哪些节点满足约束
log "Nodes matching constraints:"
for constraint in $constraints; do
log " Constraint: $constraint"
# 解析约束条件
if [[ $constraint == *"node.role"* ]]; then
local role=$(echo $constraint | grep -o "worker\|manager")
local matching_nodes=$(docker node ls --filter "role=$role" --format '{{.Hostname}}')
log " Nodes with role $role: $matching_nodes"
elif [[ $constraint == *"node.labels"* ]]; then
local label=$(echo $constraint | cut -d'=' -f2)
log " Checking for label: $label"
# 这里需要更复杂的逻辑来检查标签
fi
done
else
log "No placement constraints defined"
fi
}
# 检查资源可用性
check_resource_availability() {
local service=$1
log "Checking resource availability for service: $service"
# 获取服务的资源要求
local cpu_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.NanoCPUs}}')
local memory_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.MemoryBytes}}')
local cpu_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.NanoCPUs}}')
local memory_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.MemoryBytes}}')
log "Resource requirements:"
[ "$cpu_limit" != "<no value>" ] && log " CPU Limit: $cpu_limit nanocpus"
[ "$memory_limit" != "<no value>" ] && log " Memory Limit: $memory_limit bytes"
[ "$cpu_reservation" != "<no value>" ] && log " CPU Reservation: $cpu_reservation nanocpus"
[ "$memory_reservation" != "<no value>" ] && log " Memory Reservation: $memory_reservation bytes"
# 检查节点资源
log "Node resource status:"
for node in $(docker node ls --format '{{.Hostname}}'); do
log " Node: $node"
# 获取节点资源信息(这需要在每个节点上执行)
if [ "$node" = "$(hostname)" ]; then
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
local mem_usage=$(free | awk 'NR==2{printf "%.2f", $3*100/$2 }')
log " CPU Usage: ${cpu_usage}%"
log " Memory Usage: ${mem_usage}%"
fi
done
}
# 检查镜像可用性
check_image_availability() {
local service=$1
log "Checking image availability for service: $service"
# 获取服务使用的镜像
local image=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Image}}')
log "Service image: $image"
# 检查本地是否有镜像
if docker images --format '{{.Repository}}:{{.Tag}}' | grep -q "$image"; then
log "Image available locally"
else
log "Image not available locally"
# 尝试拉取镜像
log "Attempting to pull image..."
if docker pull $image; then
log "Image pulled successfully"
else
log "Failed to pull image"
log "Possible causes:"
log " - Image name incorrect"
log " - Registry not accessible"
log " - Authentication required"
fi
fi
# 检查其他节点的镜像可用性
log "Checking image availability on other nodes..."
for node in $(docker node ls --format '{{.Hostname}}'); do
if [ "$node" != "$(hostname)" ]; then
log " Checking node: $node"
# 这里需要 SSH 到其他节点检查
if ssh $node "docker images --format '{{.Repository}}:{{.Tag}}' | grep -q '$image'" 2>/dev/null; then
log " Image available on $node"
else
log " Image not available on $node"
fi
fi
done
}
# 检查端口冲突
check_port_conflicts() {
local service=$1
log "Checking port conflicts for service: $service"
# 获取服务发布的端口
local published_ports=$(docker service inspect $service --format '{{range .Spec.EndpointSpec.Ports}}{{.PublishedPort}}:{{.TargetPort}}/{{.Protocol}} {{end}}')
if [ -n "$published_ports" ]; then
log "Published ports: $published_ports"
for port_mapping in $published_ports; do
local published_port=$(echo $port_mapping | cut -d':' -f1)
log " Checking port: $published_port"
# 检查端口是否被占用
if netstat -tuln | grep -q ":$published_port "; then
log " Port $published_port is in use"
# 查找占用端口的进程
local process=$(netstat -tulnp | grep ":$published_port " | awk '{print $7}')
log " Used by: $process"
else
log " Port $published_port is available"
fi
done
else
log "No published ports defined"
fi
}
# 检查服务日志
check_service_logs() {
local service=$1
log "Checking service logs: $service"
# 获取最近的服务日志
local logs=$(docker service logs --tail 50 $service 2>&1)
if [ -n "$logs" ]; then
log "Recent service logs:"
echo "$logs"
# 分析日志中的错误
if echo "$logs" | grep -i error; then
log "Errors found in logs"
fi
if echo "$logs" | grep -i "permission denied"; then
log "Permission issues detected"
fi
if echo "$logs" | grep -i "connection refused"; then
log "Connection issues detected"
fi
else
log "No logs available"
fi
}
# 修复服务
fix_service() {
local service=$1
log "Attempting to fix service: $service"
echo "Available fix options:"
echo "1. Restart service (force update)"
echo "2. Scale service to 0 and back"
echo "3. Remove and recreate service"
echo "4. Update service image"
echo "5. Remove placement constraints"
echo "6. Exit without changes"
read -p "Choose a fix option (1-6): " choice
case $choice in
1)
log "Restarting service with force update..."
docker service update --force $service
;;
2)
log "Scaling service to 0 and back..."
local current_replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}' | cut -d'/' -f2)
docker service scale $service=0
sleep 5
docker service scale $service=$current_replicas
;;
3)
log "WARNING: This will remove and recreate the service"
read -p "Are you sure? (yes/no): " confirm
if [ "$confirm" = "yes" ]; then
# 导出服务配置
docker service inspect $service > "/tmp/service-$service-backup.json"
log "Service configuration backed up to /tmp/service-$service-backup.json"
# 移除服务
docker service rm $service
log "Service removed. Please recreate manually using the backup configuration."
fi
;;
4)
read -p "Enter new image name: " new_image
if [ -n "$new_image" ]; then
log "Updating service image to: $new_image"
docker service update --image $new_image $service
fi
;;
5)
log "Removing placement constraints..."
docker service update --constraint-rm $service
;;
6)
log "Exiting without changes"
;;
*)
log "Invalid choice"
;;
esac
}
# 主诊断流程
main() {
if [ -z "$SERVICE_NAME" ]; then
echo "Usage: $0 <service-name>"
echo "Available services:"
docker service ls --format '{{.Name}}'
exit 1
fi
log "Starting service diagnostics for: $SERVICE_NAME"
# 检查服务状态
if ! check_service_status $SERVICE_NAME; then
exit 1
fi
# 分析任务失败
analyze_task_failures $SERVICE_NAME
# 检查服务日志
check_service_logs $SERVICE_NAME
# 提供修复选项
echo
read -p "Do you want to attempt to fix the service? (yes/no): " fix_choice
if [ "$fix_choice" = "yes" ]; then
fix_service $SERVICE_NAME
fi
log "Service diagnostics completed for: $SERVICE_NAME"
}
# 执行主函数
main