学习目标

通过本章学习,您将能够:

  • 掌握 Docker Swarm 常见故障的诊断方法
  • 学会使用各种调试工具和命令
  • 了解集群、服务和网络故障的排除技巧
  • 掌握日志分析和问题定位技能
  • 学会预防性维护和故障预警

1. 故障诊断基础

1.1 故障分类

故障类型概览

# Docker Swarm 故障分类:

# 1. 集群级别故障
# - 节点离线或不可达
# - 管理节点选举失败
# - 集群分裂(Split-brain)
# - 证书过期或认证失败

# 2. 服务级别故障
# - 服务启动失败
# - 副本数不足
# - 服务更新失败
# - 健康检查失败

# 3. 网络级别故障
# - 容器间通信失败
# - 负载均衡异常
# - DNS 解析问题
# - 端口冲突

# 4. 存储级别故障
# - 卷挂载失败
# - 存储空间不足
# - 权限问题
# - 数据损坏

# 5. 资源级别故障
# - CPU/内存不足
# - 磁盘空间不足
# - 网络带宽限制
# - 文件描述符耗尽

故障诊断流程

# 标准故障诊断流程:

# 1. 问题识别
# - 收集故障现象
# - 确定影响范围
# - 记录故障时间

# 2. 信息收集
# - 查看系统状态
# - 收集相关日志
# - 检查配置变更

# 3. 问题分析
# - 分析日志信息
# - 对比正常状态
# - 确定根本原因

# 4. 解决方案
# - 制定修复计划
# - 实施修复措施
# - 验证修复效果

# 5. 预防措施
# - 总结经验教训
# - 完善监控告警
# - 更新运维文档

1.2 基础诊断命令

集群状态检查

#!/bin/bash
# cluster-health-check.sh

echo "=== Docker Swarm Cluster Health Check ==="
echo "Timestamp: $(date)"
echo

# 1. 基本集群信息
echo "1. Cluster Information:"
docker info | grep -A 10 "Swarm:"
echo

# 2. 节点状态
echo "2. Node Status:"
docker node ls
echo

# 3. 服务状态
echo "3. Service Status:"
docker service ls
echo

# 4. 网络状态
echo "4. Network Status:"
docker network ls
echo

# 5. 存储状态
echo "5. Volume Status:"
docker volume ls
echo

# 6. 系统资源
echo "6. System Resources:"
echo "CPU Usage:"
top -bn1 | grep "Cpu(s)"
echo "Memory Usage:"
free -h
echo "Disk Usage:"
df -h
echo

# 7. Docker 守护进程状态
echo "7. Docker Daemon Status:"
systemctl status docker --no-pager
echo

# 8. 最近的 Docker 事件
echo "8. Recent Docker Events:"
docker events --since="1h" --until="now" | tail -10
echo

# 9. 错误检查
echo "9. Error Detection:"
echo "Failed services:"
docker service ls --filter "desired-state=running" --format "table {{.Name}}\t{{.Replicas}}" | grep "0/"

echo "Unhealthy nodes:"
docker node ls --filter "availability=drain" --format "table {{.Hostname}}\t{{.Status}}\t{{.Availability}}"

echo "Network issues:"
docker network ls --filter "driver=overlay" --format "table {{.Name}}\t{{.Driver}}\t{{.Scope}}" | grep -v "swarm"

echo "=== Health Check Complete ==="

详细诊断脚本

#!/bin/bash
# detailed-diagnostics.sh

DIAG_DIR="/var/log/swarm-diagnostics"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
REPORT_FILE="$DIAG_DIR/diagnostic-report-$TIMESTAMP.txt"

# 创建诊断目录
mkdir -p $DIAG_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $REPORT_FILE
}

# 收集系统信息
collect_system_info() {
    log "Collecting system information..."
    
    {
        echo "=== System Information ==="
        echo "Hostname: $(hostname)"
        echo "Kernel: $(uname -r)"
        echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'=' -f2 | tr -d '"')"
        echo "Uptime: $(uptime)"
        echo "Date: $(date)"
        echo
        
        echo "=== Hardware Information ==="
        echo "CPU Info:"
        lscpu | grep -E "Model name|CPU\(s\)|Thread\(s\)"
        echo
        echo "Memory Info:"
        free -h
        echo
        echo "Disk Info:"
        df -h
        echo
        
        echo "=== Network Information ==="
        echo "Network Interfaces:"
        ip addr show
        echo
        echo "Routing Table:"
        ip route show
        echo
        echo "Network Connections:"
        ss -tuln | head -20
        echo
        
    } >> $REPORT_FILE
}

# 收集 Docker 信息
collect_docker_info() {
    log "Collecting Docker information..."
    
    {
        echo "=== Docker Information ==="
        docker version
        echo
        docker info
        echo
        
        echo "=== Docker Storage ==="
        docker system df
        echo
        
        echo "=== Docker Processes ==="
        docker ps -a
        echo
        
        echo "=== Docker Images ==="
        docker images
        echo
        
        echo "=== Docker Networks ==="
        docker network ls
        echo
        
        echo "=== Docker Volumes ==="
        docker volume ls
        echo
        
    } >> $REPORT_FILE
}

# 收集 Swarm 信息
collect_swarm_info() {
    log "Collecting Swarm information..."
    
    {
        echo "=== Swarm Cluster Information ==="
        docker info | grep -A 20 "Swarm:"
        echo
        
        echo "=== Node Information ==="
        docker node ls
        echo
        
        # 详细节点信息
        for node in $(docker node ls --format '{{.Hostname}}'); do
            echo "--- Node: $node ---"
            docker node inspect $node --pretty
            echo
        done
        
        echo "=== Service Information ==="
        docker service ls
        echo
        
        # 详细服务信息
        for service in $(docker service ls --format '{{.Name}}'); do
            echo "--- Service: $service ---"
            docker service inspect $service --pretty
            echo
            echo "Service Tasks:"
            docker service ps $service
            echo
        done
        
        echo "=== Stack Information ==="
        docker stack ls
        echo
        
        # 详细 Stack 信息
        for stack in $(docker stack ls --format '{{.Name}}'); do
            echo "--- Stack: $stack ---"
            docker stack ps $stack
            echo
        done
        
    } >> $REPORT_FILE
}

# 收集日志信息
collect_logs() {
    log "Collecting log information..."
    
    {
        echo "=== System Logs ==="
        echo "Docker Daemon Logs (last 100 lines):"
        journalctl -u docker --no-pager -n 100
        echo
        
        echo "System Logs (last 50 lines):"
        tail -50 /var/log/syslog 2>/dev/null || tail -50 /var/log/messages 2>/dev/null
        echo
        
        echo "=== Service Logs ==="
        for service in $(docker service ls --format '{{.Name}}'); do
            echo "--- Service Logs: $service (last 50 lines) ---"
            docker service logs --tail 50 $service 2>&1
            echo
        done
        
        echo "=== Container Logs ==="
        for container in $(docker ps --format '{{.Names}}'); do
            echo "--- Container Logs: $container (last 30 lines) ---"
            docker logs --tail 30 $container 2>&1
            echo
        done
        
    } >> $REPORT_FILE
}

# 收集性能信息
collect_performance_info() {
    log "Collecting performance information..."
    
    {
        echo "=== Performance Information ==="
        echo "CPU Usage:"
        top -bn1 | head -20
        echo
        
        echo "Memory Usage:"
        cat /proc/meminfo
        echo
        
        echo "Disk I/O:"
        iostat -x 1 3 2>/dev/null || echo "iostat not available"
        echo
        
        echo "Network Statistics:"
        cat /proc/net/dev
        echo
        
        echo "Load Average:"
        cat /proc/loadavg
        echo
        
        echo "Process List:"
        ps aux | head -20
        echo
        
        echo "Docker Stats:"
        timeout 10 docker stats --no-stream 2>/dev/null || echo "Docker stats timeout"
        echo
        
    } >> $REPORT_FILE
}

# 网络诊断
network_diagnostics() {
    log "Running network diagnostics..."
    
    {
        echo "=== Network Diagnostics ==="
        
        # 检查 Docker 网络
        echo "Docker Networks:"
        for network in $(docker network ls --format '{{.Name}}'); do
            echo "--- Network: $network ---"
            docker network inspect $network
            echo
        done
        
        # 检查端口监听
        echo "Listening Ports:"
        netstat -tuln 2>/dev/null || ss -tuln
        echo
        
        # 检查防火墙规则
        echo "Firewall Rules:"
        iptables -L -n 2>/dev/null || echo "iptables not accessible"
        echo
        
        # 检查 DNS
        echo "DNS Configuration:"
        cat /etc/resolv.conf
        echo
        
        # 测试网络连通性
        echo "Network Connectivity Tests:"
        echo "Ping Google DNS:"
        ping -c 3 8.8.8.8 2>&1
        echo
        
        echo "Ping Docker Hub:"
        ping -c 3 registry-1.docker.io 2>&1
        echo
        
    } >> $REPORT_FILE
}

# 生成诊断报告
generate_report() {
    log "Generating diagnostic report..."
    
    # 创建 HTML 报告
    local html_report="$DIAG_DIR/diagnostic-report-$TIMESTAMP.html"
    
    cat > $html_report << 'EOF'
<!DOCTYPE html>
<html>
<head>
    <title>Docker Swarm Diagnostic Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .header { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
        .section { margin: 20px 0; border: 1px solid #ddd; padding: 10px; border-radius: 5px; }
        .error { background-color: #ffebee; }
        .warning { background-color: #fff3e0; }
        .success { background-color: #e8f5e8; }
        pre { background-color: #f5f5f5; padding: 10px; overflow-x: auto; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #f2f2f2; }
    </style>
</head>
<body>
    <div class="header">
        <h1>Docker Swarm Diagnostic Report</h1>
        <p>Generated: $(date)</p>
        <p>Hostname: $(hostname)</p>
    </div>
EOF
    
    # 添加摘要信息
    echo "    <div class='section'>" >> $html_report
    echo "        <h2>Executive Summary</h2>" >> $html_report
    
    # 检查集群状态
    local cluster_status="Unknown"
    if docker info | grep -q "Swarm: active"; then
        cluster_status="Active"
    elif docker info | grep -q "Swarm: inactive"; then
        cluster_status="Inactive"
    fi
    
    # 检查节点数量
    local node_count=$(docker node ls 2>/dev/null | wc -l)
    node_count=$((node_count - 1))  # 减去标题行
    
    # 检查服务数量
    local service_count=$(docker service ls 2>/dev/null | wc -l)
    service_count=$((service_count - 1))  # 减去标题行
    
    # 检查失败的服务
    local failed_services=$(docker service ls --format '{{.Name}} {{.Replicas}}' 2>/dev/null | grep "0/" | wc -l)
    
    echo "        <table>" >> $html_report
    echo "            <tr><th>Metric</th><th>Value</th><th>Status</th></tr>" >> $html_report
    echo "            <tr><td>Cluster Status</td><td>$cluster_status</td><td class='$([ "$cluster_status" = "Active" ] && echo "success" || echo "error")'></td></tr>" >> $html_report
    echo "            <tr><td>Node Count</td><td>$node_count</td><td class='$([ $node_count -gt 0 ] && echo "success" || echo "error")'></td></tr>" >> $html_report
    echo "            <tr><td>Service Count</td><td>$service_count</td><td class='success'></td></tr>" >> $html_report
    echo "            <tr><td>Failed Services</td><td>$failed_services</td><td class='$([ $failed_services -eq 0 ] && echo "success" || echo "error")'></td></tr>" >> $html_report
    echo "        </table>" >> $html_report
    echo "    </div>" >> $html_report
    
    # 添加详细信息链接
    echo "    <div class='section'>" >> $html_report
    echo "        <h2>Detailed Information</h2>" >> $html_report
    echo "        <p>Full diagnostic report: <a href='diagnostic-report-$TIMESTAMP.txt'>diagnostic-report-$TIMESTAMP.txt</a></p>" >> $html_report
    echo "    </div>" >> $html_report
    
    # 结束 HTML
    echo "</body></html>" >> $html_report
    
    log "Diagnostic report generated:"
    log "  Text report: $REPORT_FILE"
    log "  HTML report: $html_report"
}

# 主函数
main() {
    log "Starting comprehensive diagnostic collection..."
    
    collect_system_info
    collect_docker_info
    collect_swarm_info
    collect_logs
    collect_performance_info
    network_diagnostics
    generate_report
    
    log "Diagnostic collection completed successfully"
    echo "Reports saved to: $DIAG_DIR"
}

# 执行主函数
main

3.2 服务健康检查故障

健康检查诊断

#!/bin/bash
# health-check-diagnostics.sh

SERVICE_NAME=$1
LOG_FILE="/var/log/health-check-diagnostics.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查健康检查配置
check_health_config() {
    local service=$1
    
    log "Checking health check configuration for service: $service"
    
    # 获取健康检查配置
    local health_config=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck}}')
    
    if [ "$health_config" = "<no value>" ] || [ -z "$health_config" ]; then
        log "No health check configured for service $service"
        return 1
    fi
    
    log "Health check configuration:"
    docker service inspect $service --format '{{json .Spec.TaskTemplate.ContainerSpec.Healthcheck}}' | jq .
    
    # 获取健康检查详细信息
    local test_cmd=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Healthcheck.Test}}{{.}} {{end}}')
    local interval=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Interval}}')
    local timeout=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Timeout}}')
    local retries=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.Retries}}')
    local start_period=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Healthcheck.StartPeriod}}')
    
    log "Health check details:"
    log "  Test command: $test_cmd"
    log "  Interval: $interval"
    log "  Timeout: $timeout"
    log "  Retries: $retries"
    log "  Start period: $start_period"
    
    return 0
}

# 测试健康检查命令
test_health_command() {
    local service=$1
    
    log "Testing health check command for service: $service"
    
    # 获取服务的一个运行中的容器
    local container_id=$(docker service ps $service --filter "desired-state=running" --format '{{.Name}}.{{.ID}}' | head -1)
    
    if [ -z "$container_id" ]; then
        log "No running containers found for service $service"
        return 1
    fi
    
    # 获取实际的容器 ID
    local actual_container=$(docker ps --filter "name=$container_id" --format '{{.ID}}')
    
    if [ -z "$actual_container" ]; then
        log "Container not found: $container_id"
        return 1
    fi
    
    log "Testing health check on container: $actual_container"
    
    # 获取健康检查命令
    local health_cmd=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Healthcheck.Test}}{{.}} {{end}}')
    
    if [ -n "$health_cmd" ]; then
        log "Executing health check command: $health_cmd"
        
        # 在容器中执行健康检查命令
        local result=$(docker exec $actual_container $health_cmd 2>&1)
        local exit_code=$?
        
        log "Health check result:"
        log "  Exit code: $exit_code"
        log "  Output: $result"
        
        if [ $exit_code -eq 0 ]; then
            log "Health check passed"
        else
            log "Health check failed"
            
            # 分析失败原因
            analyze_health_failure "$result" $exit_code
        fi
    else
        log "No health check command found"
    fi
}

# 分析健康检查失败原因
analyze_health_failure() {
    local output="$1"
    local exit_code=$2
    
    log "Analyzing health check failure..."
    
    case $exit_code in
        1)
            log "Health check returned unhealthy status"
            ;;
        2)
            log "Health check command not found or permission denied"
            ;;
        126)
            log "Health check command not executable"
            ;;
        127)
            log "Health check command not found"
            ;;
        *)
            log "Health check failed with exit code: $exit_code"
            ;;
    esac
    
    # 检查常见错误模式
    if echo "$output" | grep -i "connection refused"; then
        log "Issue: Connection refused - service may not be listening"
    elif echo "$output" | grep -i "timeout"; then
        log "Issue: Timeout - service may be slow to respond"
    elif echo "$output" | grep -i "permission denied"; then
        log "Issue: Permission denied - check file permissions"
    elif echo "$output" | grep -i "no such file"; then
        log "Issue: File not found - check file paths"
    fi
}

# 修复健康检查
fix_health_check() {
    local service=$1
    
    log "Health check fix options for service: $service"
    
    echo "Available fix options:"
    echo "1. Increase health check timeout"
    echo "2. Increase health check interval"
    echo "3. Increase retry count"
    echo "4. Increase start period"
    echo "5. Update health check command"
    echo "6. Disable health check"
    echo "7. Exit without changes"
    
    read -p "Choose a fix option (1-7): " choice
    
    case $choice in
        1)
            read -p "Enter new timeout (e.g., 30s): " timeout
            if [ -n "$timeout" ]; then
                log "Updating health check timeout to: $timeout"
                docker service update --health-timeout $timeout $service
            fi
            ;;
        2)
            read -p "Enter new interval (e.g., 30s): " interval
            if [ -n "$interval" ]; then
                log "Updating health check interval to: $interval"
                docker service update --health-interval $interval $service
            fi
            ;;
        3)
            read -p "Enter new retry count (e.g., 5): " retries
            if [ -n "$retries" ]; then
                log "Updating health check retries to: $retries"
                docker service update --health-retries $retries $service
            fi
            ;;
        4)
            read -p "Enter new start period (e.g., 60s): " start_period
            if [ -n "$start_period" ]; then
                log "Updating health check start period to: $start_period"
                docker service update --health-start-period $start_period $service
            fi
            ;;
        5)
            read -p "Enter new health check command: " health_cmd
            if [ -n "$health_cmd" ]; then
                log "Updating health check command to: $health_cmd"
                docker service update --health-cmd "$health_cmd" $service
            fi
            ;;
        6)
            log "Disabling health check..."
            docker service update --no-healthcheck $service
            ;;
        7)
            log "Exiting without changes"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 主函数
main() {
    if [ -z "$SERVICE_NAME" ]; then
        echo "Usage: $0 <service-name>"
        exit 1
    fi
    
    log "Starting health check diagnostics for: $SERVICE_NAME"
    
    if check_health_config $SERVICE_NAME; then
        test_health_command $SERVICE_NAME
        
        echo
        read -p "Do you want to fix the health check? (yes/no): " fix_choice
        
        if [ "$fix_choice" = "yes" ]; then
            fix_health_check $SERVICE_NAME
        fi
    else
        echo "No health check configured. Would you like to add one?"
        read -p "(yes/no): " add_choice
        
        if [ "$add_choice" = "yes" ]; then
            read -p "Enter health check command: " health_cmd
            if [ -n "$health_cmd" ]; then
                docker service update --health-cmd "$health_cmd" $SERVICE_NAME
                log "Health check added to service $SERVICE_NAME"
            fi
        fi
    fi
    
    log "Health check diagnostics completed"
}

# 执行主函数
main

4. 网络故障排除

4.1 网络连通性问题

网络诊断脚本

#!/bin/bash
# network-diagnostics.sh

LOG_FILE="/var/log/network-diagnostics.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查网络基础设施
check_network_infrastructure() {
    log "Checking network infrastructure..."
    
    # 检查 Docker 网络
    log "Docker networks:"
    docker network ls
    
    # 检查 Overlay 网络
    log "Overlay networks:"
    docker network ls --filter "driver=overlay"
    
    # 检查网络接口
    log "Network interfaces:"
    ip addr show
    
    # 检查路由表
    log "Routing table:"
    ip route show
    
    # 检查 iptables 规则
    log "Iptables rules:"
    iptables -L -n | head -20
    
    # 检查网络命名空间
    log "Network namespaces:"
    ip netns list
}

# 检查服务网络配置
check_service_networks() {
    log "Checking service network configurations..."
    
    for service in $(docker service ls --format '{{.Name}}'); do
        log "Service: $service"
        
        # 获取服务网络
        local networks=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.Networks}}{{.Target}} {{end}}')
        log "  Networks: $networks"
        
        # 获取端口配置
        local ports=$(docker service inspect $service --format '{{range .Spec.EndpointSpec.Ports}}{{.PublishedPort}}:{{.TargetPort}}/{{.Protocol}} {{end}}')
        if [ -n "$ports" ]; then
            log "  Published ports: $ports"
        fi
        
        # 检查服务端点
        local vip=$(docker service inspect $service --format '{{range .Endpoint.VirtualIPs}}{{.NetworkID}}:{{.Addr}} {{end}}')
        if [ -n "$vip" ]; then
            log "  Virtual IPs: $vip"
        fi
    done
}

# 测试容器间连通性
test_container_connectivity() {
    log "Testing container connectivity..."
    
    # 获取所有运行中的容器
    local containers=($(docker ps --format '{{.Names}}'))
    
    if [ ${#containers[@]} -lt 2 ]; then
        log "Need at least 2 containers to test connectivity"
        return 1
    fi
    
    # 测试容器间的连通性
    for i in "${!containers[@]}"; do
        local source_container=${containers[$i]}
        
        for j in "${!containers[@]}"; do
            if [ $i -ne $j ]; then
                local target_container=${containers[$j]}
                
                log "Testing connectivity: $source_container -> $target_container"
                
                # 获取目标容器的 IP
                local target_ip=$(docker inspect $target_container --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' | head -1)
                
                if [ -n "$target_ip" ]; then
                    # 在源容器中 ping 目标容器
                    local ping_result=$(docker exec $source_container ping -c 1 -W 2 $target_ip 2>&1)
                    
                    if echo "$ping_result" | grep -q "1 received"; then
                        log "  ✓ Connectivity successful"
                    else
                        log "  ✗ Connectivity failed"
                        log "    Target IP: $target_ip"
                        log "    Error: $(echo "$ping_result" | tail -1)"
                    fi
                else
                    log "  ✗ Could not get target IP for $target_container"
                fi
            fi
        done
    done
}

# 测试服务发现
test_service_discovery() {
    log "Testing service discovery..."
    
    # 获取所有服务
    local services=($(docker service ls --format '{{.Name}}'))
    
    if [ ${#services[@]} -eq 0 ]; then
        log "No services found"
        return 1
    fi
    
    # 获取一个运行中的容器来测试
    local test_container=$(docker ps --format '{{.Names}}' | head -1)
    
    if [ -z "$test_container" ]; then
        log "No running containers found for testing"
        return 1
    fi
    
    log "Using test container: $test_container"
    
    for service in "${services[@]}"; do
        log "Testing service discovery for: $service"
        
        # 测试 DNS 解析
        local dns_result=$(docker exec $test_container nslookup $service 2>&1)
        
        if echo "$dns_result" | grep -q "Address:"; then
            local service_ip=$(echo "$dns_result" | grep "Address:" | tail -1 | awk '{print $2}')
            log "  ✓ DNS resolution successful: $service -> $service_ip"
            
            # 测试连通性
            local ping_result=$(docker exec $test_container ping -c 1 -W 2 $service 2>&1)
            
            if echo "$ping_result" | grep -q "1 received"; then
                log "  ✓ Service connectivity successful"
            else
                log "  ✗ Service connectivity failed"
            fi
        else
            log "  ✗ DNS resolution failed for $service"
            log "    Error: $(echo "$dns_result" | grep -i "error\|fail" | head -1)"
        fi
    done
}

# 检查负载均衡
test_load_balancing() {
    log "Testing load balancing..."
    
    # 选择一个有多个副本的服务
    local service_with_replicas=$(docker service ls --format '{{.Name}} {{.Replicas}}' | grep -v "1/1" | head -1 | awk '{print $1}')
    
    if [ -z "$service_with_replicas" ]; then
        log "No services with multiple replicas found"
        return 1
    fi
    
    log "Testing load balancing for service: $service_with_replicas"
    
    # 获取服务的任务
    local tasks=($(docker service ps $service_with_replicas --filter "desired-state=running" --format '{{.Name}}'))
    
    log "Service tasks: ${tasks[*]}"
    
    # 获取测试容器
    local test_container=$(docker ps --format '{{.Names}}' | head -1)
    
    if [ -z "$test_container" ]; then
        log "No test container available"
        return 1
    fi
    
    # 多次请求服务,检查负载均衡
    log "Performing multiple requests to test load balancing..."
    
    for i in {1..10}; do
        # 这里假设服务有 HTTP 端点,实际情况需要根据服务类型调整
        local response=$(docker exec $test_container wget -qO- --timeout=2 http://$service_with_replicas 2>&1 || echo "failed")
        
        if [ "$response" != "failed" ]; then
            log "  Request $i: Success"
        else
            log "  Request $i: Failed"
        fi
        
        sleep 0.5
    done
}

# 检查网络性能
test_network_performance() {
    log "Testing network performance..."
    
    # 获取两个容器进行性能测试
    local containers=($(docker ps --format '{{.Names}}' | head -2))
    
    if [ ${#containers[@]} -lt 2 ]; then
        log "Need at least 2 containers for performance testing"
        return 1
    fi
    
    local source_container=${containers[0]}
    local target_container=${containers[1]}
    
    log "Performance test: $source_container -> $target_container"
    
    # 获取目标容器 IP
    local target_ip=$(docker inspect $target_container --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' | head -1)
    
    if [ -n "$target_ip" ]; then
        # 延迟测试
        log "Testing latency..."
        local ping_stats=$(docker exec $source_container ping -c 10 $target_ip 2>&1 | tail -1)
        log "  Ping statistics: $ping_stats"
        
        # 带宽测试(如果容器中有 iperf3)
        if docker exec $target_container which iperf3 > /dev/null 2>&1; then
            log "Testing bandwidth with iperf3..."
            
            # 在目标容器启动 iperf3 服务器
            docker exec -d $target_container iperf3 -s
            sleep 2
            
            # 在源容器运行 iperf3 客户端
            local bandwidth_result=$(docker exec $source_container iperf3 -c $target_ip -t 5 2>&1 | grep "sender")
            log "  Bandwidth test result: $bandwidth_result"
            
            # 停止 iperf3 服务器
            docker exec $target_container pkill iperf3
        else
            log "  iperf3 not available for bandwidth testing"
        fi
    else
        log "Could not get target container IP"
    fi
}

# 诊断网络问题
diagnose_network_issues() {
    log "Diagnosing common network issues..."
    
    # 检查 Docker 守护进程网络配置
    log "Docker daemon network configuration:"
    docker info | grep -A 10 "Network:"
    
    # 检查 Swarm 网络加密
    log "Checking Swarm network encryption..."
    for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
        local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
        log "  Network $network encrypted: $encrypted"
    done
    
    # 检查端口冲突
    log "Checking for port conflicts..."
    local listening_ports=$(netstat -tuln | grep LISTEN)
    log "Listening ports:"
    echo "$listening_ports"
    
    # 检查防火墙规则
    log "Checking firewall rules affecting Docker..."
    iptables -L DOCKER-USER -n 2>/dev/null || log "DOCKER-USER chain not found"
    
    # 检查 MTU 设置
    log "Checking MTU settings..."
    for interface in $(ip link show | grep -E "^[0-9]+:" | cut -d':' -f2 | tr -d ' '); do
        local mtu=$(ip link show $interface | grep -o "mtu [0-9]*" | cut -d' ' -f2)
        log "  Interface $interface MTU: $mtu"
    done
}

# 修复网络问题
fix_network_issues() {
    log "Network issue fix options:"
    
    echo "Available fix options:"
    echo "1. Restart Docker daemon"
    echo "2. Recreate overlay networks"
    echo "3. Flush iptables rules"
    echo "4. Reset network namespaces"
    echo "5. Update network MTU"
    echo "6. Exit without changes"
    
    read -p "Choose a fix option (1-6): " choice
    
    case $choice in
        1)
            log "Restarting Docker daemon..."
            systemctl restart docker
            sleep 10
            log "Docker daemon restarted"
            ;;
        2)
            log "Recreating overlay networks..."
            # 这需要谨慎操作,可能影响运行中的服务
            echo "WARNING: This will affect running services"
            read -p "Continue? (yes/no): " confirm
            if [ "$confirm" = "yes" ]; then
                for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
                    if [ "$network" != "ingress" ]; then
                        log "Recreating network: $network"
                        # 这里需要更复杂的逻辑来安全地重建网络
                    fi
                done
            fi
            ;;
        3)
            log "Flushing iptables rules..."
            echo "WARNING: This will reset all iptables rules"
            read -p "Continue? (yes/no): " confirm
            if [ "$confirm" = "yes" ]; then
                iptables -F
                iptables -X
                iptables -t nat -F
                iptables -t nat -X
                systemctl restart docker
                log "Iptables rules flushed and Docker restarted"
            fi
            ;;
        4)
            log "Resetting network namespaces..."
            echo "WARNING: This will affect all containers"
            read -p "Continue? (yes/no): " confirm
            if [ "$confirm" = "yes" ]; then
                # 删除所有网络命名空间
                for ns in $(ip netns list | awk '{print $1}'); do
                    ip netns delete $ns
                done
                systemctl restart docker
                log "Network namespaces reset and Docker restarted"
            fi
            ;;
        5)
            read -p "Enter interface name: " interface
            read -p "Enter new MTU value: " mtu
            if [ -n "$interface" ] && [ -n "$mtu" ]; then
                log "Setting MTU for $interface to $mtu"
                ip link set dev $interface mtu $mtu
            fi
            ;;
        6)
            log "Exiting without changes"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 主函数
main() {
    log "Starting comprehensive network diagnostics..."
    
    check_network_infrastructure
    check_service_networks
    test_container_connectivity
    test_service_discovery
    test_load_balancing
    test_network_performance
    diagnose_network_issues
    
    echo
    read -p "Do you want to attempt network fixes? (yes/no): " fix_choice
    
    if [ "$fix_choice" = "yes" ]; then
        fix_network_issues
    fi
    
    log "Network diagnostics completed"
}

# 执行主函数
main

5. 存储故障排除

5.1 卷挂载问题

存储诊断脚本

#!/bin/bash
# storage-diagnostics.sh

LOG_FILE="/var/log/storage-diagnostics.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查存储基础设施
check_storage_infrastructure() {
    log "Checking storage infrastructure..."
    
    # 检查磁盘空间
    log "Disk space usage:"
    df -h
    
    # 检查 inode 使用情况
    log "Inode usage:"
    df -i
    
    # 检查挂载点
    log "Mount points:"
    mount | grep -E "(docker|overlay|tmpfs)"
    
    # 检查 Docker 存储驱动
    log "Docker storage driver:"
    docker info | grep -A 10 "Storage Driver"
    
    # 检查 Docker 根目录
    local docker_root=$(docker info | grep "Docker Root Dir" | cut -d':' -f2 | tr -d ' ')
    log "Docker root directory: $docker_root"
    
    if [ -d "$docker_root" ]; then
        log "Docker root directory size:"
        du -sh "$docker_root"
    fi
}

# 检查 Docker 卷
check_docker_volumes() {
    log "Checking Docker volumes..."
    
    # 列出所有卷
    log "All Docker volumes:"
    docker volume ls
    
    # 检查卷详细信息
    for volume in $(docker volume ls --format '{{.Name}}'); do
        log "Volume: $volume"
        docker volume inspect $volume
        
        # 检查卷的挂载点
        local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
        if [ -d "$mountpoint" ]; then
            log "  Mountpoint: $mountpoint"
            log "  Size: $(du -sh "$mountpoint" | cut -f1)"
            log "  Permissions: $(stat -c "%a %U:%G" "$mountpoint")"
        else
            log "  ERROR: Mountpoint not found: $mountpoint"
        fi
    done
    
    # 检查孤立卷
    log "Checking for dangling volumes..."
    local dangling_volumes=$(docker volume ls --filter "dangling=true" --format '{{.Name}}')
    if [ -n "$dangling_volumes" ]; then
        log "Dangling volumes found:"
        echo "$dangling_volumes"
    else
        log "No dangling volumes found"
    fi
}

# 检查服务卷挂载
check_service_volumes() {
    log "Checking service volume mounts..."
    
    for service in $(docker service ls --format '{{.Name}}'); do
        log "Service: $service"
        
        # 获取服务的卷挂载配置
        local mounts=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.ContainerSpec.Mounts}}{{.Source}}:{{.Target}}:{{.Type}} {{end}}')
        
        if [ -n "$mounts" ]; then
            log "  Volume mounts: $mounts"
            
            # 检查每个挂载
            for mount in $mounts; do
                local source=$(echo $mount | cut -d':' -f1)
                local target=$(echo $mount | cut -d':' -f2)
                local type=$(echo $mount | cut -d':' -f3)
                
                log "    Mount: $source -> $target ($type)"
                
                # 检查源路径/卷是否存在
                if [ "$type" = "volume" ]; then
                    if docker volume inspect $source > /dev/null 2>&1; then
                        log "      ✓ Volume $source exists"
                    else
                        log "      ✗ Volume $source does not exist"
                    fi
                elif [ "$type" = "bind" ]; then
                    if [ -e "$source" ]; then
                        log "      ✓ Bind source $source exists"
                        log "      Permissions: $(stat -c "%a %U:%G" "$source")"
                    else
                        log "      ✗ Bind source $source does not exist"
                    fi
                fi
            done
        else
            log "  No volume mounts configured"
        fi
    done
}

# 检查容器卷挂载状态
check_container_mounts() {
    log "Checking container mount status..."
    
    for container in $(docker ps --format '{{.Names}}'); do
        log "Container: $container"
        
        # 获取容器的挂载信息
        local mounts=$(docker inspect $container --format '{{range .Mounts}}{{.Source}}:{{.Destination}}:{{.Type}}:{{.RW}} {{end}}')
        
        if [ -n "$mounts" ]; then
            log "  Mounts: $mounts"
            
            # 测试挂载点的读写权限
            for mount in $mounts; do
                local source=$(echo $mount | cut -d':' -f1)
                local dest=$(echo $mount | cut -d':' -f2)
                local type=$(echo $mount | cut -d':' -f3)
                local rw=$(echo $mount | cut -d':' -f4)
                
                log "    Testing mount: $dest ($type, $rw)"
                
                # 在容器中测试挂载点
                if docker exec $container test -d "$dest" 2>/dev/null; then
                    log "      ✓ Mount point accessible"
                    
                    # 测试写权限(如果是读写挂载)
                    if [ "$rw" = "true" ]; then
                        if docker exec $container touch "$dest/.test_write" 2>/dev/null; then
                            docker exec $container rm "$dest/.test_write" 2>/dev/null
                            log "      ✓ Write permission OK"
                        else
                            log "      ✗ Write permission failed"
                        fi
                    fi
                else
                    log "      ✗ Mount point not accessible"
                fi
            done
        else
            log "  No mounts found"
        fi
    done
}

# 检查存储性能
check_storage_performance() {
    log "Checking storage performance..."
    
    # 检查磁盘 I/O
    log "Disk I/O statistics:"
    if command -v iostat > /dev/null; then
        iostat -x 1 3
    else
        log "iostat not available"
    fi
    
    # 检查磁盘使用率
    log "Disk usage by Docker:"
    docker system df
    
    # 测试磁盘写入性能
    log "Testing disk write performance..."
    local test_file="/tmp/disk_test_$(date +%s)"
    
    if dd if=/dev/zero of="$test_file" bs=1M count=100 2>&1 | grep -o "[0-9.]* MB/s"; then
        log "Disk write test completed"
    else
        log "Disk write test failed"
    fi
    
    rm -f "$test_file"
    
    # 检查 Docker 存储池使用情况(对于 devicemapper)
    local storage_driver=$(docker info | grep "Storage Driver" | cut -d':' -f2 | tr -d ' ')
    
    if [ "$storage_driver" = "devicemapper" ]; then
        log "Devicemapper storage pool status:"
        docker info | grep -A 10 "Pool"
    fi
}

# 清理存储空间
cleanup_storage() {
    log "Storage cleanup options:"
    
    echo "Available cleanup options:"
    echo "1. Remove unused containers"
    echo "2. Remove unused images"
    echo "3. Remove unused volumes"
    echo "4. Remove unused networks"
    echo "5. System prune (all unused objects)"
    echo "6. Exit without cleanup"
    
    read -p "Choose a cleanup option (1-6): " choice
    
    case $choice in
        1)
            log "Removing unused containers..."
            docker container prune -f
            ;;
        2)
            log "Removing unused images..."
            docker image prune -f
            ;;
        3)
            log "Removing unused volumes..."
            docker volume prune -f
            ;;
        4)
            log "Removing unused networks..."
            docker network prune -f
            ;;
        5)
            log "Performing system prune..."
            docker system prune -f
            ;;
        6)
            log "Exiting without cleanup"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 修复存储问题
fix_storage_issues() {
    log "Storage issue fix options:"
    
    echo "Available fix options:"
    echo "1. Fix volume permissions"
    echo "2. Recreate problematic volumes"
    echo "3. Restart Docker daemon"
    echo "4. Clean up storage space"
    echo "5. Reset Docker storage"
    echo "6. Exit without changes"
    
    read -p "Choose a fix option (1-6): " choice
    
    case $choice in
        1)
            read -p "Enter volume name: " volume_name
            if [ -n "$volume_name" ]; then
                local mountpoint=$(docker volume inspect $volume_name --format '{{.Mountpoint}}' 2>/dev/null)
                if [ -n "$mountpoint" ] && [ -d "$mountpoint" ]; then
                    log "Fixing permissions for volume: $volume_name"
                    sudo chown -R root:root "$mountpoint"
                    sudo chmod -R 755 "$mountpoint"
                    log "Permissions fixed"
                else
                    log "Volume not found or mountpoint not accessible"
                fi
            fi
            ;;
        2)
            read -p "Enter volume name to recreate: " volume_name
            if [ -n "$volume_name" ]; then
                echo "WARNING: This will delete all data in the volume"
                read -p "Continue? (yes/no): " confirm
                if [ "$confirm" = "yes" ]; then
                    log "Recreating volume: $volume_name"
                    docker volume rm $volume_name
                    docker volume create $volume_name
                    log "Volume recreated"
                fi
            fi
            ;;
        3)
            log "Restarting Docker daemon..."
            systemctl restart docker
            sleep 10
            log "Docker daemon restarted"
            ;;
        4)
            cleanup_storage
            ;;
        5)
            echo "WARNING: This will remove all Docker data"
            read -p "Continue? (yes/no): " confirm
            if [ "$confirm" = "yes" ]; then
                log "Resetting Docker storage..."
                systemctl stop docker
                rm -rf /var/lib/docker/*
                systemctl start docker
                log "Docker storage reset completed"
            fi
            ;;
        6)
            log "Exiting without changes"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 主函数
main() {
    log "Starting storage diagnostics..."
    
    check_storage_infrastructure
    check_docker_volumes
    check_service_volumes
    check_container_mounts
    check_storage_performance
    
    echo
    read -p "Do you want to fix storage issues? (yes/no): " fix_choice
    
    if [ "$fix_choice" = "yes" ]; then
        fix_storage_issues
    fi
    
    log "Storage diagnostics completed"
}

# 执行主函数
main

6. 性能问题诊断

6.1 性能监控和分析

性能诊断脚本

#!/bin/bash
# performance-diagnostics.sh

LOG_FILE="/var/log/performance-diagnostics.log"
REPORT_DIR="/var/log/performance-reports"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)

# 创建报告目录
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 收集系统性能指标
collect_system_metrics() {
    log "Collecting system performance metrics..."
    
    local report_file="$REPORT_DIR/system-metrics-$TIMESTAMP.txt"
    
    {
        echo "=== System Performance Report ==="
        echo "Timestamp: $(date)"
        echo "Hostname: $(hostname)"
        echo
        
        echo "=== CPU Information ==="
        lscpu
        echo
        
        echo "=== CPU Usage ==="
        top -bn1 | head -20
        echo
        
        echo "=== Memory Usage ==="
        free -h
        echo
        cat /proc/meminfo | head -20
        echo
        
        echo "=== Load Average ==="
        uptime
        echo
        cat /proc/loadavg
        echo
        
        echo "=== Disk Usage ==="
        df -h
        echo
        
        echo "=== Disk I/O ==="
        if command -v iostat > /dev/null; then
            iostat -x 1 3
        else
            echo "iostat not available"
        fi
        echo
        
        echo "=== Network Statistics ==="
        cat /proc/net/dev
        echo
        
        echo "=== Process List ==="
        ps aux --sort=-%cpu | head -20
        echo
        
    } > $report_file
    
    log "System metrics saved to: $report_file"
}

# 收集 Docker 性能指标
collect_docker_metrics() {
    log "Collecting Docker performance metrics..."
    
    local report_file="$REPORT_DIR/docker-metrics-$TIMESTAMP.txt"
    
    {
        echo "=== Docker Performance Report ==="
        echo "Timestamp: $(date)"
        echo
        
        echo "=== Docker System Info ==="
        docker system df
        echo
        
        echo "=== Docker Events (last 1 hour) ==="
        docker events --since="1h" --until="now" | tail -20
        echo
        
        echo "=== Container Statistics ==="
        timeout 10 docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}"
        echo
        
        echo "=== Service Statistics ==="
        for service in $(docker service ls --format '{{.Name}}'); do
            echo "--- Service: $service ---"
            docker service ps $service
            echo
        done
        
        echo "=== Image Statistics ==="
        docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}"
        echo
        
        echo "=== Volume Statistics ==="
        for volume in $(docker volume ls --format '{{.Name}}'); do
            local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
            local size=$(du -sh "$mountpoint" 2>/dev/null | cut -f1 || echo "N/A")
            echo "Volume: $volume, Size: $size"
        done
        echo
        
    } > $report_file
    
    log "Docker metrics saved to: $report_file"
}

# 分析容器性能
analyze_container_performance() {
    log "Analyzing container performance..."
    
    local report_file="$REPORT_DIR/container-analysis-$TIMESTAMP.txt"
    
    {
        echo "=== Container Performance Analysis ==="
        echo "Timestamp: $(date)"
        echo
        
        # 获取所有运行中的容器
        for container in $(docker ps --format '{{.Names}}'); do
            echo "--- Container: $container ---"
            
            # 基本信息
            echo "Container Info:"
            docker inspect $container --format '{{.Config.Image}} {{.State.Status}} {{.State.StartedAt}}'
            echo
            
            # 资源限制
            echo "Resource Limits:"
            local cpu_limit=$(docker inspect $container --format '{{.HostConfig.CpuQuota}}')
            local memory_limit=$(docker inspect $container --format '{{.HostConfig.Memory}}')
            echo "  CPU Quota: $cpu_limit"
            echo "  Memory Limit: $memory_limit"
            echo
            
            # 实时统计
            echo "Current Stats:"
            timeout 5 docker stats --no-stream $container
            echo
            
            # 进程列表
            echo "Processes:"
            docker exec $container ps aux 2>/dev/null | head -10 || echo "Cannot access container processes"
            echo
            
            # 网络连接
            echo "Network Connections:"
            docker exec $container netstat -tuln 2>/dev/null | head -10 || echo "Cannot access network info"
            echo
            
            # 磁盘使用
            echo "Disk Usage:"
            docker exec $container df -h 2>/dev/null || echo "Cannot access disk info"
            echo
            
            echo "======================================"
            echo
        done
        
    } > $report_file
    
    log "Container analysis saved to: $report_file"
}

# 分析服务性能
analyze_service_performance() {
    log "Analyzing service performance..."
    
    local report_file="$REPORT_DIR/service-analysis-$TIMESTAMP.txt"
    
    {
        echo "=== Service Performance Analysis ==="
        echo "Timestamp: $(date)"
        echo
        
        for service in $(docker service ls --format '{{.Name}}'); do
            echo "--- Service: $service ---"
            
            # 服务基本信息
            echo "Service Info:"
            docker service ls --filter "name=$service" --format "table {{.Name}}\t{{.Mode}}\t{{.Replicas}}\t{{.Image}}"
            echo
            
            # 任务分布
            echo "Task Distribution:"
            docker service ps $service --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Error}}"
            echo
            
            # 资源配置
            echo "Resource Configuration:"
            local cpu_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.NanoCPUs}}')
            local memory_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.MemoryBytes}}')
            local cpu_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.NanoCPUs}}')
            local memory_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.MemoryBytes}}')
            
            echo "  CPU Limit: $cpu_limit"
            echo "  Memory Limit: $memory_limit"
            echo "  CPU Reservation: $cpu_reservation"
            echo "  Memory Reservation: $memory_reservation"
            echo
            
            # 更新历史
            echo "Update History:"
            docker service inspect $service --format '{{range .UpdateStatus.History}}{{.State}} {{.StartedAt}} {{.Message}}{{end}}' | tail -5
            echo
            
            # 服务日志(最近的错误)
            echo "Recent Errors in Logs:"
            docker service logs --tail 20 $service 2>&1 | grep -i error | tail -5
            echo
            
            echo "======================================"
            echo
        done
        
    } > $report_file
    
    log "Service analysis saved to: $report_file"
}

# 网络性能分析
analyze_network_performance() {
    log "Analyzing network performance..."
    
    local report_file="$REPORT_DIR/network-analysis-$TIMESTAMP.txt"
    
    {
        echo "=== Network Performance Analysis ==="
        echo "Timestamp: $(date)"
        echo
        
        echo "=== Network Interfaces ==="
        ip addr show
        echo
        
        echo "=== Network Statistics ==="
        cat /proc/net/dev
        echo
        
        echo "=== Network Connections ==="
        ss -tuln | head -20
        echo
        
        echo "=== Docker Networks ==="
        for network in $(docker network ls --format '{{.Name}}'); do
            echo "--- Network: $network ---"
            docker network inspect $network --format '{{.Driver}} {{.Scope}} {{.IPAM.Config}}'
            echo
        done
        
        echo "=== Overlay Network Performance ==="
        for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
            echo "Network: $network"
            local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
            echo "  Encrypted: $encrypted"
            
            # 获取连接到此网络的容器
            local containers=$(docker network inspect $network --format '{{range .Containers}}{{.Name}} {{end}}')
            echo "  Connected containers: $containers"
            echo
        done
        
    } > $report_file
    
    log "Network analysis saved to: $report_file"
}

# 生成性能报告
generate_performance_report() {
    log "Generating comprehensive performance report..."
    
    local summary_report="$REPORT_DIR/performance-summary-$TIMESTAMP.html"
    
    cat > $summary_report << 'EOF'
<!DOCTYPE html>
<html>
<head>
    <title>Docker Swarm Performance Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .header { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
        .section { margin: 20px 0; border: 1px solid #ddd; padding: 10px; border-radius: 5px; }
        .metric { display: inline-block; margin: 10px; padding: 10px; border: 1px solid #ccc; border-radius: 5px; }
        .good { background-color: #e8f5e8; }
        .warning { background-color: #fff3e0; }
        .critical { background-color: #ffebee; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #f2f2f2; }
    </style>
</head>
<body>
    <div class="header">
        <h1>Docker Swarm Performance Report</h1>
        <p>Generated: $(date)</p>
        <p>Hostname: $(hostname)</p>
    </div>
EOF
    
    # 添加系统概览
    echo "    <div class='section'>" >> $summary_report
    echo "        <h2>System Overview</h2>" >> $summary_report
    
    # CPU 使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
    local cpu_class="good"
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        cpu_class="critical"
    elif (( $(echo "$cpu_usage > 60" | bc -l) )); then
        cpu_class="warning"
    fi
    
    # 内存使用率
    local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3*100/$2 }')
    local mem_class="good"
    if (( $(echo "$mem_usage > 90" | bc -l) )); then
        mem_class="critical"
    elif (( $(echo "$mem_usage > 80" | bc -l) )); then
        mem_class="warning"
    fi
    
    # 磁盘使用率
    local disk_usage=$(df / | awk 'NR==2{print $5}' | cut -d'%' -f1)
    local disk_class="good"
    if [ $disk_usage -gt 90 ]; then
        disk_class="critical"
    elif [ $disk_usage -gt 80 ]; then
        disk_class="warning"
    fi
    
    echo "        <div class='metric $cpu_class'>CPU Usage: ${cpu_usage}%</div>" >> $summary_report
    echo "        <div class='metric $mem_class'>Memory Usage: ${mem_usage}%</div>" >> $summary_report
    echo "        <div class='metric $disk_class'>Disk Usage: ${disk_usage}%</div>" >> $summary_report
    echo "    </div>" >> $summary_report
    
    # 添加服务状态
    echo "    <div class='section'>" >> $summary_report
    echo "        <h2>Service Status</h2>" >> $summary_report
    echo "        <table>" >> $summary_report
    echo "            <tr><th>Service</th><th>Replicas</th><th>Status</th></tr>" >> $summary_report
    
    for service in $(docker service ls --format '{{.Name}}'); do
        local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
        local desired=$(echo $replicas | cut -d'/' -f2)
        local running=$(echo $replicas | cut -d'/' -f1)
        
        local status_class="good"
        if [ "$running" != "$desired" ]; then
            status_class="warning"
        fi
        
        echo "            <tr class='$status_class'><td>$service</td><td>$replicas</td><td>$([ "$running" = "$desired" ] && echo "Healthy" || echo "Degraded")</td></tr>" >> $summary_report
    done
    
    echo "        </table>" >> $summary_report
    echo "    </div>" >> $summary_report
    
    # 添加详细报告链接
    echo "    <div class='section'>" >> $summary_report
    echo "        <h2>Detailed Reports</h2>" >> $summary_report
    echo "        <ul>" >> $summary_report
    echo "            <li><a href='system-metrics-$TIMESTAMP.txt'>System Metrics</a></li>" >> $summary_report
    echo "            <li><a href='docker-metrics-$TIMESTAMP.txt'>Docker Metrics</a></li>" >> $summary_report
    echo "            <li><a href='container-analysis-$TIMESTAMP.txt'>Container Analysis</a></li>" >> $summary_report
    echo "            <li><a href='service-analysis-$TIMESTAMP.txt'>Service Analysis</a></li>" >> $summary_report
    echo "            <li><a href='network-analysis-$TIMESTAMP.txt'>Network Analysis</a></li>" >> $summary_report
    echo "        </ul>" >> $summary_report
    echo "    </div>" >> $summary_report
    
    echo "</body></html>" >> $summary_report
    
    log "Performance summary report generated: $summary_report"
}

# 性能优化建议
generate_optimization_recommendations() {
    log "Generating optimization recommendations..."
    
    local recommendations_file="$REPORT_DIR/optimization-recommendations-$TIMESTAMP.txt"
    
    {
        echo "=== Performance Optimization Recommendations ==="
        echo "Generated: $(date)"
        echo
        
        # CPU 优化建议
        local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
        echo "=== CPU Optimization ==="
        echo "Current CPU usage: ${cpu_usage}%"
        
        if (( $(echo "$cpu_usage > 80" | bc -l) )); then
            echo "CRITICAL: High CPU usage detected"
            echo "Recommendations:"
            echo "  - Scale out services to distribute load"
            echo "  - Optimize application code for better CPU efficiency"
            echo "  - Consider adding more nodes to the cluster"
            echo "  - Review and optimize resource limits"
        elif (( $(echo "$cpu_usage > 60" | bc -l) )); then
            echo "WARNING: Moderate CPU usage"
            echo "Recommendations:"
            echo "  - Monitor trends and prepare for scaling"
            echo "  - Review service resource requests"
        else
            echo "OK: CPU usage is within acceptable range"
        fi
        echo
        
        # 内存优化建议
        local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3*100/$2 }')
        echo "=== Memory Optimization ==="
        echo "Current memory usage: ${mem_usage}%"
        
        if (( $(echo "$mem_usage > 90" | bc -l) )); then
            echo "CRITICAL: High memory usage detected"
            echo "Recommendations:"
            echo "  - Immediately scale out memory-intensive services"
            echo "  - Review and optimize memory limits"
            echo "  - Check for memory leaks in applications"
            echo "  - Consider adding more memory or nodes"
        elif (( $(echo "$mem_usage > 80" | bc -l) )); then
            echo "WARNING: High memory usage"
            echo "Recommendations:"
            echo "  - Monitor memory trends closely"
            echo "  - Optimize application memory usage"
            echo "  - Review service memory limits"
        else
            echo "OK: Memory usage is within acceptable range"
        fi
        echo
        
        # 磁盘优化建议
        local disk_usage=$(df / | awk 'NR==2{print $5}' | cut -d'%' -f1)
        echo "=== Disk Optimization ==="
        echo "Current disk usage: ${disk_usage}%"
        
        if [ $disk_usage -gt 90 ]; then
            echo "CRITICAL: High disk usage detected"
            echo "Recommendations:"
            echo "  - Immediately clean up unused Docker objects"
            echo "  - Implement log rotation"
            echo "  - Move data to external storage"
            echo "  - Add more disk space"
        elif [ $disk_usage -gt 80 ]; then
            echo "WARNING: High disk usage"
            echo "Recommendations:"
            echo "  - Schedule regular cleanup tasks"
            echo "  - Monitor disk usage trends"
            echo "  - Optimize log retention policies"
        else
            echo "OK: Disk usage is within acceptable range"
        fi
        echo
        
        # 服务优化建议
        echo "=== Service Optimization ==="
        for service in $(docker service ls --format '{{.Name}}'); do
            local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
            local desired=$(echo $replicas | cut -d'/' -f2)
            local running=$(echo $replicas | cut -d'/' -f1)
            
            echo "Service: $service"
            echo "  Current replicas: $replicas"
            
            if [ "$running" != "$desired" ]; then
                echo "  WARNING: Service is not at desired replica count"
                echo "  Recommendations:"
                echo "    - Check service logs for errors"
                echo "    - Verify resource availability"
                echo "    - Check placement constraints"
            else
                echo "  OK: Service is healthy"
            fi
            echo
        done
        
        # 网络优化建议
        echo "=== Network Optimization ==="
        local overlay_networks=$(docker network ls --filter "driver=overlay" --format '{{.Name}}' | wc -l)
        echo "Overlay networks: $overlay_networks"
        
        if [ $overlay_networks -gt 10 ]; then
            echo "WARNING: Many overlay networks detected"
            echo "Recommendations:"
            echo "  - Consolidate networks where possible"
            echo "  - Remove unused networks"
        fi
        
        # 检查加密网络
        for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
            local encrypted=$(docker network inspect $network --format '{{.Options.encrypted}}')
            if [ "$encrypted" = "true" ]; then
                echo "Network $network is encrypted (may impact performance)"
                echo "  Consider: Disable encryption if not required for security"
            fi
        done
        echo
        
    } > $recommendations_file
    
    log "Optimization recommendations saved to: $recommendations_file"
}

# 主函数
main() {
    log "Starting comprehensive performance diagnostics..."
    
    collect_system_metrics
    collect_docker_metrics
    analyze_container_performance
    analyze_service_performance
    analyze_network_performance
    generate_performance_report
    generate_optimization_recommendations
    
    log "Performance diagnostics completed"
    log "Reports saved to: $REPORT_DIR"
    
    echo
    echo "Performance diagnostics completed!"
    echo "Reports available in: $REPORT_DIR"
    echo "Summary report: $REPORT_DIR/performance-summary-$TIMESTAMP.html"
}

# 执行主函数
main

7. 实践练习

练习 1:集群故障模拟与恢复

目标

模拟各种集群故障场景,练习故障诊断和恢复技能。

步骤

  1. 准备测试环境 “`bash

    创建测试服务

    docker service create –name web-test –replicas 3 nginx docker service create –name db-test –replicas 1 postgres:13

验证服务状态

docker service ls docker service ps web-test


2. **模拟节点故障**
```bash
# 在工作节点上停止 Docker 服务
sudo systemctl stop docker

# 观察集群反应
docker node ls
docker service ps web-test

# 使用故障诊断脚本
./cluster-health-check.sh
  1. 模拟网络分区 “`bash

    使用 iptables 模拟网络分区

    sudo iptables -A INPUT -s -j DROP sudo iptables -A OUTPUT -d -j DROP

观察集群状态

docker node ls

恢复网络

sudo iptables -D INPUT -s -j DROP sudo iptables -D OUTPUT -d -j DROP


4. **模拟服务故障**
```bash
# 更新服务到错误的镜像
docker service update --image nginx:nonexistent web-test

# 使用服务诊断脚本
./service-diagnostics.sh web-test

# 回滚服务
docker service rollback web-test

练习 2:网络故障排除

目标

诊断和解决网络连通性问题。

步骤

  1. 创建网络测试环境 “`bash

    创建自定义网络

    docker network create –driver overlay test-network

部署测试服务

docker service create –name client-test –network test-network alpine sleep 3600 docker service create –name server-test –network test-network nginx


2. **测试网络连通性**
```bash
# 运行网络诊断脚本
./network-diagnostics.sh

# 手动测试连通性
docker exec $(docker ps --filter "name=client-test" --format "{{.ID}}") ping server-test
  1. 模拟网络问题 “`bash

    删除网络(模拟网络配置错误)

    docker network rm test-network

观察服务状态

docker service ps client-test docker service ps server-test

重新创建网络并更新服务

docker network create –driver overlay test-network docker service update –network-add test-network client-test docker service update –network-add test-network server-test


### 练习 3:性能问题诊断

#### 目标
识别和解决性能瓶颈。

#### 步骤

1. **创建高负载场景**
```bash
# 部署 CPU 密集型服务
docker service create --name cpu-stress --replicas 5 \
  --limit-cpu 0.5 --reserve-cpu 0.2 \
  progrium/stress --cpu 2 --timeout 300s

# 部署内存密集型服务
docker service create --name mem-stress --replicas 2 \
  --limit-memory 512m --reserve-memory 256m \
  progrium/stress --vm 1 --vm-bytes 400m --timeout 300s
  1. 运行性能诊断 “`bash

    执行性能诊断脚本

    ./performance-diagnostics.sh

监控资源使用

watch -n 2 ‘docker stats –no-stream’


3. **分析和优化**
```bash
# 查看性能报告
ls /var/log/performance-reports/

# 根据建议进行优化
docker service scale cpu-stress=2  # 减少副本数
docker service update --limit-cpu 0.3 cpu-stress  # 降低 CPU 限制

8. 本章总结

关键要点

  1. 故障诊断方法论

    • 系统化的故障诊断流程
    • 从症状到根因的分析方法
    • 预防性维护的重要性
  2. 集群故障处理

    • 节点故障的识别和恢复
    • 管理节点选举问题
    • 证书和认证问题解决
  3. 服务故障排除

    • 服务启动失败的常见原因
    • 健康检查配置和调试
    • 资源约束和调度问题
  4. 网络故障诊断

    • 网络连通性测试
    • 服务发现问题
    • 负载均衡验证
  5. 存储故障处理

    • 卷挂载问题诊断
    • 存储空间管理
    • 权限和访问问题
  6. 性能问题分析

    • 系统性能监控
    • 资源使用分析
    • 性能优化建议

最佳实践

  1. 预防性维护

    • 定期运行健康检查脚本
    • 监控关键性能指标
    • 及时更新和打补丁
  2. 故障响应

    • 建立标准化的故障响应流程
    • 维护详细的故障处理文档
    • 定期进行故障演练
  3. 监控和告警

    • 实施全面的监控策略
    • 设置合理的告警阈值
    • 自动化常见问题的处理
  4. 文档和知识管理

    • 记录所有故障和解决方案
    • 建立知识库和最佳实践
    • 定期回顾和更新流程
  5. 工具和自动化

    • 开发和维护诊断脚本
    • 自动化重复性任务
    • 集成监控和告警系统

下一步学习

在掌握了故障排除和调试技巧后,建议继续学习:

  1. 高级监控和可观测性

    • 分布式追踪
    • 日志聚合和分析
    • 自定义指标收集
  2. 自动化运维

    • Infrastructure as Code
    • CI/CD 集成
    • 自动化部署和回滚
  3. 灾难恢复

    • 备份和恢复策略
    • 多区域部署
    • 业务连续性规划

通过本章的学习,您应该能够有效地诊断和解决 Docker Swarm 集群中的各种问题,确保集群的稳定运行和最佳性能。

2. 集群故障排除

2.1 节点故障处理

节点离线处理

#!/bin/bash
# node-recovery.sh

NODE_NAME=$1
LOG_FILE="/var/log/node-recovery.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查节点状态
check_node_status() {
    local node=$1
    
    log "Checking status of node: $node"
    
    local status=$(docker node ls --filter "name=$node" --format '{{.Status}}')
    local availability=$(docker node ls --filter "name=$node" --format '{{.Availability}}')
    
    echo "Node Status: $status"
    echo "Node Availability: $availability"
    
    if [ "$status" = "Down" ]; then
        return 1
    else
        return 0
    fi
}

# 尝试重新连接节点
reconnect_node() {
    local node=$1
    
    log "Attempting to reconnect node: $node"
    
    # 检查节点是否可以 ping 通
    if ping -c 3 $node > /dev/null 2>&1; then
        log "Node $node is reachable via ping"
        
        # 尝试 SSH 连接并重启 Docker
        if ssh $node "sudo systemctl restart docker" 2>/dev/null; then
            log "Docker service restarted on node $node"
            sleep 10
            
            # 检查节点是否重新加入
            if check_node_status $node; then
                log "Node $node successfully reconnected"
                return 0
            fi
        else
            log "Failed to restart Docker on node $node"
        fi
    else
        log "Node $node is not reachable via ping"
    fi
    
    return 1
}

# 强制移除节点
force_remove_node() {
    local node=$1
    
    log "Force removing node: $node"
    
    # 将节点设置为 drain 状态
    docker node update --availability drain $node
    
    # 等待任务迁移
    log "Waiting for tasks to migrate from node $node..."
    sleep 30
    
    # 检查是否还有任务在该节点上
    local task_count=$(docker node ps $node --filter "desired-state=running" --format '{{.Name}}' | wc -l)
    
    if [ $task_count -gt 0 ]; then
        log "Warning: $task_count tasks still running on node $node"
        
        # 强制停止任务
        for task in $(docker node ps $node --filter "desired-state=running" --format '{{.Name}}'); do
            log "Force stopping task: $task"
            docker service update --force $(echo $task | cut -d'.' -f1)
        done
        
        sleep 10
    fi
    
    # 移除节点
    docker node rm --force $node
    
    if [ $? -eq 0 ]; then
        log "Node $node successfully removed from cluster"
        return 0
    else
        log "Failed to remove node $node"
        return 1
    fi
}

# 重新加入节点
rejoin_node() {
    local node=$1
    
    log "Attempting to rejoin node: $node"
    
    # 获取加入令牌
    local worker_token=$(docker swarm join-token worker -q)
    local manager_ip=$(docker info | grep "Node Address" | awk '{print $3}')
    
    # 通过 SSH 在节点上执行加入命令
    if ssh $node "docker swarm join --token $worker_token $manager_ip:2377" 2>/dev/null; then
        log "Node $node successfully rejoined the cluster"
        
        # 等待节点状态更新
        sleep 5
        
        # 验证节点状态
        if check_node_status $node; then
            log "Node $node is now active in the cluster"
            return 0
        fi
    else
        log "Failed to rejoin node $node to the cluster"
    fi
    
    return 1
}

# 主恢复流程
recover_node() {
    local node=$1
    
    if [ -z "$node" ]; then
        echo "Usage: $0 <node-name>"
        exit 1
    fi
    
    log "Starting recovery process for node: $node"
    
    # 步骤 1: 检查当前状态
    if check_node_status $node; then
        log "Node $node is already active"
        exit 0
    fi
    
    # 步骤 2: 尝试重新连接
    if reconnect_node $node; then
        log "Node $node recovery completed successfully"
        exit 0
    fi
    
    # 步骤 3: 强制移除并重新加入
    log "Attempting force removal and rejoin..."
    
    if force_remove_node $node; then
        sleep 5
        
        if rejoin_node $node; then
            log "Node $node recovery completed successfully"
            exit 0
        fi
    fi
    
    log "Node $node recovery failed"
    exit 1
}

# 执行恢复
recover_node $NODE_NAME

管理节点故障处理

#!/bin/bash
# manager-node-recovery.sh

LOG_FILE="/var/log/manager-recovery.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查管理节点状态
check_manager_status() {
    log "Checking manager node status..."
    
    local manager_count=$(docker node ls --filter "role=manager" --format '{{.Hostname}} {{.Status}}' | grep "Ready" | wc -l)
    local total_managers=$(docker node ls --filter "role=manager" --format '{{.Hostname}}' | wc -l)
    
    log "Active managers: $manager_count/$total_managers"
    
    # 检查是否有足够的管理节点
    if [ $manager_count -lt 2 ]; then
        log "WARNING: Insufficient active manager nodes"
        return 1
    fi
    
    return 0
}

# 检查 Raft 共识状态
check_raft_status() {
    log "Checking Raft consensus status..."
    
    # 检查是否有 leader
    local leader_info=$(docker info | grep "Is Manager" -A 5)
    
    if echo "$leader_info" | grep -q "Leader: Yes"; then
        log "Current node is the Raft leader"
    elif echo "$leader_info" | grep -q "Leader: No"; then
        log "Current node is a Raft follower"
    else
        log "WARNING: Raft leader status unclear"
        return 1
    fi
    
    return 0
}

# 提升工作节点为管理节点
promote_worker_to_manager() {
    local worker_node=$1
    
    if [ -z "$worker_node" ]; then
        # 自动选择一个健康的工作节点
        worker_node=$(docker node ls --filter "role=worker" --filter "availability=active" --format '{{.Hostname}}' | head -1)
    fi
    
    if [ -z "$worker_node" ]; then
        log "ERROR: No available worker nodes to promote"
        return 1
    fi
    
    log "Promoting worker node $worker_node to manager"
    
    docker node promote $worker_node
    
    if [ $? -eq 0 ]; then
        log "Successfully promoted $worker_node to manager"
        return 0
    else
        log "Failed to promote $worker_node to manager"
        return 1
    fi
}

# 降级管理节点为工作节点
demote_manager_to_worker() {
    local manager_node=$1
    
    if [ -z "$manager_node" ]; then
        log "ERROR: Manager node name required"
        return 1
    fi
    
    log "Demoting manager node $manager_node to worker"
    
    # 检查是否是最后一个管理节点
    local manager_count=$(docker node ls --filter "role=manager" --format '{{.Hostname}}' | wc -l)
    
    if [ $manager_count -le 1 ]; then
        log "ERROR: Cannot demote the last manager node"
        return 1
    fi
    
    docker node demote $manager_node
    
    if [ $? -eq 0 ]; then
        log "Successfully demoted $manager_node to worker"
        return 0
    else
        log "Failed to demote $manager_node to worker"
        return 1
    fi
}

# 重建管理节点集群
rebuild_manager_cluster() {
    log "Rebuilding manager cluster..."
    
    # 获取当前管理节点列表
    local managers=($(docker node ls --filter "role=manager" --format '{{.Hostname}}'))
    local healthy_managers=()
    
    # 检查每个管理节点的健康状态
    for manager in "${managers[@]}"; do
        local status=$(docker node ls --filter "name=$manager" --format '{{.Status}}')
        if [ "$status" = "Ready" ]; then
            healthy_managers+=("$manager")
        else
            log "Manager node $manager is not healthy (status: $status)"
        fi
    done
    
    log "Healthy managers: ${healthy_managers[*]}"
    
    # 如果健康的管理节点少于 3 个,提升工作节点
    while [ ${#healthy_managers[@]} -lt 3 ]; do
        if promote_worker_to_manager; then
            # 重新获取健康管理节点列表
            healthy_managers=($(docker node ls --filter "role=manager" --filter "availability=active" --format '{{.Hostname}}'))
        else
            log "Failed to promote additional manager nodes"
            break
        fi
    done
    
    log "Manager cluster rebuild completed"
}

# 处理集群分裂
handle_split_brain() {
    log "Handling potential split-brain scenario..."
    
    # 检查当前节点是否可以访问其他管理节点
    local current_node=$(hostname)
    local other_managers=($(docker node ls --filter "role=manager" --format '{{.Hostname}}' | grep -v "$current_node"))
    
    local reachable_managers=0
    
    for manager in "${other_managers[@]}"; do
        if ping -c 1 -W 2 $manager > /dev/null 2>&1; then
            log "Manager node $manager is reachable"
            reachable_managers=$((reachable_managers + 1))
        else
            log "Manager node $manager is not reachable"
        fi
    done
    
    log "Reachable managers: $reachable_managers/${#other_managers[@]}"
    
    # 如果大部分管理节点不可达,可能存在网络分区
    if [ $reachable_managers -lt $((${#other_managers[@]} / 2)) ]; then
        log "WARNING: Possible network partition detected"
        log "Consider manual intervention to resolve split-brain"
        
        # 可以选择强制重新初始化集群(谨慎操作)
        read -p "Force reinitialize cluster? (yes/no): " confirm
        if [ "$confirm" = "yes" ]; then
            force_reinit_cluster
        fi
    fi
}

# 强制重新初始化集群(危险操作)
force_reinit_cluster() {
    log "WARNING: Force reinitializing cluster - this will lose cluster state!"
    
    # 备份当前配置
    local backup_dir="/var/backup/swarm-$(date +%Y%m%d_%H%M%S)"
    mkdir -p $backup_dir
    
    # 导出服务配置
    for service in $(docker service ls --format '{{.Name}}'); do
        docker service inspect $service > "$backup_dir/service-$service.json"
    done
    
    # 导出网络配置
    for network in $(docker network ls --filter "driver=overlay" --format '{{.Name}}'); do
        docker network inspect $network > "$backup_dir/network-$network.json"
    done
    
    log "Configuration backed up to: $backup_dir"
    
    # 强制重新初始化
    docker swarm init --force-new-cluster
    
    if [ $? -eq 0 ]; then
        log "Cluster successfully reinitialized"
        log "Please manually restore services and rejoin other nodes"
    else
        log "Failed to reinitialize cluster"
    fi
}

# 主恢复流程
main() {
    log "Starting manager node recovery process..."
    
    # 检查当前状态
    if ! check_manager_status; then
        log "Manager cluster needs attention"
        
        # 尝试重建管理节点集群
        rebuild_manager_cluster
    fi
    
    # 检查 Raft 状态
    if ! check_raft_status; then
        log "Raft consensus issues detected"
        handle_split_brain
    fi
    
    log "Manager node recovery process completed"
}

# 执行主函数
main

2.2 证书和认证问题

证书诊断脚本

#!/bin/bash
# certificate-diagnostics.sh

SWARM_DIR="/var/lib/docker/swarm"
CERT_DIR="$SWARM_DIR/certificates"
LOG_FILE="/var/log/cert-diagnostics.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查证书文件
check_certificate_files() {
    log "Checking certificate files..."
    
    local cert_files=(
        "swarm-node.crt"
        "swarm-node.key"
        "swarm-root-ca.crt"
    )
    
    for cert_file in "${cert_files[@]}"; do
        local file_path="$CERT_DIR/$cert_file"
        
        if [ -f "$file_path" ]; then
            log "Certificate file exists: $cert_file"
            
            # 检查文件权限
            local permissions=$(stat -c "%a" "$file_path")
            log "  Permissions: $permissions"
            
            # 检查文件大小
            local size=$(stat -c "%s" "$file_path")
            log "  Size: $size bytes"
            
            # 如果是证书文件,检查有效期
            if [[ $cert_file == *.crt ]]; then
                check_certificate_validity "$file_path"
            fi
        else
            log "ERROR: Certificate file missing: $cert_file"
        fi
    done
}

# 检查证书有效期
check_certificate_validity() {
    local cert_file=$1
    
    log "Checking certificate validity: $(basename $cert_file)"
    
    # 获取证书信息
    local cert_info=$(openssl x509 -in "$cert_file" -text -noout 2>/dev/null)
    
    if [ $? -eq 0 ]; then
        # 提取有效期信息
        local not_before=$(echo "$cert_info" | grep "Not Before" | cut -d':' -f2-)
        local not_after=$(echo "$cert_info" | grep "Not After" | cut -d':' -f2-)
        
        log "  Not Before: $not_before"
        log "  Not After: $not_after"
        
        # 检查是否即将过期(30天内)
        local expiry_date=$(date -d "$not_after" +%s 2>/dev/null)
        local current_date=$(date +%s)
        local days_until_expiry=$(( (expiry_date - current_date) / 86400 ))
        
        if [ $days_until_expiry -lt 0 ]; then
            log "  ERROR: Certificate has expired!"
        elif [ $days_until_expiry -lt 30 ]; then
            log "  WARNING: Certificate expires in $days_until_expiry days"
        else
            log "  OK: Certificate valid for $days_until_expiry days"
        fi
        
        # 检查证书主体
        local subject=$(echo "$cert_info" | grep "Subject:" | cut -d':' -f2-)
        log "  Subject: $subject"
        
        # 检查证书颁发者
        local issuer=$(echo "$cert_info" | grep "Issuer:" | cut -d':' -f2-)
        log "  Issuer: $issuer"
        
    else
        log "  ERROR: Cannot read certificate file"
    fi
}

# 检查 TLS 连接
check_tls_connectivity() {
    log "Checking TLS connectivity..."
    
    # 获取管理节点地址
    local manager_nodes=($(docker node ls --filter "role=manager" --format '{{.Hostname}}'))
    
    for manager in "${manager_nodes[@]}"; do
        log "Testing TLS connection to manager: $manager"
        
        # 测试 Docker API TLS 连接
        local tls_test=$(timeout 5 openssl s_client -connect $manager:2376 -cert "$CERT_DIR/swarm-node.crt" -key "$CERT_DIR/swarm-node.key" -CAfile "$CERT_DIR/swarm-root-ca.crt" < /dev/null 2>&1)
        
        if echo "$tls_test" | grep -q "Verify return code: 0"; then
            log "  TLS connection successful"
        else
            log "  TLS connection failed"
            log "  Error details: $(echo "$tls_test" | grep "verify error" | head -1)"
        fi
    done
}

# 重新生成证书
regenerate_certificates() {
    log "Regenerating certificates..."
    
    # 备份现有证书
    local backup_dir="/var/backup/swarm-certs-$(date +%Y%m%d_%H%M%S)"
    mkdir -p "$backup_dir"
    
    if [ -d "$CERT_DIR" ]; then
        cp -r "$CERT_DIR" "$backup_dir/"
        log "Certificates backed up to: $backup_dir"
    fi
    
    # 停止 Docker 服务
    log "Stopping Docker service..."
    systemctl stop docker
    
    # 删除现有证书
    if [ -d "$CERT_DIR" ]; then
        rm -rf "$CERT_DIR"
        log "Existing certificates removed"
    fi
    
    # 重新启动 Docker 服务
    log "Starting Docker service..."
    systemctl start docker
    
    # 等待服务启动
    sleep 10
    
    # 检查 Docker 服务状态
    if systemctl is-active docker > /dev/null; then
        log "Docker service restarted successfully"
        
        # 重新加入集群
        log "Attempting to rejoin cluster..."
        
        # 这里需要管理员提供加入令牌
        echo "Please run the following command on a manager node to get the join token:"
        echo "docker swarm join-token worker"
        echo "Then run the join command on this node."
        
    else
        log "ERROR: Failed to restart Docker service"
    fi
}

# 修复证书权限
fix_certificate_permissions() {
    log "Fixing certificate permissions..."
    
    if [ -d "$CERT_DIR" ]; then
        # 设置目录权限
        chmod 700 "$CERT_DIR"
        
        # 设置证书文件权限
        find "$CERT_DIR" -name "*.crt" -exec chmod 644 {} \;
        find "$CERT_DIR" -name "*.key" -exec chmod 600 {} \;
        
        # 设置所有者
        chown -R root:root "$CERT_DIR"
        
        log "Certificate permissions fixed"
    else
        log "Certificate directory not found: $CERT_DIR"
    fi
}

# 主诊断流程
main() {
    log "Starting certificate diagnostics..."
    
    # 检查是否在 Swarm 模式
    if ! docker info | grep -q "Swarm: active"; then
        log "ERROR: Docker is not in Swarm mode"
        exit 1
    fi
    
    # 检查证书文件
    check_certificate_files
    
    # 检查 TLS 连接
    check_tls_connectivity
    
    # 修复权限
    fix_certificate_permissions
    
    log "Certificate diagnostics completed"
    
    # 提供修复选项
    echo
    echo "Available actions:"
    echo "1. Regenerate certificates (requires cluster rejoin)"
    echo "2. Fix permissions only"
    echo "3. Exit"
    
    read -p "Choose an action (1-3): " choice
    
    case $choice in
        1)
            regenerate_certificates
            ;;
        2)
            fix_certificate_permissions
            ;;
        3)
            log "Exiting without changes"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 执行主函数
main

3. 服务故障排除

3.1 服务启动失败

服务诊断脚本

#!/bin/bash
# service-diagnostics.sh

SERVICE_NAME=$1
LOG_FILE="/var/log/service-diagnostics.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 检查服务状态
check_service_status() {
    local service=$1
    
    log "Checking service status: $service"
    
    # 获取服务基本信息
    local service_info=$(docker service ls --filter "name=$service" --format "table {{.Name}}\t{{.Mode}}\t{{.Replicas}}\t{{.Image}}")
    
    if [ -z "$service_info" ]; then
        log "ERROR: Service $service not found"
        return 1
    fi
    
    echo "$service_info"
    
    # 获取详细服务信息
    log "Service details:"
    docker service inspect $service --pretty
    
    # 获取任务状态
    log "Task status:"
    docker service ps $service
    
    return 0
}

# 分析任务失败原因
analyze_task_failures() {
    local service=$1
    
    log "Analyzing task failures for service: $service"
    
    # 获取失败的任务
    local failed_tasks=$(docker service ps $service --filter "desired-state=shutdown" --format '{{.Name}} {{.CurrentState}} {{.Error}}')
    
    if [ -n "$failed_tasks" ]; then
        log "Failed tasks found:"
        echo "$failed_tasks"
        
        # 分析常见失败原因
        if echo "$failed_tasks" | grep -q "no suitable node"; then
            log "Issue: No suitable node found"
            log "Possible causes:"
            log "  - Insufficient resources on nodes"
            log "  - Placement constraints not met"
            log "  - Node labels missing"
            
            check_placement_constraints $service
            check_resource_availability $service
        fi
        
        if echo "$failed_tasks" | grep -q "image.*not found"; then
            log "Issue: Image not found"
            log "Possible causes:"
            log "  - Image name incorrect"
            log "  - Image not available on nodes"
            log "  - Registry authentication issues"
            
            check_image_availability $service
        fi
        
        if echo "$failed_tasks" | grep -q "port.*already in use"; then
            log "Issue: Port conflict"
            log "Possible causes:"
            log "  - Port already bound by another service"
            log "  - Host port conflicts"
            
            check_port_conflicts $service
        fi
        
    else
        log "No failed tasks found"
    fi
}

# 检查放置约束
check_placement_constraints() {
    local service=$1
    
    log "Checking placement constraints for service: $service"
    
    # 获取服务的放置约束
    local constraints=$(docker service inspect $service --format '{{range .Spec.TaskTemplate.Placement.Constraints}}{{.}} {{end}}')
    
    if [ -n "$constraints" ]; then
        log "Placement constraints: $constraints"
        
        # 检查哪些节点满足约束
        log "Nodes matching constraints:"
        
        for constraint in $constraints; do
            log "  Constraint: $constraint"
            
            # 解析约束条件
            if [[ $constraint == *"node.role"* ]]; then
                local role=$(echo $constraint | grep -o "worker\|manager")
                local matching_nodes=$(docker node ls --filter "role=$role" --format '{{.Hostname}}')
                log "    Nodes with role $role: $matching_nodes"
            elif [[ $constraint == *"node.labels"* ]]; then
                local label=$(echo $constraint | cut -d'=' -f2)
                log "    Checking for label: $label"
                # 这里需要更复杂的逻辑来检查标签
            fi
        done
    else
        log "No placement constraints defined"
    fi
}

# 检查资源可用性
check_resource_availability() {
    local service=$1
    
    log "Checking resource availability for service: $service"
    
    # 获取服务的资源要求
    local cpu_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.NanoCPUs}}')
    local memory_limit=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Limits.MemoryBytes}}')
    local cpu_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.NanoCPUs}}')
    local memory_reservation=$(docker service inspect $service --format '{{.Spec.TaskTemplate.Resources.Reservations.MemoryBytes}}')
    
    log "Resource requirements:"
    [ "$cpu_limit" != "<no value>" ] && log "  CPU Limit: $cpu_limit nanocpus"
    [ "$memory_limit" != "<no value>" ] && log "  Memory Limit: $memory_limit bytes"
    [ "$cpu_reservation" != "<no value>" ] && log "  CPU Reservation: $cpu_reservation nanocpus"
    [ "$memory_reservation" != "<no value>" ] && log "  Memory Reservation: $memory_reservation bytes"
    
    # 检查节点资源
    log "Node resource status:"
    for node in $(docker node ls --format '{{.Hostname}}'); do
        log "  Node: $node"
        
        # 获取节点资源信息(这需要在每个节点上执行)
        if [ "$node" = "$(hostname)" ]; then
            local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
            local mem_usage=$(free | awk 'NR==2{printf "%.2f", $3*100/$2 }')
            log "    CPU Usage: ${cpu_usage}%"
            log "    Memory Usage: ${mem_usage}%"
        fi
    done
}

# 检查镜像可用性
check_image_availability() {
    local service=$1
    
    log "Checking image availability for service: $service"
    
    # 获取服务使用的镜像
    local image=$(docker service inspect $service --format '{{.Spec.TaskTemplate.ContainerSpec.Image}}')
    
    log "Service image: $image"
    
    # 检查本地是否有镜像
    if docker images --format '{{.Repository}}:{{.Tag}}' | grep -q "$image"; then
        log "Image available locally"
    else
        log "Image not available locally"
        
        # 尝试拉取镜像
        log "Attempting to pull image..."
        if docker pull $image; then
            log "Image pulled successfully"
        else
            log "Failed to pull image"
            log "Possible causes:"
            log "  - Image name incorrect"
            log "  - Registry not accessible"
            log "  - Authentication required"
        fi
    fi
    
    # 检查其他节点的镜像可用性
    log "Checking image availability on other nodes..."
    for node in $(docker node ls --format '{{.Hostname}}'); do
        if [ "$node" != "$(hostname)" ]; then
            log "  Checking node: $node"
            # 这里需要 SSH 到其他节点检查
            if ssh $node "docker images --format '{{.Repository}}:{{.Tag}}' | grep -q '$image'" 2>/dev/null; then
                log "    Image available on $node"
            else
                log "    Image not available on $node"
            fi
        fi
    done
}

# 检查端口冲突
check_port_conflicts() {
    local service=$1
    
    log "Checking port conflicts for service: $service"
    
    # 获取服务发布的端口
    local published_ports=$(docker service inspect $service --format '{{range .Spec.EndpointSpec.Ports}}{{.PublishedPort}}:{{.TargetPort}}/{{.Protocol}} {{end}}')
    
    if [ -n "$published_ports" ]; then
        log "Published ports: $published_ports"
        
        for port_mapping in $published_ports; do
            local published_port=$(echo $port_mapping | cut -d':' -f1)
            
            log "  Checking port: $published_port"
            
            # 检查端口是否被占用
            if netstat -tuln | grep -q ":$published_port "; then
                log "    Port $published_port is in use"
                
                # 查找占用端口的进程
                local process=$(netstat -tulnp | grep ":$published_port " | awk '{print $7}')
                log "    Used by: $process"
            else
                log "    Port $published_port is available"
            fi
        done
    else
        log "No published ports defined"
    fi
}

# 检查服务日志
check_service_logs() {
    local service=$1
    
    log "Checking service logs: $service"
    
    # 获取最近的服务日志
    local logs=$(docker service logs --tail 50 $service 2>&1)
    
    if [ -n "$logs" ]; then
        log "Recent service logs:"
        echo "$logs"
        
        # 分析日志中的错误
        if echo "$logs" | grep -i error; then
            log "Errors found in logs"
        fi
        
        if echo "$logs" | grep -i "permission denied"; then
            log "Permission issues detected"
        fi
        
        if echo "$logs" | grep -i "connection refused"; then
            log "Connection issues detected"
        fi
    else
        log "No logs available"
    fi
}

# 修复服务
fix_service() {
    local service=$1
    
    log "Attempting to fix service: $service"
    
    echo "Available fix options:"
    echo "1. Restart service (force update)"
    echo "2. Scale service to 0 and back"
    echo "3. Remove and recreate service"
    echo "4. Update service image"
    echo "5. Remove placement constraints"
    echo "6. Exit without changes"
    
    read -p "Choose a fix option (1-6): " choice
    
    case $choice in
        1)
            log "Restarting service with force update..."
            docker service update --force $service
            ;;
        2)
            log "Scaling service to 0 and back..."
            local current_replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}' | cut -d'/' -f2)
            docker service scale $service=0
            sleep 5
            docker service scale $service=$current_replicas
            ;;
        3)
            log "WARNING: This will remove and recreate the service"
            read -p "Are you sure? (yes/no): " confirm
            if [ "$confirm" = "yes" ]; then
                # 导出服务配置
                docker service inspect $service > "/tmp/service-$service-backup.json"
                log "Service configuration backed up to /tmp/service-$service-backup.json"
                
                # 移除服务
                docker service rm $service
                log "Service removed. Please recreate manually using the backup configuration."
            fi
            ;;
        4)
            read -p "Enter new image name: " new_image
            if [ -n "$new_image" ]; then
                log "Updating service image to: $new_image"
                docker service update --image $new_image $service
            fi
            ;;
        5)
            log "Removing placement constraints..."
            docker service update --constraint-rm $service
            ;;
        6)
            log "Exiting without changes"
            ;;
        *)
            log "Invalid choice"
            ;;
    esac
}

# 主诊断流程
main() {
    if [ -z "$SERVICE_NAME" ]; then
        echo "Usage: $0 <service-name>"
        echo "Available services:"
        docker service ls --format '{{.Name}}'
        exit 1
    fi
    
    log "Starting service diagnostics for: $SERVICE_NAME"
    
    # 检查服务状态
    if ! check_service_status $SERVICE_NAME; then
        exit 1
    fi
    
    # 分析任务失败
    analyze_task_failures $SERVICE_NAME
    
    # 检查服务日志
    check_service_logs $SERVICE_NAME
    
    # 提供修复选项
    echo
    read -p "Do you want to attempt to fix the service? (yes/no): " fix_choice
    
    if [ "$fix_choice" = "yes" ]; then
        fix_service $SERVICE_NAME
    fi
    
    log "Service diagnostics completed for: $SERVICE_NAME"
}

# 执行主函数
main