学习目标

通过本章学习,您将能够: - 掌握 Docker Swarm 的高级配置和优化技巧 - 了解生产环境部署的最佳实践 - 学习多环境管理和 CI/CD 集成 - 掌握高可用性和灾难恢复策略 - 了解性能调优和容量规划 - 学习安全加固和合规性要求

1. 高级集群配置

1.1 集群拓扑优化

多可用区部署

# multi-az-cluster.yml
version: '3.8'

services:
  web:
    image: nginx:alpine
    deploy:
      replicas: 6
      placement:
        constraints:
          - node.role == worker
        preferences:
          - spread: node.labels.zone
      resources:
        limits:
          cpus: '0.5'
          memory: 512M
        reservations:
          cpus: '0.25'
          memory: 256M
      update_config:
        parallelism: 2
        delay: 10s
        failure_action: rollback
        monitor: 60s
        max_failure_ratio: 0.3
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
    networks:
      - frontend
    ports:
      - "80:80"

  app:
    image: myapp:latest
    deploy:
      replicas: 4
      placement:
        constraints:
          - node.role == worker
          - node.labels.tier == app
        preferences:
          - spread: node.labels.zone
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M
    networks:
      - frontend
      - backend
    environment:
      - NODE_ENV=production
      - DB_HOST=db
    secrets:
      - db_password
      - api_key

  db:
    image: postgres:13
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.role == worker
          - node.labels.tier == db
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 5
        window: 300s
    networks:
      - backend
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=myapp
      - POSTGRES_PASSWORD_FILE=/run/secrets/db_password
    secrets:
      - db_password
    volumes:
      - db_data:/var/lib/postgresql/data

networks:
  frontend:
    driver: overlay
    attachable: true
    driver_opts:
      encrypted: "true"
  backend:
    driver: overlay
    driver_opts:
      encrypted: "true"

volumes:
  db_data:
    driver: local

secrets:
  db_password:
    external: true
  api_key:
    external: true

节点标签管理脚本

#!/bin/bash
# node-labeling.sh

# 设置节点标签的脚本
set_node_labels() {
    local node_name=$1
    local zone=$2
    local tier=$3
    local instance_type=$4
    
    echo "Setting labels for node: $node_name"
    
    # 设置可用区标签
    docker node update --label-add zone=$zone $node_name
    
    # 设置层级标签
    docker node update --label-add tier=$tier $node_name
    
    # 设置实例类型标签
    docker node update --label-add instance_type=$instance_type $node_name
    
    # 设置环境标签
    docker node update --label-add env=production $node_name
    
    echo "Labels set successfully for $node_name"
}

# 批量设置节点标签
batch_label_nodes() {
    # 管理节点
    set_node_labels "manager-1" "us-east-1a" "management" "t3.large"
    set_node_labels "manager-2" "us-east-1b" "management" "t3.large"
    set_node_labels "manager-3" "us-east-1c" "management" "t3.large"
    
    # 应用层工作节点
    set_node_labels "worker-app-1" "us-east-1a" "app" "c5.xlarge"
    set_node_labels "worker-app-2" "us-east-1b" "app" "c5.xlarge"
    set_node_labels "worker-app-3" "us-east-1c" "app" "c5.xlarge"
    
    # 数据库层工作节点
    set_node_labels "worker-db-1" "us-east-1a" "db" "r5.2xlarge"
    set_node_labels "worker-db-2" "us-east-1b" "db" "r5.2xlarge"
    
    # 监控节点
    set_node_labels "worker-monitor-1" "us-east-1a" "monitoring" "t3.medium"
}

# 显示节点标签
show_node_labels() {
    echo "Current node labels:"
    for node in $(docker node ls --format '{{.Hostname}}'); do
        echo "Node: $node"
        docker node inspect $node --format '{{range $k, $v := .Spec.Labels}}{{$k}}={{$v}} {{end}}'
        echo
    done
}

# 验证标签分布
validate_label_distribution() {
    echo "Validating label distribution..."
    
    echo "Zones:"
    docker node ls --format 'table {{.Hostname}}\t{{.Status}}\t{{.Availability}}' \
        --filter "label=zone"
    
    echo "\nTiers:"
    for tier in management app db monitoring; do
        echo "Tier: $tier"
        docker node ls --format '{{.Hostname}}' --filter "label=tier=$tier"
    done
    
    echo "\nInstance types:"
    for type in t3.large t3.medium c5.xlarge r5.2xlarge; do
        echo "Type: $type"
        docker node ls --format '{{.Hostname}}' --filter "label=instance_type=$type"
    done
}

# 主函数
main() {
    case "$1" in
        "set")
            batch_label_nodes
            ;;
        "show")
            show_node_labels
            ;;
        "validate")
            validate_label_distribution
            ;;
        *)
            echo "Usage: $0 {set|show|validate}"
            echo "  set      - Set labels on all nodes"
            echo "  show     - Show current node labels"
            echo "  validate - Validate label distribution"
            exit 1
            ;;
    esac
}

main "$@"

5. 实践练习

练习 5.1:多可用区高可用部署

目标:在多个可用区部署高可用的 Docker Swarm 集群

步骤

  1. 初始化多可用区集群: “`bash

    在第一个可用区初始化管理节点

    docker swarm init –advertise-addr 10.0.1.10

添加其他可用区的管理节点

docker swarm join-token manager

在其他可用区执行

docker swarm join –token SWMTKN-xxx 10.0.1.10:2377


2. **配置节点标签**:
```bash
# 标记节点所在的可用区
docker node update --label-add zone=us-east-1a node1
docker node update --label-add zone=us-east-1b node2
docker node update --label-add zone=us-east-1c node3

# 标记节点类型
docker node update --label-add type=compute node4
docker node update --label-add type=storage node5
  1. 部署跨可用区服务: “`yaml

    multi-zone-app.yml

    version: ‘3.8’ services: web: image: nginx:alpine deploy: replicas: 6 placement: max_replicas_per_node: 2 constraints: - node.labels.type == compute preferences: - spread: node.labels.zone resources: limits: cpus: ‘0.5’ memory: 512M reservations: cpus: ‘0.25’ memory: 256M networks:

    • app-network

database: image: postgres:13 deploy: replicas: 3 placement: constraints: - node.labels.type == storage preferences: - spread: node.labels.zone resources: limits: cpus: ‘1.0’ memory: 1G reservations: cpus: ‘0.5’ memory: 512M environment: POSTGRES_DB: myapp POSTGRES_USER: user POSTGRES_PASSWORD_FILE: /run/secrets/db_password secrets: - db_password volumes: - db_data:/var/lib/postgresql/data networks: - app-network

networks: app-network: driver: overlay attachable: true driver_opts: encrypted: “true”

volumes: db_data: driver: local

secrets: db_password: external: true


4. **验证部署**:
```bash
# 部署服务
docker stack deploy -c multi-zone-app.yml myapp

# 检查服务分布
docker service ps myapp_web
docker service ps myapp_database

# 验证可用区分布
docker service ps myapp_web --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}"

练习 5.2:CI/CD 集成与蓝绿部署

目标:实现自动化的 CI/CD 流水线和蓝绿部署

步骤

  1. 创建应用代码: “`javascript // app.js const express = require(‘express’); const app = express(); const port = process.env.PORT || 3000; const version = process.env.APP_VERSION || ‘1.0.0’;

app.get(‘/’, (req, res) => { res.json({ message: ‘Hello from Docker Swarm!’, version: version, hostname: require(‘os’).hostname(), timestamp: new Date().toISOString() }); });

app.get(‘/health’, (req, res) => { res.json({ status: ‘healthy’, version: version }); });

app.listen(port, () => { console.log(App version ${version} listening on port ${port}); });


2. **创建 Dockerfile**:
```dockerfile
# Dockerfile
FROM node:16-alpine

WORKDIR /app

COPY package*.json ./
RUN npm ci --only=production

COPY . .

EXPOSE 3000

USER node

CMD ["node", "app.js"]
  1. 配置 GitLab CI: “`yaml

    .gitlab-ci.yml

    stages:

    • build
    • test
    • deploy-blue
    • verify
    • deploy-green
    • cleanup

variables: DOCKER_DRIVER: overlay2 DOCKER_TLS_CERTDIR: “/certs” IMAGE_NAME: “$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA” BLUE_SERVICE: “myapp-blue” GREEN_SERVICE: “myapp-green”

build: stage: build image: docker:20.10.16 services: - docker:20.10.16-dind before_script: - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY script: - docker build -t $IMAGE_NAME . - docker push $IMAGE_NAME only: - main

test: stage: test image: node:16-alpine script: - npm ci - npm test only: - main

deploy-blue: stage: deploy-blue image: docker:20.10.16 before_script: - apk add –no-cache curl script: - | # 检查当前活跃服务 ACTIVE_SERVICE=$(curl -s http://load-balancer/api/active-service || echo “green”)

  if [ "$ACTIVE_SERVICE" = "green" ]; then
    TARGET_SERVICE="$BLUE_SERVICE"
  else
    TARGET_SERVICE="$GREEN_SERVICE"
  fi

  echo "Deploying to $TARGET_SERVICE"

  # 更新目标服务
  docker service update \
    --image $IMAGE_NAME \
    --env-add APP_VERSION=$CI_COMMIT_SHA \
    $TARGET_SERVICE

  # 等待服务更新完成
  ./scripts/wait-for-service.sh $TARGET_SERVICE

only: - main

verify: stage: verify image: alpine:latest before_script: - apk add –no-cache curl jq script: - | # 获取目标服务端点 TARGET_SERVICE=$(curl -s http://load-balancer/api/inactive-service) TARGET_URL=“http://$TARGET_SERVICE:3000”

  # 健康检查
  for i in {1..30}; do
    if curl -f $TARGET_URL/health; then
      echo "Health check passed"
      break
    fi
    echo "Waiting for service to be healthy..."
    sleep 10
  done

  # 功能测试
  RESPONSE=$(curl -s $TARGET_URL)
  VERSION=$(echo $RESPONSE | jq -r '.version')

  if [ "$VERSION" = "$CI_COMMIT_SHA" ]; then
    echo "Version verification passed"
  else
    echo "Version verification failed"
    exit 1
  fi

only: - main

deploy-green: stage: deploy-green image: alpine:latest before_script: - apk add –no-cache curl script: - | # 切换流量到新版本 TARGET_SERVICE=$(curl -s http://load-balancer/api/inactive-service) curl -X POST http://load-balancer/api/switch-traffic -d “{\“target\”: \“$TARGET_SERVICE\”}”

  echo "Traffic switched to $TARGET_SERVICE"

only: - main when: manual

cleanup: stage: cleanup image: alpine:latest script: - echo “Deployment completed successfully” - echo “Old version can be cleaned up if needed” only: - main when: manual


4. **蓝绿部署脚本**:
```bash
#!/bin/bash
# blue-green-deploy.sh

SERVICE_NAME="myapp"
BLUE_SERVICE="${SERVICE_NAME}-blue"
GREEN_SERVICE="${SERVICE_NAME}-green"
LOAD_BALANCER="traefik"
NEW_IMAGE="$1"
NEW_VERSION="$2"

if [[ -z $NEW_IMAGE || -z $NEW_VERSION ]]; then
    echo "Usage: $0 <new-image> <new-version>"
    exit 1
fi

# 获取当前活跃服务
get_active_service() {
    # 通过负载均衡器 API 或标签获取当前活跃服务
    docker service inspect $BLUE_SERVICE --format '{{.Spec.Labels.active}}' 2>/dev/null | grep -q "true" && echo "blue" || echo "green"
}

# 等待服务更新完成
wait_for_service_update() {
    local service_name=$1
    local timeout=300
    local elapsed=0
    
    echo "Waiting for service $service_name to update..."
    
    while [[ $elapsed -lt $timeout ]]; do
        local update_status=$(docker service inspect $service_name --format '{{.UpdateStatus.State}}')
        
        if [[ $update_status == "completed" ]]; then
            echo "Service $service_name updated successfully"
            return 0
        elif [[ $update_status == "paused" || $update_status == "rollback_completed" ]]; then
            echo "Service $service_name update failed: $update_status"
            return 1
        fi
        
        sleep 5
        elapsed=$((elapsed + 5))
    done
    
    echo "Timeout waiting for service $service_name to update"
    return 1
}

# 健康检查
health_check() {
    local service_name=$1
    local max_attempts=30
    local attempt=0
    
    echo "Performing health check for $service_name..."
    
    while [[ $attempt -lt $max_attempts ]]; do
        # 获取服务的一个任务 IP
        local task_ip=$(docker service ps $service_name --format '{{.Node}}' | head -1 | xargs docker node inspect --format '{{.Status.Addr}}')
        
        if curl -f -s "http://$task_ip:3000/health" > /dev/null; then
            echo "Health check passed for $service_name"
            return 0
        fi
        
        echo "Health check attempt $((attempt + 1))/$max_attempts failed, retrying..."
        sleep 10
        attempt=$((attempt + 1))
    done
    
    echo "Health check failed for $service_name"
    return 1
}

# 切换流量
switch_traffic() {
    local new_active_service=$1
    local old_active_service=$2
    
    echo "Switching traffic from $old_active_service to $new_active_service..."
    
    # 更新服务标签
    docker service update --label-rm active $old_active_service
    docker service update --label-add active=true $new_active_service
    
    # 更新负载均衡器配置(这里使用 Traefik 作为示例)
    docker service update \
        --label-add "traefik.http.routers.${SERVICE_NAME}.service=${new_active_service}" \
        $new_active_service
    
    echo "Traffic switched to $new_active_service"
}

# 回滚
rollback() {
    local service_name=$1
    
    echo "Rolling back $service_name..."
    docker service rollback $service_name
    
    if wait_for_service_update $service_name; then
        echo "Rollback completed for $service_name"
    else
        echo "Rollback failed for $service_name"
        return 1
    fi
}

# 主部署流程
main() {
    echo "Starting blue-green deployment..."
    echo "New image: $NEW_IMAGE"
    echo "New version: $NEW_VERSION"
    
    # 确定当前活跃服务和目标服务
    local active_service=$(get_active_service)
    local target_service
    
    if [[ $active_service == "blue" ]]; then
        target_service=$GREEN_SERVICE
    else
        target_service=$BLUE_SERVICE
    fi
    
    echo "Current active service: $active_service"
    echo "Target service: $target_service"
    
    # 更新目标服务
    echo "Updating $target_service with new image..."
    docker service update \
        --image $NEW_IMAGE \
        --env-add APP_VERSION=$NEW_VERSION \
        --label-rm active \
        $target_service
    
    # 等待更新完成
    if ! wait_for_service_update $target_service; then
        echo "Failed to update $target_service"
        exit 1
    fi
    
    # 健康检查
    if ! health_check $target_service; then
        echo "Health check failed, rolling back..."
        rollback $target_service
        exit 1
    fi
    
    # 询问是否切换流量
    echo "New version is ready. Switch traffic? (y/n)"
    read -r response
    
    if [[ $response == "y" || $response == "Y" ]]; then
        switch_traffic $target_service "${SERVICE_NAME}-${active_service}"
        echo "Blue-green deployment completed successfully!"
    else
        echo "Traffic not switched. New version is available at $target_service"
    fi
}

main

练习 5.3:性能调优和监控

目标:实现全面的性能监控和自动调优

步骤

  1. 部署监控栈

    # 使用之前创建的性能监控脚本
    ./performance-tuning.sh collect
    ./performance-tuning.sh analyze /opt/performance-reports/performance_*.json
    ./performance-tuning.sh optimize
    
    1. 配置自动扩缩容

      # 初始化自动扩缩容配置
      ./intelligent-autoscaler.sh init
      # 启动自动扩缩容服务
      ./intelligent-autoscaler.sh start
      
  2. 压力测试: “`bash

    使用 Apache Bench 进行压力测试

    ab -n 10000 -c 100 http://your-app-url/

使用 wrk 进行更复杂的测试

wrk -t12 -c400 -d30s –script=load-test.lua http://your-app-url/


4. **监控扩缩容行为**:
```bash
# 观察服务副本数变化
watch -n 5 'docker service ls'

# 查看自动扩缩容日志
tail -f /var/log/swarm-autoscaler.log

6. 本章总结

6.1 关键要点

  1. 高级配置管理

    • 多可用区部署策略
    • 节点标签和约束管理
    • 高级网络配置
    • 性能优化配置
  2. 生产环境最佳实践

    • 安全配置和加固
    • 资源管理和限制
    • 监控和日志管理
    • 备份和恢复策略
  3. CI/CD 集成

    • 自动化部署流水线
    • 蓝绿部署和金丝雀发布
    • 自动化测试和验证
    • 回滚和故障恢复
  4. 高可用性和灾难恢复

    • 多区域部署
    • 自动故障转移
    • 数据备份和恢复
    • 业务连续性规划
  5. 性能调优和容量规划

    • 性能监控和分析
    • 自动扩缩容策略
    • 资源优化配置
    • 容量规划和预测

6.2 最佳实践总结

  1. 架构设计

    • 采用微服务架构
    • 实现服务解耦
    • 设计容错机制
    • 考虑扩展性需求
  2. 运维管理

    • 实施基础设施即代码
    • 建立完善的监控体系
    • 制定标准化流程
    • 持续优化和改进
  3. 安全管理

    • 实施最小权限原则
    • 定期安全审计
    • 加密敏感数据
    • 及时更新和修补
  4. 性能优化

    • 持续性能监控
    • 主动容量规划
    • 自动化扩缩容
    • 定期性能调优

6.3 进阶学习方向

  1. 容器编排进阶

    • Kubernetes 集群管理
    • 服务网格(Service Mesh)
    • 无服务器容器(Serverless Containers)
    • 边缘计算部署
  2. 云原生技术栈

    • CNCF 生态系统
    • GitOps 工作流
    • 可观测性(Observability)
    • 混沌工程(Chaos Engineering)
  3. 企业级实践

    • 多云和混合云策略
    • 合规性和治理
    • 成本优化
    • 组织和流程变革

通过本章的学习,您已经掌握了 Docker Swarm 的高级特性和生产环境最佳实践。这些知识将帮助您构建和管理大规模、高可用的容器化应用系统。


恭喜!您已经完成了 Docker Swarm 完整教程的学习。

这个教程涵盖了从基础概念到高级实践的全部内容,包括: - 集群搭建和管理 - 服务部署和编排 - 网络和存储管理 - 安全配置和监控 - 故障排除和性能优化 - 生产环境最佳实践

希望这个教程能够帮助您在容器编排和微服务架构的道路上取得成功!


### 1.2 高级网络配置

#### 多网络架构

```bash
#!/bin/bash
# advanced-networking.sh

# 创建多层网络架构
create_network_architecture() {
    echo "Creating advanced network architecture..."
    
    # 前端网络(公共访问)
    docker network create \
        --driver overlay \
        --subnet 10.1.0.0/24 \
        --gateway 10.1.0.1 \
        --attachable \
        --opt encrypted=false \
        frontend-public
    
    # 应用网络(内部通信)
    docker network create \
        --driver overlay \
        --subnet 10.2.0.0/24 \
        --gateway 10.2.0.1 \
        --opt encrypted=true \
        app-internal
    
    # 数据库网络(高安全)
    docker network create \
        --driver overlay \
        --subnet 10.3.0.0/24 \
        --gateway 10.3.0.1 \
        --opt encrypted=true \
        --opt com.docker.network.driver.mtu=1450 \
        database-secure
    
    # 监控网络(管理访问)
    docker network create \
        --driver overlay \
        --subnet 10.4.0.0/24 \
        --gateway 10.4.0.1 \
        --attachable \
        --opt encrypted=false \
        monitoring-mgmt
    
    # 日志网络(日志收集)
    docker network create \
        --driver overlay \
        --subnet 10.5.0.0/24 \
        --gateway 10.5.0.1 \
        --opt encrypted=false \
        logging-collect
    
    echo "Network architecture created successfully"
}

# 配置网络策略
configure_network_policies() {
    echo "Configuring network policies..."
    
    # 这里可以集成 Calico 或其他网络策略引擎
    # 示例:限制数据库网络只能从应用网络访问
    
    cat > network-policy.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: database-access-policy
spec:
  podSelector:
    matchLabels:
      tier: database
  policyTypes:
  - Ingress
  - Egress
  ingress:
  - from:
    - podSelector:
        matchLabels:
          tier: app
    ports:
    - protocol: TCP
      port: 5432
  egress:
  - to: []
    ports:
    - protocol: TCP
      port: 53
    - protocol: UDP
      port: 53
EOF
    
    echo "Network policies configured"
}

# 网络性能优化
optimize_network_performance() {
    echo "Optimizing network performance..."
    
    # 调整内核网络参数
    cat > /etc/sysctl.d/99-docker-swarm.conf << 'EOF'
# Docker Swarm 网络优化
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 65536 134217728
net.ipv4.tcp_wmem = 4096 65536 134217728
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_congestion_control = bbr
net.ipv4.tcp_slow_start_after_idle = 0
net.ipv4.tcp_tw_reuse = 1
EOF
    
    # 应用设置
    sysctl -p /etc/sysctl.d/99-docker-swarm.conf
    
    # 优化 Docker 网络设置
    cat > /etc/docker/daemon.json << 'EOF'
{
  "storage-driver": "overlay2",
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "3"
  },
  "default-address-pools": [
    {
      "base": "172.30.0.0/16",
      "size": 24
    }
  ],
  "bip": "172.26.0.1/16",
  "mtu": 1450,
  "experimental": true,
  "metrics-addr": "0.0.0.0:9323"
}
EOF
    
    echo "Network performance optimization completed"
}

# 网络监控设置
setup_network_monitoring() {
    echo "Setting up network monitoring..."
    
    # 部署网络监控服务
    docker service create \
        --name network-monitor \
        --network monitoring-mgmt \
        --mount type=bind,source=/proc,target=/host/proc,readonly \
        --mount type=bind,source=/sys,target=/host/sys,readonly \
        --mount type=bind,source=/,target=/rootfs,readonly \
        --publish 9100:9100 \
        prom/node-exporter:latest \
        --path.procfs=/host/proc \
        --path.sysfs=/host/sys \
        --collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($|/)"
    
    # 部署网络流量分析
    docker service create \
        --name traffic-analyzer \
        --network monitoring-mgmt \
        --cap-add NET_ADMIN \
        --cap-add SYS_ADMIN \
        --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock \
        --publish 8080:8080 \
        ntopng/ntopng:latest
    
    echo "Network monitoring setup completed"
}

# 主函数
main() {
    case "$1" in
        "create")
            create_network_architecture
            ;;
        "policies")
            configure_network_policies
            ;;
        "optimize")
            optimize_network_performance
            ;;
        "monitor")
            setup_network_monitoring
            ;;
        "all")
            create_network_architecture
            configure_network_policies
            optimize_network_performance
            setup_network_monitoring
            ;;
        *)
            echo "Usage: $0 {create|policies|optimize|monitor|all}"
            exit 1
            ;;
    esac
}

main "$@"

2. 生产环境最佳实践

2.1 环境管理

多环境配置管理

#!/bin/bash
# environment-manager.sh

ENV_DIR="/opt/swarm-environments"
CONFIG_DIR="$ENV_DIR/configs"
SECRETS_DIR="$ENV_DIR/secrets"

# 环境配置
declare -A ENVIRONMENTS=(
    ["dev"]="development"
    ["staging"]="staging"
    ["prod"]="production"
)

# 初始化环境目录
init_environment_structure() {
    echo "Initializing environment structure..."
    
    mkdir -p $ENV_DIR/{configs,secrets,compose,scripts}
    
    for env in "${!ENVIRONMENTS[@]}"; do
        mkdir -p $CONFIG_DIR/$env
        mkdir -p $SECRETS_DIR/$env
        mkdir -p $ENV_DIR/compose/$env
    done
    
    echo "Environment structure initialized"
}

# 创建环境配置
create_environment_config() {
    local env=$1
    local config_file="$CONFIG_DIR/$env/config.env"
    
    case $env in
        "dev")
            cat > $config_file << 'EOF'
# Development Environment Configuration
ENVIRONMENT=development
DEBUG=true
LOG_LEVEL=debug
REPLICAS_WEB=1
REPLICAS_APP=1
REPLICAS_DB=1
CPU_LIMIT_WEB=0.5
MEMORY_LIMIT_WEB=512m
CPU_LIMIT_APP=1.0
MEMORY_LIMIT_APP=1g
CPU_LIMIT_DB=1.0
MEMORY_LIMIT_DB=2g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=3
UPDATE_PARALLELISM=1
UPDATE_DELAY=10s
EOF
            ;;
        "staging")
            cat > $config_file << 'EOF'
# Staging Environment Configuration
ENVIRONMENT=staging
DEBUG=false
LOG_LEVEL=info
REPLICAS_WEB=2
REPLICAS_APP=2
REPLICAS_DB=1
CPU_LIMIT_WEB=0.5
MEMORY_LIMIT_WEB=512m
CPU_LIMIT_APP=1.0
MEMORY_LIMIT_APP=1g
CPU_LIMIT_DB=2.0
MEMORY_LIMIT_DB=4g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=3
UPDATE_PARALLELISM=1
UPDATE_DELAY=30s
EOF
            ;;
        "prod")
            cat > $config_file << 'EOF'
# Production Environment Configuration
ENVIRONMENT=production
DEBUG=false
LOG_LEVEL=warn
REPLICAS_WEB=3
REPLICAS_APP=4
REPLICAS_DB=2
CPU_LIMIT_WEB=1.0
MEMORY_LIMIT_WEB=1g
CPU_LIMIT_APP=2.0
MEMORY_LIMIT_APP=2g
CPU_LIMIT_DB=4.0
MEMORY_LIMIT_DB=8g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=5
UPDATE_PARALLELISM=2
UPDATE_DELAY=60s
EOF
            ;;
    esac
    
    echo "Configuration created for $env environment"
}

# 部署到指定环境
deploy_to_environment() {
    local env=$1
    local stack_name=$2
    local compose_file="$ENV_DIR/compose/$env/docker-compose.yml"
    local config_file="$CONFIG_DIR/$env/config.env"
    
    if [[ ! -f $compose_file ]]; then
        echo "Error: Compose file not found for $env environment"
        return 1
    fi
    
    if [[ ! -f $config_file ]]; then
        echo "Error: Config file not found for $env environment"
        return 1
    fi
    
    echo "Deploying $stack_name to $env environment..."
    
    # 加载环境配置
    source $config_file
    
    # 验证必要的密钥
    validate_secrets $env
    
    # 部署堆栈
    docker stack deploy \
        --compose-file $compose_file \
        --with-registry-auth \
        $stack_name-$env
    
    echo "Deployment to $env completed"
}

# 验证密钥
validate_secrets() {
    local env=$1
    local secrets_file="$SECRETS_DIR/$env/secrets.txt"
    
    if [[ ! -f $secrets_file ]]; then
        echo "Warning: Secrets file not found for $env"
        return 1
    fi
    
    echo "Validating secrets for $env environment..."
    
    while IFS= read -r secret_name; do
        if ! docker secret inspect "${secret_name}-${env}" > /dev/null 2>&1; then
            echo "Error: Secret ${secret_name}-${env} not found"
            return 1
        fi
    done < $secrets_file
    
    echo "All secrets validated for $env"
}

# 环境健康检查
health_check_environment() {
    local env=$1
    local stack_name=$2
    
    echo "Performing health check for $env environment..."
    
    # 检查服务状态
    echo "Service Status:"
    docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env"
    
    # 检查任务状态
    echo "\nTask Status:"
    for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
        echo "Service: $service"
        docker service ps $service --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Error}}"
        echo
    done
    
    # 检查网络连通性
    echo "Network Connectivity:"
    test_network_connectivity $env
    
    # 检查资源使用
    echo "Resource Usage:"
    docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
}

# 测试网络连通性
test_network_connectivity() {
    local env=$1
    
    # 这里可以添加具体的网络连通性测试
    echo "Testing network connectivity for $env..."
    
    # 示例:测试服务间连通性
    local test_container=$(docker ps --filter "label=com.docker.stack.namespace=myapp-$env" --format '{{.Names}}' | head -1)
    
    if [[ -n $test_container ]]; then
        echo "Testing from container: $test_container"
        docker exec $test_container ping -c 3 google.com > /dev/null 2>&1
        if [[ $? -eq 0 ]]; then
            echo "✓ External connectivity OK"
        else
            echo "✗ External connectivity failed"
        fi
    fi
}

# 环境切换
switch_environment() {
    local from_env=$1
    local to_env=$2
    local stack_name=$3
    
    echo "Switching from $from_env to $to_env environment..."
    
    # 备份当前环境状态
    backup_environment_state $from_env $stack_name
    
    # 部署到新环境
    deploy_to_environment $to_env $stack_name
    
    # 验证新环境
    sleep 30
    health_check_environment $to_env $stack_name
    
    echo "Environment switch completed"
}

# 备份环境状态
backup_environment_state() {
    local env=$1
    local stack_name=$2
    local backup_dir="$ENV_DIR/backups/$env/$(date +%Y%m%d_%H%M%S)"
    
    mkdir -p $backup_dir
    
    echo "Backing up $env environment state..."
    
    # 备份服务配置
    for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
        docker service inspect $service > "$backup_dir/${service}.json"
    done
    
    # 备份网络配置
    for network in $(docker network ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
        docker network inspect $network > "$backup_dir/${network}-network.json"
    done
    
    echo "Environment state backed up to: $backup_dir"
}

# 主函数
main() {
    case "$1" in
        "init")
            init_environment_structure
            for env in "${!ENVIRONMENTS[@]}"; do
                create_environment_config $env
            done
            ;;
        "deploy")
            if [[ $# -lt 3 ]]; then
                echo "Usage: $0 deploy <environment> <stack-name>"
                exit 1
            fi
            deploy_to_environment $2 $3
            ;;
        "health")
            if [[ $# -lt 3 ]]; then
                echo "Usage: $0 health <environment> <stack-name>"
                exit 1
            fi
            health_check_environment $2 $3
            ;;
        "switch")
            if [[ $# -lt 4 ]]; then
                echo "Usage: $0 switch <from-env> <to-env> <stack-name>"
                exit 1
            fi
            switch_environment $2 $3 $4
            ;;
        "backup")
            if [[ $# -lt 3 ]]; then
                echo "Usage: $0 backup <environment> <stack-name>"
                exit 1
            fi
            backup_environment_state $2 $3
            ;;
        *)
            echo "Usage: $0 {init|deploy|health|switch|backup}"
            echo "  init                           - Initialize environment structure"
            echo "  deploy <env> <stack>           - Deploy to environment"
            echo "  health <env> <stack>           - Health check environment"
            echo "  switch <from> <to> <stack>     - Switch between environments"
            echo "  backup <env> <stack>           - Backup environment state"
            exit 1
            ;;
    esac
}

main "$@"

2.2 CI/CD 集成

GitLab CI 配置

# .gitlab-ci.yml
stages:
  - build
  - test
  - security
  - deploy-dev
  - deploy-staging
  - deploy-production

variables:
  DOCKER_DRIVER: overlay2
  DOCKER_TLS_CERTDIR: "/certs"
  REGISTRY: $CI_REGISTRY
  IMAGE_NAME: $CI_REGISTRY_IMAGE
  SWARM_MANAGER: $SWARM_MANAGER_HOST

before_script:
  - docker info
  - echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY

# 构建阶段
build:
  stage: build
  services:
    - docker:20.10.16-dind
  script:
    - docker build -t $IMAGE_NAME:$CI_COMMIT_SHA .
    - docker build -t $IMAGE_NAME:latest .
    - docker push $IMAGE_NAME:$CI_COMMIT_SHA
    - docker push $IMAGE_NAME:latest
  only:
    - main
    - develop
    - /^release\/.*$/

# 测试阶段
unit-tests:
  stage: test
  services:
    - docker:20.10.16-dind
  script:
    - docker run --rm $IMAGE_NAME:$CI_COMMIT_SHA npm test
  coverage: '/Coverage: \d+\.\d+%/'
  artifacts:
    reports:
      coverage_report:
        coverage_format: cobertura
        path: coverage/cobertura-coverage.xml
  only:
    - main
    - develop
    - merge_requests

integration-tests:
  stage: test
  services:
    - docker:20.10.16-dind
    - postgres:13
    - redis:6
  variables:
    POSTGRES_DB: testdb
    POSTGRES_USER: testuser
    POSTGRES_PASSWORD: testpass
    DATABASE_URL: postgresql://testuser:testpass@postgres:5432/testdb
    REDIS_URL: redis://redis:6379
  script:
    - docker run --rm --network host $IMAGE_NAME:$CI_COMMIT_SHA npm run test:integration
  only:
    - main
    - develop

# 安全扫描
security-scan:
  stage: security
  services:
    - docker:20.10.16-dind
  script:
    # 使用 Trivy 进行镜像安全扫描
    - docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \
        aquasec/trivy:latest image --exit-code 1 --severity HIGH,CRITICAL \
        $IMAGE_NAME:$CI_COMMIT_SHA
  allow_failure: true
  artifacts:
    reports:
      container_scanning: gl-container-scanning-report.json
  only:
    - main
    - develop

# 开发环境部署
deploy-dev:
  stage: deploy-dev
  environment:
    name: development
    url: https://dev.myapp.com
  script:
    - apk add --no-cache openssh-client
    - eval $(ssh-agent -s)
    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
    - chmod 644 ~/.ssh/known_hosts
    - |
      ssh $SWARM_USER@$SWARM_MANAGER << EOF
        export IMAGE_TAG=$CI_COMMIT_SHA
        export ENVIRONMENT=development
        cd /opt/swarm-deployments
        ./deploy.sh dev myapp $IMAGE_TAG
      EOF
  only:
    - develop
  when: manual

# 预发布环境部署
deploy-staging:
  stage: deploy-staging
  environment:
    name: staging
    url: https://staging.myapp.com
  script:
    - apk add --no-cache openssh-client
    - eval $(ssh-agent -s)
    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
    - chmod 644 ~/.ssh/known_hosts
    - |
      ssh $SWARM_USER@$SWARM_MANAGER << EOF
        export IMAGE_TAG=$CI_COMMIT_SHA
        export ENVIRONMENT=staging
        cd /opt/swarm-deployments
        ./deploy.sh staging myapp $IMAGE_TAG
        ./health-check.sh staging myapp
      EOF
  only:
    - main
  when: manual

# 生产环境部署
deploy-production:
  stage: deploy-production
  environment:
    name: production
    url: https://myapp.com
  script:
    - apk add --no-cache openssh-client
    - eval $(ssh-agent -s)
    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
    - chmod 644 ~/.ssh/known_hosts
    - |
      ssh $SWARM_USER@$SWARM_MANAGER << EOF
        export IMAGE_TAG=$CI_COMMIT_SHA
        export ENVIRONMENT=production
        cd /opt/swarm-deployments
        # 创建部署前备份
        ./backup.sh production myapp
        # 执行蓝绿部署
        ./blue-green-deploy.sh production myapp $IMAGE_TAG
        # 验证部署
        ./health-check.sh production myapp
        # 如果验证失败,自动回滚
        if [ $? -ne 0 ]; then
          echo "Health check failed, rolling back..."
          ./rollback.sh production myapp
          exit 1
        fi
      EOF
  only:
    - /^release\/.*$/
  when: manual
  allow_failure: false

# 部署后清理
cleanup:
  stage: .post
  script:
    - docker system prune -f
  when: always

蓝绿部署脚本

#!/bin/bash
# blue-green-deploy.sh

ENVIRONMENT=$1
STACK_NAME=$2
IMAGE_TAG=$3

if [[ $# -lt 3 ]]; then
    echo "Usage: $0 <environment> <stack-name> <image-tag>"
    exit 1
fi

BLUE_STACK="${STACK_NAME}-${ENVIRONMENT}-blue"
GREEN_STACK="${STACK_NAME}-${ENVIRONMENT}-green"
CURRENT_STACK="${STACK_NAME}-${ENVIRONMENT}"
LOAD_BALANCER_CONFIG="/etc/nginx/conf.d/${STACK_NAME}-${ENVIRONMENT}.conf"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

# 检查当前活跃的堆栈
get_active_stack() {
    if docker stack ls | grep -q "$BLUE_STACK"; then
        echo "blue"
    elif docker stack ls | grep -q "$GREEN_STACK"; then
        echo "green"
    else
        echo "none"
    fi
}

# 获取非活跃的堆栈
get_inactive_stack() {
    local active=$(get_active_stack)
    case $active in
        "blue") echo "green" ;;
        "green") echo "blue" ;;
        "none") echo "blue" ;;
    esac
}

# 部署到非活跃环境
deploy_to_inactive() {
    local inactive_color=$(get_inactive_stack)
    local inactive_stack="${STACK_NAME}-${ENVIRONMENT}-${inactive_color}"
    
    log "Deploying to inactive environment: $inactive_color"
    
    # 准备 compose 文件
    local compose_file="/tmp/docker-compose-${inactive_color}.yml"
    
    # 生成带有新镜像标签的 compose 文件
    envsubst < "/opt/swarm-environments/compose/${ENVIRONMENT}/docker-compose.yml" > $compose_file
    
    # 替换镜像标签
    sed -i "s|image: .*:latest|image: ${CI_REGISTRY_IMAGE}:${IMAGE_TAG}|g" $compose_file
    
    # 部署堆栈
    docker stack deploy --compose-file $compose_file $inactive_stack
    
    log "Deployment to $inactive_color completed"
    
    # 等待服务启动
    wait_for_services $inactive_stack
    
    return $?
}

# 等待服务启动
wait_for_services() {
    local stack_name=$1
    local max_wait=300  # 5分钟
    local wait_time=0
    
    log "Waiting for services in $stack_name to be ready..."
    
    while [ $wait_time -lt $max_wait ]; do
        local services_ready=true
        
        for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name" --format '{{.Name}}'); do
            local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
            local desired=$(echo $replicas | cut -d'/' -f2)
            local running=$(echo $replicas | cut -d'/' -f1)
            
            if [[ "$running" != "$desired" ]]; then
                services_ready=false
                break
            fi
        done
        
        if $services_ready; then
            log "All services are ready"
            return 0
        fi
        
        sleep 10
        wait_time=$((wait_time + 10))
        log "Waiting... ($wait_time/$max_wait seconds)"
    done
    
    log "Timeout waiting for services to be ready"
    return 1
}

# 健康检查
health_check() {
    local stack_name=$1
    local health_check_url="http://localhost/health"
    
    log "Performing health check for $stack_name"
    
    # 获取服务端口
    local web_service="${stack_name}_web"
    local port=$(docker service inspect $web_service --format '{{range .Endpoint.Ports}}{{.PublishedPort}}{{end}}')
    
    if [[ -n $port ]]; then
        health_check_url="http://localhost:$port/health"
    fi
    
    # 执行健康检查
    local max_attempts=10
    local attempt=1
    
    while [ $attempt -le $max_attempts ]; do
        if curl -f -s $health_check_url > /dev/null; then
            log "Health check passed for $stack_name"
            return 0
        fi
        
        log "Health check attempt $attempt/$max_attempts failed"
        sleep 10
        attempt=$((attempt + 1))
    done
    
    log "Health check failed for $stack_name"
    return 1
}

# 切换流量
switch_traffic() {
    local new_active_color=$1
    local new_active_stack="${STACK_NAME}-${ENVIRONMENT}-${new_active_color}"
    
    log "Switching traffic to $new_active_color environment"
    
    # 获取新环境的服务端口
    local web_service="${new_active_stack}_web"
    local new_port=$(docker service inspect $web_service --format '{{range .Endpoint.Ports}}{{.PublishedPort}}{{end}}')
    
    if [[ -z $new_port ]]; then
        log "Error: Could not get port for $web_service"
        return 1
    fi
    
    # 更新负载均衡器配置
    cat > $LOAD_BALANCER_CONFIG << EOF
upstream ${STACK_NAME}_${ENVIRONMENT} {
    server localhost:$new_port;
}

server {
    listen 80;
    server_name ${STACK_NAME}-${ENVIRONMENT}.local;
    
    location / {
        proxy_pass http://${STACK_NAME}_${ENVIRONMENT};
        proxy_set_header Host \$host;
        proxy_set_header X-Real-IP \$remote_addr;
        proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto \$scheme;
    }
    
    location /health {
        proxy_pass http://${STACK_NAME}_${ENVIRONMENT}/health;
    }
}
EOF
    
    # 重新加载 Nginx 配置
    nginx -t && nginx -s reload
    
    if [[ $? -eq 0 ]]; then
        log "Traffic switched to $new_active_color successfully"
        return 0
    else
        log "Failed to switch traffic to $new_active_color"
        return 1
    fi
}

# 清理旧环境
cleanup_old_environment() {
    local old_color=$1
    local old_stack="${STACK_NAME}-${ENVIRONMENT}-${old_color}"
    
    log "Cleaning up old environment: $old_color"
    
    # 等待一段时间确保流量已切换
    sleep 30
    
    # 删除旧堆栈
    docker stack rm $old_stack
    
    log "Old environment $old_color cleaned up"
}

# 回滚函数
rollback() {
    local current_active=$(get_active_stack)
    
    if [[ $current_active == "none" ]]; then
        log "No active environment to rollback to"
        return 1
    fi
    
    log "Rolling back to $current_active environment"
    
    # 切换回原来的环境
    switch_traffic $current_active
    
    return $?
}

# 主部署流程
main() {
    log "Starting blue-green deployment for $STACK_NAME in $ENVIRONMENT"
    log "Image tag: $IMAGE_TAG"
    
    local active_color=$(get_active_stack)
    local inactive_color=$(get_inactive_stack)
    
    log "Current active environment: $active_color"
    log "Deploying to inactive environment: $inactive_color"
    
    # 部署到非活跃环境
    if ! deploy_to_inactive; then
        log "Deployment failed"
        exit 1
    fi
    
    # 健康检查
    local inactive_stack="${STACK_NAME}-${ENVIRONMENT}-${inactive_color}"
    if ! health_check $inactive_stack; then
        log "Health check failed, cleaning up failed deployment"
        docker stack rm $inactive_stack
        exit 1
    fi
    
    # 切换流量
    if ! switch_traffic $inactive_color; then
        log "Traffic switch failed, rolling back"
        docker stack rm $inactive_stack
        exit 1
    fi
    
    # 清理旧环境
    if [[ $active_color != "none" ]]; then
        cleanup_old_environment $active_color
    fi
    
    log "Blue-green deployment completed successfully"
    log "New active environment: $inactive_color"
}

# 执行主函数
main

3. 高可用性和灾难恢复

3.1 高可用性配置

多区域集群设置

#!/bin/bash
# multi-region-setup.sh

# 多区域集群配置
REGIONS=("us-east-1" "us-west-2" "eu-west-1")
MANAGER_NODES_PER_REGION=1
WORKER_NODES_PER_REGION=2

# 集群配置
setup_multi_region_cluster() {
    echo "Setting up multi-region Docker Swarm cluster..."
    
    # 主区域初始化
    local primary_region="${REGIONS[0]}"
    echo "Initializing primary region: $primary_region"
    
    # 在主区域初始化 Swarm
    docker swarm init --advertise-addr $(get_primary_ip)
    
    # 获取 join tokens
    local manager_token=$(docker swarm join-token manager -q)
    local worker_token=$(docker swarm join-token worker -q)
    
    # 设置主区域节点标签
    docker node update --label-add region=$primary_region $(hostname)
    docker node update --label-add zone=${primary_region}a $(hostname)
    
    # 在其他区域添加节点
    for region in "${REGIONS[@]:1}"; do
        setup_region_nodes $region $manager_token $worker_token
    done
    
    # 配置跨区域网络
    setup_cross_region_networking
    
    echo "Multi-region cluster setup completed"
}

# 设置区域节点
setup_region_nodes() {
    local region=$1
    local manager_token=$2
    local worker_token=$3
    
    echo "Setting up nodes in region: $region"
    
    # 这里需要根据实际的基础设施自动化工具来实现
    # 例如使用 Terraform、AWS CLI、Azure CLI 等
    
    # 示例:使用 AWS CLI 创建实例
    create_aws_instances $region $manager_token $worker_token
}

# 创建 AWS 实例(示例)
create_aws_instances() {
    local region=$1
    local manager_token=$2
    local worker_token=$3
    
    # 管理节点
    for i in $(seq 1 $MANAGER_NODES_PER_REGION); do
        local instance_name="swarm-manager-${region}-${i}"
        
        aws ec2 run-instances \
            --region $region \
            --image-id ami-0abcdef1234567890 \
            --instance-type t3.medium \
            --key-name my-key-pair \
            --security-group-ids sg-12345678 \
            --subnet-id subnet-12345678 \
            --user-data "$(generate_manager_userdata $manager_token $region)" \
            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$instance_name},{Key=Role,Value=manager},{Key=Region,Value=$region}]"
    done
    
    # 工作节点
    for i in $(seq 1 $WORKER_NODES_PER_REGION); do
        local instance_name="swarm-worker-${region}-${i}"
        
        aws ec2 run-instances \
            --region $region \
            --image-id ami-0abcdef1234567890 \
            --instance-type t3.large \
            --key-name my-key-pair \
            --security-group-ids sg-12345678 \
            --subnet-id subnet-12345678 \
            --user-data "$(generate_worker_userdata $worker_token $region)" \
            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$instance_name},{Key=Role,Value=worker},{Key=Region,Value=$region}]"
    done
}

# 生成管理节点用户数据
generate_manager_userdata() {
    local manager_token=$1
    local region=$2
    
    cat << EOF
#!/bin/bash
# 安装 Docker
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
systemctl enable docker
systemctl start docker

# 加入 Swarm 作为管理节点
docker swarm join --token $manager_token $(get_primary_ip):2377

# 设置节点标签
sleep 30
docker node update --label-add region=$region \$(hostname)
docker node update --label-add zone=${region}a \$(hostname)
docker node update --label-add role=manager \$(hostname)
EOF
}

# 生成工作节点用户数据
generate_worker_userdata() {
    local worker_token=$1
    local region=$2
    
    cat << EOF
#!/bin/bash
# 安装 Docker
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
systemctl enable docker
systemctl start docker

# 加入 Swarm 作为工作节点
docker swarm join --token $worker_token $(get_primary_ip):2377

# 设置节点标签(需要在管理节点上执行)
# 这部分通常通过配置管理工具来完成
EOF
}

# 设置跨区域网络
setup_cross_region_networking() {
    echo "Setting up cross-region networking..."
    
    # 创建全局网络
    docker network create \
        --driver overlay \
        --subnet 10.10.0.0/16 \
        --opt encrypted=true \
        global-network
    
    # 创建区域特定网络
    for region in "${REGIONS[@]}"; do
        docker network create \
            --driver overlay \
            --subnet 10.${get_region_subnet $region}.0.0/24 \
            --opt encrypted=true \
            ${region}-network
    done
    
    echo "Cross-region networking setup completed"
}

# 获取区域子网
get_region_subnet() {
    local region=$1
    case $region in
        "us-east-1") echo "11" ;;
        "us-west-2") echo "12" ;;
        "eu-west-1") echo "13" ;;
        *) echo "20" ;;
    esac
}

# 获取主节点 IP
get_primary_ip() {
    # 这里需要根据实际环境获取主节点的 IP
    hostname -I | awk '{print $1}'
}

# 主函数
main() {
    case "$1" in
        "setup")
            setup_multi_region_cluster
            ;;
        "status")
            echo "Cluster Status:"
            docker node ls
            echo "\nNetworks:"
            docker network ls
            ;;
        *)
            echo "Usage: $0 {setup|status}"
            exit 1
            ;;
    esac
}

main "$@"

3.2 灾难恢复策略

自动备份和恢复系统

#!/bin/bash
# disaster-recovery.sh

BACKUP_DIR="/opt/swarm-backups"
S3_BUCKET="my-swarm-backups"
RETENTION_DAYS=30
LOG_FILE="/var/log/swarm-backup.log"

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 创建完整集群备份
create_full_backup() {
    local backup_timestamp=$(date +%Y%m%d_%H%M%S)
    local backup_path="$BACKUP_DIR/full_backup_$backup_timestamp"
    
    log "Starting full cluster backup..."
    
    mkdir -p $backup_path
    
    # 备份 Swarm 状态
    backup_swarm_state $backup_path
    
    # 备份服务配置
    backup_services $backup_path
    
    # 备份网络配置
    backup_networks $backup_path
    
    # 备份卷数据
    backup_volumes $backup_path
    
    # 备份密钥(加密)
    backup_secrets $backup_path
    
    # 备份配置文件
    backup_configs $backup_path
    
    # 创建备份清单
    create_backup_manifest $backup_path
    
    # 压缩备份
    compress_backup $backup_path
    
    # 上传到云存储
    upload_to_cloud $backup_path
    
    log "Full cluster backup completed: $backup_path"
}

# 备份 Swarm 状态
backup_swarm_state() {
    local backup_path=$1
    
    log "Backing up Swarm state..."
    
    # 节点信息
    docker node ls --format json > "$backup_path/nodes.json"
    
    # 详细节点信息
    mkdir -p "$backup_path/nodes"
    for node in $(docker node ls --format '{{.Hostname}}'); do
        docker node inspect $node > "$backup_path/nodes/${node}.json"
    done
    
    # Swarm 信息
    docker info --format json > "$backup_path/swarm-info.json"
    
    log "Swarm state backup completed"
}

# 备份服务配置
backup_services() {
    local backup_path=$1
    
    log "Backing up services..."
    
    mkdir -p "$backup_path/services"
    
    # 服务列表
    docker service ls --format json > "$backup_path/services.json"
    
    # 详细服务配置
    for service in $(docker service ls --format '{{.Name}}'); do
        docker service inspect $service > "$backup_path/services/${service}.json"
    done
    
    log "Services backup completed"
}

# 备份网络配置
backup_networks() {
    local backup_path=$1
    
    log "Backing up networks..."
    
    mkdir -p "$backup_path/networks"
    
    # 网络列表
    docker network ls --format json > "$backup_path/networks.json"
    
    # 详细网络配置
    for network in $(docker network ls --format '{{.Name}}'); do
        if [[ $network != "bridge" && $network != "host" && $network != "none" ]]; then
            docker network inspect $network > "$backup_path/networks/${network}.json"
        fi
    done
    
    log "Networks backup completed"
}

# 备份卷数据
backup_volumes() {
    local backup_path=$1
    
    log "Backing up volumes..."
    
    mkdir -p "$backup_path/volumes"
    
    # 卷列表
    docker volume ls --format json > "$backup_path/volumes.json"
    
    # 备份卷数据
    for volume in $(docker volume ls --format '{{.Name}}'); do
        log "Backing up volume: $volume"
        
        # 获取卷挂载点
        local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
        
        if [[ -d $mountpoint ]]; then
            # 创建卷数据备份
            tar -czf "$backup_path/volumes/${volume}.tar.gz" -C "$mountpoint" .
            
            # 保存卷元数据
            docker volume inspect $volume > "$backup_path/volumes/${volume}.json"
        fi
    done
    
    log "Volumes backup completed"
}

# 备份密钥(加密)
backup_secrets() {
    local backup_path=$1
    
    log "Backing up secrets..."
    
    mkdir -p "$backup_path/secrets"
    
    # 密钥列表(不包含实际内容)
    docker secret ls --format json > "$backup_path/secrets.json"
    
    # 密钥元数据
    for secret in $(docker secret ls --format '{{.Name}}'); do
        docker secret inspect $secret > "$backup_path/secrets/${secret}.json"
    done
    
    log "Secrets metadata backup completed"
    log "WARNING: Secret values are not backed up for security reasons"
}

# 备份配置
backup_configs() {
    local backup_path=$1
    
    log "Backing up configs..."
    
    mkdir -p "$backup_path/configs"
    
    # 配置列表
    docker config ls --format json > "$backup_path/configs.json"
    
    # 配置内容
    for config in $(docker config ls --format '{{.Name}}'); do
        docker config inspect $config > "$backup_path/configs/${config}.json"
    done
    
    log "Configs backup completed"
}

# 创建备份清单
create_backup_manifest() {
    local backup_path=$1
    local manifest_file="$backup_path/manifest.json"
    
    log "Creating backup manifest..."
    
    cat > $manifest_file << EOF
{
  "backup_timestamp": "$(date -Iseconds)",
  "backup_type": "full",
  "swarm_info": {
    "cluster_id": "$(docker info --format '{{.Swarm.Cluster.ID}}')",
    "node_id": "$(docker info --format '{{.Swarm.NodeID}}')",
    "nodes_count": $(docker node ls --format '{{.Hostname}}' | wc -l),
    "services_count": $(docker service ls --format '{{.Name}}' | wc -l),
    "networks_count": $(docker network ls --filter driver=overlay --format '{{.Name}}' | wc -l),
    "volumes_count": $(docker volume ls --format '{{.Name}}' | wc -l)
  },
  "backup_components": [
    "swarm_state",
    "services",
    "networks",
    "volumes",
    "secrets_metadata",
    "configs"
  ],
  "backup_size": "$(du -sh $backup_path | cut -f1)",
  "backup_location": "$backup_path"
}
EOF
    
    log "Backup manifest created"
}

# 压缩备份
compress_backup() {
    local backup_path=$1
    local compressed_file="${backup_path}.tar.gz"
    
    log "Compressing backup..."
    
    tar -czf $compressed_file -C $(dirname $backup_path) $(basename $backup_path)
    
    # 验证压缩文件
    if [[ -f $compressed_file ]]; then
        log "Backup compressed successfully: $compressed_file"
        # 删除原始备份目录
        rm -rf $backup_path
    else
        log "ERROR: Failed to compress backup"
        return 1
    fi
}

# 上传到云存储
upload_to_cloud() {
    local backup_path=$1
    local compressed_file="${backup_path}.tar.gz"
    
    log "Uploading backup to cloud storage..."
    
    # 上传到 S3
    aws s3 cp $compressed_file s3://$S3_BUCKET/$(basename $compressed_file)
    
    if [[ $? -eq 0 ]]; then
        log "Backup uploaded successfully to S3"
    else
        log "ERROR: Failed to upload backup to S3"
        return 1
    fi
}

# 增量备份
create_incremental_backup() {
    local last_backup_timestamp=$1
    local backup_timestamp=$(date +%Y%m%d_%H%M%S)
    local backup_path="$BACKUP_DIR/incremental_backup_$backup_timestamp"
    
    log "Starting incremental backup since $last_backup_timestamp..."
    
    mkdir -p $backup_path
    
    # 备份自上次备份以来的变更
    backup_changed_services $backup_path $last_backup_timestamp
    backup_changed_volumes $backup_path $last_backup_timestamp
    
    # 创建增量备份清单
    create_incremental_manifest $backup_path $last_backup_timestamp
    
    # 压缩和上传
    compress_backup $backup_path
    upload_to_cloud $backup_path
    
    log "Incremental backup completed: $backup_path"
}

# 恢复集群
restore_cluster() {
    local backup_file=$1
    local restore_path="/tmp/swarm-restore-$(date +%Y%m%d_%H%M%S)"
    
    log "Starting cluster restore from: $backup_file"
    
    # 解压备份
    mkdir -p $restore_path
    tar -xzf $backup_file -C $restore_path
    
    local backup_dir=$(find $restore_path -maxdepth 1 -type d -name "full_backup_*" | head -1)
    
    if [[ ! -d $backup_dir ]]; then
        log "ERROR: Invalid backup file structure"
        return 1
    fi
    
    # 验证备份完整性
    if ! validate_backup $backup_dir; then
        log "ERROR: Backup validation failed"
        return 1
    fi
    
    # 恢复网络
    restore_networks $backup_dir
    
    # 恢复卷
    restore_volumes $backup_dir
    
    # 恢复密钥和配置
    restore_secrets_and_configs $backup_dir
    
    # 恢复服务
    restore_services $backup_dir
    
    log "Cluster restore completed successfully"
}

# 验证备份
validate_backup() {
    local backup_dir=$1
    local manifest_file="$backup_dir/manifest.json"
    
    log "Validating backup..."
    
    if [[ ! -f $manifest_file ]]; then
        log "ERROR: Backup manifest not found"
        return 1
    fi
    
    # 检查必要的备份组件
    local required_components=("services" "networks" "volumes" "configs")
    
    for component in "${required_components[@]}"; do
        if [[ ! -d "$backup_dir/$component" ]]; then
            log "ERROR: Missing backup component: $component"
            return 1
        fi
    done
    
    log "Backup validation passed"
    return 0
}

# 恢复网络
restore_networks() {
    local backup_dir=$1
    
    log "Restoring networks..."
    
    if [[ -f "$backup_dir/networks.json" ]]; then
        while IFS= read -r network_info; do
            local network_name=$(echo $network_info | jq -r '.Name')
            
            if [[ $network_name != "bridge" && $network_name != "host" && $network_name != "none" ]]; then
                local network_file="$backup_dir/networks/${network_name}.json"
                
                if [[ -f $network_file ]]; then
                    # 从备份文件重建网络
                    restore_single_network $network_file
                fi
            fi
        done < "$backup_dir/networks.json"
    fi
    
    log "Networks restoration completed"
}

# 恢复单个网络
restore_single_network() {
    local network_file=$1
    
    local network_config=$(cat $network_file)
    local network_name=$(echo $network_config | jq -r '.[0].Name')
    local driver=$(echo $network_config | jq -r '.[0].Driver')
    local subnet=$(echo $network_config | jq -r '.[0].IPAM.Config[0].Subnet // empty')
    local gateway=$(echo $network_config | jq -r '.[0].IPAM.Config[0].Gateway // empty')
    
    # 检查网络是否已存在
    if docker network inspect $network_name > /dev/null 2>&1; then
        log "Network $network_name already exists, skipping"
        return
    fi
    
    # 创建网络
    local create_cmd="docker network create --driver $driver"
    
    if [[ -n $subnet && $subnet != "null" ]]; then
        create_cmd="$create_cmd --subnet $subnet"
    fi
    
    if [[ -n $gateway && $gateway != "null" ]]; then
        create_cmd="$create_cmd --gateway $gateway"
    fi
    
    create_cmd="$create_cmd $network_name"
    
    eval $create_cmd
    
    if [[ $? -eq 0 ]]; then
        log "Network $network_name restored successfully"
    else
        log "ERROR: Failed to restore network $network_name"
    fi
}

# 恢复卷
restore_volumes() {
    local backup_dir=$1
    
    log "Restoring volumes..."
    
    if [[ -f "$backup_dir/volumes.json" ]]; then
        while IFS= read -r volume_info; do
            local volume_name=$(echo $volume_info | jq -r '.Name')
            local volume_file="$backup_dir/volumes/${volume_name}.json"
            local volume_data="$backup_dir/volumes/${volume_name}.tar.gz"
            
            if [[ -f $volume_file && -f $volume_data ]]; then
                restore_single_volume $volume_name $volume_file $volume_data
            fi
        done < "$backup_dir/volumes.json"
    fi
    
    log "Volumes restoration completed"
}

# 恢复单个卷
restore_single_volume() {
    local volume_name=$1
    local volume_file=$2
    local volume_data=$3
    
    # 检查卷是否已存在
    if docker volume inspect $volume_name > /dev/null 2>&1; then
        log "Volume $volume_name already exists, skipping"
        return
    fi
    
    # 创建卷
    docker volume create $volume_name
    
    if [[ $? -eq 0 ]]; then
        # 恢复卷数据
        local mountpoint=$(docker volume inspect $volume_name --format '{{.Mountpoint}}')
        
        if [[ -d $mountpoint ]]; then
            tar -xzf $volume_data -C $mountpoint
            log "Volume $volume_name restored successfully"
        else
            log "ERROR: Failed to get mountpoint for volume $volume_name"
        fi
    else
        log "ERROR: Failed to create volume $volume_name"
    fi
}

# 定期备份调度
schedule_backups() {
    log "Setting up backup schedule..."
    
    # 创建 cron 任务
    cat > /etc/cron.d/swarm-backup << 'EOF'
# Docker Swarm 自动备份
# 每天凌晨 2 点执行完整备份
0 2 * * * root /opt/scripts/disaster-recovery.sh full-backup

# 每 6 小时执行增量备份
0 */6 * * * root /opt/scripts/disaster-recovery.sh incremental-backup

# 每周日凌晨 1 点清理旧备份
0 1 * * 0 root /opt/scripts/disaster-recovery.sh cleanup
EOF
    
    # 重启 cron 服务
    systemctl restart cron
    
    log "Backup schedule configured"
}

# 清理旧备份
cleanup_old_backups() {
    log "Cleaning up old backups..."
    
    # 清理本地备份
    find $BACKUP_DIR -name "*.tar.gz" -mtime +$RETENTION_DAYS -delete
    
    # 清理云存储备份
    aws s3 ls s3://$S3_BUCKET/ | while read -r line; do
        local file_date=$(echo $line | awk '{print $1" "$2}')
        local file_name=$(echo $line | awk '{print $4}')
        
        if [[ -n $file_name ]]; then
            local file_age=$(( ($(date +%s) - $(date -d "$file_date" +%s)) / 86400 ))
            
            if [[ $file_age -gt $RETENTION_DAYS ]]; then
                aws s3 rm s3://$S3_BUCKET/$file_name
                log "Deleted old backup: $file_name"
            fi
        fi
    done
    
    log "Old backups cleanup completed"
}

# 主函数
main() {
    case "$1" in
        "full-backup")
            create_full_backup
            ;;
        "incremental-backup")
            local last_backup=$(ls -t $BACKUP_DIR/full_backup_*.tar.gz 2>/dev/null | head -1)
            if [[ -n $last_backup ]]; then
                local timestamp=$(basename $last_backup | sed 's/full_backup_\(.*\)\.tar\.gz/\1/')
                create_incremental_backup $timestamp
            else
                log "No previous backup found, creating full backup"
                create_full_backup
            fi
            ;;
        "restore")
            if [[ $# -lt 2 ]]; then
                echo "Usage: $0 restore <backup-file>"
                exit 1
            fi
            restore_cluster $2
            ;;
        "schedule")
            schedule_backups
            ;;
        "cleanup")
            cleanup_old_backups
            ;;
        *)
            echo "Usage: $0 {full-backup|incremental-backup|restore|schedule|cleanup}"
            echo "  full-backup        - Create full cluster backup"
            echo "  incremental-backup - Create incremental backup"
            echo "  restore <file>     - Restore cluster from backup"
            echo "  schedule           - Setup automatic backup schedule"
            echo "  cleanup            - Clean up old backups"
            exit 1
            ;;
    esac
}

main "$@"

4. 性能调优和容量规划

4.1 性能监控和分析

综合性能监控脚本

#!/bin/bash
# performance-tuning.sh

MONITOR_DURATION=300  # 5分钟监控周期
REPORT_DIR="/opt/performance-reports"
THRESHOLD_CPU=80
THRESHOLD_MEMORY=85
THRESHOLD_DISK=90

# 性能数据收集
collect_performance_data() {
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local report_file="$REPORT_DIR/performance_$timestamp.json"
    
    echo "Collecting performance data..."
    
    mkdir -p $REPORT_DIR
    
    # 系统性能数据
    local system_data=$(cat << EOF
{
  "timestamp": "$(date -Iseconds)",
  "system": {
    "cpu": {
      "usage": $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'),
      "load_average": "$(uptime | awk -F'load average:' '{print $2}')",
      "cores": $(nproc)
    },
    "memory": {
      "total": $(free -m | awk 'NR==2{print $2}'),
      "used": $(free -m | awk 'NR==2{print $3}'),
      "free": $(free -m | awk 'NR==2{print $4}'),
      "usage_percent": $(free | awk 'NR==2{printf "%.2f", $3*100/$2}')
    },
    "disk": {
      "usage": $(df -h / | awk 'NR==2 {print $5}' | sed 's/%//'),
      "available": "$(df -h / | awk 'NR==2 {print $4}')"
    },
    "network": {
      "connections": $(netstat -an | wc -l),
      "tcp_established": $(netstat -an | grep ESTABLISHED | wc -l)
    }
  },
EOF
)
    
    # Docker 性能数据
    local docker_data=$(get_docker_performance_data)
    
    # Swarm 性能数据
    local swarm_data=$(get_swarm_performance_data)
    
    # 合并数据
    echo "$system_data" > $report_file
    echo "  \"docker\": $docker_data," >> $report_file
    echo "  \"swarm\": $swarm_data" >> $report_file
    echo "}" >> $report_file
    
    echo "Performance data collected: $report_file"
}

# 获取 Docker 性能数据
get_docker_performance_data() {
    local containers_count=$(docker ps -q | wc -l)
    local images_count=$(docker images -q | wc -l)
    local volumes_count=$(docker volume ls -q | wc -l)
    local networks_count=$(docker network ls -q | wc -l)
    
    # 容器资源使用统计
    local container_stats=$(docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" | tail -n +2)
    
    cat << EOF
{
  "containers": {
    "total": $containers_count,
    "running": $(docker ps -q | wc -l),
    "stopped": $(docker ps -aq | wc -l)
  },
  "images": {
    "total": $images_count,
    "size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | head -1)"
  },
  "volumes": {
    "total": $volumes_count
  },
  "networks": {
    "total": $networks_count
  },
  "system_usage": {
    "containers_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | sed -n '2p')",
    "images_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | head -1)",
    "volumes_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | sed -n '3p')"
  }
}
EOF
}

# 获取 Swarm 性能数据
get_swarm_performance_data() {
    local nodes_count=$(docker node ls --format '{{.Hostname}}' | wc -l)
    local services_count=$(docker service ls --format '{{.Name}}' | wc -l)
    local tasks_running=0
    local tasks_failed=0
    
    # 统计任务状态
    for service in $(docker service ls --format '{{.Name}}'); do
        local service_tasks=$(docker service ps $service --format '{{.CurrentState}}')
        tasks_running=$((tasks_running + $(echo "$service_tasks" | grep -c "Running")))
        tasks_failed=$((tasks_failed + $(echo "$service_tasks" | grep -c "Failed")))
    done
    
    cat << EOF
{
  "cluster": {
    "nodes": {
      "total": $nodes_count,
      "managers": $(docker node ls --filter role=manager --format '{{.Hostname}}' | wc -l),
      "workers": $(docker node ls --filter role=worker --format '{{.Hostname}}' | wc -l),
      "active": $(docker node ls --filter availability=active --format '{{.Hostname}}' | wc -l)
    },
    "services": {
      "total": $services_count,
      "replicated": $(docker service ls --filter mode=replicated --format '{{.Name}}' | wc -l),
      "global": $(docker service ls --filter mode=global --format '{{.Name}}' | wc -l)
    },
    "tasks": {
      "running": $tasks_running,
      "failed": $tasks_failed
    }
  }
}
EOF
}

# 性能分析和建议
analyze_performance() {
    local report_file=$1
    
    echo "Analyzing performance data..."
    
    if [[ ! -f $report_file ]]; then
        echo "Error: Report file not found: $report_file"
        return 1
    fi
    
    local cpu_usage=$(jq -r '.system.cpu.usage' $report_file | sed 's/%//')
    local memory_usage=$(jq -r '.system.memory.usage_percent' $report_file)
    local disk_usage=$(jq -r '.system.disk.usage' $report_file)
    
    echo "Performance Analysis Report"
    echo "==========================="
    echo "Timestamp: $(jq -r '.timestamp' $report_file)"
    echo
    
    # CPU 分析
    echo "CPU Analysis:"
    echo "  Usage: ${cpu_usage}%"
    if (( $(echo "$cpu_usage > $THRESHOLD_CPU" | bc -l) )); then
        echo "  ⚠️  HIGH CPU USAGE DETECTED!"
        echo "  Recommendations:"
        echo "    - Scale out services with high CPU usage"
        echo "    - Optimize application code"
        echo "    - Consider adding more worker nodes"
    else
        echo "  ✅ CPU usage is within normal range"
    fi
    echo
    
    # 内存分析
    echo "Memory Analysis:"
    echo "  Usage: ${memory_usage}%"
    if (( $(echo "$memory_usage > $THRESHOLD_MEMORY" | bc -l) )); then
        echo "  ⚠️  HIGH MEMORY USAGE DETECTED!"
        echo "  Recommendations:"
        echo "    - Increase memory limits for services"
        echo "    - Scale out memory-intensive services"
        echo "    - Add nodes with more memory"
    else
        echo "  ✅ Memory usage is within normal range"
    fi
    echo
    
    # 磁盘分析
    echo "Disk Analysis:"
    echo "  Usage: ${disk_usage}%"
    if (( disk_usage > THRESHOLD_DISK )); then
        echo "  ⚠️  HIGH DISK USAGE DETECTED!"
        echo "  Recommendations:"
        echo "    - Clean up unused Docker images and containers"
        echo "    - Implement log rotation"
        echo "    - Add more storage capacity"
    else
        echo "  ✅ Disk usage is within normal range"
    fi
    echo
    
    # Swarm 特定分析
    analyze_swarm_performance $report_file
}

# Swarm 性能分析
analyze_swarm_performance() {
    local report_file=$1
    
    echo "Swarm Cluster Analysis:"
    
    local total_nodes=$(jq -r '.swarm.cluster.nodes.total' $report_file)
    local active_nodes=$(jq -r '.swarm.cluster.nodes.active' $report_file)
    local failed_tasks=$(jq -r '.swarm.cluster.tasks.failed' $report_file)
    
    echo "  Nodes: $active_nodes/$total_nodes active"
    echo "  Failed tasks: $failed_tasks"
    
    if [[ $failed_tasks -gt 0 ]]; then
        echo "  ⚠️  FAILED TASKS DETECTED!"
        echo "  Recommendations:"
        echo "    - Check service logs for errors"
        echo "    - Verify resource availability"
        echo "    - Check node health"
    fi
    
    # 节点分布分析
    local manager_nodes=$(jq -r '.swarm.cluster.nodes.managers' $report_file)
    if [[ $manager_nodes -lt 3 ]]; then
        echo "  ⚠️  INSUFFICIENT MANAGER NODES!"
        echo "  Recommendations:"
        echo "    - Add more manager nodes for high availability"
        echo "    - Ensure odd number of managers (3, 5, 7)"
    fi
}

# 自动优化建议
generate_optimization_recommendations() {
    echo "Generating optimization recommendations..."
    
    local recommendations_file="$REPORT_DIR/optimization_recommendations_$(date +%Y%m%d_%H%M%S).md"
    
    cat > $recommendations_file << 'EOF'
# Docker Swarm 性能优化建议

## 系统级优化

### 内核参数调优
```bash
# 网络性能优化
echo 'net.core.rmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.core.wmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_rmem = 4096 65536 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_wmem = 4096 65536 134217728' >> /etc/sysctl.conf

# 文件描述符限制
echo 'fs.file-max = 2097152' >> /etc/sysctl.conf

# 应用设置
sysctl -p

Docker 守护进程优化

{
  "storage-driver": "overlay2",
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "3"
  },
  "default-ulimits": {
    "nofile": {
      "Name": "nofile",
      "Hard": 64000,
      "Soft": 64000
    }
  },
  "max-concurrent-downloads": 10,
  "max-concurrent-uploads": 5
}

服务级优化

资源限制最佳实践

deploy:
  resources:
    limits:
      cpus: '2.0'
      memory: 2G
    reservations:
      cpus: '0.5'
      memory: 512M
  restart_policy:
    condition: on-failure
    delay: 5s
    max_attempts: 3
    window: 120s

健康检查优化

healthcheck:
  test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
  interval: 30s
  timeout: 10s
  retries: 3
  start_period: 40s

网络优化

Overlay 网络调优

# 创建优化的 overlay 网络
docker network create \
  --driver overlay \
  --opt encrypted=true \
  --opt com.docker.network.driver.mtu=1450 \
  optimized-network

负载均衡优化

ports:
  - target: 80
    published: 80
    protocol: tcp
    mode: ingress

存储优化

卷性能调优

# 使用本地卷提高性能
docker volume create \
  --driver local \
  --opt type=tmpfs \
  --opt device=tmpfs \
  --opt o=size=1g \
  fast-cache

日志管理

logging:
  driver: "json-file"
  options:
    max-size: "10m"
    max-file: "3"

EOF

echo "Optimization recommendations generated: $recommendations_file"

}

容量规划

capacity_planning() { echo “Performing capacity planning analysis…”

local planning_report="$REPORT_DIR/capacity_planning_$(date +%Y%m%d_%H%M%S).json"

# 收集历史数据
local historical_data=$(get_historical_performance_data)

# 预测未来需求
local future_projections=$(calculate_future_projections)

# 生成容量规划报告
cat > $planning_report << EOF

{ “timestamp”: “$(date -Iseconds)”, “current_capacity”: { “nodes”: $(docker node ls –format ‘{{.Hostname}}’ | wc -l), “total_cpu_cores”: $(docker node ls –format ‘{{.Hostname}}’ | xargs -I {} docker node inspect {} –format ‘{{.Description.Resources.NanoCPUs}}’ | awk ‘{sum += $11000000000} END {print sum}’), “total_memory_gb”: $(docker node ls –format ‘{{.Hostname}}’ | xargs -I {} docker node inspect {} –format ‘{{.Description.Resources.MemoryBytes}}’ | awk ‘{sum += $11073741824} END {print sum}’) }, “current_usage”: { “services”: $(docker service ls –format ‘{{.Name}}’ | wc -l), “total_replicas”: $(docker service ls –format ‘{{.Replicas}}’ | awk -F’/’ ‘{sum += $2} END {print sum}’) }, “recommendations”: { “short_term”: [ “Monitor CPU usage trends”, “Implement auto-scaling for high-demand services”, “Optimize resource allocation” ], “long_term”: [ “Plan for 50% capacity growth over next 6 months”, “Consider multi-region deployment”, “Implement predictive scaling” ] } } EOF

echo "Capacity planning report generated: $planning_report"

}

主函数

main() { case “$1” in “collect”) collect_performance_data ;; “analyze”) if [[ $# -lt 2 ]]; then echo “Usage: $0 analyze ” exit 1 fi analyze_performance $2 ;; “optimize”) generate_optimization_recommendations ;; “capacity”) capacity_planning ;; “monitor”) echo “Starting continuous monitoring for $MONITOR_DURATION seconds…” while true; do collect_performance_data sleep $MONITOR_DURATION done ;; *) echo “Usage: $0 {collect|analyze|optimize|capacity|monitor}” echo “ collect - Collect current performance data” echo “ analyze - Analyze performance report” echo “ optimize - Generate optimization recommendations” echo “ capacity - Perform capacity planning” echo “ monitor - Start continuous monitoring” exit 1 ;; esac }

main “$@”


### 4.2 自动扩缩容策略

#### 智能自动扩缩容系统

```bash
#!/bin/bash
# intelligent-autoscaler.sh

CONFIG_FILE="/etc/swarm-autoscaler/config.json"
LOG_FILE="/var/log/swarm-autoscaler.log"
METRICS_ENDPOINT="http://prometheus:9090"
CHECK_INTERVAL=60  # 检查间隔(秒)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}

# 初始化配置
init_autoscaler_config() {
    mkdir -p $(dirname $CONFIG_FILE)
    
    cat > $CONFIG_FILE << 'EOF'
{
  "services": {
    "web": {
      "min_replicas": 2,
      "max_replicas": 10,
      "target_cpu_percent": 70,
      "target_memory_percent": 80,
      "scale_up_threshold": 80,
      "scale_down_threshold": 30,
      "scale_up_cooldown": 300,
      "scale_down_cooldown": 600,
      "scale_factor": 2
    },
    "api": {
      "min_replicas": 3,
      "max_replicas": 15,
      "target_cpu_percent": 60,
      "target_memory_percent": 75,
      "scale_up_threshold": 75,
      "scale_down_threshold": 25,
      "scale_up_cooldown": 180,
      "scale_down_cooldown": 900,
      "scale_factor": 1.5
    },
    "worker": {
      "min_replicas": 1,
      "max_replicas": 20,
      "target_cpu_percent": 80,
      "target_memory_percent": 85,
      "scale_up_threshold": 85,
      "scale_down_threshold": 40,
      "scale_up_cooldown": 120,
      "scale_down_cooldown": 1200,
      "scale_factor": 3,
      "queue_based_scaling": {
        "enabled": true,
        "queue_name": "work_queue",
        "target_queue_length": 100,
        "messages_per_replica": 50
      }
    }
  },
  "global_settings": {
    "enabled": true,
    "dry_run": false,
    "metrics_retention": 3600,
    "prediction_window": 1800,
    "enable_predictive_scaling": true
  }
}
EOF
    
    log "Autoscaler configuration initialized"
}

# 获取服务指标
get_service_metrics() {
    local service_name=$1
    
    # 从 Prometheus 获取 CPU 使用率
    local cpu_query="avg(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=\"$service_name\"}[5m])) * 100"
    local cpu_usage=$(curl -s "$METRICS_ENDPOINT/api/v1/query?query=$cpu_query" | jq -r '.data.result[0].value[1] // 0')
    
    # 从 Prometheus 获取内存使用率
    local memory_query="avg(container_memory_usage_bytes{container_label_com_docker_swarm_service_name=\"$service_name\"} / container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name=\"$service_name\"}) * 100"
    local memory_usage=$(curl -s "$METRICS_ENDPOINT/api/v1/query?query=$memory_query" | jq -r '.data.result[0].value[1] // 0')
    
    # 获取当前副本数
    local current_replicas=$(docker service inspect $service_name --format '{{.Spec.Mode.Replicated.Replicas}}')
    
    # 获取队列长度(如果启用)
    local queue_length=0
    local queue_config=$(jq -r ".services.$service_name.queue_based_scaling.enabled // false" $CONFIG_FILE)
    
    if [[ $queue_config == "true" ]]; then
        local queue_name=$(jq -r ".services.$service_name.queue_based_scaling.queue_name" $CONFIG_FILE)
        queue_length=$(get_queue_length $queue_name)
    fi
    
    cat << EOF
{
  "service": "$service_name",
  "cpu_usage": $cpu_usage,
  "memory_usage": $memory_usage,
  "current_replicas": $current_replicas,
  "queue_length": $queue_length,
  "timestamp": "$(date -Iseconds)"
}
EOF
}

# 获取队列长度
get_queue_length() {
    local queue_name=$1
    
    # 这里需要根据实际的消息队列系统实现
    # 示例:Redis 队列
    redis-cli llen $queue_name 2>/dev/null || echo 0
}

# 计算所需副本数
calculate_desired_replicas() {
    local service_name=$1
    local metrics=$2
    
    local cpu_usage=$(echo $metrics | jq -r '.cpu_usage')
    local memory_usage=$(echo $metrics | jq -r '.memory_usage')
    local current_replicas=$(echo $metrics | jq -r '.current_replicas')
    local queue_length=$(echo $metrics | jq -r '.queue_length')
    
    # 获取服务配置
    local config=$(jq ".services.$service_name" $CONFIG_FILE)
    local min_replicas=$(echo $config | jq -r '.min_replicas')
    local max_replicas=$(echo $config | jq -r '.max_replicas')
    local target_cpu=$(echo $config | jq -r '.target_cpu_percent')
    local target_memory=$(echo $config | jq -r '.target_memory_percent')
    
    # 基于 CPU 的扩缩容计算
    local cpu_desired_replicas=$current_replicas
    if (( $(echo "$cpu_usage > 0" | bc -l) )); then
        cpu_desired_replicas=$(echo "scale=0; $current_replicas * $cpu_usage / $target_cpu" | bc)
    fi
    
    # 基于内存的扩缩容计算
    local memory_desired_replicas=$current_replicas
    if (( $(echo "$memory_usage > 0" | bc -l) )); then
        memory_desired_replicas=$(echo "scale=0; $current_replicas * $memory_usage / $target_memory" | bc)
    fi
    
    # 基于队列的扩缩容计算
    local queue_desired_replicas=$current_replicas
    local queue_enabled=$(echo $config | jq -r '.queue_based_scaling.enabled // false')
    
    if [[ $queue_enabled == "true" && $queue_length -gt 0 ]]; then
        local messages_per_replica=$(echo $config | jq -r '.queue_based_scaling.messages_per_replica')
        queue_desired_replicas=$(echo "scale=0; ($queue_length + $messages_per_replica - 1) / $messages_per_replica" | bc)
    fi
    
    # 取最大值作为期望副本数
    local desired_replicas=$cpu_desired_replicas
    
    if (( memory_desired_replicas > desired_replicas )); then
        desired_replicas=$memory_desired_replicas
    fi
    
    if (( queue_desired_replicas > desired_replicas )); then
        desired_replicas=$queue_desired_replicas
    fi
    
    # 应用最小和最大限制
    if (( desired_replicas < min_replicas )); then
        desired_replicas=$min_replicas
    elif (( desired_replicas > max_replicas )); then
        desired_replicas=$max_replicas
    fi
    
    echo $desired_replicas
}

# 检查冷却期
check_cooldown() {
    local service_name=$1
    local action=$2  # "scale_up" or "scale_down"
    
    local last_action_file="/tmp/autoscaler_${service_name}_${action}"
    local cooldown_key="${action}_cooldown"
    local cooldown_period=$(jq -r ".services.$service_name.$cooldown_key" $CONFIG_FILE)
    
    if [[ -f $last_action_file ]]; then
        local last_action_time=$(cat $last_action_file)
        local current_time=$(date +%s)
        local time_diff=$((current_time - last_action_time))
        
        if (( time_diff < cooldown_period )); then
            log "Service $service_name is in cooldown period for $action (${time_diff}s/${cooldown_period}s)"
            return 1
        fi
    fi
    
    return 0
}

# 执行扩缩容
perform_scaling() {
    local service_name=$1
    local current_replicas=$2
    local desired_replicas=$3
    
    local dry_run=$(jq -r '.global_settings.dry_run' $CONFIG_FILE)
    
    if [[ $current_replicas -eq $desired_replicas ]]; then
        return 0
    fi
    
    local action="scale_up"
    if (( desired_replicas < current_replicas )); then
        action="scale_down"
    fi
    
    # 检查冷却期
    if ! check_cooldown $service_name $action; then
        return 1
    fi
    
    log "Scaling service $service_name from $current_replicas to $desired_replicas replicas ($action)"
    
    if [[ $dry_run == "true" ]]; then
        log "DRY RUN: Would scale $service_name to $desired_replicas replicas"
    else
        # 执行扩缩容
        docker service scale $service_name=$desired_replicas
        
        if [[ $? -eq 0 ]]; then
            log "Successfully scaled $service_name to $desired_replicas replicas"
            
            # 记录操作时间
            echo $(date +%s) > "/tmp/autoscaler_${service_name}_${action}"
            
            # 发送通知
            send_scaling_notification $service_name $current_replicas $desired_replicas $action
        else
            log "ERROR: Failed to scale $service_name"
            return 1
        fi
    fi
    
    return 0
}

# 发送扩缩容通知
send_scaling_notification() {
    local service_name=$1
    local old_replicas=$2
    local new_replicas=$3
    local action=$4
    
    # 这里可以集成 Slack、邮件或其他通知系统
    local message="🔄 Autoscaler: Service '$service_name' scaled from $old_replicas to $new_replicas replicas ($action)"
    
    # 示例:发送到 Slack
    # curl -X POST -H 'Content-type: application/json' \
    #     --data "{\"text\":\"$message\"}" \
    #     $SLACK_WEBHOOK_URL
    
    log "Notification sent: $message"
}

# 预测性扩缩容
predictive_scaling() {
    local service_name=$1
    
    local prediction_enabled=$(jq -r '.global_settings.enable_predictive_scaling' $CONFIG_FILE)
    
    if [[ $prediction_enabled != "true" ]]; then
        return 0
    fi
    
    log "Performing predictive scaling analysis for $service_name"
    
    # 获取历史数据
    local prediction_window=$(jq -r '.global_settings.prediction_window' $CONFIG_FILE)
    local historical_data=$(get_historical_metrics $service_name $prediction_window)
    
    # 简单的线性预测(实际应用中可以使用更复杂的机器学习模型)
    local predicted_load=$(calculate_load_prediction "$historical_data")
    
    log "Predicted load for $service_name: $predicted_load%"
    
    # 如果预测负载较高,提前扩容
    if (( $(echo "$predicted_load > 80" | bc -l) )); then
        local current_replicas=$(docker service inspect $service_name --format '{{.Spec.Mode.Replicated.Replicas}}')
        local predicted_replicas=$(echo "scale=0; $current_replicas * 1.2" | bc)
        
        log "Predictive scaling: Preparing for high load, pre-scaling $service_name to $predicted_replicas replicas"
        
        # 执行预测性扩容
        perform_scaling $service_name $current_replicas $predicted_replicas
    fi
}

# 获取历史指标
get_historical_metrics() {
    local service_name=$1
    local window_seconds=$2
    
    # 从 Prometheus 获取历史数据
    local query="avg_over_time(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=\"$service_name\"}[5m])[$window_seconds s:1m]) * 100"
    
    curl -s "$METRICS_ENDPOINT/api/v1/query?query=$query" | jq -r '.data.result[0].values[][]' | paste - -
}

# 计算负载预测
calculate_load_prediction() {
    local historical_data="$1"
    
    # 简单的移动平均预测
    local avg_load=$(echo "$historical_data" | awk '{sum += $2; count++} END {if (count > 0) print sum/count; else print 0}')
    
    echo $avg_load
}

# 主监控循环
start_autoscaler() {
    log "Starting Docker Swarm Autoscaler"
    
    local enabled=$(jq -r '.global_settings.enabled' $CONFIG_FILE)
    
    if [[ $enabled != "true" ]]; then
        log "Autoscaler is disabled in configuration"
        exit 0
    fi
    
    while true; do
        # 获取所有配置的服务
        local services=$(jq -r '.services | keys[]' $CONFIG_FILE)
        
        for service in $services; do
            # 检查服务是否存在
            if ! docker service inspect $service > /dev/null 2>&1; then
                log "Service $service not found, skipping"
                continue
            fi
            
            # 获取服务指标
            local metrics=$(get_service_metrics $service)
            
            if [[ -z $metrics ]]; then
                log "Failed to get metrics for service $service"
                continue
            fi
            
            # 计算期望副本数
            local current_replicas=$(echo $metrics | jq -r '.current_replicas')
            local desired_replicas=$(calculate_desired_replicas $service "$metrics")
            
            log "Service $service: current=$current_replicas, desired=$desired_replicas"
            
            # 执行扩缩容
            perform_scaling $service $current_replicas $desired_replicas
            
            # 预测性扩缩容
            predictive_scaling $service
        done
        
        sleep $CHECK_INTERVAL
    done
}

# 主函数
main() {
    case "$1" in
        "init")
            init_autoscaler_config
            ;;
        "start")
            start_autoscaler
            ;;
        "status")
            echo "Autoscaler Status:"
            echo "Configuration: $CONFIG_FILE"
            echo "Log file: $LOG_FILE"
            echo "Enabled: $(jq -r '.global_settings.enabled' $CONFIG_FILE)"
            echo "Dry run: $(jq -r '.global_settings.dry_run' $CONFIG_FILE)"
            ;;
        "test")
            if [[ $# -lt 2 ]]; then
                echo "Usage: $0 test <service-name>"
                exit 1
            fi
            local metrics=$(get_service_metrics $2)
            echo "Metrics for service $2:"
            echo $metrics | jq .
            ;;
        *)
            echo "Usage: $0 {init|start|status|test}"
            echo "  init              - Initialize autoscaler configuration"
            echo "  start             - Start autoscaler daemon"
            echo "  status            - Show autoscaler status"
            echo "  test <service>    - Test metrics collection for service"
            exit 1
            ;;
    esac
}

main "$@"