学习目标
通过本章学习,您将能够: - 掌握 Docker Swarm 的高级配置和优化技巧 - 了解生产环境部署的最佳实践 - 学习多环境管理和 CI/CD 集成 - 掌握高可用性和灾难恢复策略 - 了解性能调优和容量规划 - 学习安全加固和合规性要求
1. 高级集群配置
1.1 集群拓扑优化
多可用区部署
# multi-az-cluster.yml
version: '3.8'
services:
web:
image: nginx:alpine
deploy:
replicas: 6
placement:
constraints:
- node.role == worker
preferences:
- spread: node.labels.zone
resources:
limits:
cpus: '0.5'
memory: 512M
reservations:
cpus: '0.25'
memory: 256M
update_config:
parallelism: 2
delay: 10s
failure_action: rollback
monitor: 60s
max_failure_ratio: 0.3
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
networks:
- frontend
ports:
- "80:80"
app:
image: myapp:latest
deploy:
replicas: 4
placement:
constraints:
- node.role == worker
- node.labels.tier == app
preferences:
- spread: node.labels.zone
resources:
limits:
cpus: '1.0'
memory: 1G
reservations:
cpus: '0.5'
memory: 512M
networks:
- frontend
- backend
environment:
- NODE_ENV=production
- DB_HOST=db
secrets:
- db_password
- api_key
db:
image: postgres:13
deploy:
replicas: 1
placement:
constraints:
- node.role == worker
- node.labels.tier == db
resources:
limits:
cpus: '2.0'
memory: 2G
reservations:
cpus: '1.0'
memory: 1G
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 5
window: 300s
networks:
- backend
environment:
- POSTGRES_DB=myapp
- POSTGRES_USER=myapp
- POSTGRES_PASSWORD_FILE=/run/secrets/db_password
secrets:
- db_password
volumes:
- db_data:/var/lib/postgresql/data
networks:
frontend:
driver: overlay
attachable: true
driver_opts:
encrypted: "true"
backend:
driver: overlay
driver_opts:
encrypted: "true"
volumes:
db_data:
driver: local
secrets:
db_password:
external: true
api_key:
external: true
节点标签管理脚本
#!/bin/bash
# node-labeling.sh
# 设置节点标签的脚本
set_node_labels() {
local node_name=$1
local zone=$2
local tier=$3
local instance_type=$4
echo "Setting labels for node: $node_name"
# 设置可用区标签
docker node update --label-add zone=$zone $node_name
# 设置层级标签
docker node update --label-add tier=$tier $node_name
# 设置实例类型标签
docker node update --label-add instance_type=$instance_type $node_name
# 设置环境标签
docker node update --label-add env=production $node_name
echo "Labels set successfully for $node_name"
}
# 批量设置节点标签
batch_label_nodes() {
# 管理节点
set_node_labels "manager-1" "us-east-1a" "management" "t3.large"
set_node_labels "manager-2" "us-east-1b" "management" "t3.large"
set_node_labels "manager-3" "us-east-1c" "management" "t3.large"
# 应用层工作节点
set_node_labels "worker-app-1" "us-east-1a" "app" "c5.xlarge"
set_node_labels "worker-app-2" "us-east-1b" "app" "c5.xlarge"
set_node_labels "worker-app-3" "us-east-1c" "app" "c5.xlarge"
# 数据库层工作节点
set_node_labels "worker-db-1" "us-east-1a" "db" "r5.2xlarge"
set_node_labels "worker-db-2" "us-east-1b" "db" "r5.2xlarge"
# 监控节点
set_node_labels "worker-monitor-1" "us-east-1a" "monitoring" "t3.medium"
}
# 显示节点标签
show_node_labels() {
echo "Current node labels:"
for node in $(docker node ls --format '{{.Hostname}}'); do
echo "Node: $node"
docker node inspect $node --format '{{range $k, $v := .Spec.Labels}}{{$k}}={{$v}} {{end}}'
echo
done
}
# 验证标签分布
validate_label_distribution() {
echo "Validating label distribution..."
echo "Zones:"
docker node ls --format 'table {{.Hostname}}\t{{.Status}}\t{{.Availability}}' \
--filter "label=zone"
echo "\nTiers:"
for tier in management app db monitoring; do
echo "Tier: $tier"
docker node ls --format '{{.Hostname}}' --filter "label=tier=$tier"
done
echo "\nInstance types:"
for type in t3.large t3.medium c5.xlarge r5.2xlarge; do
echo "Type: $type"
docker node ls --format '{{.Hostname}}' --filter "label=instance_type=$type"
done
}
# 主函数
main() {
case "$1" in
"set")
batch_label_nodes
;;
"show")
show_node_labels
;;
"validate")
validate_label_distribution
;;
*)
echo "Usage: $0 {set|show|validate}"
echo " set - Set labels on all nodes"
echo " show - Show current node labels"
echo " validate - Validate label distribution"
exit 1
;;
esac
}
main "$@"
5. 实践练习
练习 5.1:多可用区高可用部署
目标:在多个可用区部署高可用的 Docker Swarm 集群
步骤:
初始化多可用区集群: “`bash
在第一个可用区初始化管理节点
docker swarm init –advertise-addr 10.0.1.10
添加其他可用区的管理节点
docker swarm join-token manager
在其他可用区执行
docker swarm join –token SWMTKN-xxx 10.0.1.10:2377
2. **配置节点标签**:
```bash
# 标记节点所在的可用区
docker node update --label-add zone=us-east-1a node1
docker node update --label-add zone=us-east-1b node2
docker node update --label-add zone=us-east-1c node3
# 标记节点类型
docker node update --label-add type=compute node4
docker node update --label-add type=storage node5
部署跨可用区服务: “`yaml
multi-zone-app.yml
version: ‘3.8’ services: web: image: nginx:alpine deploy: replicas: 6 placement: max_replicas_per_node: 2 constraints: - node.labels.type == compute preferences: - spread: node.labels.zone resources: limits: cpus: ‘0.5’ memory: 512M reservations: cpus: ‘0.25’ memory: 256M networks:
- app-network
database: image: postgres:13 deploy: replicas: 3 placement: constraints: - node.labels.type == storage preferences: - spread: node.labels.zone resources: limits: cpus: ‘1.0’ memory: 1G reservations: cpus: ‘0.5’ memory: 512M environment: POSTGRES_DB: myapp POSTGRES_USER: user POSTGRES_PASSWORD_FILE: /run/secrets/db_password secrets: - db_password volumes: - db_data:/var/lib/postgresql/data networks: - app-network
networks: app-network: driver: overlay attachable: true driver_opts: encrypted: “true”
volumes: db_data: driver: local
secrets: db_password: external: true
4. **验证部署**:
```bash
# 部署服务
docker stack deploy -c multi-zone-app.yml myapp
# 检查服务分布
docker service ps myapp_web
docker service ps myapp_database
# 验证可用区分布
docker service ps myapp_web --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}"
练习 5.2:CI/CD 集成与蓝绿部署
目标:实现自动化的 CI/CD 流水线和蓝绿部署
步骤:
- 创建应用代码: “`javascript // app.js const express = require(‘express’); const app = express(); const port = process.env.PORT || 3000; const version = process.env.APP_VERSION || ‘1.0.0’;
app.get(‘/’, (req, res) => { res.json({ message: ‘Hello from Docker Swarm!’, version: version, hostname: require(‘os’).hostname(), timestamp: new Date().toISOString() }); });
app.get(‘/health’, (req, res) => { res.json({ status: ‘healthy’, version: version }); });
app.listen(port, () => {
console.log(App version ${version} listening on port ${port}
);
});
2. **创建 Dockerfile**:
```dockerfile
# Dockerfile
FROM node:16-alpine
WORKDIR /app
COPY package*.json ./
RUN npm ci --only=production
COPY . .
EXPOSE 3000
USER node
CMD ["node", "app.js"]
配置 GitLab CI: “`yaml
.gitlab-ci.yml
stages:
- build
- test
- deploy-blue
- verify
- deploy-green
- cleanup
variables: DOCKER_DRIVER: overlay2 DOCKER_TLS_CERTDIR: “/certs” IMAGE_NAME: “$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA” BLUE_SERVICE: “myapp-blue” GREEN_SERVICE: “myapp-green”
build: stage: build image: docker:20.10.16 services: - docker:20.10.16-dind before_script: - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY script: - docker build -t $IMAGE_NAME . - docker push $IMAGE_NAME only: - main
test: stage: test image: node:16-alpine script: - npm ci - npm test only: - main
deploy-blue: stage: deploy-blue image: docker:20.10.16 before_script: - apk add –no-cache curl script: - | # 检查当前活跃服务 ACTIVE_SERVICE=$(curl -s http://load-balancer/api/active-service || echo “green”)
if [ "$ACTIVE_SERVICE" = "green" ]; then
TARGET_SERVICE="$BLUE_SERVICE"
else
TARGET_SERVICE="$GREEN_SERVICE"
fi
echo "Deploying to $TARGET_SERVICE"
# 更新目标服务
docker service update \
--image $IMAGE_NAME \
--env-add APP_VERSION=$CI_COMMIT_SHA \
$TARGET_SERVICE
# 等待服务更新完成
./scripts/wait-for-service.sh $TARGET_SERVICE
only: - main
verify: stage: verify image: alpine:latest before_script: - apk add –no-cache curl jq script: - | # 获取目标服务端点 TARGET_SERVICE=$(curl -s http://load-balancer/api/inactive-service) TARGET_URL=“http://$TARGET_SERVICE:3000”
# 健康检查
for i in {1..30}; do
if curl -f $TARGET_URL/health; then
echo "Health check passed"
break
fi
echo "Waiting for service to be healthy..."
sleep 10
done
# 功能测试
RESPONSE=$(curl -s $TARGET_URL)
VERSION=$(echo $RESPONSE | jq -r '.version')
if [ "$VERSION" = "$CI_COMMIT_SHA" ]; then
echo "Version verification passed"
else
echo "Version verification failed"
exit 1
fi
only: - main
deploy-green: stage: deploy-green image: alpine:latest before_script: - apk add –no-cache curl script: - | # 切换流量到新版本 TARGET_SERVICE=$(curl -s http://load-balancer/api/inactive-service) curl -X POST http://load-balancer/api/switch-traffic -d “{\“target\”: \“$TARGET_SERVICE\”}”
echo "Traffic switched to $TARGET_SERVICE"
only: - main when: manual
cleanup: stage: cleanup image: alpine:latest script: - echo “Deployment completed successfully” - echo “Old version can be cleaned up if needed” only: - main when: manual
4. **蓝绿部署脚本**:
```bash
#!/bin/bash
# blue-green-deploy.sh
SERVICE_NAME="myapp"
BLUE_SERVICE="${SERVICE_NAME}-blue"
GREEN_SERVICE="${SERVICE_NAME}-green"
LOAD_BALANCER="traefik"
NEW_IMAGE="$1"
NEW_VERSION="$2"
if [[ -z $NEW_IMAGE || -z $NEW_VERSION ]]; then
echo "Usage: $0 <new-image> <new-version>"
exit 1
fi
# 获取当前活跃服务
get_active_service() {
# 通过负载均衡器 API 或标签获取当前活跃服务
docker service inspect $BLUE_SERVICE --format '{{.Spec.Labels.active}}' 2>/dev/null | grep -q "true" && echo "blue" || echo "green"
}
# 等待服务更新完成
wait_for_service_update() {
local service_name=$1
local timeout=300
local elapsed=0
echo "Waiting for service $service_name to update..."
while [[ $elapsed -lt $timeout ]]; do
local update_status=$(docker service inspect $service_name --format '{{.UpdateStatus.State}}')
if [[ $update_status == "completed" ]]; then
echo "Service $service_name updated successfully"
return 0
elif [[ $update_status == "paused" || $update_status == "rollback_completed" ]]; then
echo "Service $service_name update failed: $update_status"
return 1
fi
sleep 5
elapsed=$((elapsed + 5))
done
echo "Timeout waiting for service $service_name to update"
return 1
}
# 健康检查
health_check() {
local service_name=$1
local max_attempts=30
local attempt=0
echo "Performing health check for $service_name..."
while [[ $attempt -lt $max_attempts ]]; do
# 获取服务的一个任务 IP
local task_ip=$(docker service ps $service_name --format '{{.Node}}' | head -1 | xargs docker node inspect --format '{{.Status.Addr}}')
if curl -f -s "http://$task_ip:3000/health" > /dev/null; then
echo "Health check passed for $service_name"
return 0
fi
echo "Health check attempt $((attempt + 1))/$max_attempts failed, retrying..."
sleep 10
attempt=$((attempt + 1))
done
echo "Health check failed for $service_name"
return 1
}
# 切换流量
switch_traffic() {
local new_active_service=$1
local old_active_service=$2
echo "Switching traffic from $old_active_service to $new_active_service..."
# 更新服务标签
docker service update --label-rm active $old_active_service
docker service update --label-add active=true $new_active_service
# 更新负载均衡器配置(这里使用 Traefik 作为示例)
docker service update \
--label-add "traefik.http.routers.${SERVICE_NAME}.service=${new_active_service}" \
$new_active_service
echo "Traffic switched to $new_active_service"
}
# 回滚
rollback() {
local service_name=$1
echo "Rolling back $service_name..."
docker service rollback $service_name
if wait_for_service_update $service_name; then
echo "Rollback completed for $service_name"
else
echo "Rollback failed for $service_name"
return 1
fi
}
# 主部署流程
main() {
echo "Starting blue-green deployment..."
echo "New image: $NEW_IMAGE"
echo "New version: $NEW_VERSION"
# 确定当前活跃服务和目标服务
local active_service=$(get_active_service)
local target_service
if [[ $active_service == "blue" ]]; then
target_service=$GREEN_SERVICE
else
target_service=$BLUE_SERVICE
fi
echo "Current active service: $active_service"
echo "Target service: $target_service"
# 更新目标服务
echo "Updating $target_service with new image..."
docker service update \
--image $NEW_IMAGE \
--env-add APP_VERSION=$NEW_VERSION \
--label-rm active \
$target_service
# 等待更新完成
if ! wait_for_service_update $target_service; then
echo "Failed to update $target_service"
exit 1
fi
# 健康检查
if ! health_check $target_service; then
echo "Health check failed, rolling back..."
rollback $target_service
exit 1
fi
# 询问是否切换流量
echo "New version is ready. Switch traffic? (y/n)"
read -r response
if [[ $response == "y" || $response == "Y" ]]; then
switch_traffic $target_service "${SERVICE_NAME}-${active_service}"
echo "Blue-green deployment completed successfully!"
else
echo "Traffic not switched. New version is available at $target_service"
fi
}
main
练习 5.3:性能调优和监控
目标:实现全面的性能监控和自动调优
步骤:
部署监控栈:
# 使用之前创建的性能监控脚本 ./performance-tuning.sh collect ./performance-tuning.sh analyze /opt/performance-reports/performance_*.json ./performance-tuning.sh optimize
配置自动扩缩容:
# 初始化自动扩缩容配置 ./intelligent-autoscaler.sh init # 启动自动扩缩容服务 ./intelligent-autoscaler.sh start
压力测试: “`bash
使用 Apache Bench 进行压力测试
ab -n 10000 -c 100 http://your-app-url/
使用 wrk 进行更复杂的测试
wrk -t12 -c400 -d30s –script=load-test.lua http://your-app-url/
4. **监控扩缩容行为**:
```bash
# 观察服务副本数变化
watch -n 5 'docker service ls'
# 查看自动扩缩容日志
tail -f /var/log/swarm-autoscaler.log
6. 本章总结
6.1 关键要点
高级配置管理
- 多可用区部署策略
- 节点标签和约束管理
- 高级网络配置
- 性能优化配置
生产环境最佳实践
- 安全配置和加固
- 资源管理和限制
- 监控和日志管理
- 备份和恢复策略
CI/CD 集成
- 自动化部署流水线
- 蓝绿部署和金丝雀发布
- 自动化测试和验证
- 回滚和故障恢复
高可用性和灾难恢复
- 多区域部署
- 自动故障转移
- 数据备份和恢复
- 业务连续性规划
性能调优和容量规划
- 性能监控和分析
- 自动扩缩容策略
- 资源优化配置
- 容量规划和预测
6.2 最佳实践总结
架构设计
- 采用微服务架构
- 实现服务解耦
- 设计容错机制
- 考虑扩展性需求
运维管理
- 实施基础设施即代码
- 建立完善的监控体系
- 制定标准化流程
- 持续优化和改进
安全管理
- 实施最小权限原则
- 定期安全审计
- 加密敏感数据
- 及时更新和修补
性能优化
- 持续性能监控
- 主动容量规划
- 自动化扩缩容
- 定期性能调优
6.3 进阶学习方向
容器编排进阶
- Kubernetes 集群管理
- 服务网格(Service Mesh)
- 无服务器容器(Serverless Containers)
- 边缘计算部署
云原生技术栈
- CNCF 生态系统
- GitOps 工作流
- 可观测性(Observability)
- 混沌工程(Chaos Engineering)
企业级实践
- 多云和混合云策略
- 合规性和治理
- 成本优化
- 组织和流程变革
通过本章的学习,您已经掌握了 Docker Swarm 的高级特性和生产环境最佳实践。这些知识将帮助您构建和管理大规模、高可用的容器化应用系统。
恭喜!您已经完成了 Docker Swarm 完整教程的学习。
这个教程涵盖了从基础概念到高级实践的全部内容,包括: - 集群搭建和管理 - 服务部署和编排 - 网络和存储管理 - 安全配置和监控 - 故障排除和性能优化 - 生产环境最佳实践
希望这个教程能够帮助您在容器编排和微服务架构的道路上取得成功!
### 1.2 高级网络配置
#### 多网络架构
```bash
#!/bin/bash
# advanced-networking.sh
# 创建多层网络架构
create_network_architecture() {
echo "Creating advanced network architecture..."
# 前端网络(公共访问)
docker network create \
--driver overlay \
--subnet 10.1.0.0/24 \
--gateway 10.1.0.1 \
--attachable \
--opt encrypted=false \
frontend-public
# 应用网络(内部通信)
docker network create \
--driver overlay \
--subnet 10.2.0.0/24 \
--gateway 10.2.0.1 \
--opt encrypted=true \
app-internal
# 数据库网络(高安全)
docker network create \
--driver overlay \
--subnet 10.3.0.0/24 \
--gateway 10.3.0.1 \
--opt encrypted=true \
--opt com.docker.network.driver.mtu=1450 \
database-secure
# 监控网络(管理访问)
docker network create \
--driver overlay \
--subnet 10.4.0.0/24 \
--gateway 10.4.0.1 \
--attachable \
--opt encrypted=false \
monitoring-mgmt
# 日志网络(日志收集)
docker network create \
--driver overlay \
--subnet 10.5.0.0/24 \
--gateway 10.5.0.1 \
--opt encrypted=false \
logging-collect
echo "Network architecture created successfully"
}
# 配置网络策略
configure_network_policies() {
echo "Configuring network policies..."
# 这里可以集成 Calico 或其他网络策略引擎
# 示例:限制数据库网络只能从应用网络访问
cat > network-policy.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: database-access-policy
spec:
podSelector:
matchLabels:
tier: database
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
tier: app
ports:
- protocol: TCP
port: 5432
egress:
- to: []
ports:
- protocol: TCP
port: 53
- protocol: UDP
port: 53
EOF
echo "Network policies configured"
}
# 网络性能优化
optimize_network_performance() {
echo "Optimizing network performance..."
# 调整内核网络参数
cat > /etc/sysctl.d/99-docker-swarm.conf << 'EOF'
# Docker Swarm 网络优化
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 65536 134217728
net.ipv4.tcp_wmem = 4096 65536 134217728
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_congestion_control = bbr
net.ipv4.tcp_slow_start_after_idle = 0
net.ipv4.tcp_tw_reuse = 1
EOF
# 应用设置
sysctl -p /etc/sysctl.d/99-docker-swarm.conf
# 优化 Docker 网络设置
cat > /etc/docker/daemon.json << 'EOF'
{
"storage-driver": "overlay2",
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"default-address-pools": [
{
"base": "172.30.0.0/16",
"size": 24
}
],
"bip": "172.26.0.1/16",
"mtu": 1450,
"experimental": true,
"metrics-addr": "0.0.0.0:9323"
}
EOF
echo "Network performance optimization completed"
}
# 网络监控设置
setup_network_monitoring() {
echo "Setting up network monitoring..."
# 部署网络监控服务
docker service create \
--name network-monitor \
--network monitoring-mgmt \
--mount type=bind,source=/proc,target=/host/proc,readonly \
--mount type=bind,source=/sys,target=/host/sys,readonly \
--mount type=bind,source=/,target=/rootfs,readonly \
--publish 9100:9100 \
prom/node-exporter:latest \
--path.procfs=/host/proc \
--path.sysfs=/host/sys \
--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($|/)"
# 部署网络流量分析
docker service create \
--name traffic-analyzer \
--network monitoring-mgmt \
--cap-add NET_ADMIN \
--cap-add SYS_ADMIN \
--mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock \
--publish 8080:8080 \
ntopng/ntopng:latest
echo "Network monitoring setup completed"
}
# 主函数
main() {
case "$1" in
"create")
create_network_architecture
;;
"policies")
configure_network_policies
;;
"optimize")
optimize_network_performance
;;
"monitor")
setup_network_monitoring
;;
"all")
create_network_architecture
configure_network_policies
optimize_network_performance
setup_network_monitoring
;;
*)
echo "Usage: $0 {create|policies|optimize|monitor|all}"
exit 1
;;
esac
}
main "$@"
2. 生产环境最佳实践
2.1 环境管理
多环境配置管理
#!/bin/bash
# environment-manager.sh
ENV_DIR="/opt/swarm-environments"
CONFIG_DIR="$ENV_DIR/configs"
SECRETS_DIR="$ENV_DIR/secrets"
# 环境配置
declare -A ENVIRONMENTS=(
["dev"]="development"
["staging"]="staging"
["prod"]="production"
)
# 初始化环境目录
init_environment_structure() {
echo "Initializing environment structure..."
mkdir -p $ENV_DIR/{configs,secrets,compose,scripts}
for env in "${!ENVIRONMENTS[@]}"; do
mkdir -p $CONFIG_DIR/$env
mkdir -p $SECRETS_DIR/$env
mkdir -p $ENV_DIR/compose/$env
done
echo "Environment structure initialized"
}
# 创建环境配置
create_environment_config() {
local env=$1
local config_file="$CONFIG_DIR/$env/config.env"
case $env in
"dev")
cat > $config_file << 'EOF'
# Development Environment Configuration
ENVIRONMENT=development
DEBUG=true
LOG_LEVEL=debug
REPLICAS_WEB=1
REPLICAS_APP=1
REPLICAS_DB=1
CPU_LIMIT_WEB=0.5
MEMORY_LIMIT_WEB=512m
CPU_LIMIT_APP=1.0
MEMORY_LIMIT_APP=1g
CPU_LIMIT_DB=1.0
MEMORY_LIMIT_DB=2g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=3
UPDATE_PARALLELISM=1
UPDATE_DELAY=10s
EOF
;;
"staging")
cat > $config_file << 'EOF'
# Staging Environment Configuration
ENVIRONMENT=staging
DEBUG=false
LOG_LEVEL=info
REPLICAS_WEB=2
REPLICAS_APP=2
REPLICAS_DB=1
CPU_LIMIT_WEB=0.5
MEMORY_LIMIT_WEB=512m
CPU_LIMIT_APP=1.0
MEMORY_LIMIT_APP=1g
CPU_LIMIT_DB=2.0
MEMORY_LIMIT_DB=4g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=3
UPDATE_PARALLELISM=1
UPDATE_DELAY=30s
EOF
;;
"prod")
cat > $config_file << 'EOF'
# Production Environment Configuration
ENVIRONMENT=production
DEBUG=false
LOG_LEVEL=warn
REPLICAS_WEB=3
REPLICAS_APP=4
REPLICAS_DB=2
CPU_LIMIT_WEB=1.0
MEMORY_LIMIT_WEB=1g
CPU_LIMIT_APP=2.0
MEMORY_LIMIT_APP=2g
CPU_LIMIT_DB=4.0
MEMORY_LIMIT_DB=8g
HEALTH_CHECK_INTERVAL=30s
HEALTH_CHECK_TIMEOUT=10s
HEALTH_CHECK_RETRIES=5
UPDATE_PARALLELISM=2
UPDATE_DELAY=60s
EOF
;;
esac
echo "Configuration created for $env environment"
}
# 部署到指定环境
deploy_to_environment() {
local env=$1
local stack_name=$2
local compose_file="$ENV_DIR/compose/$env/docker-compose.yml"
local config_file="$CONFIG_DIR/$env/config.env"
if [[ ! -f $compose_file ]]; then
echo "Error: Compose file not found for $env environment"
return 1
fi
if [[ ! -f $config_file ]]; then
echo "Error: Config file not found for $env environment"
return 1
fi
echo "Deploying $stack_name to $env environment..."
# 加载环境配置
source $config_file
# 验证必要的密钥
validate_secrets $env
# 部署堆栈
docker stack deploy \
--compose-file $compose_file \
--with-registry-auth \
$stack_name-$env
echo "Deployment to $env completed"
}
# 验证密钥
validate_secrets() {
local env=$1
local secrets_file="$SECRETS_DIR/$env/secrets.txt"
if [[ ! -f $secrets_file ]]; then
echo "Warning: Secrets file not found for $env"
return 1
fi
echo "Validating secrets for $env environment..."
while IFS= read -r secret_name; do
if ! docker secret inspect "${secret_name}-${env}" > /dev/null 2>&1; then
echo "Error: Secret ${secret_name}-${env} not found"
return 1
fi
done < $secrets_file
echo "All secrets validated for $env"
}
# 环境健康检查
health_check_environment() {
local env=$1
local stack_name=$2
echo "Performing health check for $env environment..."
# 检查服务状态
echo "Service Status:"
docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env"
# 检查任务状态
echo "\nTask Status:"
for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
echo "Service: $service"
docker service ps $service --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Error}}"
echo
done
# 检查网络连通性
echo "Network Connectivity:"
test_network_connectivity $env
# 检查资源使用
echo "Resource Usage:"
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
}
# 测试网络连通性
test_network_connectivity() {
local env=$1
# 这里可以添加具体的网络连通性测试
echo "Testing network connectivity for $env..."
# 示例:测试服务间连通性
local test_container=$(docker ps --filter "label=com.docker.stack.namespace=myapp-$env" --format '{{.Names}}' | head -1)
if [[ -n $test_container ]]; then
echo "Testing from container: $test_container"
docker exec $test_container ping -c 3 google.com > /dev/null 2>&1
if [[ $? -eq 0 ]]; then
echo "✓ External connectivity OK"
else
echo "✗ External connectivity failed"
fi
fi
}
# 环境切换
switch_environment() {
local from_env=$1
local to_env=$2
local stack_name=$3
echo "Switching from $from_env to $to_env environment..."
# 备份当前环境状态
backup_environment_state $from_env $stack_name
# 部署到新环境
deploy_to_environment $to_env $stack_name
# 验证新环境
sleep 30
health_check_environment $to_env $stack_name
echo "Environment switch completed"
}
# 备份环境状态
backup_environment_state() {
local env=$1
local stack_name=$2
local backup_dir="$ENV_DIR/backups/$env/$(date +%Y%m%d_%H%M%S)"
mkdir -p $backup_dir
echo "Backing up $env environment state..."
# 备份服务配置
for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
docker service inspect $service > "$backup_dir/${service}.json"
done
# 备份网络配置
for network in $(docker network ls --filter "label=com.docker.stack.namespace=$stack_name-$env" --format '{{.Name}}'); do
docker network inspect $network > "$backup_dir/${network}-network.json"
done
echo "Environment state backed up to: $backup_dir"
}
# 主函数
main() {
case "$1" in
"init")
init_environment_structure
for env in "${!ENVIRONMENTS[@]}"; do
create_environment_config $env
done
;;
"deploy")
if [[ $# -lt 3 ]]; then
echo "Usage: $0 deploy <environment> <stack-name>"
exit 1
fi
deploy_to_environment $2 $3
;;
"health")
if [[ $# -lt 3 ]]; then
echo "Usage: $0 health <environment> <stack-name>"
exit 1
fi
health_check_environment $2 $3
;;
"switch")
if [[ $# -lt 4 ]]; then
echo "Usage: $0 switch <from-env> <to-env> <stack-name>"
exit 1
fi
switch_environment $2 $3 $4
;;
"backup")
if [[ $# -lt 3 ]]; then
echo "Usage: $0 backup <environment> <stack-name>"
exit 1
fi
backup_environment_state $2 $3
;;
*)
echo "Usage: $0 {init|deploy|health|switch|backup}"
echo " init - Initialize environment structure"
echo " deploy <env> <stack> - Deploy to environment"
echo " health <env> <stack> - Health check environment"
echo " switch <from> <to> <stack> - Switch between environments"
echo " backup <env> <stack> - Backup environment state"
exit 1
;;
esac
}
main "$@"
2.2 CI/CD 集成
GitLab CI 配置
# .gitlab-ci.yml
stages:
- build
- test
- security
- deploy-dev
- deploy-staging
- deploy-production
variables:
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: "/certs"
REGISTRY: $CI_REGISTRY
IMAGE_NAME: $CI_REGISTRY_IMAGE
SWARM_MANAGER: $SWARM_MANAGER_HOST
before_script:
- docker info
- echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY
# 构建阶段
build:
stage: build
services:
- docker:20.10.16-dind
script:
- docker build -t $IMAGE_NAME:$CI_COMMIT_SHA .
- docker build -t $IMAGE_NAME:latest .
- docker push $IMAGE_NAME:$CI_COMMIT_SHA
- docker push $IMAGE_NAME:latest
only:
- main
- develop
- /^release\/.*$/
# 测试阶段
unit-tests:
stage: test
services:
- docker:20.10.16-dind
script:
- docker run --rm $IMAGE_NAME:$CI_COMMIT_SHA npm test
coverage: '/Coverage: \d+\.\d+%/'
artifacts:
reports:
coverage_report:
coverage_format: cobertura
path: coverage/cobertura-coverage.xml
only:
- main
- develop
- merge_requests
integration-tests:
stage: test
services:
- docker:20.10.16-dind
- postgres:13
- redis:6
variables:
POSTGRES_DB: testdb
POSTGRES_USER: testuser
POSTGRES_PASSWORD: testpass
DATABASE_URL: postgresql://testuser:testpass@postgres:5432/testdb
REDIS_URL: redis://redis:6379
script:
- docker run --rm --network host $IMAGE_NAME:$CI_COMMIT_SHA npm run test:integration
only:
- main
- develop
# 安全扫描
security-scan:
stage: security
services:
- docker:20.10.16-dind
script:
# 使用 Trivy 进行镜像安全扫描
- docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \
aquasec/trivy:latest image --exit-code 1 --severity HIGH,CRITICAL \
$IMAGE_NAME:$CI_COMMIT_SHA
allow_failure: true
artifacts:
reports:
container_scanning: gl-container-scanning-report.json
only:
- main
- develop
# 开发环境部署
deploy-dev:
stage: deploy-dev
environment:
name: development
url: https://dev.myapp.com
script:
- apk add --no-cache openssh-client
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
- |
ssh $SWARM_USER@$SWARM_MANAGER << EOF
export IMAGE_TAG=$CI_COMMIT_SHA
export ENVIRONMENT=development
cd /opt/swarm-deployments
./deploy.sh dev myapp $IMAGE_TAG
EOF
only:
- develop
when: manual
# 预发布环境部署
deploy-staging:
stage: deploy-staging
environment:
name: staging
url: https://staging.myapp.com
script:
- apk add --no-cache openssh-client
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
- |
ssh $SWARM_USER@$SWARM_MANAGER << EOF
export IMAGE_TAG=$CI_COMMIT_SHA
export ENVIRONMENT=staging
cd /opt/swarm-deployments
./deploy.sh staging myapp $IMAGE_TAG
./health-check.sh staging myapp
EOF
only:
- main
when: manual
# 生产环境部署
deploy-production:
stage: deploy-production
environment:
name: production
url: https://myapp.com
script:
- apk add --no-cache openssh-client
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan $SWARM_MANAGER >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
- |
ssh $SWARM_USER@$SWARM_MANAGER << EOF
export IMAGE_TAG=$CI_COMMIT_SHA
export ENVIRONMENT=production
cd /opt/swarm-deployments
# 创建部署前备份
./backup.sh production myapp
# 执行蓝绿部署
./blue-green-deploy.sh production myapp $IMAGE_TAG
# 验证部署
./health-check.sh production myapp
# 如果验证失败,自动回滚
if [ $? -ne 0 ]; then
echo "Health check failed, rolling back..."
./rollback.sh production myapp
exit 1
fi
EOF
only:
- /^release\/.*$/
when: manual
allow_failure: false
# 部署后清理
cleanup:
stage: .post
script:
- docker system prune -f
when: always
蓝绿部署脚本
#!/bin/bash
# blue-green-deploy.sh
ENVIRONMENT=$1
STACK_NAME=$2
IMAGE_TAG=$3
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <environment> <stack-name> <image-tag>"
exit 1
fi
BLUE_STACK="${STACK_NAME}-${ENVIRONMENT}-blue"
GREEN_STACK="${STACK_NAME}-${ENVIRONMENT}-green"
CURRENT_STACK="${STACK_NAME}-${ENVIRONMENT}"
LOAD_BALANCER_CONFIG="/etc/nginx/conf.d/${STACK_NAME}-${ENVIRONMENT}.conf"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}
# 检查当前活跃的堆栈
get_active_stack() {
if docker stack ls | grep -q "$BLUE_STACK"; then
echo "blue"
elif docker stack ls | grep -q "$GREEN_STACK"; then
echo "green"
else
echo "none"
fi
}
# 获取非活跃的堆栈
get_inactive_stack() {
local active=$(get_active_stack)
case $active in
"blue") echo "green" ;;
"green") echo "blue" ;;
"none") echo "blue" ;;
esac
}
# 部署到非活跃环境
deploy_to_inactive() {
local inactive_color=$(get_inactive_stack)
local inactive_stack="${STACK_NAME}-${ENVIRONMENT}-${inactive_color}"
log "Deploying to inactive environment: $inactive_color"
# 准备 compose 文件
local compose_file="/tmp/docker-compose-${inactive_color}.yml"
# 生成带有新镜像标签的 compose 文件
envsubst < "/opt/swarm-environments/compose/${ENVIRONMENT}/docker-compose.yml" > $compose_file
# 替换镜像标签
sed -i "s|image: .*:latest|image: ${CI_REGISTRY_IMAGE}:${IMAGE_TAG}|g" $compose_file
# 部署堆栈
docker stack deploy --compose-file $compose_file $inactive_stack
log "Deployment to $inactive_color completed"
# 等待服务启动
wait_for_services $inactive_stack
return $?
}
# 等待服务启动
wait_for_services() {
local stack_name=$1
local max_wait=300 # 5分钟
local wait_time=0
log "Waiting for services in $stack_name to be ready..."
while [ $wait_time -lt $max_wait ]; do
local services_ready=true
for service in $(docker service ls --filter "label=com.docker.stack.namespace=$stack_name" --format '{{.Name}}'); do
local replicas=$(docker service ls --filter "name=$service" --format '{{.Replicas}}')
local desired=$(echo $replicas | cut -d'/' -f2)
local running=$(echo $replicas | cut -d'/' -f1)
if [[ "$running" != "$desired" ]]; then
services_ready=false
break
fi
done
if $services_ready; then
log "All services are ready"
return 0
fi
sleep 10
wait_time=$((wait_time + 10))
log "Waiting... ($wait_time/$max_wait seconds)"
done
log "Timeout waiting for services to be ready"
return 1
}
# 健康检查
health_check() {
local stack_name=$1
local health_check_url="http://localhost/health"
log "Performing health check for $stack_name"
# 获取服务端口
local web_service="${stack_name}_web"
local port=$(docker service inspect $web_service --format '{{range .Endpoint.Ports}}{{.PublishedPort}}{{end}}')
if [[ -n $port ]]; then
health_check_url="http://localhost:$port/health"
fi
# 执行健康检查
local max_attempts=10
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -f -s $health_check_url > /dev/null; then
log "Health check passed for $stack_name"
return 0
fi
log "Health check attempt $attempt/$max_attempts failed"
sleep 10
attempt=$((attempt + 1))
done
log "Health check failed for $stack_name"
return 1
}
# 切换流量
switch_traffic() {
local new_active_color=$1
local new_active_stack="${STACK_NAME}-${ENVIRONMENT}-${new_active_color}"
log "Switching traffic to $new_active_color environment"
# 获取新环境的服务端口
local web_service="${new_active_stack}_web"
local new_port=$(docker service inspect $web_service --format '{{range .Endpoint.Ports}}{{.PublishedPort}}{{end}}')
if [[ -z $new_port ]]; then
log "Error: Could not get port for $web_service"
return 1
fi
# 更新负载均衡器配置
cat > $LOAD_BALANCER_CONFIG << EOF
upstream ${STACK_NAME}_${ENVIRONMENT} {
server localhost:$new_port;
}
server {
listen 80;
server_name ${STACK_NAME}-${ENVIRONMENT}.local;
location / {
proxy_pass http://${STACK_NAME}_${ENVIRONMENT};
proxy_set_header Host \$host;
proxy_set_header X-Real-IP \$remote_addr;
proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto \$scheme;
}
location /health {
proxy_pass http://${STACK_NAME}_${ENVIRONMENT}/health;
}
}
EOF
# 重新加载 Nginx 配置
nginx -t && nginx -s reload
if [[ $? -eq 0 ]]; then
log "Traffic switched to $new_active_color successfully"
return 0
else
log "Failed to switch traffic to $new_active_color"
return 1
fi
}
# 清理旧环境
cleanup_old_environment() {
local old_color=$1
local old_stack="${STACK_NAME}-${ENVIRONMENT}-${old_color}"
log "Cleaning up old environment: $old_color"
# 等待一段时间确保流量已切换
sleep 30
# 删除旧堆栈
docker stack rm $old_stack
log "Old environment $old_color cleaned up"
}
# 回滚函数
rollback() {
local current_active=$(get_active_stack)
if [[ $current_active == "none" ]]; then
log "No active environment to rollback to"
return 1
fi
log "Rolling back to $current_active environment"
# 切换回原来的环境
switch_traffic $current_active
return $?
}
# 主部署流程
main() {
log "Starting blue-green deployment for $STACK_NAME in $ENVIRONMENT"
log "Image tag: $IMAGE_TAG"
local active_color=$(get_active_stack)
local inactive_color=$(get_inactive_stack)
log "Current active environment: $active_color"
log "Deploying to inactive environment: $inactive_color"
# 部署到非活跃环境
if ! deploy_to_inactive; then
log "Deployment failed"
exit 1
fi
# 健康检查
local inactive_stack="${STACK_NAME}-${ENVIRONMENT}-${inactive_color}"
if ! health_check $inactive_stack; then
log "Health check failed, cleaning up failed deployment"
docker stack rm $inactive_stack
exit 1
fi
# 切换流量
if ! switch_traffic $inactive_color; then
log "Traffic switch failed, rolling back"
docker stack rm $inactive_stack
exit 1
fi
# 清理旧环境
if [[ $active_color != "none" ]]; then
cleanup_old_environment $active_color
fi
log "Blue-green deployment completed successfully"
log "New active environment: $inactive_color"
}
# 执行主函数
main
3. 高可用性和灾难恢复
3.1 高可用性配置
多区域集群设置
#!/bin/bash
# multi-region-setup.sh
# 多区域集群配置
REGIONS=("us-east-1" "us-west-2" "eu-west-1")
MANAGER_NODES_PER_REGION=1
WORKER_NODES_PER_REGION=2
# 集群配置
setup_multi_region_cluster() {
echo "Setting up multi-region Docker Swarm cluster..."
# 主区域初始化
local primary_region="${REGIONS[0]}"
echo "Initializing primary region: $primary_region"
# 在主区域初始化 Swarm
docker swarm init --advertise-addr $(get_primary_ip)
# 获取 join tokens
local manager_token=$(docker swarm join-token manager -q)
local worker_token=$(docker swarm join-token worker -q)
# 设置主区域节点标签
docker node update --label-add region=$primary_region $(hostname)
docker node update --label-add zone=${primary_region}a $(hostname)
# 在其他区域添加节点
for region in "${REGIONS[@]:1}"; do
setup_region_nodes $region $manager_token $worker_token
done
# 配置跨区域网络
setup_cross_region_networking
echo "Multi-region cluster setup completed"
}
# 设置区域节点
setup_region_nodes() {
local region=$1
local manager_token=$2
local worker_token=$3
echo "Setting up nodes in region: $region"
# 这里需要根据实际的基础设施自动化工具来实现
# 例如使用 Terraform、AWS CLI、Azure CLI 等
# 示例:使用 AWS CLI 创建实例
create_aws_instances $region $manager_token $worker_token
}
# 创建 AWS 实例(示例)
create_aws_instances() {
local region=$1
local manager_token=$2
local worker_token=$3
# 管理节点
for i in $(seq 1 $MANAGER_NODES_PER_REGION); do
local instance_name="swarm-manager-${region}-${i}"
aws ec2 run-instances \
--region $region \
--image-id ami-0abcdef1234567890 \
--instance-type t3.medium \
--key-name my-key-pair \
--security-group-ids sg-12345678 \
--subnet-id subnet-12345678 \
--user-data "$(generate_manager_userdata $manager_token $region)" \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$instance_name},{Key=Role,Value=manager},{Key=Region,Value=$region}]"
done
# 工作节点
for i in $(seq 1 $WORKER_NODES_PER_REGION); do
local instance_name="swarm-worker-${region}-${i}"
aws ec2 run-instances \
--region $region \
--image-id ami-0abcdef1234567890 \
--instance-type t3.large \
--key-name my-key-pair \
--security-group-ids sg-12345678 \
--subnet-id subnet-12345678 \
--user-data "$(generate_worker_userdata $worker_token $region)" \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$instance_name},{Key=Role,Value=worker},{Key=Region,Value=$region}]"
done
}
# 生成管理节点用户数据
generate_manager_userdata() {
local manager_token=$1
local region=$2
cat << EOF
#!/bin/bash
# 安装 Docker
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
systemctl enable docker
systemctl start docker
# 加入 Swarm 作为管理节点
docker swarm join --token $manager_token $(get_primary_ip):2377
# 设置节点标签
sleep 30
docker node update --label-add region=$region \$(hostname)
docker node update --label-add zone=${region}a \$(hostname)
docker node update --label-add role=manager \$(hostname)
EOF
}
# 生成工作节点用户数据
generate_worker_userdata() {
local worker_token=$1
local region=$2
cat << EOF
#!/bin/bash
# 安装 Docker
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
systemctl enable docker
systemctl start docker
# 加入 Swarm 作为工作节点
docker swarm join --token $worker_token $(get_primary_ip):2377
# 设置节点标签(需要在管理节点上执行)
# 这部分通常通过配置管理工具来完成
EOF
}
# 设置跨区域网络
setup_cross_region_networking() {
echo "Setting up cross-region networking..."
# 创建全局网络
docker network create \
--driver overlay \
--subnet 10.10.0.0/16 \
--opt encrypted=true \
global-network
# 创建区域特定网络
for region in "${REGIONS[@]}"; do
docker network create \
--driver overlay \
--subnet 10.${get_region_subnet $region}.0.0/24 \
--opt encrypted=true \
${region}-network
done
echo "Cross-region networking setup completed"
}
# 获取区域子网
get_region_subnet() {
local region=$1
case $region in
"us-east-1") echo "11" ;;
"us-west-2") echo "12" ;;
"eu-west-1") echo "13" ;;
*) echo "20" ;;
esac
}
# 获取主节点 IP
get_primary_ip() {
# 这里需要根据实际环境获取主节点的 IP
hostname -I | awk '{print $1}'
}
# 主函数
main() {
case "$1" in
"setup")
setup_multi_region_cluster
;;
"status")
echo "Cluster Status:"
docker node ls
echo "\nNetworks:"
docker network ls
;;
*)
echo "Usage: $0 {setup|status}"
exit 1
;;
esac
}
main "$@"
3.2 灾难恢复策略
自动备份和恢复系统
#!/bin/bash
# disaster-recovery.sh
BACKUP_DIR="/opt/swarm-backups"
S3_BUCKET="my-swarm-backups"
RETENTION_DAYS=30
LOG_FILE="/var/log/swarm-backup.log"
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 创建完整集群备份
create_full_backup() {
local backup_timestamp=$(date +%Y%m%d_%H%M%S)
local backup_path="$BACKUP_DIR/full_backup_$backup_timestamp"
log "Starting full cluster backup..."
mkdir -p $backup_path
# 备份 Swarm 状态
backup_swarm_state $backup_path
# 备份服务配置
backup_services $backup_path
# 备份网络配置
backup_networks $backup_path
# 备份卷数据
backup_volumes $backup_path
# 备份密钥(加密)
backup_secrets $backup_path
# 备份配置文件
backup_configs $backup_path
# 创建备份清单
create_backup_manifest $backup_path
# 压缩备份
compress_backup $backup_path
# 上传到云存储
upload_to_cloud $backup_path
log "Full cluster backup completed: $backup_path"
}
# 备份 Swarm 状态
backup_swarm_state() {
local backup_path=$1
log "Backing up Swarm state..."
# 节点信息
docker node ls --format json > "$backup_path/nodes.json"
# 详细节点信息
mkdir -p "$backup_path/nodes"
for node in $(docker node ls --format '{{.Hostname}}'); do
docker node inspect $node > "$backup_path/nodes/${node}.json"
done
# Swarm 信息
docker info --format json > "$backup_path/swarm-info.json"
log "Swarm state backup completed"
}
# 备份服务配置
backup_services() {
local backup_path=$1
log "Backing up services..."
mkdir -p "$backup_path/services"
# 服务列表
docker service ls --format json > "$backup_path/services.json"
# 详细服务配置
for service in $(docker service ls --format '{{.Name}}'); do
docker service inspect $service > "$backup_path/services/${service}.json"
done
log "Services backup completed"
}
# 备份网络配置
backup_networks() {
local backup_path=$1
log "Backing up networks..."
mkdir -p "$backup_path/networks"
# 网络列表
docker network ls --format json > "$backup_path/networks.json"
# 详细网络配置
for network in $(docker network ls --format '{{.Name}}'); do
if [[ $network != "bridge" && $network != "host" && $network != "none" ]]; then
docker network inspect $network > "$backup_path/networks/${network}.json"
fi
done
log "Networks backup completed"
}
# 备份卷数据
backup_volumes() {
local backup_path=$1
log "Backing up volumes..."
mkdir -p "$backup_path/volumes"
# 卷列表
docker volume ls --format json > "$backup_path/volumes.json"
# 备份卷数据
for volume in $(docker volume ls --format '{{.Name}}'); do
log "Backing up volume: $volume"
# 获取卷挂载点
local mountpoint=$(docker volume inspect $volume --format '{{.Mountpoint}}')
if [[ -d $mountpoint ]]; then
# 创建卷数据备份
tar -czf "$backup_path/volumes/${volume}.tar.gz" -C "$mountpoint" .
# 保存卷元数据
docker volume inspect $volume > "$backup_path/volumes/${volume}.json"
fi
done
log "Volumes backup completed"
}
# 备份密钥(加密)
backup_secrets() {
local backup_path=$1
log "Backing up secrets..."
mkdir -p "$backup_path/secrets"
# 密钥列表(不包含实际内容)
docker secret ls --format json > "$backup_path/secrets.json"
# 密钥元数据
for secret in $(docker secret ls --format '{{.Name}}'); do
docker secret inspect $secret > "$backup_path/secrets/${secret}.json"
done
log "Secrets metadata backup completed"
log "WARNING: Secret values are not backed up for security reasons"
}
# 备份配置
backup_configs() {
local backup_path=$1
log "Backing up configs..."
mkdir -p "$backup_path/configs"
# 配置列表
docker config ls --format json > "$backup_path/configs.json"
# 配置内容
for config in $(docker config ls --format '{{.Name}}'); do
docker config inspect $config > "$backup_path/configs/${config}.json"
done
log "Configs backup completed"
}
# 创建备份清单
create_backup_manifest() {
local backup_path=$1
local manifest_file="$backup_path/manifest.json"
log "Creating backup manifest..."
cat > $manifest_file << EOF
{
"backup_timestamp": "$(date -Iseconds)",
"backup_type": "full",
"swarm_info": {
"cluster_id": "$(docker info --format '{{.Swarm.Cluster.ID}}')",
"node_id": "$(docker info --format '{{.Swarm.NodeID}}')",
"nodes_count": $(docker node ls --format '{{.Hostname}}' | wc -l),
"services_count": $(docker service ls --format '{{.Name}}' | wc -l),
"networks_count": $(docker network ls --filter driver=overlay --format '{{.Name}}' | wc -l),
"volumes_count": $(docker volume ls --format '{{.Name}}' | wc -l)
},
"backup_components": [
"swarm_state",
"services",
"networks",
"volumes",
"secrets_metadata",
"configs"
],
"backup_size": "$(du -sh $backup_path | cut -f1)",
"backup_location": "$backup_path"
}
EOF
log "Backup manifest created"
}
# 压缩备份
compress_backup() {
local backup_path=$1
local compressed_file="${backup_path}.tar.gz"
log "Compressing backup..."
tar -czf $compressed_file -C $(dirname $backup_path) $(basename $backup_path)
# 验证压缩文件
if [[ -f $compressed_file ]]; then
log "Backup compressed successfully: $compressed_file"
# 删除原始备份目录
rm -rf $backup_path
else
log "ERROR: Failed to compress backup"
return 1
fi
}
# 上传到云存储
upload_to_cloud() {
local backup_path=$1
local compressed_file="${backup_path}.tar.gz"
log "Uploading backup to cloud storage..."
# 上传到 S3
aws s3 cp $compressed_file s3://$S3_BUCKET/$(basename $compressed_file)
if [[ $? -eq 0 ]]; then
log "Backup uploaded successfully to S3"
else
log "ERROR: Failed to upload backup to S3"
return 1
fi
}
# 增量备份
create_incremental_backup() {
local last_backup_timestamp=$1
local backup_timestamp=$(date +%Y%m%d_%H%M%S)
local backup_path="$BACKUP_DIR/incremental_backup_$backup_timestamp"
log "Starting incremental backup since $last_backup_timestamp..."
mkdir -p $backup_path
# 备份自上次备份以来的变更
backup_changed_services $backup_path $last_backup_timestamp
backup_changed_volumes $backup_path $last_backup_timestamp
# 创建增量备份清单
create_incremental_manifest $backup_path $last_backup_timestamp
# 压缩和上传
compress_backup $backup_path
upload_to_cloud $backup_path
log "Incremental backup completed: $backup_path"
}
# 恢复集群
restore_cluster() {
local backup_file=$1
local restore_path="/tmp/swarm-restore-$(date +%Y%m%d_%H%M%S)"
log "Starting cluster restore from: $backup_file"
# 解压备份
mkdir -p $restore_path
tar -xzf $backup_file -C $restore_path
local backup_dir=$(find $restore_path -maxdepth 1 -type d -name "full_backup_*" | head -1)
if [[ ! -d $backup_dir ]]; then
log "ERROR: Invalid backup file structure"
return 1
fi
# 验证备份完整性
if ! validate_backup $backup_dir; then
log "ERROR: Backup validation failed"
return 1
fi
# 恢复网络
restore_networks $backup_dir
# 恢复卷
restore_volumes $backup_dir
# 恢复密钥和配置
restore_secrets_and_configs $backup_dir
# 恢复服务
restore_services $backup_dir
log "Cluster restore completed successfully"
}
# 验证备份
validate_backup() {
local backup_dir=$1
local manifest_file="$backup_dir/manifest.json"
log "Validating backup..."
if [[ ! -f $manifest_file ]]; then
log "ERROR: Backup manifest not found"
return 1
fi
# 检查必要的备份组件
local required_components=("services" "networks" "volumes" "configs")
for component in "${required_components[@]}"; do
if [[ ! -d "$backup_dir/$component" ]]; then
log "ERROR: Missing backup component: $component"
return 1
fi
done
log "Backup validation passed"
return 0
}
# 恢复网络
restore_networks() {
local backup_dir=$1
log "Restoring networks..."
if [[ -f "$backup_dir/networks.json" ]]; then
while IFS= read -r network_info; do
local network_name=$(echo $network_info | jq -r '.Name')
if [[ $network_name != "bridge" && $network_name != "host" && $network_name != "none" ]]; then
local network_file="$backup_dir/networks/${network_name}.json"
if [[ -f $network_file ]]; then
# 从备份文件重建网络
restore_single_network $network_file
fi
fi
done < "$backup_dir/networks.json"
fi
log "Networks restoration completed"
}
# 恢复单个网络
restore_single_network() {
local network_file=$1
local network_config=$(cat $network_file)
local network_name=$(echo $network_config | jq -r '.[0].Name')
local driver=$(echo $network_config | jq -r '.[0].Driver')
local subnet=$(echo $network_config | jq -r '.[0].IPAM.Config[0].Subnet // empty')
local gateway=$(echo $network_config | jq -r '.[0].IPAM.Config[0].Gateway // empty')
# 检查网络是否已存在
if docker network inspect $network_name > /dev/null 2>&1; then
log "Network $network_name already exists, skipping"
return
fi
# 创建网络
local create_cmd="docker network create --driver $driver"
if [[ -n $subnet && $subnet != "null" ]]; then
create_cmd="$create_cmd --subnet $subnet"
fi
if [[ -n $gateway && $gateway != "null" ]]; then
create_cmd="$create_cmd --gateway $gateway"
fi
create_cmd="$create_cmd $network_name"
eval $create_cmd
if [[ $? -eq 0 ]]; then
log "Network $network_name restored successfully"
else
log "ERROR: Failed to restore network $network_name"
fi
}
# 恢复卷
restore_volumes() {
local backup_dir=$1
log "Restoring volumes..."
if [[ -f "$backup_dir/volumes.json" ]]; then
while IFS= read -r volume_info; do
local volume_name=$(echo $volume_info | jq -r '.Name')
local volume_file="$backup_dir/volumes/${volume_name}.json"
local volume_data="$backup_dir/volumes/${volume_name}.tar.gz"
if [[ -f $volume_file && -f $volume_data ]]; then
restore_single_volume $volume_name $volume_file $volume_data
fi
done < "$backup_dir/volumes.json"
fi
log "Volumes restoration completed"
}
# 恢复单个卷
restore_single_volume() {
local volume_name=$1
local volume_file=$2
local volume_data=$3
# 检查卷是否已存在
if docker volume inspect $volume_name > /dev/null 2>&1; then
log "Volume $volume_name already exists, skipping"
return
fi
# 创建卷
docker volume create $volume_name
if [[ $? -eq 0 ]]; then
# 恢复卷数据
local mountpoint=$(docker volume inspect $volume_name --format '{{.Mountpoint}}')
if [[ -d $mountpoint ]]; then
tar -xzf $volume_data -C $mountpoint
log "Volume $volume_name restored successfully"
else
log "ERROR: Failed to get mountpoint for volume $volume_name"
fi
else
log "ERROR: Failed to create volume $volume_name"
fi
}
# 定期备份调度
schedule_backups() {
log "Setting up backup schedule..."
# 创建 cron 任务
cat > /etc/cron.d/swarm-backup << 'EOF'
# Docker Swarm 自动备份
# 每天凌晨 2 点执行完整备份
0 2 * * * root /opt/scripts/disaster-recovery.sh full-backup
# 每 6 小时执行增量备份
0 */6 * * * root /opt/scripts/disaster-recovery.sh incremental-backup
# 每周日凌晨 1 点清理旧备份
0 1 * * 0 root /opt/scripts/disaster-recovery.sh cleanup
EOF
# 重启 cron 服务
systemctl restart cron
log "Backup schedule configured"
}
# 清理旧备份
cleanup_old_backups() {
log "Cleaning up old backups..."
# 清理本地备份
find $BACKUP_DIR -name "*.tar.gz" -mtime +$RETENTION_DAYS -delete
# 清理云存储备份
aws s3 ls s3://$S3_BUCKET/ | while read -r line; do
local file_date=$(echo $line | awk '{print $1" "$2}')
local file_name=$(echo $line | awk '{print $4}')
if [[ -n $file_name ]]; then
local file_age=$(( ($(date +%s) - $(date -d "$file_date" +%s)) / 86400 ))
if [[ $file_age -gt $RETENTION_DAYS ]]; then
aws s3 rm s3://$S3_BUCKET/$file_name
log "Deleted old backup: $file_name"
fi
fi
done
log "Old backups cleanup completed"
}
# 主函数
main() {
case "$1" in
"full-backup")
create_full_backup
;;
"incremental-backup")
local last_backup=$(ls -t $BACKUP_DIR/full_backup_*.tar.gz 2>/dev/null | head -1)
if [[ -n $last_backup ]]; then
local timestamp=$(basename $last_backup | sed 's/full_backup_\(.*\)\.tar\.gz/\1/')
create_incremental_backup $timestamp
else
log "No previous backup found, creating full backup"
create_full_backup
fi
;;
"restore")
if [[ $# -lt 2 ]]; then
echo "Usage: $0 restore <backup-file>"
exit 1
fi
restore_cluster $2
;;
"schedule")
schedule_backups
;;
"cleanup")
cleanup_old_backups
;;
*)
echo "Usage: $0 {full-backup|incremental-backup|restore|schedule|cleanup}"
echo " full-backup - Create full cluster backup"
echo " incremental-backup - Create incremental backup"
echo " restore <file> - Restore cluster from backup"
echo " schedule - Setup automatic backup schedule"
echo " cleanup - Clean up old backups"
exit 1
;;
esac
}
main "$@"
4. 性能调优和容量规划
4.1 性能监控和分析
综合性能监控脚本
#!/bin/bash
# performance-tuning.sh
MONITOR_DURATION=300 # 5分钟监控周期
REPORT_DIR="/opt/performance-reports"
THRESHOLD_CPU=80
THRESHOLD_MEMORY=85
THRESHOLD_DISK=90
# 性能数据收集
collect_performance_data() {
local timestamp=$(date +%Y%m%d_%H%M%S)
local report_file="$REPORT_DIR/performance_$timestamp.json"
echo "Collecting performance data..."
mkdir -p $REPORT_DIR
# 系统性能数据
local system_data=$(cat << EOF
{
"timestamp": "$(date -Iseconds)",
"system": {
"cpu": {
"usage": $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'),
"load_average": "$(uptime | awk -F'load average:' '{print $2}')",
"cores": $(nproc)
},
"memory": {
"total": $(free -m | awk 'NR==2{print $2}'),
"used": $(free -m | awk 'NR==2{print $3}'),
"free": $(free -m | awk 'NR==2{print $4}'),
"usage_percent": $(free | awk 'NR==2{printf "%.2f", $3*100/$2}')
},
"disk": {
"usage": $(df -h / | awk 'NR==2 {print $5}' | sed 's/%//'),
"available": "$(df -h / | awk 'NR==2 {print $4}')"
},
"network": {
"connections": $(netstat -an | wc -l),
"tcp_established": $(netstat -an | grep ESTABLISHED | wc -l)
}
},
EOF
)
# Docker 性能数据
local docker_data=$(get_docker_performance_data)
# Swarm 性能数据
local swarm_data=$(get_swarm_performance_data)
# 合并数据
echo "$system_data" > $report_file
echo " \"docker\": $docker_data," >> $report_file
echo " \"swarm\": $swarm_data" >> $report_file
echo "}" >> $report_file
echo "Performance data collected: $report_file"
}
# 获取 Docker 性能数据
get_docker_performance_data() {
local containers_count=$(docker ps -q | wc -l)
local images_count=$(docker images -q | wc -l)
local volumes_count=$(docker volume ls -q | wc -l)
local networks_count=$(docker network ls -q | wc -l)
# 容器资源使用统计
local container_stats=$(docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" | tail -n +2)
cat << EOF
{
"containers": {
"total": $containers_count,
"running": $(docker ps -q | wc -l),
"stopped": $(docker ps -aq | wc -l)
},
"images": {
"total": $images_count,
"size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | head -1)"
},
"volumes": {
"total": $volumes_count
},
"networks": {
"total": $networks_count
},
"system_usage": {
"containers_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | sed -n '2p')",
"images_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | head -1)",
"volumes_size": "$(docker system df --format 'table {{.Size}}' | tail -n +2 | sed -n '3p')"
}
}
EOF
}
# 获取 Swarm 性能数据
get_swarm_performance_data() {
local nodes_count=$(docker node ls --format '{{.Hostname}}' | wc -l)
local services_count=$(docker service ls --format '{{.Name}}' | wc -l)
local tasks_running=0
local tasks_failed=0
# 统计任务状态
for service in $(docker service ls --format '{{.Name}}'); do
local service_tasks=$(docker service ps $service --format '{{.CurrentState}}')
tasks_running=$((tasks_running + $(echo "$service_tasks" | grep -c "Running")))
tasks_failed=$((tasks_failed + $(echo "$service_tasks" | grep -c "Failed")))
done
cat << EOF
{
"cluster": {
"nodes": {
"total": $nodes_count,
"managers": $(docker node ls --filter role=manager --format '{{.Hostname}}' | wc -l),
"workers": $(docker node ls --filter role=worker --format '{{.Hostname}}' | wc -l),
"active": $(docker node ls --filter availability=active --format '{{.Hostname}}' | wc -l)
},
"services": {
"total": $services_count,
"replicated": $(docker service ls --filter mode=replicated --format '{{.Name}}' | wc -l),
"global": $(docker service ls --filter mode=global --format '{{.Name}}' | wc -l)
},
"tasks": {
"running": $tasks_running,
"failed": $tasks_failed
}
}
}
EOF
}
# 性能分析和建议
analyze_performance() {
local report_file=$1
echo "Analyzing performance data..."
if [[ ! -f $report_file ]]; then
echo "Error: Report file not found: $report_file"
return 1
fi
local cpu_usage=$(jq -r '.system.cpu.usage' $report_file | sed 's/%//')
local memory_usage=$(jq -r '.system.memory.usage_percent' $report_file)
local disk_usage=$(jq -r '.system.disk.usage' $report_file)
echo "Performance Analysis Report"
echo "==========================="
echo "Timestamp: $(jq -r '.timestamp' $report_file)"
echo
# CPU 分析
echo "CPU Analysis:"
echo " Usage: ${cpu_usage}%"
if (( $(echo "$cpu_usage > $THRESHOLD_CPU" | bc -l) )); then
echo " ⚠️ HIGH CPU USAGE DETECTED!"
echo " Recommendations:"
echo " - Scale out services with high CPU usage"
echo " - Optimize application code"
echo " - Consider adding more worker nodes"
else
echo " ✅ CPU usage is within normal range"
fi
echo
# 内存分析
echo "Memory Analysis:"
echo " Usage: ${memory_usage}%"
if (( $(echo "$memory_usage > $THRESHOLD_MEMORY" | bc -l) )); then
echo " ⚠️ HIGH MEMORY USAGE DETECTED!"
echo " Recommendations:"
echo " - Increase memory limits for services"
echo " - Scale out memory-intensive services"
echo " - Add nodes with more memory"
else
echo " ✅ Memory usage is within normal range"
fi
echo
# 磁盘分析
echo "Disk Analysis:"
echo " Usage: ${disk_usage}%"
if (( disk_usage > THRESHOLD_DISK )); then
echo " ⚠️ HIGH DISK USAGE DETECTED!"
echo " Recommendations:"
echo " - Clean up unused Docker images and containers"
echo " - Implement log rotation"
echo " - Add more storage capacity"
else
echo " ✅ Disk usage is within normal range"
fi
echo
# Swarm 特定分析
analyze_swarm_performance $report_file
}
# Swarm 性能分析
analyze_swarm_performance() {
local report_file=$1
echo "Swarm Cluster Analysis:"
local total_nodes=$(jq -r '.swarm.cluster.nodes.total' $report_file)
local active_nodes=$(jq -r '.swarm.cluster.nodes.active' $report_file)
local failed_tasks=$(jq -r '.swarm.cluster.tasks.failed' $report_file)
echo " Nodes: $active_nodes/$total_nodes active"
echo " Failed tasks: $failed_tasks"
if [[ $failed_tasks -gt 0 ]]; then
echo " ⚠️ FAILED TASKS DETECTED!"
echo " Recommendations:"
echo " - Check service logs for errors"
echo " - Verify resource availability"
echo " - Check node health"
fi
# 节点分布分析
local manager_nodes=$(jq -r '.swarm.cluster.nodes.managers' $report_file)
if [[ $manager_nodes -lt 3 ]]; then
echo " ⚠️ INSUFFICIENT MANAGER NODES!"
echo " Recommendations:"
echo " - Add more manager nodes for high availability"
echo " - Ensure odd number of managers (3, 5, 7)"
fi
}
# 自动优化建议
generate_optimization_recommendations() {
echo "Generating optimization recommendations..."
local recommendations_file="$REPORT_DIR/optimization_recommendations_$(date +%Y%m%d_%H%M%S).md"
cat > $recommendations_file << 'EOF'
# Docker Swarm 性能优化建议
## 系统级优化
### 内核参数调优
```bash
# 网络性能优化
echo 'net.core.rmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.core.wmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_rmem = 4096 65536 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_wmem = 4096 65536 134217728' >> /etc/sysctl.conf
# 文件描述符限制
echo 'fs.file-max = 2097152' >> /etc/sysctl.conf
# 应用设置
sysctl -p
Docker 守护进程优化
{
"storage-driver": "overlay2",
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"default-ulimits": {
"nofile": {
"Name": "nofile",
"Hard": 64000,
"Soft": 64000
}
},
"max-concurrent-downloads": 10,
"max-concurrent-uploads": 5
}
服务级优化
资源限制最佳实践
deploy:
resources:
limits:
cpus: '2.0'
memory: 2G
reservations:
cpus: '0.5'
memory: 512M
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
健康检查优化
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
网络优化
Overlay 网络调优
# 创建优化的 overlay 网络
docker network create \
--driver overlay \
--opt encrypted=true \
--opt com.docker.network.driver.mtu=1450 \
optimized-network
负载均衡优化
ports:
- target: 80
published: 80
protocol: tcp
mode: ingress
存储优化
卷性能调优
# 使用本地卷提高性能
docker volume create \
--driver local \
--opt type=tmpfs \
--opt device=tmpfs \
--opt o=size=1g \
fast-cache
日志管理
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
EOF
echo "Optimization recommendations generated: $recommendations_file"
}
容量规划
capacity_planning() { echo “Performing capacity planning analysis…”
local planning_report="$REPORT_DIR/capacity_planning_$(date +%Y%m%d_%H%M%S).json"
# 收集历史数据
local historical_data=$(get_historical_performance_data)
# 预测未来需求
local future_projections=$(calculate_future_projections)
# 生成容量规划报告
cat > $planning_report << EOF
{ “timestamp”: “$(date -Iseconds)”, “current_capacity”: { “nodes”: $(docker node ls –format ‘{{.Hostname}}’ | wc -l), “total_cpu_cores”: $(docker node ls –format ‘{{.Hostname}}’ | xargs -I {} docker node inspect {} –format ‘{{.Description.Resources.NanoCPUs}}’ | awk ‘{sum += $1⁄1000000000} END {print sum}’), “total_memory_gb”: $(docker node ls –format ‘{{.Hostname}}’ | xargs -I {} docker node inspect {} –format ‘{{.Description.Resources.MemoryBytes}}’ | awk ‘{sum += $1⁄1073741824} END {print sum}’) }, “current_usage”: { “services”: $(docker service ls –format ‘{{.Name}}’ | wc -l), “total_replicas”: $(docker service ls –format ‘{{.Replicas}}’ | awk -F’/’ ‘{sum += $2} END {print sum}’) }, “recommendations”: { “short_term”: [ “Monitor CPU usage trends”, “Implement auto-scaling for high-demand services”, “Optimize resource allocation” ], “long_term”: [ “Plan for 50% capacity growth over next 6 months”, “Consider multi-region deployment”, “Implement predictive scaling” ] } } EOF
echo "Capacity planning report generated: $planning_report"
}
主函数
main() {
case “$1” in
“collect”)
collect_performance_data
;;
“analyze”)
if [[ $# -lt 2 ]]; then
echo “Usage: $0 analyze
main “$@”
### 4.2 自动扩缩容策略
#### 智能自动扩缩容系统
```bash
#!/bin/bash
# intelligent-autoscaler.sh
CONFIG_FILE="/etc/swarm-autoscaler/config.json"
LOG_FILE="/var/log/swarm-autoscaler.log"
METRICS_ENDPOINT="http://prometheus:9090"
CHECK_INTERVAL=60 # 检查间隔(秒)
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 初始化配置
init_autoscaler_config() {
mkdir -p $(dirname $CONFIG_FILE)
cat > $CONFIG_FILE << 'EOF'
{
"services": {
"web": {
"min_replicas": 2,
"max_replicas": 10,
"target_cpu_percent": 70,
"target_memory_percent": 80,
"scale_up_threshold": 80,
"scale_down_threshold": 30,
"scale_up_cooldown": 300,
"scale_down_cooldown": 600,
"scale_factor": 2
},
"api": {
"min_replicas": 3,
"max_replicas": 15,
"target_cpu_percent": 60,
"target_memory_percent": 75,
"scale_up_threshold": 75,
"scale_down_threshold": 25,
"scale_up_cooldown": 180,
"scale_down_cooldown": 900,
"scale_factor": 1.5
},
"worker": {
"min_replicas": 1,
"max_replicas": 20,
"target_cpu_percent": 80,
"target_memory_percent": 85,
"scale_up_threshold": 85,
"scale_down_threshold": 40,
"scale_up_cooldown": 120,
"scale_down_cooldown": 1200,
"scale_factor": 3,
"queue_based_scaling": {
"enabled": true,
"queue_name": "work_queue",
"target_queue_length": 100,
"messages_per_replica": 50
}
}
},
"global_settings": {
"enabled": true,
"dry_run": false,
"metrics_retention": 3600,
"prediction_window": 1800,
"enable_predictive_scaling": true
}
}
EOF
log "Autoscaler configuration initialized"
}
# 获取服务指标
get_service_metrics() {
local service_name=$1
# 从 Prometheus 获取 CPU 使用率
local cpu_query="avg(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=\"$service_name\"}[5m])) * 100"
local cpu_usage=$(curl -s "$METRICS_ENDPOINT/api/v1/query?query=$cpu_query" | jq -r '.data.result[0].value[1] // 0')
# 从 Prometheus 获取内存使用率
local memory_query="avg(container_memory_usage_bytes{container_label_com_docker_swarm_service_name=\"$service_name\"} / container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name=\"$service_name\"}) * 100"
local memory_usage=$(curl -s "$METRICS_ENDPOINT/api/v1/query?query=$memory_query" | jq -r '.data.result[0].value[1] // 0')
# 获取当前副本数
local current_replicas=$(docker service inspect $service_name --format '{{.Spec.Mode.Replicated.Replicas}}')
# 获取队列长度(如果启用)
local queue_length=0
local queue_config=$(jq -r ".services.$service_name.queue_based_scaling.enabled // false" $CONFIG_FILE)
if [[ $queue_config == "true" ]]; then
local queue_name=$(jq -r ".services.$service_name.queue_based_scaling.queue_name" $CONFIG_FILE)
queue_length=$(get_queue_length $queue_name)
fi
cat << EOF
{
"service": "$service_name",
"cpu_usage": $cpu_usage,
"memory_usage": $memory_usage,
"current_replicas": $current_replicas,
"queue_length": $queue_length,
"timestamp": "$(date -Iseconds)"
}
EOF
}
# 获取队列长度
get_queue_length() {
local queue_name=$1
# 这里需要根据实际的消息队列系统实现
# 示例:Redis 队列
redis-cli llen $queue_name 2>/dev/null || echo 0
}
# 计算所需副本数
calculate_desired_replicas() {
local service_name=$1
local metrics=$2
local cpu_usage=$(echo $metrics | jq -r '.cpu_usage')
local memory_usage=$(echo $metrics | jq -r '.memory_usage')
local current_replicas=$(echo $metrics | jq -r '.current_replicas')
local queue_length=$(echo $metrics | jq -r '.queue_length')
# 获取服务配置
local config=$(jq ".services.$service_name" $CONFIG_FILE)
local min_replicas=$(echo $config | jq -r '.min_replicas')
local max_replicas=$(echo $config | jq -r '.max_replicas')
local target_cpu=$(echo $config | jq -r '.target_cpu_percent')
local target_memory=$(echo $config | jq -r '.target_memory_percent')
# 基于 CPU 的扩缩容计算
local cpu_desired_replicas=$current_replicas
if (( $(echo "$cpu_usage > 0" | bc -l) )); then
cpu_desired_replicas=$(echo "scale=0; $current_replicas * $cpu_usage / $target_cpu" | bc)
fi
# 基于内存的扩缩容计算
local memory_desired_replicas=$current_replicas
if (( $(echo "$memory_usage > 0" | bc -l) )); then
memory_desired_replicas=$(echo "scale=0; $current_replicas * $memory_usage / $target_memory" | bc)
fi
# 基于队列的扩缩容计算
local queue_desired_replicas=$current_replicas
local queue_enabled=$(echo $config | jq -r '.queue_based_scaling.enabled // false')
if [[ $queue_enabled == "true" && $queue_length -gt 0 ]]; then
local messages_per_replica=$(echo $config | jq -r '.queue_based_scaling.messages_per_replica')
queue_desired_replicas=$(echo "scale=0; ($queue_length + $messages_per_replica - 1) / $messages_per_replica" | bc)
fi
# 取最大值作为期望副本数
local desired_replicas=$cpu_desired_replicas
if (( memory_desired_replicas > desired_replicas )); then
desired_replicas=$memory_desired_replicas
fi
if (( queue_desired_replicas > desired_replicas )); then
desired_replicas=$queue_desired_replicas
fi
# 应用最小和最大限制
if (( desired_replicas < min_replicas )); then
desired_replicas=$min_replicas
elif (( desired_replicas > max_replicas )); then
desired_replicas=$max_replicas
fi
echo $desired_replicas
}
# 检查冷却期
check_cooldown() {
local service_name=$1
local action=$2 # "scale_up" or "scale_down"
local last_action_file="/tmp/autoscaler_${service_name}_${action}"
local cooldown_key="${action}_cooldown"
local cooldown_period=$(jq -r ".services.$service_name.$cooldown_key" $CONFIG_FILE)
if [[ -f $last_action_file ]]; then
local last_action_time=$(cat $last_action_file)
local current_time=$(date +%s)
local time_diff=$((current_time - last_action_time))
if (( time_diff < cooldown_period )); then
log "Service $service_name is in cooldown period for $action (${time_diff}s/${cooldown_period}s)"
return 1
fi
fi
return 0
}
# 执行扩缩容
perform_scaling() {
local service_name=$1
local current_replicas=$2
local desired_replicas=$3
local dry_run=$(jq -r '.global_settings.dry_run' $CONFIG_FILE)
if [[ $current_replicas -eq $desired_replicas ]]; then
return 0
fi
local action="scale_up"
if (( desired_replicas < current_replicas )); then
action="scale_down"
fi
# 检查冷却期
if ! check_cooldown $service_name $action; then
return 1
fi
log "Scaling service $service_name from $current_replicas to $desired_replicas replicas ($action)"
if [[ $dry_run == "true" ]]; then
log "DRY RUN: Would scale $service_name to $desired_replicas replicas"
else
# 执行扩缩容
docker service scale $service_name=$desired_replicas
if [[ $? -eq 0 ]]; then
log "Successfully scaled $service_name to $desired_replicas replicas"
# 记录操作时间
echo $(date +%s) > "/tmp/autoscaler_${service_name}_${action}"
# 发送通知
send_scaling_notification $service_name $current_replicas $desired_replicas $action
else
log "ERROR: Failed to scale $service_name"
return 1
fi
fi
return 0
}
# 发送扩缩容通知
send_scaling_notification() {
local service_name=$1
local old_replicas=$2
local new_replicas=$3
local action=$4
# 这里可以集成 Slack、邮件或其他通知系统
local message="🔄 Autoscaler: Service '$service_name' scaled from $old_replicas to $new_replicas replicas ($action)"
# 示例:发送到 Slack
# curl -X POST -H 'Content-type: application/json' \
# --data "{\"text\":\"$message\"}" \
# $SLACK_WEBHOOK_URL
log "Notification sent: $message"
}
# 预测性扩缩容
predictive_scaling() {
local service_name=$1
local prediction_enabled=$(jq -r '.global_settings.enable_predictive_scaling' $CONFIG_FILE)
if [[ $prediction_enabled != "true" ]]; then
return 0
fi
log "Performing predictive scaling analysis for $service_name"
# 获取历史数据
local prediction_window=$(jq -r '.global_settings.prediction_window' $CONFIG_FILE)
local historical_data=$(get_historical_metrics $service_name $prediction_window)
# 简单的线性预测(实际应用中可以使用更复杂的机器学习模型)
local predicted_load=$(calculate_load_prediction "$historical_data")
log "Predicted load for $service_name: $predicted_load%"
# 如果预测负载较高,提前扩容
if (( $(echo "$predicted_load > 80" | bc -l) )); then
local current_replicas=$(docker service inspect $service_name --format '{{.Spec.Mode.Replicated.Replicas}}')
local predicted_replicas=$(echo "scale=0; $current_replicas * 1.2" | bc)
log "Predictive scaling: Preparing for high load, pre-scaling $service_name to $predicted_replicas replicas"
# 执行预测性扩容
perform_scaling $service_name $current_replicas $predicted_replicas
fi
}
# 获取历史指标
get_historical_metrics() {
local service_name=$1
local window_seconds=$2
# 从 Prometheus 获取历史数据
local query="avg_over_time(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_service_name=\"$service_name\"}[5m])[$window_seconds s:1m]) * 100"
curl -s "$METRICS_ENDPOINT/api/v1/query?query=$query" | jq -r '.data.result[0].values[][]' | paste - -
}
# 计算负载预测
calculate_load_prediction() {
local historical_data="$1"
# 简单的移动平均预测
local avg_load=$(echo "$historical_data" | awk '{sum += $2; count++} END {if (count > 0) print sum/count; else print 0}')
echo $avg_load
}
# 主监控循环
start_autoscaler() {
log "Starting Docker Swarm Autoscaler"
local enabled=$(jq -r '.global_settings.enabled' $CONFIG_FILE)
if [[ $enabled != "true" ]]; then
log "Autoscaler is disabled in configuration"
exit 0
fi
while true; do
# 获取所有配置的服务
local services=$(jq -r '.services | keys[]' $CONFIG_FILE)
for service in $services; do
# 检查服务是否存在
if ! docker service inspect $service > /dev/null 2>&1; then
log "Service $service not found, skipping"
continue
fi
# 获取服务指标
local metrics=$(get_service_metrics $service)
if [[ -z $metrics ]]; then
log "Failed to get metrics for service $service"
continue
fi
# 计算期望副本数
local current_replicas=$(echo $metrics | jq -r '.current_replicas')
local desired_replicas=$(calculate_desired_replicas $service "$metrics")
log "Service $service: current=$current_replicas, desired=$desired_replicas"
# 执行扩缩容
perform_scaling $service $current_replicas $desired_replicas
# 预测性扩缩容
predictive_scaling $service
done
sleep $CHECK_INTERVAL
done
}
# 主函数
main() {
case "$1" in
"init")
init_autoscaler_config
;;
"start")
start_autoscaler
;;
"status")
echo "Autoscaler Status:"
echo "Configuration: $CONFIG_FILE"
echo "Log file: $LOG_FILE"
echo "Enabled: $(jq -r '.global_settings.enabled' $CONFIG_FILE)"
echo "Dry run: $(jq -r '.global_settings.dry_run' $CONFIG_FILE)"
;;
"test")
if [[ $# -lt 2 ]]; then
echo "Usage: $0 test <service-name>"
exit 1
fi
local metrics=$(get_service_metrics $2)
echo "Metrics for service $2:"
echo $metrics | jq .
;;
*)
echo "Usage: $0 {init|start|status|test}"
echo " init - Initialize autoscaler configuration"
echo " start - Start autoscaler daemon"
echo " status - Show autoscaler status"
echo " test <service> - Test metrics collection for service"
exit 1
;;
esac
}
main "$@"