14.1 概述
Kubernetes作为容器编排的事实标准,正在不断演进以适应新的技术趋势和业务需求。本章将探讨Kubernetes的未来发展方向,包括边缘计算、AI/ML工作负载、WebAssembly集成、多云和混合云、可持续发展等前沿技术和趋势。
14.1.1 技术发展趋势
graph TB
A[Kubernetes核心] --> B[边缘计算]
A --> C[AI/ML工作负载]
A --> D[WebAssembly]
A --> E[多云混合云]
A --> F[可持续发展]
A --> G[安全增强]
A --> H[开发者体验]
B --> B1[边缘节点管理]
B --> B2[离线运行]
B --> B3[低延迟应用]
C --> C1[GPU调度]
C --> C2[模型服务]
C --> C3[分布式训练]
D --> D1[轻量级运行时]
D --> D2[多语言支持]
D --> D3[安全沙箱]
E --> E1[联邦集群]
E --> E2[跨云迁移]
E --> E3[统一管理]
F --> F1[绿色计算]
F --> F2[资源优化]
F --> F3[碳中和]
G --> G1[零信任]
G --> G2[供应链安全]
G --> G3[运行时保护]
H --> H1[简化部署]
H --> H2[可视化工具]
H --> H3[自动化运维]
14.2 边缘计算和Kubernetes
14.2.1 边缘计算架构
边缘计算将计算能力推向网络边缘,Kubernetes在边缘计算中扮演重要角色。
# 边缘节点配置
apiVersion: v1
kind: Node
metadata:
name: edge-node-001
labels:
node-type: edge
location: beijing-datacenter
hardware: arm64
network-zone: zone-a
spec:
# 边缘节点特殊配置
taints:
- key: edge-node
value: "true"
effect: NoSchedule
# 资源限制
capacity:
cpu: "4"
memory: "8Gi"
storage: "100Gi"
nvidia.com/gpu: "1"
---
# 边缘工作负载调度策略
apiVersion: v1
kind: Pod
metadata:
name: edge-application
namespace: edge-apps
spec:
nodeSelector:
node-type: edge
location: beijing-datacenter
tolerations:
- key: edge-node
operator: Equal
value: "true"
effect: NoSchedule
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: network-zone
operator: In
values: ["zone-a", "zone-b"]
containers:
- name: edge-app
image: edge-app:v1.0
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
env:
- name: EDGE_MODE
value: "true"
- name: OFFLINE_CAPABLE
value: "true"
14.2.2 K3s轻量级Kubernetes
#!/bin/bash
# K3s边缘Kubernetes部署脚本
echo "=== K3s边缘Kubernetes部署 ==="
# 安装K3s服务器
install_k3s_server() {
echo "安装K3s服务器..."
# 下载并安装K3s
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable traefik --disable servicelb" sh -
# 等待K3s启动
echo "等待K3s启动..."
sleep 30
# 检查状态
sudo k3s kubectl get nodes
# 获取节点token
sudo cat /var/lib/rancher/k3s/server/node-token
}
# 安装K3s代理节点
install_k3s_agent() {
local server_ip=$1
local node_token=$2
echo "安装K3s代理节点..."
# 安装代理节点
curl -sfL https://get.k3s.io | K3S_URL=https://${server_ip}:6443 K3S_TOKEN=${node_token} sh -
echo "K3s代理节点安装完成"
}
# 配置边缘应用
configure_edge_apps() {
echo "配置边缘应用..."
# 创建边缘应用命名空间
cat <<EOF | sudo k3s kubectl apply -f -
apiVersion: v1
kind: Namespace
metadata:
name: edge-apps
labels:
type: edge
---
# 边缘应用部署
apiVersion: apps/v1
kind: Deployment
metadata:
name: edge-sensor-app
namespace: edge-apps
spec:
replicas: 1
selector:
matchLabels:
app: edge-sensor
template:
metadata:
labels:
app: edge-sensor
spec:
containers:
- name: sensor-collector
image: nginx:alpine
ports:
- containerPort: 80
env:
- name: EDGE_LOCATION
valueFrom:
fieldRef:
fieldPath: spec.nodeName
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
volumeMounts:
- name: sensor-data
mountPath: /data
volumes:
- name: sensor-data
hostPath:
path: /opt/sensor-data
type: DirectoryOrCreate
nodeSelector:
kubernetes.io/arch: arm64
EOF
}
# 配置离线运行
configure_offline_mode() {
echo "配置离线运行模式..."
# 创建离线配置
cat <<EOF | sudo k3s kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: offline-config
namespace: edge-apps
data:
offline.conf: |
# 离线模式配置
offline_mode=true
cache_duration=3600
sync_interval=300
local_storage=/data/cache
# 网络配置
network_timeout=30
retry_attempts=3
fallback_mode=local
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: offline-sync
namespace: edge-apps
spec:
selector:
matchLabels:
app: offline-sync
template:
metadata:
labels:
app: offline-sync
spec:
containers:
- name: sync-agent
image: alpine:latest
command: ["/bin/sh"]
args: ["-c", "while true; do echo 'Syncing data...'; sleep 300; done"]
volumeMounts:
- name: config
mountPath: /etc/config
- name: data-cache
mountPath: /data/cache
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumes:
- name: config
configMap:
name: offline-config
- name: data-cache
hostPath:
path: /opt/edge-cache
type: DirectoryOrCreate
hostNetwork: true
EOF
}
# 监控边缘节点
setup_edge_monitoring() {
echo "设置边缘节点监控..."
# 部署轻量级监控
cat <<EOF | sudo k3s kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: edge-monitor
namespace: kube-system
spec:
selector:
matchLabels:
app: edge-monitor
template:
metadata:
labels:
app: edge-monitor
spec:
containers:
- name: node-exporter
image: prom/node-exporter:latest
ports:
- containerPort: 9100
args:
- '--path.rootfs=/host'
- '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)'
- '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$'
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host
readOnly: true
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 50m
memory: 128Mi
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
hostNetwork: true
hostPID: true
EOF
}
# 主函数
case "$1" in
server)
install_k3s_server
;;
agent)
if [ -z "$2" ] || [ -z "$3" ]; then
echo "用法: $0 agent <server_ip> <node_token>"
exit 1
fi
install_k3s_agent "$2" "$3"
;;
apps)
configure_edge_apps
;;
offline)
configure_offline_mode
;;
monitoring)
setup_edge_monitoring
;;
all)
install_k3s_server
configure_edge_apps
configure_offline_mode
setup_edge_monitoring
;;
*)
echo "用法: $0 {server|agent|apps|offline|monitoring|all}"
echo " server - 安装K3s服务器"
echo " agent - 安装K3s代理节点"
echo " apps - 配置边缘应用"
echo " offline - 配置离线模式"
echo " monitoring - 设置监控"
echo " all - 执行所有配置"
exit 1
;;
esac
14.6 可持续发展和绿色计算
14.6.1 碳足迹监控
# 碳足迹监控配置
apiVersion: v1
kind: ConfigMap
metadata:
name: carbon-footprint-config
namespace: sustainability
data:
carbon-metrics.yaml: |
# 碳排放计算配置
carbon_intensity:
# 不同云提供商的碳强度 (gCO2/kWh)
aws:
us-east-1: 415.755
us-west-2: 350.993
eu-west-1: 316.0
gcp:
us-central1: 479.0
us-west1: 350.0
europe-west1: 167.0
azure:
eastus: 415.755
westus2: 350.993
westeurope: 316.0
# 资源功耗系数
power_consumption:
cpu_per_core: 3.5 # 瓦特/核心
memory_per_gb: 0.375 # 瓦特/GB
storage_per_gb: 0.65 # 瓦特/GB
network_per_gbps: 5.0 # 瓦特/Gbps
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: carbon-footprint-exporter
namespace: sustainability
spec:
replicas: 1
selector:
matchLabels:
app: carbon-footprint-exporter
template:
metadata:
labels:
app: carbon-footprint-exporter
spec:
containers:
- name: exporter
image: carbon-footprint-exporter:latest
ports:
- containerPort: 8080
env:
- name: CLOUD_PROVIDER
value: "aws"
- name: REGION
value: "us-west-2"
- name: CARBON_INTENSITY
valueFrom:
configMapKeyRef:
name: carbon-footprint-config
key: carbon-metrics.yaml
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: config
mountPath: /etc/config
volumes:
- name: config
configMap:
name: carbon-footprint-config
serviceAccountName: carbon-footprint-exporter
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: carbon-footprint-exporter
namespace: sustainability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: carbon-footprint-exporter
rules:
- apiGroups: [""]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: carbon-footprint-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: carbon-footprint-exporter
subjects:
- kind: ServiceAccount
name: carbon-footprint-exporter
namespace: sustainability
14.6.2 绿色计算优化脚本
#!/bin/bash
# Kubernetes绿色计算优化脚本
echo "=== Kubernetes绿色计算优化 ==="
# 安装可持续性监控
install_sustainability_monitoring() {
echo "安装可持续性监控..."
# 创建命名空间
kubectl create namespace sustainability --dry-run=client -o yaml | kubectl apply -f -
# 部署Kepler (Kubernetes Efficient Power Level Exporter)
kubectl apply -f https://raw.githubusercontent.com/sustainable-computing-io/kepler/main/manifests/kubernetes/deployment.yaml
# 部署碳足迹计算器
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: carbon-calculator
namespace: sustainability
spec:
replicas: 1
selector:
matchLabels:
app: carbon-calculator
template:
metadata:
labels:
app: carbon-calculator
spec:
containers:
- name: calculator
image: python:3.9-slim
command: ["/bin/sh"]
args: ["-c", "while true; do python /app/carbon_calculator.py; sleep 300; done"]
env:
- name: PROMETHEUS_URL
value: "http://prometheus:9090"
- name: CARBON_INTENSITY_API
value: "https://api.carbonintensity.org.uk/intensity"
volumeMounts:
- name: calculator-script
mountPath: /app
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: calculator-script
configMap:
name: carbon-calculator-script
defaultMode: 0755
---
apiVersion: v1
kind: ConfigMap
metadata:
name: carbon-calculator-script
namespace: sustainability
data:
carbon_calculator.py: |
#!/usr/bin/env python3
import requests
import json
import os
import time
from datetime import datetime
def get_cluster_metrics():
"""获取集群资源使用指标"""
prometheus_url = os.getenv('PROMETHEUS_URL', 'http://prometheus:9090')
queries = {
'cpu_usage': 'sum(rate(container_cpu_usage_seconds_total[5m]))',
'memory_usage': 'sum(container_memory_working_set_bytes)',
'network_rx': 'sum(rate(container_network_receive_bytes_total[5m]))',
'network_tx': 'sum(rate(container_network_transmit_bytes_total[5m]))'
}
metrics = {}
for name, query in queries.items():
try:
response = requests.get(f'{prometheus_url}/api/v1/query',
params={'query': query})
data = response.json()
if data['status'] == 'success' and data['data']['result']:
metrics[name] = float(data['data']['result'][0]['value'][1])
else:
metrics[name] = 0
except Exception as e:
print(f"Error fetching {name}: {e}")
metrics[name] = 0
return metrics
def get_carbon_intensity():
"""获取当前碳强度"""
try:
# 这里使用英国的碳强度API作为示例
response = requests.get('https://api.carbonintensity.org.uk/intensity')
data = response.json()
return data['data'][0]['intensity']['actual']
except:
# 如果API不可用,使用默认值
return 350 # gCO2/kWh
def calculate_carbon_footprint(metrics, carbon_intensity):
"""计算碳足迹"""
# 功耗计算 (瓦特)
cpu_power = metrics['cpu_usage'] * 3.5 # 3.5W per CPU core
memory_power = (metrics['memory_usage'] / 1024**3) * 0.375 # 0.375W per GB
network_power = ((metrics['network_rx'] + metrics['network_tx']) / 1024**3) * 5.0 # 5W per Gbps
total_power = cpu_power + memory_power + network_power # 瓦特
# 碳排放计算 (gCO2)
carbon_emission = (total_power / 1000) * (carbon_intensity / 1000) # kgCO2/hour
return {
'timestamp': datetime.now().isoformat(),
'power_consumption': {
'cpu_watts': cpu_power,
'memory_watts': memory_power,
'network_watts': network_power,
'total_watts': total_power
},
'carbon_intensity': carbon_intensity,
'carbon_emission_kg_per_hour': carbon_emission,
'metrics': metrics
}
def main():
print("Carbon footprint calculator started")
while True:
try:
# 获取指标
metrics = get_cluster_metrics()
carbon_intensity = get_carbon_intensity()
# 计算碳足迹
result = calculate_carbon_footprint(metrics, carbon_intensity)
# 输出结果
print(json.dumps(result, indent=2))
# 这里可以将结果发送到监控系统
# send_to_prometheus(result)
except Exception as e:
print(f"Error in carbon calculation: {e}")
time.sleep(300) # 5分钟间隔
if __name__ == '__main__':
main()
EOF
}
# 实施绿色调度策略
implement_green_scheduling() {
echo "实施绿色调度策略..."
# 创建绿色调度器配置
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: green-scheduler-config
namespace: kube-system
data:
config.yaml: |
apiVersion: kubescheduler.config.k8s.io/v1beta3
kind: KubeSchedulerConfiguration
profiles:
- schedulerName: green-scheduler
plugins:
score:
enabled:
- name: NodeResourcesFit
- name: NodeAffinity
- name: PodTopologySpread
- name: GreenScoring # 自定义绿色评分插件
pluginConfig:
- name: NodeResourcesFit
args:
scoringStrategy:
type: LeastAllocated
resources:
- name: cpu
weight: 1
- name: memory
weight: 1
- name: carbon-efficiency
weight: 2 # 给碳效率更高的权重
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: green-scheduler
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: green-scheduler
template:
metadata:
labels:
app: green-scheduler
spec:
serviceAccountName: green-scheduler
containers:
- name: kube-scheduler
image: k8s.gcr.io/kube-scheduler:v1.28.0
command:
- kube-scheduler
- --config=/etc/kubernetes/scheduler-config.yaml
- --v=2
volumeMounts:
- name: config
mountPath: /etc/kubernetes
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: config
configMap:
name: green-scheduler-config
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: green-scheduler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: green-scheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:kube-scheduler
subjects:
- kind: ServiceAccount
name: green-scheduler
namespace: kube-system
EOF
# 创建使用绿色调度器的Pod示例
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: green-app
namespace: default
spec:
replicas: 3
selector:
matchLabels:
app: green-app
template:
metadata:
labels:
app: green-app
annotations:
scheduler.alpha.kubernetes.io/preferred-carbon-efficiency: "high"
spec:
schedulerName: green-scheduler
containers:
- name: app
image: nginx:alpine
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
env:
- name: GREEN_MODE
value: "enabled"
nodeSelector:
carbon-efficiency: high
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: energy-source
operator: In
values: ["renewable", "solar", "wind"]
- weight: 50
preference:
matchExpressions:
- key: carbon-intensity
operator: In
values: ["low", "very-low"]
EOF
}
# 优化资源使用
optimize_resource_usage() {
echo "优化资源使用..."
# 部署VPA (Vertical Pod Autoscaler)
git clone https://github.com/kubernetes/autoscaler.git
cd autoscaler/vertical-pod-autoscaler
./hack/vpa-install.sh
cd ../../..
rm -rf autoscaler
# 创建VPA配置
cat <<EOF | kubectl apply -f -
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: green-app-vpa
namespace: default
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: green-app
updatePolicy:
updateMode: "Auto"
resourcePolicy:
containerPolicies:
- containerName: app
maxAllowed:
cpu: 500m
memory: 512Mi
minAllowed:
cpu: 10m
memory: 32Mi
controlledResources: ["cpu", "memory"]
EOF
# 创建资源清理CronJob
cat <<EOF | kubectl apply -f -
apiVersion: batch/v1
kind: CronJob
metadata:
name: resource-cleanup
namespace: sustainability
spec:
schedule: "0 2 * * *" # 每天凌晨2点执行
jobTemplate:
spec:
template:
spec:
containers:
- name: cleanup
image: bitnami/kubectl:latest
command:
- /bin/sh
- -c
- |
echo "Starting resource cleanup..."
# 清理已完成的Jobs
kubectl get jobs --all-namespaces --field-selector status.successful=1 -o json | \
jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' | \
while read namespace name; do
echo "Deleting completed job: $namespace/$name"
kubectl delete job "$name" -n "$namespace"
done
# 清理失败的Pods
kubectl get pods --all-namespaces --field-selector status.phase=Failed -o json | \
jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' | \
while read namespace name; do
echo "Deleting failed pod: $namespace/$name"
kubectl delete pod "$name" -n "$namespace"
done
# 清理未使用的ConfigMaps (超过30天)
kubectl get configmaps --all-namespaces -o json | \
jq -r '.items[] | select(.metadata.creationTimestamp < (now - 30*24*3600 | strftime("%Y-%m-%dT%H:%M:%SZ"))) | "\(.metadata.namespace) \(.metadata.name)"' | \
while read namespace name; do
# 检查是否被使用
used=$(kubectl get pods -n $namespace -o json | jq -r --arg cm "$name" '.items[] | select(.spec.volumes[]?.configMap.name == $cm) | .metadata.name')
if [ -z "$used" ]; then
echo "Deleting unused configmap: $namespace/$name"
kubectl delete configmap "$name" -n "$namespace"
fi
done
echo "Resource cleanup completed"
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
restartPolicy: OnFailure
serviceAccountName: resource-cleanup
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: resource-cleanup
namespace: sustainability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: resource-cleanup
rules:
- apiGroups: [""]
resources: ["pods", "configmaps", "secrets"]
verbs: ["get", "list", "delete"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: resource-cleanup
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: resource-cleanup
subjects:
- kind: ServiceAccount
name: resource-cleanup
namespace: sustainability
EOF
}
# 生成可持续性报告
generate_sustainability_report() {
echo "生成可持续性报告..."
local report_file="sustainability-report-$(date +%Y%m%d).md"
cat > "$report_file" <<EOF
# Kubernetes集群可持续性报告
生成时间: $(date)
## 1. 资源使用概览
### 节点资源使用
\`\`\`
$(kubectl top nodes 2>/dev/null || echo "需要安装metrics-server")
\`\`\`
### Pod资源使用 (Top 10)
\`\`\`
$(kubectl top pods --all-namespaces --sort-by=cpu 2>/dev/null | head -11 || echo "需要安装metrics-server")
\`\`\`
## 2. 资源效率分析
### 未使用的资源
- **未绑定的PV**: $(kubectl get pv | grep Available | wc -l)
- **失败的Pod**: $(kubectl get pods --all-namespaces --field-selector status.phase=Failed | wc -l)
- **已完成的Job**: $(kubectl get jobs --all-namespaces --field-selector status.successful=1 | wc -l)
### 资源配置分析
- **没有资源限制的Pod**: $(kubectl get pods --all-namespaces -o json | jq '[.items[] | select(.spec.containers[].resources.limits == null)] | length')
- **没有资源请求的Pod**: $(kubectl get pods --all-namespaces -o json | jq '[.items[] | select(.spec.containers[].resources.requests == null)] | length')
## 3. 碳足迹估算
基于当前资源使用情况的碳足迹估算:
- **CPU使用**: $(kubectl top nodes --no-headers 2>/dev/null | awk '{sum+=$3} END {print sum}' || echo "N/A") cores
- **内存使用**: $(kubectl top nodes --no-headers 2>/dev/null | awk '{sum+=$5} END {print sum}' || echo "N/A") GB
- **估算功耗**: 基于标准服务器功耗计算
- **估算碳排放**: 基于地区电网碳强度计算
## 4. 优化建议
### 立即行动项
1. 为所有Pod设置资源请求和限制
2. 清理未使用的资源
3. 实施自动扩缩容
4. 使用更高效的镜像
### 中期优化
1. 实施绿色调度策略
2. 优化应用架构
3. 使用可再生能源
4. 实施碳感知调度
### 长期目标
1. 实现碳中和运营
2. 建立可持续性KPI
3. 持续监控和优化
4. 参与绿色计算倡议
## 5. 监控指标
建议监控以下可持续性指标:
- 每个工作负载的碳足迹
- 资源利用率
- 能源效率
- 可再生能源使用比例
EOF
echo "可持续性报告已生成: $report_file"
}
# 主函数
case "$1" in
monitoring)
install_sustainability_monitoring
;;
scheduling)
implement_green_scheduling
;;
optimize)
optimize_resource_usage
;;
report)
generate_sustainability_report
;;
all)
install_sustainability_monitoring
implement_green_scheduling
optimize_resource_usage
generate_sustainability_report
;;
*)
echo "用法: $0 {monitoring|scheduling|optimize|report|all}"
echo " monitoring - 安装可持续性监控"
echo " scheduling - 实施绿色调度"
echo " optimize - 优化资源使用"
echo " report - 生成可持续性报告"
echo " all - 执行所有操作"
exit 1
;;
esac
14.7 安全增强和零信任
14.7.1 零信任架构
# 零信任网络策略
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: zero-trust-default-deny
namespace: production
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
# 默认拒绝所有流量
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: zero-trust-web-tier
namespace: production
spec:
podSelector:
matchLabels:
tier: web
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
ports:
- protocol: TCP
port: 80
- protocol: TCP
port: 443
egress:
- to:
- podSelector:
matchLabels:
tier: api
ports:
- protocol: TCP
port: 8080
- to: [] # DNS
ports:
- protocol: TCP
port: 53
- protocol: UDP
port: 53
---
# 服务网格安全策略
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: default
namespace: production
spec:
mtls:
mode: STRICT
---
apiVersion: security.istio.io/v1beta1
kind: AuthorizationPolicy
metadata:
name: zero-trust-authz
namespace: production
spec:
selector:
matchLabels:
app: web-app
rules:
- from:
- source:
principals: ["cluster.local/ns/production/sa/web-service"]
- to:
- operation:
methods: ["GET", "POST"]
paths: ["/api/*"]
- when:
- key: source.ip
values: ["10.0.0.0/8"]
14.7.2 供应链安全
#!/bin/bash
# Kubernetes供应链安全脚本
echo "=== Kubernetes供应链安全 ==="
# 安装镜像扫描工具
install_image_scanning() {
echo "安装镜像扫描工具..."
# 安装Trivy
curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin
# 部署Trivy Operator
kubectl apply -f https://raw.githubusercontent.com/aquasecurity/trivy-operator/main/deploy/static/trivy-operator.yaml
# 配置镜像扫描策略
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: trivy-operator-config
namespace: trivy-system
data:
trivy.repository: "ghcr.io/aquasecurity/trivy"
trivy.tag: "latest"
trivy.severity: "CRITICAL,HIGH,MEDIUM"
trivy.ignoreUnfixed: "false"
trivy.timeout: "5m0s"
trivy.skipUpdate: "false"
---
apiVersion: aquasecurity.github.io/v1alpha1
kind: ConfigAuditReport
metadata:
name: scan-policy
namespace: trivy-system
spec:
scanner:
name: Trivy
vendor: Aqua Security
version: "0.45.0"
summary:
criticalCount: 0
highCount: 0
mediumCount: 5
lowCount: 10
EOF
}
# 实施镜像签名验证
implement_image_signing() {
echo "实施镜像签名验证..."
# 安装Cosign
curl -O -L "https://github.com/sigstore/cosign/releases/latest/download/cosign-linux-amd64"
sudo mv cosign-linux-amd64 /usr/local/bin/cosign
sudo chmod +x /usr/local/bin/cosign
# 生成密钥对
cosign generate-key-pair
# 创建密钥Secret
kubectl create secret generic cosign-keys \
--from-file=cosign.key=cosign.key \
--from-file=cosign.pub=cosign.pub \
-n kube-system
# 部署镜像验证Webhook
cat <<EOF | kubectl apply -f -
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionWebhook
metadata:
name: image-signature-webhook
webhooks:
- name: verify-signature.example.com
clientConfig:
service:
name: image-signature-webhook
namespace: kube-system
path: "/verify"
rules:
- operations: ["CREATE", "UPDATE"]
apiGroups: [""]
apiVersions: ["v1"]
resources: ["pods"]
- operations: ["CREATE", "UPDATE"]
apiGroups: ["apps"]
apiVersions: ["v1"]
resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
admissionReviewVersions: ["v1", "v1beta1"]
sideEffects: None
failurePolicy: Fail
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: image-signature-webhook
namespace: kube-system
spec:
replicas: 2
selector:
matchLabels:
app: image-signature-webhook
template:
metadata:
labels:
app: image-signature-webhook
spec:
containers:
- name: webhook
image: image-signature-webhook:latest
ports:
- containerPort: 8443
env:
- name: TLS_CERT_FILE
value: "/etc/certs/tls.crt"
- name: TLS_PRIVATE_KEY_FILE
value: "/etc/certs/tls.key"
- name: COSIGN_PUBLIC_KEY
value: "/etc/cosign/cosign.pub"
volumeMounts:
- name: certs
mountPath: /etc/certs
readOnly: true
- name: cosign-keys
mountPath: /etc/cosign
readOnly: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: certs
secret:
secretName: webhook-certs
- name: cosign-keys
secret:
secretName: cosign-keys
EOF
}
# 配置SBOM生成
setup_sbom_generation() {
echo "配置SBOM生成..."
# 安装Syft
curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b /usr/local/bin
# 创建SBOM生成Job
cat <<EOF | kubectl apply -f -
apiVersion: batch/v1
kind: CronJob
metadata:
name: sbom-generator
namespace: security
spec:
schedule: "0 2 * * *" # 每天凌晨2点
jobTemplate:
spec:
template:
spec:
containers:
- name: sbom-generator
image: anchore/syft:latest
command:
- /bin/sh
- -c
- |
echo "Generating SBOMs for all images..."
# 获取所有运行中的镜像
kubectl get pods --all-namespaces -o jsonpath='{.items[*].spec.containers[*].image}' | \
tr ' ' '\n' | sort -u | while read image; do
echo "Generating SBOM for $image"
# 生成SBOM
syft "$image" -o spdx-json > "/sbom/$(echo $image | tr '/' '_' | tr ':' '_').spdx.json"
# 上传到存储
# aws s3 cp "/sbom/$(echo $image | tr '/' '_' | tr ':' '_').spdx.json" s3://sbom-bucket/
done
echo "SBOM generation completed"
volumeMounts:
- name: sbom-storage
mountPath: /sbom
- name: docker-socket
mountPath: /var/run/docker.sock
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 1
memory: 1Gi
volumes:
- name: sbom-storage
persistentVolumeClaim:
claimName: sbom-storage-pvc
- name: docker-socket
hostPath:
path: /var/run/docker.sock
restartPolicy: OnFailure
serviceAccountName: sbom-generator
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: sbom-generator
namespace: security
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: sbom-generator
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sbom-generator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: sbom-generator
subjects:
- kind: ServiceAccount
name: sbom-generator
namespace: security
EOF
}
# 实施运行时安全监控
implement_runtime_security() {
echo "实施运行时安全监控..."
# 部署Falco
helm repo add falcosecurity https://falcosecurity.github.io/charts
helm repo update
helm install falco falcosecurity/falco \
--namespace falco-system \
--create-namespace \
--set falco.grpc.enabled=true \
--set falco.grpcOutput.enabled=true
# 配置自定义Falco规则
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: falco-custom-rules
namespace: falco-system
data:
custom_rules.yaml: |
# 自定义安全规则
# 检测特权容器
- rule: Privileged Container Started
desc: Detect privileged container started
condition: >
spawned_process and container and
k8s_audit and ka.verb=create and ka.target.resource=pods and
ka.req.pod.containers.privileged=true
output: >
Privileged container started (user=%ka.user.name verb=%ka.verb
pod=%ka.target.name container=%ka.req.pod.containers.name
image=%ka.req.pod.containers.image)
priority: WARNING
tags: [container, privilege_escalation, k8s_audit]
# 检测敏感文件访问
- rule: Sensitive File Access
desc: Detect access to sensitive files
condition: >
open_read and sensitive_files and not proc_name_exists and
not user_known_read_sensitive_files_activities
output: >
Sensitive file opened for reading (user=%user.name command=%proc.cmdline
file=%fd.name parent=%proc.pname pcmdline=%proc.pcmdline gparent=%proc.aname[2])
priority: WARNING
tags: [filesystem, sensitive_files]
# 检测网络连接异常
- rule: Unexpected Network Connection
desc: Detect unexpected network connections
condition: >
inbound_outbound and fd.sockfamily=ip and
not proc.name in (known_network_tools) and
not fd.sport in (known_ports) and
not fd.dport in (known_ports)
output: >
Unexpected network connection (user=%user.name command=%proc.cmdline
connection=%fd.name proto=%fd.l4proto)
priority: NOTICE
tags: [network, anomaly]
EOF
# 配置Falco输出到Prometheus
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: falco-exporter-config
namespace: falco-system
data:
config.yaml: |
listenAddress: 0.0.0.0
listenPort: 9376
falcoGrpcAddress: falco.falco-system.svc.cluster.local
falcoGrpcPort: 5060
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: falco-exporter
namespace: falco-system
spec:
replicas: 1
selector:
matchLabels:
app: falco-exporter
template:
metadata:
labels:
app: falco-exporter
spec:
containers:
- name: falco-exporter
image: falcosecurity/falco-exporter:latest
ports:
- containerPort: 9376
volumeMounts:
- name: config
mountPath: /etc/falco-exporter
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: config
configMap:
name: falco-exporter-config
EOF
}
# 生成安全报告
generate_security_report() {
echo "生成安全报告..."
local report_file="security-report-$(date +%Y%m%d).md"
cat > "$report_file" <<EOF
# Kubernetes集群安全报告
生成时间: $(date)
## 1. 镜像安全扫描结果
### 高危漏洞统计
\`\`\`
$(kubectl get vulnerabilityreports --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.report.summary.criticalCount > 0) | "\(.metadata.namespace)/\(.metadata.name): Critical=\(.report.summary.criticalCount), High=\(.report.summary.highCount)"' || echo "需要安装Trivy Operator")
\`\`\`
### 配置安全检查
\`\`\`
$(kubectl get configauditreports --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace)/\(.metadata.name): Danger=\(.report.summary.dangerCount), Warning=\(.report.summary.warningCount)"' || echo "需要安装配置审计工具")
\`\`\`
## 2. 网络安全策略
### 网络策略覆盖率
- **总命名空间数**: $(kubectl get namespaces --no-headers | wc -l)
- **有网络策略的命名空间**: $(kubectl get networkpolicies --all-namespaces --no-headers | awk '{print $1}' | sort -u | wc -l)
- **覆盖率**: $(echo "scale=2; $(kubectl get networkpolicies --all-namespaces --no-headers | awk '{print $1}' | sort -u | wc -l) * 100 / $(kubectl get namespaces --no-headers | wc -l)" | bc)%
### 默认拒绝策略
\`\`\`
$(kubectl get networkpolicies --all-namespaces -o json | jq -r '.items[] | select(.spec.podSelector == {}) | "\(.metadata.namespace): \(.metadata.name)"')
\`\`\`
## 3. RBAC安全分析
### 特权角色
\`\`\`
$(kubectl get clusterrolebindings -o json | jq -r '.items[] | select(.roleRef.name == "cluster-admin") | "\(.metadata.name): \(.subjects[].name)"')
\`\`\`
### 服务账户统计
- **总服务账户数**: $(kubectl get serviceaccounts --all-namespaces --no-headers | wc -l)
- **有自定义权限的SA**: $(kubectl get rolebindings,clusterrolebindings --all-namespaces -o json | jq '[.items[] | select(.subjects[]?.kind == "ServiceAccount")] | length')
## 4. Pod安全标准
### 特权Pod
\`\`\`
$(kubectl get pods --all-namespaces -o json | jq -r '.items[] | select(.spec.securityContext.privileged == true or .spec.containers[].securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"')
\`\`\`
### HostNetwork Pod
\`\`\`
$(kubectl get pods --all-namespaces -o json | jq -r '.items[] | select(.spec.hostNetwork == true) | "\(.metadata.namespace)/\(.metadata.name)"')
\`\`\`
## 5. 运行时安全事件
### Falco告警统计 (最近24小时)
\`\`\`
$(kubectl logs -n falco-system -l app.kubernetes.io/name=falco --since=24h | grep -c "Priority:" || echo "Falco未部署或无日志")
\`\`\`
## 6. 安全建议
### 立即修复
1. 修复所有Critical和High级别的漏洞
2. 为所有命名空间配置网络策略
3. 移除不必要的特权权限
4. 实施Pod安全标准
### 安全加固
1. 启用镜像签名验证
2. 实施零信任网络架构
3. 配置运行时安全监控
4. 定期进行安全审计
### 合规要求
1. 实施SBOM管理
2. 建立安全基线
3. 配置审计日志
4. 实施访问控制
EOF
echo "安全报告已生成: $report_file"
}
# 主函数
case "$1" in
scanning)
install_image_scanning
;;
signing)
implement_image_signing
;;
sbom)
setup_sbom_generation
;;
runtime)
implement_runtime_security
;;
report)
generate_security_report
;;
all)
install_image_scanning
implement_image_signing
setup_sbom_generation
implement_runtime_security
generate_security_report
;;
*)
echo "用法: $0 {scanning|signing|sbom|runtime|report|all}"
echo " scanning - 安装镜像扫描"
echo " signing - 实施镜像签名"
echo " sbom - 配置SBOM生成"
echo " runtime - 运行时安全监控"
echo " report - 生成安全报告"
echo " all - 执行所有配置"
exit 1
;;
esac
14.8 开发者体验和平台工程
14.8.1 内部开发者平台 (IDP)
# Backstage 开发者门户配置
apiVersion: v1
kind: ConfigMap
metadata:
name: backstage-config
namespace: platform
data:
app-config.yaml: |
app:
title: Internal Developer Platform
baseUrl: http://localhost:3000
organization:
name: My Company
backend:
baseUrl: http://localhost:7007
listen:
port: 7007
csp:
connect-src: ["'self'", 'http:', 'https:']
cors:
origin: http://localhost:3000
methods: [GET, HEAD, PATCH, POST, PUT, DELETE]
credentials: true
database:
client: better-sqlite3
connection: ':memory:'
integrations:
github:
- host: github.com
token: ${GITHUB_TOKEN}
gitlab:
- host: gitlab.com
token: ${GITLAB_TOKEN}
techdocs:
builder: 'local'
generator:
runIn: 'local'
publisher:
type: 'local'
auth:
providers:
github:
development:
clientId: ${AUTH_GITHUB_CLIENT_ID}
clientSecret: ${AUTH_GITHUB_CLIENT_SECRET}
scaffolder:
defaultAuthor:
name: Platform Team
email: platform@company.com
defaultCommitMessage: 'Initial commit'
catalog:
import:
entityFilename: catalog-info.yaml
pullRequestBranchName: backstage-integration
rules:
- allow: [Component, System, API, Resource, Location]
locations:
- type: file
target: ../../examples/entities.yaml
- type: file
target: ../../examples/template/template.yaml
rules:
- allow: [Template]
- type: file
target: ../../examples/org.yaml
rules:
- allow: [User, Group]
kubernetes:
serviceLocatorMethod:
type: 'multiTenant'
clusterLocatorMethods:
- type: 'config'
clusters:
- url: https://kubernetes.default.svc
name: local-cluster
authProvider: 'serviceAccount'
skipTLSVerify: true
skipMetricsLookup: false
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: backstage
namespace: platform
spec:
replicas: 1
selector:
matchLabels:
app: backstage
template:
metadata:
labels:
app: backstage
spec:
containers:
- name: backstage
image: backstage:latest
ports:
- containerPort: 7007
- containerPort: 3000
env:
- name: POSTGRES_HOST
value: postgres
- name: POSTGRES_PORT
value: "5432"
- name: POSTGRES_USER
value: backstage
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secrets
key: password
- name: GITHUB_TOKEN
valueFrom:
secretKeyRef:
name: backstage-secrets
key: github-token
- name: AUTH_GITHUB_CLIENT_ID
valueFrom:
secretKeyRef:
name: backstage-secrets
key: github-client-id
- name: AUTH_GITHUB_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: backstage-secrets
key: github-client-secret
volumeMounts:
- name: config
mountPath: /app/app-config.yaml
subPath: app-config.yaml
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 1
memory: 1Gi
volumes:
- name: config
configMap:
name: backstage-config
serviceAccountName: backstage
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: backstage
namespace: platform
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: backstage-reader
rules:
- apiGroups: [""]
resources: ["pods", "services", "configmaps", "secrets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "replicasets", "statefulsets", "daemonsets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: backstage-reader
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: backstage-reader
subjects:
- kind: ServiceAccount
name: backstage
namespace: platform
14.8.2 应用模板和脚手架
#!/bin/bash
# 开发者体验优化脚本
echo "=== 开发者体验优化 ==="
# 部署应用模板系统
deploy_app_templates() {
echo "部署应用模板系统..."
# 创建模板仓库
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: app-templates
namespace: platform
data:
microservice-template.yaml: |
apiVersion: scaffolder.backstage.io/v1beta3
kind: Template
metadata:
name: microservice-template
title: Microservice Template
description: Create a new microservice with best practices
tags:
- recommended
- microservice
- go
spec:
owner: platform-team
type: service
parameters:
- title: Service Information
required:
- name
- description
properties:
name:
title: Name
type: string
description: Unique name of the service
pattern: '^([a-z0-9\\-]+)$'
description:
title: Description
type: string
description: Help others understand what this service is for
owner:
title: Owner
type: string
description: Owner of the component
ui:field: OwnerPicker
ui:options:
allowedKinds:
- Group
- title: Choose a location
required:
- repoUrl
properties:
repoUrl:
title: Repository Location
type: string
ui:field: RepoUrlPicker
ui:options:
allowedHosts:
- github.com
steps:
- id: fetch-base
name: Fetch Base
action: fetch:template
input:
url: ./content
values:
name: ${{ parameters.name }}
description: ${{ parameters.description }}
owner: ${{ parameters.owner }}
- id: publish
name: Publish
action: publish:github
input:
allowedHosts: ['github.com']
description: This is ${{ parameters.name }}
repoUrl: ${{ parameters.repoUrl }}
- id: register
name: Register
action: catalog:register
input:
repoContentsUrl: ${{ steps.publish.output.repoContentsUrl }}
catalogInfoPath: '/catalog-info.yaml'
output:
links:
- title: Repository
url: ${{ steps.publish.output.remoteUrl }}
- title: Open in catalog
icon: catalog
entityRef: ${{ steps.register.output.entityRef }}
web-app-template.yaml: |
apiVersion: scaffolder.backstage.io/v1beta3
kind: Template
metadata:
name: web-app-template
title: Web Application Template
description: Create a new web application with React and TypeScript
tags:
- recommended
- web
- react
- typescript
spec:
owner: frontend-team
type: website
parameters:
- title: Application Information
required:
- name
- description
properties:
name:
title: Name
type: string
description: Unique name of the application
description:
title: Description
type: string
description: Help others understand what this application does
framework:
title: Framework
type: string
description: Choose the frontend framework
default: react
enum:
- react
- vue
- angular
enumNames:
- React
- Vue.js
- Angular
steps:
- id: fetch-base
name: Fetch Base
action: fetch:template
input:
url: ./web-content
values:
name: ${{ parameters.name }}
description: ${{ parameters.description }}
framework: ${{ parameters.framework }}
EOF
# 部署模板内容
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: microservice-content
namespace: platform
data:
catalog-info.yaml: |
apiVersion: backstage.io/v1alpha1
kind: Component
metadata:
name: \${{ values.name }}
description: \${{ values.description }}
annotations:
github.com/project-slug: \${{ values.destination.owner }}/\${{ values.destination.repo }}
spec:
type: service
lifecycle: experimental
owner: \${{ values.owner }}
Dockerfile: |
FROM golang:1.19-alpine AS builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -o main .
FROM alpine:latest
RUN apk --no-cache add ca-certificates
WORKDIR /root/
COPY --from=builder /app/main .
CMD ["./main"]
main.go: |
package main
import (
"fmt"
"log"
"net/http"
"os"
)
func main() {
port := os.Getenv("PORT")
if port == "" {
port = "8080"
}
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, "Hello from \${{ values.name }}!")
})
http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
fmt.Fprintf(w, "OK")
})
log.Printf("Server starting on port %s", port)
log.Fatal(http.ListenAndServe(":"+port, nil))
}
go.mod: |
module \${{ values.name }}
go 1.19
k8s-deployment.yaml: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: \${{ values.name }}
labels:
app: \${{ values.name }}
spec:
replicas: 3
selector:
matchLabels:
app: \${{ values.name }}
template:
metadata:
labels:
app: \${{ values.name }}
spec:
containers:
- name: \${{ values.name }}
image: \${{ values.name }}:latest
ports:
- containerPort: 8080
env:
- name: PORT
value: "8080"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
---
apiVersion: v1
kind: Service
metadata:
name: \${{ values.name }}
spec:
selector:
app: \${{ values.name }}
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: ClusterIP
EOF
}
# 部署开发环境管理
deploy_dev_environments() {
echo "部署开发环境管理..."
# 部署Okteto开发环境
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Namespace
metadata:
name: dev-environments
labels:
name: dev-environments
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dev-env-controller
namespace: dev-environments
spec:
replicas: 1
selector:
matchLabels:
app: dev-env-controller
template:
metadata:
labels:
app: dev-env-controller
spec:
containers:
- name: controller
image: dev-env-controller:latest
ports:
- containerPort: 8080
env:
- name: NAMESPACE_PREFIX
value: "dev-"
- name: DEFAULT_RESOURCES
value: |
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 1
memory: 1Gi
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
serviceAccountName: dev-env-controller
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: dev-env-controller
namespace: dev-environments
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: dev-env-controller
rules:
- apiGroups: [""]
resources: ["namespaces", "pods", "services", "configmaps", "secrets"]
verbs: ["*"]
- apiGroups: ["apps"]
resources: ["deployments", "replicasets"]
verbs: ["*"]
- apiGroups: ["networking.k8s.io"]
resources: ["ingresses"]
verbs: ["*"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: dev-env-controller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: dev-env-controller
subjects:
- kind: ServiceAccount
name: dev-env-controller
namespace: dev-environments
EOF
# 创建开发环境模板
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: dev-env-templates
namespace: dev-environments
data:
basic-dev-env.yaml: |
apiVersion: v1
kind: Namespace
metadata:
name: dev-{{.Username}}-{{.ProjectName}}
labels:
type: development
owner: {{.Username}}
project: {{.ProjectName}}
annotations:
dev-env/created-at: {{.CreatedAt}}
dev-env/expires-at: {{.ExpiresAt}}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{.ProjectName}}
namespace: dev-{{.Username}}-{{.ProjectName}}
spec:
replicas: 1
selector:
matchLabels:
app: {{.ProjectName}}
template:
metadata:
labels:
app: {{.ProjectName}}
spec:
containers:
- name: dev-container
image: {{.Image}}
ports:
- containerPort: {{.Port}}
env:
- name: ENV
value: "development"
- name: DEBUG
value: "true"
volumeMounts:
- name: code
mountPath: /workspace
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 1
memory: 1Gi
volumes:
- name: code
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: {{.ProjectName}}
namespace: dev-{{.Username}}-{{.ProjectName}}
spec:
selector:
app: {{.ProjectName}}
ports:
- protocol: TCP
port: 80
targetPort: {{.Port}}
type: ClusterIP
EOF
}
# 配置CI/CD集成
setup_cicd_integration() {
echo "配置CI/CD集成..."
# 部署Tekton Dashboard
kubectl apply --filename https://storage.googleapis.com/tekton-releases/dashboard/latest/tekton-dashboard-release.yaml
# 创建通用Pipeline模板
cat <<EOF | kubectl apply -f -
apiVersion: tekton.dev/v1beta1
kind: Pipeline
metadata:
name: build-and-deploy
namespace: tekton-pipelines
spec:
description: |
This pipeline clones a git repo, builds a Docker image with Kaniko and
deploys it to Kubernetes
params:
- name: repo-url
type: string
- name: image-reference
type: string
- name: deployment-name
type: string
- name: namespace
type: string
default: default
workspaces:
- name: shared-data
description: |
This workspace contains the cloned repo files, so they can be read by the
next task.
- name: docker-credentials
description: |
This workspace contains docker credentials for pushing images.
tasks:
- name: fetch-source
taskRef:
name: git-clone
workspaces:
- name: output
workspace: shared-data
params:
- name: url
value: $(params.repo-url)
- name: build-image
runAfter: ["fetch-source"]
taskRef:
name: kaniko
workspaces:
- name: source
workspace: shared-data
- name: dockerconfig
workspace: docker-credentials
params:
- name: IMAGE
value: $(params.image-reference)
- name: deploy
runAfter: ["build-image"]
taskRef:
name: kubernetes-actions
params:
- name: script
value: |
kubectl set image deployment/$(params.deployment-name) \
$(params.deployment-name)=$(params.image-reference) \
-n $(params.namespace)
kubectl rollout status deployment/$(params.deployment-name) -n $(params.namespace)
---
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
name: kubernetes-actions
namespace: tekton-pipelines
spec:
description: |
This task performs kubernetes actions like apply, delete, etc.
params:
- name: script
description: The kubectl script to run
type: string
steps:
- name: kubectl
image: bitnami/kubectl:latest
script: |
#!/bin/bash
set -e
$(params.script)
EOF
# 创建自动化触发器
cat <<EOF | kubectl apply -f -
apiVersion: triggers.tekton.dev/v1beta1
kind: EventListener
metadata:
name: github-listener
namespace: tekton-pipelines
spec:
serviceAccountName: tekton-triggers-sa
triggers:
- name: github-push
interceptors:
- ref:
name: "github"
params:
- name: "secretRef"
value:
secretName: github-secret
secretKey: secretToken
- name: "eventTypes"
value: ["push"]
bindings:
- ref: github-push-binding
template:
ref: build-and-deploy-template
---
apiVersion: triggers.tekton.dev/v1beta1
kind: TriggerBinding
metadata:
name: github-push-binding
namespace: tekton-pipelines
spec:
params:
- name: repo-url
value: $(body.repository.clone_url)
- name: repo-name
value: $(body.repository.name)
- name: revision
value: $(body.head_commit.id)
---
apiVersion: triggers.tekton.dev/v1beta1
kind: TriggerTemplate
metadata:
name: build-and-deploy-template
namespace: tekton-pipelines
spec:
params:
- name: repo-url
- name: repo-name
- name: revision
resourcetemplates:
- apiVersion: tekton.dev/v1beta1
kind: PipelineRun
metadata:
generateName: build-and-deploy-run-
spec:
pipelineRef:
name: build-and-deploy
params:
- name: repo-url
value: $(tt.params.repo-url)
- name: image-reference
value: registry.example.com/$(tt.params.repo-name):$(tt.params.revision)
- name: deployment-name
value: $(tt.params.repo-name)
workspaces:
- name: shared-data
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
- name: docker-credentials
secret:
secretName: docker-credentials
EOF
}
# 部署开发者工具
deploy_developer_tools() {
echo "部署开发者工具..."
# 部署代码质量检查
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: sonarqube
namespace: platform
spec:
replicas: 1
selector:
matchLabels:
app: sonarqube
template:
metadata:
labels:
app: sonarqube
spec:
containers:
- name: sonarqube
image: sonarqube:community
ports:
- containerPort: 9000
env:
- name: SONAR_JDBC_URL
value: jdbc:postgresql://postgres:5432/sonar
- name: SONAR_JDBC_USERNAME
value: sonar
- name: SONAR_JDBC_PASSWORD
valueFrom:
secretKeyRef:
name: sonar-secrets
key: password
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 2
memory: 4Gi
volumeMounts:
- name: sonarqube-data
mountPath: /opt/sonarqube/data
- name: sonarqube-logs
mountPath: /opt/sonarqube/logs
- name: sonarqube-extensions
mountPath: /opt/sonarqube/extensions
volumes:
- name: sonarqube-data
persistentVolumeClaim:
claimName: sonarqube-data-pvc
- name: sonarqube-logs
persistentVolumeClaim:
claimName: sonarqube-logs-pvc
- name: sonarqube-extensions
persistentVolumeClaim:
claimName: sonarqube-extensions-pvc
---
apiVersion: v1
kind: Service
metadata:
name: sonarqube
namespace: platform
spec:
selector:
app: sonarqube
ports:
- protocol: TCP
port: 9000
targetPort: 9000
type: ClusterIP
EOF
# 部署API文档生成
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: swagger-ui
namespace: platform
spec:
replicas: 1
selector:
matchLabels:
app: swagger-ui
template:
metadata:
labels:
app: swagger-ui
spec:
containers:
- name: swagger-ui
image: swaggerapi/swagger-ui:latest
ports:
- containerPort: 8080
env:
- name: SWAGGER_JSON_URL
value: "http://api-docs-aggregator/swagger.json"
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
---
apiVersion: v1
kind: Service
metadata:
name: swagger-ui
namespace: platform
spec:
selector:
app: swagger-ui
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: ClusterIP
EOF
}
# 生成开发者体验报告
generate_dx_report() {
echo "生成开发者体验报告..."
local report_file="developer-experience-report-$(date +%Y%m%d).md"
cat > "$report_file" <<EOF
# 开发者体验报告
生成时间: $(date)
## 1. 平台概览
### 开发者门户状态
- **Backstage状态**: $(kubectl get pods -n platform -l app=backstage -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "未部署")
- **可用模板数量**: $(kubectl get configmaps -n platform -l type=template --no-headers 2>/dev/null | wc -l)
- **注册组件数量**: 通过Backstage API获取
### 开发环境
- **活跃开发环境**: $(kubectl get namespaces -l type=development --no-headers 2>/dev/null | wc -l)
- **环境资源使用**: $(kubectl top pods -n dev-environments 2>/dev/null | tail -n +2 | awk '{sum+=$2} END {print sum "m CPU"}' || echo "N/A")
## 2. CI/CD 管道
### Pipeline 统计
- **总Pipeline数**: $(kubectl get pipelines -n tekton-pipelines --no-headers 2>/dev/null | wc -l)
- **最近24小时运行**: $(kubectl get pipelineruns -n tekton-pipelines --no-headers 2>/dev/null | wc -l)
- **成功率**: 通过Tekton Dashboard API计算
### 部署频率
\`\`\`
$(kubectl get pipelineruns -n tekton-pipelines -o json 2>/dev/null | jq -r '.items[] | select(.status.completionTime != null) | .metadata.creationTimestamp' | head -10 || echo "无数据")
\`\`\`
## 3. 开发工具
### 代码质量
- **SonarQube状态**: $(kubectl get pods -n platform -l app=sonarqube -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "未部署")
- **扫描项目数**: 通过SonarQube API获取
- **代码覆盖率**: 通过SonarQube API获取
### API文档
- **Swagger UI状态**: $(kubectl get pods -n platform -l app=swagger-ui -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "未部署")
- **文档化API数量**: 通过API聚合器获取
## 4. 开发者满意度指标
### 部署时间
- **平均部署时间**: 通过Pipeline运行时间计算
- **从代码到生产时间**: 端到端时间测量
### 错误率
- **Pipeline失败率**: $(kubectl get pipelineruns -n tekton-pipelines -o json 2>/dev/null | jq '[.items[] | select(.status.conditions[0].status == "False")] | length' || echo "0")%
- **环境创建失败率**: 通过开发环境控制器日志分析
## 5. 改进建议
### 立即改进
1. 优化Pipeline性能
2. 增加更多应用模板
3. 改善文档和培训
4. 实施自动化测试
### 中期目标
1. 实施GitOps工作流
2. 增强监控和可观测性
3. 实施安全扫描集成
4. 优化资源使用
### 长期愿景
1. 实现完全自助服务
2. 建立开发者社区
3. 持续改进开发体验
4. 实施AI辅助开发
## 6. 关键指标
建议跟踪以下开发者体验指标:
- 部署频率
- 变更前置时间
- 平均恢复时间
- 变更失败率
- 开发者满意度评分
EOF
echo "开发者体验报告已生成: $report_file"
}
# 主函数
case "$1" in
templates)
deploy_app_templates
;;
environments)
deploy_dev_environments
;;
cicd)
setup_cicd_integration
;;
tools)
deploy_developer_tools
;;
report)
generate_dx_report
;;
all)
deploy_app_templates
deploy_dev_environments
setup_cicd_integration
deploy_developer_tools
generate_dx_report
;;
*)
echo "用法: $0 {templates|environments|cicd|tools|report|all}"
echo " templates - 部署应用模板"
echo " environments - 部署开发环境管理"
echo " cicd - 配置CI/CD集成"
echo " tools - 部署开发者工具"
echo " report - 生成开发者体验报告"
echo " all - 执行所有配置"
exit 1
;;
esac
14.9 总结与展望
14.9.1 技术趋势总结
# 未来技术趋势配置图
apiVersion: v1
kind: ConfigMap
metadata:
name: k8s-future-trends
namespace: kube-system
data:
trends-summary.yaml: |
# Kubernetes未来发展趋势
trends:
edge_computing:
description: "边缘计算和IoT设备管理"
maturity: "成长期"
adoption_timeline: "2024-2026"
key_technologies:
- K3s
- KubeEdge
- OpenYurt
- Akri
use_cases:
- IoT设备管理
- 边缘AI推理
- 实时数据处理
- 低延迟应用
ai_ml_workloads:
description: "AI/ML工作负载原生支持"
maturity: "快速发展期"
adoption_timeline: "2024-2025"
key_technologies:
- Kubeflow
- KServe
- Volcano
- GPU Operator
use_cases:
- 模型训练
- 模型推理
- 数据管道
- MLOps
webassembly:
description: "WebAssembly运行时集成"
maturity: "早期阶段"
adoption_timeline: "2025-2027"
key_technologies:
- Wasmtime
- WasmEdge
- Krustlet
- WASI
use_cases:
- 轻量级函数
- 多语言支持
- 安全沙箱
- 边缘计算
multi_cloud:
description: "多云和混合云管理"
maturity: "成熟期"
adoption_timeline: "2024-2025"
key_technologies:
- Cluster API
- Admiral
- Submariner
- Liqo
use_cases:
- 多云部署
- 灾难恢复
- 成本优化
- 避免厂商锁定
sustainability:
description: "可持续发展和绿色计算"
maturity: "新兴期"
adoption_timeline: "2024-2026"
key_technologies:
- Kepler
- Carbon Aware Scheduler
- Green Metrics
- Sustainable Computing
use_cases:
- 碳足迹监控
- 绿色调度
- 能效优化
- 可持续运营
security_enhancement:
description: "安全增强和零信任"
maturity: "快速发展期"
adoption_timeline: "2024-2025"
key_technologies:
- Sigstore
- SPIFFE/SPIRE
- Falco
- OPA Gatekeeper
use_cases:
- 供应链安全
- 零信任架构
- 运行时安全
- 合规管理
developer_experience:
description: "开发者体验和平台工程"
maturity: "快速发展期"
adoption_timeline: "2024-2025"
key_technologies:
- Backstage
- Crossplane
- Tekton
- ArgoCD
use_cases:
- 内部开发者平台
- 自助服务
- GitOps
- 应用模板
adoption_recommendations:
immediate_focus:
- "实施GitOps工作流"
- "加强安全基线"
- "优化资源使用"
- "建立监控体系"
short_term:
- "探索边缘计算场景"
- "实施AI/ML工作负载"
- "建设开发者平台"
- "实施多云策略"
long_term:
- "采用WebAssembly技术"
- "实现可持续运营"
- "建立零信任架构"
- "实施智能运维"
success_metrics:
technical:
- "部署频率"
- "变更前置时间"
- "平均恢复时间"
- "变更失败率"
business:
- "开发者生产力"
- "运营成本"
- "安全事件数量"
- "合规达成率"
sustainability:
- "碳足迹"
- "能源效率"
- "资源利用率"
- "可再生能源使用率"
14.9.2 实施路线图
#!/bin/bash
# Kubernetes未来技术实施路线图脚本
echo "=== Kubernetes未来技术实施路线图 ==="
# 生成实施路线图
generate_roadmap() {
echo "生成实施路线图..."
local roadmap_file="k8s-future-roadmap-$(date +%Y%m%d).md"
cat > "$roadmap_file" <<EOF
# Kubernetes未来技术实施路线图
生成时间: $(date)
## 阶段一:基础设施现代化 (0-6个月)
### 目标
- 建立现代化的Kubernetes基础设施
- 实施基本的安全和监控
- 优化资源使用和成本
### 关键任务
1. **集群升级和标准化**
- 升级到最新稳定版本
- 统一集群配置
- 实施基础安全策略
2. **监控和可观测性**
- 部署Prometheus + Grafana
- 实施日志聚合
- 配置告警规则
3. **CI/CD现代化**
- 实施GitOps工作流
- 自动化部署管道
- 集成安全扫描
4. **资源优化**
- 实施资源配额
- 配置自动扩缩容
- 优化镜像和存储
### 成功指标
- 集群可用性 > 99.9%
- 部署时间 < 10分钟
- 资源利用率 > 70%
- 安全扫描覆盖率 100%
## 阶段二:平台工程和开发者体验 (6-12个月)
### 目标
- 建设内部开发者平台
- 提升开发者生产力
- 实施自助服务模式
### 关键任务
1. **开发者门户**
- 部署Backstage平台
- 创建应用模板
- 集成开发工具链
2. **自助服务**
- 实施环境自动化
- 配置资源自助申请
- 建立服务目录
3. **质量保证**
- 集成代码质量检查
- 自动化测试流水线
- 实施性能测试
4. **文档和培训**
- 建立知识库
- 开发者培训计划
- 最佳实践指南
### 成功指标
- 开发者满意度 > 4.5/5
- 环境创建时间 < 5分钟
- 自助服务使用率 > 80%
- 文档覆盖率 > 90%
## 阶段三:高级特性和新兴技术 (12-18个月)
### 目标
- 探索和实施新兴技术
- 扩展到边缘和多云
- 支持AI/ML工作负载
### 关键任务
1. **边缘计算**
- 部署边缘集群
- 实施边缘应用管理
- 配置边缘监控
2. **AI/ML平台**
- 部署Kubeflow
- 配置GPU资源池
- 实施模型管理
3. **多云管理**
- 实施集群联邦
- 配置跨云网络
- 建立灾难恢复
4. **WebAssembly探索**
- 试点WASM运行时
- 开发WASM应用
- 评估性能和安全性
### 成功指标
- 边缘节点数量 > 10
- ML模型部署时间 < 30分钟
- 多云部署成功率 > 95%
- WASM应用性能提升 > 20%
## 阶段四:可持续发展和智能运维 (18-24个月)
### 目标
- 实现可持续运营
- 建立智能运维体系
- 实施零信任安全
### 关键任务
1. **绿色计算**
- 实施碳足迹监控
- 配置绿色调度
- 优化能源使用
2. **智能运维**
- 实施AIOps
- 自动化故障处理
- 预测性维护
3. **零信任安全**
- 实施微分段
- 配置身份验证
- 建立安全基线
4. **持续优化**
- 性能调优自动化
- 成本优化算法
- 容量规划智能化
### 成功指标
- 碳足迹减少 > 30%
- 故障自动修复率 > 80%
- 安全事件减少 > 50%
- 运维效率提升 > 40%
## 技术选型建议
### 立即采用
- **GitOps**: ArgoCD, Flux
- **监控**: Prometheus, Grafana, Jaeger
- **安全**: OPA Gatekeeper, Falco
- **CI/CD**: Tekton, GitHub Actions
### 短期评估
- **平台工程**: Backstage, Crossplane
- **服务网格**: Istio, Linkerd
- **多云**: Cluster API, Admiral
- **AI/ML**: Kubeflow, KServe
### 长期探索
- **边缘计算**: K3s, KubeEdge
- **WebAssembly**: Krustlet, WasmEdge
- **可持续性**: Kepler, Carbon Aware Scheduler
- **智能运维**: AIOps平台, 预测分析
## 风险和缓解策略
### 技术风险
- **新技术不成熟**: 先试点,后推广
- **兼容性问题**: 充分测试,渐进升级
- **性能影响**: 基准测试,监控对比
### 组织风险
- **技能差距**: 培训计划,外部支持
- **变更阻力**: 沟通策略,激励机制
- **资源不足**: 分阶段实施,优先级管理
### 业务风险
- **服务中断**: 蓝绿部署,回滚策略
- **成本超支**: 预算控制,成本监控
- **合规问题**: 安全审计,合规检查
## 投资回报分析
### 成本节约
- 运维自动化: 节约人力成本30-50%
- 资源优化: 降低基础设施成本20-40%
- 故障减少: 减少业务损失60-80%
### 效率提升
- 部署速度: 提升5-10倍
- 开发效率: 提升30-50%
- 问题解决: 缩短70-90%
### 创新能力
- 新技术采用: 缩短6-12个月
- 产品上市: 加速30-50%
- 竞争优势: 建立技术护城河
EOF
echo "实施路线图已生成: $roadmap_file"
}
# 创建技术评估矩阵
create_assessment_matrix() {
echo "创建技术评估矩阵..."
cat > "technology-assessment-matrix.csv" <<EOF
技术,成熟度,复杂度,投资成本,预期收益,风险等级,推荐优先级
GitOps,高,中,低,高,低,1
服务网格,高,高,中,高,中,2
边缘计算,中,高,高,中,中,3
AI/ML平台,中,高,高,高,中,2
WebAssembly,低,中,低,中,高,4
可持续计算,低,中,中,中,中,3
零信任安全,中,高,高,高,中,2
平台工程,中,中,中,高,低,1
多云管理,中,高,高,中,中,3
智能运维,低,高,高,高,高,4
EOF
echo "技术评估矩阵已创建: technology-assessment-matrix.csv"
}
# 生成决策支持报告
generate_decision_support() {
echo "生成决策支持报告..."
local decision_file="decision-support-$(date +%Y%m%d).md"
cat > "$decision_file" <<EOF
# Kubernetes未来技术决策支持报告
## 执行摘要
Kubernetes生态系统正在快速发展,新兴技术为企业带来了前所未有的机遇和挑战。本报告基于当前技术趋势、市场需求和最佳实践,为组织制定Kubernetes未来技术策略提供决策支持。
## 关键发现
### 1. 技术成熟度分析
- **成熟技术**: GitOps、服务网格、监控可观测性
- **快速发展**: AI/ML平台、平台工程、零信任安全
- **新兴技术**: WebAssembly、可持续计算、智能运维
### 2. 业务价值评估
- **高价值**: 平台工程(开发效率)、AI/ML(创新能力)、安全增强(风险控制)
- **中等价值**: 边缘计算(新场景)、多云管理(灵活性)
- **长期价值**: 可持续计算(合规要求)、智能运维(成本优化)
### 3. 实施复杂度
- **低复杂度**: GitOps、基础监控、应用模板
- **中等复杂度**: 平台工程、WebAssembly、可持续计算
- **高复杂度**: 服务网格、AI/ML平台、零信任架构
## 推荐策略
### 短期策略 (6-12个月)
1. **优先实施GitOps和平台工程**
- 投资回报高,风险低
- 为后续技术奠定基础
- 显著提升开发者体验
2. **加强安全基线**
- 实施基础安全策略
- 配置监控和告警
- 建立合规框架
3. **优化现有工作负载**
- 资源使用优化
- 性能调优
- 成本控制
### 中期策略 (12-18个月)
1. **探索AI/ML工作负载**
- 评估业务需求
- 试点项目实施
- 建立ML运维能力
2. **实施服务网格**
- 微服务治理
- 流量管理
- 安全策略
3. **扩展到边缘场景**
- 识别边缘用例
- 部署边缘基础设施
- 建立边缘管理能力
### 长期策略 (18-24个月)
1. **建立可持续运营**
- 碳足迹监控
- 绿色计算实践
- 可持续发展目标
2. **实施智能运维**
- AIOps平台
- 自动化运维
- 预测性维护
3. **探索前沿技术**
- WebAssembly应用
- 量子计算准备
- 下一代架构
## 投资建议
### 预算分配
- **基础设施现代化**: 40%
- **平台工程**: 25%
- **安全增强**: 20%
- **新兴技术**: 15%
### 人员配置
- **平台工程师**: 2-3人
- **DevOps工程师**: 3-4人
- **安全工程师**: 1-2人
- **AI/ML工程师**: 1-2人
### 培训投入
- **内部培训**: 每季度技术分享
- **外部培训**: 年度技术大会
- **认证考试**: 鼓励获得相关认证
- **实践项目**: 20%时间用于技术探索
## 风险管理
### 技术风险
- **缓解策略**: 试点项目、渐进实施、充分测试
- **应急计划**: 回滚策略、备用方案、专家支持
### 组织风险
- **变更管理**: 沟通计划、培训支持、激励机制
- **技能建设**: 招聘计划、培训投入、知识管理
### 业务风险
- **连续性保障**: 蓝绿部署、灾难恢复、监控告警
- **合规要求**: 安全审计、数据保护、行业标准
## 成功指标
### 技术指标
- 部署频率提升 > 5倍
- 故障恢复时间 < 1小时
- 系统可用性 > 99.9%
- 安全事件 < 1次/月
### 业务指标
- 开发效率提升 > 50%
- 运维成本降低 > 30%
- 新功能上线时间 < 2周
- 客户满意度 > 4.5/5
### 创新指标
- 新技术采用数量 > 3个/年
- 技术专利申请 > 2个/年
- 开源贡献 > 10个PR/年
- 技术影响力提升
## 结论
Kubernetes未来技术的采用需要平衡创新与稳定、收益与风险。建议采用渐进式策略,优先实施成熟度高、价值明确的技术,同时保持对新兴技术的关注和试点。成功的关键在于建立强大的平台工程能力、持续的学习文化和有效的风险管理机制。
EOF
echo "决策支持报告已生成: $decision_file"
}
# 主函数
case "$1" in
roadmap)
generate_roadmap
;;
matrix)
create_assessment_matrix
;;
decision)
generate_decision_support
;;
all)
generate_roadmap
create_assessment_matrix
generate_decision_support
;;
*)
echo "用法: $0 {roadmap|matrix|decision|all}"
echo " roadmap - 生成实施路线图"
echo " matrix - 创建技术评估矩阵"
echo " decision - 生成决策支持报告"
echo " all - 生成所有文档"
exit 1
;;
esac
本章详细介绍了Kubernetes的未来发展趋势和新兴技术,包括边缘计算、AI/ML工作负载、WebAssembly集成、多云混合云、可持续发展、安全增强、开发者体验等方面。通过实际的配置示例、脚本和最佳实践,帮助读者了解和准备这些前沿技术。
下一章预告: 第15章将总结整个Kubernetes学习之旅,回顾核心概念,展望技术发展,并提供持续学习的建议和资源。
14.2.3 边缘计算最佳实践
# 边缘计算资源配额
apiVersion: v1
kind: ResourceQuota
metadata:
name: edge-quota
namespace: edge-apps
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
persistentvolumeclaims: "5"
pods: "10"
---
# 边缘网络策略
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: edge-network-policy
namespace: edge-apps
spec:
podSelector:
matchLabels:
tier: edge
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: edge-apps
- podSelector:
matchLabels:
role: edge-gateway
ports:
- protocol: TCP
port: 80
- protocol: TCP
port: 443
egress:
- to:
- namespaceSelector:
matchLabels:
name: kube-system
- to: []
ports:
- protocol: TCP
port: 53
- protocol: UDP
port: 53
---
# 边缘Pod中断预算
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: edge-app-pdb
namespace: edge-apps
spec:
minAvailable: 1
selector:
matchLabels:
app: edge-sensor
14.3 AI/ML工作负载支持
14.3.1 GPU资源管理
# GPU节点配置
apiVersion: v1
kind: Node
metadata:
name: gpu-node-001
labels:
accelerator: nvidia-tesla-v100
gpu-count: "4"
node-type: gpu
spec:
capacity:
nvidia.com/gpu: "4"
allocatable:
nvidia.com/gpu: "4"
---
# GPU工作负载
apiVersion: batch/v1
kind: Job
metadata:
name: ml-training-job
namespace: ml-workloads
spec:
template:
spec:
containers:
- name: pytorch-training
image: pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
command: ["python"]
args: ["/app/train.py"]
resources:
requests:
nvidia.com/gpu: 2
cpu: "4"
memory: "16Gi"
limits:
nvidia.com/gpu: 2
cpu: "8"
memory: "32Gi"
env:
- name: CUDA_VISIBLE_DEVICES
value: "0,1"
- name: NCCL_DEBUG
value: "INFO"
volumeMounts:
- name: training-data
mountPath: /data
- name: model-output
mountPath: /output
- name: shm
mountPath: /dev/shm
volumes:
- name: training-data
persistentVolumeClaim:
claimName: training-data-pvc
- name: model-output
persistentVolumeClaim:
claimName: model-output-pvc
- name: shm
emptyDir:
medium: Memory
sizeLimit: 8Gi
restartPolicy: Never
nodeSelector:
accelerator: nvidia-tesla-v100
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
backoffLimit: 3
14.3.2 Kubeflow机器学习平台
#!/bin/bash
# Kubeflow部署脚本
echo "=== Kubeflow机器学习平台部署 ==="
# 安装Kubeflow
install_kubeflow() {
echo "安装Kubeflow..."
# 下载kfctl
KUBEFLOW_VERSION="v1.7.0"
wget https://github.com/kubeflow/kfctl/releases/download/${KUBEFLOW_VERSION}/kfctl_${KUBEFLOW_VERSION}_linux.tar.gz
tar -xzf kfctl_${KUBEFLOW_VERSION}_linux.tar.gz
sudo mv kfctl /usr/local/bin/
# 设置Kubeflow配置
export KF_NAME=kubeflow
export BASE_DIR=/opt/kubeflow
export KF_DIR=${BASE_DIR}/${KF_NAME}
export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v1.7-branch/kfdef/kfctl_k8s_istio.v1.7.0.yaml"
# 创建目录
mkdir -p ${KF_DIR}
cd ${KF_DIR}
# 下载配置
kfctl build -V -f ${CONFIG_URI}
# 部署Kubeflow
kfctl apply -V -f kfctl_k8s_istio.v1.7.0.yaml
echo "等待Kubeflow组件启动..."
kubectl wait --for=condition=available --timeout=600s deployment --all -n kubeflow
kubectl wait --for=condition=available --timeout=600s deployment --all -n istio-system
}
# 配置Jupyter Notebook
setup_jupyter() {
echo "配置Jupyter Notebook..."
cat <<EOF | kubectl apply -f -
apiVersion: kubeflow.org/v1
kind: Notebook
metadata:
name: ml-notebook
namespace: kubeflow-user-example-com
spec:
template:
spec:
containers:
- name: notebook
image: jupyter/tensorflow-notebook:latest
resources:
requests:
cpu: "1"
memory: "2Gi"
nvidia.com/gpu: "1"
limits:
cpu: "2"
memory: "4Gi"
nvidia.com/gpu: "1"
env:
- name: JUPYTER_ENABLE_LAB
value: "yes"
volumeMounts:
- name: workspace
mountPath: /home/jovyan/work
volumes:
- name: workspace
persistentVolumeClaim:
claimName: ml-workspace-pvc
EOF
}
# 创建训练Pipeline
create_training_pipeline() {
echo "创建训练Pipeline..."
cat <<EOF > training_pipeline.py
import kfp
from kfp import dsl
from kfp.components import create_component_from_func
# 数据预处理组件
@create_component_from_func
def preprocess_data(input_path: str, output_path: str) -> str:
import pandas as pd
import numpy as np
# 模拟数据预处理
print(f"Processing data from {input_path}")
# 这里添加实际的数据预处理逻辑
data = pd.DataFrame(np.random.randn(1000, 10))
data.to_csv(f"{output_path}/processed_data.csv", index=False)
return output_path
# 模型训练组件
@create_component_from_func
def train_model(data_path: str, model_path: str) -> str:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
print(f"Training model with data from {data_path}")
# 加载数据
data = pd.read_csv(f"{data_path}/processed_data.csv")
X = data.iloc[:, :-1]
y = np.random.randint(0, 2, len(data)) # 模拟标签
# 训练模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# 保存模型
joblib.dump(model, f"{model_path}/model.pkl")
return model_path
# 模型评估组件
@create_component_from_func
def evaluate_model(model_path: str, data_path: str) -> float:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
print(f"Evaluating model from {model_path}")
# 加载模型和数据
model = joblib.load(f"{model_path}/model.pkl")
data = pd.read_csv(f"{data_path}/processed_data.csv")
X = data.iloc[:, :-1]
y = np.random.randint(0, 2, len(data)) # 模拟标签
# 评估模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model accuracy: {accuracy}")
return accuracy
# 定义Pipeline
@dsl.pipeline(
name='ML Training Pipeline',
description='A machine learning training pipeline'
)
def ml_training_pipeline(
input_data_path: str = '/data/input',
processed_data_path: str = '/data/processed',
model_output_path: str = '/data/models'
):
# 数据预处理步骤
preprocess_task = preprocess_data(
input_path=input_data_path,
output_path=processed_data_path
)
# 模型训练步骤
train_task = train_model(
data_path=preprocess_task.output,
model_path=model_output_path
)
# 模型评估步骤
evaluate_task = evaluate_model(
model_path=train_task.output,
data_path=preprocess_task.output
)
# 设置GPU资源
train_task.set_gpu_limit(1)
train_task.add_node_selector_constraint('accelerator', 'nvidia-tesla-v100')
if __name__ == '__main__':
# 编译Pipeline
kfp.compiler.Compiler().compile(ml_training_pipeline, 'ml_training_pipeline.yaml')
print("Pipeline compiled successfully!")
EOF
# 运行Pipeline编译
python training_pipeline.py
echo "训练Pipeline已创建"
}
# 部署模型服务
deploy_model_serving() {
echo "部署模型服务..."
cat <<EOF | kubectl apply -f -
apiVersion: serving.kubeflow.org/v1beta1
kind: InferenceService
metadata:
name: sklearn-iris
namespace: kubeflow-user-example-com
spec:
predictor:
sklearn:
storageUri: "gs://kfserving-examples/models/sklearn/iris"
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
transformer:
containers:
- name: transformer
image: kfserving/image-transformer:latest
env:
- name: STORAGE_URI
value: "gs://kfserving-examples/models/sklearn/iris"
EOF
}
# 设置监控
setup_ml_monitoring() {
echo "设置ML监控..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ServiceMonitor
metadata:
name: kubeflow-monitoring
namespace: kubeflow
spec:
selector:
matchLabels:
app: kubeflow
endpoints:
- port: http
interval: 30s
path: /metrics
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kubeflow-alerts
namespace: kubeflow
spec:
groups:
- name: kubeflow.rules
rules:
- alert: KubeflowPipelineFailure
expr: increase(kubeflow_pipeline_runs_failed_total[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Kubeflow pipeline failure detected"
description: "Pipeline {{ \$labels.pipeline_name }} has failed"
- alert: ModelServingDown
expr: up{job="model-serving"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Model serving is down"
description: "Model serving endpoint has been down for more than 5 minutes"
EOF
}
# 主函数
case "$1" in
install)
install_kubeflow
;;
jupyter)
setup_jupyter
;;
pipeline)
create_training_pipeline
;;
serving)
deploy_model_serving
;;
monitoring)
setup_ml_monitoring
;;
all)
install_kubeflow
setup_jupyter
create_training_pipeline
deploy_model_serving
setup_ml_monitoring
;;
*)
echo "用法: $0 {install|jupyter|pipeline|serving|monitoring|all}"
echo " install - 安装Kubeflow"
echo " jupyter - 配置Jupyter Notebook"
echo " pipeline - 创建训练Pipeline"
echo " serving - 部署模型服务"
echo " monitoring - 设置监控"
echo " all - 执行所有配置"
exit 1
;;
esac
14.4 WebAssembly (WASM) 集成
14.4.1 WASM运行时支持
WebAssembly为Kubernetes提供了轻量级、安全的运行时选择。
# WASM运行时配置
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: wasmtime
handler: wasmtime
overhead:
podFixed:
cpu: 10m
memory: 32Mi
scheduling:
nodeClassForRuntimeClass: wasm-capable
tolerations:
- key: wasm-runtime
operator: Equal
value: "true"
effect: NoSchedule
---
# WASM应用部署
apiVersion: apps/v1
kind: Deployment
metadata:
name: wasm-app
namespace: wasm-apps
spec:
replicas: 3
selector:
matchLabels:
app: wasm-app
template:
metadata:
labels:
app: wasm-app
spec:
runtimeClassName: wasmtime
containers:
- name: wasm-container
image: wasm-app:latest
resources:
requests:
cpu: 5m
memory: 16Mi
limits:
cpu: 50m
memory: 128Mi
env:
- name: WASM_MODULE_PATH
value: "/app/module.wasm"
ports:
- containerPort: 8080
protocol: TCP
nodeSelector:
wasm-runtime: "true"
14.4.2 WASM应用开发
#!/bin/bash
# WASM应用开发和部署脚本
echo "=== WebAssembly应用开发和部署 ==="
# 安装WASM运行时
install_wasm_runtime() {
echo "安装WASM运行时..."
# 安装wasmtime
curl https://wasmtime.dev/install.sh -sSf | bash
source ~/.bashrc
# 安装containerd-wasm
wget https://github.com/containerd/runwasi/releases/download/containerd-wasm-shims%2Fv0.3.3/containerd-wasm-shims-v1-linux-x86_64.tar.gz
tar -xzf containerd-wasm-shims-v1-linux-x86_64.tar.gz
sudo mv containerd-wasmtime-shim /usr/local/bin/
# 配置containerd
sudo tee -a /etc/containerd/config.toml <<EOF
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.wasmtime]
runtime_type = "io.containerd.wasmtime.v1"
EOF
# 重启containerd
sudo systemctl restart containerd
}
# 创建WASM应用示例
create_wasm_app() {
echo "创建WASM应用示例..."
# 创建Rust项目
mkdir -p wasm-hello-world
cd wasm-hello-world
# 创建Cargo.toml
cat > Cargo.toml <<EOF
[package]
name = "wasm-hello-world"
version = "0.1.0"
edition = "2021"
[lib]
crate-type = ["cdylib"]
[dependencies]
wasm-bindgen = "0.2"
web-sys = "0.3"
wasi = "0.11"
tokio = { version = "1.0", features = ["full"] }
EOF
# 创建源代码
mkdir -p src
cat > src/lib.rs <<EOF
use wasm_bindgen::prelude::*;
use web_sys::console;
#[wasm_bindgen]
extern "C" {
fn alert(s: &str);
}
#[wasm_bindgen]
pub fn greet(name: &str) {
alert(&format!("Hello, {}!", name));
}
#[wasm_bindgen(start)]
pub fn main() {
console::log_1(&"WASM module loaded".into());
}
#[no_mangle]
pub extern "C" fn add(a: i32, b: i32) -> i32 {
a + b
}
#[no_mangle]
pub extern "C" fn fibonacci(n: i32) -> i32 {
if n <= 1 {
return n;
}
fibonacci(n - 1) + fibonacci(n - 2)
}
EOF
# 创建HTTP服务器
cat > src/main.rs <<EOF
use std::io::prelude::*;
use std::net::{TcpListener, TcpStream};
use std::thread;
use std::time::Duration;
fn main() {
let listener = TcpListener::bind("0.0.0.0:8080").unwrap();
println!("WASM HTTP Server listening on port 8080");
for stream in listener.incoming() {
let stream = stream.unwrap();
thread::spawn(|| {
handle_connection(stream);
});
}
}
fn handle_connection(mut stream: TcpStream) {
let mut buffer = [0; 1024];
stream.read(&mut buffer).unwrap();
let response = "HTTP/1.1 200 OK\r\n\r\nHello from WASM!";
stream.write(response.as_bytes()).unwrap();
stream.flush().unwrap();
}
EOF
# 编译为WASM
rustup target add wasm32-wasi
cargo build --target wasm32-wasi --release
echo "WASM应用编译完成"
cd ..
}
# 构建WASM容器镜像
build_wasm_image() {
echo "构建WASM容器镜像..."
# 创建Dockerfile
cat > Dockerfile.wasm <<EOF
FROM scratch
COPY target/wasm32-wasi/release/wasm-hello-world.wasm /app/module.wasm
EXPOSE 8080
ENTRYPOINT ["/app/module.wasm"]
EOF
# 构建镜像
docker build -f Dockerfile.wasm -t wasm-app:latest .
echo "WASM镜像构建完成"
}
# 部署WASM应用
deploy_wasm_app() {
echo "部署WASM应用..."
# 创建命名空间
kubectl create namespace wasm-apps --dry-run=client -o yaml | kubectl apply -f -
# 部署应用
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: wasm-hello-world
namespace: wasm-apps
spec:
replicas: 2
selector:
matchLabels:
app: wasm-hello-world
template:
metadata:
labels:
app: wasm-hello-world
spec:
runtimeClassName: wasmtime
containers:
- name: wasm-app
image: wasm-app:latest
ports:
- containerPort: 8080
resources:
requests:
cpu: 1m
memory: 8Mi
limits:
cpu: 10m
memory: 32Mi
livenessProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 2
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: wasm-hello-world-service
namespace: wasm-apps
spec:
selector:
app: wasm-hello-world
ports:
- protocol: TCP
port: 80
targetPort: 8080
type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: wasm-hello-world-ingress
namespace: wasm-apps
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: wasm-app.local
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: wasm-hello-world-service
port:
number: 80
EOF
}
# 性能测试
performance_test() {
echo "WASM应用性能测试..."
# 等待应用启动
kubectl wait --for=condition=available --timeout=300s deployment/wasm-hello-world -n wasm-apps
# 端口转发
kubectl port-forward service/wasm-hello-world-service 8080:80 -n wasm-apps &
PF_PID=$!
sleep 5
# 性能测试
echo "开始性能测试..."
curl -s http://localhost:8080
# 使用ab进行压力测试
if command -v ab &> /dev/null; then
echo "执行压力测试..."
ab -n 1000 -c 10 http://localhost:8080/
else
echo "apache2-utils未安装,跳过压力测试"
fi
# 清理
kill $PF_PID
}
# 主函数
case "$1" in
runtime)
install_wasm_runtime
;;
create)
create_wasm_app
;;
build)
build_wasm_image
;;
deploy)
deploy_wasm_app
;;
test)
performance_test
;;
all)
install_wasm_runtime
create_wasm_app
build_wasm_image
deploy_wasm_app
performance_test
;;
*)
echo "用法: $0 {runtime|create|build|deploy|test|all}"
echo " runtime - 安装WASM运行时"
echo " create - 创建WASM应用"
echo " build - 构建WASM镜像"
echo " deploy - 部署WASM应用"
echo " test - 性能测试"
echo " all - 执行所有步骤"
exit 1
;;
esac
14.5 多云和混合云
14.5.1 集群联邦
# 集群联邦配置
apiVersion: core.kubefed.io/v1beta1
kind: KubeFedCluster
metadata:
name: cluster-aws
namespace: kube-federation-system
spec:
apiEndpoint: https://aws-cluster.example.com
caBundle: LS0tLS1CRUdJTi...
secretRef:
name: cluster-aws-secret
---
apiVersion: core.kubefed.io/v1beta1
kind: KubeFedCluster
metadata:
name: cluster-gcp
namespace: kube-federation-system
spec:
apiEndpoint: https://gcp-cluster.example.com
caBundle: LS0tLS1CRUdJTi...
secretRef:
name: cluster-gcp-secret
---
# 联邦部署
apiVersion: types.kubefed.io/v1beta1
kind: FederatedDeployment
metadata:
name: multi-cloud-app
namespace: default
spec:
template:
metadata:
labels:
app: multi-cloud-app
spec:
replicas: 3
selector:
matchLabels:
app: multi-cloud-app
template:
metadata:
labels:
app: multi-cloud-app
spec:
containers:
- name: app
image: nginx:latest
ports:
- containerPort: 80
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
placement:
clusters:
- name: cluster-aws
- name: cluster-gcp
overrides:
- clusterName: cluster-aws
clusterOverrides:
- path: "/spec/replicas"
value: 5
- path: "/spec/template/spec/containers/0/env"
op: "add"
value:
- name: CLOUD_PROVIDER
value: "aws"
- clusterName: cluster-gcp
clusterOverrides:
- path: "/spec/replicas"
value: 2
- path: "/spec/template/spec/containers/0/env"
op: "add"
value:
- name: CLOUD_PROVIDER
value: "gcp"
14.5.2 多云管理脚本
#!/bin/bash
# 多云Kubernetes管理脚本
echo "=== 多云Kubernetes管理 ==="
# 配置多集群访问
setup_multi_cluster_access() {
echo "配置多集群访问..."
# 合并kubeconfig
export KUBECONFIG=~/.kube/config:~/.kube/aws-config:~/.kube/gcp-config:~/.kube/azure-config
kubectl config view --flatten > ~/.kube/merged-config
export KUBECONFIG=~/.kube/merged-config
# 设置集群别名
kubectl config rename-context aws-cluster aws
kubectl config rename-context gcp-cluster gcp
kubectl config rename-context azure-cluster azure
# 显示可用集群
echo "可用集群:"
kubectl config get-contexts
}
# 部署跨云应用
deploy_cross_cloud_app() {
echo "部署跨云应用..."
# AWS集群部署
kubectl config use-context aws
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: cross-cloud-app-aws
namespace: default
labels:
cloud: aws
region: us-west-2
spec:
replicas: 3
selector:
matchLabels:
app: cross-cloud-app
cloud: aws
template:
metadata:
labels:
app: cross-cloud-app
cloud: aws
spec:
containers:
- name: app
image: nginx:latest
env:
- name: CLOUD_PROVIDER
value: "AWS"
- name: REGION
value: "us-west-2"
ports:
- containerPort: 80
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
nodeSelector:
kubernetes.io/arch: amd64
---
apiVersion: v1
kind: Service
metadata:
name: cross-cloud-app-aws-service
namespace: default
spec:
selector:
app: cross-cloud-app
cloud: aws
ports:
- port: 80
targetPort: 80
type: LoadBalancer
EOF
# GCP集群部署
kubectl config use-context gcp
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: cross-cloud-app-gcp
namespace: default
labels:
cloud: gcp
region: us-central1
spec:
replicas: 2
selector:
matchLabels:
app: cross-cloud-app
cloud: gcp
template:
metadata:
labels:
app: cross-cloud-app
cloud: gcp
spec:
containers:
- name: app
image: nginx:latest
env:
- name: CLOUD_PROVIDER
value: "GCP"
- name: REGION
value: "us-central1"
ports:
- containerPort: 80
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
nodeSelector:
kubernetes.io/arch: amd64
---
apiVersion: v1
kind: Service
metadata:
name: cross-cloud-app-gcp-service
namespace: default
spec:
selector:
app: cross-cloud-app
cloud: gcp
ports:
- port: 80
targetPort: 80
type: LoadBalancer
EOF
}
# 配置跨云网络
setup_cross_cloud_networking() {
echo "配置跨云网络..."
# 安装Submariner
curl -Ls https://get.submariner.io | bash
export PATH=$PATH:~/.local/bin
# 部署Submariner Broker
kubectl config use-context aws
subctl deploy-broker
# 加入AWS集群
subctl join broker-info.subm --kubeconfig ~/.kube/aws-config --clusterid aws-cluster
# 加入GCP集群
kubectl config use-context gcp
subctl join broker-info.subm --kubeconfig ~/.kube/gcp-config --clusterid gcp-cluster
# 验证连接
subctl show connections
subctl show endpoints
}
# 监控多云环境
setup_multi_cloud_monitoring() {
echo "设置多云监控..."
# 在每个集群部署Prometheus
for context in aws gcp azure; do
if kubectl config get-contexts | grep -q $context; then
echo "在$context集群部署监控..."
kubectl config use-context $context
# 部署Prometheus
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:latest
ports:
- containerPort: 9090
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus/'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.external-url=http://prometheus-$context.example.com'
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus/
- name: prometheus-storage
mountPath: /prometheus/
env:
- name: CLUSTER_NAME
value: "$context"
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage
emptyDir: {}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
external_labels:
cluster: '$context'
region: '$(REGION)'
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-service
namespace: monitoring
spec:
selector:
app: prometheus
ports:
- port: 9090
targetPort: 9090
type: LoadBalancer
EOF
fi
done
}
# 灾难恢复
setup_disaster_recovery() {
echo "设置灾难恢复..."
# 安装Velero
wget https://github.com/vmware-tanzu/velero/releases/download/v1.11.0/velero-v1.11.0-linux-amd64.tar.gz
tar -xzf velero-v1.11.0-linux-amd64.tar.gz
sudo mv velero-v1.11.0-linux-amd64/velero /usr/local/bin/
# 在每个集群配置Velero
for context in aws gcp; do
if kubectl config get-contexts | grep -q $context; then
echo "在$context集群配置Velero..."
kubectl config use-context $context
# 创建备份存储位置
if [ "$context" = "aws" ]; then
velero install \
--provider aws \
--plugins velero/velero-plugin-for-aws:v1.7.0 \
--bucket velero-backups-aws \
--backup-location-config region=us-west-2 \
--snapshot-location-config region=us-west-2
elif [ "$context" = "gcp" ]; then
velero install \
--provider gcp \
--plugins velero/velero-plugin-for-gcp:v1.7.0 \
--bucket velero-backups-gcp \
--backup-location-config project=my-project
fi
# 创建定期备份
velero schedule create daily-backup --schedule="0 2 * * *" --ttl 720h0m0s
fi
done
}
# 成本优化
optimize_multi_cloud_costs() {
echo "多云成本优化..."
# 分析各云资源使用情况
for context in aws gcp azure; do
if kubectl config get-contexts | grep -q $context; then
echo "分析$context集群资源使用..."
kubectl config use-context $context
echo "节点资源使用:"
kubectl top nodes 2>/dev/null || echo "需要安装metrics-server"
echo "Pod资源使用:"
kubectl top pods --all-namespaces 2>/dev/null | head -20
echo "未使用的资源:"
kubectl get pv | grep Available
kubectl get pvc --all-namespaces | grep Pending
fi
done
# 生成成本报告
cat > multi-cloud-cost-report.md <<EOF
# 多云成本分析报告
生成时间: $(date)
## 集群资源概览
| 集群 | 节点数 | CPU总量 | 内存总量 | 存储总量 |
|------|--------|---------|----------|----------|
EOF
for context in aws gcp azure; do
if kubectl config get-contexts | grep -q $context; then
kubectl config use-context $context
nodes=$(kubectl get nodes --no-headers | wc -l)
echo "| $context | $nodes | - | - | - |" >> multi-cloud-cost-report.md
fi
done
cat >> multi-cloud-cost-report.md <<EOF
## 优化建议
1. **资源右调**: 根据实际使用情况调整Pod资源请求和限制
2. **自动扩缩容**: 实施HPA和VPA减少资源浪费
3. **Spot实例**: 在合适的工作负载中使用Spot/Preemptible实例
4. **存储优化**: 清理未使用的PV和PVC
5. **跨云负载均衡**: 根据成本和性能优化工作负载分布
EOF
echo "成本报告已生成: multi-cloud-cost-report.md"
}
# 主函数
case "$1" in
setup)
setup_multi_cluster_access
;;
deploy)
deploy_cross_cloud_app
;;
network)
setup_cross_cloud_networking
;;
monitoring)
setup_multi_cloud_monitoring
;;
backup)
setup_disaster_recovery
;;
cost)
optimize_multi_cloud_costs
;;
all)
setup_multi_cluster_access
deploy_cross_cloud_app
setup_cross_cloud_networking
setup_multi_cloud_monitoring
setup_disaster_recovery
optimize_multi_cloud_costs
;;
*)
echo "用法: $0 {setup|deploy|network|monitoring|backup|cost|all}"
echo " setup - 配置多集群访问"
echo " deploy - 部署跨云应用"
echo " network - 配置跨云网络"
echo " monitoring - 设置监控"
echo " backup - 设置灾难恢复"
echo " cost - 成本优化"
echo " all - 执行所有配置"
exit 1
;;
esac