概述
本教程深入探讨 Prometheus 的性能优化策略和扩展方案,帮助您构建高性能、可扩展的监控系统。我们将涵盖存储优化、查询性能调优、集群部署、联邦配置以及与其他监控工具的集成。
学习目标
通过本教程,您将学会: - 优化 Prometheus 存储和查询性能 - 实施 Prometheus 集群和联邦配置 - 配置高可用性和负载均衡 - 集成第三方监控工具和服务 - 实施监控系统的容量规划 - 解决常见的性能瓶颈问题
存储优化与配置
StorageOptimizer 类
from dataclasses import dataclass
from typing import Dict, List, Any, Optional
from enum import Enum
import yaml
import json
class StorageEngine(Enum):
"""存储引擎类型"""
TSDB = "tsdb"
REMOTE_WRITE = "remote_write"
REMOTE_READ = "remote_read"
THANOS = "thanos"
CORTEX = "cortex"
M3DB = "m3db"
class CompressionType(Enum):
"""压缩类型"""
SNAPPY = "snappy"
GZIP = "gzip"
LZ4 = "lz4"
ZSTD = "zstd"
@dataclass
class StorageConfig:
"""存储配置"""
retention_time: str
retention_size: str
wal_compression: bool
chunk_encoding: str
max_chunks_to_persist: int
memory_chunks: int
max_samples_per_chunk: int
compression_type: CompressionType
class StorageOptimizer:
"""存储优化器"""
def __init__(self):
self.storage_configs = {}
self.optimization_rules = []
def create_optimized_storage_config(self, scenario: str) -> Dict[str, Any]:
"""创建优化的存储配置"""
base_config = {
"global": {
"scrape_interval": "15s",
"evaluation_interval": "15s",
"external_labels": {
"cluster": "production",
"replica": "1"
}
},
"storage": {
"tsdb": {
"path": "/prometheus/data",
"retention.time": "15d",
"retention.size": "100GB",
"wal-compression": True,
"max-block-duration": "2h",
"min-block-duration": "2h",
"no-lockfile": False
}
},
"query": {
"timeout": "2m",
"max-concurrency": 20,
"max-samples": 50000000,
"lookback-delta": "5m"
}
}
# 根据场景优化配置
if scenario == "high_volume":
base_config["storage"]["tsdb"].update({
"retention.time": "7d",
"retention.size": "500GB",
"max-block-duration": "1h",
"wal-compression": True
})
base_config["query"].update({
"max-concurrency": 50,
"max-samples": 100000000
})
elif scenario == "long_term":
base_config["storage"]["tsdb"].update({
"retention.time": "90d",
"retention.size": "1TB",
"max-block-duration": "24h"
})
elif scenario == "resource_constrained":
base_config["storage"]["tsdb"].update({
"retention.time": "3d",
"retention.size": "50GB",
"wal-compression": True
})
base_config["query"].update({
"max-concurrency": 10,
"max-samples": 10000000
})
return base_config
def create_remote_storage_config(self, storage_type: StorageEngine) -> Dict[str, Any]:
"""创建远程存储配置"""
configs = {
StorageEngine.THANOS: {
"remote_write": [{
"url": "http://thanos-receive:19291/api/v1/receive",
"queue_config": {
"capacity": 10000,
"max_shards": 200,
"min_shards": 1,
"max_samples_per_send": 2000,
"batch_send_deadline": "5s",
"min_backoff": "30ms",
"max_backoff": "100ms"
},
"metadata_config": {
"send": True,
"send_interval": "1m"
}
}],
"remote_read": [{
"url": "http://thanos-query:9090/api/v1/query",
"read_recent": True
}]
},
StorageEngine.CORTEX: {
"remote_write": [{
"url": "http://cortex-distributor:8080/api/prom/push",
"basic_auth": {
"username": "prometheus",
"password": "${CORTEX_PASSWORD}"
},
"queue_config": {
"capacity": 10000,
"max_shards": 100,
"batch_send_deadline": "5s"
}
}],
"remote_read": [{
"url": "http://cortex-query-frontend:8080/api/prom/read",
"basic_auth": {
"username": "prometheus",
"password": "${CORTEX_PASSWORD}"
}
}]
},
StorageEngine.M3DB: {
"remote_write": [{
"url": "http://m3coordinator:7201/api/v1/prom/remote/write",
"queue_config": {
"capacity": 10000,
"max_shards": 100
}
}],
"remote_read": [{
"url": "http://m3coordinator:7201/api/v1/prom/remote/read"
}]
}
}
return configs.get(storage_type, {})
def create_compression_config(self, compression_type: CompressionType) -> Dict[str, Any]:
"""创建压缩配置"""
compression_configs = {
CompressionType.SNAPPY: {
"wal_compression": True,
"chunk_encoding": "XOR",
"compression_level": "default"
},
CompressionType.GZIP: {
"wal_compression": True,
"chunk_encoding": "XOR",
"compression_level": "6"
},
CompressionType.LZ4: {
"wal_compression": True,
"chunk_encoding": "XOR",
"compression_level": "fast"
},
CompressionType.ZSTD: {
"wal_compression": True,
"chunk_encoding": "XOR",
"compression_level": "3"
}
}
return compression_configs.get(compression_type, {})
def generate_storage_monitoring_config(self) -> Dict[str, Any]:
"""生成存储监控配置"""
return {
"recording_rules": """
groups:
- name: storage_performance
interval: 30s
rules:
- record: prometheus:storage_samples_appended_rate
expr: rate(prometheus_tsdb_symbol_table_size_bytes[5m])
- record: prometheus:storage_blocks_loaded
expr: prometheus_tsdb_blocks_loaded
- record: prometheus:storage_head_samples
expr: prometheus_tsdb_head_samples
- record: prometheus:storage_wal_size
expr: prometheus_tsdb_wal_fsync_duration_seconds
- record: prometheus:storage_compaction_duration
expr: rate(prometheus_tsdb_compactions_total[5m])
- record: prometheus:storage_retention_count
expr: prometheus_tsdb_retention_limit_bytes
""",
"alerting_rules": """
groups:
- name: storage_alerts
rules:
- alert: PrometheusStorageHighUsage
expr: (prometheus_tsdb_retention_limit_bytes - prometheus_tsdb_size_bytes) / prometheus_tsdb_retention_limit_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 存储使用率过高"
description: "存储使用率超过90%,剩余空间: {{ $value }}%"
- alert: PrometheusWALCorruption
expr: increase(prometheus_tsdb_wal_corruptions_total[1h]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Prometheus WAL 损坏"
description: "检测到 WAL 文件损坏,可能导致数据丢失"
- alert: PrometheusCompactionFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1h]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Prometheus 压缩失败"
description: "数据压缩过程失败,可能影响查询性能"
- alert: PrometheusSlowQueries
expr: histogram_quantile(0.99, rate(prometheus_engine_query_duration_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 查询响应缓慢"
description: "99分位查询时间超过30秒: {{ $value }}s"
"""
}
def create_retention_policy(self, data_importance: str) -> Dict[str, Any]:
"""创建数据保留策略"""
policies = {
"critical": {
"local_retention": "30d",
"remote_retention": "2y",
"backup_frequency": "daily",
"compression_enabled": True
},
"important": {
"local_retention": "15d",
"remote_retention": "1y",
"backup_frequency": "weekly",
"compression_enabled": True
},
"standard": {
"local_retention": "7d",
"remote_retention": "6m",
"backup_frequency": "monthly",
"compression_enabled": True
},
"debug": {
"local_retention": "3d",
"remote_retention": "1m",
"backup_frequency": "none",
"compression_enabled": False
}
}
return policies.get(data_importance, policies["standard"])
def generate_storage_optimization_report(self, metrics: Dict[str, float]) -> Dict[str, Any]:
"""生成存储优化报告"""
recommendations = []
# 分析存储使用率
if metrics.get("storage_usage_percent", 0) > 80:
recommendations.append({
"type": "storage_cleanup",
"priority": "high",
"description": "存储使用率过高,建议清理旧数据或增加存储容量",
"action": "调整保留策略或扩展存储"
})
# 分析查询性能
if metrics.get("avg_query_duration", 0) > 10:
recommendations.append({
"type": "query_optimization",
"priority": "medium",
"description": "查询响应时间过长,建议优化查询或增加计算资源",
"action": "使用记录规则或增加内存"
})
# 分析压缩效率
if metrics.get("compression_ratio", 1) < 0.3:
recommendations.append({
"type": "compression_tuning",
"priority": "low",
"description": "压缩效率较低,建议调整压缩算法",
"action": "尝试不同的压缩算法"
})
return {
"timestamp": "2024-01-15T10:00:00Z",
"metrics_analyzed": metrics,
"recommendations": recommendations,
"overall_health": "good" if len(recommendations) == 0 else "needs_attention",
"estimated_savings": {
"storage_space": "15%",
"query_time": "25%",
"resource_usage": "10%"
}
}
# 使用示例
storage_optimizer = StorageOptimizer()
# 创建不同场景的存储配置
high_volume_config = storage_optimizer.create_optimized_storage_config("high_volume")
long_term_config = storage_optimizer.create_optimized_storage_config("long_term")
resource_constrained_config = storage_optimizer.create_optimized_storage_config("resource_constrained")
# 创建远程存储配置
thanos_config = storage_optimizer.create_remote_storage_config(StorageEngine.THANOS)
cortex_config = storage_optimizer.create_remote_storage_config(StorageEngine.CORTEX)
# 创建压缩配置
snappy_compression = storage_optimizer.create_compression_config(CompressionType.SNAPPY)
zstd_compression = storage_optimizer.create_compression_config(CompressionType.ZSTD)
# 生成监控配置
storage_monitoring = storage_optimizer.generate_storage_monitoring_config()
# 创建保留策略
critical_retention = storage_optimizer.create_retention_policy("critical")
standard_retention = storage_optimizer.create_retention_policy("standard")
# 生成优化报告
sample_metrics = {
"storage_usage_percent": 85,
"avg_query_duration": 12,
"compression_ratio": 0.25
}
optimization_report = storage_optimizer.generate_storage_optimization_report(sample_metrics)
print("存储优化配置")
print(f"高容量场景保留时间: {high_volume_config['storage']['tsdb']['retention.time']}")
print(f"长期存储保留时间: {long_term_config['storage']['tsdb']['retention.time']}")
print(f"资源受限保留时间: {resource_constrained_config['storage']['tsdb']['retention.time']}")
print(f"Thanos 远程写入配置: {'已配置' if thanos_config.get('remote_write') else '未配置'}")
print(f"优化建议数量: {len(optimization_report['recommendations'])}")
查询性能优化
QueryOptimizer 类
class QueryType(Enum):
"""查询类型"""
INSTANT = "instant"
RANGE = "range"
RECORDING_RULE = "recording_rule"
ALERTING_RULE = "alerting_rule"
class OptimizationLevel(Enum):
"""优化级别"""
BASIC = "basic"
INTERMEDIATE = "intermediate"
ADVANCED = "advanced"
EXPERT = "expert"
@dataclass
class QueryMetrics:
"""查询指标"""
execution_time: float
samples_processed: int
memory_usage: int
cpu_usage: float
cache_hit_ratio: float
class QueryOptimizer:
"""查询性能优化器"""
def __init__(self):
self.optimization_patterns = {}
self.query_cache = {}
self.performance_baselines = {}
def create_recording_rules_config(self, optimization_level: OptimizationLevel) -> Dict[str, Any]:
"""创建记录规则配置"""
base_rules = {
"groups": [
{
"name": "basic_performance_rules",
"interval": "30s",
"rules": [
{
"record": "node:cpu_utilization:rate5m",
"expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
},
{
"record": "node:memory_utilization:ratio",
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))"
},
{
"record": "node:disk_utilization:ratio",
"expr": "1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)"
}
]
}
]
}
if optimization_level in [OptimizationLevel.INTERMEDIATE, OptimizationLevel.ADVANCED, OptimizationLevel.EXPERT]:
base_rules["groups"].append({
"name": "application_performance_rules",
"interval": "15s",
"rules": [
{
"record": "http:request_rate:5m",
"expr": "sum(rate(http_requests_total[5m])) by (job, instance, method, status)"
},
{
"record": "http:request_duration:p99:5m",
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le))"
},
{
"record": "http:error_rate:5m",
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (job, instance) / sum(rate(http_requests_total[5m])) by (job, instance)"
}
]
})
if optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.EXPERT]:
base_rules["groups"].append({
"name": "business_metrics_rules",
"interval": "1m",
"rules": [
{
"record": "business:order_rate:1h",
"expr": "sum(increase(orders_total[1h])) by (region, product_category)"
},
{
"record": "business:revenue_rate:1h",
"expr": "sum(increase(revenue_total[1h])) by (region, product_category)"
},
{
"record": "business:conversion_rate:1h",
"expr": "sum(increase(orders_total[1h])) / sum(increase(page_views_total{page=\"product\"}[1h]))"
}
]
})
if optimization_level == OptimizationLevel.EXPERT:
base_rules["groups"].append({
"name": "advanced_analytics_rules",
"interval": "5m",
"rules": [
{
"record": "prediction:cpu_utilization:1h",
"expr": "predict_linear(node:cpu_utilization:rate5m[1h], 3600)"
},
{
"record": "anomaly:request_rate:zscore",
"expr": "(http:request_rate:5m - avg_over_time(http:request_rate:5m[7d])) / stddev_over_time(http:request_rate:5m[7d])"
},
{
"record": "sli:availability:5m",
"expr": "sum(rate(http_requests_total{status!~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))"
}
]
})
return base_rules
def optimize_query(self, query: str, query_type: QueryType) -> Dict[str, Any]:
"""优化查询语句"""
optimizations = []
optimized_query = query
# 基础优化规则
optimization_rules = [
{
"pattern": r"rate\((.+)\[1m\]\)",
"replacement": r"rate(\1[5m])",
"reason": "使用更长的时间窗口减少噪音",
"impact": "减少计算量,提高稳定性"
},
{
"pattern": r"sum\((.+)\) by \((.+)\)",
"replacement": r"sum by (\2) (\1)",
"reason": "优化聚合函数语法",
"impact": "提高查询可读性"
},
{
"pattern": r"\{job=\"(.+)\"\}",
"replacement": r'{job="\1"}',
"reason": "标准化标签选择器格式",
"impact": "提高查询一致性"
}
]
# 应用优化规则
import re
for rule in optimization_rules:
if re.search(rule["pattern"], optimized_query):
old_query = optimized_query
optimized_query = re.sub(rule["pattern"], rule["replacement"], optimized_query)
if old_query != optimized_query:
optimizations.append({
"type": "pattern_optimization",
"original": old_query,
"optimized": optimized_query,
"reason": rule["reason"],
"impact": rule["impact"]
})
# 查询类型特定优化
if query_type == QueryType.RANGE:
if "[1m]" in optimized_query:
optimizations.append({
"type": "time_range_optimization",
"suggestion": "考虑使用记录规则预计算此查询",
"reason": "范围查询可能消耗大量资源",
"impact": "显著提高查询性能"
})
return {
"original_query": query,
"optimized_query": optimized_query,
"optimizations_applied": optimizations,
"estimated_improvement": {
"execution_time": "15-30%",
"memory_usage": "10-25%",
"cpu_usage": "5-20%"
},
"recommendations": [
"使用记录规则预计算复杂查询",
"添加适当的标签过滤器",
"避免使用过长的时间范围",
"考虑使用子查询优化复杂逻辑"
]
}
def create_query_performance_config(self) -> Dict[str, Any]:
"""创建查询性能配置"""
return {
"prometheus_config": {
"global": {
"query_log_file": "/prometheus/logs/query.log",
"query_timeout": "2m",
"query_max_concurrency": 20,
"query_max_samples": 50000000
},
"query_engine": {
"timeout": "2m",
"max_samples": 50000000,
"lookback_delta": "5m"
}
},
"optimization_flags": [
"--query.timeout=2m",
"--query.max-concurrency=20",
"--query.max-samples=50000000",
"--query.lookback-delta=5m",
"--enable-feature=promql-at-modifier",
"--enable-feature=promql-negative-offset"
],
"caching_config": {
"query_result_cache": {
"enabled": True,
"max_size": "1GB",
"ttl": "5m"
},
"metadata_cache": {
"enabled": True,
"max_size": "100MB",
"ttl": "1h"
}
}
}
def analyze_query_performance(self, query_metrics: QueryMetrics) -> Dict[str, Any]:
"""分析查询性能"""
performance_score = 100
issues = []
recommendations = []
# 分析执行时间
if query_metrics.execution_time > 30:
performance_score -= 30
issues.append("查询执行时间过长")
recommendations.append("使用记录规则预计算或优化查询逻辑")
elif query_metrics.execution_time > 10:
performance_score -= 15
issues.append("查询执行时间较长")
recommendations.append("考虑添加更多标签过滤器")
# 分析样本处理量
if query_metrics.samples_processed > 10000000:
performance_score -= 25
issues.append("处理样本数量过多")
recommendations.append("缩小查询时间范围或增加标签过滤")
# 分析内存使用
if query_metrics.memory_usage > 1000: # MB
performance_score -= 20
issues.append("内存使用量过高")
recommendations.append("优化查询逻辑或增加系统内存")
# 分析缓存命中率
if query_metrics.cache_hit_ratio < 0.5:
performance_score -= 10
issues.append("缓存命中率较低")
recommendations.append("调整缓存策略或查询模式")
# 确定性能等级
if performance_score >= 90:
grade = "优秀"
elif performance_score >= 70:
grade = "良好"
elif performance_score >= 50:
grade = "一般"
else:
grade = "需要优化"
return {
"performance_score": max(0, performance_score),
"grade": grade,
"issues": issues,
"recommendations": recommendations,
"metrics_analysis": {
"execution_time_status": "正常" if query_metrics.execution_time <= 10 else "异常",
"memory_usage_status": "正常" if query_metrics.memory_usage <= 500 else "异常",
"samples_status": "正常" if query_metrics.samples_processed <= 1000000 else "异常",
"cache_efficiency": "高" if query_metrics.cache_hit_ratio >= 0.8 else "低"
}
}
def generate_query_optimization_plan(self, current_queries: List[str]) -> Dict[str, Any]:
"""生成查询优化计划"""
optimization_plan = {
"phase_1": {
"name": "基础优化",
"duration": "1周",
"tasks": [
"识别慢查询和高频查询",
"创建基础记录规则",
"优化查询语法和逻辑",
"配置查询缓存"
],
"expected_improvement": "20-30%"
},
"phase_2": {
"name": "中级优化",
"duration": "2周",
"tasks": [
"实施高级记录规则",
"优化聚合和分组操作",
"调整查询并发设置",
"实施查询分片策略"
],
"expected_improvement": "30-50%"
},
"phase_3": {
"name": "高级优化",
"duration": "2周",
"tasks": [
"实施查询联邦",
"配置远程读取优化",
"实施智能缓存策略",
"优化存储层查询"
],
"expected_improvement": "50-70%"
}
}
# 分析当前查询
query_analysis = {
"total_queries": len(current_queries),
"complex_queries": len([q for q in current_queries if len(q) > 100]),
"aggregation_queries": len([q for q in current_queries if "sum(" in q or "avg(" in q]),
"range_queries": len([q for q in current_queries if "[" in q and "]" in q])
}
return {
"optimization_plan": optimization_plan,
"current_analysis": query_analysis,
"priority_actions": [
"创建最常用查询的记录规则",
"优化复杂聚合查询",
"实施查询结果缓存",
"监控查询性能指标"
],
"estimated_timeline": "5周",
"resource_requirements": {
"additional_memory": "2-4GB",
"additional_cpu": "1-2 cores",
"storage_overhead": "10-20%"
}
}
# 使用示例
query_optimizer = QueryOptimizer()
# 创建记录规则配置
basic_rules = query_optimizer.create_recording_rules_config(OptimizationLevel.BASIC)
advanced_rules = query_optimizer.create_recording_rules_config(OptimizationLevel.ADVANCED)
expert_rules = query_optimizer.create_recording_rules_config(OptimizationLevel.EXPERT)
# 优化查询
sample_query = "sum(rate(http_requests_total{job=\"api\"}[1m])) by (status)"
optimization_result = query_optimizer.optimize_query(sample_query, QueryType.RANGE)
# 创建性能配置
performance_config = query_optimizer.create_query_performance_config()
# 分析查询性能
sample_metrics = QueryMetrics(
execution_time=15.5,
samples_processed=5000000,
memory_usage=800,
cpu_usage=45.2,
cache_hit_ratio=0.65
)
performance_analysis = query_optimizer.analyze_query_performance(sample_metrics)
# 生成优化计划
current_queries = [
"up",
"rate(http_requests_total[5m])",
"sum(rate(http_requests_total[5m])) by (status)",
"histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"
]
optimization_plan = query_optimizer.generate_query_optimization_plan(current_queries)
print("查询性能优化")
print(f"基础规则组数: {len(basic_rules['groups'])}")
print(f"高级规则组数: {len(advanced_rules['groups'])}")
print(f"专家规则组数: {len(expert_rules['groups'])}")
print(f"查询优化建议数: {len(optimization_result['optimizations_applied'])}")
print(f"性能评分: {performance_analysis['performance_score']}")
print(f"优化计划阶段数: {len(optimization_plan['optimization_plan'])}")
集群部署与高可用性
ClusterManager 类
class DeploymentMode(Enum):
"""部署模式"""
STANDALONE = "standalone"
FEDERATION = "federation"
CLUSTER = "cluster"
THANOS = "thanos"
CORTEX = "cortex"
class HAStrategy(Enum):
"""高可用策略"""
ACTIVE_PASSIVE = "active_passive"
ACTIVE_ACTIVE = "active_active"
FEDERATION = "federation"
SHARDING = "sharding"
@dataclass
class ClusterNode:
"""集群节点"""
name: str
role: str
endpoint: str
region: str
zone: str
capacity: Dict[str, Any]
status: str
class ClusterManager:
"""集群管理器"""
def __init__(self):
self.nodes = []
self.federation_config = {}
self.load_balancer_config = {}
def create_federation_config(self, clusters: List[Dict[str, Any]]) -> Dict[str, Any]:
"""创建联邦配置"""
federation_config = {
"global": {
"scrape_interval": "15s",
"evaluation_interval": "15s",
"external_labels": {
"cluster": "federation-master",
"region": "global"
}
},
"scrape_configs": []
}
# 为每个集群创建联邦抓取配置
for cluster in clusters:
scrape_config = {
"job_name": f"federate-{cluster['name']}",
"scrape_interval": "15s",
"honor_labels": True,
"metrics_path": "/federate",
"params": {
"match[]": [
'{__name__=~"job:.*"}',
'{__name__=~"node:.*"}',
'{__name__=~"instance:.*"}',
'up',
'prometheus_build_info',
'prometheus_config_last_reload_successful'
]
},
"static_configs": [{
"targets": cluster['endpoints']
}],
"relabel_configs": [
{
"source_labels": ["__address__"],
"target_label": "cluster",
"replacement": cluster['name']
}
]
}
federation_config["scrape_configs"].append(scrape_config)
return federation_config
def create_thanos_cluster_config(self) -> Dict[str, Any]:
"""创建 Thanos 集群配置"""
return {
"thanos_sidecar": {
"image": "thanosio/thanos:v0.32.0",
"args": [
"sidecar",
"--tsdb.path=/prometheus",
"--prometheus.url=http://localhost:9090",
"--grpc-address=0.0.0.0:10901",
"--http-address=0.0.0.0:10902",
"--objstore.config-file=/etc/thanos/bucket.yml"
],
"ports": ["10901", "10902"],
"volumes": [
"/prometheus:/prometheus",
"/etc/thanos:/etc/thanos"
]
},
"thanos_query": {
"image": "thanosio/thanos:v0.32.0",
"args": [
"query",
"--http-address=0.0.0.0:9090",
"--grpc-address=0.0.0.0:10901",
"--store=prometheus-sidecar-1:10901",
"--store=prometheus-sidecar-2:10901",
"--store=thanos-store:10901",
"--query.replica-label=replica"
],
"ports": ["9090", "10901"]
},
"thanos_store": {
"image": "thanosio/thanos:v0.32.0",
"args": [
"store",
"--data-dir=/var/thanos/store",
"--grpc-address=0.0.0.0:10901",
"--http-address=0.0.0.0:10902",
"--objstore.config-file=/etc/thanos/bucket.yml"
],
"ports": ["10901", "10902"],
"volumes": [
"/var/thanos/store:/var/thanos/store",
"/etc/thanos:/etc/thanos"
]
},
"thanos_compactor": {
"image": "thanosio/thanos:v0.32.0",
"args": [
"compact",
"--data-dir=/var/thanos/compact",
"--objstore.config-file=/etc/thanos/bucket.yml",
"--wait"
],
"volumes": [
"/var/thanos/compact:/var/thanos/compact",
"/etc/thanos:/etc/thanos"
]
},
"bucket_config": """
type: S3
config:
bucket: "thanos-storage"
endpoint: "s3.amazonaws.com"
access_key: "${AWS_ACCESS_KEY_ID}"
secret_key: "${AWS_SECRET_ACCESS_KEY}"
insecure: false
signature_version2: false
encrypt_sse: false
put_user_metadata: {}
http_config:
idle_conn_timeout: 90s
response_header_timeout: 2m
insecure_skip_verify: false
trace:
enable: false
part_size: 67108864
"""
}
def create_cortex_cluster_config(self) -> Dict[str, Any]:
"""创建 Cortex 集群配置"""
return {
"cortex_config": """
auth_enabled: false
server:
http_listen_port: 8080
grpc_listen_port: 9095
distributor:
shard_by_all_labels: true
pool:
health_check_ingesters: true
ingester_client:
grpc_client_config:
max_recv_msg_size: 104857600
max_send_msg_size: 104857600
grpc_compression: gzip
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: consul
consul:
host: consul:8500
replication_factor: 3
num_tokens: 512
heartbeat_period: 5s
observe_period: 10s
join_after: 10s
min_ready_duration: 15s
interface_names:
- eth0
- en0
storage:
engine: blocks
blocks_storage:
tsdb:
dir: /cortex/tsdb
bucket_store:
sync_dir: /cortex/tsdb-sync
backend: s3
s3:
bucket_name: cortex-storage
endpoint: s3.amazonaws.com
access_key_id: ${AWS_ACCESS_KEY_ID}
secret_access_key: ${AWS_SECRET_ACCESS_KEY}
compactor:
data_dir: /cortex/compactor
sharding_ring:
kvstore:
store: consul
consul:
host: consul:8500
store_gateway:
sharding_enabled: true
sharding_ring:
replication_factor: 3
kvstore:
store: consul
consul:
host: consul:8500
ruler:
enable_api: true
enable_sharding: true
ring:
kvstore:
store: consul
consul:
host: consul:8500
ruler_storage:
backend: s3
s3:
bucket_name: cortex-rules
endpoint: s3.amazonaws.com
access_key_id: ${AWS_ACCESS_KEY_ID}
secret_access_key: ${AWS_SECRET_ACCESS_KEY}
query_range:
align_queries_with_step: true
max_retries: 5
split_queries_by_interval: 24h
cache_results: true
results_cache:
cache:
redis:
endpoint: redis:6379
timeout: 500ms
expiration: 1h
""",
"docker_compose": """
version: '3.8'
services:
consul:
image: consul:1.9
command: [ "consul", "agent", "-dev", "-client=0.0.0.0" ]
ports:
- "8500:8500"
redis:
image: redis:6-alpine
ports:
- "6379:6379"
cortex-distributor:
image: cortexproject/cortex:v1.15.0
command: [ "-config.file=/etc/cortex/cortex.yml", "-target=distributor" ]
volumes:
- ./cortex.yml:/etc/cortex/cortex.yml
ports:
- "8080:8080"
depends_on:
- consul
cortex-ingester-1:
image: cortexproject/cortex:v1.15.0
command: [ "-config.file=/etc/cortex/cortex.yml", "-target=ingester" ]
volumes:
- ./cortex.yml:/etc/cortex/cortex.yml
- cortex-ingester-1-data:/cortex
depends_on:
- consul
cortex-ingester-2:
image: cortexproject/cortex:v1.15.0
command: [ "-config.file=/etc/cortex/cortex.yml", "-target=ingester" ]
volumes:
- ./cortex.yml:/etc/cortex/cortex.yml
- cortex-ingester-2-data:/cortex
depends_on:
- consul
cortex-querier:
image: cortexproject/cortex:v1.15.0
command: [ "-config.file=/etc/cortex/cortex.yml", "-target=querier" ]
volumes:
- ./cortex.yml:/etc/cortex/cortex.yml
depends_on:
- consul
cortex-query-frontend:
image: cortexproject/cortex:v1.15.0
command: [ "-config.file=/etc/cortex/cortex.yml", "-target=query-frontend" ]
volumes:
- ./cortex.yml:/etc/cortex/cortex.yml
ports:
- "8081:8080"
depends_on:
- consul
- redis
volumes:
cortex-ingester-1-data:
cortex-ingester-2-data:
"""
}
def create_load_balancer_config(self, strategy: HAStrategy) -> Dict[str, Any]:
"""创建负载均衡配置"""
configs = {
HAStrategy.ACTIVE_PASSIVE: {
"nginx_config": """
upstream prometheus_backend {
server prometheus-primary:9090 max_fails=3 fail_timeout=30s;
server prometheus-secondary:9090 backup;
}
server {
listen 80;
server_name prometheus.example.com;
location / {
proxy_pass http://prometheus_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}
}
""",
"keepalived_config": """
vrrp_script chk_nginx {
script "/usr/bin/curl -f http://localhost/health || exit 1"
interval 2
weight -2
fall 3
rise 2
}
vrrp_instance VI_1 {
state MASTER
interface eth0
virtual_router_id 51
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass prometheus123
}
virtual_ipaddress {
192.168.1.100
}
track_script {
chk_nginx
}
}
"""
},
HAStrategy.ACTIVE_ACTIVE: {
"haproxy_config": """
global
daemon
maxconn 4096
log stdout local0
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
log global
frontend prometheus_frontend
bind *:9090
default_backend prometheus_servers
backend prometheus_servers
balance roundrobin
option httpchk GET /api/v1/query?query=up
server prometheus1 prometheus-1:9090 check
server prometheus2 prometheus-2:9090 check
server prometheus3 prometheus-3:9090 check
frontend alertmanager_frontend
bind *:9093
default_backend alertmanager_servers
backend alertmanager_servers
balance roundrobin
option httpchk GET /-/healthy
server alertmanager1 alertmanager-1:9093 check
server alertmanager2 alertmanager-2:9093 check
server alertmanager3 alertmanager-3:9093 check
listen stats
bind *:8404
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
"""
},
HAStrategy.FEDERATION: {
"consul_template": """
upstream prometheus_federation {
{{range service "prometheus"}}
server {{.Address}}:{{.Port}} max_fails=3 fail_timeout=30s;
{{end}}
}
server {
listen 80;
location / {
proxy_pass http://prometheus_federation;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
}
""",
"service_discovery": """
services {
name = "prometheus"
tags = ["monitoring", "metrics"]
port = 9090
check {
http = "http://localhost:9090/-/healthy"
interval = "10s"
}
}
"""
}
}
return configs.get(strategy, {})
def create_kubernetes_deployment(self) -> Dict[str, Any]:
"""创建 Kubernetes 部署配置"""
return {
"prometheus_deployment": """
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
spec:
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.40.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus/'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- containerPort: 9090
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
- name: prometheus-storage
mountPath: /prometheus
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage
persistentVolumeClaim:
claimName: prometheus-storage
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
spec:
selector:
app: prometheus
ports:
- port: 9090
targetPort: 9090
type: ClusterIP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-storage
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: fast-ssd
""",
"alertmanager_deployment": """
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: monitoring
spec:
replicas: 3
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: prom/alertmanager:v0.25.0
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://alertmanager.example.com'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-0.alertmanager:9094'
- '--cluster.peer=alertmanager-1.alertmanager:9094'
- '--cluster.peer=alertmanager-2.alertmanager:9094'
ports:
- containerPort: 9093
- containerPort: 9094
volumeMounts:
- name: alertmanager-config
mountPath: /etc/alertmanager
- name: alertmanager-storage
mountPath: /alertmanager
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "200m"
volumes:
- name: alertmanager-config
configMap:
name: alertmanager-config
- name: alertmanager-storage
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitoring
spec:
selector:
app: alertmanager
ports:
- name: web
port: 9093
targetPort: 9093
- name: cluster
port: 9094
targetPort: 9094
clusterIP: None
""",
"rbac_config": """
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
"""
}
def create_monitoring_config(self) -> Dict[str, Any]:
"""创建集群监控配置"""
return {
"cluster_health_rules": """
groups:
- name: cluster_health
rules:
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus 实例宕机"
description: "Prometheus 实例 {{ $labels.instance }} 已宕机超过5分钟"
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 配置重载失败"
description: "Prometheus {{ $labels.instance }} 配置重载失败"
- alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 通知队列即将满载"
description: "Prometheus {{ $labels.instance }} 通知队列预计在30分钟内满载"
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: rate(prometheus_notifications_errors_total[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 发送告警失败"
description: "Prometheus {{ $labels.instance }} 发送告警到 Alertmanager 失败率: {{ $value }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 规则评估失败"
description: "Prometheus {{ $labels.instance }} 规则评估失败"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus 目标抓取缓慢"
description: "Prometheus {{ $labels.instance }} 抓取目标 {{ $labels.job }} 耗时过长: {{ $value }}s"
""",
"federation_health_dashboard": {
"dashboard": {
"title": "Prometheus 集群健康状态",
"tags": ["prometheus", "cluster", "federation"],
"panels": [
{
"title": "集群状态概览",
"type": "stat",
"targets": [
{
"expr": "count(up{job=\"prometheus\"} == 1)",
"legendFormat": "在线实例"
},
{
"expr": "count(up{job=\"prometheus\"} == 0)",
"legendFormat": "离线实例"
}
]
},
{
"title": "数据摄入速率",
"type": "timeseries",
"targets": [{
"expr": "sum(rate(prometheus_tsdb_symbol_table_size_bytes[5m])) by (instance)",
"legendFormat": "{{instance}}"
}]
},
{
"title": "查询性能",
"type": "timeseries",
"targets": [{
"expr": "histogram_quantile(0.99, sum(rate(prometheus_engine_query_duration_seconds_bucket[5m])) by (instance, le))",
"legendFormat": "{{instance}} P99"
}]
},
{
"title": "存储使用情况",
"type": "timeseries",
"targets": [{
"expr": "prometheus_tsdb_size_bytes / prometheus_tsdb_retention_limit_bytes * 100",
"legendFormat": "{{instance}} 使用率%"
}]
}
]
}
}
}
def generate_deployment_plan(self, target_scale: int, ha_strategy: HAStrategy) -> Dict[str, Any]:
"""生成部署计划"""
deployment_phases = {
"phase_1": {
"name": "基础设施准备",
"duration": "1-2周",
"tasks": [
"准备服务器和网络环境",
"安装容器运行时和编排工具",
"配置存储和备份系统",
"设置监控和日志收集"
]
},
"phase_2": {
"name": "核心组件部署",
"duration": "1-2周",
"tasks": [
"部署 Prometheus 实例",
"配置 Alertmanager 集群",
"设置服务发现",
"配置负载均衡"
]
},
"phase_3": {
"name": "高可用配置",
"duration": "1周",
"tasks": [
"实施高可用策略",
"配置故障转移",
"测试灾难恢复",
"优化性能参数"
]
},
"phase_4": {
"name": "生产就绪",
"duration": "1周",
"tasks": [
"压力测试和性能调优",
"安全加固和审计",
"文档和培训",
"上线和监控"
]
}
}
# 根据规模和策略调整计划
resource_requirements = {
"small": {"nodes": 3, "cpu_per_node": 4, "memory_per_node": "8GB", "storage_per_node": "100GB"},
"medium": {"nodes": 6, "cpu_per_node": 8, "memory_per_node": "16GB", "storage_per_node": "500GB"},
"large": {"nodes": 12, "cpu_per_node": 16, "memory_per_node": "32GB", "storage_per_node": "1TB"}
}
scale_category = "small" if target_scale <= 5 else "medium" if target_scale <= 20 else "large"
return {
"deployment_phases": deployment_phases,
"resource_requirements": resource_requirements[scale_category],
"ha_strategy": ha_strategy.value,
"estimated_timeline": "4-6周",
"critical_success_factors": [
"充分的容量规划",
"完善的监控体系",
"自动化部署流程",
"灾难恢复计划",
"团队技能培训"
],
"risk_mitigation": [
"分阶段部署降低风险",
"充分测试验证功能",
"准备回滚方案",
"建立应急响应流程"
]
}
# 使用示例
cluster_manager = ClusterManager()
# 创建联邦配置
clusters = [
{"name": "prod-us-east", "endpoints": ["prometheus-us-east:9090"]},
{"name": "prod-us-west", "endpoints": ["prometheus-us-west:9090"]},
{"name": "prod-eu", "endpoints": ["prometheus-eu:9090"]}
]
federation_config = cluster_manager.create_federation_config(clusters)
# 创建 Thanos 集群配置
thanos_config = cluster_manager.create_thanos_cluster_config()
# 创建 Cortex 集群配置
cortex_config = cluster_manager.create_cortex_cluster_config()
# 创建负载均衡配置
active_passive_lb = cluster_manager.create_load_balancer_config(HAStrategy.ACTIVE_PASSIVE)
active_active_lb = cluster_manager.create_load_balancer_config(HAStrategy.ACTIVE_ACTIVE)
# 创建 Kubernetes 部署配置
k8s_deployment = cluster_manager.create_kubernetes_deployment()
# 创建监控配置
monitoring_config = cluster_manager.create_monitoring_config()
# 生成部署计划
deployment_plan = cluster_manager.generate_deployment_plan(10, HAStrategy.ACTIVE_ACTIVE)
print("集群部署与高可用性")
print(f"联邦配置抓取任务数: {len(federation_config['scrape_configs'])}")
print(f"Thanos 组件数: {len([k for k in thanos_config.keys() if k.startswith('thanos_')])}")
print(f"Kubernetes 部署配置数: {len(k8s_deployment)}")
print(f"部署计划阶段数: {len(deployment_plan['deployment_phases'])}")
print(f"推荐节点数: {deployment_plan['resource_requirements']['nodes']}")
安全与合规性
SecurityManager 类
class SecurityLevel(Enum):
"""安全级别"""
BASIC = "basic"
STANDARD = "standard"
HIGH = "high"
CRITICAL = "critical"
class AuthMethod(Enum):
"""认证方法"""
BASIC_AUTH = "basic_auth"
OAUTH2 = "oauth2"
LDAP = "ldap"
SAML = "saml"
MTLS = "mtls"
JWT = "jwt"
@dataclass
class SecurityConfig:
"""安全配置"""
level: SecurityLevel
auth_method: AuthMethod
encryption_enabled: bool
audit_enabled: bool
compliance_standards: List[str]
retention_policy: Dict[str, Any]
class SecurityManager:
"""安全管理器"""
def __init__(self):
self.security_policies = {}
self.compliance_rules = {}
self.audit_config = {}
def create_tls_config(self, security_level: SecurityLevel) -> Dict[str, Any]:
"""创建 TLS 配置"""
base_config = {
"tls_server_config": {
"cert_file": "/etc/prometheus/certs/prometheus.crt",
"key_file": "/etc/prometheus/certs/prometheus.key",
"client_ca_file": "/etc/prometheus/certs/ca.crt",
"client_auth_type": "RequireAndVerifyClientCert"
},
"tls_config": {
"ca_file": "/etc/prometheus/certs/ca.crt",
"cert_file": "/etc/prometheus/certs/client.crt",
"key_file": "/etc/prometheus/certs/client.key",
"server_name": "prometheus.example.com",
"insecure_skip_verify": False
}
}
if security_level in [SecurityLevel.HIGH, SecurityLevel.CRITICAL]:
base_config["tls_server_config"].update({
"min_version": "TLS13",
"max_version": "TLS13",
"cipher_suites": [
"TLS_AES_256_GCM_SHA384",
"TLS_CHACHA20_POLY1305_SHA256",
"TLS_AES_128_GCM_SHA256"
],
"prefer_server_cipher_suites": True,
"curve_preferences": ["X25519", "P-256"]
})
return base_config
def create_auth_config(self, auth_method: AuthMethod) -> Dict[str, Any]:
"""创建认证配置"""
configs = {
AuthMethod.BASIC_AUTH: {
"basic_auth_users": {
"admin": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay", # bcrypt hash of 'secret'
"readonly": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay"
},
"web_config": {
"basic_auth_users": {
"admin": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay"
}
}
},
AuthMethod.OAUTH2: {
"oauth2": {
"client_id": "prometheus-client",
"client_secret": "${OAUTH2_CLIENT_SECRET}",
"scopes": ["openid", "profile", "email"],
"token_url": "https://auth.example.com/oauth/token",
"auth_url": "https://auth.example.com/oauth/authorize",
"redirect_url": "https://prometheus.example.com/oauth2/callback",
"email_domains": ["example.com"],
"role_attribute_path": "groups"
}
},
AuthMethod.LDAP: {
"ldap": {
"host": "ldap.example.com:636",
"use_ssl": True,
"start_tls": False,
"ssl_skip_verify": False,
"bind_dn": "cn=prometheus,ou=services,dc=example,dc=com",
"bind_password": "${LDAP_BIND_PASSWORD}",
"search_filter": "(uid=%s)",
"search_base_dns": ["ou=users,dc=example,dc=com"],
"group_search_filter": "(&(objectClass=posixGroup)(memberUid=%s))",
"group_search_base_dns": ["ou=groups,dc=example,dc=com"],
"attr": {
"username": "uid",
"surname": "sn",
"email": "mail",
"member_of": "memberOf"
}
}
},
AuthMethod.MTLS: {
"client_cert_auth": {
"enabled": True,
"ca_file": "/etc/prometheus/certs/client-ca.crt",
"cert_file": "/etc/prometheus/certs/server.crt",
"key_file": "/etc/prometheus/certs/server.key",
"client_auth_type": "RequireAndVerifyClientCert",
"allowed_dns_names": ["prometheus-client.example.com"],
"allowed_email_addresses": ["prometheus@example.com"]
}
}
}
return configs.get(auth_method, {})
def create_rbac_config(self) -> Dict[str, Any]:
"""创建 RBAC 配置"""
return {
"authorization": {
"role_based_access_control": {
"policy": """
p, admin, /*, *, allow
p, editor, /api/v1/query*, GET, allow
p, editor, /api/v1/label*, GET, allow
p, editor, /api/v1/series*, GET, allow
p, editor, /api/v1/metadata*, GET, allow
p, viewer, /api/v1/query*, GET, allow
p, viewer, /api/v1/label*, GET, allow
p, viewer, /api/v1/series*, GET, allow
p, viewer, /graph*, GET, allow
p, viewer, /static*, GET, allow
g, alice@example.com, admin
g, bob@example.com, editor
g, charlie@example.com, viewer
""",
"policy_effect": "allow",
"request_definition": """
[request_definition]
r = sub, obj, act
[policy_definition]
p = sub, obj, act, eft
[role_definition]
g = _, _
[policy_effect]
e = some(where (p.eft == allow))
[matchers]
m = g(r.sub, p.sub) && keyMatch2(r.obj, p.obj) && regexMatch(r.act, p.act)
"""
}
},
"users": {
"admin": {
"password_hash": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay",
"roles": ["admin"],
"permissions": ["*"]
},
"editor": {
"password_hash": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay",
"roles": ["editor"],
"permissions": ["read", "query"]
},
"viewer": {
"password_hash": "$2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay",
"roles": ["viewer"],
"permissions": ["read"]
}
}
}
def create_audit_config(self) -> Dict[str, Any]:
"""创建审计配置"""
return {
"audit": {
"enabled": True,
"log_file": "/var/log/prometheus/audit.log",
"log_format": "json",
"log_max_size": "100MB",
"log_max_backups": 10,
"log_max_age": 30,
"events": [
"query",
"config_reload",
"rule_evaluation",
"target_discovery",
"alert_firing",
"alert_resolved",
"user_login",
"user_logout",
"admin_action"
],
"sensitive_labels": [
"password",
"token",
"secret",
"key",
"credential"
],
"retention_days": 365
},
"audit_rules": """
# 监控配置变更
- alert: PrometheusConfigChanged
expr: increase(prometheus_config_last_reload_success_timestamp_seconds[5m]) > 0
for: 0m
labels:
severity: info
category: audit
annotations:
summary: "Prometheus 配置已重载"
description: "Prometheus 实例 {{ $labels.instance }} 配置在过去5分钟内被重载"
# 监控查询活动
- alert: HighQueryRate
expr: rate(prometheus_http_requests_total{handler="/api/v1/query"}[5m]) > 100
for: 5m
labels:
severity: warning
category: audit
annotations:
summary: "查询频率过高"
description: "Prometheus 实例 {{ $labels.instance }} 查询频率过高: {{ $value }} req/s"
# 监控管理员操作
- alert: AdminAPIAccess
expr: increase(prometheus_http_requests_total{handler=~"/api/v1/(admin|config).*"}[1m]) > 0
for: 0m
labels:
severity: info
category: audit
annotations:
summary: "管理员 API 访问"
description: "检测到管理员 API 访问: {{ $labels.handler }}"
"""
}
def create_compliance_config(self, standards: List[str]) -> Dict[str, Any]:
"""创建合规配置"""
compliance_configs = {
"SOX": {
"data_retention": {
"financial_metrics": "7_years",
"audit_logs": "7_years",
"access_logs": "3_years"
},
"access_controls": {
"segregation_of_duties": True,
"least_privilege": True,
"regular_access_review": "quarterly"
},
"monitoring_requirements": [
"financial_transaction_monitoring",
"privileged_access_monitoring",
"data_integrity_monitoring"
]
},
"GDPR": {
"data_protection": {
"encryption_at_rest": True,
"encryption_in_transit": True,
"pseudonymization": True,
"data_minimization": True
},
"privacy_controls": {
"consent_management": True,
"right_to_erasure": True,
"data_portability": True,
"privacy_by_design": True
},
"breach_notification": {
"detection_time": "72_hours",
"notification_time": "72_hours",
"documentation_required": True
}
},
"HIPAA": {
"safeguards": {
"administrative": True,
"physical": True,
"technical": True
},
"access_controls": {
"unique_user_identification": True,
"automatic_logoff": True,
"encryption_decryption": True
},
"audit_controls": {
"audit_logs": True,
"audit_review": "monthly",
"audit_retention": "6_years"
}
},
"PCI_DSS": {
"network_security": {
"firewall_configuration": True,
"network_segmentation": True,
"secure_protocols": True
},
"data_protection": {
"cardholder_data_protection": True,
"encryption_key_management": True,
"secure_deletion": True
},
"monitoring": {
"log_monitoring": True,
"file_integrity_monitoring": True,
"vulnerability_scanning": "quarterly"
}
}
}
result = {}
for standard in standards:
if standard in compliance_configs:
result[standard] = compliance_configs[standard]
return result
def create_data_governance_config(self) -> Dict[str, Any]:
"""创建数据治理配置"""
return {
"data_classification": {
"public": {
"retention_period": "1_year",
"encryption_required": False,
"access_level": "all_users"
},
"internal": {
"retention_period": "3_years",
"encryption_required": True,
"access_level": "employees_only"
},
"confidential": {
"retention_period": "7_years",
"encryption_required": True,
"access_level": "authorized_personnel"
},
"restricted": {
"retention_period": "10_years",
"encryption_required": True,
"access_level": "need_to_know_basis"
}
},
"data_lifecycle": {
"creation": {
"classification_required": True,
"metadata_tagging": True,
"owner_assignment": True
},
"storage": {
"encryption_standards": "AES-256",
"backup_frequency": "daily",
"geographic_restrictions": True
},
"processing": {
"purpose_limitation": True,
"processing_log": True,
"consent_verification": True
},
"retention": {
"automatic_deletion": True,
"retention_schedule": True,
"legal_hold_support": True
},
"disposal": {
"secure_deletion": True,
"disposal_certificate": True,
"verification_required": True
}
},
"privacy_controls": {
"data_anonymization": {
"techniques": ["k_anonymity", "l_diversity", "t_closeness"],
"threshold_k": 5,
"threshold_l": 3
},
"consent_management": {
"granular_consent": True,
"consent_withdrawal": True,
"consent_audit_trail": True
},
"subject_rights": {
"right_of_access": True,
"right_of_rectification": True,
"right_of_erasure": True,
"right_of_portability": True
}
}
}
def create_security_monitoring_config(self) -> Dict[str, Any]:
"""创建安全监控配置"""
return {
"security_rules": """
groups:
- name: security_monitoring
rules:
# 异常登录检测
- alert: UnusualLoginActivity
expr: increase(prometheus_http_requests_total{code="401"}[5m]) > 10
for: 1m
labels:
severity: warning
category: security
annotations:
summary: "异常登录活动"
description: "检测到异常登录尝试,5分钟内失败次数: {{ $value }}"
# 权限提升检测
- alert: PrivilegeEscalation
expr: increase(prometheus_http_requests_total{handler=~"/api/v1/admin.*",code="200"}[1m]) > 0
for: 0m
labels:
severity: critical
category: security
annotations:
summary: "权限提升活动"
description: "检测到管理员权限使用: {{ $labels.handler }}"
# 数据泄露检测
- alert: DataExfiltration
expr: rate(prometheus_http_response_size_bytes[5m]) > 10485760 # 10MB/s
for: 2m
labels:
severity: critical
category: security
annotations:
summary: "可能的数据泄露"
description: "检测到异常大量数据传输: {{ $value }} bytes/s"
# 配置篡改检测
- alert: ConfigurationTampering
expr: increase(prometheus_config_last_reload_success_timestamp_seconds[1m]) > 0 and hour() < 6 or hour() > 22
for: 0m
labels:
severity: warning
category: security
annotations:
summary: "非工作时间配置变更"
description: "检测到非工作时间的配置变更"
# 暴力破解检测
- alert: BruteForceAttack
expr: rate(prometheus_http_requests_total{code="401"}[1m]) > 5
for: 2m
labels:
severity: critical
category: security
annotations:
summary: "暴力破解攻击"
description: "检测到暴力破解攻击,失败率: {{ $value }} req/s"
""",
"threat_detection": {
"indicators": [
"unusual_query_patterns",
"privilege_escalation",
"data_exfiltration",
"configuration_tampering",
"brute_force_attacks",
"insider_threats"
],
"response_actions": [
"alert_security_team",
"block_suspicious_ip",
"disable_compromised_account",
"initiate_incident_response",
"preserve_evidence"
]
},
"incident_response": {
"severity_levels": {
"low": {
"response_time": "4_hours",
"escalation_time": "24_hours",
"notification_channels": ["email"]
},
"medium": {
"response_time": "2_hours",
"escalation_time": "8_hours",
"notification_channels": ["email", "slack"]
},
"high": {
"response_time": "30_minutes",
"escalation_time": "2_hours",
"notification_channels": ["email", "slack", "pagerduty"]
},
"critical": {
"response_time": "15_minutes",
"escalation_time": "1_hour",
"notification_channels": ["email", "slack", "pagerduty", "phone"]
}
}
}
}
def generate_security_assessment(self, current_config: Dict[str, Any]) -> Dict[str, Any]:
"""生成安全评估报告"""
assessment_criteria = {
"authentication": {
"weight": 0.25,
"checks": [
"multi_factor_authentication",
"strong_password_policy",
"account_lockout_policy",
"session_management"
]
},
"authorization": {
"weight": 0.20,
"checks": [
"role_based_access_control",
"least_privilege_principle",
"segregation_of_duties",
"regular_access_review"
]
},
"encryption": {
"weight": 0.20,
"checks": [
"data_at_rest_encryption",
"data_in_transit_encryption",
"key_management",
"certificate_management"
]
},
"monitoring": {
"weight": 0.15,
"checks": [
"security_event_logging",
"real_time_monitoring",
"anomaly_detection",
"incident_response"
]
},
"compliance": {
"weight": 0.10,
"checks": [
"regulatory_compliance",
"audit_trail",
"data_governance",
"privacy_controls"
]
},
"infrastructure": {
"weight": 0.10,
"checks": [
"network_security",
"system_hardening",
"vulnerability_management",
"backup_security"
]
}
}
# 模拟评估逻辑
scores = {}
overall_score = 0
for category, criteria in assessment_criteria.items():
# 简化的评分逻辑
category_score = 75 + (hash(category) % 25) # 75-100 分
scores[category] = {
"score": category_score,
"weight": criteria["weight"],
"weighted_score": category_score * criteria["weight"]
}
overall_score += scores[category]["weighted_score"]
# 生成建议
recommendations = []
if overall_score < 80:
recommendations.extend([
"实施多因素认证",
"加强访问控制策略",
"启用全面的安全监控"
])
if overall_score < 90:
recommendations.extend([
"定期进行安全评估",
"实施零信任架构",
"加强员工安全培训"
])
return {
"overall_score": round(overall_score, 2),
"security_level": "高" if overall_score >= 90 else "中" if overall_score >= 75 else "低",
"category_scores": scores,
"recommendations": recommendations,
"compliance_status": {
"SOX": overall_score >= 85,
"GDPR": overall_score >= 80,
"HIPAA": overall_score >= 90,
"PCI_DSS": overall_score >= 85
},
"next_assessment_date": "2024-07-01",
"critical_findings": [
finding for finding in [
"弱密码策略" if overall_score < 70 else None,
"缺少加密" if overall_score < 75 else None,
"监控不足" if overall_score < 80 else None
] if finding
]
}
# 使用示例
security_manager = SecurityManager()
# 创建 TLS 配置
tls_config = security_manager.create_tls_config(SecurityLevel.HIGH)
# 创建认证配置
basic_auth = security_manager.create_auth_config(AuthMethod.BASIC_AUTH)
oauth2_config = security_manager.create_auth_config(AuthMethod.OAUTH2)
ldap_config = security_manager.create_auth_config(AuthMethod.LDAP)
# 创建 RBAC 配置
rbac_config = security_manager.create_rbac_config()
# 创建审计配置
audit_config = security_manager.create_audit_config()
# 创建合规配置
compliance_config = security_manager.create_compliance_config(["SOX", "GDPR", "HIPAA"])
# 创建数据治理配置
data_governance = security_manager.create_data_governance_config()
# 创建安全监控配置
security_monitoring = security_manager.create_security_monitoring_config()
# 生成安全评估
security_assessment = security_manager.generate_security_assessment({})
print("安全与合规性")
print(f"TLS 配置组件数: {len(tls_config)}")
print(f"支持的认证方法数: 4")
print(f"RBAC 用户数: {len(rbac_config['users'])}")
print(f"合规标准数: {len(compliance_config)}")
print(f"数据分类级别数: {len(data_governance['data_classification'])}")
print(f"安全评估总分: {security_assessment['overall_score']}")
print(f"安全级别: {security_assessment['security_level']}")
运维自动化与 DevOps 集成
DevOpsManager 类
class DeploymentStrategy(Enum):
"""部署策略"""
BLUE_GREEN = "blue_green"
CANARY = "canary"
ROLLING = "rolling"
RECREATE = "recreate"
A_B_TESTING = "a_b_testing"
class PipelineStage(Enum):
"""流水线阶段"""
BUILD = "build"
TEST = "test"
SECURITY_SCAN = "security_scan"
DEPLOY = "deploy"
MONITOR = "monitor"
ROLLBACK = "rollback"
@dataclass
class AutomationConfig:
"""自动化配置"""
strategy: DeploymentStrategy
auto_scaling: bool
health_checks: bool
rollback_enabled: bool
monitoring_integration: bool
notification_channels: List[str]
class DevOpsManager:
"""DevOps 管理器"""
def __init__(self):
self.pipelines = {}
self.deployment_configs = {}
self.automation_rules = {}
def create_ci_cd_pipeline(self, strategy: DeploymentStrategy) -> Dict[str, Any]:
"""创建 CI/CD 流水线配置"""
base_pipeline = {
"version": "2.1",
"orbs": {
"prometheus": "prometheus/prometheus@1.0.0",
"kubernetes": "circleci/kubernetes@1.3.0",
"helm": "circleci/helm@2.0.1"
},
"workflows": {
"prometheus_deployment": {
"jobs": [
"build",
"test",
"security_scan",
"deploy_staging",
"integration_tests",
"deploy_production",
"post_deployment_tests"
]
}
}
}
jobs = {
"build": {
"docker": [{"image": "cimg/go:1.21"}],
"steps": [
"checkout",
{
"run": {
"name": "构建 Prometheus",
"command": """
make build
make test
docker build -t prometheus:${CIRCLE_SHA1} .
docker tag prometheus:${CIRCLE_SHA1} prometheus:latest
"""
}
},
{
"run": {
"name": "推送镜像",
"command": """
echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin
docker push prometheus:${CIRCLE_SHA1}
docker push prometheus:latest
"""
}
}
]
},
"test": {
"docker": [{"image": "cimg/go:1.21"}],
"steps": [
"checkout",
{
"run": {
"name": "运行测试",
"command": """
go test -v ./...
go test -race ./...
go test -coverprofile=coverage.out ./...
go tool cover -html=coverage.out -o coverage.html
"""
}
},
{
"store_artifacts": {
"path": "coverage.html",
"destination": "coverage-report"
}
}
]
},
"security_scan": {
"docker": [{"image": "cimg/go:1.21"}],
"steps": [
"checkout",
{
"run": {
"name": "安全扫描",
"command": """
# 依赖漏洞扫描
go list -json -m all | nancy sleuth
# 代码安全扫描
gosec ./...
# 容器镜像扫描
trivy image prometheus:${CIRCLE_SHA1}
# 配置安全检查
kube-score score k8s-manifests/
"""
}
}
]
}
}
# 根据部署策略添加特定的部署作业
if strategy == DeploymentStrategy.BLUE_GREEN:
jobs["deploy_production"] = self._create_blue_green_deployment()
elif strategy == DeploymentStrategy.CANARY:
jobs["deploy_production"] = self._create_canary_deployment()
elif strategy == DeploymentStrategy.ROLLING:
jobs["deploy_production"] = self._create_rolling_deployment()
base_pipeline["jobs"] = jobs
return base_pipeline
def _create_blue_green_deployment(self) -> Dict[str, Any]:
"""创建蓝绿部署配置"""
return {
"docker": [{"image": "cimg/deploy:2023.06"}],
"steps": [
"checkout",
{
"kubernetes/install-kubectl": {
"kubectl-version": "v1.28.0"
}
},
{
"run": {
"name": "蓝绿部署",
"command": """
# 部署到绿色环境
kubectl apply -f k8s-manifests/green/
# 等待绿色环境就绪
kubectl wait --for=condition=ready pod -l app=prometheus,env=green --timeout=300s
# 运行健康检查
./scripts/health-check.sh green
# 切换流量到绿色环境
kubectl patch service prometheus-service -p '{"spec":{"selector":{"env":"green"}}}'
# 验证切换成功
./scripts/verify-deployment.sh
# 清理蓝色环境
kubectl delete -f k8s-manifests/blue/
"""
}
}
]
}
def _create_canary_deployment(self) -> Dict[str, Any]:
"""创建金丝雀部署配置"""
return {
"docker": [{"image": "cimg/deploy:2023.06"}],
"steps": [
"checkout",
{
"run": {
"name": "金丝雀部署",
"command": """
# 部署金丝雀版本 (5% 流量)
kubectl apply -f k8s-manifests/canary/
kubectl patch deployment prometheus-canary -p '{"spec":{"replicas":1}}'
# 配置流量分割
kubectl apply -f istio-manifests/virtual-service-canary.yaml
# 监控金丝雀指标
./scripts/monitor-canary.sh 300 # 监控5分钟
# 如果指标正常,逐步增加流量
for traffic in 10 25 50 75 100; do
./scripts/update-traffic-split.sh $traffic
./scripts/monitor-canary.sh 180 # 监控3分钟
if [ $? -ne 0 ]; then
echo "金丝雀部署失败,回滚"
./scripts/rollback-canary.sh
exit 1
fi
done
# 完成部署
kubectl delete -f k8s-manifests/stable/
kubectl apply -f k8s-manifests/new-stable/
"""
}
}
]
}
def _create_rolling_deployment(self) -> Dict[str, Any]:
"""创建滚动部署配置"""
return {
"docker": [{"image": "cimg/deploy:2023.06"}],
"steps": [
"checkout",
{
"run": {
"name": "滚动部署",
"command": """
# 更新部署镜像
kubectl set image deployment/prometheus prometheus=prometheus:${CIRCLE_SHA1}
# 等待滚动更新完成
kubectl rollout status deployment/prometheus --timeout=600s
# 验证部署
./scripts/verify-deployment.sh
# 如果验证失败,回滚
if [ $? -ne 0 ]; then
echo "部署验证失败,执行回滚"
kubectl rollout undo deployment/prometheus
kubectl rollout status deployment/prometheus --timeout=300s
exit 1
fi
"""
}
}
]
}
def create_infrastructure_as_code(self) -> Dict[str, Any]:
"""创建基础设施即代码配置"""
return {
"terraform": {
"main.tf": """
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.11"
}
}
backend "s3" {
bucket = "prometheus-terraform-state"
key = "prometheus/terraform.tfstate"
region = "us-west-2"
}
}
provider "aws" {
region = var.aws_region
}
provider "kubernetes" {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
token = data.aws_eks_cluster_auth.cluster.token
}
provider "helm" {
kubernetes {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
token = data.aws_eks_cluster_auth.cluster.token
}
}
# EKS 集群
module "eks" {
source = "terraform-aws-modules/eks/aws"
cluster_name = var.cluster_name
cluster_version = "1.28"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
node_groups = {
prometheus = {
desired_capacity = 3
max_capacity = 10
min_capacity = 3
instance_types = ["m5.large"]
k8s_labels = {
Environment = var.environment
Application = "prometheus"
}
}
}
}
# Prometheus Helm Chart
resource "helm_release" "prometheus" {
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = "monitoring"
create_namespace = true
values = [
file("${path.module}/prometheus-values.yaml")
]
depends_on = [module.eks]
}
""",
"variables.tf": """
variable "aws_region" {
description = "AWS region"
type = string
default = "us-west-2"
}
variable "cluster_name" {
description = "EKS cluster name"
type = string
default = "prometheus-cluster"
}
variable "environment" {
description = "Environment name"
type = string
default = "production"
}
""",
"outputs.tf": """
output "cluster_endpoint" {
description = "Endpoint for EKS control plane"
value = module.eks.cluster_endpoint
}
output "cluster_security_group_id" {
description = "Security group ids attached to the cluster control plane"
value = module.eks.cluster_security_group_id
}
output "prometheus_url" {
description = "Prometheus URL"
value = "http://${helm_release.prometheus.status[0].load_balancer[0].ingress[0].hostname}:9090"
}
"""
},
"ansible": {
"playbook.yml": """
---
- name: Deploy Prometheus Infrastructure
hosts: localhost
gather_facts: false
vars:
prometheus_version: "2.45.0"
alertmanager_version: "0.25.0"
grafana_version: "10.0.0"
tasks:
- name: Create monitoring namespace
kubernetes.core.k8s:
name: monitoring
api_version: v1
kind: Namespace
state: present
- name: Deploy Prometheus ConfigMap
kubernetes.core.k8s:
definition:
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
{{ lookup('file', 'prometheus.yml') | indent(14) }}
- name: Deploy Prometheus
kubernetes.core.k8s:
definition:
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
spec:
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: "prom/prometheus:v{{ prometheus_version }}"
ports:
- containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: storage
mountPath: /prometheus
volumes:
- name: config
configMap:
name: prometheus-config
- name: storage
persistentVolumeClaim:
claimName: prometheus-storage
- name: Configure Auto-scaling
kubernetes.core.k8s:
definition:
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: prometheus-hpa
namespace: monitoring
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: prometheus
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
"""
}
}
def create_monitoring_automation(self) -> Dict[str, Any]:
"""创建监控自动化配置"""
return {
"auto_scaling_rules": """
groups:
- name: auto_scaling
rules:
# CPU 自动扩容
- alert: HighCPUUsage
expr: avg(rate(container_cpu_usage_seconds_total[5m])) by (pod) > 0.8
for: 2m
labels:
severity: warning
action: scale_up
annotations:
summary: "Pod CPU 使用率过高"
description: "Pod {{ $labels.pod }} CPU 使用率: {{ $value }}"
scale_target: "deployment/prometheus"
scale_action: "increase_replicas"
# 内存自动扩容
- alert: HighMemoryUsage
expr: avg(container_memory_working_set_bytes / container_spec_memory_limit_bytes) by (pod) > 0.85
for: 2m
labels:
severity: warning
action: scale_up
annotations:
summary: "Pod 内存使用率过高"
description: "Pod {{ $labels.pod }} 内存使用率: {{ $value }}"
# 查询延迟自动扩容
- alert: HighQueryLatency
expr: histogram_quantile(0.95, rate(prometheus_http_request_duration_seconds_bucket[5m])) > 1
for: 3m
labels:
severity: warning
action: scale_up
annotations:
summary: "查询延迟过高"
description: "95% 查询延迟: {{ $value }}s"
# 自动缩容
- alert: LowResourceUsage
expr: |
avg(rate(container_cpu_usage_seconds_total[10m])) by (deployment) < 0.2
and
avg(container_memory_working_set_bytes / container_spec_memory_limit_bytes) by (deployment) < 0.5
for: 10m
labels:
severity: info
action: scale_down
annotations:
summary: "资源使用率较低"
description: "部署 {{ $labels.deployment }} 资源使用率较低,可以缩容"
""",
"auto_remediation": {
"webhook_config": {
"url": "http://automation-service:8080/webhook",
"http_config": {
"bearer_token": "${AUTOMATION_TOKEN}"
}
},
"remediation_actions": {
"restart_pod": {
"command": "kubectl delete pod -l app=prometheus",
"conditions": ["pod_crash_loop", "pod_not_ready"]
},
"scale_deployment": {
"command": "kubectl scale deployment prometheus --replicas={{.replicas}}",
"conditions": ["high_cpu", "high_memory", "high_latency"]
},
"reload_config": {
"command": "curl -X POST http://prometheus:9090/-/reload",
"conditions": ["config_error"]
},
"clear_cache": {
"command": "curl -X POST http://prometheus:9090/api/v1/admin/tsdb/delete_series",
"conditions": ["high_memory", "storage_full"]
}
}
},
"chaos_engineering": {
"experiments": [
{
"name": "pod_failure",
"description": "随机杀死 Prometheus Pod",
"schedule": "0 2 * * 1", # 每周一凌晨2点
"config": {
"action": "pod-kill",
"selector": {
"labelSelectors": {
"app": "prometheus"
}
},
"mode": "one",
"duration": "30s"
}
},
{
"name": "network_delay",
"description": "模拟网络延迟",
"schedule": "0 3 * * 2", # 每周二凌晨3点
"config": {
"action": "netem",
"selector": {
"labelSelectors": {
"app": "prometheus"
}
},
"mode": "all",
"duration": "5m",
"delay": "100ms",
"jitter": "10ms"
}
},
{
"name": "cpu_stress",
"description": "CPU 压力测试",
"schedule": "0 4 * * 3", # 每周三凌晨4点
"config": {
"action": "stress",
"selector": {
"labelSelectors": {
"app": "prometheus"
}
},
"mode": "one",
"duration": "10m",
"stressors": {
"cpu": {
"workers": 2,
"load": 80
}
}
}
}
]
}
}
def create_gitops_config(self) -> Dict[str, Any]:
"""创建 GitOps 配置"""
return {
"argocd": {
"application.yaml": """
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://github.com/company/prometheus-config
targetRevision: HEAD
path: k8s-manifests
helm:
valueFiles:
- values-production.yaml
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10
""",
"appproject.yaml": """
apiVersion: argoproj.io/v1alpha1
kind: AppProject
metadata:
name: prometheus-project
namespace: argocd
spec:
description: Prometheus monitoring project
sourceRepos:
- 'https://github.com/company/prometheus-config'
- 'https://prometheus-community.github.io/helm-charts'
destinations:
- namespace: monitoring
server: https://kubernetes.default.svc
- namespace: kube-system
server: https://kubernetes.default.svc
clusterResourceWhitelist:
- group: ''
kind: Namespace
- group: 'rbac.authorization.k8s.io'
kind: ClusterRole
- group: 'rbac.authorization.k8s.io'
kind: ClusterRoleBinding
namespaceResourceWhitelist:
- group: ''
kind: ConfigMap
- group: ''
kind: Secret
- group: ''
kind: Service
- group: 'apps'
kind: Deployment
- group: 'apps'
kind: StatefulSet
roles:
- name: admin
description: Admin access to prometheus project
policies:
- p, proj:prometheus-project:admin, applications, *, prometheus-project/*, allow
- p, proj:prometheus-project:admin, repositories, *, *, allow
groups:
- prometheus-admins
- name: developer
description: Developer access to prometheus project
policies:
- p, proj:prometheus-project:developer, applications, get, prometheus-project/*, allow
- p, proj:prometheus-project:developer, applications, sync, prometheus-project/*, allow
groups:
- prometheus-developers
"""
},
"flux": {
"kustomization.yaml": """
apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
kind: Kustomization
metadata:
name: prometheus
namespace: flux-system
spec:
interval: 5m
path: "./clusters/production/monitoring"
prune: true
sourceRef:
kind: GitRepository
name: prometheus-config
validation: client
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: prometheus-server
namespace: monitoring
- apiVersion: apps/v1
kind: Deployment
name: alertmanager
namespace: monitoring
postBuild:
substitute:
cluster_name: "production"
prometheus_retention: "30d"
alertmanager_retention: "120h"
""",
"gitrepository.yaml": """
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: GitRepository
metadata:
name: prometheus-config
namespace: flux-system
spec:
interval: 1m
ref:
branch: main
url: https://github.com/company/prometheus-config
secretRef:
name: git-credentials
""",
"helmrelease.yaml": """
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: prometheus
namespace: monitoring
spec:
interval: 5m
chart:
spec:
chart: kube-prometheus-stack
version: '48.x'
sourceRef:
kind: HelmRepository
name: prometheus-community
namespace: flux-system
interval: 1m
values:
prometheus:
prometheusSpec:
retention: 30d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
alertmanager:
alertmanagerSpec:
retention: 120h
storage:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
grafana:
persistence:
enabled: true
storageClassName: fast-ssd
size: 10Gi
upgrade:
remediation:
retries: 3
rollback:
cleanupOnFail: true
force: true
"""
}
}
def generate_automation_report(self) -> Dict[str, Any]:
"""生成自动化报告"""
return {
"deployment_metrics": {
"total_deployments": 156,
"successful_deployments": 148,
"failed_deployments": 8,
"success_rate": 94.87,
"average_deployment_time": "8m 32s",
"rollback_count": 3,
"rollback_rate": 1.92
},
"automation_coverage": {
"ci_cd_pipeline": 100,
"infrastructure_as_code": 95,
"monitoring_automation": 90,
"auto_scaling": 85,
"auto_remediation": 75,
"chaos_engineering": 60
},
"performance_improvements": {
"deployment_time_reduction": "65%",
"manual_intervention_reduction": "80%",
"incident_resolution_time": "45%",
"resource_utilization_improvement": "30%",
"cost_reduction": "25%"
},
"recommendations": [
"增加混沌工程实验覆盖率",
"实施更多自动修复规则",
"优化部署流水线性能",
"加强监控自动化",
"完善回滚机制"
],
"next_quarter_goals": {
"target_success_rate": 98,
"target_deployment_time": "6m",
"target_automation_coverage": 95,
"new_features": [
"AI 驱动的异常检测",
"预测性扩容",
"智能告警降噪",
"自动性能调优"
]
}
}
# 使用示例
devops_manager = DevOpsManager()
# 创建 CI/CD 流水线
canary_pipeline = devops_manager.create_ci_cd_pipeline(DeploymentStrategy.CANARY)
blue_green_pipeline = devops_manager.create_ci_cd_pipeline(DeploymentStrategy.BLUE_GREEN)
# 创建基础设施即代码
iac_config = devops_manager.create_infrastructure_as_code()
# 创建监控自动化
monitoring_automation = devops_manager.create_monitoring_automation()
# 创建 GitOps 配置
gitops_config = devops_manager.create_gitops_config()
# 生成自动化报告
automation_report = devops_manager.generate_automation_report()
print("运维自动化与 DevOps 集成")
print(f"CI/CD 流水线作业数: {len(canary_pipeline['jobs'])}")
print(f"基础设施组件数: {len(iac_config)}")
print(f"自动化规则数: {len(monitoring_automation['auto_scaling_rules'].split('- alert:')) - 1}")
print(f"GitOps 配置文件数: {len(gitops_config['argocd']) + len(gitops_config['flux'])}")
print(f"部署成功率: {automation_report['deployment_metrics']['success_rate']}%")
print(f"自动化覆盖率: {automation_report['automation_coverage']['ci_cd_pipeline']}%")
总结
核心要点
通过本教程,我们深入探讨了 Prometheus 的性能优化与扩展策略:
1. 存储优化
- 存储引擎配置:根据不同场景优化 TSDB 配置
- 远程存储集成:支持 Thanos、Cortex、M3DB 等解决方案
- 数据压缩:实施有效的压缩策略减少存储成本
- 保留策略:制定合理的数据保留和清理策略
2. 查询性能优化
- 记录规则:预计算常用查询以提高响应速度
- 查询优化:识别和优化慢查询
- 缓存策略:实施查询结果缓存机制
- 性能监控:持续监控查询性能指标
3. 集群部署与高可用
- 联邦架构:构建多层级监控体系
- Thanos 集群:实现长期存储和全局查询
- 负载均衡:配置 Active-Passive 和 Active-Active 模式
- Kubernetes 部署:容器化部署和自动化管理
4. 安全与合规
- TLS 加密:保护数据传输安全
- 身份认证:支持多种认证方式
- RBAC 授权:细粒度权限控制
- 合规性:满足 SOX、GDPR、HIPAA 等合规要求
5. 运维自动化
- CI/CD 流水线:自动化构建、测试和部署
- 基础设施即代码:使用 Terraform 和 Ansible 管理基础设施
- GitOps:通过 ArgoCD 和 Flux 实现声明式部署
- 混沌工程:提高系统韧性和可靠性
最佳实践总结
性能优化
- 监控优先:建立完善的性能监控体系
- 渐进优化:从最影响性能的问题开始优化
- 容量规划:基于历史数据进行容量预测
- 定期评估:持续评估和调整优化策略
扩展策略
- 水平扩展:优先考虑水平扩展而非垂直扩展
- 分层架构:构建分层的监控架构
- 数据分片:合理分片数据以提高查询性能
- 缓存利用:充分利用各级缓存机制
安全实践
- 最小权限:遵循最小权限原则
- 定期审计:定期进行安全审计和评估
- 加密传输:所有数据传输都应加密
- 合规遵循:严格遵循相关合规要求
自动化原则
- 一切即代码:基础设施、配置、流程都应代码化
- 持续集成:实施完整的 CI/CD 流水线
- 自动修复:建立自动故障检测和修复机制
- 监控驱动:基于监控数据驱动自动化决策
下一步学习建议
深入学习方向
- 云原生技术:深入学习 Kubernetes、Istio 等云原生技术
- 可观测性:学习分布式追踪、日志聚合等可观测性技术
- 机器学习:探索 AIOps 和智能运维技术
- 边缘计算:了解边缘环境下的监控挑战
实践项目建议
- 构建企业级监控平台:设计和实施完整的企业监控解决方案
- 性能优化项目:针对特定场景进行深度性能优化
- 多云监控:实现跨多个云平台的统一监控
- 智能运维系统:结合 AI/ML 技术构建智能运维平台
社区参与
- 开源贡献:参与 Prometheus 生态系统的开源项目
- 技术分享:在技术会议和社区分享实践经验
- 最佳实践:总结和分享企业级实践案例
- 工具开发:开发和分享有用的监控工具
结语
Prometheus 作为现代监控系统的核心,其性能优化和扩展能力直接影响整个监控体系的效果。通过本系列教程的学习,您应该已经掌握了从基础配置到高级优化的完整知识体系。
监控技术在不断发展,新的挑战和解决方案也在不断涌现。保持学习的热情,关注技术发展趋势,结合实际业务需求,持续优化和改进监控系统,是每个监控工程师的使命。
希望这个教程能够帮助您在 Prometheus 的学习和实践道路上更进一步,构建出高效、可靠、安全的监控系统!
Prometheus 性能优化与扩展教程完成!
本教程涵盖了 Prometheus 性能优化的各个方面,从存储优化到集群部署,从安全配置到运维自动化,为您提供了完整的企业级 Prometheus 部署和优化指南。 “`