概述
Prometheus是一个开源的系统监控和告警工具包,最初由SoundCloud开发。它具有多维数据模型、灵活的查询语言、高效的时间序列数据库和强大的告警功能。
学习目标
通过本章学习,你将能够:n- 理解Prometheus的核心概念和架构 - 掌握Prometheus的安装和配置方法 - 了解Prometheus的数据模型和指标类型 - 学会基本的Prometheus配置和使用
Prometheus架构概览
1. 核心组件
from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
from datetime import datetime
class ComponentType(Enum):
"""Prometheus组件类型"""
PROMETHEUS_SERVER = "prometheus_server"
PUSHGATEWAY = "pushgateway"
ALERTMANAGER = "alertmanager"
EXPORTER = "exporter"
GRAFANA = "grafana"
CLIENT_LIBRARY = "client_library"
class MetricType(Enum):
"""指标类型"""
COUNTER = "counter"
GAUGE = "gauge"
HISTOGRAM = "histogram"
SUMMARY = "summary"
class ScrapeProtocol(Enum):
"""抓取协议"""
HTTP = "http"
HTTPS = "https"
@dataclass
class PrometheusComponent:
"""Prometheus组件"""
name: str
type: ComponentType
version: str
port: int
description: str
dependencies: List[str]
config_file: Optional[str] = None
@dataclass
class ScrapeConfig:
"""抓取配置"""
job_name: str
targets: List[str]
scrape_interval: str
scrape_timeout: str
metrics_path: str
scheme: ScrapeProtocol
params: Dict[str, List[str]]
basic_auth: Optional[Dict[str, str]] = None
bearer_token: Optional[str] = None
tls_config: Optional[Dict[str, Any]] = None
class PrometheusArchitecture:
"""Prometheus架构管理器"""
def __init__(self):
self.components = []
self.scrape_configs = []
self.alerting_rules = []
self.recording_rules = []
def get_core_components(self) -> List[PrometheusComponent]:
"""获取核心组件列表"""
return [
PrometheusComponent(
name="Prometheus Server",
type=ComponentType.PROMETHEUS_SERVER,
version="2.45.0",
port=9090,
description="核心服务器,负责数据收集、存储和查询",
dependencies=[],
config_file="prometheus.yml"
),
PrometheusComponent(
name="Pushgateway",
type=ComponentType.PUSHGATEWAY,
version="1.6.2",
port=9091,
description="用于短期作业的指标推送网关",
dependencies=["Prometheus Server"]
),
PrometheusComponent(
name="Alertmanager",
type=ComponentType.ALERTMANAGER,
version="0.25.0",
port=9093,
description="处理告警的路由、分组、静默和抑制",
dependencies=["Prometheus Server"],
config_file="alertmanager.yml"
),
PrometheusComponent(
name="Node Exporter",
type=ComponentType.EXPORTER,
version="1.6.1",
port=9100,
description="收集系统级指标(CPU、内存、磁盘等)",
dependencies=[]
),
PrometheusComponent(
name="Grafana",
type=ComponentType.GRAFANA,
version="10.0.0",
port=3000,
description="数据可视化和仪表板平台",
dependencies=["Prometheus Server"]
)
]
def create_architecture_diagram(self) -> str:
"""创建架构图"""
return """
┌─────────────────────────────────────────────────────────────────┐
│ Prometheus 监控架构 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Grafana │ │ Alertmanager│ │ Pushgateway │ │
│ │ :3000 │ │ :9093 │ │ :9091 │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ │ Query │ Alerts │ Push │
│ │ │ │ │
│ ┌──────▼──────────────────▼──────────────────▼──────┐ │
│ │ Prometheus Server │ │
│ │ :9090 │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Storage │ │ Rules │ │ Scraper │ │ │
│ │ │ Engine │ │ Engine │ │ Engine │ │ │
│ │ └─────────────┘ └─────────────┘ └──────┬──────┘ │ │
│ └─────────────────────────────────────────────┬──────┘ │
│ │ │
│ Scrape │ │
│ │ │
│ ┌─────────────┐ ┌─────────────┐ ┌──────▼──────┐ │
│ │ MySQL │ │ Redis │ │ Node │ │
│ │ Exporter │ │ Exporter │ │ Exporter │ │
│ │ :9104 │ │ :9121 │ │ :9100 │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Application │ │ Nginx │ │ Kubernetes │ │
│ │ Metrics │ │ Exporter │ │ Metrics │ │
│ │ :8080 │ │ :9113 │ │ :10250 │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────┘
"""
def create_data_flow_diagram(self) -> str:
"""创建数据流图"""
return """
数据流向图:
1. 数据收集阶段
Target Applications → Exporters → Prometheus Server
2. 数据存储阶段
Prometheus Server → Time Series Database (TSDB)
3. 规则处理阶段
TSDB → Rules Engine → Recording Rules / Alerting Rules
4. 告警处理阶段
Alerting Rules → Alertmanager → Notification Channels
5. 数据查询阶段
TSDB → PromQL Engine → Grafana / API Clients
6. 短期作业处理
Batch Jobs → Pushgateway → Prometheus Server
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Target │────▶│ Exporter │────▶│ Prometheus │
│ Application │ │ │ │ Server │
└─────────────┘ └─────────────┘ └──────┬──────┘
│
┌─────────────┐ ┌─────────────┐ │
│ Batch Jobs │────▶│ Pushgateway │───────────┘
└─────────────┘ └─────────────┘
│
┌─────────────┐ ▼
│ Alertmanager│◀──┌─────────────┐
└─────────────┘ │ TSDB │
└──────┬──────┘
┌─────────────┐ │
│ Grafana │◀─────────┘
└─────────────┘
"""
# 使用示例
architecture = PrometheusArchitecture()
# 获取核心组件
components = architecture.get_core_components()
print(f"Prometheus核心组件数量: {len(components)}")
for component in components:
print(f"- {component.name} ({component.type.value}): {component.description}")
# 显示架构图
print("\n" + architecture.create_architecture_diagram())
# 显示数据流图
print("\n" + architecture.create_data_flow_diagram())
Prometheus安装
1. 二进制安装
class PrometheusInstaller:
"""Prometheus安装器"""
def __init__(self):
self.version = "2.45.0"
self.platform = "linux-amd64"
self.install_dir = "/opt/prometheus"
self.config_dir = "/etc/prometheus"
self.data_dir = "/var/lib/prometheus"
self.user = "prometheus"
self.group = "prometheus"
def generate_download_script(self) -> str:
"""生成下载脚本"""
return f"""
#!/bin/bash
# Prometheus 二进制安装脚本
set -e
# 配置变量
VERSION="{self.version}"
PLATFORM="{self.platform}"
INSTALL_DIR="{self.install_dir}"
CONFIG_DIR="{self.config_dir}"
DATA_DIR="{self.data_dir}"
USER="{self.user}"
GROUP="{self.group}"
echo "开始安装 Prometheus $VERSION..."
# 1. 创建用户和组
sudo groupadd --system $GROUP 2>/dev/null || true
sudo useradd --system --gid $GROUP --no-create-home --shell /bin/false $USER 2>/dev/null || true
# 2. 创建目录
sudo mkdir -p $INSTALL_DIR $CONFIG_DIR $DATA_DIR
sudo chown $USER:$GROUP $INSTALL_DIR $CONFIG_DIR $DATA_DIR
# 3. 下载和解压
cd /tmp
wget https://github.com/prometheus/prometheus/releases/download/v$VERSION/prometheus-$VERSION.$PLATFORM.tar.gz
tar xzf prometheus-$VERSION.$PLATFORM.tar.gz
# 4. 安装二进制文件
sudo cp prometheus-$VERSION.$PLATFORM/prometheus $INSTALL_DIR/
sudo cp prometheus-$VERSION.$PLATFORM/promtool $INSTALL_DIR/
sudo chown $USER:$GROUP $INSTALL_DIR/prometheus $INSTALL_DIR/promtool
sudo chmod +x $INSTALL_DIR/prometheus $INSTALL_DIR/promtool
# 5. 创建符号链接
sudo ln -sf $INSTALL_DIR/prometheus /usr/local/bin/prometheus
sudo ln -sf $INSTALL_DIR/promtool /usr/local/bin/promtool
# 6. 复制配置文件
sudo cp prometheus-$VERSION.$PLATFORM/prometheus.yml $CONFIG_DIR/
sudo chown $USER:$GROUP $CONFIG_DIR/prometheus.yml
# 7. 清理临时文件
rm -rf prometheus-$VERSION.$PLATFORM*
echo "Prometheus 安装完成!"
echo "配置文件位置: $CONFIG_DIR/prometheus.yml"
echo "数据目录: $DATA_DIR"
echo "安装目录: $INSTALL_DIR"
"""
def generate_systemd_service(self) -> str:
"""生成systemd服务文件"""
return f"""
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/docs/
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User={self.user}
Group={self.group}
ExecReload=/bin/kill -HUP $MAINPID
ExecStart={self.install_dir}/prometheus \
--config.file={self.config_dir}/prometheus.yml \
--storage.tsdb.path={self.data_dir} \
--web.console.templates={self.install_dir}/consoles \
--web.console.libraries={self.install_dir}/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.external-url=
SyslogIdentifier=prometheus
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
"""
def generate_docker_compose(self) -> str:
"""生成Docker Compose配置"""
return f"""
version: '3.8'
services:
prometheus:
image: prom/prometheus:{self.version}
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.listen-address=0.0.0.0:9090'
- '--web.external-url='
- '--storage.tsdb.retention.time=15d'
- '--storage.tsdb.retention.size=10GB'
- '--web.enable-lifecycle'
restart: unless-stopped
networks:
- monitoring
node-exporter:
image: prom/node-exporter:v1.6.1
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
networks:
- monitoring
alertmanager:
image: prom/alertmanager:v0.25.0
container_name: alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url='
restart: unless-stopped
networks:
- monitoring
grafana:
image: grafana/grafana:10.0.0
container_name: grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
networks:
- monitoring
volumes:
prometheus_data:
alertmanager_data:
grafana_data:
networks:
monitoring:
driver: bridge
"""
def generate_kubernetes_manifests(self) -> Dict[str, str]:
"""生成Kubernetes部署清单"""
return {
"namespace.yaml": """
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
""",
"prometheus-configmap.yaml": """
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
""",
"prometheus-deployment.yaml": f"""
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:{self.version}
ports:
- containerPort: 9090
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.listen-address=0.0.0.0:9090'
- '--web.external-url='
- '--storage.tsdb.retention.time=15d'
- '--storage.tsdb.retention.size=10GB'
- '--web.enable-lifecycle'
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: storage-volume
mountPath: /prometheus
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
volumes:
- name: config-volume
configMap:
name: prometheus-config
- name: storage-volume
persistentVolumeClaim:
claimName: prometheus-pvc
""",
"prometheus-service.yaml": """
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
selector:
app: prometheus
""",
"prometheus-pvc.yaml": """
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
""",
"prometheus-rbac.yaml": """
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
"""
}
# 使用示例
installer = PrometheusInstaller()
# 生成安装脚本
install_script = installer.generate_download_script()
print("二进制安装脚本已生成")
# 生成systemd服务
systemd_service = installer.generate_systemd_service()
print("systemd服务文件已生成")
# 生成Docker Compose
docker_compose = installer.generate_docker_compose()
print("Docker Compose配置已生成")
# 生成Kubernetes清单
k8s_manifests = installer.generate_kubernetes_manifests()
print(f"Kubernetes部署清单已生成,包含 {len(k8s_manifests)} 个文件")
基础配置
1. Prometheus配置文件
class PrometheusConfig:
"""Prometheus配置管理器"""
def __init__(self):
self.global_config = {}
self.scrape_configs = []
self.rule_files = []
self.alerting_config = {}
self.remote_write_configs = []
self.remote_read_configs = []
def create_basic_config(self) -> str:
"""创建基础配置文件"""
return """
# Prometheus 基础配置文件
global:
# 全局抓取间隔
scrape_interval: 15s
# 全局评估间隔
evaluation_interval: 15s
# 外部标签(用于联邦和远程存储)
external_labels:
cluster: 'production'
region: 'us-west-1'
# 规则文件路径
rule_files:
- "rules/*.yml"
- "alerts/*.yml"
# 告警管理器配置
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
timeout: 10s
api_version: v2
# 抓取配置
scrape_configs:
# Prometheus自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
metrics_path: /metrics
# Node Exporter监控
- job_name: 'node-exporter'
static_configs:
- targets:
- 'node1:9100'
- 'node2:9100'
- 'node3:9100'
scrape_interval: 15s
# 应用程序监控
- job_name: 'web-app'
static_configs:
- targets:
- 'app1:8080'
- 'app2:8080'
metrics_path: /actuator/prometheus
scrape_interval: 30s
# 数据库监控
- job_name: 'mysql-exporter'
static_configs:
- targets: ['mysql-exporter:9104']
# Redis监控
- job_name: 'redis-exporter'
static_configs:
- targets: ['redis-exporter:9121']
# Nginx监控
- job_name: 'nginx-exporter'
static_configs:
- targets: ['nginx-exporter:9113']
# 远程写入配置(可选)
# remote_write:
# - url: "https://prometheus-remote-write.example.com/api/v1/write"
# basic_auth:
# username: "user"
# password: "password"
# 远程读取配置(可选)
# remote_read:
# - url: "https://prometheus-remote-read.example.com/api/v1/read"
# basic_auth:
# username: "user"
# password: "password"
"""
def create_advanced_config(self) -> str:
"""创建高级配置文件"""
return """
# Prometheus 高级配置文件
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
external_labels:
cluster: 'production'
region: 'us-west-1'
environment: 'prod'
rule_files:
- "rules/recording_rules.yml"
- "rules/alerting_rules.yml"
- "rules/custom_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager-1:9093
- alertmanager-2:9093
- alertmanager-3:9093
timeout: 10s
api_version: v2
path_prefix: /alertmanager
scrape_configs:
# Prometheus集群监控
- job_name: 'prometheus'
static_configs:
- targets:
- 'prometheus-1:9090'
- 'prometheus-2:9090'
scrape_interval: 5s
honor_labels: true
# 服务发现 - Consul
- job_name: 'consul-services'
consul_sd_configs:
- server: 'consul:8500'
services: []
relabel_configs:
- source_labels: [__meta_consul_tags]
regex: .*,prometheus,.*
action: keep
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_node]
target_label: instance
# 服务发现 - Kubernetes
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# 黑盒监控
- job_name: 'blackbox-http'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://example.com
- https://api.example.com
- https://app.example.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# SNMP监控
- job_name: 'snmp-devices'
static_configs:
- targets:
- 192.168.1.1 # Router
- 192.168.1.2 # Switch
metrics_path: /snmp
params:
module: [if_mib]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter:9116
# 自定义Exporter
- job_name: 'custom-app'
static_configs:
- targets: ['custom-exporter:9200']
basic_auth:
username: 'prometheus'
password: 'secret'
tls_config:
ca_file: /etc/prometheus/certs/ca.pem
cert_file: /etc/prometheus/certs/client.pem
key_file: /etc/prometheus/certs/client-key.pem
insecure_skip_verify: false
# 联邦配置
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job=~"prometheus|node-exporter"}
- '{__name__=~"job:.*"}'
static_configs:
- targets:
- 'prometheus-dc1:9090'
- 'prometheus-dc2:9090'
# 远程存储配置
remote_write:
- url: "https://cortex.example.com/api/prom/push"
basic_auth:
username: "prometheus"
password: "secret"
write_relabel_configs:
- source_labels: [__name__]
regex: 'go_.*'
action: drop
queue_config:
capacity: 10000
max_shards: 200
min_shards: 1
max_samples_per_send: 1000
batch_send_deadline: 5s
min_backoff: 30ms
max_backoff: 100ms
remote_read:
- url: "https://cortex.example.com/api/prom/read"
basic_auth:
username: "prometheus"
password: "secret"
read_recent: true
"""
def create_scrape_config_template(self, job_name: str, targets: List[str],
scrape_interval: str = "15s",
metrics_path: str = "/metrics",
scheme: str = "http") -> Dict:
"""创建抓取配置模板"""
return {
"job_name": job_name,
"static_configs": [
{"targets": targets}
],
"scrape_interval": scrape_interval,
"metrics_path": metrics_path,
"scheme": scheme
}
def validate_config(self, config_content: str) -> Dict[str, Any]:
"""验证配置文件"""
# 模拟配置验证
validation_result = {
"valid": True,
"errors": [],
"warnings": [],
"suggestions": []
}
# 检查必需字段
required_sections = ["global", "scrape_configs"]
for section in required_sections:
if section not in config_content:
validation_result["errors"].append(f"缺少必需的配置段: {section}")
validation_result["valid"] = False
# 检查抓取间隔
if "scrape_interval: 1s" in config_content:
validation_result["warnings"].append("抓取间隔过短可能导致性能问题")
# 提供优化建议
if "scrape_timeout" not in config_content:
validation_result["suggestions"].append("建议设置scrape_timeout以避免超时问题")
return validation_result
# 使用示例
config_manager = PrometheusConfig()
# 创建基础配置
basic_config = config_manager.create_basic_config()
print("基础配置文件已生成")
# 创建高级配置
advanced_config = config_manager.create_advanced_config()
print("高级配置文件已生成")
# 创建自定义抓取配置
custom_scrape = config_manager.create_scrape_config_template(
job_name="my-app",
targets=["app1:8080", "app2:8080"],
scrape_interval="30s",
metrics_path="/metrics"
)
print(f"自定义抓取配置: {custom_scrape}")
# 验证配置
validation = config_manager.validate_config(basic_config)
print(f"配置验证结果: {validation}")
数据模型与指标类型
1. 时间序列数据模型
class TimeSeriesModel:
"""时间序列数据模型"""
def __init__(self):
self.metric_name = ""
self.labels = {}
self.samples = []
def explain_data_model(self) -> str:
"""解释Prometheus数据模型"""
return """
Prometheus 时间序列数据模型:
1. 指标名称 (Metric Name)
- 描述被测量的系统特征
- 格式:[a-zA-Z_:][a-zA-Z0-9_:]*
- 示例:http_requests_total, cpu_usage_percent
2. 标签 (Labels)
- 用于区分同一指标的不同维度
- 格式:{label_name="label_value"}
- 示例:{method="GET", status="200", instance="web-1"}
3. 样本 (Samples)
- 包含时间戳和数值
- 时间戳:Unix时间戳(毫秒精度)
- 数值:64位浮点数
4. 时间序列标识
- 指标名称 + 标签集合 = 唯一时间序列
- 示例:http_requests_total{method="GET", status="200"}
数据格式示例:
http_requests_total{method="GET", status="200", instance="web-1"} 1234 @1609459200
│ │ │ │
│ └─ 标签集合 │ └─ 时间戳
└─ 指标名称 └─ 数值
"""
def create_metric_examples(self) -> Dict[str, List[str]]:
"""创建指标示例"""
return {
"Counter": [
"http_requests_total{method='GET', status='200'} 1234",
"errors_total{type='timeout', service='api'} 56",
"bytes_sent_total{instance='web-1'} 1048576"
],
"Gauge": [
"cpu_usage_percent{cpu='0', mode='user'} 45.2",
"memory_usage_bytes{instance='db-1'} 2147483648",
"temperature_celsius{sensor='cpu', location='server-room'} 68.5"
],
"Histogram": [
"http_request_duration_seconds_bucket{le='0.1'} 100",
"http_request_duration_seconds_bucket{le='0.5'} 200",
"http_request_duration_seconds_bucket{le='+Inf'} 250",
"http_request_duration_seconds_sum 45.2",
"http_request_duration_seconds_count 250"
],
"Summary": [
"http_request_duration_seconds{quantile='0.5'} 0.12",
"http_request_duration_seconds{quantile='0.9'} 0.35",
"http_request_duration_seconds{quantile='0.99'} 0.8",
"http_request_duration_seconds_sum 45.2",
"http_request_duration_seconds_count 250"
]
}
class MetricTypeExplainer:
"""指标类型解释器"""
def explain_counter(self) -> str:
"""解释Counter指标"""
return """
Counter(计数器):
特征:
- 只能增加或重置为0
- 用于计算累积值
- 重启时会重置
使用场景:
- HTTP请求总数
- 错误总数
- 处理的任务总数
- 发送的字节总数
示例:
# HELP http_requests_total Total number of HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="GET", status="200"} 1234
http_requests_total{method="POST", status="201"} 567
http_requests_total{method="GET", status="404"} 89
PromQL查询示例:
# 计算每秒请求率
rate(http_requests_total[5m])
# 计算5分钟内的请求增量
increase(http_requests_total[5m])
# 按状态码分组的请求率
sum(rate(http_requests_total[5m])) by (status)
注意事项:
- 不要用于可能减少的值
- 使用rate()或increase()函数处理
- 重启后会重置,需要考虑这种情况
"""
def explain_gauge(self) -> str:
"""解释Gauge指标"""
return """
Gauge(仪表盘):
特征:
- 可以任意增加或减少
- 表示瞬时值
- 适合表示当前状态
使用场景:
- CPU使用率
- 内存使用量
- 温度
- 当前连接数
- 队列长度
示例:
# HELP cpu_usage_percent Current CPU usage percentage
# TYPE cpu_usage_percent gauge
cpu_usage_percent{cpu="0", mode="user"} 45.2
cpu_usage_percent{cpu="0", mode="system"} 12.8
cpu_usage_percent{cpu="1", mode="user"} 38.9
PromQL查询示例:
# 当前CPU使用率
cpu_usage_percent
# 平均CPU使用率
avg(cpu_usage_percent) by (instance)
# 最大内存使用率
max(memory_usage_percent)
# 预测未来趋势
predict_linear(disk_usage_percent[1h], 3600)
注意事项:
- 直接使用原始值
- 不需要rate()函数
- 适合告警阈值设置
"""
def explain_histogram(self) -> str:
"""解释Histogram指标"""
return """
Histogram(直方图):
特征:
- 将观测值分配到预定义的桶中
- 提供累积计数
- 包含总和和计数
组成部分:
- _bucket{le="x"}: 小于等于x的观测值数量
- _sum: 所有观测值的总和
- _count: 观测值的总数量
使用场景:
- 请求延迟分布
- 响应大小分布
- 任务执行时间分布
示例:
# HELP http_request_duration_seconds HTTP request latency
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{le="0.1"} 100
http_request_duration_seconds_bucket{le="0.5"} 200
http_request_duration_seconds_bucket{le="1.0"} 240
http_request_duration_seconds_bucket{le="+Inf"} 250
http_request_duration_seconds_sum 45.2
http_request_duration_seconds_count 250
PromQL查询示例:
# 计算95%分位数
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# 计算平均延迟
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])
# 计算SLA(99%的请求在1秒内完成)
(
rate(http_request_duration_seconds_bucket{le="1.0"}[5m]) /
rate(http_request_duration_seconds_count[5m])
) * 100
桶设计建议:
- 使用指数增长的桶边界
- 包含预期值范围
- 考虑查询需求
"""
def explain_summary(self) -> str:
"""解释Summary指标"""
return """
Summary(摘要):
特征:
- 在客户端计算分位数
- 提供精确的分位数值
- 包含总和和计数
组成部分:
- {quantile="x"}: x分位数的值
- _sum: 所有观测值的总和
- _count: 观测值的总数量
使用场景:
- 需要精确分位数的场景
- 客户端计算能力充足
- 分位数需求固定
示例:
# HELP http_request_duration_seconds HTTP request latency
# TYPE http_request_duration_seconds summary
http_request_duration_seconds{quantile="0.5"} 0.12
http_request_duration_seconds{quantile="0.9"} 0.35
http_request_duration_seconds{quantile="0.99"} 0.8
http_request_duration_seconds_sum 45.2
http_request_duration_seconds_count 250
PromQL查询示例:
# 直接获取分位数
http_request_duration_seconds{quantile="0.95"}
# 计算平均值
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])
# 计算请求率
rate(http_request_duration_seconds_count[5m])
Histogram vs Summary:
Histogram优势:
- 可以聚合多个实例
- 可以计算任意分位数
- 服务器端计算
Summary优势:
- 精确的分位数
- 客户端计算,减少服务器负载
- 固定的内存使用
选择建议:
- 需要聚合:使用Histogram
- 需要精确分位数:使用Summary
- 分布式系统:优先Histogram
"""
# 使用示例
ts_model = TimeSeriesModel()
metric_explainer = MetricTypeExplainer()
# 解释数据模型
print(ts_model.explain_data_model())
# 创建指标示例
examples = ts_model.create_metric_examples()
for metric_type, metric_examples in examples.items():
print(f"\n{metric_type} 示例:")
for example in metric_examples:
print(f" {example}")
# 解释各种指标类型
print("\n" + "="*50)
print(metric_explainer.explain_counter())
print("\n" + "="*50)
print(metric_explainer.explain_gauge())
print("\n" + "="*50)
print(metric_explainer.explain_histogram())
print("\n" + "="*50)
print(metric_explainer.explain_summary())
总结
关键要点
Prometheus架构理解
- 核心组件及其作用
- 数据流向和处理过程
- 分布式架构设计
安装部署方式
- 二进制安装:适合传统环境
- Docker部署:适合容器化环境
- Kubernetes部署:适合云原生环境
配置文件管理
- 全局配置参数
- 抓取目标配置
- 服务发现配置
- 远程存储配置
数据模型掌握
- 时间序列标识
- 四种指标类型
- 标签使用规范
- 数据格式理解
最佳实践
架构设计
- 合理规划组件部署
- 考虑高可用性
- 设计数据保留策略
- 规划存储容量
配置优化
- 合理设置抓取间隔
- 优化标签使用
- 配置适当的超时时间
- 使用服务发现
监控策略
- 选择合适的指标类型
- 设计有意义的标签
- 避免高基数标签
- 定期清理无用指标
下一步学习建议
深入学习PromQL查询语言
- 掌握基本查询语法
- 学习聚合函数使用
- 理解时间序列操作
探索Exporter生态
- 了解常用Exporter
- 学习自定义Exporter开发
- 掌握服务发现配置
告警规则配置
- 学习告警规则语法
- 配置Alertmanager
- 设计告警策略
性能优化
- 监控Prometheus性能
- 优化查询效率
- 配置远程存储
通过本章学习,你应该能够: - 理解Prometheus的核心概念和架构 - 成功安装和配置Prometheus - 掌握基本的配置文件编写 - 理解时间序列数据模型和指标类型
继续学习后续章节,你将能够构建完整的监控系统!