概述
数据可视化是监控系统的重要组成部分,它将复杂的指标数据转化为直观的图表和仪表板,帮助运维人员快速理解系统状态、发现问题和做出决策。Grafana 作为业界领先的开源可视化平台,与 Prometheus 完美集成,提供了强大的数据展示和分析能力。
学习目标
通过本章学习,你将掌握:
- 🎨 Grafana 基础配置和数据源管理
- 📊 创建和定制各种类型的可视化图表
- 📋 构建综合性的监控仪表板
- 🔍 高级查询和数据分析技巧
- ⚡ 性能优化和最佳实践
- 🚨 可视化告警和通知集成
- 🔧 自动化部署和管理策略
Grafana 基础配置
GrafanaConfigManager 类
from typing import Dict, List, Any, Optional
from enum import Enum
from dataclasses import dataclass
import json
import yaml
class DataSourceType(Enum):
"""数据源类型枚举"""
PROMETHEUS = "prometheus"
INFLUXDB = "influxdb"
ELASTICSEARCH = "elasticsearch"
MYSQL = "mysql"
POSTGRES = "postgres"
CLOUDWATCH = "cloudwatch"
GRAPHITE = "graphite"
LOKI = "loki"
JAEGER = "jaeger"
ZIPKIN = "zipkin"
class PanelType(Enum):
"""面板类型枚举"""
GRAPH = "graph"
SINGLESTAT = "singlestat"
TABLE = "table"
HEATMAP = "heatmap"
GAUGE = "gauge"
BAR_GAUGE = "bargauge"
STAT = "stat"
PIE_CHART = "piechart"
WORLDMAP = "worldmap"
TEXT = "text"
LOGS = "logs"
NODE_GRAPH = "nodeGraph"
@dataclass
class DataSource:
"""数据源配置"""
name: str
type: DataSourceType
url: str
access: str = "proxy"
is_default: bool = False
basic_auth: bool = False
basic_auth_user: str = ""
basic_auth_password: str = ""
with_credentials: bool = False
json_data: Dict[str, Any] = None
secure_json_data: Dict[str, Any] = None
class GrafanaConfigManager:
"""Grafana 配置管理器"""
def __init__(self):
self.data_sources = []
self.dashboards = []
self.organizations = []
self.users = []
def create_prometheus_datasource(self,
name: str = "Prometheus",
url: str = "http://prometheus:9090",
scrape_interval: str = "15s") -> Dict[str, Any]:
"""创建 Prometheus 数据源配置"""
return {
"name": name,
"type": "prometheus",
"access": "proxy",
"url": url,
"isDefault": True,
"jsonData": {
"httpMethod": "POST",
"timeInterval": scrape_interval,
"queryTimeout": "60s",
"exemplarTraceIdDestinations": [
{
"name": "trace_id",
"datasourceUid": "jaeger"
}
]
},
"secureJsonData": {},
"version": 1,
"editable": True
}
def create_loki_datasource(self,
name: str = "Loki",
url: str = "http://loki:3100") -> Dict[str, Any]:
"""创建 Loki 数据源配置"""
return {
"name": name,
"type": "loki",
"access": "proxy",
"url": url,
"jsonData": {
"maxLines": 1000,
"derivedFields": [
{
"matcherRegex": "trace_id=(\\w+)",
"name": "TraceID",
"url": "${__value.raw}",
"datasourceUid": "jaeger"
}
]
},
"version": 1,
"editable": True
}
def create_jaeger_datasource(self,
name: str = "Jaeger",
url: str = "http://jaeger:16686") -> Dict[str, Any]:
"""创建 Jaeger 数据源配置"""
return {
"name": name,
"type": "jaeger",
"access": "proxy",
"url": url,
"jsonData": {
"tracesToLogs": {
"datasourceUid": "loki",
"tags": ["job", "instance", "pod", "namespace"],
"mappedTags": [
{
"key": "service.name",
"value": "service"
}
],
"mapTagNamesEnabled": True,
"spanStartTimeShift": "1h",
"spanEndTimeShift": "1h"
},
"nodeGraph": {
"enabled": True
}
},
"version": 1,
"editable": True
}
def create_grafana_config(self) -> str:
"""创建 Grafana 主配置文件"""
return """
# Grafana 配置文件 (grafana.ini)
[default]
instance_name = prometheus-monitoring
[paths]
data = /var/lib/grafana
logs = /var/log/grafana
plugins = /var/lib/grafana/plugins
provisioning = /etc/grafana/provisioning
[server]
protocol = http
http_addr = 0.0.0.0
http_port = 3000
domain = localhost
enforce_domain = false
root_url = %(protocol)s://%(domain)s:%(http_port)s/
serve_from_sub_path = false
router_logging = false
static_root_path = public
enable_gzip = false
cert_file =
cert_key =
socket = /tmp/grafana.sock
cdn_url =
read_timeout = 0
[database]
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
password =
url =
ssl_mode = disable
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
path = /var/lib/grafana/grafana.db
max_idle_conn = 2
max_open_conn =
conn_max_lifetime = 14400
log_queries =
cache_mode = private
[session]
provider = file
provider_config = sessions
cookie_name = grafana_sess
cookie_secure = false
session_life_time = 86400
gc_interval_time = 86400
conn_max_lifetime = 14400
[dataproxy]
logging = false
timeout = 30
dialTimeout = 10
keep_alive_seconds = 30
tls_handshake_timeout_seconds = 10
expect_continue_timeout_seconds = 1
max_conns_per_host = 0
max_idle_connections = 100
idle_conn_timeout_seconds = 90
send_user_header = false
[analytics]
reporting_enabled = false
check_for_updates = false
google_analytics_ua_id =
google_tag_manager_id =
rudderstack_write_key =
rudderstack_data_plane_url =
rudderstack_sdk_url =
application_insights_connection_string =
application_insights_endpoint_url =
[security]
admin_user = admin
admin_password = admin
secret_key = SW2YcwTIb9zpOOhoPsMm
login_remember_days = 7
cookie_username = grafana_user
cookie_remember_name = grafana_remember
disable_gravatar = false
data_source_proxy_whitelist =
disable_brute_force_login_protection = false
cookie_samesite = lax
allow_embedding = false
strict_transport_security = false
strict_transport_security_max_age_seconds = 86400
strict_transport_security_preload = false
strict_transport_security_subdomains = false
x_content_type_options = true
x_xss_protection = true
content_security_policy = false
content_security_policy_template = """
[snapshots]
external_enabled = true
external_snapshot_url = https://snapshots-origin.raintank.io
external_snapshot_name = Publish to snapshot.raintank.io
remove_expired = true
ttl_days = 90
[dashboards]
versions_to_keep = 20
min_refresh_interval = 5s
default_home_dashboard_path =
[users]
allow_sign_up = false
allow_org_create = false
auto_assign_org = true
auto_assign_org_id = 1
auto_assign_org_role = Viewer
verify_email_enabled = false
login_hint = email or username
password_hint = password
default_theme = dark
home_page =
external_manage_link_url =
external_manage_link_name =
external_manage_info =
viewers_can_edit = false
editors_can_admin = false
user_invite_max_lifetime_duration = 24h
hidden_users =
[auth]
login_cookie_name = grafana_session
login_maximum_inactive_lifetime_duration =
login_maximum_lifetime_duration =
token_rotation_interval_minutes = 10
disable_login_form = false
disable_signout_menu = false
signout_redirect_url =
oauth_auto_login = false
oauth_state_cookie_max_age = 600
api_key_max_seconds_to_live = -1
sigv4_auth_enabled = false
[auth.anonymous]
enabled = false
org_name = Main Org.
org_role = Viewer
hide_version = false
[auth.github]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email,read:org
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
team_ids =
allowed_organizations =
allowed_domains =
[auth.google]
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret = some_client_secret
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
[auth.ldap]
enabled = false
config_file = /etc/grafana/ldap.toml
allow_sign_up = true
sync_cron = "0 0 1 * * *"
active_sync_enabled = true
[smtp]
enabled = false
host = localhost:587
user =
password =
cert_file =
key_file =
skip_verify = false
from_address = admin@grafana.localhost
from_name = Grafana
ehlo_identity = dashboard.example.com
startTLS_policy =
[emails]
welcome_email_on_sign_up = false
templates_pattern = emails/*.html, emails/*.txt
content_types = text/html
[log]
mode = console
level = info
filters =
[log.console]
level =
format = console
[log.file]
level =
format = text
log_rotate = true
max_lines = 1000000
max_size_shift = 28
daily_rotate = true
max_days = 7
[log.syslog]
level =
format = text
network =
address =
facility =
tag =
[metrics]
enabled = true
interval_seconds = 10
disable_total_stats = false
basic_auth_username =
basic_auth_password =
[metrics.graphite]
address =
prefix = prod.grafana.%(instance_name)s.
[tracing.jaeger]
address = localhost:6831
always_included_tag =
sampler_type = const
sampler_param = 1
sampling_server_url =
process_tags =
[grafana_net]
url = https://grafana.net
[external_image_storage]
provider =
[external_image_storage.s3]
endpoint =
path_style_access =
bucket_url =
bucket =
region =
path =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
[external_image_storage.gcs]
key_file =
bucket =
path =
[external_image_storage.azure_blob]
account_name =
account_key =
container_name =
[external_image_storage.local]
path =
[rendering]
server_url =
callback_url =
concurrent_render_request_limit = 30
[panels]
enable_alpha = false
disable_sanitize_html = false
[plugins]
enable_alpha = false
app_tls_skip_verify_insecure = false
allow_loading_unsigned_plugins =
marketplace_url = https://grafana.com/grafana/plugins/
[live]
max_connections = 100
allowed_origins =
[feature_toggles]
enable =
"""
def create_provisioning_config(self) -> Dict[str, str]:
"""创建 Grafana 自动配置文件"""
# 数据源配置
datasources_config = """
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
jsonData:
httpMethod: POST
timeInterval: 15s
queryTimeout: 60s
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: jaeger
version: 1
editable: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
jsonData:
maxLines: 1000
derivedFields:
- matcherRegex: "trace_id=(\\w+)"
name: TraceID
url: "${__value.raw}"
datasourceUid: jaeger
version: 1
editable: true
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686
jsonData:
tracesToLogs:
datasourceUid: loki
tags: ["job", "instance", "pod", "namespace"]
mappedTags:
- key: service.name
value: service
mapTagNamesEnabled: true
spanStartTimeShift: 1h
spanEndTimeShift: 1h
nodeGraph:
enabled: true
version: 1
editable: true
- name: AlertManager
type: alertmanager
access: proxy
url: http://alertmanager:9093
jsonData:
implementation: prometheus
version: 1
editable: true
"""
# 仪表板配置
dashboards_config = """
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
- name: 'infrastructure'
orgId: 1
folder: 'Infrastructure'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/infrastructure
- name: 'applications'
orgId: 1
folder: 'Applications'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/applications
- name: 'business'
orgId: 1
folder: 'Business Metrics'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/business
"""
# 通知渠道配置
notifiers_config = """
apiVersion: 1
notifiers:
- name: email-notifications
type: email
uid: email001
org_id: 1
is_default: true
send_reminder: true
disable_resolve_message: false
frequency: 10m
settings:
addresses: "admin@example.com;ops@example.com"
subject: "Grafana Alert - {{ .GroupLabels.alertname }}"
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Value: {{ .ValueString }}
{{ end }}
- name: slack-notifications
type: slack
uid: slack001
org_id: 1
is_default: false
send_reminder: true
disable_resolve_message: false
frequency: 5m
settings:
url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
channel: "#alerts"
username: "Grafana"
title: "Grafana Alert"
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Value:* {{ .ValueString }}
{{ end }}
- name: webhook-notifications
type: webhook
uid: webhook001
org_id: 1
is_default: false
send_reminder: false
disable_resolve_message: false
frequency: 1m
settings:
url: "http://webhook-service:8080/alerts"
httpMethod: "POST"
username: ""
password: ""
authorization_scheme: ""
authorization_credentials: ""
maxAlerts: 0
"""
return {
"datasources.yml": datasources_config,
"dashboards.yml": dashboards_config,
"notifiers.yml": notifiers_config
}
def create_docker_compose_config(self) -> str:
"""创建 Docker Compose 配置"""
return """
version: '3.8'
services:
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-worldmap-panel,grafana-piechart-panel
volumes:
- grafana-storage:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
- ./grafana/grafana.ini:/etc/grafana/grafana.ini
networks:
- monitoring
depends_on:
- prometheus
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.localhost`)"
- "traefik.http.routers.grafana.entrypoints=web"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
volumes:
- ./prometheus:/etc/prometheus
- prometheus-storage:/prometheus
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager:/etc/alertmanager
- alertmanager-storage:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
ports:
- "9100:9100"
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
networks:
- monitoring
volumes:
grafana-storage:
prometheus-storage:
alertmanager-storage:
networks:
monitoring:
driver: bridge
"""
# 使用示例
config_manager = GrafanaConfigManager()
# 创建数据源配置
prometheus_ds = config_manager.create_prometheus_datasource()
loki_ds = config_manager.create_loki_datasource()
jaeger_ds = config_manager.create_jaeger_datasource()
# 创建配置文件
grafana_config = config_manager.create_grafana_config()
provisioning_configs = config_manager.create_provisioning_config()
docker_compose = config_manager.create_docker_compose_config()
print("Grafana 配置已生成")
print(f"Prometheus 数据源配置: {len(json.dumps(prometheus_ds))} 字符")
print(f"Loki 数据源配置: {len(json.dumps(loki_ds))} 字符")
print(f"Jaeger 数据源配置: {len(json.dumps(jaeger_ds))} 字符")
print(f"主配置文件长度: {len(grafana_config)} 字符")
print(f"自动配置文件数量: {len(provisioning_configs)}")
print(f"Docker Compose 配置长度: {len(docker_compose)} 字符")
仪表板设计与创建
DashboardManager 类
class DashboardManager:
"""仪表板管理器"""
def __init__(self):
self.dashboards = []
self.panels = []
self.variables = []
def create_infrastructure_dashboard(self) -> Dict[str, Any]:
"""创建基础设施监控仪表板"""
return {
"dashboard": {
"id": None,
"title": "Infrastructure Overview",
"tags": ["infrastructure", "overview"],
"timezone": "browser",
"panels": [
# 系统概览行
{
"id": 1,
"title": "System Overview",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": False
},
# CPU 使用率
{
"id": 2,
"title": "CPU Usage",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 1},
"targets": [
{
"expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 内存使用率
{
"id": 3,
"title": "Memory Usage",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 1},
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 磁盘使用率
{
"id": 4,
"title": "Disk Usage",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
"targets": [
{
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"})",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 80},
{"color": "red", "value": 90}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 网络流量
{
"id": 5,
"title": "Network Traffic",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "RX {{device}}",
"refId": "A"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "TX {{device}}",
"refId": "B"
}
],
"fieldConfig": {
"defaults": {
"unit": "bps",
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 100000000}, # 100Mbps
{"color": "red", "value": 1000000000} # 1Gbps
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": [
{
"name": "instance",
"type": "query",
"query": "label_values(up, instance)",
"refresh": 1,
"includeAll": True,
"multi": True,
"allValue": ".*"
}
]
},
"annotations": {
"list": [
{
"name": "Annotations & Alerts",
"enable": True,
"iconColor": "rgba(0, 211, 255, 1)",
"type": "dashboard",
"builtIn": 1,
"hide": True
}
]
},
"refresh": "30s",
"schemaVersion": 27,
"version": 1,
"links": []
}
}
def create_application_dashboard(self) -> Dict[str, Any]:
"""创建应用程序监控仪表板"""
return {
"dashboard": {
"id": None,
"title": "Application Performance Monitoring",
"tags": ["application", "performance", "apm"],
"timezone": "browser",
"panels": [
# 应用概览行
{
"id": 1,
"title": "Application Overview",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": False
},
# HTTP 请求率
{
"id": 2,
"title": "HTTP Request Rate",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (service, method, status)",
"legendFormat": "{{service}} {{method}} {{status}}",
"refId": "A"
}
],
"yAxes": [
{
"label": "Requests/sec",
"min": 0
},
{
"show": False
}
],
"xAxis": {
"show": True
},
"legend": {
"show": True,
"values": True,
"current": True,
"max": True,
"avg": True
}
},
# HTTP 错误率
{
"id": 3,
"title": "HTTP Error Rate",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 响应时间
{
"id": 4,
"title": "Response Time",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "95th percentile",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.5},
{"color": "red", "value": 1}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 数据库连接池
{
"id": 5,
"title": "Database Connection Pool",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 9},
"targets": [
{
"expr": "db_connections_active",
"legendFormat": "Active Connections",
"refId": "A"
},
{
"expr": "db_connections_idle",
"legendFormat": "Idle Connections",
"refId": "B"
},
{
"expr": "db_connections_max",
"legendFormat": "Max Connections",
"refId": "C"
}
],
"yAxes": [
{
"label": "Connections",
"min": 0
},
{
"show": False
}
]
},
# JVM 内存使用
{
"id": 6,
"title": "JVM Memory Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 9},
"targets": [
{
"expr": "jvm_memory_used_bytes{area=\"heap\"}",
"legendFormat": "Heap Used",
"refId": "A"
},
{
"expr": "jvm_memory_max_bytes{area=\"heap\"}",
"legendFormat": "Heap Max",
"refId": "B"
},
{
"expr": "jvm_memory_used_bytes{area=\"nonheap\"}",
"legendFormat": "Non-Heap Used",
"refId": "C"
}
],
"yAxes": [
{
"label": "Bytes",
"min": 0
},
{
"show": False
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"templating": {
"list": [
{
"name": "service",
"type": "query",
"query": "label_values(http_requests_total, service)",
"refresh": 1,
"includeAll": True,
"multi": True
}
]
},
"refresh": "30s",
"schemaVersion": 27,
"version": 1
}
}
def create_business_dashboard(self) -> Dict[str, Any]:
"""创建业务指标仪表板"""
return {
"dashboard": {
"id": None,
"title": "Business Metrics Dashboard",
"tags": ["business", "metrics", "kpi"],
"timezone": "browser",
"panels": [
# 业务概览行
{
"id": 1,
"title": "Business Overview",
"type": "row",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": False
},
# 订单量
{
"id": 2,
"title": "Orders per Hour",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 1},
"targets": [
{
"expr": "sum(increase(orders_total[1h]))",
"legendFormat": "Orders/Hour",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 100},
{"color": "green", "value": 500}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 收入
{
"id": 3,
"title": "Revenue per Hour",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 1},
"targets": [
{
"expr": "sum(increase(revenue_total[1h]))",
"legendFormat": "Revenue/Hour",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "currencyUSD",
"thresholds": {
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 10000},
{"color": "green", "value": 50000}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 用户注册
{
"id": 4,
"title": "New Users per Hour",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
"targets": [
{
"expr": "sum(increase(user_registrations_total[1h]))",
"legendFormat": "New Users/Hour",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 10},
{"color": "green", "value": 50}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 转换率
{
"id": 5,
"title": "Conversion Rate",
"type": "stat",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
"targets": [
{
"expr": "(sum(increase(orders_total[1h])) / sum(increase(page_views_total{page=\"product\"}[1h]))) * 100",
"legendFormat": "Conversion Rate",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 2},
{"color": "green", "value": 5}
]
}
}
},
"options": {
"reduceOptions": {
"values": False,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
# 订单趋势图
{
"id": 6,
"title": "Order Trends",
"type": "graph",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 9},
"targets": [
{
"expr": "sum(rate(orders_total[5m])) * 3600",
"legendFormat": "Orders per Hour",
"refId": "A"
},
{
"expr": "sum(rate(revenue_total[5m])) * 3600",
"legendFormat": "Revenue per Hour",
"refId": "B"
}
],
"yAxes": [
{
"label": "Orders/Hour",
"min": 0
},
{
"label": "Revenue/Hour",
"min": 0
}
],
"seriesOverrides": [
{
"alias": "Revenue per Hour",
"yAxis": 2
}
]
}
],
"time": {
"from": "now-24h",
"to": "now"
},
"templating": {
"list": [
{
"name": "region",
"type": "query",
"query": "label_values(orders_total, region)",
"refresh": 1,
"includeAll": True,
"multi": True
}
]
},
"refresh": "1m",
"schemaVersion": 27,
"version": 1
}
}
def create_custom_panel(self, panel_config: Dict[str, Any]) -> Dict[str, Any]:
"""创建自定义面板"""
default_config = {
"id": 1,
"title": "Custom Panel",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"targets": [],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "green", "value": None}
]
}
}
}
}
# 合并配置
default_config.update(panel_config)
return default_config
def export_dashboard_json(self, dashboard: Dict[str, Any], filename: str) -> str:
"""导出仪表板为 JSON 文件"""
dashboard_json = json.dumps(dashboard, indent=2, ensure_ascii=False)
with open(filename, 'w', encoding='utf-8') as f:
f.write(dashboard_json)
return f"仪表板已导出到: {filename}"
# 使用示例
dashboard_manager = DashboardManager()
# 创建各种仪表板
infra_dashboard = dashboard_manager.create_infrastructure_dashboard()
app_dashboard = dashboard_manager.create_application_dashboard()
business_dashboard = dashboard_manager.create_business_dashboard()
# 创建自定义面板
custom_panel = dashboard_manager.create_custom_panel({
"title": "Custom Metric",
"targets": [
{
"expr": "custom_metric_total",
"legendFormat": "Custom Metric",
"refId": "A"
}
]
})
print("仪表板创建完成")
print(f"基础设施仪表板面板数: {len(infra_dashboard['dashboard']['panels'])}")
print(f"应用程序仪表板面板数: {len(app_dashboard['dashboard']['panels'])}")
print(f"业务指标仪表板面板数: {len(business_dashboard['dashboard']['panels'])}")
高级查询与数据分析
QueryOptimizer 类
class QueryOptimizer:
"""查询优化器"""
def __init__(self):
self.query_templates = {}
self.optimization_rules = []
def create_advanced_queries(self) -> Dict[str, str]:
"""创建高级查询示例"""
return {
# SLI/SLO 查询
"availability_sli": """
# 可用性 SLI (99.9%)
sum(rate(http_requests_total{status!~"5.."}[5m])) /
sum(rate(http_requests_total[5m]))
""",
"latency_sli": """
# 延迟 SLI (95% < 500ms)
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) < 0.5
""",
"error_budget": """
# 错误预算消耗率
(
1 - (
sum(rate(http_requests_total{status!~"5.."}[30d])) /
sum(rate(http_requests_total[30d]))
)
) / 0.001 # 0.1% error budget
""",
# 容量规划查询
"cpu_trend": """
# CPU 使用趋势预测
predict_linear(
avg(1 - rate(node_cpu_seconds_total{mode="idle"}[5m]))[1h:5m],
7*24*3600 # 7天预测
)
""",
"memory_growth": """
# 内存增长率
derivative(
avg(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[1h:5m]
)
""",
# 异常检测查询
"anomaly_detection": """
# 基于历史数据的异常检测
abs(
rate(http_requests_total[5m]) -
avg_over_time(rate(http_requests_total[5m])[7d:5m] offset 7d)
) > 2 * stddev_over_time(rate(http_requests_total[5m])[7d:5m] offset 7d)
""",
# 多维度聚合
"service_health_score": """
# 服务健康评分
(
# 可用性权重 40%
0.4 * (
sum(rate(http_requests_total{status!~"5.."}[5m])) by (service) /
sum(rate(http_requests_total[5m])) by (service)
) +
# 性能权重 30%
0.3 * (
1 - clamp_max(
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
) / 2, 1
)
) +
# 资源使用权重 30%
0.3 * (
1 - clamp_max(
avg(rate(container_cpu_usage_seconds_total[5m])) by (service), 1
)
)
) * 100
""",
# 业务指标关联
"revenue_per_request": """
# 每请求收入
sum(increase(revenue_total[1h])) /
sum(increase(http_requests_total{path=~"/api/orders.*"}[1h]))
""",
"user_journey_funnel": """
# 用户转化漏斗
label_replace(
(
sum(increase(page_views_total{page="product"}[1h])) or vector(0)
), "stage", "1_product_view", "", ""
) or
label_replace(
(
sum(increase(page_views_total{page="cart"}[1h])) or vector(0)
), "stage", "2_add_to_cart", "", ""
) or
label_replace(
(
sum(increase(page_views_total{page="checkout"}[1h])) or vector(0)
), "stage", "3_checkout", "", ""
) or
label_replace(
(
sum(increase(orders_total[1h])) or vector(0)
), "stage", "4_purchase", "", ""
)
"""
}
def create_recording_rules(self) -> str:
"""创建记录规则配置"""
return """
# Prometheus 记录规则配置
groups:
- name: sli_rules
interval: 30s
rules:
# HTTP 可用性 SLI
- record: http:availability:rate5m
expr: |
sum(rate(http_requests_total{status!~"5.."}[5m])) by (service) /
sum(rate(http_requests_total[5m])) by (service)
labels:
sli_type: "availability"
# HTTP 延迟 SLI
- record: http:latency:p95:rate5m
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
)
labels:
sli_type: "latency"
percentile: "95"
# HTTP 延迟 SLI (99th percentile)
- record: http:latency:p99:rate5m
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
)
labels:
sli_type: "latency"
percentile: "99"
# 错误率
- record: http:error_rate:rate5m
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) /
sum(rate(http_requests_total[5m])) by (service)
labels:
sli_type: "error_rate"
- name: infrastructure_rules
interval: 30s
rules:
# CPU 使用率
- record: node:cpu_utilization:rate5m
expr: |
100 - (
avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
) * 100
)
# 内存使用率
- record: node:memory_utilization:ratio
expr: |
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes * 100
# 磁盘使用率
- record: node:disk_utilization:ratio
expr: |
100 - (
node_filesystem_avail_bytes{fstype!="tmpfs"} /
node_filesystem_size_bytes{fstype!="tmpfs"} * 100
)
# 网络流量
- record: node:network_receive_bytes:rate5m
expr: |
sum(rate(node_network_receive_bytes_total{device!="lo"}[5m])) by (instance)
- record: node:network_transmit_bytes:rate5m
expr: |
sum(rate(node_network_transmit_bytes_total{device!="lo"}[5m])) by (instance)
- name: business_rules
interval: 60s
rules:
# 订单率
- record: business:orders:rate1h
expr: |
sum(increase(orders_total[1h])) by (region, product_category)
# 收入率
- record: business:revenue:rate1h
expr: |
sum(increase(revenue_total[1h])) by (region, product_category)
# 用户注册率
- record: business:user_registrations:rate1h
expr: |
sum(increase(user_registrations_total[1h])) by (region)
# 转换率
- record: business:conversion_rate:ratio1h
expr: |
sum(increase(orders_total[1h])) by (region) /
sum(increase(page_views_total{page="product"}[1h])) by (region) * 100
# 平均订单价值
- record: business:average_order_value:ratio1h
expr: |
sum(increase(revenue_total[1h])) by (region) /
sum(increase(orders_total[1h])) by (region)
- name: capacity_planning_rules
interval: 300s
rules:
# CPU 趋势预测 (7天)
- record: capacity:cpu_trend:predict7d
expr: |
predict_linear(node:cpu_utilization:rate5m[24h], 7*24*3600)
# 内存趋势预测 (7天)
- record: capacity:memory_trend:predict7d
expr: |
predict_linear(node:memory_utilization:ratio[24h], 7*24*3600)
# 磁盘趋势预测 (30天)
- record: capacity:disk_trend:predict30d
expr: |
predict_linear(node:disk_utilization:ratio[7d], 30*24*3600)
# 请求量趋势预测 (7天)
- record: capacity:request_trend:predict7d
expr: |
predict_linear(sum(http:requests:rate5m)[24h], 7*24*3600)
"""
def optimize_query(self, query: str) -> Dict[str, Any]:
"""优化查询性能"""
optimizations = []
optimized_query = query
# 检查常见的性能问题
if "rate(" in query and "[5m]" not in query:
optimizations.append({
"type": "rate_interval",
"message": "建议使用适当的时间窗口,如 [5m]",
"severity": "warning"
})
if "sum(" in query and "by (" not in query:
optimizations.append({
"type": "aggregation",
"message": "考虑添加 'by' 子句来减少基数",
"severity": "info"
})
if query.count("(") > 5:
optimizations.append({
"type": "complexity",
"message": "查询过于复杂,考虑使用记录规则",
"severity": "warning"
})
if "offset" in query:
optimizations.append({
"type": "offset_usage",
"message": "offset 操作可能影响性能,谨慎使用",
"severity": "info"
})
return {
"original_query": query,
"optimized_query": optimized_query,
"optimizations": optimizations,
"estimated_performance": self._estimate_performance(query)
}
def _estimate_performance(self, query: str) -> str:
"""估算查询性能"""
complexity_score = 0
# 基于查询复杂度评分
complexity_score += query.count("(") * 2
complexity_score += query.count("rate(") * 3
complexity_score += query.count("histogram_quantile(") * 5
complexity_score += query.count("predict_linear(") * 4
complexity_score += query.count("offset") * 3
if complexity_score < 10:
return "fast"
elif complexity_score < 20:
return "medium"
else:
return "slow"
def create_query_library(self) -> Dict[str, Dict[str, str]]:
"""创建查询库"""
return {
"infrastructure": {
"cpu_usage": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"memory_usage": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"disk_usage": "100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes)",
"network_in": "rate(node_network_receive_bytes_total[5m]) * 8",
"network_out": "rate(node_network_transmit_bytes_total[5m]) * 8",
"load_average": "node_load1",
"disk_io_read": "rate(node_disk_read_bytes_total[5m])",
"disk_io_write": "rate(node_disk_written_bytes_total[5m])"
},
"application": {
"request_rate": "sum(rate(http_requests_total[5m])) by (service)",
"error_rate": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) * 100",
"response_time_p95": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
"response_time_p99": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
"active_connections": "sum(http_connections_active) by (service)",
"queue_size": "sum(queue_size) by (service)",
"cache_hit_rate": "sum(rate(cache_hits_total[5m])) by (service) / sum(rate(cache_requests_total[5m])) by (service) * 100"
},
"database": {
"connection_pool_usage": "db_connections_active / db_connections_max * 100",
"query_duration_p95": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le))",
"slow_queries": "sum(rate(db_slow_queries_total[5m]))",
"deadlocks": "sum(rate(db_deadlocks_total[5m]))",
"table_size": "sum(db_table_size_bytes) by (table)",
"index_usage": "sum(rate(db_index_scans_total[5m])) by (index)"
},
"business": {
"orders_per_minute": "sum(rate(orders_total[1m])) * 60",
"revenue_per_hour": "sum(increase(revenue_total[1h]))",
"new_users_per_hour": "sum(increase(user_registrations_total[1h]))",
"conversion_rate": "sum(increase(orders_total[1h])) / sum(increase(page_views_total{page=\"product\"}[1h])) * 100",
"cart_abandonment_rate": "(sum(increase(cart_created_total[1h])) - sum(increase(orders_total[1h]))) / sum(increase(cart_created_total[1h])) * 100",
"average_order_value": "sum(increase(revenue_total[1h])) / sum(increase(orders_total[1h]))"
}
}
# 使用示例
query_optimizer = QueryOptimizer()
# 创建高级查询
advanced_queries = query_optimizer.create_advanced_queries()
recording_rules = query_optimizer.create_recording_rules()
query_library = query_optimizer.create_query_library()
# 优化查询
sample_query = "sum(rate(http_requests_total[5m])) by (service)"
optimization_result = query_optimizer.optimize_query(sample_query)
print("高级查询与数据分析")
print(f"高级查询数量: {len(advanced_queries)}")
print(f"记录规则配置长度: {len(recording_rules)} 字符")
print(f"查询库分类数: {len(query_library)}")
print(f"查询优化建议数: {len(optimization_result['optimizations'])}")
print(f"查询性能评估: {optimization_result['estimated_performance']}")
可视化优化与性能调优
VisualizationOptimizer 类
class VisualizationOptimizer:
"""可视化优化器"""
def __init__(self):
self.performance_metrics = {}
self.optimization_rules = []
def create_performance_dashboard(self) -> Dict[str, Any]:
"""创建性能监控仪表板"""
return {
"dashboard": {
"id": None,
"title": "Grafana 性能监控",
"tags": ["grafana", "performance", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "查询响应时间",
"type": "stat",
"targets": [{
"expr": "histogram_quantile(0.95, sum(rate(grafana_api_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "95th percentile"
}],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "活跃用户数",
"type": "stat",
"targets": [{
"expr": "grafana_stat_totals_dashboard_views",
"legendFormat": "Dashboard Views"
}],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {"mode": "palette-classic"}
}
},
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
},
{
"id": 3,
"title": "数据源查询性能",
"type": "timeseries",
"targets": [{
"expr": "rate(prometheus_tsdb_symbol_table_size_bytes[5m])",
"legendFormat": "Symbol Table Growth Rate"
}],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 1,
"fillOpacity": 10
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 4,
"title": "内存使用情况",
"type": "timeseries",
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"grafana\"}",
"legendFormat": "Resident Memory"
},
{
"expr": "go_memstats_heap_inuse_bytes{job=\"grafana\"}",
"legendFormat": "Heap In Use"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 5,
"title": "慢查询统计",
"type": "table",
"targets": [{
"expr": "topk(10, sum by (query) (rate(grafana_api_request_duration_seconds_count[5m])))",
"format": "table",
"instant": True
}],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"query": "查询",
"Value": "频率"
}
}
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
def optimize_panel_queries(self, panel_config: Dict[str, Any]) -> Dict[str, Any]:
"""优化面板查询"""
optimizations = []
optimized_targets = []
for target in panel_config.get('targets', []):
query = target.get('expr', '')
# 优化建议
if 'rate(' in query and '[5m]' not in query:
optimizations.append({
"type": "interval",
"message": "建议使用标准的5分钟间隔",
"original": query
})
if query.count('(') > 3:
optimizations.append({
"type": "complexity",
"message": "查询过于复杂,考虑使用记录规则",
"original": query
})
# 添加缓存配置
optimized_target = target.copy()
optimized_target['interval'] = '30s'
optimized_target['maxDataPoints'] = 1000
optimized_targets.append(optimized_target)
optimized_panel = panel_config.copy()
optimized_panel['targets'] = optimized_targets
# 添加缓存超时
optimized_panel['cacheTimeout'] = '300s'
return {
"original_panel": panel_config,
"optimized_panel": optimized_panel,
"optimizations": optimizations,
"performance_score": self._calculate_performance_score(panel_config)
}
def _calculate_performance_score(self, panel_config: Dict[str, Any]) -> int:
"""计算性能评分"""
score = 100
targets = panel_config.get('targets', [])
# 查询数量惩罚
if len(targets) > 5:
score -= (len(targets) - 5) * 10
# 查询复杂度惩罚
for target in targets:
query = target.get('expr', '')
complexity = query.count('(') + query.count('rate(') * 2
if complexity > 10:
score -= (complexity - 10) * 5
# 时间范围惩罚
time_range = panel_config.get('timeFrom', '')
if 'd' in time_range or 'w' in time_range:
score -= 20
return max(0, score)
def create_optimization_guide(self) -> Dict[str, Any]:
"""创建优化指南"""
return {
"query_optimization": {
"best_practices": [
"使用记录规则预计算复杂查询",
"选择合适的时间间隔(通常5分钟)",
"避免过度聚合和高基数标签",
"使用 rate() 而不是 increase() 进行速率计算",
"限制查询的时间范围",
"使用 topk() 限制返回的时间序列数量"
],
"common_issues": [
{
"issue": "查询超时",
"cause": "查询过于复杂或时间范围过大",
"solution": "使用记录规则或减少时间范围"
},
{
"issue": "内存使用过高",
"cause": "高基数标签或大量时间序列",
"solution": "使用标签过滤或聚合"
},
{
"issue": "仪表板加载缓慢",
"cause": "面板查询过多或过于复杂",
"solution": "减少面板数量或优化查询"
}
]
},
"dashboard_optimization": {
"layout_tips": [
"将最重要的指标放在顶部",
"使用合理的面板大小和布局",
"避免在单个仪表板中放置过多面板",
"使用变量来创建动态仪表板",
"合理设置刷新间隔"
],
"performance_tips": [
"启用查询缓存",
"使用合适的数据点数量限制",
"避免使用过短的刷新间隔",
"使用时间范围变量",
"优化面板查询的复杂度"
]
},
"data_source_optimization": {
"prometheus_settings": {
"query_timeout": "60s",
"http_method": "POST",
"cache_level": "High",
"max_concurrent_queries": 20
},
"connection_pooling": {
"max_idle_connections": 100,
"max_open_connections": 100,
"connection_max_lifetime": "14400s"
}
}
}
def create_alerting_rules(self) -> str:
"""创建 Grafana 告警规则"""
return """
# Grafana 告警规则配置
groups:
- name: grafana_performance
rules:
- alert: GrafanaHighQueryLatency
expr: histogram_quantile(0.95, sum(rate(grafana_api_request_duration_seconds_bucket[5m])) by (le)) > 5
for: 5m
labels:
severity: warning
service: grafana
annotations:
summary: "Grafana 查询延迟过高"
description: "Grafana 95% 查询延迟超过 5 秒,当前值: {{ $value }}s"
- alert: GrafanaHighMemoryUsage
expr: process_resident_memory_bytes{job="grafana"} / 1024 / 1024 / 1024 > 2
for: 10m
labels:
severity: warning
service: grafana
annotations:
summary: "Grafana 内存使用过高"
description: "Grafana 内存使用超过 2GB,当前值: {{ $value }}GB"
- alert: GrafanaDashboardErrors
expr: increase(grafana_api_response_status_total{code=~"5.."}[5m]) > 10
for: 5m
labels:
severity: critical
service: grafana
annotations:
summary: "Grafana 仪表板错误率过高"
description: "Grafana 在过去5分钟内出现 {{ $value }} 个5xx错误"
- alert: PrometheusQueryTimeout
expr: increase(prometheus_query_duration_seconds_count{quantile="0.99"}[5m]) > 100
for: 5m
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus 查询超时频繁"
description: "Prometheus 99% 查询延迟过高,可能影响 Grafana 性能"
- alert: GrafanaDataSourceDown
expr: up{job="prometheus"} == 0
for: 1m
labels:
severity: critical
service: grafana
annotations:
summary: "Grafana 数据源不可用"
description: "Prometheus 数据源连接失败,Grafana 无法获取数据"
"""
def generate_performance_report(self, dashboard_config: Dict[str, Any]) -> Dict[str, Any]:
"""生成性能报告"""
panels = dashboard_config.get('dashboard', {}).get('panels', [])
total_queries = sum(len(panel.get('targets', [])) for panel in panels)
complex_queries = 0
slow_queries = 0
for panel in panels:
for target in panel.get('targets', []):
query = target.get('expr', '')
complexity = query.count('(') + query.count('rate(') * 2
if complexity > 10:
complex_queries += 1
if 'offset' in query or 'predict_linear' in query:
slow_queries += 1
performance_score = 100
if total_queries > 20:
performance_score -= (total_queries - 20) * 2
if complex_queries > 5:
performance_score -= (complex_queries - 5) * 10
if slow_queries > 2:
performance_score -= slow_queries * 15
performance_score = max(0, performance_score)
recommendations = []
if total_queries > 20:
recommendations.append("减少查询数量或使用记录规则")
if complex_queries > 5:
recommendations.append("简化复杂查询或使用预计算")
if slow_queries > 2:
recommendations.append("避免使用 offset 和 predict_linear 等慢查询")
return {
"dashboard_title": dashboard_config.get('dashboard', {}).get('title', 'Unknown'),
"total_panels": len(panels),
"total_queries": total_queries,
"complex_queries": complex_queries,
"slow_queries": slow_queries,
"performance_score": performance_score,
"performance_grade": self._get_performance_grade(performance_score),
"recommendations": recommendations,
"estimated_load_time": self._estimate_load_time(total_queries, complex_queries),
"memory_usage_estimate": self._estimate_memory_usage(total_queries)
}
def _get_performance_grade(self, score: int) -> str:
"""获取性能等级"""
if score >= 90:
return "A (优秀)"
elif score >= 80:
return "B (良好)"
elif score >= 70:
return "C (一般)"
elif score >= 60:
return "D (较差)"
else:
return "F (很差)"
def _estimate_load_time(self, total_queries: int, complex_queries: int) -> str:
"""估算加载时间"""
base_time = 2 # 基础加载时间(秒)
query_time = total_queries * 0.1 # 每个查询增加0.1秒
complex_time = complex_queries * 0.5 # 每个复杂查询增加0.5秒
total_time = base_time + query_time + complex_time
if total_time < 5:
return f"{total_time:.1f}秒 (快)"
elif total_time < 10:
return f"{total_time:.1f}秒 (中等)"
else:
return f"{total_time:.1f}秒 (慢)"
def _estimate_memory_usage(self, total_queries: int) -> str:
"""估算内存使用"""
base_memory = 50 # 基础内存使用(MB)
query_memory = total_queries * 5 # 每个查询增加5MB
total_memory = base_memory + query_memory
if total_memory < 200:
return f"{total_memory}MB (低)"
elif total_memory < 500:
return f"{total_memory}MB (中等)"
else:
return f"{total_memory}MB (高)"
# 使用示例
vis_optimizer = VisualizationOptimizer()
# 创建性能监控仪表板
perf_dashboard = vis_optimizer.create_performance_dashboard()
optimization_guide = vis_optimizer.create_optimization_guide()
alerting_rules = vis_optimizer.create_alerting_rules()
# 生成性能报告
sample_dashboard = {
"dashboard": {
"title": "示例仪表板",
"panels": [
{
"targets": [
{"expr": "rate(http_requests_total[5m])"},
{"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"}
]
}
]
}
}
performance_report = vis_optimizer.generate_performance_report(sample_dashboard)
print("可视化优化与性能调优")
print(f"性能仪表板面板数: {len(perf_dashboard['dashboard']['panels'])}")
print(f"优化指南分类数: {len(optimization_guide)}")
print(f"告警规则长度: {len(alerting_rules)} 字符")
print(f"性能报告评分: {performance_report['performance_score']}")
print(f"性能等级: {performance_report['performance_grade']}")
print(f"预估加载时间: {performance_report['estimated_load_time']}")
print(f"预估内存使用: {performance_report['memory_usage_estimate']}")
实战案例与最佳实践
CaseStudyManager 类
class CaseStudyManager:
"""实战案例管理器"""
def __init__(self):
self.case_studies = {}
self.best_practices = []
def create_ecommerce_monitoring(self) -> Dict[str, Any]:
"""创建电商监控案例"""
return {
"scenario": "电商平台监控",
"requirements": [
"实时监控订单量和收入",
"跟踪用户行为和转化率",
"监控系统性能和可用性",
"检测异常和欺诈行为"
],
"dashboard_config": {
"dashboard": {
"title": "电商平台监控中心",
"tags": ["ecommerce", "business", "monitoring"],
"panels": [
{
"title": "实时订单量",
"type": "stat",
"targets": [{
"expr": "sum(rate(orders_total[1m])) * 60",
"legendFormat": "订单/分钟"
}],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 10},
{"color": "green", "value": 50}
]
}
}
}
},
{
"title": "收入趋势",
"type": "timeseries",
"targets": [
{
"expr": "sum(increase(revenue_total[5m]))",
"legendFormat": "总收入"
},
{
"expr": "sum(increase(revenue_total[5m])) by (region)",
"legendFormat": "{{region}} 收入"
}
]
},
{
"title": "转化漏斗",
"type": "piechart",
"targets": [{
"expr": "sum(increase(page_views_total{page=\"product\"}[1h]))",
"legendFormat": "产品页面浏览"
}, {
"expr": "sum(increase(cart_additions_total[1h]))",
"legendFormat": "加入购物车"
}, {
"expr": "sum(increase(checkout_starts_total[1h]))",
"legendFormat": "开始结账"
}, {
"expr": "sum(increase(orders_total[1h]))",
"legendFormat": "完成订单"
}]
},
{
"title": "系统健康状态",
"type": "table",
"targets": [{
"expr": "up{job=~\".*ecommerce.*\"}",
"format": "table",
"instant": True
}]
},
{
"title": "异常检测",
"type": "timeseries",
"targets": [{
"expr": "abs(rate(orders_total[5m]) - avg_over_time(rate(orders_total[5m])[7d:5m] offset 7d)) > 2 * stddev_over_time(rate(orders_total[5m])[7d:5m] offset 7d)",
"legendFormat": "订单异常"
}]
}
]
}
},
"alerting_rules": """
groups:
- name: ecommerce_alerts
rules:
- alert: LowOrderRate
expr: sum(rate(orders_total[5m])) * 60 < 10
for: 5m
labels:
severity: warning
team: business
annotations:
summary: "订单率过低"
description: "每分钟订单数低于10个,当前: {{ $value }}"
- alert: HighCartAbandonmentRate
expr: (sum(increase(cart_additions_total[1h])) - sum(increase(orders_total[1h]))) / sum(increase(cart_additions_total[1h])) * 100 > 80
for: 10m
labels:
severity: warning
team: business
annotations:
summary: "购物车放弃率过高"
description: "购物车放弃率超过80%,当前: {{ $value }}%"
- alert: PaymentSystemDown
expr: up{job="payment-service"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "支付系统不可用"
description: "支付服务连接失败,影响订单处理"
""",
"implementation_steps": [
"1. 配置业务指标采集(订单、收入、用户行为)",
"2. 设置系统监控(服务可用性、性能指标)",
"3. 创建实时仪表板展示关键业务指标",
"4. 配置告警规则监控异常情况",
"5. 设置自动化报告和通知"
]
}
def create_microservices_monitoring(self) -> Dict[str, Any]:
"""创建微服务监控案例"""
return {
"scenario": "微服务架构监控",
"requirements": [
"服务间调用链路追踪",
"API 性能和错误率监控",
"资源使用情况监控",
"服务依赖关系可视化"
],
"dashboard_config": {
"dashboard": {
"title": "微服务监控中心",
"tags": ["microservices", "api", "performance"],
"panels": [
{
"title": "服务地图",
"type": "nodeGraph",
"targets": [{
"expr": "sum(rate(http_requests_total[5m])) by (source_service, destination_service)",
"legendFormat": "{{source_service}} -> {{destination_service}}"
}]
},
{
"title": "API 请求率",
"type": "timeseries",
"targets": [{
"expr": "sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "{{service}}"
}]
},
{
"title": "错误率分布",
"type": "heatmap",
"targets": [{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) * 100",
"legendFormat": "{{service}}"
}]
},
{
"title": "响应时间分位数",
"type": "timeseries",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
"legendFormat": "{{service}} P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
"legendFormat": "{{service}} P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
"legendFormat": "{{service}} P99"
}
]
},
{
"title": "服务资源使用",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (service) * 100",
"legendFormat": "{{service}} CPU"
},
{
"expr": "sum(container_memory_usage_bytes) by (service) / 1024 / 1024",
"legendFormat": "{{service}} Memory (MB)"
}
]
}
]
}
},
"service_discovery": {
"kubernetes": """
# Kubernetes 服务发现配置
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\\d+)?;(\\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
"""
}
}
def create_infrastructure_monitoring(self) -> Dict[str, Any]:
"""创建基础设施监控案例"""
return {
"scenario": "基础设施监控",
"requirements": [
"服务器资源监控",
"网络性能监控",
"存储系统监控",
"容器和集群监控"
],
"dashboard_config": {
"dashboard": {
"title": "基础设施监控中心",
"tags": ["infrastructure", "servers", "network"],
"panels": [
{
"title": "集群概览",
"type": "stat",
"targets": [
{
"expr": "count(up{job=\"node-exporter\"} == 1)",
"legendFormat": "在线节点"
},
{
"expr": "count(up{job=\"node-exporter\"} == 0)",
"legendFormat": "离线节点"
}
]
},
{
"title": "CPU 使用率热力图",
"type": "heatmap",
"targets": [{
"expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}]
},
{
"title": "内存使用情况",
"type": "timeseries",
"targets": [{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}"
}]
},
{
"title": "磁盘 I/O",
"type": "timeseries",
"targets": [
{
"expr": "rate(node_disk_read_bytes_total[5m])",
"legendFormat": "{{instance}} 读取"
},
{
"expr": "rate(node_disk_written_bytes_total[5m])",
"legendFormat": "{{instance}} 写入"
}
]
},
{
"title": "网络流量",
"type": "timeseries",
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "{{instance}} 入站"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "{{instance}} 出站"
}
]
}
]
}
},
"capacity_planning": {
"cpu_forecast": "predict_linear(node:cpu_utilization:rate5m[24h], 7*24*3600)",
"memory_forecast": "predict_linear(node:memory_utilization:ratio[24h], 7*24*3600)",
"disk_forecast": "predict_linear(node:disk_utilization:ratio[7d], 30*24*3600)"
}
}
def create_best_practices_guide(self) -> Dict[str, Any]:
"""创建最佳实践指南"""
return {
"dashboard_design": {
"principles": [
"遵循5秒规则:用户应在5秒内理解仪表板内容",
"使用一致的颜色方案和视觉元素",
"将最重要的信息放在左上角",
"避免信息过载,每个仪表板专注特定主题",
"使用有意义的标题和描述"
],
"layout_guidelines": [
"使用网格系统保持对齐",
"相关面板应放置在一起",
"使用空白空间提高可读性",
"保持面板大小的一致性",
"考虑不同屏幕尺寸的适配"
]
},
"query_optimization": {
"performance_tips": [
"使用记录规则预计算复杂查询",
"避免使用过长的时间范围",
"合理使用聚合函数减少数据量",
"使用标签过滤减少查询范围",
"定期清理不必要的历史数据"
],
"caching_strategies": [
"启用查询结果缓存",
"设置合适的缓存过期时间",
"使用 CDN 加速静态资源",
"实施数据预聚合策略"
]
},
"alerting_best_practices": [
"基于 SLI/SLO 设计告警",
"避免告警疲劳,设置合理阈值",
"使用告警分组减少噪音",
"提供清晰的告警描述和处理步骤",
"定期审查和优化告警规则"
],
"security_considerations": [
"实施基于角色的访问控制",
"使用 HTTPS 加密数据传输",
"定期更新 Grafana 和插件",
"审计用户访问和操作日志",
"保护敏感数据和凭据"
],
"maintenance_checklist": [
"定期备份仪表板配置",
"监控 Grafana 自身的性能",
"清理未使用的仪表板和数据源",
"更新文档和使用指南",
"培训团队成员使用最佳实践"
]
}
def generate_implementation_plan(self, scenario: str) -> Dict[str, Any]:
"""生成实施计划"""
base_plan = {
"phase_1": {
"name": "基础设施准备",
"duration": "1-2周",
"tasks": [
"安装和配置 Prometheus",
"安装和配置 Grafana",
"设置基础监控指标采集",
"配置数据源连接"
]
},
"phase_2": {
"name": "仪表板开发",
"duration": "2-3周",
"tasks": [
"设计仪表板布局",
"创建核心监控面板",
"配置查询和可视化",
"测试和优化性能"
]
},
"phase_3": {
"name": "告警配置",
"duration": "1周",
"tasks": [
"定义告警规则",
"配置通知渠道",
"测试告警功能",
"文档化告警处理流程"
]
},
"phase_4": {
"name": "部署和培训",
"duration": "1周",
"tasks": [
"生产环境部署",
"用户培训和文档",
"建立运维流程",
"持续优化和改进"
]
}
}
# 根据场景调整计划
if scenario == "microservices":
base_plan["phase_2"]["tasks"].extend([
"配置服务发现",
"实施分布式追踪",
"创建服务依赖图"
])
elif scenario == "ecommerce":
base_plan["phase_2"]["tasks"].extend([
"集成业务指标",
"配置实时数据流",
"创建转化漏斗分析"
])
return base_plan
# 使用示例
case_manager = CaseStudyManager()
# 创建实战案例
ecommerce_case = case_manager.create_ecommerce_monitoring()
microservices_case = case_manager.create_microservices_monitoring()
infrastructure_case = case_manager.create_infrastructure_monitoring()
best_practices = case_manager.create_best_practices_guide()
# 生成实施计划
ecommerce_plan = case_manager.generate_implementation_plan("ecommerce")
microservices_plan = case_manager.generate_implementation_plan("microservices")
print("实战案例与最佳实践")
print(f"电商监控案例面板数: {len(ecommerce_case['dashboard_config']['dashboard']['panels'])}")
print(f"微服务监控案例面板数: {len(microservices_case['dashboard_config']['dashboard']['panels'])}")
print(f"基础设施监控案例面板数: {len(infrastructure_case['dashboard_config']['dashboard']['panels'])}")
print(f"最佳实践指南分类数: {len(best_practices)}")
print(f"实施计划阶段数: {len(ecommerce_plan)}")
总结
本教程全面介绍了 Prometheus 数据可视化与 Grafana 集成的各个方面,从基础配置到高级优化,从理论知识到实战案例。
核心要点
Grafana 基础配置
- 数据源配置和管理
- 基础设置和安全配置
- Docker 容器化部署
仪表板设计与创建
- 基础设施监控仪表板
- 应用程序性能仪表板
- 业务指标仪表板
- 自定义面板和可视化
高级查询与数据分析
- SLI/SLO 查询设计
- 容量规划和趋势预测
- 异常检测和多维度分析
- 记录规则和查询优化
可视化优化与性能调优
- 查询性能优化
- 仪表板性能监控
- 缓存策略和连接池配置
- 性能评估和报告生成
实战案例与最佳实践
- 电商平台监控实施
- 微服务架构监控
- 基础设施监控方案
- 设计原则和维护指南
最佳实践总结
- 设计原则:遵循5秒规则,保持简洁明了
- 性能优化:使用记录规则,合理设置缓存
- 查询优化:避免复杂查询,使用适当的时间范围
- 告警配置:基于 SLI/SLO,避免告警疲劳
- 安全考虑:实施访问控制,保护敏感数据
下一步学习建议
- 深入学习 PromQL:掌握更多高级查询技巧
- 探索 Grafana 插件:扩展可视化能力
- 实施自动化:使用 Infrastructure as Code
- 集成其他工具:结合 Jaeger、ELK Stack 等
- 持续优化:定期审查和改进监控策略
通过本教程的学习,您应该能够: - 独立配置和管理 Grafana 实例 - 设计和创建专业的监控仪表板 - 编写高效的 PromQL 查询 - 优化可视化性能和用户体验 - 实施企业级监控解决方案
恭喜您完成了 Prometheus 数据可视化与 Grafana 集成的学习! “`