概述

Exporter是Prometheus生态系统中的重要组件,负责从各种系统和服务中收集指标数据,并以Prometheus可以理解的格式暴露这些指标。本章将详细介绍各种Exporter的使用方法和自定义Exporter的开发。

学习目标

通过本章学习,你将能够: - 理解Exporter的工作原理和分类 - 掌握常用Exporter的配置和使用 - 学会开发自定义Exporter - 了解服务发现和动态配置 - 掌握数据收集的最佳实践

Exporter基础概念

1. Exporter分类和原理

from enum import Enum
from dataclasses import dataclass
from typing import Dict, List, Optional, Any, Union
from datetime import datetime
import json
import time
import random

class ExporterType(Enum):
    """Exporter类型"""
    OFFICIAL = "official"  # 官方Exporter
    COMMUNITY = "community"  # 社区Exporter
    CUSTOM = "custom"  # 自定义Exporter

class MetricFormat(Enum):
    """指标格式"""
    PROMETHEUS = "prometheus"
    OPENMETRICS = "openmetrics"
    JSON = "json"

class CollectionMethod(Enum):
    """收集方法"""
    PULL = "pull"  # 拉取模式
    PUSH = "push"  # 推送模式
    HYBRID = "hybrid"  # 混合模式

@dataclass
class ExporterInfo:
    """Exporter信息"""
    name: str
    type: ExporterType
    port: int
    description: str
    metrics_path: str
    collection_method: CollectionMethod
    supported_versions: List[str]
    dependencies: List[str]
    configuration_file: Optional[str] = None
    docker_image: Optional[str] = None

@dataclass
class MetricDefinition:
    """指标定义"""
    name: str
    type: str  # counter, gauge, histogram, summary
    help: str
    labels: List[str]
    unit: Optional[str] = None

class ExporterCatalog:
    """Exporter目录"""
    
    def __init__(self):
        self.exporters = {}
        self._initialize_catalog()
    
    def _initialize_catalog(self):
        """初始化Exporter目录"""
        self.exporters = {
            "node_exporter": ExporterInfo(
                name="Node Exporter",
                type=ExporterType.OFFICIAL,
                port=9100,
                description="收集系统级指标(CPU、内存、磁盘、网络等)",
                metrics_path="/metrics",
                collection_method=CollectionMethod.PULL,
                supported_versions=["1.6.1", "1.5.0", "1.4.0"],
                dependencies=[],
                docker_image="prom/node-exporter:v1.6.1"
            ),
            "mysql_exporter": ExporterInfo(
                name="MySQL Exporter",
                type=ExporterType.OFFICIAL,
                port=9104,
                description="收集MySQL数据库指标",
                metrics_path="/metrics",
                collection_method=CollectionMethod.PULL,
                supported_versions=["0.15.0", "0.14.0"],
                dependencies=["MySQL 5.6+"],
                configuration_file="my.cnf",
                docker_image="prom/mysqld-exporter:v0.15.0"
            ),
            "redis_exporter": ExporterInfo(
                name="Redis Exporter",
                type=ExporterType.COMMUNITY,
                port=9121,
                description="收集Redis数据库指标",
                metrics_path="/metrics",
                collection_method=CollectionMethod.PULL,
                supported_versions=["1.52.0", "1.51.0"],
                dependencies=["Redis 3.0+"],
                docker_image="oliver006/redis_exporter:v1.52.0"
            ),
            "nginx_exporter": ExporterInfo(
                name="Nginx Exporter",
                type=ExporterType.COMMUNITY,
                port=9113,
                description="收集Nginx Web服务器指标",
                metrics_path="/metrics",
                collection_method=CollectionMethod.PULL,
                supported_versions=["0.11.0", "0.10.0"],
                dependencies=["Nginx with stub_status"],
                docker_image="nginx/nginx-prometheus-exporter:0.11.0"
            ),
            "blackbox_exporter": ExporterInfo(
                name="Blackbox Exporter",
                type=ExporterType.OFFICIAL,
                port=9115,
                description="黑盒监控(HTTP、HTTPS、DNS、TCP、ICMP)",
                metrics_path="/probe",
                collection_method=CollectionMethod.PULL,
                supported_versions=["0.24.0", "0.23.0"],
                dependencies=[],
                configuration_file="blackbox.yml",
                docker_image="prom/blackbox-exporter:v0.24.0"
            ),
            "postgres_exporter": ExporterInfo(
                name="PostgreSQL Exporter",
                type=ExporterType.COMMUNITY,
                port=9187,
                description="收集PostgreSQL数据库指标",
                metrics_path="/metrics",
                collection_method=CollectionMethod.PULL,
                supported_versions=["0.13.2", "0.12.0"],
                dependencies=["PostgreSQL 9.4+"],
                docker_image="prometheuscommunity/postgres-exporter:v0.13.2"
            )
        }
    
    def get_exporter_info(self, name: str) -> Optional[ExporterInfo]:
        """获取Exporter信息"""
        return self.exporters.get(name)
    
    def list_exporters_by_type(self, exporter_type: ExporterType) -> List[ExporterInfo]:
        """按类型列出Exporter"""
        return [exp for exp in self.exporters.values() if exp.type == exporter_type]
    
    def generate_deployment_guide(self, exporter_name: str) -> str:
        """生成部署指南"""
        exporter = self.get_exporter_info(exporter_name)
        if not exporter:
            return f"Exporter '{exporter_name}' not found"
        
        return f"""
{exporter.name} 部署指南

基本信息:
- 类型:{exporter.type.value}
- 端口:{exporter.port}
- 指标路径:{exporter.metrics_path}
- 描述:{exporter.description}

Docker部署:
docker run -d \
  --name {exporter_name} \
  -p {exporter.port}:{exporter.port} \
  {exporter.docker_image or 'N/A'}

Prometheus配置:
scrape_configs:
  - job_name: '{exporter_name}'
    static_configs:
      - targets: ['localhost:{exporter.port}']
    scrape_interval: 15s
    metrics_path: {exporter.metrics_path}

依赖要求:
{chr(10).join(f'- {dep}' for dep in exporter.dependencies) if exporter.dependencies else '- 无特殊依赖'}

支持版本:
{chr(10).join(f'- {ver}' for ver in exporter.supported_versions)}
"""

# 使用示例
catalog = ExporterCatalog()

# 列出所有官方Exporter
official_exporters = catalog.list_exporters_by_type(ExporterType.OFFICIAL)
print(f"官方Exporter数量: {len(official_exporters)}")

# 获取Node Exporter信息
node_exporter = catalog.get_exporter_info("node_exporter")
print(f"\nNode Exporter端口: {node_exporter.port}")

# 生成MySQL Exporter部署指南
mysql_guide = catalog.generate_deployment_guide("mysql_exporter")
print(f"\n{mysql_guide}")

常用Exporter详解

1. Node Exporter

class NodeExporterManager:
    """Node Exporter管理器"""
    
    def __init__(self):
        self.version = "1.6.1"
        self.port = 9100
        self.collectors = []
    
    def generate_installation_script(self) -> str:
        """生成安装脚本"""
        return f"""
#!/bin/bash
# Node Exporter 安装脚本

set -e

VERSION="{self.version}"
PORT="{self.port}"
USER="node_exporter"
GROUP="node_exporter"
INSTALL_DIR="/opt/node_exporter"

echo "安装 Node Exporter $VERSION..."

# 1. 创建用户
sudo groupadd --system $GROUP 2>/dev/null || true
sudo useradd --system --gid $GROUP --no-create-home --shell /bin/false $USER 2>/dev/null || true

# 2. 下载和安装
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v$VERSION/node_exporter-$VERSION.linux-amd64.tar.gz
tar xzf node_exporter-$VERSION.linux-amd64.tar.gz

# 3. 安装二进制文件
sudo mkdir -p $INSTALL_DIR
sudo cp node_exporter-$VERSION.linux-amd64/node_exporter $INSTALL_DIR/
sudo chown $USER:$GROUP $INSTALL_DIR/node_exporter
sudo chmod +x $INSTALL_DIR/node_exporter

# 4. 创建符号链接
sudo ln -sf $INSTALL_DIR/node_exporter /usr/local/bin/node_exporter

# 5. 清理
rm -rf node_exporter-$VERSION.linux-amd64*

echo "Node Exporter 安装完成!"
echo "端口: $PORT"
echo "指标路径: /metrics"
"""
    
    def generate_systemd_service(self) -> str:
        """生成systemd服务文件"""
        return f"""
[Unit]
Description=Node Exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/opt/node_exporter/node_exporter \
    --web.listen-address=0.0.0.0:{self.port} \
    --path.procfs=/proc \
    --path.sysfs=/sys \
    --path.rootfs=/ \
    --collector.filesystem.mount-points-exclude='^/(sys|proc|dev|host|etc)($$|/)' \
    --collector.systemd \
    --collector.processes

SyslogIdentifier=node_exporter
Restart=always
RestartSec=5

[Install]
WantedBy=multi-user.target
"""
    
    def generate_docker_compose(self) -> str:
        """生成Docker Compose配置"""
        return f"""
version: '3.8'

services:
  node-exporter:
    image: prom/node-exporter:v{self.version}
    container_name: node-exporter
    ports:
      - "{self.port}:{self.port}"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
      - '--collector.systemd'
      - '--collector.processes'
      - '--web.listen-address=0.0.0.0:{self.port}'
    restart: unless-stopped
    networks:
      - monitoring
    pid: host

networks:
  monitoring:
    external: true
"""
    
    def get_key_metrics(self) -> Dict[str, List[MetricDefinition]]:
        """获取关键指标定义"""
        return {
            "CPU指标": [
                MetricDefinition(
                    name="node_cpu_seconds_total",
                    type="counter",
                    help="Seconds the CPUs spent in each mode",
                    labels=["cpu", "mode"],
                    unit="seconds"
                ),
                MetricDefinition(
                    name="node_load1",
                    type="gauge",
                    help="1m load average",
                    labels=[]
                ),
                MetricDefinition(
                    name="node_load5",
                    type="gauge",
                    help="5m load average",
                    labels=[]
                ),
                MetricDefinition(
                    name="node_load15",
                    type="gauge",
                    help="15m load average",
                    labels=[]
                )
            ],
            "内存指标": [
                MetricDefinition(
                    name="node_memory_MemTotal_bytes",
                    type="gauge",
                    help="Memory information field MemTotal_bytes",
                    labels=[],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_memory_MemFree_bytes",
                    type="gauge",
                    help="Memory information field MemFree_bytes",
                    labels=[],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_memory_MemAvailable_bytes",
                    type="gauge",
                    help="Memory information field MemAvailable_bytes",
                    labels=[],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_memory_Buffers_bytes",
                    type="gauge",
                    help="Memory information field Buffers_bytes",
                    labels=[],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_memory_Cached_bytes",
                    type="gauge",
                    help="Memory information field Cached_bytes",
                    labels=[],
                    unit="bytes"
                )
            ],
            "磁盘指标": [
                MetricDefinition(
                    name="node_filesystem_size_bytes",
                    type="gauge",
                    help="Filesystem size in bytes",
                    labels=["device", "fstype", "mountpoint"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_filesystem_free_bytes",
                    type="gauge",
                    help="Filesystem free space in bytes",
                    labels=["device", "fstype", "mountpoint"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_disk_read_bytes_total",
                    type="counter",
                    help="The total number of bytes read successfully",
                    labels=["device"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_disk_written_bytes_total",
                    type="counter",
                    help="The total number of bytes written successfully",
                    labels=["device"],
                    unit="bytes"
                )
            ],
            "网络指标": [
                MetricDefinition(
                    name="node_network_receive_bytes_total",
                    type="counter",
                    help="Network device statistic receive_bytes",
                    labels=["device"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_network_transmit_bytes_total",
                    type="counter",
                    help="Network device statistic transmit_bytes",
                    labels=["device"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="node_network_receive_packets_total",
                    type="counter",
                    help="Network device statistic receive_packets",
                    labels=["device"]
                ),
                MetricDefinition(
                    name="node_network_transmit_packets_total",
                    type="counter",
                    help="Network device statistic transmit_packets",
                    labels=["device"]
                )
            ]
        }
    
    def create_monitoring_queries(self) -> Dict[str, str]:
        """创建监控查询"""
        return {
            "CPU使用率": """
# CPU使用率(百分比)
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100)
""",
            "内存使用率": """
# 内存使用率(百分比)
(
  node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes * 100
""",
            "磁盘使用率": """
# 磁盘使用率(百分比)
(
  node_filesystem_size_bytes - node_filesystem_free_bytes
) / node_filesystem_size_bytes * 100
""",
            "磁盘IO": """
# 磁盘读取速率(字节/秒)
rate(node_disk_read_bytes_total[5m])

# 磁盘写入速率(字节/秒)
rate(node_disk_written_bytes_total[5m])
""",
            "网络流量": """
# 网络接收速率(字节/秒)
rate(node_network_receive_bytes_total[5m])

# 网络发送速率(字节/秒)
rate(node_network_transmit_bytes_total[5m])
""",
            "系统负载": """
# 1分钟负载平均值
node_load1

# 5分钟负载平均值
node_load5

# 15分钟负载平均值
node_load15
"""
        }

# 使用示例
node_manager = NodeExporterManager()

# 生成安装脚本
install_script = node_manager.generate_installation_script()
print("Node Exporter安装脚本已生成")

# 生成systemd服务
systemd_service = node_manager.generate_systemd_service()
print("systemd服务文件已生成")

# 获取关键指标
key_metrics = node_manager.get_key_metrics()
print(f"\n关键指标类别数: {len(key_metrics)}")

for category, metrics in key_metrics.items():
    print(f"\n{category}:")
    for metric in metrics:
        print(f"  - {metric.name}: {metric.help}")

# 获取监控查询
queries = node_manager.create_monitoring_queries()
print("\n监控查询示例:")
for name, query in queries.items():
    print(f"\n{name}:")
    print(query)

2. MySQL Exporter

class MySQLExporterManager:
    """MySQL Exporter管理器"""
    
    def __init__(self):
        self.version = "0.15.0"
        self.port = 9104
        self.config_file = "/etc/mysql/my.cnf"
    
    def generate_mysql_user_setup(self) -> str:
        """生成MySQL用户设置脚本"""
        return """
-- MySQL Exporter 用户设置
-- 在MySQL中执行以下SQL语句

-- 1. 创建监控用户
CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'your_password_here' WITH MAX_USER_CONNECTIONS 3;

-- 2. 授予必要权限
GRANT PROCESS ON *.* TO 'exporter'@'localhost';
GRANT REPLICATION CLIENT ON *.* TO 'exporter'@'localhost';
GRANT SELECT ON performance_schema.* TO 'exporter'@'localhost';
GRANT SELECT ON information_schema.* TO 'exporter'@'localhost';

-- 3. 可选:授予额外权限以获取更多指标
GRANT SELECT ON mysql.user TO 'exporter'@'localhost';
GRANT SELECT ON mysql.* TO 'exporter'@'localhost';

-- 4. 刷新权限
FLUSH PRIVILEGES;

-- 5. 验证用户创建
SELECT User, Host FROM mysql.user WHERE User = 'exporter';
"""
    
    def generate_config_file(self) -> str:
        """生成配置文件"""
        return """
# MySQL Exporter 配置文件 (.my.cnf)
[client]
user=exporter
password=your_password_here
host=localhost
port=3306

# 可选配置
[mysql]
default-character-set=utf8mb4

# SSL配置(如果需要)
# ssl-ca=/path/to/ca.pem
# ssl-cert=/path/to/client-cert.pem
# ssl-key=/path/to/client-key.pem
"""
    
    def generate_docker_compose(self) -> str:
        """生成Docker Compose配置"""
        return f"""
version: '3.8'

services:
  mysql-exporter:
    image: prom/mysqld-exporter:v{self.version}
    container_name: mysql-exporter
    ports:
      - "{self.port}:{self.port}"
    environment:
      - DATA_SOURCE_NAME=exporter:your_password_here@(mysql:3306)/
      # 或者使用配置文件
      # - CONFIG_MY_CNF=/etc/mysql/.my.cnf
    volumes:
      # 如果使用配置文件
      - ./my.cnf:/etc/mysql/.my.cnf:ro
    command:
      - '--web.listen-address=0.0.0.0:{self.port}'
      - '--collect.info_schema.processlist'
      - '--collect.info_schema.innodb_metrics'
      - '--collect.info_schema.innodb_tablespaces'
      - '--collect.info_schema.innodb_cmp'
      - '--collect.info_schema.innodb_cmpmem'
      - '--collect.engine_innodb_status'
      - '--collect.binlog_size'
      - '--collect.info_schema.clientstats'
      - '--collect.info_schema.tablestats'
      - '--collect.info_schema.schemastats'
      - '--collect.perf_schema.eventswaits'
      - '--collect.perf_schema.file_events'
      - '--collect.perf_schema.indexiowaits'
      - '--collect.perf_schema.tableiowaits'
      - '--collect.perf_schema.tablelocks'
    restart: unless-stopped
    networks:
      - monitoring
    depends_on:
      - mysql

  mysql:
    image: mysql:8.0
    container_name: mysql
    environment:
      - MYSQL_ROOT_PASSWORD=rootpassword
      - MYSQL_DATABASE=testdb
      - MYSQL_USER=testuser
      - MYSQL_PASSWORD=testpass
    ports:
      - "3306:3306"
    volumes:
      - mysql_data:/var/lib/mysql
      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
    networks:
      - monitoring

volumes:
  mysql_data:

networks:
  monitoring:
    driver: bridge
"""
    
    def get_key_metrics(self) -> Dict[str, List[MetricDefinition]]:
        """获取关键指标定义"""
        return {
            "连接指标": [
                MetricDefinition(
                    name="mysql_global_status_threads_connected",
                    type="gauge",
                    help="The number of currently open connections",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_max_used_connections",
                    type="gauge",
                    help="The maximum number of connections that have been in use simultaneously",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_variables_max_connections",
                    type="gauge",
                    help="The maximum permitted number of simultaneous client connections",
                    labels=[]
                )
            ],
            "查询指标": [
                MetricDefinition(
                    name="mysql_global_status_queries",
                    type="counter",
                    help="The number of statements executed by the server",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_slow_queries",
                    type="counter",
                    help="The number of queries that have taken more than long_query_time seconds",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_questions",
                    type="counter",
                    help="The number of statements executed by the server",
                    labels=[]
                )
            ],
            "InnoDB指标": [
                MetricDefinition(
                    name="mysql_global_status_innodb_buffer_pool_read_requests",
                    type="counter",
                    help="The number of logical read requests",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_innodb_buffer_pool_reads",
                    type="counter",
                    help="The number of logical reads that InnoDB could not satisfy from the buffer pool",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_innodb_buffer_pool_pages_total",
                    type="gauge",
                    help="The total size of the buffer pool, in pages",
                    labels=[]
                ),
                MetricDefinition(
                    name="mysql_global_status_innodb_buffer_pool_pages_free",
                    type="gauge",
                    help="The number of free pages in the buffer pool",
                    labels=[]
                )
            ],
            "复制指标": [
                MetricDefinition(
                    name="mysql_slave_lag_seconds",
                    type="gauge",
                    help="Lag behind master in seconds",
                    labels=["channel_name"]
                ),
                MetricDefinition(
                    name="mysql_slave_sql_running",
                    type="gauge",
                    help="Whether the SQL thread is running",
                    labels=["channel_name"]
                ),
                MetricDefinition(
                    name="mysql_slave_io_running",
                    type="gauge",
                    help="Whether the IO thread is running",
                    labels=["channel_name"]
                )
            ]
        }
    
    def create_monitoring_queries(self) -> Dict[str, str]:
        """创建监控查询"""
        return {
            "QPS(每秒查询数)": """
# MySQL QPS
rate(mysql_global_status_queries[5m])
""",
            "连接使用率": """
# 连接使用率(百分比)
mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100
""",
            "慢查询率": """
# 慢查询率(百分比)
rate(mysql_global_status_slow_queries[5m]) / rate(mysql_global_status_queries[5m]) * 100
""",
            "InnoDB缓冲池命中率": """
# InnoDB缓冲池命中率(百分比)
(
  mysql_global_status_innodb_buffer_pool_read_requests - 
  mysql_global_status_innodb_buffer_pool_reads
) / mysql_global_status_innodb_buffer_pool_read_requests * 100
""",
            "InnoDB缓冲池使用率": """
# InnoDB缓冲池使用率(百分比)
(
  mysql_global_status_innodb_buffer_pool_pages_total - 
  mysql_global_status_innodb_buffer_pool_pages_free
) / mysql_global_status_innodb_buffer_pool_pages_total * 100
""",
            "复制延迟": """
# 主从复制延迟(秒)
mysql_slave_lag_seconds
""",
            "复制状态": """
# SQL线程状态
mysql_slave_sql_running

# IO线程状态
mysql_slave_io_running
""",
            "表锁等待": """
# 表锁等待时间
rate(mysql_global_status_table_locks_waited[5m])
"""
        }
    
    def create_alerting_rules(self) -> str:
        """创建告警规则"""
        return """
# MySQL Exporter 告警规则
groups:
  - name: mysql_alerts
    rules:
      # MySQL服务不可用
      - alert: MySQLDown
        expr: mysql_up == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "MySQL instance is down"
          description: "MySQL database is down on {{ $labels.instance }}"
      
      # 连接数过高
      - alert: MySQLHighConnections
        expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "MySQL high connections"
          description: "MySQL connections usage is {{ $value }}% on {{ $labels.instance }}"
      
      # 慢查询率过高
      - alert: MySQLHighSlowQueries
        expr: rate(mysql_global_status_slow_queries[5m]) / rate(mysql_global_status_queries[5m]) * 100 > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "MySQL high slow queries rate"
          description: "MySQL slow queries rate is {{ $value }}% on {{ $labels.instance }}"
      
      # InnoDB缓冲池命中率过低
      - alert: MySQLLowBufferPoolHitRate
        expr: |
          (
            mysql_global_status_innodb_buffer_pool_read_requests - 
            mysql_global_status_innodb_buffer_pool_reads
          ) / mysql_global_status_innodb_buffer_pool_read_requests * 100 < 95
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "MySQL low InnoDB buffer pool hit rate"
          description: "MySQL InnoDB buffer pool hit rate is {{ $value }}% on {{ $labels.instance }}"
      
      # 主从复制延迟
      - alert: MySQLReplicationLag
        expr: mysql_slave_lag_seconds > 30
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "MySQL replication lag"
          description: "MySQL replication lag is {{ $value }} seconds on {{ $labels.instance }}"
      
      # 主从复制中断
      - alert: MySQLReplicationStopped
        expr: mysql_slave_sql_running == 0 or mysql_slave_io_running == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "MySQL replication stopped"
          description: "MySQL replication has stopped on {{ $labels.instance }}"
"""

# 使用示例
mysql_manager = MySQLExporterManager()

# 生成MySQL用户设置
user_setup = mysql_manager.generate_mysql_user_setup()
print("MySQL用户设置脚本:")
print(user_setup)

# 生成配置文件
config_file = mysql_manager.generate_config_file()
print("\n配置文件内容:")
print(config_file)

# 获取关键指标
key_metrics = mysql_manager.get_key_metrics()
print(f"\n关键指标类别数: {len(key_metrics)}")

# 获取监控查询
queries = mysql_manager.create_monitoring_queries()
print("\n监控查询示例:")
for name, query in queries.items():
    print(f"\n{name}:")
    print(query)

# 生成告警规则
alert_rules = mysql_manager.create_alerting_rules()
print("\n告警规则:")
print(alert_rules)

3. Blackbox Exporter

class BlackboxExporterManager:
    """Blackbox Exporter管理器"""
    
    def __init__(self):
        self.version = "0.24.0"
        self.port = 9115
        self.config_file = "/etc/blackbox_exporter/config.yml"
    
    def generate_config_file(self) -> str:
        """生成配置文件"""
        return """
# Blackbox Exporter 配置文件
modules:
  # HTTP 2xx检查
  http_2xx:
    prober: http
    timeout: 5s
    http:
      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
      valid_status_codes: []  # 默认为2xx
      method: GET
      headers:
        Host: example.com
        Accept-Language: en-US
      no_follow_redirects: false
      fail_if_ssl: false
      fail_if_not_ssl: false
      tls_config:
        insecure_skip_verify: false
      preferred_ip_protocol: "ip4"
  
  # HTTP POST检查
  http_post_2xx:
    prober: http
    timeout: 5s
    http:
      method: POST
      headers:
        Content-Type: application/json
      body: '{"key": "value"}'
      valid_status_codes: [200, 201]
  
  # HTTPS检查
  http_2xx_ssl:
    prober: http
    timeout: 5s
    http:
      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
      method: GET
      fail_if_not_ssl: true
      tls_config:
        insecure_skip_verify: false
  
  # TCP连接检查
  tcp_connect:
    prober: tcp
    timeout: 5s
    tcp:
      preferred_ip_protocol: "ip4"
  
  # ICMP检查
  icmp:
    prober: icmp
    timeout: 5s
    icmp:
      preferred_ip_protocol: "ip4"
      source_ip_address: "0.0.0.0"
  
  # DNS检查
  dns_udp:
    prober: dns
    timeout: 5s
    dns:
      query_name: "example.com"
      query_type: "A"
      valid_rcodes:
        - NOERROR
      validate_answer_rrs:
        fail_if_matches_regexp:
          - ".*127.0.0.1"
        fail_if_not_matches_regexp:
          - "example.com.\t300\tIN\tA\t.*"
      preferred_ip_protocol: "ip4"
      transport_protocol: "udp"
  
  # SSH检查
  ssh_banner:
    prober: tcp
    timeout: 5s
    tcp:
      query_response:
        - expect: "^SSH-2.0-"
      preferred_ip_protocol: "ip4"
  
  # MySQL检查
  mysql_connect:
    prober: tcp
    timeout: 5s
    tcp:
      query_response:
        - send: "\x00\x00\x00\x01"
        - expect: "mysql_native_password"
      preferred_ip_protocol: "ip4"
  
  # Redis检查
  redis_connect:
    prober: tcp
    timeout: 5s
    tcp:
      query_response:
        - send: "PING\r\n"
        - expect: "PONG"
      preferred_ip_protocol: "ip4"
"""
    
    def generate_docker_compose(self) -> str:
        """生成Docker Compose配置"""
        return f"""
version: '3.8'

services:
  blackbox-exporter:
    image: prom/blackbox-exporter:v{self.version}
    container_name: blackbox-exporter
    ports:
      - "{self.port}:{self.port}"
    volumes:
      - ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
    command:
      - '--config.file=/etc/blackbox_exporter/config.yml'
      - '--web.listen-address=0.0.0.0:{self.port}'
      - '--log.level=info'
    restart: unless-stopped
    networks:
      - monitoring

networks:
  monitoring:
    external: true
"""
    
    def generate_prometheus_config(self) -> str:
        """生成Prometheus配置"""
        return f"""
# Prometheus配置中的Blackbox Exporter抓取配置
scrape_configs:
  # Blackbox Exporter自身监控
  - job_name: 'blackbox-exporter'
    static_configs:
      - targets: ['blackbox-exporter:{self.port}']
  
  # HTTP监控
  - job_name: 'blackbox-http'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
        - https://example.com
        - https://api.example.com
        - https://www.google.com
        - https://prometheus.io
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:{self.port}
  
  # HTTPS SSL证书监控
  - job_name: 'blackbox-ssl'
    metrics_path: /probe
    params:
      module: [http_2xx_ssl]
    static_configs:
      - targets:
        - https://example.com
        - https://api.example.com
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:{self.port}
  
  # TCP端口监控
  - job_name: 'blackbox-tcp'
    metrics_path: /probe
    params:
      module: [tcp_connect]
    static_configs:
      - targets:
        - example.com:80
        - example.com:443
        - example.com:22
        - database.example.com:3306
        - redis.example.com:6379
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:{self.port}
  
  # ICMP监控
  - job_name: 'blackbox-icmp'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets:
        - 8.8.8.8
        - 8.8.4.4
        - 1.1.1.1
        - example.com
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:{self.port}
  
  # DNS监控
  - job_name: 'blackbox-dns'
    metrics_path: /probe
    params:
      module: [dns_udp]
    static_configs:
      - targets:
        - 8.8.8.8:53
        - 8.8.4.4:53
        - 1.1.1.1:53
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:{self.port}
"""
    
    def get_key_metrics(self) -> Dict[str, List[MetricDefinition]]:
        """获取关键指标定义"""
        return {
            "探测结果": [
                MetricDefinition(
                    name="probe_success",
                    type="gauge",
                    help="Displays whether or not the probe was a success",
                    labels=["instance", "job"]
                ),
                MetricDefinition(
                    name="probe_duration_seconds",
                    type="gauge",
                    help="Returns how long the probe took to complete in seconds",
                    labels=["instance", "job"],
                    unit="seconds"
                )
            ],
            "HTTP指标": [
                MetricDefinition(
                    name="probe_http_status_code",
                    type="gauge",
                    help="Response HTTP status code",
                    labels=["instance", "job"]
                ),
                MetricDefinition(
                    name="probe_http_duration_seconds",
                    type="gauge",
                    help="Duration of http request by phase",
                    labels=["instance", "job", "phase"],
                    unit="seconds"
                ),
                MetricDefinition(
                    name="probe_http_content_length",
                    type="gauge",
                    help="Length of http content response",
                    labels=["instance", "job"],
                    unit="bytes"
                ),
                MetricDefinition(
                    name="probe_http_ssl",
                    type="gauge",
                    help="Indicates if SSL was used for the final redirect",
                    labels=["instance", "job"]
                )
            ],
            "SSL证书指标": [
                MetricDefinition(
                    name="probe_ssl_earliest_cert_expiry",
                    type="gauge",
                    help="Returns earliest SSL cert expiry date",
                    labels=["instance", "job"],
                    unit="seconds"
                ),
                MetricDefinition(
                    name="probe_tls_version_info",
                    type="gauge",
                    help="Contains the TLS version used",
                    labels=["instance", "job", "version"]
                )
            ],
            "DNS指标": [
                MetricDefinition(
                    name="probe_dns_lookup_time_seconds",
                    type="gauge",
                    help="Time taken for DNS lookup",
                    labels=["instance", "job"],
                    unit="seconds"
                ),
                MetricDefinition(
                    name="probe_dns_answer_rrs",
                    type="gauge",
                    help="Number of answer resource records",
                    labels=["instance", "job"]
                )
            ],
            "ICMP指标": [
                MetricDefinition(
                    name="probe_icmp_duration_seconds",
                    type="gauge",
                    help="Duration of ICMP request",
                    labels=["instance", "job"],
                    unit="seconds"
                ),
                MetricDefinition(
                    name="probe_icmp_reply_hop_limit",
                    type="gauge",
                    help="Hop limit of the ICMP reply packet",
                    labels=["instance", "job"]
                )
            ]
        }
    
    def create_monitoring_queries(self) -> Dict[str, str]:
        """创建监控查询"""
        return {
            "服务可用性": """
# 服务可用性(百分比)
avg(probe_success) by (instance) * 100
""",
            "响应时间": """
# 平均响应时间
avg(probe_duration_seconds) by (instance)

# P95响应时间
histogram_quantile(0.95, 
  sum(rate(probe_duration_seconds_bucket[5m])) by (le, instance)
)
""",
            "HTTP状态码分布": """
# HTTP状态码分布
sum(probe_http_status_code) by (code)
""",
            "SSL证书过期时间": """
# SSL证书剩余天数
(probe_ssl_earliest_cert_expiry - time()) / 86400
""",
            "DNS解析时间": """
# DNS解析时间
probe_dns_lookup_time_seconds
""",
            "网络延迟": """
# ICMP延迟
probe_icmp_duration_seconds * 1000  # 转换为毫秒
""",
            "服务异常检测": """
# 检测服务异常(连续失败)
probe_success == 0
""",
            "性能趋势分析": """
# 响应时间趋势(1小时移动平均)
avg_over_time(probe_duration_seconds[1h])
"""
        }
    
    def create_alerting_rules(self) -> str:
        """创建告警规则"""
        return """
# Blackbox Exporter 告警规则
groups:
  - name: blackbox_alerts
    rules:
      # 服务不可用
      - alert: ServiceDown
        expr: probe_success == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.instance }} is down"
          description: "Service {{ $labels.instance }} has been down for more than 1 minute"
      
      # 响应时间过长
      - alert: SlowResponse
        expr: probe_duration_seconds > 5
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Slow response from {{ $labels.instance }}"
          description: "{{ $labels.instance }} response time is {{ $value }} seconds"
      
      # HTTP状态码异常
      - alert: HttpStatusCodeError
        expr: probe_http_status_code >= 400
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "HTTP error status code"
          description: "{{ $labels.instance }} returned status code {{ $value }}"
      
      # SSL证书即将过期
      - alert: SSLCertExpiringSoon
        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL certificate expiring soon"
          description: "SSL certificate for {{ $labels.instance }} expires in {{ $value }} days"
      
      # SSL证书已过期
      - alert: SSLCertExpired
        expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "SSL certificate expired"
          description: "SSL certificate for {{ $labels.instance }} has expired"
      
      # DNS解析失败
      - alert: DNSResolutionFailure
        expr: probe_dns_lookup_time_seconds == 0 and probe_success == 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "DNS resolution failure"
          description: "DNS resolution failed for {{ $labels.instance }}"
      
      # 网络延迟过高
      - alert: HighNetworkLatency
        expr: probe_icmp_duration_seconds > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network latency"
          description: "Network latency to {{ $labels.instance }} is {{ $value }} seconds"
"""

# 使用示例
blackbox_manager = BlackboxExporterManager()

# 生成配置文件
config_file = blackbox_manager.generate_config_file()
print("Blackbox Exporter配置文件:")
print(config_file[:500] + "...")

# 生成Prometheus配置
prometheus_config = blackbox_manager.generate_prometheus_config()
print("\nPrometheus配置示例:")
print(prometheus_config[:500] + "...")

# 获取关键指标
key_metrics = blackbox_manager.get_key_metrics()
print(f"\n关键指标类别数: {len(key_metrics)}")

# 获取监控查询
queries = blackbox_manager.create_monitoring_queries()
print("\n监控查询示例:")
for name, query in list(queries.items())[:3]:
    print(f"\n{name}:")
    print(query)

# 生成告警规则
alert_rules = blackbox_manager.create_alerting_rules()
print("\n告警规则示例:")
print(alert_rules[:500] + "...")

自定义Exporter开发

1. Python Exporter开发

class CustomExporterDeveloper:
    """自定义Exporter开发器"""
    
    def __init__(self):
        self.port = 8000
        self.metrics_path = "/metrics"
    
    def generate_simple_exporter(self) -> str:
        """生成简单的Python Exporter"""
        return f"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
简单的自定义Prometheus Exporter
"""

import time
import random
import psutil
from prometheus_client import start_http_server, Gauge, Counter, Histogram, Info
from prometheus_client.core import CollectorRegistry
import logging
from concurrent.futures import ThreadPoolExecutor
from flask import Flask, Response
import argparse

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class ConfigManager:
    """配置管理器"""
    
    def __init__(self, config_file: str = 'config.yaml'):
        self.config_file = config_file
        self.config = self.load_config()
    
    def load_config(self) -> Dict[str, Any]:
        """加载配置文件"""
        try:
            if Path(self.config_file).exists():
                with open(self.config_file, 'r', encoding='utf-8') as f:
                    if self.config_file.endswith('.yaml') or self.config_file.endswith('.yml'):
                        return yaml.safe_load(f)
                    else:
                        return json.load(f)
            else:
                return self.get_default_config()
        except Exception as e:
            logger.error(f"Error loading config: {e}")
            return self.get_default_config()
    
    def get_default_config(self) -> Dict[str, Any]:
        """获取默认配置"""
        return {
            'server': {
                'port': 8000,
                'host': '0.0.0.0',
                'metrics_path': '/metrics'
            },
            'collection': {
                'interval': 15,
                'timeout': 10,
                'enabled_collectors': ['system', 'application', 'custom']
            },
            'targets': {
                'api_endpoints': [
                    'http://localhost:8080/api/health',
                    'http://localhost:8081/api/status'
                ],
                'databases': [
                    {
                        'type': 'mysql',
                        'host': 'localhost',
                        'port': 3306,
                        'database': 'test'
                    }
                ]
            },
            'logging': {
                'level': 'INFO',
                'file': 'exporter.log'
            }
        }

class MetricsCollector:
    """指标收集器基类"""
    
    def __init__(self, name: str, config: Dict[str, Any]):
        self.name = name
        self.config = config
        self.enabled = config.get('enabled', True)
        self.registry = CollectorRegistry()
        self.setup_metrics()
    
    def setup_metrics(self):
        """设置指标(子类实现)"""
        pass
    
    def collect(self):
        """收集指标(子类实现)"""
        pass

# 使用示例
developer = CustomExporterDeveloper()

# 生成简单Exporter
simple_exporter = developer.generate_simple_exporter()
print("简单Exporter代码已生成")

# 生成配置文件
config_file = developer.generate_config_file()
print("\n配置文件示例:")
print(config_file[:500] + "...")

服务发现与动态配置

1. 服务发现机制

class ServiceDiscoveryManager:
    """服务发现管理器"""
    
    def __init__(self):
        self.discovery_methods = {}
        self._initialize_discovery_methods()
    
    def _initialize_discovery_methods(self):
        """初始化服务发现方法"""
        self.discovery_methods = {
            "static": self.static_discovery,
            "consul": self.consul_discovery,
            "kubernetes": self.kubernetes_discovery,
            "dns": self.dns_discovery,
            "file": self.file_discovery,
            "ec2": self.ec2_discovery
        }
    
    def generate_static_config(self) -> str:
        """生成静态配置"""
        return """
# 静态服务发现配置
scrape_configs:
  - job_name: 'static-targets'
    static_configs:
      - targets:
        - 'localhost:9100'  # Node Exporter
        - 'localhost:9104'  # MySQL Exporter
        - 'localhost:9121'  # Redis Exporter
        labels:
          environment: 'production'
          datacenter: 'dc1'
      
      - targets:
        - 'web1.example.com:8080'
        - 'web2.example.com:8080'
        labels:
          service: 'web'
          environment: 'production'
"""
    
    def generate_consul_config(self) -> str:
        """生成Consul服务发现配置"""
        return """
# Consul服务发现配置
scrape_configs:
  - job_name: 'consul-services'
    consul_sd_configs:
      - server: 'consul.example.com:8500'
        datacenter: 'dc1'
        services:
          - 'web'
          - 'api'
          - 'database'
        tags:
          - 'prometheus'
          - 'monitoring'
        scheme: 'http'
        allow_stale: true
        refresh_interval: 30s
    
    relabel_configs:
      # 使用服务名作为job标签
      - source_labels: [__meta_consul_service]
        target_label: job
      
      # 使用Consul标签
      - source_labels: [__meta_consul_tags]
        regex: '.*,prometheus,.*'
        action: keep
      
      # 设置实例标签
      - source_labels: [__meta_consul_service_address, __meta_consul_service_port]
        separator: ':'
        target_label: instance
      
      # 添加环境标签
      - source_labels: [__meta_consul_datacenter]
        target_label: datacenter
"""
    
    def generate_kubernetes_config(self) -> str:
        """生成Kubernetes服务发现配置"""
        return """
# Kubernetes服务发现配置
scrape_configs:
  # Pod发现
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - default
            - monitoring
            - production
    
    relabel_configs:
      # 只抓取有prometheus.io/scrape=true注解的Pod
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      
      # 使用自定义路径
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      
      # 使用自定义端口
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\\d+)?;(\\d+)
        replacement: $1:$2
        target_label: __address__
      
      # 添加Pod标签
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      
      # 添加命名空间和Pod名称
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name
  
  # Service发现
  - job_name: 'kubernetes-services'
    kubernetes_sd_configs:
      - role: service
    
    relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\\d+)?;(\\d+)
        replacement: $1:$2
      
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_service_name
  
  # Ingress发现
  - job_name: 'kubernetes-ingresses'
    kubernetes_sd_configs:
      - role: ingress
    
    relabel_configs:
      - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      
      - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
        regex: (.+);(.+);(.+)
        replacement: ${1}://${2}${3}
        target_label: __address__
      
      - action: labelmap
        regex: __meta_kubernetes_ingress_label_(.+)
"""
    
    def generate_dns_config(self) -> str:
        """生成DNS服务发现配置"""
        return """
# DNS服务发现配置
scrape_configs:
  - job_name: 'dns-discovery'
    dns_sd_configs:
      - names:
        - '_prometheus._tcp.example.com'
        - '_node-exporter._tcp.example.com'
        - '_mysql-exporter._tcp.example.com'
        type: 'SRV'
        port: 9100
        refresh_interval: 30s
    
    relabel_configs:
      # 使用DNS记录名称作为job
      - source_labels: [__meta_dns_name]
        regex: '_([^.]+)._tcp.(.+)'
        replacement: '${1}'
        target_label: job
      
      # 添加域名标签
      - source_labels: [__meta_dns_name]
        regex: '_[^.]+._tcp.(.+)'
        replacement: '${1}'
        target_label: domain
"""
    
    def generate_file_config(self) -> str:
        """生成文件服务发现配置"""
        return """
# 文件服务发现配置
scrape_configs:
  - job_name: 'file-discovery'
    file_sd_configs:
      - files:
        - '/etc/prometheus/targets/*.json'
        - '/etc/prometheus/targets/*.yml'
        refresh_interval: 10s
    
    relabel_configs:
      # 使用文件中的标签
      - source_labels: [__meta_filepath]
        regex: '.*/([^/]+)\\.(json|yml)'
        replacement: '${1}'
        target_label: file_source
"""
    
    def generate_target_files(self) -> Dict[str, str]:
        """生成目标文件示例"""
        return {
            "web_servers.json": """
[
  {
    "targets": ["web1.example.com:8080", "web2.example.com:8080"],
    "labels": {
      "job": "web",
      "environment": "production",
      "datacenter": "dc1"
    }
  },
  {
    "targets": ["api1.example.com:8080", "api2.example.com:8080"],
    "labels": {
      "job": "api",
      "environment": "production",
      "datacenter": "dc1"
    }
  }
]
""",
            "databases.yml": """
- targets:
  - mysql1.example.com:9104
  - mysql2.example.com:9104
  labels:
    job: mysql
    environment: production
    datacenter: dc1
    
- targets:
  - redis1.example.com:9121
  - redis2.example.com:9121
  labels:
    job: redis
    environment: production
    datacenter: dc1
"""
        }
    
    def generate_ec2_config(self) -> str:
        """生成EC2服务发现配置"""
        return """
# EC2服务发现配置
scrape_configs:
  - job_name: 'ec2-discovery'
    ec2_sd_configs:
      - region: us-west-2
        access_key: 'your-access-key'
        secret_key: 'your-secret-key'
        port: 9100
        filters:
          - name: 'tag:Environment'
            values: ['production', 'staging']
          - name: 'tag:Monitoring'
            values: ['enabled']
          - name: 'instance-state-name'
            values: ['running']
        refresh_interval: 60s
    
    relabel_configs:
      # 使用实例ID作为实例标签
      - source_labels: [__meta_ec2_instance_id]
        target_label: instance_id
      
      # 使用私有IP
      - source_labels: [__meta_ec2_private_ip]
        target_label: __address__
        replacement: '${1}:9100'
      
      # 添加EC2标签
      - action: labelmap
        regex: __meta_ec2_tag_(.+)
        replacement: ec2_tag_${1}
      
      # 添加可用区
      - source_labels: [__meta_ec2_availability_zone]
        target_label: availability_zone
      
      # 添加实例类型
      - source_labels: [__meta_ec2_instance_type]
        target_label: instance_type
"""

# 使用示例
sd_manager = ServiceDiscoveryManager()

# 生成各种服务发现配置
static_config = sd_manager.generate_static_config()
consul_config = sd_manager.generate_consul_config()
kubernetes_config = sd_manager.generate_kubernetes_config()

print("服务发现配置已生成")
print(f"\n静态配置示例:")
print(static_config[:300] + "...")

print(f"\nConsul配置示例:")
print(consul_config[:300] + "...")

# 生成目标文件
target_files = sd_manager.generate_target_files()
print(f"\n目标文件数量: {len(target_files)}")

数据收集最佳实践

1. 性能优化

class DataCollectionOptimizer:
    """数据收集优化器"""
    
    def __init__(self):
        self.optimization_strategies = {}
        self._initialize_strategies()
    
    def _initialize_strategies(self):
        """初始化优化策略"""
        self.optimization_strategies = {
            "scrape_interval": self.optimize_scrape_interval,
            "metric_filtering": self.optimize_metric_filtering,
            "label_optimization": self.optimize_labels,
            "storage_optimization": self.optimize_storage,
            "network_optimization": self.optimize_network
        }
    
    def generate_optimized_config(self) -> str:
        """生成优化配置"""
        return """
# 优化的Prometheus配置
global:
  # 全局抓取间隔(根据需求调整)
  scrape_interval: 15s
  # 全局评估间隔
  evaluation_interval: 15s
  # 外部标签(用于联邦和远程存储)
  external_labels:
    cluster: 'production'
    region: 'us-west-2'

# 规则文件
rule_files:
  - "rules/*.yml"

# 抓取配置
scrape_configs:
  # 高频监控(关键服务)
  - job_name: 'critical-services'
    scrape_interval: 5s
    scrape_timeout: 3s
    static_configs:
      - targets: ['api.example.com:8080']
    metric_relabel_configs:
      # 只保留关键指标
      - source_labels: [__name__]
        regex: '(http_requests_total|http_request_duration_seconds|up)'
        action: keep
  
  # 中频监控(一般服务)
  - job_name: 'standard-services'
    scrape_interval: 15s
    scrape_timeout: 10s
    static_configs:
      - targets: ['web1.example.com:8080', 'web2.example.com:8080']
    metric_relabel_configs:
      # 过滤掉高基数标签
      - source_labels: [__name__]
        regex: '.*_bucket'
        action: drop
      # 限制标签值
      - source_labels: [status_code]
        regex: '[45]..'
        replacement: '4xx_5xx'
        target_label: status_code
  
  # 低频监控(基础设施)
  - job_name: 'infrastructure'
    scrape_interval: 60s
    scrape_timeout: 30s
    static_configs:
      - targets: ['node1:9100', 'node2:9100']
    metric_relabel_configs:
      # 删除不需要的指标
      - source_labels: [__name__]
        regex: 'node_scrape_collector_.*'
        action: drop
      # 删除高基数标签
      - regex: 'device'
        action: labeldrop
        source_labels: [__name__]
        target_label: __tmp_device_metric
      - source_labels: [__tmp_device_metric]
        regex: 'node_filesystem_.*'
        action: keep

# 存储配置
storage:
  tsdb:
    # 数据保留时间
    retention.time: 30d
    # 数据保留大小
    retention.size: 100GB
    # 压缩级别
    wal-compression: true
    # 块大小
    min-block-duration: 2h
    max-block-duration: 25h

# 远程写入配置(可选)
remote_write:
  - url: 'https://remote-storage.example.com/api/v1/write'
    queue_config:
      capacity: 10000
      max_samples_per_send: 2000
      batch_send_deadline: 5s
      min_backoff: 30ms
      max_backoff: 100ms
    write_relabel_configs:
      # 只发送关键指标到远程存储
      - source_labels: [__name__]
        regex: '(up|http_requests_total|node_cpu_seconds_total)'
        action: keep
"""
    
    def generate_metric_filtering_rules(self) -> str:
        """生成指标过滤规则"""
        return """
# 指标过滤规则示例
metric_relabel_configs:
  # 1. 删除不需要的指标
  - source_labels: [__name__]
    regex: '(go_.*|process_.*|promhttp_.*)'
    action: drop
  
  # 2. 重命名指标
  - source_labels: [__name__]
    regex: 'http_request_duration_seconds'
    replacement: 'http_response_time_seconds'
    target_label: __name__
  
  # 3. 标签值标准化
  - source_labels: [method]
    regex: '(GET|POST|PUT|DELETE)'
    action: keep
  
  - source_labels: [status_code]
    regex: '2..'
    replacement: '2xx'
    target_label: status_code
  
  - source_labels: [status_code]
    regex: '4..'
    replacement: '4xx'
    target_label: status_code
  
  - source_labels: [status_code]
    regex: '5..'
    replacement: '5xx'
    target_label: status_code
  
  # 4. 删除高基数标签
  - regex: '(user_id|session_id|request_id)'
    action: labeldrop
  
  # 5. 限制标签长度
  - source_labels: [path]
    regex: '(.{50}).*'
    replacement: '${1}...'
    target_label: path
  
  # 6. 合并相似标签
  - source_labels: [instance]
    regex: '([^:]+):.*'
    replacement: '${1}'
    target_label: host
"""
    
    def generate_recording_rules(self) -> str:
        """生成记录规则"""
        return """
# 记录规则 - 预计算常用查询
groups:
  - name: performance_rules
    interval: 30s
    rules:
      # HTTP请求速率
      - record: http:request_rate_5m
        expr: |
          sum(rate(http_requests_total[5m])) by (job, instance, method, status_code)
      
      # HTTP错误率
      - record: http:error_rate_5m
        expr: |
          sum(rate(http_requests_total{status_code=~"[45].."}[5m])) by (job, instance)
          /
          sum(rate(http_requests_total[5m])) by (job, instance)
      
      # HTTP P95响应时间
      - record: http:response_time_p95_5m
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le)
          )
      
      # CPU使用率
      - record: node:cpu_utilization_5m
        expr: |
          100 - (
            avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100
          )
      
      # 内存使用率
      - record: node:memory_utilization
        expr: |
          (
            node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
          ) / node_memory_MemTotal_bytes * 100
      
      # 磁盘使用率
      - record: node:disk_utilization
        expr: |
          (
            node_filesystem_size_bytes - node_filesystem_free_bytes
          ) / node_filesystem_size_bytes * 100
  
  - name: business_rules
    interval: 60s
    rules:
      # 业务指标聚合
      - record: business:orders_per_minute
        expr: |
          sum(rate(orders_total[1m])) by (region, product_type)
      
      - record: business:revenue_per_hour
        expr: |
          sum(increase(revenue_total[1h])) by (currency, region)
      
      - record: business:active_users_5m
        expr: |
          sum(active_users) by (region, platform)
"""
    
    def generate_optimization_checklist(self) -> List[str]:
        """生成优化检查清单"""
        return [
            "✓ 根据服务重要性设置不同的抓取间隔",
            "✓ 使用metric_relabel_configs过滤不需要的指标",
            "✓ 删除或合并高基数标签",
            "✓ 使用记录规则预计算常用查询",
            "✓ 配置适当的数据保留策略",
            "✓ 启用WAL压缩减少磁盘使用",
            "✓ 使用远程存储进行长期数据保存",
            "✓ 监控Prometheus自身的性能指标",
            "✓ 定期清理不再使用的指标和标签",
            "✓ 使用联邦机制分散负载",
            "✓ 配置适当的查询超时时间",
            "✓ 使用标签标准化减少基数",
            "✓ 实施指标命名规范",
            "✓ 定期审查和优化抓取配置",
            "✓ 监控存储使用情况和增长趋势"
        ]

# 使用示例
optimizer = DataCollectionOptimizer()

# 生成优化配置
optimized_config = optimizer.generate_optimized_config()
print("优化配置已生成")

# 生成过滤规则
filtering_rules = optimizer.generate_metric_filtering_rules()
print("\n指标过滤规则:")
print(filtering_rules[:300] + "...")

# 生成记录规则
recording_rules = optimizer.generate_recording_rules()
print("\n记录规则:")
print(recording_rules[:300] + "...")

# 获取优化检查清单
checklist = optimizer.generate_optimization_checklist()
print("\n优化检查清单:")
for item in checklist[:5]:
    print(item)
print(f"... 共{len(checklist)}项")

总结

通过本章学习,我们深入了解了Prometheus生态系统中Exporter的重要作用和使用方法:

关键要点

  1. Exporter分类理解

    • 官方Exporter:Node、MySQL、Blackbox等
    • 社区Exporter:Redis、Nginx、PostgreSQL等
    • 自定义Exporter:根据业务需求开发
  2. 常用Exporter掌握

    • Node Exporter:系统级监控的基础
    • MySQL Exporter:数据库性能监控
    • Blackbox Exporter:黑盒监控和可用性检测
  3. 自定义开发能力

    • Python客户端库使用
    • 指标类型选择和设计
    • 配置管理和错误处理

最佳实践

  1. 性能优化

    • 合理设置抓取间隔
    • 过滤不必要的指标
    • 控制标签基数
  2. 服务发现

    • 选择合适的发现机制
    • 配置标签重写规则
    • 实现动态配置更新
  3. 数据质量

    • 标准化指标命名
    • 实施标签规范
    • 定期清理和优化

下一步学习建议

  1. 深入实践:在实际环境中部署和配置各种Exporter
  2. 自定义开发:根据业务需求开发专用Exporter
  3. 性能调优:学习Prometheus性能优化技巧
  4. 集成扩展:探索与其他监控工具的集成方案

掌握了Exporter和数据收集的核心技能后,你就可以构建完整的监控体系,为下一章的告警和可视化打下坚实基础。 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(name)

class CustomExporter: “”“自定义Exporter类”“”

def __init__(self, port={self.port}):
    self.port = port

    # 创建指标
    self.cpu_usage = Gauge('custom_cpu_usage_percent', 'CPU usage percentage', ['cpu'])
    self.memory_usage = Gauge('custom_memory_usage_bytes', 'Memory usage in bytes')
    self.disk_usage = Gauge('custom_disk_usage_percent', 'Disk usage percentage', ['device'])
    self.network_bytes = Counter('custom_network_bytes_total', 'Network bytes total', ['device', 'direction'])
    self.request_duration = Histogram('custom_request_duration_seconds', 'Request duration')
    self.app_info = Info('custom_app_info', 'Application information')

    # 设置应用信息
    self.app_info.info({{
        'version': '1.0.0',
        'build_date': '2023-01-01',
        'author': 'Custom Team'
    }})

def collect_system_metrics(self):
    """收集系统指标"""
    try:
        # CPU使用率
        cpu_percent = psutil.cpu_percent(percpu=True)
        for i, usage in enumerate(cpu_percent):
            self.cpu_usage.labels(cpu=f'cpu{{i}}').set(usage)

        # 内存使用
        memory = psutil.virtual_memory()
        self.memory_usage.set(memory.used)

        # 磁盘使用率
        disk_usage = psutil.disk_usage('/')
        usage_percent = (disk_usage.used / disk_usage.total) * 100
        self.disk_usage.labels(device='root').set(usage_percent)

        # 网络流量
        network = psutil.net_io_counters(pernic=True)
        for interface, stats in network.items():
            self.network_bytes.labels(device=interface, direction='rx').inc(stats.bytes_recv)
            self.network_bytes.labels(device=interface, direction='tx').inc(stats.bytes_sent)

        logger.info("System metrics collected successfully")

    except Exception as e:
        logger.error(f"Error collecting system metrics: {{e}}")

def collect_custom_metrics(self):
    """收集自定义指标"""
    try:
        # 模拟请求处理时间
        with self.request_duration.time():
            time.sleep(random.uniform(0.01, 0.1))

        logger.info("Custom metrics collected successfully")

    except Exception as e:
        logger.error(f"Error collecting custom metrics: {{e}}")

def start_server(self):
    """启动HTTP服务器"""
    try:
        start_http_server(self.port)
        logger.info(f"Exporter started on port {{self.port}}")

        while True:
            self.collect_system_metrics()
            self.collect_custom_metrics()
            time.sleep(15)  # 每15秒收集一次

    except KeyboardInterrupt:
        logger.info("Exporter stopped")
    except Exception as e:
        logger.error(f"Error starting server: {{e}}")

if name == ‘main’: exporter = CustomExporter() exporter.start_server() “””

def generate_advanced_exporter(self) -> str:
    """生成高级Python Exporter"""
    return f"""

#!/usr/bin/env python3

-- coding: utf-8 --

”“” 高级自定义Prometheus Exporter 支持配置文件、多线程、健康检查等功能 “””

import time import json import yaml import threading import requests from pathlib import Path from typing import Dict, List, Any from prometheus_client import start_http_server, Gauge, Counter, Histogram, Info, Summary from prometheus_client.core import CollectorRegistry