日志处理与监控 - 在线学习与练习平台

1. 日志系统概述

1.1 日志的重要性

在OpenResty应用中，日志系统是运维和调试的重要工具： - 问题诊断：快速定位和解决问题 - 性能监控：分析系统性能瓶颈 - 安全审计：记录安全相关事件 - 业务分析：了解用户行为和业务趋势 - 合规要求：满足法规和审计要求

1.2 OpenResty日志架构

┌─────────────────┐
│   应用请求       │
└─────────┬───────┘
          │
┌─────────▼───────┐
│   Nginx日志     │  ← access.log, error.log
├─────────────────┤
│   Lua应用日志   │  ← 自定义业务日志
├─────────────────┤
│   结构化日志     │  ← JSON格式日志
├─────────────────┤
│   日志收集器     │  ← Filebeat, Fluentd
├─────────────────┤
│   日志存储       │  ← Elasticsearch, ClickHouse
├─────────────────┤
│   日志分析       │  ← Kibana, Grafana
└─────────────────┘

1.3 日志级别

OpenResty支持多种日志级别： - emerg：紧急情况，系统不可用 - alert：需要立即采取行动 - crit：严重错误 - error：错误信息 - warn：警告信息 - notice：正常但重要的信息 - info：一般信息 - debug：调试信息

2. Nginx日志配置

2.1 访问日志配置

# nginx.conf
http {
    # 定义日志格式
    log_format main '$remote_addr - $remote_user [$time_local] "$request" '
                   '$status $body_bytes_sent "$http_referer" '
                   '"$http_user_agent" "$http_x_forwarded_for"';
    
    # JSON格式日志
    log_format json_combined escape=json
    '{
        "timestamp": "$time_iso8601",
        "remote_addr": "$remote_addr",
        "remote_user": "$remote_user",
        "request": "$request",
        "status": $status,
        "body_bytes_sent": $body_bytes_sent,
        "http_referer": "$http_referer",
        "http_user_agent": "$http_user_agent",
        "http_x_forwarded_for": "$http_x_forwarded_for",
        "request_time": $request_time,
        "upstream_response_time": "$upstream_response_time",
        "upstream_addr": "$upstream_addr",
        "request_id": "$request_id"
    }';
    
    # 性能监控日志格式
    log_format performance '$time_iso8601\t$remote_addr\t$request_method\t'
                          '$uri\t$status\t$request_time\t$upstream_response_time\t'
                          '$body_bytes_sent\t$http_user_agent';
    
    # 安全日志格式
    log_format security '$time_iso8601\t$remote_addr\t$request\t$status\t'
                       '$http_user_agent\t$http_x_forwarded_for\t'
                       '$request_length\t$request_time';
    
    server {
        listen 80;
        server_name example.com;
        
        # 访问日志
        access_log /var/log/nginx/access.log main;
        access_log /var/log/nginx/access_json.log json_combined;
        
        # 错误日志
        error_log /var/log/nginx/error.log warn;
        
        location /api/ {
            # API专用日志
            access_log /var/log/nginx/api_access.log json_combined;
            
            proxy_pass http://backend;
        }
        
        location /admin/ {
            # 管理员访问日志
            access_log /var/log/nginx/admin_access.log security;
            
            proxy_pass http://admin_backend;
        }
    }
}

2.2 条件日志记录

# 根据条件记录日志
server {
    # 定义变量
    set $log_flag 1;
    
    # 排除健康检查
    if ($request_uri = "/health") {
        set $log_flag 0;
    }
    
    # 排除静态资源
    if ($request_uri ~* "\.(css|js|png|jpg|gif|ico)$") {
        set $log_flag 0;
    }
    
    # 条件日志记录
    access_log /var/log/nginx/access.log main if=$log_flag;
    
    # 错误日志条件记录
    location / {
        access_log /var/log/nginx/app.log json_combined if=$log_flag;
        
        # 记录慢请求
        if ($request_time > 1) {
            access_log /var/log/nginx/slow.log performance;
        }
        
        proxy_pass http://backend;
    }
}

3. Lua日志处理

3.1 基础日志模块

-- 日志处理模块
local logger = {}
local cjson = require "cjson"
local resty_lock = require "resty.lock"

-- 日志级别映射
local log_levels = {
    DEBUG = ngx.DEBUG,
    INFO = ngx.INFO,
    NOTICE = ngx.NOTICE,
    WARN = ngx.WARN,
    ERR = ngx.ERR,
    CRIT = ngx.CRIT,
    ALERT = ngx.ALERT,
    EMERG = ngx.EMERG
}

-- 日志配置
local log_config = {
    level = "INFO",
    format = "json",  -- json, text
    include_trace = true,
    max_line_length = 4096,
    buffer_size = 1024 * 1024,  -- 1MB
    flush_interval = 5  -- 5秒
}

-- 获取调用栈信息
local function get_trace_info()
    if not log_config.include_trace then
        return nil
    end
    
    local info = debug.getinfo(4, "Sl")
    if info then
        return {
            source = info.source,
            line = info.currentline
        }
    end
    return nil
end

-- 格式化日志消息
local function format_message(level, message, context)
    local log_entry = {
        timestamp = ngx.utctime(),
        level = level,
        message = message,
        request_id = ngx.var.request_id,
        remote_addr = ngx.var.remote_addr,
        uri = ngx.var.uri,
        method = ngx.var.request_method
    }
    
    -- 添加上下文信息
    if context then
        log_entry.context = context
    end
    
    -- 添加用户信息
    if ngx.ctx.user then
        log_entry.user = {
            user_id = ngx.ctx.user.user_id,
            username = ngx.ctx.user.username
        }
    end
    
    -- 添加调用栈信息
    local trace = get_trace_info()
    if trace then
        log_entry.trace = trace
    end
    
    if log_config.format == "json" then
        return cjson.encode(log_entry)
    else
        return string.format("[%s] %s %s - %s", 
            log_entry.timestamp, level, log_entry.request_id or "-", message)
    end
end

-- 检查日志级别
local function should_log(level)
    local current_level = log_levels[log_config.level] or ngx.INFO
    local target_level = log_levels[level] or ngx.INFO
    return target_level >= current_level
end

-- 基础日志函数
local function log(level, message, context)
    if not should_log(level) then
        return
    end
    
    local formatted_message = format_message(level, message, context)
    
    -- 限制日志行长度
    if #formatted_message > log_config.max_line_length then
        formatted_message = string.sub(formatted_message, 1, log_config.max_line_length - 3) .. "..."
    end
    
    ngx.log(log_levels[level], formatted_message)
end

-- 公共日志接口
function logger.debug(message, context)
    log("DEBUG", message, context)
end

function logger.info(message, context)
    log("INFO", message, context)
end

function logger.warn(message, context)
    log("WARN", message, context)
end

function logger.error(message, context)
    log("ERR", message, context)
end

function logger.critical(message, context)
    log("CRIT", message, context)
end

-- 结构化日志
function logger.structured(level, event_type, data)
    if not should_log(level) then
        return
    end
    
    local log_entry = {
        timestamp = ngx.utctime(),
        level = level,
        event_type = event_type,
        request_id = ngx.var.request_id,
        data = data or {}
    }
    
    ngx.log(log_levels[level], cjson.encode(log_entry))
end

-- 性能日志
function logger.performance(operation, duration, details)
    logger.structured("INFO", "performance", {
        operation = operation,
        duration_ms = duration,
        details = details
    })
end

-- 业务日志
function logger.business(event, data)
    logger.structured("INFO", "business", {
        event = event,
        data = data
    })
end

-- 安全日志
function logger.security(event, details)
    logger.structured("WARN", "security", {
        event = event,
        ip = ngx.var.remote_addr,
        user_agent = ngx.var.http_user_agent,
        details = details
    })
end

-- 错误日志
function logger.exception(err, context)
    logger.structured("ERR", "exception", {
        error = tostring(err),
        context = context,
        stack_trace = debug.traceback()
    })
end

return logger

3.2 异步日志处理

-- 异步日志模块
local async_logger = {}
local cjson = require "cjson"
local resty_lock = require "resty.lock"

-- 日志缓冲区
local log_buffer = {}
local buffer_size = 0
local max_buffer_size = 1024 * 1024  -- 1MB
local flush_interval = 5  -- 5秒
local last_flush_time = ngx.time()

-- 初始化定时器
local function init_flush_timer()
    local function flush_logs(premature)
        if premature then
            return
        end
        
        async_logger.flush_buffer()
        
        -- 重新设置定时器
        local ok, err = ngx.timer.at(flush_interval, flush_logs)
        if not ok then
            ngx.log(ngx.ERR, "Failed to create flush timer: ", err)
        end
    end
    
    local ok, err = ngx.timer.at(flush_interval, flush_logs)
    if not ok then
        ngx.log(ngx.ERR, "Failed to create initial flush timer: ", err)
    end
end

-- 添加日志到缓冲区
function async_logger.add_to_buffer(log_entry)
    local log_line = cjson.encode(log_entry) .. "\n"
    
    table.insert(log_buffer, log_line)
    buffer_size = buffer_size + #log_line
    
    -- 检查是否需要立即刷新
    if buffer_size >= max_buffer_size then
        async_logger.flush_buffer()
    end
end

-- 刷新缓冲区
function async_logger.flush_buffer()
    if #log_buffer == 0 then
        return
    end
    
    -- 使用锁防止并发刷新
    local lock = resty_lock:new("log_locks")
    local elapsed, err = lock:lock("flush_buffer")
    
    if not elapsed then
        ngx.log(ngx.ERR, "Failed to acquire flush lock: ", err)
        return
    end
    
    -- 获取当前缓冲区内容
    local logs_to_flush = {}
    for i, log_line in ipairs(log_buffer) do
        logs_to_flush[i] = log_line
    end
    
    -- 清空缓冲区
    log_buffer = {}
    buffer_size = 0
    last_flush_time = ngx.time()
    
    lock:unlock()
    
    -- 异步写入日志文件
    ngx.timer.at(0, function(premature)
        if premature then
            return
        end
        
        async_logger.write_logs(logs_to_flush)
    end)
end

-- 写入日志文件
function async_logger.write_logs(logs)
    local log_file = "/var/log/openresty/app.log"
    
    local file, err = io.open(log_file, "a")
    if not file then
        ngx.log(ngx.ERR, "Failed to open log file: ", err)
        return
    end
    
    for _, log_line in ipairs(logs) do
        file:write(log_line)
    end
    
    file:close()
end

-- 异步日志记录
function async_logger.log(level, message, context)
    local log_entry = {
        timestamp = ngx.utctime(),
        level = level,
        message = message,
        context = context,
        request_id = ngx.var.request_id,
        worker_pid = ngx.worker.pid()
    }
    
    async_logger.add_to_buffer(log_entry)
end

-- 初始化异步日志
function async_logger.init()
    init_flush_timer()
end

return async_logger

3.3 日志轮转处理

-- 日志轮转模块
local log_rotation = {}
local os_date = os.date
local os_time = os.time

-- 轮转配置
local rotation_config = {
    max_size = 100 * 1024 * 1024,  -- 100MB
    max_files = 10,
    rotation_time = "daily",  -- daily, hourly, weekly
    compress = true
}

-- 获取日志文件大小
local function get_file_size(filepath)
    local file = io.open(filepath, "r")
    if not file then
        return 0
    end
    
    local size = file:seek("end")
    file:close()
    return size or 0
end

-- 生成轮转文件名
local function generate_rotated_filename(original_path, timestamp)
    local dir, filename = string.match(original_path, "(.+)/([^/]+)$")
    local name, ext = string.match(filename, "(.+)%.(.+)$")
    
    if not name or not ext then
        name = filename
        ext = "log"
    end
    
    local date_str = os_date("%Y%m%d_%H%M%S", timestamp)
    local rotated_name = string.format("%s_%s.%s", name, date_str, ext)
    
    return dir .. "/" .. rotated_name
end

-- 压缩日志文件
local function compress_file(filepath)
    if not rotation_config.compress then
        return filepath
    end
    
    local compressed_path = filepath .. ".gz"
    local cmd = string.format("gzip '%s'", filepath)
    
    local handle = io.popen(cmd)
    if handle then
        handle:close()
        return compressed_path
    end
    
    return filepath
end

-- 清理旧日志文件
local function cleanup_old_files(log_dir, base_name, max_files)
    local cmd = string.format("ls -t '%s'/%s_* 2>/dev/null | tail -n +%d | xargs rm -f", 
                             log_dir, base_name, max_files + 1)
    
    local handle = io.popen(cmd)
    if handle then
        handle:close()
    end
end

-- 检查是否需要轮转
function log_rotation.should_rotate(log_path)
    local file_size = get_file_size(log_path)
    
    -- 检查文件大小
    if file_size >= rotation_config.max_size then
        return true, "size"
    end
    
    -- 检查时间
    local current_time = os_time()
    local file_stat = io.popen(string.format("stat -c %%Y '%s' 2>/dev/null", log_path))
    
    if file_stat then
        local mtime = tonumber(file_stat:read("*a"))
        file_stat:close()
        
        if mtime then
            local time_diff = current_time - mtime
            
            if rotation_config.rotation_time == "daily" and time_diff >= 86400 then
                return true, "time"
            elseif rotation_config.rotation_time == "hourly" and time_diff >= 3600 then
                return true, "time"
            elseif rotation_config.rotation_time == "weekly" and time_diff >= 604800 then
                return true, "time"
            end
        end
    end
    
    return false
end

-- 执行日志轮转
function log_rotation.rotate_log(log_path)
    local should_rotate, reason = log_rotation.should_rotate(log_path)
    
    if not should_rotate then
        return false
    end
    
    ngx.log(ngx.INFO, "Rotating log file ", log_path, " (reason: ", reason, ")")
    
    local timestamp = os_time()
    local rotated_path = generate_rotated_filename(log_path, timestamp)
    
    -- 移动当前日志文件
    local mv_cmd = string.format("mv '%s' '%s'", log_path, rotated_path)
    local mv_handle = io.popen(mv_cmd)
    
    if not mv_handle then
        ngx.log(ngx.ERR, "Failed to rotate log file: ", log_path)
        return false
    end
    
    mv_handle:close()
    
    -- 发送USR1信号给Nginx重新打开日志文件
    local reload_cmd = "nginx -s reopen"
    local reload_handle = io.popen(reload_cmd)
    if reload_handle then
        reload_handle:close()
    end
    
    -- 压缩轮转的文件
    ngx.timer.at(0, function(premature)
        if premature then
            return
        end
        
        local compressed_path = compress_file(rotated_path)
        
        -- 清理旧文件
        local dir, filename = string.match(log_path, "(.+)/([^/]+)$")
        local base_name = string.match(filename, "(.+)%.")
        
        if dir and base_name then
            cleanup_old_files(dir, base_name, rotation_config.max_files)
        end
        
        ngx.log(ngx.INFO, "Log rotation completed: ", compressed_path)
    end)
    
    return true
end

-- 定期检查日志轮转
function log_rotation.start_rotation_timer(log_paths)
    local function check_rotation(premature)
        if premature then
            return
        end
        
        for _, log_path in ipairs(log_paths) do
            log_rotation.rotate_log(log_path)
        end
        
        -- 重新设置定时器（每小时检查一次）
        local ok, err = ngx.timer.at(3600, check_rotation)
        if not ok then
            ngx.log(ngx.ERR, "Failed to create rotation timer: ", err)
        end
    end
    
    local ok, err = ngx.timer.at(3600, check_rotation)
    if not ok then
        ngx.log(ngx.ERR, "Failed to create initial rotation timer: ", err)
    end
end

return log_rotation

4. 性能监控

4.1 请求性能监控

-- 性能监控模块
local performance_monitor = {}
local cjson = require "cjson"
local logger = require "logger"

-- 性能指标收集
local metrics = {
    request_count = 0,
    total_response_time = 0,
    slow_requests = 0,
    error_count = 0
}

-- 性能阈值配置
local perf_config = {
    slow_request_threshold = 1000,  -- 1秒
    error_rate_threshold = 0.05,    -- 5%
    memory_threshold = 80,          -- 80%
    cpu_threshold = 80              -- 80%
}

-- 记录请求开始时间
function performance_monitor.start_request()
    ngx.ctx.request_start_time = ngx.now()
    ngx.ctx.request_id = ngx.var.request_id or string.format("%d-%d", ngx.time(), math.random(10000, 99999))
end

-- 记录请求结束时间并计算性能指标
function performance_monitor.end_request()
    local start_time = ngx.ctx.request_start_time
    if not start_time then
        return
    end
    
    local end_time = ngx.now()
    local response_time = (end_time - start_time) * 1000  -- 转换为毫秒
    local status = ngx.status
    
    -- 更新指标
    metrics.request_count = metrics.request_count + 1
    metrics.total_response_time = metrics.total_response_time + response_time
    
    if status >= 400 then
        metrics.error_count = metrics.error_count + 1
    end
    
    if response_time > perf_config.slow_request_threshold then
        metrics.slow_requests = metrics.slow_requests + 1
        
        -- 记录慢请求
        logger.performance("slow_request", response_time, {
            uri = ngx.var.uri,
            method = ngx.var.request_method,
            status = status,
            request_id = ngx.ctx.request_id
        })
    end
    
    -- 记录性能日志
    logger.structured("INFO", "request_performance", {
        request_id = ngx.ctx.request_id,
        uri = ngx.var.uri,
        method = ngx.var.request_method,
        status = status,
        response_time_ms = response_time,
        bytes_sent = ngx.var.bytes_sent,
        upstream_response_time = ngx.var.upstream_response_time
    })
end

-- 获取系统性能指标
function performance_monitor.get_system_metrics()
    local metrics = {}
    
    -- 内存使用情况
    local memory_info = io.popen("free | grep Mem")
    if memory_info then
        local mem_line = memory_info:read("*l")
        memory_info:close()
        
        if mem_line then
            local total, used = string.match(mem_line, "Mem:%s+(%d+)%s+(%d+)")
            if total and used then
                metrics.memory_usage = math.floor((tonumber(used) / tonumber(total)) * 100)
            end
        end
    end
    
    -- CPU使用情况
    local cpu_info = io.popen("top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1")
    if cpu_info then
        local cpu_usage = cpu_info:read("*l")
        cpu_info:close()
        
        if cpu_usage then
            metrics.cpu_usage = tonumber(cpu_usage) or 0
        end
    end
    
    -- 磁盘使用情况
    local disk_info = io.popen("df / | tail -1 | awk '{print $5}' | cut -d'%' -f1")
    if disk_info then
        local disk_usage = disk_info:read("*l")
        disk_info:close()
        
        if disk_usage then
            metrics.disk_usage = tonumber(disk_usage) or 0
        end
    end
    
    return metrics
end

-- 检查性能告警
function performance_monitor.check_alerts()
    local error_rate = metrics.request_count > 0 and (metrics.error_count / metrics.request_count) or 0
    local avg_response_time = metrics.request_count > 0 and (metrics.total_response_time / metrics.request_count) or 0
    
    -- 错误率告警
    if error_rate > perf_config.error_rate_threshold then
        logger.structured("WARN", "performance_alert", {
            alert_type = "high_error_rate",
            error_rate = error_rate,
            threshold = perf_config.error_rate_threshold,
            request_count = metrics.request_count,
            error_count = metrics.error_count
        })
    end
    
    -- 响应时间告警
    if avg_response_time > perf_config.slow_request_threshold then
        logger.structured("WARN", "performance_alert", {
            alert_type = "high_response_time",
            avg_response_time = avg_response_time,
            threshold = perf_config.slow_request_threshold,
            slow_requests = metrics.slow_requests
        })
    end
    
    -- 系统资源告警
    local system_metrics = performance_monitor.get_system_metrics()
    
    if system_metrics.memory_usage and system_metrics.memory_usage > perf_config.memory_threshold then
        logger.structured("WARN", "system_alert", {
            alert_type = "high_memory_usage",
            memory_usage = system_metrics.memory_usage,
            threshold = perf_config.memory_threshold
        })
    end
    
    if system_metrics.cpu_usage and system_metrics.cpu_usage > perf_config.cpu_threshold then
        logger.structured("WARN", "system_alert", {
            alert_type = "high_cpu_usage",
            cpu_usage = system_metrics.cpu_usage,
            threshold = perf_config.cpu_threshold
        })
    end
end

-- 重置指标
function performance_monitor.reset_metrics()
    metrics.request_count = 0
    metrics.total_response_time = 0
    metrics.slow_requests = 0
    metrics.error_count = 0
end

-- 获取性能报告
function performance_monitor.get_performance_report()
    local error_rate = metrics.request_count > 0 and (metrics.error_count / metrics.request_count) or 0
    local avg_response_time = metrics.request_count > 0 and (metrics.total_response_time / metrics.request_count) or 0
    
    return {
        timestamp = ngx.utctime(),
        request_count = metrics.request_count,
        error_count = metrics.error_count,
        error_rate = error_rate,
        slow_requests = metrics.slow_requests,
        avg_response_time_ms = avg_response_time,
        system_metrics = performance_monitor.get_system_metrics()
    }
end

-- 启动性能监控定时器
function performance_monitor.start_monitoring()
    local function monitor_performance(premature)
        if premature then
            return
        end
        
        -- 检查告警
        performance_monitor.check_alerts()
        
        -- 记录性能报告
        local report = performance_monitor.get_performance_report()
        logger.structured("INFO", "performance_report", report)
        
        -- 重置指标
        performance_monitor.reset_metrics()
        
        -- 重新设置定时器（每5分钟）
        local ok, err = ngx.timer.at(300, monitor_performance)
        if not ok then
            ngx.log(ngx.ERR, "Failed to create performance monitor timer: ", err)
        end
    end
    
    local ok, err = ngx.timer.at(300, monitor_performance)
    if not ok then
        ngx.log(ngx.ERR, "Failed to create initial performance monitor timer: ", err)
    end
end

return performance_monitor

4.2 业务指标监控

-- 业务指标监控模块
local business_monitor = {}
local cjson = require "cjson"
local redis = require "resty.redis"
local logger = require "logger"

-- 业务指标存储
local function get_redis()
    local red = redis:new()
    red:set_timeout(1000)
    
    local ok, err = red:connect("127.0.0.1", 6379)
    if not ok then
        ngx.log(ngx.ERR, "Failed to connect to Redis: ", err)
        return nil
    end
    
    return red
end

-- 记录业务事件
function business_monitor.track_event(event_name, properties)
    local red = get_redis()
    if not red then
        return
    end
    
    local event_data = {
        timestamp = ngx.time(),
        event = event_name,
        properties = properties or {},
        user_id = ngx.ctx.user and ngx.ctx.user.user_id,
        session_id = ngx.var.cookie_session_id,
        ip = ngx.var.remote_addr,
        user_agent = ngx.var.http_user_agent
    }
    
    -- 存储到Redis
    local key = "business_events:" .. os.date("%Y%m%d")
    red:lpush(key, cjson.encode(event_data))
    red:expire(key, 86400 * 7)  -- 保留7天
    
    -- 更新计数器
    local counter_key = "event_counter:" .. event_name .. ":" .. os.date("%Y%m%d%H")
    red:incr(counter_key)
    red:expire(counter_key, 86400 * 30)  -- 保留30天
    
    red:set_keepalive(10000, 100)
    
    -- 记录到日志
    logger.business(event_name, event_data)
end

-- 记录用户行为
function business_monitor.track_user_action(action, details)
    business_monitor.track_event("user_action", {
        action = action,
        details = details
    })
end

-- 记录API调用
function business_monitor.track_api_call(endpoint, method, status, response_time)
    business_monitor.track_event("api_call", {
        endpoint = endpoint,
        method = method,
        status = status,
        response_time_ms = response_time
    })
end

-- 记录业务转换
function business_monitor.track_conversion(funnel_step, conversion_data)
    business_monitor.track_event("conversion", {
        funnel_step = funnel_step,
        conversion_data = conversion_data
    })
end

-- 获取业务指标
function business_monitor.get_metrics(date, event_type)
    local red = get_redis()
    if not red then
        return nil
    end
    
    local metrics = {}
    
    if event_type then
        -- 获取特定事件的指标
        local pattern = "event_counter:" .. event_type .. ":" .. date .. "*"
        local keys = red:keys(pattern)
        
        if keys and #keys > 0 then
            local values = red:mget(unpack(keys))
            for i, key in ipairs(keys) do
                local hour = string.match(key, ":(%d%d)$")
                if hour and values[i] then
                    metrics[hour] = tonumber(values[i]) or 0
                end
            end
        end
    else
        -- 获取所有事件的汇总指标
        local pattern = "event_counter:*:" .. date .. "*"
        local keys = red:keys(pattern)
        
        if keys and #keys > 0 then
            for _, key in ipairs(keys) do
                local event_name = string.match(key, "event_counter:([^:]+):")
                if event_name then
                    if not metrics[event_name] then
                        metrics[event_name] = 0
                    end
                    local count = red:get(key)
                    if count then
                        metrics[event_name] = metrics[event_name] + (tonumber(count) or 0)
                    end
                end
            end
        end
    end
    
    red:set_keepalive(10000, 100)
    return metrics
end

return business_monitor

5. 日志分析与可视化

5.1 ELK集成配置

# filebeat.yml
filebeat.inputs:
- type: log
  enabled: true
  paths:
    - /var/log/nginx/access_json.log
  json.keys_under_root: true
  json.add_error_key: true
  fields:
    log_type: nginx_access
  fields_under_root: true

- type: log
  enabled: true
  paths:
    - /var/log/nginx/error.log
  multiline.pattern: '^\d{4}/\d{2}/\d{2}'
  multiline.negate: true
  multiline.match: after
  fields:
    log_type: nginx_error
  fields_under_root: true

- type: log
  enabled: true
  paths:
    - /var/log/openresty/app.log
  json.keys_under_root: true
  json.add_error_key: true
  fields:
    log_type: openresty_app
  fields_under_root: true

output.elasticsearch:
  hosts: ["elasticsearch:9200"]
  index: "openresty-logs-%{+yyyy.MM.dd}"
  template.name: "openresty"
  template.pattern: "openresty-*"
  template.settings:
    index.number_of_shards: 1
    index.number_of_replicas: 1

processors:
- add_host_metadata:
    when.not.contains.tags: forwarded
- add_docker_metadata: ~
- add_kubernetes_metadata: ~

5.2 Grafana仪表板配置

{
  "dashboard": {
    "title": "OpenResty监控仪表板",
    "panels": [
      {
        "title": "请求量趋势",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(nginx_http_requests_total[5m])",
            "legendFormat": "{{method}} {{status}}"
          }
        ]
      },
      {
        "title": "响应时间分布",
        "type": "heatmap",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(nginx_http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "错误率",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(nginx_http_requests_total{status=~\"4..|5..\"}[5m]) / rate(nginx_http_requests_total[5m])",
            "legendFormat": "Error Rate"
          }
        ]
      },
      {
        "title": "系统资源使用",
        "type": "graph",
        "targets": [
          {
            "expr": "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100",
            "legendFormat": "Memory Usage %"
          },
          {
            "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
            "legendFormat": "CPU Usage %"
          }
        ]
      }
    ]
  }
}

6. 告警配置

6.1 Prometheus告警规则

# alert_rules.yml
groups:
- name: openresty_alerts
  rules:
  - alert: HighErrorRate
    expr: rate(nginx_http_requests_total{status=~"4..|5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"

  - alert: HighResponseTime
    expr: histogram_quantile(0.95, rate(nginx_http_request_duration_seconds_bucket[5m])) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High response time detected"
      description: "95th percentile response time is {{ $value }}s"

  - alert: HighMemoryUsage
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.8
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High memory usage"
      description: "Memory usage is {{ $value | humanizePercentage }}"

  - alert: HighCPUUsage
    expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High CPU usage"
      description: "CPU usage is {{ $value }}%"

6.2 告警通知配置

-- 告警通知模块
local alerting = {}
local http = require "resty.http"
local cjson = require "cjson"

-- 告警配置
local alert_config = {
    webhook_url = "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
    email_api_url = "https://api.sendgrid.com/v3/mail/send",
    email_api_key = "YOUR_SENDGRID_API_KEY",
    alert_cooldown = 300  -- 5分钟冷却期
}

-- 发送Slack通知
function alerting.send_slack_alert(alert_data)
    local httpc = http.new()
    httpc:set_timeout(10000)
    
    local message = {
        text = string.format("🚨 OpenResty Alert: %s", alert_data.title),
        attachments = {
            {
                color = alert_data.severity == "critical" and "danger" or "warning",
                fields = {
                    {
                        title = "Severity",
                        value = alert_data.severity,
                        short = true
                    },
                    {
                        title = "Instance",
                        value = alert_data.instance or "unknown",
                        short = true
                    },
                    {
                        title = "Description",
                        value = alert_data.description,
                        short = false
                    },
                    {
                        title = "Timestamp",
                        value = os.date("%Y-%m-%d %H:%M:%S", alert_data.timestamp),
                        short = true
                    }
                }
            }
        }
    }
    
    local res, err = httpc:request_uri(alert_config.webhook_url, {
        method = "POST",
        headers = {
            ["Content-Type"] = "application/json"
        },
        body = cjson.encode(message)
    })
    
    if not res or res.status ~= 200 then
        ngx.log(ngx.ERR, "Failed to send Slack alert: ", err or res.status)
        return false
    end
    
    return true
end

-- 发送邮件告警
function alerting.send_email_alert(alert_data, recipients)
    local httpc = http.new()
    httpc:set_timeout(10000)
    
    local email_data = {
        personalizations = {
            {
                to = recipients,
                subject = string.format("[OpenResty Alert] %s", alert_data.title)
            }
        },
        from = {
            email = "alerts@yourcompany.com",
            name = "OpenResty Monitoring"
        },
        content = {
            {
                type = "text/html",
                value = string.format([[
                    <h2>OpenResty Alert</h2>
                    <p><strong>Severity:</strong> %s</p>
                    <p><strong>Instance:</strong> %s</p>
                    <p><strong>Description:</strong> %s</p>
                    <p><strong>Timestamp:</strong> %s</p>
                ]], alert_data.severity, alert_data.instance or "unknown", 
                    alert_data.description, os.date("%Y-%m-%d %H:%M:%S", alert_data.timestamp))
            }
        }
    }
    
    local res, err = httpc:request_uri(alert_config.email_api_url, {
        method = "POST",
        headers = {
            ["Authorization"] = "Bearer " .. alert_config.email_api_key,
            ["Content-Type"] = "application/json"
        },
        body = cjson.encode(email_data)
    })
    
    if not res or res.status ~= 202 then
        ngx.log(ngx.ERR, "Failed to send email alert: ", err or res.status)
        return false
    end
    
    return true
end

-- 检查告警冷却期
local function is_in_cooldown(alert_key)
    local cache = ngx.shared.alerts_cache
    if not cache then
        return false
    end
    
    local last_alert_time = cache:get(alert_key)
    if last_alert_time then
        local current_time = ngx.time()
        return (current_time - last_alert_time) < alert_config.alert_cooldown
    end
    
    return false
end

-- 设置告警冷却期
local function set_cooldown(alert_key)
    local cache = ngx.shared.alerts_cache
    if cache then
        cache:set(alert_key, ngx.time(), alert_config.alert_cooldown)
    end
end

-- 触发告警
function alerting.trigger_alert(alert_data)
    local alert_key = string.format("%s:%s", alert_data.type, alert_data.instance or "global")
    
    -- 检查冷却期
    if is_in_cooldown(alert_key) then
        ngx.log(ngx.INFO, "Alert in cooldown period: ", alert_key)
        return
    end
    
    -- 发送Slack通知
    alerting.send_slack_alert(alert_data)
    
    -- 发送邮件通知（仅对严重告警）
    if alert_data.severity == "critical" then
        local recipients = {
            {email = "admin@yourcompany.com", name = "Admin"}
        }
        alerting.send_email_alert(alert_data, recipients)
    end
    
    -- 设置冷却期
    set_cooldown(alert_key)
    
    ngx.log(ngx.WARN, "Alert triggered: ", cjson.encode(alert_data))
end

return alerting

总结

日志处理与监控是OpenResty应用运维的重要组成部分。通过合理的日志配置、性能监控和告警机制，可以：

及时发现问题：通过实时监控快速识别异常
优化性能：基于性能数据进行系统调优
保障安全：通过安全日志监控威胁
支持决策：基于业务数据做出决策
满足合规：记录审计所需的日志信息

关键最佳实践： - 结构化日志格式 - 异步日志处理 - 合理的日志轮转策略 - 多维度性能监控 - 智能告警机制

📂 分类导航

▶ 学与练
- ▶ 软件技术基础
  - ▶ 操作系统技术
    - Linux实战
    - ▶ Linux技巧
      - debug-remote-api.md
  - ▶ 容器化与编排
    - Docker实战
    - ▶ Docker高级
- ▶ 前端开发技术
  - ▶ 框架与库
    - js
    - vue
  - ▶ 前端生态
    - bootstrap
    - vue-ssr
- ▶ 后端开发技术
  - ▶ 编程语言
    - ▶ Java
    - ▶ Go
      - go-server.md
      - mini.md
    - Rust
    - Python
    - csharp
  - ▶ 中间件
    - redis
    - ▶ minio
      - minio.md
    - elasticsearch
    - kafka
    - elk
    - caddy
  - ▶ 数据库
    - MySQL
    - SQLServer
    - ▶ Dameng
      - sql.md
    - clickhouse
- ▶ 数据开发与运维
  - ▶ 数据开发
    - hadoop
  - ▶ 运维开发
    - ▶ CI/CD
      - jenkins
    - ▶ 自动化
      - allinssl.md
    - ▶ 日志处理
      - elk
    - ▶ 监控
- 软件速学教程
▶ 软件园
- AI智能体与应用
- 开发工具与环境
- AI 开发和编排
- 业务与生产力应用
- 数据和中间件
▶ 工具箱
- 内容管理
- 编码解码
- ▶ 系统监控
  - miaotixing.md
- ▶ 日常工具
- 工具命令
- 使用教程

📚 日志处理与监控