1. 日志系统概述
1.1 日志的重要性
在OpenResty应用中,日志系统是运维和调试的重要工具: - 问题诊断:快速定位和解决问题 - 性能监控:分析系统性能瓶颈 - 安全审计:记录安全相关事件 - 业务分析:了解用户行为和业务趋势 - 合规要求:满足法规和审计要求
1.2 OpenResty日志架构
┌─────────────────┐
│ 应用请求 │
└─────────┬───────┘
│
┌─────────▼───────┐
│ Nginx日志 │ ← access.log, error.log
├─────────────────┤
│ Lua应用日志 │ ← 自定义业务日志
├─────────────────┤
│ 结构化日志 │ ← JSON格式日志
├─────────────────┤
│ 日志收集器 │ ← Filebeat, Fluentd
├─────────────────┤
│ 日志存储 │ ← Elasticsearch, ClickHouse
├─────────────────┤
│ 日志分析 │ ← Kibana, Grafana
└─────────────────┘
1.3 日志级别
OpenResty支持多种日志级别: - emerg:紧急情况,系统不可用 - alert:需要立即采取行动 - crit:严重错误 - error:错误信息 - warn:警告信息 - notice:正常但重要的信息 - info:一般信息 - debug:调试信息
2. Nginx日志配置
2.1 访问日志配置
# nginx.conf
http {
# 定义日志格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
# JSON格式日志
log_format json_combined escape=json
'{
"timestamp": "$time_iso8601",
"remote_addr": "$remote_addr",
"remote_user": "$remote_user",
"request": "$request",
"status": $status,
"body_bytes_sent": $body_bytes_sent,
"http_referer": "$http_referer",
"http_user_agent": "$http_user_agent",
"http_x_forwarded_for": "$http_x_forwarded_for",
"request_time": $request_time,
"upstream_response_time": "$upstream_response_time",
"upstream_addr": "$upstream_addr",
"request_id": "$request_id"
}';
# 性能监控日志格式
log_format performance '$time_iso8601\t$remote_addr\t$request_method\t'
'$uri\t$status\t$request_time\t$upstream_response_time\t'
'$body_bytes_sent\t$http_user_agent';
# 安全日志格式
log_format security '$time_iso8601\t$remote_addr\t$request\t$status\t'
'$http_user_agent\t$http_x_forwarded_for\t'
'$request_length\t$request_time';
server {
listen 80;
server_name example.com;
# 访问日志
access_log /var/log/nginx/access.log main;
access_log /var/log/nginx/access_json.log json_combined;
# 错误日志
error_log /var/log/nginx/error.log warn;
location /api/ {
# API专用日志
access_log /var/log/nginx/api_access.log json_combined;
proxy_pass http://backend;
}
location /admin/ {
# 管理员访问日志
access_log /var/log/nginx/admin_access.log security;
proxy_pass http://admin_backend;
}
}
}
2.2 条件日志记录
# 根据条件记录日志
server {
# 定义变量
set $log_flag 1;
# 排除健康检查
if ($request_uri = "/health") {
set $log_flag 0;
}
# 排除静态资源
if ($request_uri ~* "\.(css|js|png|jpg|gif|ico)$") {
set $log_flag 0;
}
# 条件日志记录
access_log /var/log/nginx/access.log main if=$log_flag;
# 错误日志条件记录
location / {
access_log /var/log/nginx/app.log json_combined if=$log_flag;
# 记录慢请求
if ($request_time > 1) {
access_log /var/log/nginx/slow.log performance;
}
proxy_pass http://backend;
}
}
3. Lua日志处理
3.1 基础日志模块
-- 日志处理模块
local logger = {}
local cjson = require "cjson"
local resty_lock = require "resty.lock"
-- 日志级别映射
local log_levels = {
DEBUG = ngx.DEBUG,
INFO = ngx.INFO,
NOTICE = ngx.NOTICE,
WARN = ngx.WARN,
ERR = ngx.ERR,
CRIT = ngx.CRIT,
ALERT = ngx.ALERT,
EMERG = ngx.EMERG
}
-- 日志配置
local log_config = {
level = "INFO",
format = "json", -- json, text
include_trace = true,
max_line_length = 4096,
buffer_size = 1024 * 1024, -- 1MB
flush_interval = 5 -- 5秒
}
-- 获取调用栈信息
local function get_trace_info()
if not log_config.include_trace then
return nil
end
local info = debug.getinfo(4, "Sl")
if info then
return {
source = info.source,
line = info.currentline
}
end
return nil
end
-- 格式化日志消息
local function format_message(level, message, context)
local log_entry = {
timestamp = ngx.utctime(),
level = level,
message = message,
request_id = ngx.var.request_id,
remote_addr = ngx.var.remote_addr,
uri = ngx.var.uri,
method = ngx.var.request_method
}
-- 添加上下文信息
if context then
log_entry.context = context
end
-- 添加用户信息
if ngx.ctx.user then
log_entry.user = {
user_id = ngx.ctx.user.user_id,
username = ngx.ctx.user.username
}
end
-- 添加调用栈信息
local trace = get_trace_info()
if trace then
log_entry.trace = trace
end
if log_config.format == "json" then
return cjson.encode(log_entry)
else
return string.format("[%s] %s %s - %s",
log_entry.timestamp, level, log_entry.request_id or "-", message)
end
end
-- 检查日志级别
local function should_log(level)
local current_level = log_levels[log_config.level] or ngx.INFO
local target_level = log_levels[level] or ngx.INFO
return target_level >= current_level
end
-- 基础日志函数
local function log(level, message, context)
if not should_log(level) then
return
end
local formatted_message = format_message(level, message, context)
-- 限制日志行长度
if #formatted_message > log_config.max_line_length then
formatted_message = string.sub(formatted_message, 1, log_config.max_line_length - 3) .. "..."
end
ngx.log(log_levels[level], formatted_message)
end
-- 公共日志接口
function logger.debug(message, context)
log("DEBUG", message, context)
end
function logger.info(message, context)
log("INFO", message, context)
end
function logger.warn(message, context)
log("WARN", message, context)
end
function logger.error(message, context)
log("ERR", message, context)
end
function logger.critical(message, context)
log("CRIT", message, context)
end
-- 结构化日志
function logger.structured(level, event_type, data)
if not should_log(level) then
return
end
local log_entry = {
timestamp = ngx.utctime(),
level = level,
event_type = event_type,
request_id = ngx.var.request_id,
data = data or {}
}
ngx.log(log_levels[level], cjson.encode(log_entry))
end
-- 性能日志
function logger.performance(operation, duration, details)
logger.structured("INFO", "performance", {
operation = operation,
duration_ms = duration,
details = details
})
end
-- 业务日志
function logger.business(event, data)
logger.structured("INFO", "business", {
event = event,
data = data
})
end
-- 安全日志
function logger.security(event, details)
logger.structured("WARN", "security", {
event = event,
ip = ngx.var.remote_addr,
user_agent = ngx.var.http_user_agent,
details = details
})
end
-- 错误日志
function logger.exception(err, context)
logger.structured("ERR", "exception", {
error = tostring(err),
context = context,
stack_trace = debug.traceback()
})
end
return logger
3.2 异步日志处理
-- 异步日志模块
local async_logger = {}
local cjson = require "cjson"
local resty_lock = require "resty.lock"
-- 日志缓冲区
local log_buffer = {}
local buffer_size = 0
local max_buffer_size = 1024 * 1024 -- 1MB
local flush_interval = 5 -- 5秒
local last_flush_time = ngx.time()
-- 初始化定时器
local function init_flush_timer()
local function flush_logs(premature)
if premature then
return
end
async_logger.flush_buffer()
-- 重新设置定时器
local ok, err = ngx.timer.at(flush_interval, flush_logs)
if not ok then
ngx.log(ngx.ERR, "Failed to create flush timer: ", err)
end
end
local ok, err = ngx.timer.at(flush_interval, flush_logs)
if not ok then
ngx.log(ngx.ERR, "Failed to create initial flush timer: ", err)
end
end
-- 添加日志到缓冲区
function async_logger.add_to_buffer(log_entry)
local log_line = cjson.encode(log_entry) .. "\n"
table.insert(log_buffer, log_line)
buffer_size = buffer_size + #log_line
-- 检查是否需要立即刷新
if buffer_size >= max_buffer_size then
async_logger.flush_buffer()
end
end
-- 刷新缓冲区
function async_logger.flush_buffer()
if #log_buffer == 0 then
return
end
-- 使用锁防止并发刷新
local lock = resty_lock:new("log_locks")
local elapsed, err = lock:lock("flush_buffer")
if not elapsed then
ngx.log(ngx.ERR, "Failed to acquire flush lock: ", err)
return
end
-- 获取当前缓冲区内容
local logs_to_flush = {}
for i, log_line in ipairs(log_buffer) do
logs_to_flush[i] = log_line
end
-- 清空缓冲区
log_buffer = {}
buffer_size = 0
last_flush_time = ngx.time()
lock:unlock()
-- 异步写入日志文件
ngx.timer.at(0, function(premature)
if premature then
return
end
async_logger.write_logs(logs_to_flush)
end)
end
-- 写入日志文件
function async_logger.write_logs(logs)
local log_file = "/var/log/openresty/app.log"
local file, err = io.open(log_file, "a")
if not file then
ngx.log(ngx.ERR, "Failed to open log file: ", err)
return
end
for _, log_line in ipairs(logs) do
file:write(log_line)
end
file:close()
end
-- 异步日志记录
function async_logger.log(level, message, context)
local log_entry = {
timestamp = ngx.utctime(),
level = level,
message = message,
context = context,
request_id = ngx.var.request_id,
worker_pid = ngx.worker.pid()
}
async_logger.add_to_buffer(log_entry)
end
-- 初始化异步日志
function async_logger.init()
init_flush_timer()
end
return async_logger
3.3 日志轮转处理
-- 日志轮转模块
local log_rotation = {}
local os_date = os.date
local os_time = os.time
-- 轮转配置
local rotation_config = {
max_size = 100 * 1024 * 1024, -- 100MB
max_files = 10,
rotation_time = "daily", -- daily, hourly, weekly
compress = true
}
-- 获取日志文件大小
local function get_file_size(filepath)
local file = io.open(filepath, "r")
if not file then
return 0
end
local size = file:seek("end")
file:close()
return size or 0
end
-- 生成轮转文件名
local function generate_rotated_filename(original_path, timestamp)
local dir, filename = string.match(original_path, "(.+)/([^/]+)$")
local name, ext = string.match(filename, "(.+)%.(.+)$")
if not name or not ext then
name = filename
ext = "log"
end
local date_str = os_date("%Y%m%d_%H%M%S", timestamp)
local rotated_name = string.format("%s_%s.%s", name, date_str, ext)
return dir .. "/" .. rotated_name
end
-- 压缩日志文件
local function compress_file(filepath)
if not rotation_config.compress then
return filepath
end
local compressed_path = filepath .. ".gz"
local cmd = string.format("gzip '%s'", filepath)
local handle = io.popen(cmd)
if handle then
handle:close()
return compressed_path
end
return filepath
end
-- 清理旧日志文件
local function cleanup_old_files(log_dir, base_name, max_files)
local cmd = string.format("ls -t '%s'/%s_* 2>/dev/null | tail -n +%d | xargs rm -f",
log_dir, base_name, max_files + 1)
local handle = io.popen(cmd)
if handle then
handle:close()
end
end
-- 检查是否需要轮转
function log_rotation.should_rotate(log_path)
local file_size = get_file_size(log_path)
-- 检查文件大小
if file_size >= rotation_config.max_size then
return true, "size"
end
-- 检查时间
local current_time = os_time()
local file_stat = io.popen(string.format("stat -c %%Y '%s' 2>/dev/null", log_path))
if file_stat then
local mtime = tonumber(file_stat:read("*a"))
file_stat:close()
if mtime then
local time_diff = current_time - mtime
if rotation_config.rotation_time == "daily" and time_diff >= 86400 then
return true, "time"
elseif rotation_config.rotation_time == "hourly" and time_diff >= 3600 then
return true, "time"
elseif rotation_config.rotation_time == "weekly" and time_diff >= 604800 then
return true, "time"
end
end
end
return false
end
-- 执行日志轮转
function log_rotation.rotate_log(log_path)
local should_rotate, reason = log_rotation.should_rotate(log_path)
if not should_rotate then
return false
end
ngx.log(ngx.INFO, "Rotating log file ", log_path, " (reason: ", reason, ")")
local timestamp = os_time()
local rotated_path = generate_rotated_filename(log_path, timestamp)
-- 移动当前日志文件
local mv_cmd = string.format("mv '%s' '%s'", log_path, rotated_path)
local mv_handle = io.popen(mv_cmd)
if not mv_handle then
ngx.log(ngx.ERR, "Failed to rotate log file: ", log_path)
return false
end
mv_handle:close()
-- 发送USR1信号给Nginx重新打开日志文件
local reload_cmd = "nginx -s reopen"
local reload_handle = io.popen(reload_cmd)
if reload_handle then
reload_handle:close()
end
-- 压缩轮转的文件
ngx.timer.at(0, function(premature)
if premature then
return
end
local compressed_path = compress_file(rotated_path)
-- 清理旧文件
local dir, filename = string.match(log_path, "(.+)/([^/]+)$")
local base_name = string.match(filename, "(.+)%.")
if dir and base_name then
cleanup_old_files(dir, base_name, rotation_config.max_files)
end
ngx.log(ngx.INFO, "Log rotation completed: ", compressed_path)
end)
return true
end
-- 定期检查日志轮转
function log_rotation.start_rotation_timer(log_paths)
local function check_rotation(premature)
if premature then
return
end
for _, log_path in ipairs(log_paths) do
log_rotation.rotate_log(log_path)
end
-- 重新设置定时器(每小时检查一次)
local ok, err = ngx.timer.at(3600, check_rotation)
if not ok then
ngx.log(ngx.ERR, "Failed to create rotation timer: ", err)
end
end
local ok, err = ngx.timer.at(3600, check_rotation)
if not ok then
ngx.log(ngx.ERR, "Failed to create initial rotation timer: ", err)
end
end
return log_rotation
4. 性能监控
4.1 请求性能监控
-- 性能监控模块
local performance_monitor = {}
local cjson = require "cjson"
local logger = require "logger"
-- 性能指标收集
local metrics = {
request_count = 0,
total_response_time = 0,
slow_requests = 0,
error_count = 0
}
-- 性能阈值配置
local perf_config = {
slow_request_threshold = 1000, -- 1秒
error_rate_threshold = 0.05, -- 5%
memory_threshold = 80, -- 80%
cpu_threshold = 80 -- 80%
}
-- 记录请求开始时间
function performance_monitor.start_request()
ngx.ctx.request_start_time = ngx.now()
ngx.ctx.request_id = ngx.var.request_id or string.format("%d-%d", ngx.time(), math.random(10000, 99999))
end
-- 记录请求结束时间并计算性能指标
function performance_monitor.end_request()
local start_time = ngx.ctx.request_start_time
if not start_time then
return
end
local end_time = ngx.now()
local response_time = (end_time - start_time) * 1000 -- 转换为毫秒
local status = ngx.status
-- 更新指标
metrics.request_count = metrics.request_count + 1
metrics.total_response_time = metrics.total_response_time + response_time
if status >= 400 then
metrics.error_count = metrics.error_count + 1
end
if response_time > perf_config.slow_request_threshold then
metrics.slow_requests = metrics.slow_requests + 1
-- 记录慢请求
logger.performance("slow_request", response_time, {
uri = ngx.var.uri,
method = ngx.var.request_method,
status = status,
request_id = ngx.ctx.request_id
})
end
-- 记录性能日志
logger.structured("INFO", "request_performance", {
request_id = ngx.ctx.request_id,
uri = ngx.var.uri,
method = ngx.var.request_method,
status = status,
response_time_ms = response_time,
bytes_sent = ngx.var.bytes_sent,
upstream_response_time = ngx.var.upstream_response_time
})
end
-- 获取系统性能指标
function performance_monitor.get_system_metrics()
local metrics = {}
-- 内存使用情况
local memory_info = io.popen("free | grep Mem")
if memory_info then
local mem_line = memory_info:read("*l")
memory_info:close()
if mem_line then
local total, used = string.match(mem_line, "Mem:%s+(%d+)%s+(%d+)")
if total and used then
metrics.memory_usage = math.floor((tonumber(used) / tonumber(total)) * 100)
end
end
end
-- CPU使用情况
local cpu_info = io.popen("top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1")
if cpu_info then
local cpu_usage = cpu_info:read("*l")
cpu_info:close()
if cpu_usage then
metrics.cpu_usage = tonumber(cpu_usage) or 0
end
end
-- 磁盘使用情况
local disk_info = io.popen("df / | tail -1 | awk '{print $5}' | cut -d'%' -f1")
if disk_info then
local disk_usage = disk_info:read("*l")
disk_info:close()
if disk_usage then
metrics.disk_usage = tonumber(disk_usage) or 0
end
end
return metrics
end
-- 检查性能告警
function performance_monitor.check_alerts()
local error_rate = metrics.request_count > 0 and (metrics.error_count / metrics.request_count) or 0
local avg_response_time = metrics.request_count > 0 and (metrics.total_response_time / metrics.request_count) or 0
-- 错误率告警
if error_rate > perf_config.error_rate_threshold then
logger.structured("WARN", "performance_alert", {
alert_type = "high_error_rate",
error_rate = error_rate,
threshold = perf_config.error_rate_threshold,
request_count = metrics.request_count,
error_count = metrics.error_count
})
end
-- 响应时间告警
if avg_response_time > perf_config.slow_request_threshold then
logger.structured("WARN", "performance_alert", {
alert_type = "high_response_time",
avg_response_time = avg_response_time,
threshold = perf_config.slow_request_threshold,
slow_requests = metrics.slow_requests
})
end
-- 系统资源告警
local system_metrics = performance_monitor.get_system_metrics()
if system_metrics.memory_usage and system_metrics.memory_usage > perf_config.memory_threshold then
logger.structured("WARN", "system_alert", {
alert_type = "high_memory_usage",
memory_usage = system_metrics.memory_usage,
threshold = perf_config.memory_threshold
})
end
if system_metrics.cpu_usage and system_metrics.cpu_usage > perf_config.cpu_threshold then
logger.structured("WARN", "system_alert", {
alert_type = "high_cpu_usage",
cpu_usage = system_metrics.cpu_usage,
threshold = perf_config.cpu_threshold
})
end
end
-- 重置指标
function performance_monitor.reset_metrics()
metrics.request_count = 0
metrics.total_response_time = 0
metrics.slow_requests = 0
metrics.error_count = 0
end
-- 获取性能报告
function performance_monitor.get_performance_report()
local error_rate = metrics.request_count > 0 and (metrics.error_count / metrics.request_count) or 0
local avg_response_time = metrics.request_count > 0 and (metrics.total_response_time / metrics.request_count) or 0
return {
timestamp = ngx.utctime(),
request_count = metrics.request_count,
error_count = metrics.error_count,
error_rate = error_rate,
slow_requests = metrics.slow_requests,
avg_response_time_ms = avg_response_time,
system_metrics = performance_monitor.get_system_metrics()
}
end
-- 启动性能监控定时器
function performance_monitor.start_monitoring()
local function monitor_performance(premature)
if premature then
return
end
-- 检查告警
performance_monitor.check_alerts()
-- 记录性能报告
local report = performance_monitor.get_performance_report()
logger.structured("INFO", "performance_report", report)
-- 重置指标
performance_monitor.reset_metrics()
-- 重新设置定时器(每5分钟)
local ok, err = ngx.timer.at(300, monitor_performance)
if not ok then
ngx.log(ngx.ERR, "Failed to create performance monitor timer: ", err)
end
end
local ok, err = ngx.timer.at(300, monitor_performance)
if not ok then
ngx.log(ngx.ERR, "Failed to create initial performance monitor timer: ", err)
end
end
return performance_monitor
4.2 业务指标监控
-- 业务指标监控模块
local business_monitor = {}
local cjson = require "cjson"
local redis = require "resty.redis"
local logger = require "logger"
-- 业务指标存储
local function get_redis()
local red = redis:new()
red:set_timeout(1000)
local ok, err = red:connect("127.0.0.1", 6379)
if not ok then
ngx.log(ngx.ERR, "Failed to connect to Redis: ", err)
return nil
end
return red
end
-- 记录业务事件
function business_monitor.track_event(event_name, properties)
local red = get_redis()
if not red then
return
end
local event_data = {
timestamp = ngx.time(),
event = event_name,
properties = properties or {},
user_id = ngx.ctx.user and ngx.ctx.user.user_id,
session_id = ngx.var.cookie_session_id,
ip = ngx.var.remote_addr,
user_agent = ngx.var.http_user_agent
}
-- 存储到Redis
local key = "business_events:" .. os.date("%Y%m%d")
red:lpush(key, cjson.encode(event_data))
red:expire(key, 86400 * 7) -- 保留7天
-- 更新计数器
local counter_key = "event_counter:" .. event_name .. ":" .. os.date("%Y%m%d%H")
red:incr(counter_key)
red:expire(counter_key, 86400 * 30) -- 保留30天
red:set_keepalive(10000, 100)
-- 记录到日志
logger.business(event_name, event_data)
end
-- 记录用户行为
function business_monitor.track_user_action(action, details)
business_monitor.track_event("user_action", {
action = action,
details = details
})
end
-- 记录API调用
function business_monitor.track_api_call(endpoint, method, status, response_time)
business_monitor.track_event("api_call", {
endpoint = endpoint,
method = method,
status = status,
response_time_ms = response_time
})
end
-- 记录业务转换
function business_monitor.track_conversion(funnel_step, conversion_data)
business_monitor.track_event("conversion", {
funnel_step = funnel_step,
conversion_data = conversion_data
})
end
-- 获取业务指标
function business_monitor.get_metrics(date, event_type)
local red = get_redis()
if not red then
return nil
end
local metrics = {}
if event_type then
-- 获取特定事件的指标
local pattern = "event_counter:" .. event_type .. ":" .. date .. "*"
local keys = red:keys(pattern)
if keys and #keys > 0 then
local values = red:mget(unpack(keys))
for i, key in ipairs(keys) do
local hour = string.match(key, ":(%d%d)$")
if hour and values[i] then
metrics[hour] = tonumber(values[i]) or 0
end
end
end
else
-- 获取所有事件的汇总指标
local pattern = "event_counter:*:" .. date .. "*"
local keys = red:keys(pattern)
if keys and #keys > 0 then
for _, key in ipairs(keys) do
local event_name = string.match(key, "event_counter:([^:]+):")
if event_name then
if not metrics[event_name] then
metrics[event_name] = 0
end
local count = red:get(key)
if count then
metrics[event_name] = metrics[event_name] + (tonumber(count) or 0)
end
end
end
end
end
red:set_keepalive(10000, 100)
return metrics
end
return business_monitor
5. 日志分析与可视化
5.1 ELK集成配置
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/nginx/access_json.log
json.keys_under_root: true
json.add_error_key: true
fields:
log_type: nginx_access
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/nginx/error.log
multiline.pattern: '^\d{4}/\d{2}/\d{2}'
multiline.negate: true
multiline.match: after
fields:
log_type: nginx_error
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/openresty/app.log
json.keys_under_root: true
json.add_error_key: true
fields:
log_type: openresty_app
fields_under_root: true
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "openresty-logs-%{+yyyy.MM.dd}"
template.name: "openresty"
template.pattern: "openresty-*"
template.settings:
index.number_of_shards: 1
index.number_of_replicas: 1
processors:
- add_host_metadata:
when.not.contains.tags: forwarded
- add_docker_metadata: ~
- add_kubernetes_metadata: ~
5.2 Grafana仪表板配置
{
"dashboard": {
"title": "OpenResty监控仪表板",
"panels": [
{
"title": "请求量趋势",
"type": "graph",
"targets": [
{
"expr": "rate(nginx_http_requests_total[5m])",
"legendFormat": "{{method}} {{status}}"
}
]
},
{
"title": "响应时间分布",
"type": "heatmap",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(nginx_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
}
]
},
{
"title": "错误率",
"type": "stat",
"targets": [
{
"expr": "rate(nginx_http_requests_total{status=~\"4..|5..\"}[5m]) / rate(nginx_http_requests_total[5m])",
"legendFormat": "Error Rate"
}
]
},
{
"title": "系统资源使用",
"type": "graph",
"targets": [
{
"expr": "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100",
"legendFormat": "Memory Usage %"
},
{
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU Usage %"
}
]
}
]
}
}
6. 告警配置
6.1 Prometheus告警规则
# alert_rules.yml
groups:
- name: openresty_alerts
rules:
- alert: HighErrorRate
expr: rate(nginx_http_requests_total{status=~"4..|5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(nginx_http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.8
for: 5m
labels:
severity: critical
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | humanizePercentage }}"
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is {{ $value }}%"
6.2 告警通知配置
-- 告警通知模块
local alerting = {}
local http = require "resty.http"
local cjson = require "cjson"
-- 告警配置
local alert_config = {
webhook_url = "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
email_api_url = "https://api.sendgrid.com/v3/mail/send",
email_api_key = "YOUR_SENDGRID_API_KEY",
alert_cooldown = 300 -- 5分钟冷却期
}
-- 发送Slack通知
function alerting.send_slack_alert(alert_data)
local httpc = http.new()
httpc:set_timeout(10000)
local message = {
text = string.format("🚨 OpenResty Alert: %s", alert_data.title),
attachments = {
{
color = alert_data.severity == "critical" and "danger" or "warning",
fields = {
{
title = "Severity",
value = alert_data.severity,
short = true
},
{
title = "Instance",
value = alert_data.instance or "unknown",
short = true
},
{
title = "Description",
value = alert_data.description,
short = false
},
{
title = "Timestamp",
value = os.date("%Y-%m-%d %H:%M:%S", alert_data.timestamp),
short = true
}
}
}
}
}
local res, err = httpc:request_uri(alert_config.webhook_url, {
method = "POST",
headers = {
["Content-Type"] = "application/json"
},
body = cjson.encode(message)
})
if not res or res.status ~= 200 then
ngx.log(ngx.ERR, "Failed to send Slack alert: ", err or res.status)
return false
end
return true
end
-- 发送邮件告警
function alerting.send_email_alert(alert_data, recipients)
local httpc = http.new()
httpc:set_timeout(10000)
local email_data = {
personalizations = {
{
to = recipients,
subject = string.format("[OpenResty Alert] %s", alert_data.title)
}
},
from = {
email = "alerts@yourcompany.com",
name = "OpenResty Monitoring"
},
content = {
{
type = "text/html",
value = string.format([[
<h2>OpenResty Alert</h2>
<p><strong>Severity:</strong> %s</p>
<p><strong>Instance:</strong> %s</p>
<p><strong>Description:</strong> %s</p>
<p><strong>Timestamp:</strong> %s</p>
]], alert_data.severity, alert_data.instance or "unknown",
alert_data.description, os.date("%Y-%m-%d %H:%M:%S", alert_data.timestamp))
}
}
}
local res, err = httpc:request_uri(alert_config.email_api_url, {
method = "POST",
headers = {
["Authorization"] = "Bearer " .. alert_config.email_api_key,
["Content-Type"] = "application/json"
},
body = cjson.encode(email_data)
})
if not res or res.status ~= 202 then
ngx.log(ngx.ERR, "Failed to send email alert: ", err or res.status)
return false
end
return true
end
-- 检查告警冷却期
local function is_in_cooldown(alert_key)
local cache = ngx.shared.alerts_cache
if not cache then
return false
end
local last_alert_time = cache:get(alert_key)
if last_alert_time then
local current_time = ngx.time()
return (current_time - last_alert_time) < alert_config.alert_cooldown
end
return false
end
-- 设置告警冷却期
local function set_cooldown(alert_key)
local cache = ngx.shared.alerts_cache
if cache then
cache:set(alert_key, ngx.time(), alert_config.alert_cooldown)
end
end
-- 触发告警
function alerting.trigger_alert(alert_data)
local alert_key = string.format("%s:%s", alert_data.type, alert_data.instance or "global")
-- 检查冷却期
if is_in_cooldown(alert_key) then
ngx.log(ngx.INFO, "Alert in cooldown period: ", alert_key)
return
end
-- 发送Slack通知
alerting.send_slack_alert(alert_data)
-- 发送邮件通知(仅对严重告警)
if alert_data.severity == "critical" then
local recipients = {
{email = "admin@yourcompany.com", name = "Admin"}
}
alerting.send_email_alert(alert_data, recipients)
end
-- 设置冷却期
set_cooldown(alert_key)
ngx.log(ngx.WARN, "Alert triggered: ", cjson.encode(alert_data))
end
return alerting
总结
日志处理与监控是OpenResty应用运维的重要组成部分。通过合理的日志配置、性能监控和告警机制,可以:
- 及时发现问题:通过实时监控快速识别异常
- 优化性能:基于性能数据进行系统调优
- 保障安全:通过安全日志监控威胁
- 支持决策:基于业务数据做出决策
- 满足合规:记录审计所需的日志信息
关键最佳实践: - 结构化日志格式 - 异步日志处理 - 合理的日志轮转策略 - 多维度性能监控 - 智能告警机制