1. 部署架构设计

1.1 典型部署架构

                    ┌─────────────────┐
                    │   Load Balancer │
                    │   (HAProxy/F5)  │
                    └─────────┬───────┘
                              │
              ┌───────────────┼───────────────┐
              │               │               │
    ┌─────────▼─────────┐ ┌───▼────┐ ┌───────▼─────────┐
    │   OpenResty-1     │ │   ...  │ │   OpenResty-N   │
    │   (API Gateway)   │ │        │ │   (API Gateway) │
    └─────────┬─────────┘ └────────┘ └─────────┬───────┘
              │                                │
    ┌─────────▼─────────┐                ┌─────▼─────────┐
    │   Backend Apps    │                │ Backend Apps  │
    │   (Microservices) │                │(Microservices)│
    └─────────┬─────────┘                └─────┬─────────┘
              │                                │
    ┌─────────▼─────────┐                ┌─────▼─────────┐
    │    Database       │                │   Database    │
    │   (MySQL/Redis)   │                │ (MySQL/Redis) │
    └───────────────────┘                └───────────────┘

1.2 部署模式选择

1.2.1 单机部署

# 适用场景:开发测试、小型应用
# 优点:简单、成本低
# 缺点:单点故障、性能受限

# 部署脚本
#!/bin/bash
# deploy_single.sh

set -e

APP_NAME="openresty-app"
APP_DIR="/opt/${APP_NAME}"
NGINX_CONF="/etc/openresty/nginx.conf"
LUA_DIR="${APP_DIR}/lua"

echo "Starting single node deployment..."

# 1. 停止现有服务
sudo systemctl stop openresty || true

# 2. 备份配置
sudo cp ${NGINX_CONF} ${NGINX_CONF}.backup.$(date +%Y%m%d_%H%M%S)

# 3. 部署应用代码
sudo mkdir -p ${APP_DIR}
sudo cp -r ./src/* ${APP_DIR}/
sudo chown -R openresty:openresty ${APP_DIR}

# 4. 更新配置
sudo cp ./config/nginx.conf ${NGINX_CONF}
sudo nginx -t

# 5. 启动服务
sudo systemctl start openresty
sudo systemctl enable openresty

# 6. 健康检查
for i in {1..30}; do
    if curl -f http://localhost/health; then
        echo "Deployment successful!"
        exit 0
    fi
    echo "Waiting for service to start... ($i/30)"
    sleep 2
done

echo "Deployment failed - service not responding"
exit 1

1.2.2 集群部署

#!/bin/bash
# deploy_cluster.sh

set -e

# 集群节点配置
NODES=("10.0.1.10" "10.0.1.11" "10.0.1.12")
USER="deploy"
APP_NAME="openresty-app"

echo "Starting cluster deployment..."

# 1. 并行部署到所有节点
deploy_to_node() {
    local node=$1
    echo "Deploying to node: $node"
    
    # 上传代码
    rsync -avz --delete ./src/ ${USER}@${node}:/opt/${APP_NAME}/
    
    # 上传配置
    scp ./config/nginx.conf ${USER}@${node}:/etc/openresty/
    
    # 远程执行部署
    ssh ${USER}@${node} << 'EOF'
        # 测试配置
        sudo nginx -t
        
        # 优雅重载
        sudo systemctl reload openresty
        
        # 健康检查
        sleep 5
        curl -f http://localhost/health || exit 1
EOF
    
    echo "Node $node deployed successfully"
}

# 并行部署
for node in "${NODES[@]}"; do
    deploy_to_node $node &
done

# 等待所有部署完成
wait

echo "Cluster deployment completed!"

# 验证集群状态
echo "Verifying cluster health..."
for node in "${NODES[@]}"; do
    if curl -f http://${node}/health; then
        echo "✓ Node $node is healthy"
    else
        echo "✗ Node $node is unhealthy"
    fi
done

1.2.3 容器化部署

# Dockerfile
FROM openresty/openresty:1.21.4.1-alpine

# 安装依赖
RUN apk add --no-cache \
    curl \
    bash \
    tzdata

# 设置时区
ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# 创建应用目录
WORKDIR /app

# 复制Lua库
COPY lua/ /usr/local/openresty/lualib/app/

# 复制配置文件
COPY config/nginx.conf /usr/local/openresty/nginx/conf/nginx.conf

# 复制启动脚本
COPY scripts/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

# 创建日志目录
RUN mkdir -p /var/log/nginx

# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
    CMD curl -f http://localhost/health || exit 1

# 暴露端口
EXPOSE 80 443

# 启动脚本
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/usr/local/openresty/bin/openresty", "-g", "daemon off;"]
#!/bin/bash
# scripts/entrypoint.sh

set -e

# 环境变量默认值
WORKER_PROCESSES=${WORKER_PROCESSES:-auto}
WORKER_CONNECTIONS=${WORKER_CONNECTIONS:-1024}
LOG_LEVEL=${LOG_LEVEL:-info}

# 动态生成配置
cat > /tmp/env.conf << EOF
env WORKER_PROCESSES;
env WORKER_CONNECTIONS;
env LOG_LEVEL;
env DATABASE_URL;
env REDIS_URL;
env JWT_SECRET;
EOF

# 将环境配置插入到nginx.conf
sed -i '1r /tmp/env.conf' /usr/local/openresty/nginx/conf/nginx.conf

# 测试配置
/usr/local/openresty/bin/openresty -t

# 执行传入的命令
exec "$@"
# docker-compose.yml
version: '3.8'

services:
  openresty:
    build: .
    ports:
      - "80:80"
      - "443:443"
    environment:
      - WORKER_PROCESSES=auto
      - WORKER_CONNECTIONS=1024
      - LOG_LEVEL=info
      - DATABASE_URL=mysql://user:pass@db:3306/app
      - REDIS_URL=redis://redis:6379/0
      - JWT_SECRET=your-secret-key
    volumes:
      - ./logs:/var/log/nginx
      - ./ssl:/etc/ssl/certs
    depends_on:
      - db
      - redis
    restart: unless-stopped
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

  db:
    image: mysql:8.0
    environment:
      - MYSQL_ROOT_PASSWORD=rootpass
      - MYSQL_DATABASE=app
      - MYSQL_USER=user
      - MYSQL_PASSWORD=pass
    volumes:
      - db_data:/var/lib/mysql
      - ./sql:/docker-entrypoint-initdb.d
    restart: unless-stopped

  redis:
    image: redis:7-alpine
    command: redis-server --appendonly yes
    volumes:
      - redis_data:/data
    restart: unless-stopped

  nginx:
    image: nginx:alpine
    ports:
      - "8080:80"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf
    depends_on:
      - openresty
    restart: unless-stopped

volumes:
  db_data:
  redis_data:

networks:
  default:
    driver: bridge

2. 配置管理

2.1 环境配置分离

-- config/env.lua - 环境配置管理
local env = {}

-- 环境类型
local ENV_TYPE = os.getenv("ENV_TYPE") or "development"

-- 基础配置
local base_config = {
    app_name = "openresty-app",
    version = "1.0.0",
    
    -- 服务器配置
    server = {
        worker_processes = "auto",
        worker_connections = 1024,
        keepalive_timeout = 65,
        client_max_body_size = "10m"
    },
    
    -- 日志配置
    logging = {
        level = "info",
        access_log = "/var/log/nginx/access.log",
        error_log = "/var/log/nginx/error.log"
    },
    
    -- 安全配置
    security = {
        jwt_secret = os.getenv("JWT_SECRET") or "default-secret",
        session_timeout = 3600,
        rate_limit = {
            requests_per_minute = 60,
            burst = 10
        }
    }
}

-- 环境特定配置
local env_configs = {
    development = {
        debug = true,
        
        database = {
            host = "localhost",
            port = 3306,
            database = "app_dev",
            user = "dev_user",
            password = "dev_pass",
            pool_size = 5
        },
        
        redis = {
            host = "localhost",
            port = 6379,
            database = 0,
            pool_size = 10
        },
        
        logging = {
            level = "debug"
        }
    },
    
    testing = {
        debug = true,
        
        database = {
            host = "test-db",
            port = 3306,
            database = "app_test",
            user = "test_user",
            password = "test_pass",
            pool_size = 3
        },
        
        redis = {
            host = "test-redis",
            port = 6379,
            database = 1,
            pool_size = 5
        }
    },
    
    production = {
        debug = false,
        
        database = {
            host = os.getenv("DB_HOST") or "prod-db",
            port = tonumber(os.getenv("DB_PORT")) or 3306,
            database = os.getenv("DB_NAME") or "app_prod",
            user = os.getenv("DB_USER") or "prod_user",
            password = os.getenv("DB_PASSWORD") or "prod_pass",
            pool_size = tonumber(os.getenv("DB_POOL_SIZE")) or 20
        },
        
        redis = {
            host = os.getenv("REDIS_HOST") or "prod-redis",
            port = tonumber(os.getenv("REDIS_PORT")) or 6379,
            database = tonumber(os.getenv("REDIS_DB")) or 0,
            pool_size = tonumber(os.getenv("REDIS_POOL_SIZE")) or 50
        },
        
        logging = {
            level = "warn"
        },
        
        server = {
            worker_processes = tonumber(os.getenv("WORKER_PROCESSES")) or "auto",
            worker_connections = tonumber(os.getenv("WORKER_CONNECTIONS")) or 2048
        }
    }
}

-- 合并配置
function env.deep_merge(base, override)
    local result = {}
    
    -- 复制基础配置
    for k, v in pairs(base) do
        if type(v) == "table" then
            result[k] = env.deep_merge(v, {})
        else
            result[k] = v
        end
    end
    
    -- 覆盖配置
    for k, v in pairs(override) do
        if type(v) == "table" and type(result[k]) == "table" then
            result[k] = env.deep_merge(result[k], v)
        else
            result[k] = v
        end
    end
    
    return result
end

-- 获取当前环境配置
function env.get_config()
    local env_config = env_configs[ENV_TYPE] or env_configs.development
    return env.deep_merge(base_config, env_config)
end

-- 获取环境类型
function env.get_env_type()
    return ENV_TYPE
end

-- 是否为生产环境
function env.is_production()
    return ENV_TYPE == "production"
end

-- 是否为开发环境
function env.is_development()
    return ENV_TYPE == "development"
end

return env

2.2 配置热更新

-- config/hot_reload.lua - 配置热更新
local hot_reload = {}
local cjson = require "cjson"
local lfs = require "lfs"

-- 配置文件监控
local config_files = {
    "/etc/openresty/app.conf",
    "/etc/openresty/routes.conf",
    "/etc/openresty/upstream.conf"
}

-- 文件修改时间缓存
local file_mtimes = {}

-- 配置缓存
local config_cache = {}

-- 检查文件是否修改
function hot_reload.check_file_modified(filepath)
    local attr = lfs.attributes(filepath)
    if not attr then
        return false
    end
    
    local current_mtime = attr.modification
    local cached_mtime = file_mtimes[filepath]
    
    if not cached_mtime or current_mtime > cached_mtime then
        file_mtimes[filepath] = current_mtime
        return true
    end
    
    return false
end

-- 加载配置文件
function hot_reload.load_config_file(filepath)
    local file = io.open(filepath, "r")
    if not file then
        ngx.log(ngx.ERR, "Failed to open config file: ", filepath)
        return nil
    end
    
    local content = file:read("*all")
    file:close()
    
    local success, config = pcall(cjson.decode, content)
    if not success then
        ngx.log(ngx.ERR, "Failed to parse config file: ", filepath, ", error: ", config)
        return nil
    end
    
    return config
end

-- 重载配置
function hot_reload.reload_config()
    local updated = false
    
    for _, filepath in ipairs(config_files) do
        if hot_reload.check_file_modified(filepath) then
            ngx.log(ngx.INFO, "Config file modified, reloading: ", filepath)
            
            local new_config = hot_reload.load_config_file(filepath)
            if new_config then
                config_cache[filepath] = new_config
                updated = true
                
                -- 触发配置更新事件
                hot_reload.on_config_updated(filepath, new_config)
            end
        end
    end
    
    return updated
end

-- 配置更新回调
function hot_reload.on_config_updated(filepath, config)
    -- 根据文件类型执行不同的更新逻辑
    if filepath:match("routes%.conf$") then
        hot_reload.update_routes(config)
    elseif filepath:match("upstream%.conf$") then
        hot_reload.update_upstream(config)
    elseif filepath:match("app%.conf$") then
        hot_reload.update_app_config(config)
    end
end

-- 更新路由配置
function hot_reload.update_routes(routes_config)
    local shared_dict = ngx.shared.routes
    if not shared_dict then
        ngx.log(ngx.ERR, "Routes shared dict not found")
        return
    end
    
    -- 清空现有路由
    shared_dict:flush_all()
    
    -- 加载新路由
    for path, config in pairs(routes_config) do
        local success, err = shared_dict:set(path, cjson.encode(config))
        if not success then
            ngx.log(ngx.ERR, "Failed to update route: ", path, ", error: ", err)
        end
    end
    
    ngx.log(ngx.INFO, "Routes updated successfully")
end

-- 更新上游配置
function hot_reload.update_upstream(upstream_config)
    local shared_dict = ngx.shared.upstream
    if not shared_dict then
        ngx.log(ngx.ERR, "Upstream shared dict not found")
        return
    end
    
    for name, servers in pairs(upstream_config) do
        local success, err = shared_dict:set(name, cjson.encode(servers))
        if not success then
            ngx.log(ngx.ERR, "Failed to update upstream: ", name, ", error: ", err)
        end
    end
    
    ngx.log(ngx.INFO, "Upstream configuration updated successfully")
end

-- 更新应用配置
function hot_reload.update_app_config(app_config)
    local shared_dict = ngx.shared.app_config
    if not shared_dict then
        ngx.log(ngx.ERR, "App config shared dict not found")
        return
    end
    
    for key, value in pairs(app_config) do
        local encoded_value = type(value) == "table" and cjson.encode(value) or tostring(value)
        local success, err = shared_dict:set(key, encoded_value)
        if not success then
            ngx.log(ngx.ERR, "Failed to update app config: ", key, ", error: ", err)
        end
    end
    
    ngx.log(ngx.INFO, "App configuration updated successfully")
end

-- 获取配置
function hot_reload.get_config(filepath)
    return config_cache[filepath]
end

-- 初始化
function hot_reload.init()
    -- 初始加载所有配置文件
    for _, filepath in ipairs(config_files) do
        local config = hot_reload.load_config_file(filepath)
        if config then
            config_cache[filepath] = config
            hot_reload.on_config_updated(filepath, config)
        end
    end
end

-- 定时检查配置更新
function hot_reload.start_monitor()
    local function check_config()
        hot_reload.reload_config()
    end
    
    -- 每5秒检查一次配置文件
    local ok, err = ngx.timer.every(5, check_config)
    if not ok then
        ngx.log(ngx.ERR, "Failed to start config monitor: ", err)
    else
        ngx.log(ngx.INFO, "Config monitor started")
    end
end

return hot_reload

3. 服务管理

3.1 Systemd服务配置

# /etc/systemd/system/openresty.service
[Unit]
Description=OpenResty Web Server
After=network.target remote-fs.target nss-lookup.target
Wants=network.target

[Service]
Type=forking
PIDFile=/var/run/openresty.pid
ExecStartPre=/usr/local/openresty/bin/openresty -t -q -g 'daemon on; master_process on;'
ExecStart=/usr/local/openresty/bin/openresty -g 'daemon on; master_process on;'
ExecReload=/bin/kill -s HUP $MAINPID
ExecStop=/bin/kill -s TERM $MAINPID
KillSignal=SIGTERM
KillMode=mixed
PrivateTmp=true
LimitNOFILE=65536
LimitNPROC=32768
LimitCORE=infinity
Restart=on-failure
RestartSec=5
User=openresty
Group=openresty

# 环境变量
Environment=ENV_TYPE=production
Environment=WORKER_PROCESSES=auto
Environment=WORKER_CONNECTIONS=2048

# 安全设置
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/log/nginx /var/cache/nginx /var/run

[Install]
WantedBy=multi-user.target

3.2 进程管理脚本

#!/bin/bash
# scripts/openresty-manager.sh

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
APP_NAME="openresty-app"
PID_FILE="/var/run/openresty.pid"
LOG_DIR="/var/log/nginx"
CONF_FILE="/etc/openresty/nginx.conf"

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# 日志函数
log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 检查进程状态
check_status() {
    if [ -f "$PID_FILE" ]; then
        local pid=$(cat "$PID_FILE")
        if kill -0 "$pid" 2>/dev/null; then
            echo "running"
            return 0
        else
            echo "dead"
            return 1
        fi
    else
        echo "stopped"
        return 1
    fi
}

# 启动服务
start_service() {
    log_info "Starting OpenResty..."
    
    local status=$(check_status)
    if [ "$status" = "running" ]; then
        log_warn "OpenResty is already running"
        return 0
    fi
    
    # 测试配置
    if ! /usr/local/openresty/bin/openresty -t -c "$CONF_FILE"; then
        log_error "Configuration test failed"
        return 1
    fi
    
    # 启动服务
    /usr/local/openresty/bin/openresty -c "$CONF_FILE"
    
    # 等待启动
    sleep 2
    
    local status=$(check_status)
    if [ "$status" = "running" ]; then
        log_info "OpenResty started successfully"
        return 0
    else
        log_error "Failed to start OpenResty"
        return 1
    fi
}

# 停止服务
stop_service() {
    log_info "Stopping OpenResty..."
    
    local status=$(check_status)
    if [ "$status" = "stopped" ]; then
        log_warn "OpenResty is not running"
        return 0
    fi
    
    if [ -f "$PID_FILE" ]; then
        local pid=$(cat "$PID_FILE")
        
        # 发送TERM信号
        kill -TERM "$pid" 2>/dev/null || true
        
        # 等待进程退出
        local count=0
        while kill -0 "$pid" 2>/dev/null && [ $count -lt 30 ]; do
            sleep 1
            count=$((count + 1))
        done
        
        # 如果进程仍在运行,强制杀死
        if kill -0 "$pid" 2>/dev/null; then
            log_warn "Force killing OpenResty process"
            kill -KILL "$pid" 2>/dev/null || true
        fi
        
        rm -f "$PID_FILE"
    fi
    
    log_info "OpenResty stopped"
}

# 重启服务
restart_service() {
    log_info "Restarting OpenResty..."
    stop_service
    sleep 2
    start_service
}

# 重载配置
reload_service() {
    log_info "Reloading OpenResty configuration..."
    
    local status=$(check_status)
    if [ "$status" != "running" ]; then
        log_error "OpenResty is not running"
        return 1
    fi
    
    # 测试配置
    if ! /usr/local/openresty/bin/openresty -t -c "$CONF_FILE"; then
        log_error "Configuration test failed"
        return 1
    fi
    
    # 发送HUP信号重载配置
    local pid=$(cat "$PID_FILE")
    kill -HUP "$pid"
    
    log_info "Configuration reloaded successfully"
}

# 显示状态
show_status() {
    local status=$(check_status)
    
    echo "OpenResty Status: $status"
    
    if [ "$status" = "running" ]; then
        local pid=$(cat "$PID_FILE")
        echo "PID: $pid"
        
        # 显示进程信息
        ps -p "$pid" -o pid,ppid,user,%cpu,%mem,vsz,rss,tty,stat,start,time,command
        
        # 显示端口监听
        echo ""
        echo "Listening ports:"
        netstat -tlnp 2>/dev/null | grep "$pid" || true
    fi
}

# 查看日志
show_logs() {
    local log_type=${1:-access}
    local lines=${2:-50}
    
    case $log_type in
        access)
            tail -n "$lines" "$LOG_DIR/access.log"
            ;;
        error)
            tail -n "$lines" "$LOG_DIR/error.log"
            ;;
        *)
            log_error "Unknown log type: $log_type (use 'access' or 'error')"
            return 1
            ;;
    esac
}

# 健康检查
health_check() {
    local endpoint=${1:-"http://localhost/health"}
    
    log_info "Performing health check on $endpoint"
    
    if curl -f -s "$endpoint" > /dev/null; then
        log_info "Health check passed"
        return 0
    else
        log_error "Health check failed"
        return 1
    fi
}

# 显示帮助
show_help() {
    cat << EOF
Usage: $0 {start|stop|restart|reload|status|logs|health|help}

Commands:
    start       Start OpenResty service
    stop        Stop OpenResty service
    restart     Restart OpenResty service
    reload      Reload configuration without stopping
    status      Show service status
    logs        Show logs (access|error) [lines]
    health      Perform health check [endpoint]
    help        Show this help message

Examples:
    $0 start
    $0 logs error 100
    $0 health http://localhost:8080/health
EOF
}

# 主函数
main() {
    case "${1:-}" in
        start)
            start_service
            ;;
        stop)
            stop_service
            ;;
        restart)
            restart_service
            ;;
        reload)
            reload_service
            ;;
        status)
            show_status
            ;;
        logs)
            show_logs "$2" "$3"
            ;;
        health)
            health_check "$2"
            ;;
        help|--help|-h)
            show_help
            ;;
        *)
            log_error "Unknown command: ${1:-}"
            show_help
            exit 1
            ;;
    esac
}

# 检查权限
if [ "$EUID" -ne 0 ] && [ "${1:-}" != "status" ] && [ "${1:-}" != "logs" ] && [ "${1:-}" != "health" ] && [ "${1:-}" != "help" ]; then
    log_error "This script must be run as root for most operations"
    exit 1
fi

main "$@"

4. 监控与告警

4.1 Prometheus监控集成

-- lua/prometheus_metrics.lua - Prometheus指标收集
local prometheus_metrics = {}
local prometheus = require "resty.prometheus"

-- 初始化Prometheus
local prom = prometheus.init("prometheus_metrics", "nginx_")

-- 定义指标
local metrics = {
    -- 请求计数器
    requests_total = prom:counter(
        "http_requests_total",
        "Total number of HTTP requests",
        {"method", "status", "endpoint"}
    ),
    
    -- 请求延迟直方图
    request_duration = prom:histogram(
        "http_request_duration_seconds",
        "HTTP request latency",
        {"method", "endpoint"},
        {0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}
    ),
    
    -- 活跃连接数
    active_connections = prom:gauge(
        "nginx_connections_active",
        "Number of active connections"
    ),
    
    -- 数据库连接池
    db_connections = prom:gauge(
        "database_connections",
        "Database connection pool status",
        {"pool", "status"}
    ),
    
    -- 缓存命中率
    cache_operations = prom:counter(
        "cache_operations_total",
        "Cache operations",
        {"operation", "result"}
    ),
    
    -- 错误计数
    errors_total = prom:counter(
        "errors_total",
        "Total number of errors",
        {"type", "severity"}
    ),
    
    -- 内存使用
    memory_usage = prom:gauge(
        "lua_memory_usage_bytes",
        "Lua memory usage in bytes"
    )
}

-- 记录请求指标
function prometheus_metrics.record_request(method, status, endpoint, duration)
    metrics.requests_total:inc(1, {method, tostring(status), endpoint})
    metrics.request_duration:observe(duration, {method, endpoint})
end

-- 更新连接数
function prometheus_metrics.update_connections(active, reading, writing, waiting)
    metrics.active_connections:set(active)
end

-- 记录数据库连接
function prometheus_metrics.record_db_connection(pool, status, count)
    metrics.db_connections:set(count, {pool, status})
end

-- 记录缓存操作
function prometheus_metrics.record_cache_operation(operation, result)
    metrics.cache_operations:inc(1, {operation, result})
end

-- 记录错误
function prometheus_metrics.record_error(error_type, severity)
    metrics.errors_total:inc(1, {error_type, severity})
end

-- 更新内存使用
function prometheus_metrics.update_memory_usage()
    local memory_kb = collectgarbage("count")
    metrics.memory_usage:set(memory_kb * 1024)
end

-- 收集所有指标
function prometheus_metrics.collect()
    -- 更新系统指标
    prometheus_metrics.update_memory_usage()
    
    -- 收集Nginx状态
    local status = ngx.var.connections_active
    if status then
        prometheus_metrics.update_connections(tonumber(status))
    end
    
    return prom:collect()
end

return prometheus_metrics
# nginx配置中的监控端点
location /metrics {
    access_log off;
    allow 127.0.0.1;
    allow 10.0.0.0/8;
    deny all;
    
    content_by_lua_block {
        local prometheus_metrics = require "prometheus_metrics"
        ngx.header.content_type = "text/plain"
        ngx.say(prometheus_metrics.collect())
    }
}

# 状态监控端点
location /status {
    access_log off;
    allow 127.0.0.1;
    allow 10.0.0.0/8;
    deny all;
    
    content_by_lua_block {
        local cjson = require "cjson"
        
        local status = {
            timestamp = ngx.time(),
            version = "1.0.0",
            uptime = ngx.time() - ngx.shared.app_start_time:get("start_time"),
            connections = {
                active = ngx.var.connections_active,
                reading = ngx.var.connections_reading,
                writing = ngx.var.connections_writing,
                waiting = ngx.var.connections_waiting
            },
            memory = {
                lua_memory_kb = collectgarbage("count"),
                shared_dicts = {}
            }
        }
        
        -- 收集共享内存状态
        local shared_dicts = {"cache", "sessions", "rate_limit"}
        for _, dict_name in ipairs(shared_dicts) do
            local dict = ngx.shared[dict_name]
            if dict then
                status.memory.shared_dicts[dict_name] = {
                    capacity = dict:capacity(),
                    free_space = dict:free_space()
                }
            end
        end
        
        ngx.header.content_type = "application/json"
        ngx.say(cjson.encode(status))
    }
}

4.2 健康检查

-- lua/health_check.lua - 健康检查模块
local health_check = {}
local cjson = require "cjson"

-- 健康检查项
local checks = {
    database = {
        name = "Database Connection",
        timeout = 5,
        critical = true
    },
    redis = {
        name = "Redis Connection",
        timeout = 3,
        critical = true
    },
    external_api = {
        name = "External API",
        timeout = 10,
        critical = false
    },
    disk_space = {
        name = "Disk Space",
        timeout = 1,
        critical = true
    }
}

-- 检查数据库连接
function health_check.check_database()
    local mysql = require "resty.mysql"
    local db = mysql:new()
    
    db:set_timeout(5000)
    
    local ok, err = db:connect({
        host = "127.0.0.1",
        port = 3306,
        database = "app",
        user = "app_user",
        password = "app_pass"
    })
    
    if not ok then
        return false, "Connection failed: " .. (err or "unknown error")
    end
    
    local res, err = db:query("SELECT 1 as health_check")
    db:set_keepalive(10000, 100)
    
    if not res then
        return false, "Query failed: " .. (err or "unknown error")
    end
    
    return true, "OK"
end

-- 检查Redis连接
function health_check.check_redis()
    local redis = require "resty.redis"
    local red = redis:new()
    
    red:set_timeout(3000)
    
    local ok, err = red:connect("127.0.0.1", 6379)
    if not ok then
        return false, "Connection failed: " .. (err or "unknown error")
    end
    
    local res, err = red:ping()
    red:set_keepalive(10000, 100)
    
    if not res then
        return false, "Ping failed: " .. (err or "unknown error")
    end
    
    return true, "OK"
end

-- 检查外部API
function health_check.check_external_api()
    local http = require "resty.http"
    local httpc = http.new()
    
    httpc:set_timeout(10000)
    
    local res, err = httpc:request_uri("https://api.example.com/health", {
        method = "GET",
        headers = {
            ["User-Agent"] = "OpenResty-HealthCheck/1.0"
        }
    })
    
    if not res then
        return false, "Request failed: " .. (err or "unknown error")
    end
    
    if res.status ~= 200 then
        return false, "HTTP " .. res.status .. ": " .. (res.body or "")
    end
    
    return true, "OK"
end

-- 检查磁盘空间
function health_check.check_disk_space()
    local handle = io.popen("df / | tail -1 | awk '{print $5}' | sed 's/%//'")
    local usage = handle:read("*a")
    handle:close()
    
    local usage_percent = tonumber(usage)
    if not usage_percent then
        return false, "Unable to get disk usage"
    end
    
    if usage_percent > 90 then
        return false, "Disk usage too high: " .. usage_percent .. "%"
    elseif usage_percent > 80 then
        return true, "Warning: Disk usage is " .. usage_percent .. "%"
    end
    
    return true, "Disk usage: " .. usage_percent .. "%"
end

-- 执行所有健康检查
function health_check.run_all_checks()
    local results = {
        timestamp = ngx.time(),
        overall_status = "healthy",
        checks = {}
    }
    
    local check_functions = {
        database = health_check.check_database,
        redis = health_check.check_redis,
        external_api = health_check.check_external_api,
        disk_space = health_check.check_disk_space
    }
    
    for check_name, check_config in pairs(checks) do
        local check_func = check_functions[check_name]
        if check_func then
            local start_time = ngx.now()
            local success, message = check_func()
            local duration = ngx.now() - start_time
            
            local check_result = {
                name = check_config.name,
                status = success and "healthy" or "unhealthy",
                message = message,
                duration_ms = duration * 1000,
                critical = check_config.critical
            }
            
            results.checks[check_name] = check_result
            
            -- 如果关键检查失败,整体状态为不健康
            if not success and check_config.critical then
                results.overall_status = "unhealthy"
            end
        end
    end
    
    return results
end

-- 简单健康检查(快速响应)
function health_check.simple_check()
    return {
        status = "healthy",
        timestamp = ngx.time(),
        version = "1.0.0"
    }
end

return health_check

4.3 告警配置

# prometheus/alert_rules.yml - Prometheus告警规则
groups:
- name: openresty
  rules:
  # 服务可用性告警
  - alert: OpenRestyDown
    expr: up{job="openresty"} == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "OpenResty instance is down"
      description: "OpenResty instance {{ $labels.instance }} has been down for more than 1 minute."
  
  # 高错误率告警
  - alert: HighErrorRate
    expr: |
      (
        rate(nginx_http_requests_total{status=~"5.."}[5m]) /
        rate(nginx_http_requests_total[5m])
      ) * 100 > 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value }}% for instance {{ $labels.instance }}"
  
  # 高延迟告警
  - alert: HighLatency
    expr: |
      histogram_quantile(0.95,
        rate(nginx_http_request_duration_seconds_bucket[5m])
      ) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High latency detected"
      description: "95th percentile latency is {{ $value }}s for instance {{ $labels.instance }}"
  
  # 内存使用告警
  - alert: HighMemoryUsage
    expr: nginx_lua_memory_usage_bytes / (1024 * 1024) > 512
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "High Lua memory usage"
      description: "Lua memory usage is {{ $value }}MB for instance {{ $labels.instance }}"
  
  # 连接数告警
  - alert: HighConnectionCount
    expr: nginx_connections_active > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High connection count"
      description: "Active connections: {{ $value }} for instance {{ $labels.instance }}"
  
  # 数据库连接告警
  - alert: DatabaseConnectionIssue
    expr: database_connections{status="error"} > 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Database connection issues"
      description: "Database connection errors detected for pool {{ $labels.pool }}"
  
  # 缓存命中率告警
  - alert: LowCacheHitRate
    expr: |
      (
        rate(cache_operations_total{result="hit"}[10m]) /
        rate(cache_operations_total{operation="get"}[10m])
      ) * 100 < 80
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: "Low cache hit rate"
      description: "Cache hit rate is {{ $value }}% for instance {{ $labels.instance }}"
  
  # 磁盘空间告警
  - alert: HighDiskUsage
    expr: |
      (
        (node_filesystem_size_bytes - node_filesystem_avail_bytes) /
        node_filesystem_size_bytes
      ) * 100 > 85
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High disk usage"
      description: "Disk usage is {{ $value }}% for instance {{ $labels.instance }}"
# alertmanager/alertmanager.yml - Alertmanager配置
global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alerts@example.com'
  smtp_auth_username: 'alerts@example.com'
  smtp_auth_password: 'password'

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
  - match:
      severity: warning
    receiver: 'warning-alerts'

receivers:
- name: 'default'
  email_configs:
  - to: 'ops@example.com'
    subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
      {{ end }}

- name: 'critical-alerts'
  email_configs:
  - to: 'ops@example.com,oncall@example.com'
    subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
    body: |
      CRITICAL ALERT!
      
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      Instance: {{ .Labels.instance }}
      Time: {{ .StartsAt }}
      {{ end }}
  slack_configs:
  - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
    channel: '#alerts'
    title: 'Critical Alert: {{ .GroupLabels.alertname }}'
    text: |
      {{ range .Alerts }}
      {{ .Annotations.summary }}
      {{ .Annotations.description }}
      {{ end }}

- name: 'warning-alerts'
  email_configs:
  - to: 'ops@example.com'
    subject: '[WARNING] {{ .GroupLabels.alertname }}'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      Instance: {{ .Labels.instance }}
      {{ end }}

inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'cluster', 'service']

通过本章的学习,你应该掌握了OpenResty在生产环境中的完整部署方案,包括架构设计、配置管理、服务管理、监控告警等各个方面。这些知识将帮助你构建稳定、可靠、可扩展的OpenResty生产环境。