11.1 常见问题诊断
11.1.1 连接问题诊断
1. 连接无法建立
#!/bin/bash
# connection_diagnosis.sh
# 连接问题诊断脚本
diagnose_connection() {
local target_host="$1"
local target_port="$2"
local protocol="${3:-tcp}"
echo "=== Connection Diagnosis for $target_host:$target_port ($protocol) ==="
# 1. 基本连通性测试
echo "1. Testing basic connectivity..."
if ping -c 3 "$target_host" > /dev/null 2>&1; then
echo " ✓ Host is reachable via ICMP"
else
echo " ✗ Host is NOT reachable via ICMP"
echo " Checking if ICMP is blocked..."
# 检查 ICMP 规则
icmp_rules=$(iptables -L -n | grep icmp)
if [ -n "$icmp_rules" ]; then
echo " ICMP rules found:"
echo "$icmp_rules" | sed 's/^/ /'
fi
fi
# 2. 端口连通性测试
echo "2. Testing port connectivity..."
if command -v nc > /dev/null; then
if nc -z -w 5 "$target_host" "$target_port" 2>/dev/null; then
echo " ✓ Port $target_port is open"
else
echo " ✗ Port $target_port is closed or filtered"
fi
elif command -v telnet > /dev/null; then
timeout 5 telnet "$target_host" "$target_port" 2>/dev/null | grep -q "Connected" &&
echo " ✓ Port $target_port is open" ||
echo " ✗ Port $target_port is closed or filtered"
fi
# 3. 检查相关的 iptables 规则
echo "3. Checking iptables rules..."
# 检查 INPUT 链规则
echo " INPUT chain rules for port $target_port:"
input_rules=$(iptables -L INPUT -n --line-numbers | grep ":$target_port ")
if [ -n "$input_rules" ]; then
echo "$input_rules" | sed 's/^/ /'
else
echo " No specific rules found for port $target_port"
fi
# 检查 OUTPUT 链规则
echo " OUTPUT chain rules for port $target_port:"
output_rules=$(iptables -L OUTPUT -n --line-numbers | grep ":$target_port ")
if [ -n "$output_rules" ]; then
echo "$output_rules" | sed 's/^/ /'
else
echo " No specific rules found for port $target_port"
fi
# 4. 检查默认策略
echo "4. Checking default policies..."
policies=$(iptables -L | grep "Chain.*policy")
echo "$policies" | sed 's/^/ /'
# 5. 检查连接跟踪
echo "5. Checking connection tracking..."
if [ -f /proc/net/nf_conntrack ]; then
conn_count=$(cat /proc/net/nf_conntrack | wc -l)
conn_max=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
echo " Current connections: $conn_count / $conn_max"
# 检查特定连接
specific_conn=$(grep "$target_host.*:$target_port" /proc/net/nf_conntrack 2>/dev/null)
if [ -n "$specific_conn" ]; then
echo " Found connection tracking entry:"
echo "$specific_conn" | sed 's/^/ /'
else
echo " No connection tracking entry found"
fi
fi
# 6. 建议
echo "6. Troubleshooting suggestions:"
echo " - Check if service is running on target port"
echo " - Verify iptables rules allow the connection"
echo " - Check if connection tracking table is full"
echo " - Review application logs for errors"
echo " - Test from different source addresses"
}
# 网络接口诊断
diagnose_interface() {
local interface="$1"
echo "=== Interface Diagnosis for $interface ==="
# 1. 接口状态
echo "1. Interface status:"
if ip link show "$interface" > /dev/null 2>&1; then
ip link show "$interface" | sed 's/^/ /'
# 检查接口是否启用
if ip link show "$interface" | grep -q "state UP"; then
echo " ✓ Interface is UP"
else
echo " ✗ Interface is DOWN"
fi
else
echo " ✗ Interface $interface does not exist"
return 1
fi
# 2. IP 地址配置
echo "2. IP address configuration:"
ip_info=$(ip addr show "$interface" | grep "inet ")
if [ -n "$ip_info" ]; then
echo "$ip_info" | sed 's/^/ /'
else
echo " No IP address configured"
fi
# 3. 路由信息
echo "3. Routing information:"
route_info=$(ip route | grep "$interface")
if [ -n "$route_info" ]; then
echo "$route_info" | sed 's/^/ /'
else
echo " No routes found for this interface"
fi
# 4. 接口相关的 iptables 规则
echo "4. Interface-specific iptables rules:"
interface_rules=$(iptables -L -n | grep "$interface")
if [ -n "$interface_rules" ]; then
echo "$interface_rules" | sed 's/^/ /'
else
echo " No interface-specific rules found"
fi
# 5. 流量统计
echo "5. Traffic statistics:"
if [ -f "/sys/class/net/$interface/statistics/rx_bytes" ]; then
rx_bytes=$(cat "/sys/class/net/$interface/statistics/rx_bytes")
tx_bytes=$(cat "/sys/class/net/$interface/statistics/tx_bytes")
rx_packets=$(cat "/sys/class/net/$interface/statistics/rx_packets")
tx_packets=$(cat "/sys/class/net/$interface/statistics/tx_packets")
echo " RX: $rx_packets packets, $rx_bytes bytes"
echo " TX: $tx_packets packets, $tx_bytes bytes"
fi
}
# 使用示例
if [ $# -eq 0 ]; then
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " connection <host> <port> [protocol] - Diagnose connection issues"
echo " interface <interface> - Diagnose interface issues"
exit 1
fi
case "$1" in
"connection")
if [ $# -lt 3 ]; then
echo "Usage: $0 connection <host> <port> [protocol]"
exit 1
fi
diagnose_connection "$2" "$3" "$4"
;;
"interface")
if [ $# -lt 2 ]; then
echo "Usage: $0 interface <interface>"
exit 1
fi
diagnose_interface "$2"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.4 自动化故障排除
11.4.1 自动化诊断脚本
1. 综合故障诊断脚本
#!/bin/bash
# iptables_auto_diagnosis.sh
# 自动化 iptables 故障诊断脚本
auto_diagnosis() {
local log_file="/var/log/iptables_diagnosis_$(date +%Y%m%d_%H%M%S).log"
echo "=== Automated iptables Diagnosis Report ===" | tee "$log_file"
echo "Timestamp: $(date)" | tee -a "$log_file"
echo "Hostname: $(hostname)" | tee -a "$log_file"
echo "Kernel: $(uname -r)" | tee -a "$log_file"
echo | tee -a "$log_file"
local issues_found=0
# 1. 基础连通性检查
echo "1. Basic Connectivity Check:" | tee -a "$log_file"
# 检查网络接口
echo " Network Interfaces:" | tee -a "$log_file"
ip link show | grep -E "^[0-9]+:" | while read -r line; do
interface=$(echo "$line" | awk -F': ' '{print $2}' | awk '{print $1}')
status=$(echo "$line" | grep -o "state [A-Z]*" | awk '{print $2}')
echo " $interface: $status" | tee -a "$log_file"
if [ "$status" != "UP" ] && [ "$interface" != "lo" ]; then
echo " ⚠️ Interface $interface is down" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
done
# 检查路由表
echo " Default Route:" | tee -a "$log_file"
default_route=$(ip route | grep default)
if [ -n "$default_route" ]; then
echo " ✓ $default_route" | tee -a "$log_file"
else
echo " ❌ No default route found" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
# 2. iptables 规则检查
echo -e "\n2. iptables Rules Check:" | tee -a "$log_file"
# 检查默认策略
echo " Default Policies:" | tee -a "$log_file"
for chain in INPUT OUTPUT FORWARD; do
policy=$(iptables -L "$chain" -n | head -1 | awk '{print $4}' | tr -d '()')
echo " $chain: $policy" | tee -a "$log_file"
if [ "$policy" = "DROP" ] || [ "$policy" = "REJECT" ]; then
# 检查是否有允许规则
allow_rules=$(iptables -L "$chain" -n | grep -c ACCEPT)
if [ $allow_rules -eq 0 ]; then
echo " ⚠️ $chain chain has restrictive policy but no ACCEPT rules" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
fi
done
# 检查规则数量
total_rules=$(iptables-save | grep "^-A" | wc -l)
echo " Total Rules: $total_rules" | tee -a "$log_file"
if [ $total_rules -gt 1000 ]; then
echo " ⚠️ Very high rule count may impact performance" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
# 3. 连接跟踪检查
echo -e "\n3. Connection Tracking Check:" | tee -a "$log_file"
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_conn * 100 / max_conn))
echo " Connection Table Usage: $usage_percent% ($current_conn/$max_conn)" | tee -a "$log_file"
if [ $usage_percent -gt 90 ]; then
echo " ❌ Connection table near capacity" | tee -a "$log_file"
issues_found=$((issues_found + 1))
elif [ $usage_percent -gt 80 ]; then
echo " ⚠️ Connection table usage is high" | tee -a "$log_file"
issues_found=$((issues_found + 1))
else
echo " ✓ Connection table usage is normal" | tee -a "$log_file"
fi
# 检查连接状态分布
echo " Connection States:" | tee -a "$log_file"
cat /proc/net/nf_conntrack | awk '{print $4}' | sort | uniq -c | sort -nr | head -5 |
while read -r count state; do
echo " $state: $count" | tee -a "$log_file"
done
else
echo " ❌ Connection tracking not available" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
# 4. 系统资源检查
echo -e "\n4. System Resource Check:" | tee -a "$log_file"
# CPU 负载
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
cpu_cores=$(nproc)
load_per_core=$(echo "scale=2; $load_avg / $cpu_cores" | bc 2>/dev/null || echo "0")
echo " CPU Load: $load_avg (${load_per_core} per core)" | tee -a "$log_file"
if [ "$(echo "$load_per_core > 2" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ Very high CPU load" | tee -a "$log_file"
issues_found=$((issues_found + 1))
elif [ "$(echo "$load_per_core > 1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High CPU load" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
# 内存使用
memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3*100/$2}')
echo " Memory Usage: ${memory_usage}%" | tee -a "$log_file"
if [ "$(echo "$memory_usage > 90" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ Very high memory usage" | tee -a "$log_file"
issues_found=$((issues_found + 1))
elif [ "$(echo "$memory_usage > 80" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High memory usage" | tee -a "$log_file"
issues_found=$((issues_found + 1))
fi
# 5. 日志检查
echo -e "\n5. Log Analysis:" | tee -a "$log_file"
# 检查最近的 iptables 相关日志
if [ -f /var/log/messages ]; then
recent_blocks=$(grep -i "iptables" /var/log/messages | tail -10 | wc -l)
if [ $recent_blocks -gt 0 ]; then
echo " Recent iptables log entries: $recent_blocks" | tee -a "$log_file"
echo " Last 3 entries:" | tee -a "$log_file"
grep -i "iptables" /var/log/messages | tail -3 | sed 's/^/ /' | tee -a "$log_file"
else
echo " No recent iptables log entries found" | tee -a "$log_file"
fi
fi
# 检查内核日志中的网络相关错误
network_errors=$(dmesg | grep -i -E "network|iptables|netfilter" | tail -5 | wc -l)
if [ $network_errors -gt 0 ]; then
echo " Recent network-related kernel messages: $network_errors" | tee -a "$log_file"
dmesg | grep -i -E "network|iptables|netfilter" | tail -3 | sed 's/^/ /' | tee -a "$log_file"
fi
# 6. 总结和建议
echo -e "\n6. Summary and Recommendations:" | tee -a "$log_file"
if [ $issues_found -eq 0 ]; then
echo " ✓ No critical issues detected" | tee -a "$log_file"
echo " System appears to be functioning normally" | tee -a "$log_file"
else
echo " Found $issues_found potential issues" | tee -a "$log_file"
echo " Recommendations:" | tee -a "$log_file"
if [ $total_rules -gt 500 ]; then
echo " - Consider optimizing iptables rules" | tee -a "$log_file"
echo " - Use ipset for large IP lists" | tee -a "$log_file"
fi
if [ "$(echo "$load_per_core > 1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " - Monitor CPU usage and consider hardware upgrade" | tee -a "$log_file"
fi
if [ "$(echo "$memory_usage > 80" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " - Monitor memory usage and consider adding RAM" | tee -a "$log_file"
fi
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_conn * 100 / max_conn))
if [ $usage_percent -gt 80 ]; then
echo " - Increase nf_conntrack_max value" | tee -a "$log_file"
echo " - Reduce connection timeout values" | tee -a "$log_file"
fi
fi
fi
echo -e "\nDiagnosis report saved to: $log_file" | tee -a "$log_file"
echo "Issues found: $issues_found" | tee -a "$log_file"
return $issues_found
}
# 快速健康检查
quick_health_check() {
echo "=== Quick Health Check ==="
local status="HEALTHY"
# 1. 基本服务检查
echo "1. Basic Service Check:"
# 检查 iptables 服务
if command -v systemctl >/dev/null 2>&1; then
if systemctl is-active iptables >/dev/null 2>&1; then
echo " ✓ iptables service is active"
else
echo " ⚠️ iptables service status unknown"
fi
fi
# 检查规则是否加载
rule_count=$(iptables -L | wc -l)
if [ $rule_count -gt 10 ]; then
echo " ✓ iptables rules are loaded"
else
echo " ⚠️ Very few iptables rules detected"
status="WARNING"
fi
# 2. 连通性检查
echo "\n2. Connectivity Check:"
# 检查本地回环
if ping -c 1 127.0.0.1 >/dev/null 2>&1; then
echo " ✓ Localhost connectivity OK"
else
echo " ❌ Localhost connectivity failed"
status="CRITICAL"
fi
# 检查默认网关
gateway=$(ip route | grep default | awk '{print $3}' | head -1)
if [ -n "$gateway" ]; then
if ping -c 1 -W 3 "$gateway" >/dev/null 2>&1; then
echo " ✓ Gateway ($gateway) reachable"
else
echo " ⚠️ Gateway ($gateway) unreachable"
status="WARNING"
fi
else
echo " ❌ No default gateway found"
status="CRITICAL"
fi
# 3. 资源检查
echo "\n3. Resource Check:"
# CPU 负载
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
cpu_cores=$(nproc)
load_per_core=$(echo "scale=2; $load_avg / $cpu_cores" | bc 2>/dev/null || echo "0")
if [ "$(echo "$load_per_core > 2" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ High CPU load: $load_per_core per core"
status="CRITICAL"
elif [ "$(echo "$load_per_core > 1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ Moderate CPU load: $load_per_core per core"
if [ "$status" = "HEALTHY" ]; then
status="WARNING"
fi
else
echo " ✓ CPU load normal: $load_per_core per core"
fi
# 内存使用
memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3*100/$2}')
if [ "$(echo "$memory_usage > 90" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ High memory usage: ${memory_usage}%"
status="CRITICAL"
elif [ "$(echo "$memory_usage > 80" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ Moderate memory usage: ${memory_usage}%"
if [ "$status" = "HEALTHY" ]; then
status="WARNING"
fi
else
echo " ✓ Memory usage normal: ${memory_usage}%"
fi
# 连接跟踪
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_conn * 100 / max_conn))
if [ $usage_percent -gt 90 ]; then
echo " ❌ Connection table near capacity: ${usage_percent}%"
status="CRITICAL"
elif [ $usage_percent -gt 80 ]; then
echo " ⚠️ High connection table usage: ${usage_percent}%"
if [ "$status" = "HEALTHY" ]; then
status="WARNING"
fi
else
echo " ✓ Connection table usage normal: ${usage_percent}%"
fi
fi
# 4. 总结
echo "\n4. Overall Status: $status"
case "$status" in
"HEALTHY")
echo " ✓ All systems operating normally"
return 0
;;
"WARNING")
echo " ⚠️ Some issues detected, monitoring recommended"
return 1
;;
"CRITICAL")
echo " ❌ Critical issues detected, immediate attention required"
return 2
;;
esac
}
# 使用示例
case "${1:-help}" in
"diagnosis")
auto_diagnosis
;;
"health")
quick_health_check
;;
"help")
echo "Usage: $0 <command>"
echo "Commands:"
echo " diagnosis - Run comprehensive diagnosis"
echo " health - Quick health check"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.5 最佳实践和建议
11.5.1 故障排除最佳实践
1. 系统化的故障排除方法
# 故障排除检查清单
echo "=== iptables Troubleshooting Checklist ==="
echo
echo "1. 基础检查 (Basic Checks):"
echo " □ 检查网络接口状态"
echo " □ 验证 IP 地址配置"
echo " □ 确认路由表正确"
echo " □ 测试基本连通性"
echo
echo "2. iptables 规则检查 (Rule Checks):"
echo " □ 查看默认策略"
echo " □ 检查规则顺序"
echo " □ 验证规则语法"
echo " □ 测试规则匹配"
echo
echo "3. 连接跟踪检查 (Connection Tracking):"
echo " □ 检查连接表使用情况"
echo " □ 验证连接状态"
echo " □ 检查超时设置"
echo " □ 监控连接建立速率"
echo
echo "4. 性能检查 (Performance Checks):"
echo " □ 监控 CPU 使用率"
echo " □ 检查内存使用情况"
echo " □ 分析规则复杂度"
echo " □ 测试处理速度"
echo
echo "5. 日志分析 (Log Analysis):"
echo " □ 检查系统日志"
echo " □ 分析 iptables 日志"
echo " □ 查看内核消息"
echo " □ 监控错误模式"
2. 常见问题解决方案
#!/bin/bash
# common_issues_solutions.sh
# 常见问题解决方案
solve_common_issues() {
echo "=== Common iptables Issues and Solutions ==="
echo
echo "1. 连接被拒绝 (Connection Refused):"
echo " 问题: 无法连接到服务"
echo " 检查:"
echo " - iptables -L INPUT -n -v --line-numbers"
echo " - netstat -tlnp | grep <port>"
echo " - ss -tlnp | grep <port>"
echo " 解决:"
echo " - 添加允许规则: iptables -I INPUT -p tcp --dport <port> -j ACCEPT"
echo " - 检查服务是否运行"
echo " - 验证监听地址"
echo
echo "2. 连接超时 (Connection Timeout):"
echo " 问题: 连接建立缓慢或超时"
echo " 检查:"
echo " - ping <target_ip>"
echo " - traceroute <target_ip>"
echo " - iptables -L FORWARD -n -v"
echo " 解决:"
echo " - 检查 FORWARD 链规则"
echo " - 验证路由配置"
echo " - 检查 MTU 设置"
echo
echo "3. NAT 不工作 (NAT Not Working):"
echo " 问题: 网络地址转换失败"
echo " 检查:"
echo " - iptables -t nat -L -n -v"
echo " - cat /proc/sys/net/ipv4/ip_forward"
echo " - ip route show"
echo " 解决:"
echo " - 启用 IP 转发: echo 1 > /proc/sys/net/ipv4/ip_forward"
echo " - 添加 MASQUERADE 规则"
echo " - 检查源/目标地址"
echo
echo "4. 规则不匹配 (Rules Not Matching):"
echo " 问题: 规则没有按预期工作"
echo " 检查:"
echo " - iptables -L -n -v --line-numbers"
echo " - iptables -Z (重置计数器)"
echo " - tcpdump 抓包分析"
echo " 解决:"
echo " - 检查规则顺序"
echo " - 验证匹配条件"
echo " - 使用 LOG 目标调试"
echo
echo "5. 性能问题 (Performance Issues):"
echo " 问题: 网络性能下降"
echo " 检查:"
echo " - iptables-save | wc -l"
echo " - cat /proc/net/nf_conntrack | wc -l"
echo " - top (查看 CPU 使用)"
echo " 解决:"
echo " - 优化规则顺序"
echo " - 使用 ipset"
echo " - 增加连接跟踪表大小"
echo
echo "6. 连接跟踪表满 (Connection Table Full):"
echo " 问题: nf_conntrack: table full"
echo " 检查:"
echo " - cat /proc/sys/net/netfilter/nf_conntrack_max"
echo " - cat /proc/net/nf_conntrack | wc -l"
echo " 解决:"
echo " - echo 65536 > /proc/sys/net/netfilter/nf_conntrack_max"
echo " - 减少超时时间"
echo " - 使用 NOTRACK 跳过跟踪"
echo
}
# 故障排除工具箱
troubleshooting_toolkit() {
echo "=== Troubleshooting Toolkit ==="
echo
echo "1. 网络连通性测试:"
echo " ping -c 4 <target> # 基本连通性"
echo " traceroute <target> # 路由跟踪"
echo " mtr <target> # 实时路由跟踪"
echo " nc -zv <host> <port> # 端口连通性"
echo " telnet <host> <port> # 交互式连接测试"
echo
echo "2. 网络状态查看:"
echo " ss -tuln # 监听端口"
echo " ss -tuap # 所有连接"
echo " netstat -rn # 路由表"
echo " ip route show # 路由信息"
echo " ip addr show # 接口地址"
echo
echo "3. iptables 调试:"
echo " iptables -L -n -v # 查看规则和计数"
echo " iptables -t nat -L -n -v # NAT 表规则"
echo " iptables -Z # 重置计数器"
echo " iptables-save # 导出规则"
echo " iptables-restore # 导入规则"
echo
echo "4. 数据包分析:"
echo " tcpdump -i any host <ip> # 抓取特定主机"
echo " tcpdump -i any port <port> # 抓取特定端口"
echo " wireshark # 图形化分析"
echo " tshark # 命令行分析"
echo
echo "5. 系统监控:"
echo " top # CPU 和内存"
echo " htop # 增强版 top"
echo " iotop # I/O 监控"
echo " iftop # 网络流量"
echo " nload # 网络负载"
echo
echo "6. 日志分析:"
echo " tail -f /var/log/messages # 实时日志"
echo " journalctl -f # systemd 日志"
echo " dmesg | tail # 内核消息"
echo " grep iptables /var/log/* # iptables 日志"
echo
}
# 性能优化建议
performance_optimization_tips() {
echo "=== Performance Optimization Tips ==="
echo
echo "1. 规则优化:"
echo " • 将最常匹配的规则放在前面"
echo " • 使用具体的匹配条件而不是通用条件"
echo " • 避免使用过多的扩展模块"
echo " • 合并相似的规则"
echo " • 删除不必要的规则"
echo
echo "2. 连接跟踪优化:"
echo " • 增加 nf_conntrack_max 值"
echo " • 减少连接超时时间"
echo " • 对不需要跟踪的流量使用 NOTRACK"
echo " • 调整哈希表大小"
echo
echo "3. 系统优化:"
echo " • 启用网卡多队列"
echo " • 调整中断亲和性"
echo " • 优化内核参数"
echo " • 使用高性能网卡"
echo
echo "4. 监控和维护:"
echo " • 定期检查规则使用情况"
echo " • 监控系统资源使用"
echo " • 定期清理日志文件"
echo " • 备份重要配置"
echo
}
# 使用示例
case "${1:-help}" in
"solutions")
solve_common_issues
;;
"toolkit")
troubleshooting_toolkit
;;
"optimization")
performance_optimization_tips
;;
"help")
echo "Usage: $0 <command>"
echo "Commands:"
echo " solutions - Show common issues and solutions"
echo " toolkit - Display troubleshooting tools"
echo " optimization - Performance optimization tips"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.5.2 预防性维护
1. 定期检查脚本
#!/bin/bash
# preventive_maintenance.sh
# 预防性维护脚本
preventive_maintenance() {
local log_file="/var/log/iptables_maintenance_$(date +%Y%m%d).log"
echo "=== iptables Preventive Maintenance ===" | tee "$log_file"
echo "Date: $(date)" | tee -a "$log_file"
echo | tee -a "$log_file"
# 1. 规则健康检查
echo "1. Rules Health Check:" | tee -a "$log_file"
# 检查未使用的规则
unused_rules=0
for chain in INPUT OUTPUT FORWARD; do
chain_unused=$(iptables -L "$chain" -n -v --line-numbers | tail -n +3 | awk '$2 == "0" {print $1}' | wc -l)
unused_rules=$((unused_rules + chain_unused))
if [ $chain_unused -gt 0 ]; then
echo " $chain chain: $chain_unused unused rules" | tee -a "$log_file"
fi
done
if [ $unused_rules -eq 0 ]; then
echo " ✓ All rules are being used" | tee -a "$log_file"
else
echo " ⚠️ Found $unused_rules unused rules" | tee -a "$log_file"
echo " Consider reviewing and removing unused rules" | tee -a "$log_file"
fi
# 检查规则复杂度
complex_rules=$(iptables-save | grep -E "(\-m.*){3,}" | wc -l)
total_rules=$(iptables-save | grep "^-A" | wc -l)
if [ $total_rules -gt 0 ]; then
complexity_ratio=$((complex_rules * 100 / total_rules))
echo " Rule complexity: $complexity_ratio% ($complex_rules/$total_rules complex rules)" | tee -a "$log_file"
if [ $complexity_ratio -gt 30 ]; then
echo " ⚠️ High rule complexity detected" | tee -a "$log_file"
fi
fi
# 2. 连接跟踪维护
echo -e "\n2. Connection Tracking Maintenance:" | tee -a "$log_file"
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_conn * 100 / max_conn))
echo " Current usage: $usage_percent% ($current_conn/$max_conn)" | tee -a "$log_file"
# 分析连接状态分布
echo " Connection state distribution:" | tee -a "$log_file"
cat /proc/net/nf_conntrack | awk '{print $4}' | sort | uniq -c | sort -nr | head -5 |
while read -r count state; do
echo " $state: $count connections" | tee -a "$log_file"
done
# 检查异常连接
time_wait_count=$(cat /proc/net/nf_conntrack | grep TIME_WAIT | wc -l)
if [ $time_wait_count -gt 1000 ]; then
echo " ⚠️ High TIME_WAIT connections: $time_wait_count" | tee -a "$log_file"
fi
fi
# 3. 系统资源检查
echo -e "\n3. System Resource Check:" | tee -a "$log_file"
# 内存使用
memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3*100/$2}')
echo " Memory usage: ${memory_usage}%" | tee -a "$log_file"
# CPU 负载
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
echo " CPU load average: $load_avg" | tee -a "$log_file"
# 磁盘使用
disk_usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
echo " Root disk usage: ${disk_usage}%" | tee -a "$log_file"
if [ $disk_usage -gt 80 ]; then
echo " ⚠️ High disk usage detected" | tee -a "$log_file"
fi
# 4. 日志维护
echo -e "\n4. Log Maintenance:" | tee -a "$log_file"
# 检查日志文件大小
if [ -f /var/log/messages ]; then
log_size=$(du -h /var/log/messages | awk '{print $1}')
echo " System log size: $log_size" | tee -a "$log_file"
fi
# 清理旧日志(可选)
old_logs=$(find /var/log -name "iptables_*.log" -mtime +30 | wc -l)
if [ $old_logs -gt 0 ]; then
echo " Found $old_logs old log files (>30 days)" | tee -a "$log_file"
echo " Consider cleaning up old logs" | tee -a "$log_file"
fi
# 5. 配置备份
echo -e "\n5. Configuration Backup:" | tee -a "$log_file"
backup_dir="/etc/iptables/backups"
mkdir -p "$backup_dir"
backup_file="$backup_dir/iptables_backup_$(date +%Y%m%d_%H%M%S).rules"
iptables-save > "$backup_file"
if [ -f "$backup_file" ]; then
echo " ✓ Configuration backed up to: $backup_file" | tee -a "$log_file"
# 保留最近10个备份
backup_count=$(ls -1 "$backup_dir"/iptables_backup_*.rules 2>/dev/null | wc -l)
if [ $backup_count -gt 10 ]; then
old_backups=$((backup_count - 10))
ls -1t "$backup_dir"/iptables_backup_*.rules | tail -$old_backups | xargs rm -f
echo " Cleaned up $old_backups old backup files" | tee -a "$log_file"
fi
else
echo " ❌ Failed to create backup" | tee -a "$log_file"
fi
# 6. 安全检查
echo -e "\n6. Security Check:" | tee -a "$log_file"
# 检查默认策略
for chain in INPUT OUTPUT FORWARD; do
policy=$(iptables -L "$chain" -n | head -1 | awk '{print $4}' | tr -d '()')
echo " $chain default policy: $policy" | tee -a "$log_file"
if [ "$policy" = "ACCEPT" ]; then
echo " ⚠️ $chain chain has permissive default policy" | tee -a "$log_file"
fi
done
# 检查危险规则
dangerous_rules=$(iptables-save | grep -E "(0\.0\.0\.0/0.*ACCEPT|ACCEPT.*0\.0\.0\.0/0)" | wc -l)
if [ $dangerous_rules -gt 0 ]; then
echo " ⚠️ Found $dangerous_rules potentially dangerous rules (ACCEPT from/to anywhere)" | tee -a "$log_file"
fi
# 7. 总结
echo -e "\n7. Maintenance Summary:" | tee -a "$log_file"
echo " Maintenance completed successfully" | tee -a "$log_file"
echo " Report saved to: $log_file" | tee -a "$log_file"
# 生成建议
echo -e "\n8. Recommendations:" | tee -a "$log_file"
if [ $unused_rules -gt 0 ]; then
echo " • Review and remove $unused_rules unused rules" | tee -a "$log_file"
fi
if [ $complexity_ratio -gt 30 ]; then
echo " • Consider simplifying complex rules" | tee -a "$log_file"
fi
if [ "$(echo "$memory_usage > 80" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " • Monitor memory usage closely" | tee -a "$log_file"
fi
if [ $dangerous_rules -gt 0 ]; then
echo " • Review potentially dangerous rules" | tee -a "$log_file"
fi
echo " • Schedule next maintenance check" | tee -a "$log_file"
}
# 使用示例
case "${1:-help}" in
"run")
preventive_maintenance
;;
"help")
echo "Usage: $0 <command>"
echo "Commands:"
echo " run - Run preventive maintenance"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.6 本章小结
11.6.1 关键要点回顾
本章详细介绍了 iptables 故障排除和调试的各种技巧和方法:
1. 连接问题诊断 - 系统化的连接问题排查方法 - 网络接口和路由诊断技巧 - ICMP 和端口连通性测试 - iptables 规则检查和验证
2. 规则匹配问题 - 规则匹配调试技术 - 规则冲突分析方法 - 性能影响评估 - 规则优化建议
3. NAT 问题诊断 - SNAT 和 DNAT 故障排除 - 连接跟踪问题解决 - 路由和地址转换验证
4. 调试工具使用 - tcpdump 数据包分析 - iptables 日志配置和分析 - 连接跟踪调试技巧 - 系统资源监控
5. 自动化故障排除 - 自动化诊断脚本 - 实时监控和告警 - 性能趋势分析 - 预防性维护
11.6.2 最佳实践总结
故障排除原则: 1. 系统化方法:按层次逐步排查 2. 工具结合:多种工具交叉验证 3. 日志分析:重视日志信息 4. 性能监控:关注系统资源 5. 预防为主:定期维护检查
调试技巧: 1. 使用 LOG 目标记录关键信息 2. 重置计数器观察规则匹配 3. 临时规则快速测试 4. 数据包抓取分析流量 5. 连接跟踪状态监控
性能优化: 1. 规则顺序优化 2. 复杂规则简化 3. 连接跟踪调优 4. 系统参数优化 5. 硬件资源升级
11.6.3 故障排除检查清单
基础检查: - [ ] 网络接口状态 - [ ] IP 地址配置 - [ ] 路由表设置 - [ ] 基本连通性
iptables 检查: - [ ] 默认策略设置 - [ ] 规则语法正确性 - [ ] 规则匹配顺序 - [ ] 计数器统计
性能检查: - [ ] CPU 使用率 - [ ] 内存使用情况 - [ ] 连接跟踪使用率 - [ ] 规则处理效率
安全检查: - [ ] 危险规则识别 - [ ] 日志记录配置 - [ ] 访问控制验证 - [ ] 配置备份状态
11.6.4 下一章预告
下一章我们将学习 iptables 与其他工具的集成,包括:
与系统服务集成
- systemd 服务配置
- 开机自启动设置
- 服务依赖管理
与监控工具集成
- Nagios 监控集成
- Zabbix 监控配置
- Prometheus 指标收集
与自动化工具集成
- Ansible 自动化部署
- Puppet 配置管理
- Chef 基础设施代码
与容器技术集成
- Docker 网络集成
- Kubernetes 网络策略
- 容器安全配置
与云平台集成
- AWS 安全组集成
- Azure 网络安全组
- 混合云网络配置
11.7 练习与思考
11.7.1 理论练习
故障排除流程设计
- 设计一个完整的 iptables 故障排除流程
- 包括检查步骤、工具使用、问题分类
性能问题分析
- 分析可能导致 iptables 性能问题的因素
- 提出相应的优化方案
监控指标设计
- 设计 iptables 监控指标体系
- 包括关键指标、告警阈值、响应策略
11.7.2 实践练习
故障模拟和排除 “`bash
练习1:模拟连接问题
创建阻止 SSH 连接的规则,然后排除故障
# 练习2:模拟 NAT 问题 # 配置错误的 NAT 规则,分析和修复问题
# 练习3:模拟性能问题 # 创建大量复杂规则,分析性能影响
2. **调试脚本开发**
```bash
# 开发自定义的故障诊断脚本
# 包括自动检测、问题分类、解决建议
监控系统搭建
# 搭建 iptables 监控系统 # 包括数据收集、可视化、告警
11.7.3 思考题
如何设计一个高效的 iptables 故障排除流程?
在大规模环境中,如何实现 iptables 的自动化监控和故障处理?
如何平衡 iptables 的安全性和性能?
在云环境中,iptables 故障排除有哪些特殊考虑?
如何设计 iptables 的灾难恢复方案?
11.1.2 规则匹配问题
1. 规则匹配调试
#!/bin/bash # rule_matching_debug.sh # 规则匹配调试脚本 debug_rule_matching() { echo "=== Rule Matching Debug ===" # 1. 显示所有规则及其计数器 echo "1. Current rules with packet/byte counters:" iptables -L -n -v --line-numbers echo -e "\n2. Rules with zero packet count (potentially unused):" iptables -L -n -v --line-numbers | awk '$2 == "0" && NR > 2 {print " Line " $1 ": " $0}' echo -e "\n3. Most active rules (top 10):" iptables -L -n -v --line-numbers | awk 'NR > 2 && $2 != "pkts" {print $2 " " $0}' | sort -nr | head -10 | sed 's/^/ /' } # 测试特定规则匹配 test_rule_matching() { local src_ip="$1" local dst_ip="$2" local dst_port="$3" local protocol="${4:-tcp}" echo "=== Testing Rule Matching ===" echo "Source: $src_ip" echo "Destination: $dst_ip:$dst_port" echo "Protocol: $protocol" echo # 创建测试规则(带日志) test_rule="-s $src_ip -d $dst_ip -p $protocol --dport $dst_port -j LOG --log-prefix 'TEST_RULE: '" echo "Adding test rule: iptables -I INPUT 1 $test_rule" iptables -I INPUT 1 $test_rule echo "Test rule added. Generate some traffic and check logs:" echo " tail -f /var/log/messages | grep 'TEST_RULE'" echo echo "To remove test rule: iptables -D INPUT 1" } # 分析规则冲突 analyze_rule_conflicts() { echo "=== Analyzing Rule Conflicts ===" # 获取所有规则 rules_file="/tmp/iptables_rules.txt" iptables-save > "$rules_file" echo "1. Checking for duplicate rules:" duplicates=$(sort "$rules_file" | uniq -d | grep -v "^#" | grep -v "^:") if [ -n "$duplicates" ]; then echo " Found duplicate rules:" echo "$duplicates" | sed 's/^/ /' else echo " No duplicate rules found" fi echo -e "\n2. Checking for conflicting ACCEPT/DROP rules:" # 提取 INPUT 链规则 input_rules=$(iptables -S INPUT | grep -v "^-P") # 检查是否有相同条件但不同动作的规则 echo "$input_rules" | while read -r rule; do if echo "$rule" | grep -q "\-j ACCEPT"; then # 检查是否有相同条件的 DROP 规则 condition=$(echo "$rule" | sed 's/-j ACCEPT//' | sed 's/-A INPUT//') drop_rule=$(echo "$input_rules" | grep "$condition" | grep "\-j DROP") if [ -n "$drop_rule" ]; then echo " Potential conflict found:" echo " ACCEPT: $rule" echo " DROP: $drop_rule" fi fi done echo -e "\n3. Checking rule order issues:" # 检查是否有过于宽泛的规则在前面 broad_rules=$(iptables -S INPUT | grep -E "\-j (ACCEPT|DROP)$" | grep -v "\-s" | grep -v "\-d" | grep -v "\-p") if [ -n "$broad_rules" ]; then echo " Found broad rules that might block more specific rules:" echo "$broad_rules" | sed 's/^/ /' fi rm -f "$rules_file" } # 规则性能分析 analyze_rule_performance() { echo "=== Rule Performance Analysis ===" # 1. 规则数量统计 echo "1. Rule count by chain:" for chain in INPUT OUTPUT FORWARD; do count=$(iptables -L "$chain" --line-numbers | tail -n +3 | wc -l) echo " $chain: $count rules" done # 2. 复杂规则识别 echo -e "\n2. Complex rules (multiple conditions):" iptables -S | grep -E "(\-m.*){3,}" | sed 's/^/ /' # 3. 低效匹配模式 echo -e "\n3. Potentially inefficient patterns:" # 检查字符串匹配 string_rules=$(iptables -S | grep "\-m string") if [ -n "$string_rules" ]; then echo " String matching rules (can be slow):" echo "$string_rules" | sed 's/^/ /' fi # 检查正则表达式 regex_rules=$(iptables -S | grep "\-m regexp") if [ -n "$regex_rules" ]; then echo " Regular expression rules (can be slow):" echo "$regex_rules" | sed 's/^/ /' fi # 4. 建议优化 echo -e "\n4. Optimization suggestions:" echo " - Move frequently matched rules to the top" echo " - Use ipset for large IP lists" echo " - Combine similar rules where possible" echo " - Avoid string matching in high-traffic rules" echo " - Use stateful connection tracking" } # 使用示例 case "${1:-help}" in "debug") debug_rule_matching ;; "test") if [ $# -lt 4 ]; then echo "Usage: $0 test <src_ip> <dst_ip> <dst_port> [protocol]" exit 1 fi test_rule_matching "$2" "$3" "$4" "$5" ;; "conflicts") analyze_rule_conflicts ;; "performance") analyze_rule_performance ;; "help") echo "Usage: $0 <command>" echo "Commands:" echo " debug - Show rule matching debug info" echo " test - Test specific rule matching" echo " conflicts - Analyze rule conflicts" echo " performance - Analyze rule performance" ;; *) echo "Unknown command: $1" exit 1 ;; esac
11.1.3 NAT 问题诊断
1. NAT 故障排除
#!/bin/bash
# nat_troubleshooting.sh
# NAT 故障排除脚本
diagnose_nat() {
echo "=== NAT Diagnosis ==="
# 1. 检查 NAT 表规则
echo "1. NAT table rules:"
echo " PREROUTING chain:"
iptables -t nat -L PREROUTING -n -v --line-numbers | sed 's/^/ /'
echo " POSTROUTING chain:"
iptables -t nat -L POSTROUTING -n -v --line-numbers | sed 's/^/ /'
echo " OUTPUT chain:"
iptables -t nat -L OUTPUT -n -v --line-numbers | sed 's/^/ /'
# 2. 检查 IP 转发
echo -e "\n2. IP forwarding status:"
ip_forward=$(cat /proc/sys/net/ipv4/ip_forward)
if [ "$ip_forward" = "1" ]; then
echo " ✓ IP forwarding is enabled"
else
echo " ✗ IP forwarding is disabled"
echo " To enable: echo 1 > /proc/sys/net/ipv4/ip_forward"
fi
# 3. 检查连接跟踪
echo -e "\n3. Connection tracking for NAT:"
if [ -f /proc/net/nf_conntrack ]; then
nat_connections=$(grep "nat" /proc/net/nf_conntrack | wc -l)
total_connections=$(cat /proc/net/nf_conntrack | wc -l)
echo " NAT connections: $nat_connections / $total_connections"
# 显示一些 NAT 连接示例
echo " Sample NAT connections:"
grep "nat" /proc/net/nf_conntrack | head -5 | sed 's/^/ /'
else
echo " Connection tracking not available"
fi
# 4. 检查网络接口
echo -e "\n4. Network interfaces:"
ip addr show | grep -E "^[0-9]+:|inet " | sed 's/^/ /'
# 5. 检查路由表
echo -e "\n5. Routing table:"
ip route | sed 's/^/ /'
}
# SNAT 问题诊断
diagnose_snat() {
local internal_network="$1"
local external_interface="$2"
echo "=== SNAT Diagnosis ==="
echo "Internal network: $internal_network"
echo "External interface: $external_interface"
echo
# 1. 检查 SNAT 规则
echo "1. SNAT rules in POSTROUTING:"
snat_rules=$(iptables -t nat -L POSTROUTING -n -v | grep "$internal_network\|$external_interface")
if [ -n "$snat_rules" ]; then
echo "$snat_rules" | sed 's/^/ /'
else
echo " No SNAT rules found for specified network/interface"
fi
# 2. 检查 MASQUERADE 规则
echo -e "\n2. MASQUERADE rules:"
masq_rules=$(iptables -t nat -L POSTROUTING -n -v | grep MASQUERADE)
if [ -n "$masq_rules" ]; then
echo "$masq_rules" | sed 's/^/ /'
else
echo " No MASQUERADE rules found"
fi
# 3. 测试 SNAT 功能
echo -e "\n3. Testing SNAT functionality:"
# 检查外部接口 IP
external_ip=$(ip addr show "$external_interface" 2>/dev/null | grep "inet " | awk '{print $2}' | cut -d'/' -f1)
if [ -n "$external_ip" ]; then
echo " External interface IP: $external_ip"
# 检查是否有使用该 IP 的 NAT 连接
nat_with_ip=$(grep "$external_ip" /proc/net/nf_conntrack 2>/dev/null | wc -l)
echo " Active NAT connections using this IP: $nat_with_ip"
else
echo " ✗ External interface has no IP address"
fi
# 4. 建议
echo -e "\n4. Troubleshooting suggestions:"
echo " - Ensure IP forwarding is enabled"
echo " - Check POSTROUTING rules for source network"
echo " - Verify external interface has valid IP"
echo " - Test connectivity from internal hosts"
echo " - Check for conflicting rules"
}
# DNAT 问题诊断
diagnose_dnat() {
local external_port="$1"
local internal_ip="$2"
local internal_port="$3"
echo "=== DNAT Diagnosis ==="
echo "External port: $external_port"
echo "Internal target: $internal_ip:$internal_port"
echo
# 1. 检查 DNAT 规则
echo "1. DNAT rules in PREROUTING:"
dnat_rules=$(iptables -t nat -L PREROUTING -n -v | grep ":$external_port ")
if [ -n "$dnat_rules" ]; then
echo "$dnat_rules" | sed 's/^/ /'
else
echo " No DNAT rules found for port $external_port"
fi
# 2. 检查 FORWARD 规则
echo -e "\n2. FORWARD rules for internal target:"
forward_rules=$(iptables -L FORWARD -n -v | grep "$internal_ip.*:$internal_port")
if [ -n "$forward_rules" ]; then
echo "$forward_rules" | sed 's/^/ /'
else
echo " No specific FORWARD rules found"
echo " Checking general FORWARD policy:"
forward_policy=$(iptables -L FORWARD | grep "policy" | awk '{print $4}')
echo " FORWARD policy: $forward_policy"
fi
# 3. 检查内部主机连通性
echo -e "\n3. Testing internal host connectivity:"
if ping -c 1 -W 2 "$internal_ip" > /dev/null 2>&1; then
echo " ✓ Internal host $internal_ip is reachable"
# 测试端口
if nc -z -w 2 "$internal_ip" "$internal_port" 2>/dev/null; then
echo " ✓ Port $internal_port is open on internal host"
else
echo " ✗ Port $internal_port is closed on internal host"
fi
else
echo " ✗ Internal host $internal_ip is not reachable"
fi
# 4. 检查连接跟踪
echo -e "\n4. Connection tracking for DNAT:"
dnat_connections=$(grep "$internal_ip:$internal_port" /proc/net/nf_conntrack 2>/dev/null)
if [ -n "$dnat_connections" ]; then
echo " Found DNAT connections:"
echo "$dnat_connections" | sed 's/^/ /'
else
echo " No active DNAT connections found"
fi
# 5. 建议
echo -e "\n5. Troubleshooting suggestions:"
echo " - Verify DNAT rule in PREROUTING chain"
echo " - Check FORWARD rules allow traffic to internal host"
echo " - Ensure internal service is running and accessible"
echo " - Test DNAT from external source"
echo " - Check for firewall rules on internal host"
}
# 使用示例
case "${1:-help}" in
"general")
diagnose_nat
;;
"snat")
if [ $# -lt 3 ]; then
echo "Usage: $0 snat <internal_network> <external_interface>"
echo "Example: $0 snat 192.168.1.0/24 eth0"
exit 1
fi
diagnose_snat "$2" "$3"
;;
"dnat")
if [ $# -lt 4 ]; then
echo "Usage: $0 dnat <external_port> <internal_ip> <internal_port>"
echo "Example: $0 dnat 80 192.168.1.100 8080"
exit 1
fi
diagnose_dnat "$2" "$3" "$4"
;;
"help")
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " general - General NAT diagnosis"
echo " snat <internal_network> <external_if> - SNAT diagnosis"
echo " dnat <ext_port> <int_ip> <int_port> - DNAT diagnosis"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.2 调试工具使用
11.2.1 数据包跟踪工具
1. tcpdump 调试
#!/bin/bash
# tcpdump_debug.sh
# tcpdump 调试脚本
tcpdump_basic_capture() {
local interface="$1"
local filter="$2"
local output_file="${3:-/tmp/capture.pcap}"
echo "Starting packet capture on $interface"
echo "Filter: $filter"
echo "Output: $output_file"
echo "Press Ctrl+C to stop"
echo
tcpdump -i "$interface" -w "$output_file" "$filter"
}
# 实时数据包分析
tcpdump_realtime_analysis() {
local interface="$1"
local target_ip="$2"
echo "Real-time packet analysis for $target_ip on $interface"
echo "Press Ctrl+C to stop"
echo
tcpdump -i "$interface" -n -v host "$target_ip" | while read -r line; do
timestamp=$(echo "$line" | awk '{print $1}')
if echo "$line" | grep -q ">"; then
src=$(echo "$line" | awk '{print $3}')
dst=$(echo "$line" | awk '{print $5}')
echo "[$timestamp] $src -> $dst"
# 检查是否被 iptables 处理
if echo "$line" | grep -q "Flags.*S"; then
echo " SYN packet detected"
elif echo "$line" | grep -q "Flags.*R"; then
echo " RST packet detected (connection rejected?)"
fi
fi
done
}
# 连接建立跟踪
trace_connection_establishment() {
local src_ip="$1"
local dst_ip="$2"
local dst_port="$3"
echo "Tracing connection establishment: $src_ip -> $dst_ip:$dst_port"
echo "Monitoring TCP handshake..."
echo
tcpdump -i any -n "host $src_ip and host $dst_ip and port $dst_port" |
while read -r line; do
if echo "$line" | grep -q "Flags.*S.*"; then
if echo "$line" | grep -q "$src_ip.*$dst_ip"; then
echo "1. SYN: $src_ip -> $dst_ip:$dst_port"
elif echo "$line" | grep -q "$dst_ip.*$src_ip"; then
echo "2. SYN-ACK: $dst_ip:$dst_port -> $src_ip"
fi
elif echo "$line" | grep -q "Flags.*A.*" && ! echo "$line" | grep -q "Flags.*S.*"; then
if echo "$line" | grep -q "$src_ip.*$dst_ip"; then
echo "3. ACK: $src_ip -> $dst_ip:$dst_port (Connection established)"
break
fi
elif echo "$line" | grep -q "Flags.*R.*"; then
echo "Connection reset detected!"
break
fi
done
}
# 使用示例
case "${1:-help}" in
"capture")
if [ $# -lt 3 ]; then
echo "Usage: $0 capture <interface> <filter> [output_file]"
echo "Example: $0 capture eth0 'port 80' /tmp/web_traffic.pcap"
exit 1
fi
tcpdump_basic_capture "$2" "$3" "$4"
;;
"realtime")
if [ $# -lt 3 ]; then
echo "Usage: $0 realtime <interface> <target_ip>"
exit 1
fi
tcpdump_realtime_analysis "$2" "$3"
;;
"trace")
if [ $# -lt 4 ]; then
echo "Usage: $0 trace <src_ip> <dst_ip> <dst_port>"
exit 1
fi
trace_connection_establishment "$2" "$3" "$4"
;;
"help")
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " capture - Basic packet capture"
echo " realtime - Real-time packet analysis"
echo " trace - Trace connection establishment"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
2. iptables 日志分析
#!/bin/bash
# iptables_log_analysis.sh
# iptables 日志分析脚本
analyze_iptables_logs() {
local log_file="${1:-/var/log/messages}"
local time_range="${2:-1h}"
echo "=== IPTables Log Analysis ==="
echo "Log file: $log_file"
echo "Time range: last $time_range"
echo
# 1. 基本统计
echo "1. Basic statistics:"
# 获取指定时间范围内的日志
case "$time_range" in
*h) hours=${time_range%h}; since_time=$(date -d "$hours hours ago" '+%b %d %H:%M') ;;
*m) minutes=${time_range%m}; since_time=$(date -d "$minutes minutes ago" '+%b %d %H:%M') ;;
*d) days=${time_range%d}; since_time=$(date -d "$days days ago" '+%b %d') ;;
*) since_time=$(date -d "1 hour ago" '+%b %d %H:%M') ;;
esac
# 提取 iptables 相关日志
iptables_logs=$(awk -v since="$since_time" '$0 >= since && /kernel:.*IN=|OUT=/' "$log_file")
if [ -z "$iptables_logs" ]; then
echo " No iptables logs found in the specified time range"
return
fi
total_entries=$(echo "$iptables_logs" | wc -l)
echo " Total log entries: $total_entries"
# 2. 按前缀分组统计
echo -e "\n2. Log entries by prefix:"
echo "$iptables_logs" | grep -o 'kernel:.*:' | sort | uniq -c | sort -nr | head -10 |
while read -r count prefix; do
echo " $count entries: $prefix"
done
# 3. 最活跃的源 IP
echo -e "\n3. Top source IPs:"
echo "$iptables_logs" | grep -o 'SRC=[0-9.]*' | cut -d'=' -f2 | sort | uniq -c | sort -nr | head -10 |
while read -r count ip; do
echo " $count packets from $ip"
done
# 4. 最常见的目标端口
echo -e "\n4. Top destination ports:"
echo "$iptables_logs" | grep -o 'DPT=[0-9]*' | cut -d'=' -f2 | sort | uniq -c | sort -nr | head -10 |
while read -r count port; do
echo " $count packets to port $port"
done
# 5. 协议分布
echo -e "\n5. Protocol distribution:"
echo "$iptables_logs" | grep -o 'PROTO=[A-Z]*' | cut -d'=' -f2 | sort | uniq -c | sort -nr |
while read -r count proto; do
echo " $count packets: $proto"
done
# 6. 网络接口统计
echo -e "\n6. Interface statistics:"
echo " Incoming interfaces:"
echo "$iptables_logs" | grep -o 'IN=[a-zA-Z0-9]*' | cut -d'=' -f2 | grep -v '^$' | sort | uniq -c | sort -nr |
while read -r count iface; do
echo " $count packets on $iface"
done
echo " Outgoing interfaces:"
echo "$iptables_logs" | grep -o 'OUT=[a-zA-Z0-9]*' | cut -d'=' -f2 | grep -v '^$' | sort | uniq -c | sort -nr |
while read -r count iface; do
echo " $count packets on $iface"
done
}
# 实时日志监控
monitor_iptables_logs() {
local log_file="${1:-/var/log/messages}"
local filter="${2:-.*}"
echo "Monitoring iptables logs in real-time"
echo "Log file: $log_file"
echo "Filter: $filter"
echo "Press Ctrl+C to stop"
echo
tail -f "$log_file" | grep --line-buffered "kernel:.*IN=\|OUT=" | grep --line-buffered "$filter" |
while read -r line; do
timestamp=$(echo "$line" | awk '{print $1, $2, $3}')
# 提取关键信息
src_ip=$(echo "$line" | grep -o 'SRC=[0-9.]*' | cut -d'=' -f2)
dst_ip=$(echo "$line" | grep -o 'DST=[0-9.]*' | cut -d'=' -f2)
src_port=$(echo "$line" | grep -o 'SPT=[0-9]*' | cut -d'=' -f2)
dst_port=$(echo "$line" | grep -o 'DPT=[0-9]*' | cut -d'=' -f2)
protocol=$(echo "$line" | grep -o 'PROTO=[A-Z]*' | cut -d'=' -f2)
in_iface=$(echo "$line" | grep -o 'IN=[a-zA-Z0-9]*' | cut -d'=' -f2)
out_iface=$(echo "$line" | grep -o 'OUT=[a-zA-Z0-9]*' | cut -d'=' -f2)
# 格式化输出
printf "[%s] %s:%s -> %s:%s (%s) [%s->%s]\n" \
"$timestamp" "$src_ip" "$src_port" "$dst_ip" "$dst_port" "$protocol" "$in_iface" "$out_iface"
done
}
# 安全事件检测
detect_security_events() {
local log_file="${1:-/var/log/messages}"
local time_range="${2:-1h}"
echo "=== Security Event Detection ==="
echo
# 获取时间范围
case "$time_range" in
*h) hours=${time_range%h}; since_time=$(date -d "$hours hours ago" '+%b %d %H:%M') ;;
*m) minutes=${time_range%m}; since_time=$(date -d "$minutes minutes ago" '+%b %d %H:%M') ;;
*d) days=${time_range%d}; since_time=$(date -d "$days days ago" '+%b %d') ;;
*) since_time=$(date -d "1 hour ago" '+%b %d %H:%M') ;;
esac
iptables_logs=$(awk -v since="$since_time" '$0 >= since && /kernel:.*IN=|OUT=/' "$log_file")
# 1. 端口扫描检测
echo "1. Port scan detection:"
echo "$iptables_logs" | grep -o 'SRC=[0-9.]*' | cut -d'=' -f2 | sort | uniq -c |
awk '$1 > 50 {print " Potential port scan from " $2 " (" $1 " attempts)"}'
# 2. 暴力破解检测
echo -e "\n2. Brute force detection:"
for port in 22 21 23 3389; do
attacks=$(echo "$iptables_logs" | grep "DPT=$port" | grep -o 'SRC=[0-9.]*' | cut -d'=' -f2 | sort | uniq -c | awk '$1 > 10')
if [ -n "$attacks" ]; then
echo " Port $port attacks:"
echo "$attacks" | while read -r count ip; do
echo " $ip: $count attempts"
done
fi
done
# 3. DDoS 检测
echo -e "\n3. DDoS detection:"
high_volume_ips=$(echo "$iptables_logs" | grep -o 'SRC=[0-9.]*' | cut -d'=' -f2 | sort | uniq -c | awk '$1 > 100')
if [ -n "$high_volume_ips" ]; then
echo " High volume sources:"
echo "$high_volume_ips" | while read -r count ip; do
echo " $ip: $count packets"
done
else
echo " No high volume sources detected"
fi
# 4. 异常协议检测
echo -e "\n4. Unusual protocol detection:"
unusual_protocols=$(echo "$iptables_logs" | grep -o 'PROTO=[A-Z]*' | cut -d'=' -f2 | sort | uniq -c | awk '$2 !~ /^(TCP|UDP|ICMP)$/ && $1 > 5')
if [ -n "$unusual_protocols" ]; then
echo " Unusual protocols:"
echo "$unusual_protocols" | while read -r count proto; do
echo " $proto: $count packets"
done
else
echo " No unusual protocols detected"
fi
}
# 使用示例
case "${1:-help}" in
"analyze")
analyze_iptables_logs "$2" "$3"
;;
"monitor")
monitor_iptables_logs "$2" "$3"
;;
"security")
detect_security_events "$2" "$3"
;;
"help")
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " analyze [log_file] [time_range] - Analyze iptables logs"
echo " monitor [log_file] [filter] - Monitor logs in real-time"
echo " security [log_file] [time_range] - Detect security events"
echo
echo "Examples:"
echo " $0 analyze /var/log/messages 2h"
echo " $0 monitor /var/log/messages 'port 22'"
echo " $0 security /var/log/messages 1d"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.2.2 连接跟踪调试
1. conntrack 工具使用
#!/bin/bash
# conntrack_debug.sh
# conntrack 调试脚本
conntrack_status() {
echo "=== Connection Tracking Status ==="
# 1. 连接跟踪统计
echo "1. Connection tracking statistics:"
if [ -f /proc/net/nf_conntrack ]; then
current_connections=$(cat /proc/net/nf_conntrack | wc -l)
max_connections=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_connections * 100 / max_connections))
echo " Current connections: $current_connections"
echo " Maximum connections: $max_connections"
echo " Usage: $usage_percent%"
if [ $usage_percent -gt 80 ]; then
echo " ⚠️ WARNING: Connection table usage is high!"
fi
else
echo " Connection tracking not available"
return 1
fi
# 2. 按协议分组统计
echo -e "\n2. Connections by protocol:"
awk '{print $1}' /proc/net/nf_conntrack | sort | uniq -c | sort -nr |
while read -r count proto; do
echo " $proto: $count connections"
done
# 3. 按状态分组统计
echo -e "\n3. Connections by state:"
grep -o 'state=[A-Z_]*' /proc/net/nf_conntrack | cut -d'=' -f2 | sort | uniq -c | sort -nr |
while read -r count state; do
echo " $state: $count connections"
done
# 4. 超时设置
echo -e "\n4. Timeout settings:"
if [ -d /proc/sys/net/netfilter ]; then
echo " TCP established: $(cat /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_established 2>/dev/null || echo 'N/A') seconds"
echo " TCP close wait: $(cat /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_close_wait 2>/dev/null || echo 'N/A') seconds"
echo " UDP timeout: $(cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout 2>/dev/null || echo 'N/A') seconds"
echo " ICMP timeout: $(cat /proc/sys/net/netfilter/nf_conntrack_icmp_timeout 2>/dev/null || echo 'N/A') seconds"
fi
}
# 监控特定连接
monitor_connection() {
local src_ip="$1"
local dst_ip="$2"
local port="$3"
echo "=== Monitoring Connection: $src_ip -> $dst_ip:$port ==="
echo "Press Ctrl+C to stop"
echo
while true; do
# 查找匹配的连接
connections=$(grep "$src_ip.*$dst_ip.*:$port \|$dst_ip.*$src_ip.*:$port " /proc/net/nf_conntrack 2>/dev/null)
if [ -n "$connections" ]; then
echo "[$(date '+%H:%M:%S')] Active connections found:"
echo "$connections" | while read -r conn; do
# 提取状态信息
state=$(echo "$conn" | grep -o 'state=[A-Z_]*' | cut -d'=' -f2)
timeout=$(echo "$conn" | awk '{print $2}')
echo " State: $state, Timeout: $timeout seconds"
echo " Full entry: $conn"
done
else
echo "[$(date '+%H:%M:%S')] No active connections found"
fi
echo
sleep 2
done
}
# 清理连接跟踪表
cleanup_conntrack() {
echo "=== Connection Tracking Cleanup ==="
# 1. 显示清理前状态
before_count=$(cat /proc/net/nf_conntrack | wc -l)
echo "Connections before cleanup: $before_count"
# 2. 清理已关闭的连接
echo "Cleaning up closed connections..."
if command -v conntrack > /dev/null; then
# 清理 TIME_WAIT 状态的连接
closed_count=$(conntrack -L | grep TIME_WAIT | wc -l)
if [ $closed_count -gt 0 ]; then
echo "Found $closed_count TIME_WAIT connections"
conntrack -D -p tcp --state TIME_WAIT 2>/dev/null || echo "Failed to clean TIME_WAIT connections"
fi
# 清理 CLOSE_WAIT 状态的连接
close_wait_count=$(conntrack -L | grep CLOSE_WAIT | wc -l)
if [ $close_wait_count -gt 0 ]; then
echo "Found $close_wait_count CLOSE_WAIT connections"
conntrack -D -p tcp --state CLOSE_WAIT 2>/dev/null || echo "Failed to clean CLOSE_WAIT connections"
fi
else
echo "conntrack tool not available, using alternative method"
# 降低超时值来加速清理
echo 60 > /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait 2>/dev/null
echo 60 > /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_close_wait 2>/dev/null
fi
# 3. 显示清理后状态
sleep 2
after_count=$(cat /proc/net/nf_conntrack | wc -l)
cleaned_count=$((before_count - after_count))
echo "Connections after cleanup: $after_count"
echo "Cleaned up: $cleaned_count connections"
}
# 连接跟踪性能分析
analyze_conntrack_performance() {
echo "=== Connection Tracking Performance Analysis ==="
# 1. 内存使用情况
echo "1. Memory usage:"
if [ -f /proc/slabinfo ]; then
conntrack_mem=$(grep nf_conntrack /proc/slabinfo 2>/dev/null)
if [ -n "$conntrack_mem" ]; then
echo " $conntrack_mem"
else
echo " Connection tracking memory info not available"
fi
fi
# 2. 哈希表统计
echo -e "\n2. Hash table statistics:"
if [ -f /proc/sys/net/netfilter/nf_conntrack_buckets ]; then
buckets=$(cat /proc/sys/net/netfilter/nf_conntrack_buckets)
current_connections=$(cat /proc/net/nf_conntrack | wc -l)
avg_per_bucket=$((current_connections / buckets))
echo " Hash buckets: $buckets"
echo " Current connections: $current_connections"
echo " Average per bucket: $avg_per_bucket"
if [ $avg_per_bucket -gt 5 ]; then
echo " ⚠️ WARNING: High collision rate, consider increasing buckets"
fi
fi
# 3. 连接建立速率
echo -e "\n3. Connection establishment rate:"
initial_count=$(cat /proc/net/nf_conntrack | wc -l)
sleep 5
final_count=$(cat /proc/net/nf_conntrack | wc -l)
rate=$(((final_count - initial_count) / 5))
echo " New connections per second: $rate"
if [ $rate -gt 100 ]; then
echo " ⚠️ WARNING: High connection rate detected"
fi
# 4. 性能建议
echo -e "\n4. Performance recommendations:"
max_connections=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
current_connections=$(cat /proc/net/nf_conntrack | wc -l)
usage_percent=$((current_connections * 100 / max_connections))
if [ $usage_percent -gt 80 ]; then
echo " - Increase nf_conntrack_max value"
echo " - Consider reducing timeout values"
echo " - Implement connection limiting"
fi
if [ $avg_per_bucket -gt 5 ]; then
echo " - Increase nf_conntrack_buckets value"
fi
if [ $rate -gt 100 ]; then
echo " - Monitor for DDoS attacks"
echo " - Implement rate limiting"
fi
}
# 使用示例
case "${1:-help}" in
"status")
conntrack_status
;;
"monitor")
if [ $# -lt 4 ]; then
echo "Usage: $0 monitor <src_ip> <dst_ip> <port>"
exit 1
fi
monitor_connection "$2" "$3" "$4"
;;
"cleanup")
cleanup_conntrack
;;
"performance")
analyze_conntrack_performance
;;
"help")
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " status - Show connection tracking status"
echo " monitor <src_ip> <dst_ip> <port> - Monitor specific connection"
echo " cleanup - Clean up connection tracking table"
echo " performance - Analyze performance"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.3 性能问题排查
11.3.1 规则性能分析
1. 规则匹配效率测试
#!/bin/bash
# rule_performance_test.sh
# 规则性能测试脚本
test_rule_performance() {
echo "=== Rule Performance Test ==="
# 1. 基准测试 - 无规则情况
echo "1. Baseline test (no rules):"
# 保存当前规则
iptables-save > /tmp/current_rules.txt
# 清空规则进行基准测试
iptables -F
iptables -P INPUT ACCEPT
iptables -P OUTPUT ACCEPT
iptables -P FORWARD ACCEPT
# 测试网络性能
baseline_result=$(test_network_performance)
echo " Baseline performance: $baseline_result"
# 2. 恢复规则并测试
echo "2. Performance with current rules:"
iptables-restore < /tmp/current_rules.txt
current_result=$(test_network_performance)
echo " Current performance: $current_result"
# 3. 计算性能影响
if [ -n "$baseline_result" ] && [ -n "$current_result" ]; then
impact=$(echo "scale=2; ($baseline_result - $current_result) / $baseline_result * 100" | bc 2>/dev/null || echo "N/A")
echo " Performance impact: ${impact}%"
fi
rm -f /tmp/current_rules.txt
}
# 网络性能测试函数
test_network_performance() {
# 使用 ping 测试延迟
local test_host="8.8.8.8"
local ping_result
ping_result=$(ping -c 10 -q "$test_host" 2>/dev/null | grep "avg" | awk -F'/' '{print $5}')
echo "$ping_result"
}
# 规则复杂度分析
analyze_rule_complexity() {
echo "=== Rule Complexity Analysis ==="
# 1. 规则数量统计
echo "1. Rule count by chain:"
for table in filter nat mangle; do
echo " Table: $table"
case "$table" in
"filter")
chains="INPUT OUTPUT FORWARD"
;;
"nat")
chains="PREROUTING POSTROUTING OUTPUT"
;;
"mangle")
chains="PREROUTING INPUT FORWARD OUTPUT POSTROUTING"
;;
esac
for chain in $chains; do
count=$(iptables -t "$table" -L "$chain" --line-numbers 2>/dev/null | tail -n +3 | wc -l)
echo " $chain: $count rules"
done
done
# 2. 复杂规则识别
echo -e "\n2. Complex rules analysis:"
# 多条件规则
complex_rules=$(iptables-save | grep -E "(\-m.*){3,}" | wc -l)
echo " Rules with 3+ match conditions: $complex_rules"
# 字符串匹配规则
string_rules=$(iptables-save | grep "\-m string" | wc -l)
echo " String matching rules: $string_rules"
# 正则表达式规则
regex_rules=$(iptables-save | grep "\-m regexp" | wc -l)
echo " Regular expression rules: $regex_rules"
# 状态跟踪规则
state_rules=$(iptables-save | grep "\-m.*state\|\-m.*conntrack" | wc -l)
echo " Stateful rules: $state_rules"
# 3. 性能建议
echo -e "\n3. Performance recommendations:"
total_rules=$(iptables-save | grep "^\-A" | wc -l)
if [ $total_rules -gt 100 ]; then
echo " - Consider using ipset for large IP lists"
echo " - Group similar rules together"
echo " - Review rule necessity"
fi
if [ $string_rules -gt 5 ]; then
echo " - Minimize string matching rules"
echo " - Consider application-level filtering"
fi
if [ $complex_rules -gt 20 ]; then
echo " - Simplify complex rules where possible"
echo " - Split complex rules into multiple simpler ones"
fi
}
# 规则匹配统计
rule_matching_stats() {
echo "=== Rule Matching Statistics ==="
# 1. 重置计数器
echo "1. Resetting rule counters..."
iptables -Z
echo " Counters reset. Waiting 60 seconds for traffic..."
sleep 60
# 2. 显示匹配统计
echo "2. Rule matching statistics (last 60 seconds):"
# INPUT 链统计
echo " INPUT chain:"
iptables -L INPUT -n -v --line-numbers | tail -n +3 |
while read -r line; do
packets=$(echo "$line" | awk '{print $2}')
rule_num=$(echo "$line" | awk '{print $1}')
target=$(echo "$line" | awk '{print $3}')
if [ "$packets" != "0" ]; then
echo " Rule $rule_num: $packets packets -> $target"
fi
done
# OUTPUT 链统计
echo " OUTPUT chain:"
iptables -L OUTPUT -n -v --line-numbers | tail -n +3 |
while read -r line; do
packets=$(echo "$line" | awk '{print $2}')
rule_num=$(echo "$line" | awk '{print $1}')
target=$(echo "$line" | awk '{print $3}')
if [ "$packets" != "0" ]; then
echo " Rule $rule_num: $packets packets -> $target"
fi
done
# 3. 未匹配规则识别
echo -e "\n3. Unused rules (zero packet count):"
unused_count=0
for chain in INPUT OUTPUT FORWARD; do
unused_in_chain=$(iptables -L "$chain" -n -v --line-numbers | tail -n +3 | awk '$2 == "0" {print $1}' | wc -l)
if [ $unused_in_chain -gt 0 ]; then
echo " $chain chain: $unused_in_chain unused rules"
unused_count=$((unused_count + unused_in_chain))
fi
done
if [ $unused_count -eq 0 ]; then
echo " All rules have been matched"
else
echo " Total unused rules: $unused_count"
echo " Consider reviewing and removing unused rules"
fi
}
# 热点规则识别
identify_hotspot_rules() {
echo "=== Hotspot Rules Identification ==="
# 重置计数器
iptables -Z
echo "Collecting data for 5 minutes..."
sleep 300
echo "Top 10 most active rules:"
# 收集所有链的统计信息
{
iptables -L INPUT -n -v --line-numbers | tail -n +3 | sed 's/^/INPUT /';
iptables -L OUTPUT -n -v --line-numbers | tail -n +3 | sed 's/^/OUTPUT /';
iptables -L FORWARD -n -v --line-numbers | tail -n +3 | sed 's/^/FORWARD /';
} | sort -k3 -nr | head -10 |
while read -r chain line_num packets bytes target rest; do
echo " $chain[$line_num]: $packets packets, $bytes bytes -> $target"
done
}
# 使用示例
case "${1:-help}" in
"test")
test_rule_performance
;;
"complexity")
analyze_rule_complexity
;;
"stats")
rule_matching_stats
;;
"hotspot")
identify_hotspot_rules
;;
"help")
echo "Usage: $0 <command>"
echo "Commands:"
echo " test - Test rule performance impact"
echo " complexity - Analyze rule complexity"
echo " stats - Show rule matching statistics"
echo " hotspot - Identify most active rules"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac
11.3.2 系统资源监控
1. 系统性能监控
#!/bin/bash
# system_performance_monitor.sh
# 系统性能监控脚本
monitor_system_performance() {
local duration="${1:-60}"
local interval="${2:-5}"
echo "=== System Performance Monitor ==="
echo "Duration: ${duration}s, Interval: ${interval}s"
echo "Time,CPU%,Memory%,ConnTrack,Rules,Network_RX,Network_TX"
local end_time=$(($(date +%s) + duration))
while [ $(date +%s) -lt $end_time ]; do
timestamp=$(date '+%H:%M:%S')
# CPU 使用率
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
# 内存使用率
memory_info=$(free | grep Mem)
total_mem=$(echo "$memory_info" | awk '{print $2}')
used_mem=$(echo "$memory_info" | awk '{print $3}')
memory_usage=$(echo "scale=1; $used_mem * 100 / $total_mem" | bc 2>/dev/null || echo "0")
# 连接跟踪使用情况
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
conntrack_usage=$(echo "scale=1; $current_conn * 100 / $max_conn" | bc 2>/dev/null || echo "0")
else
conntrack_usage="N/A"
fi
# iptables 规则数量
rule_count=$(iptables-save | grep "^-A" | wc -l)
# 网络流量
network_stats=$(cat /proc/net/dev | grep eth0 | awk '{print $2,$10}' 2>/dev/null || echo "0 0")
rx_bytes=$(echo "$network_stats" | awk '{print $1}')
tx_bytes=$(echo "$network_stats" | awk '{print $2}')
echo "$timestamp,$cpu_usage,$memory_usage,$conntrack_usage,$rule_count,$rx_bytes,$tx_bytes"
sleep "$interval"
done
}
# 资源使用分析
analyze_resource_usage() {
echo "=== Resource Usage Analysis ==="
# 1. CPU 分析
echo "1. CPU Analysis:"
# 检查 ksoftirqd 进程(网络中断处理)
ksoftirqd_cpu=$(ps aux | grep ksoftirqd | grep -v grep | awk '{sum+=$3} END {print sum+0}')
echo " ksoftirqd CPU usage: ${ksoftirqd_cpu}%"
if [ "$(echo "$ksoftirqd_cpu > 10" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High softirq CPU usage detected"
echo " This may indicate high network interrupt load"
fi
# 检查系统负载
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
cpu_cores=$(nproc)
load_per_core=$(echo "scale=2; $load_avg / $cpu_cores" | bc 2>/dev/null || echo "0")
echo " Load average: $load_avg (${load_per_core} per core)"
if [ "$(echo "$load_per_core > 1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High system load detected"
fi
# 2. 内存分析
echo -e "\n2. Memory Analysis:"
# 总体内存使用
memory_info=$(free -h)
echo "$memory_info" | sed 's/^/ /'
# iptables 相关内存使用
if [ -f /proc/slabinfo ]; then
echo " iptables memory usage:"
grep -E "ip_tables|xt_|nf_" /proc/slabinfo 2>/dev/null |
while read -r name active_objs num_objs objsize objperslab pagesperslab rest; do
if [ "$active_objs" != "0" ]; then
memory_kb=$((active_objs * objsize / 1024))
echo " $name: $active_objs objects, ${memory_kb}KB"
fi
done
fi
# 3. 网络接口分析
echo -e "\n3. Network Interface Analysis:"
for interface in $(ls /sys/class/net/ | grep -v lo); do
if [ -f "/sys/class/net/$interface/statistics/rx_packets" ]; then
rx_packets=$(cat "/sys/class/net/$interface/statistics/rx_packets")
tx_packets=$(cat "/sys/class/net/$interface/statistics/tx_packets")
rx_errors=$(cat "/sys/class/net/$interface/statistics/rx_errors")
tx_errors=$(cat "/sys/class/net/$interface/statistics/tx_errors")
rx_dropped=$(cat "/sys/class/net/$interface/statistics/rx_dropped")
tx_dropped=$(cat "/sys/class/net/$interface/statistics/tx_dropped")
echo " $interface:"
echo " RX: $rx_packets packets, $rx_errors errors, $rx_dropped dropped"
echo " TX: $tx_packets packets, $tx_errors errors, $tx_dropped dropped"
# 计算错误率
if [ "$rx_packets" -gt 0 ]; then
rx_error_rate=$(echo "scale=4; $rx_errors * 100 / $rx_packets" | bc 2>/dev/null || echo "0")
if [ "$(echo "$rx_error_rate > 0.1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High RX error rate: ${rx_error_rate}%"
fi
fi
if [ "$tx_packets" -gt 0 ]; then
tx_error_rate=$(echo "scale=4; $tx_errors * 100 / $tx_packets" | bc 2>/dev/null || echo "0")
if [ "$(echo "$tx_error_rate > 0.1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High TX error rate: ${tx_error_rate}%"
fi
fi
fi
done
}
# 性能瓶颈检测
detect_performance_bottlenecks() {
echo "=== Performance Bottleneck Detection ==="
local issues_found=0
# 1. 连接跟踪瓶颈
echo "1. Connection Tracking Bottlenecks:"
if [ -f /proc/net/nf_conntrack ]; then
current_conn=$(cat /proc/net/nf_conntrack | wc -l)
max_conn=$(cat /proc/sys/net/netfilter/nf_conntrack_max)
usage_percent=$((current_conn * 100 / max_conn))
echo " Connection table usage: $usage_percent%"
if [ $usage_percent -gt 80 ]; then
echo " ❌ Connection table near capacity"
echo " Recommendation: Increase nf_conntrack_max or reduce timeouts"
issues_found=$((issues_found + 1))
elif [ $usage_percent -gt 60 ]; then
echo " ⚠️ Connection table usage is high"
issues_found=$((issues_found + 1))
else
echo " ✓ Connection table usage is normal"
fi
# 检查连接建立速率
initial_count=$(cat /proc/net/nf_conntrack | wc -l)
sleep 5
final_count=$(cat /proc/net/nf_conntrack | wc -l)
conn_rate=$(((final_count - initial_count) / 5))
echo " Connection establishment rate: $conn_rate/sec"
if [ $conn_rate -gt 1000 ]; then
echo " ❌ Very high connection rate detected"
echo " Recommendation: Implement rate limiting or check for attacks"
issues_found=$((issues_found + 1))
elif [ $conn_rate -gt 500 ]; then
echo " ⚠️ High connection rate detected"
issues_found=$((issues_found + 1))
fi
fi
# 2. 规则处理瓶颈
echo -e "\n2. Rule Processing Bottlenecks:"
total_rules=$(iptables-save | grep "^-A" | wc -l)
echo " Total rules: $total_rules"
if [ $total_rules -gt 1000 ]; then
echo " ❌ Very high rule count"
echo " Recommendation: Optimize rules, use ipset, or implement rule grouping"
issues_found=$((issues_found + 1))
elif [ $total_rules -gt 500 ]; then
echo " ⚠️ High rule count"
issues_found=$((issues_found + 1))
else
echo " ✓ Rule count is reasonable"
fi
# 检查复杂规则
complex_rules=$(iptables-save | grep -E "(\-m.*){3,}" | wc -l)
echo " Complex rules (3+ conditions): $complex_rules"
if [ $complex_rules -gt 50 ]; then
echo " ❌ Too many complex rules"
echo " Recommendation: Simplify rules or split into multiple simpler rules"
issues_found=$((issues_found + 1))
elif [ $complex_rules -gt 20 ]; then
echo " ⚠️ Many complex rules detected"
issues_found=$((issues_found + 1))
fi
# 3. 系统资源瓶颈
echo -e "\n3. System Resource Bottlenecks:"
# CPU 检查
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
cpu_cores=$(nproc)
load_per_core=$(echo "scale=2; $load_avg / $cpu_cores" | bc 2>/dev/null || echo "0")
echo " CPU load per core: $load_per_core"
if [ "$(echo "$load_per_core > 2" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ Very high CPU load"
issues_found=$((issues_found + 1))
elif [ "$(echo "$load_per_core > 1" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High CPU load"
issues_found=$((issues_found + 1))
else
echo " ✓ CPU load is normal"
fi
# 内存检查
memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3*100/$2}')
echo " Memory usage: ${memory_usage}%"
if [ "$(echo "$memory_usage > 90" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ❌ Very high memory usage"
issues_found=$((issues_found + 1))
elif [ "$(echo "$memory_usage > 80" | bc 2>/dev/null || echo 0)" = "1" ]; then
echo " ⚠️ High memory usage"
issues_found=$((issues_found + 1))
else
echo " ✓ Memory usage is normal"
fi
# 4. 总结
echo -e "\n4. Summary:"
if [ $issues_found -eq 0 ]; then
echo " ✓ No performance bottlenecks detected"
else
echo " Found $issues_found potential performance issues"
echo " Review the recommendations above for optimization"
fi
}
# 使用示例
case "${1:-help}" in
"monitor")
monitor_system_performance "$2" "$3"
;;
"analyze")
analyze_resource_usage
;;
"bottleneck")
detect_performance_bottlenecks
;;
"help")
echo "Usage: $0 <command> [options]"
echo "Commands:"
echo " monitor [duration] [interval] - Monitor system performance"
echo " analyze - Analyze resource usage"
echo " bottleneck - Detect performance bottlenecks"
echo
echo "Examples:"
echo " $0 monitor 300 10 - Monitor for 5 minutes, 10s interval"
echo " $0 analyze - Analyze current resource usage"
echo " $0 bottleneck - Check for performance issues"
;;
*)
echo "Unknown command: $1"
exit 1
;;
esac