AWK & SED 生产力教程 / 第 8 章:数据提取
第 8 章:数据提取
数据提取是将"信息海洋"转化为"可用数据"的关键步骤。
8.1 日志解析基础
多种日志格式
# Apache Combined Log Format
# 192.168.1.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /index.html HTTP/1.0" 200 2326 "http://www.example.com" "Mozilla/4.08"
# Nginx 默认格式(类似 Apache)
# 192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234
# Syslog 格式
# Jan 15 10:23:45 hostname sshd[12345]: Accepted password for user from 192.168.1.1
# JSON 日志
# {"ts":"2024-01-15T10:23:45Z","level":"error","msg":"Something failed","code":500}
通用解析框架
# 通用日志解析器
cat > parse_log.awk << 'EOF'
BEGIN {
total = 0
errors = 0
total_bytes = 0
}
{
total++
# 提取基本字段(假设空格分隔的常见日志格式)
client_ip = $1
timestamp = $4 " " $5
method = $6
path = $7
status = $9
bytes = $10
# 统计
status_count[status]++
ip_count[client_ip]++
path_count[path]++
if (bytes ~ /^[0-9]+$/) total_bytes += bytes
if (status ~ /^[45]/) errors++
}
END {
printf "========== 日志分析报告 ==========\n\n"
printf "总请求数: %d\n", total
printf "错误请求数: %d (%.1f%%)\n", errors, (errors/total)*100
printf "总传输量: %.2f MB\n\n", total_bytes/1048576
printf "--- 状态码分布 ---\n"
for (s in status_count)
printf " %s: %d\n", s, status_count[s]
printf "\n--- Top 5 IP ---\n"
for (ip in ip_count)
printf "%6d %s\n", ip_count[ip], ip
close("sort -rn | head -5")
}
EOF
$ awk -f parse_log.awk access.log
8.2 指标采集
系统指标采集脚本
#!/bin/bash
# collect_metrics.sh — 采集系统指标并输出为 CSV
echo "timestamp,cpu_usage_pct,mem_usage_pct,disk_usage_pct,load_1m,load_5m,load_15m"
while true; do
ts=$(date '+%Y-%m-%d %H:%M:%S')
# CPU 使用率
cpu=$(top -bn1 | awk '/^%Cpu/ {printf "%.1f", 100-$8}')
# 内存使用率
mem=$(free | awk '/^Mem:/ {printf "%.1f", $3/$2*100}')
# 磁盘使用率(根分区)
disk=$(df / | awk 'NR==2 {gsub(/%/,""); print $5}')
# 负载
load=$(awk '{printf "%s,%s,%s", $1, $2, $3}' /proc/loadavg)
echo "$ts,$cpu,$mem,$disk,$load"
sleep 60
done
从输出中提取指标
# 从 free 命令提取内存指标
$ free -m | awk '
/Mem:/ {
total = $2
used = $3
free = $4
available = $7
printf "内存使用率: %.1f%%\n", used/total*100
printf "可用内存: %d MB\n", available
}
/Swap:/ {
total = $2
used = $3
printf "Swap 使用率: %.1f%%\n", (total>0 ? used/total*100 : 0)
}'
# 从 vmstat 提取 CPU 指标
$ vmstat 1 5 | awk 'NR>2 {
printf "CPU 空闲: %s%%, 等待IO: %s%%, 用户态: %s%%\n", $15, $16, $13
}'
# 从 iostat 提取磁盘指标
$ iostat -x 1 2 | awk '/^[sv]d/ && NR>10 {
printf "磁盘: %-8s 读IOPS: %6s 写IOPS: %6s 读延迟: %sms\n", $1, $2, $3, $10
}'
🏢 场景:实时监控面板
#!/bin/bash
# monitor.sh — 终端实时监控
watch -n 2 '
echo "=== 系统资源监控 ==="
echo ""
# CPU
echo "--- CPU ---"
top -bn1 | awk "/^%Cpu/ {
usage = 100 - \$8
bar = \"\"
filled = int(usage / 5)
for (i=0; i<filled; i++) bar = bar \"█\"
for (i=filled; i<20; i++) bar = bar \"░\"
printf \"[%s] %.1f%%\n\", bar, usage
}"
# 内存
echo "--- 内存 ---"
free -m | awk "/^Mem:/ {
pct = \$3/\$2 * 100
bar = \"\"
filled = int(pct / 5)
for (i=0; i<filled; i++) bar = bar \"█\"
for (i=filled; i<20; i++) bar = bar \"░\"
printf \"[%s] %.1f%% (%d/%d MB)\n\", bar, pct, \$3, \$2
}"
# 磁盘
echo "--- 磁盘 ---"
df -h / | awk "NR==2 {
gsub(/%/, \"\", \$5)
pct = \$5+0
bar = \"\"
filled = int(pct / 5)
for (i=0; i<filled; i++) bar = bar \"█\"
for (i=filled; i<20; i++) bar = bar \"░\"
printf \"[%s] %s%% (%s/%s)\n\", bar, \$5, \$3, \$2
}"
'
8.3 结构化数据提取
从命令输出中提取
# 从 ps 命令提取进程信息
$ ps aux | awk 'NR>1 {
user=$1; pid=$2; cpu=$3; mem=$4; cmd=$11
if (cpu+0 > 50) printf "⚠️ 高CPU: PID=%s CPU=%s%% CMD=%s\n", pid, cpu, cmd
}'
# 从 netstat 提取连接统计
$ netstat -an | awk '
/ESTABLISHED/ { established++ }
/TIME_WAIT/ { timewait++ }
/LISTEN/ { listen++ }
/CLOSE_WAIT/ { closewait++ }
END {
printf "ESTABLISHED: %d\n", established
printf "TIME_WAIT: %d\n", timewait
printf "LISTEN: %d\n", listen
printf "CLOSE_WAIT: %d\n", closewait
}'
# 从 docker ps 提取容器信息
$ docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | awk -F'\t' '{
name = $1; status = $2; ports = $3
printf "%-20s %-20s %s\n", name, status, ports
}'
🏢 场景:服务健康检查
#!/bin/bash
# health_check.sh — 检查多个服务状态
check_service() {
local name=$1
local host=$2
local port=$3
local expected=$4
# 检查端口是否可达
if nc -z -w3 "$host" "$port" 2>/dev/null; then
status="✅ UP"
else
status="🔴 DOWN"
fi
printf "%-20s %-10s %s:%s\n" "$name" "$status" "$host" "$port"
}
echo "=== 服务健康检查 ==="
check_service "Nginx" "localhost" "80" "UP"
check_service "MySQL" "localhost" "3306" "UP"
check_service "Redis" "localhost" "6379" "UP"
check_service "App" "localhost" "8080" "UP"
8.4 多格式数据提取
提取并转换格式
# 从 /etc/passwd 提取并输出为 CSV
$ awk -F: '{printf "%s,%s,%s,%s\n", $1, $3, $5, $7}' /etc/passwd
# 从 /etc/fstab 提取并格式化
$ awk '!/^#/ && !/^$/ && NF>=3 {
printf "%-25s %-20s %-10s %s\n", $1, $2, $3, $4
}' /etc/fstab
# 从 mount 提取挂载信息
$ mount | awk -F'[ (]' '{
printf "%-25s %-15s %s\n", $1, $3, $2
}'
网页内容提取
# 提取 HTML 中的链接
$ curl -s https://example.com | grep -oP 'href="\K[^"]+'
# 提取 HTML 中的标题
$ curl -s https://example.com | sed -n 's/<title>\(.*\)<\/title>/\1/p'
# 提取 HTML 表格数据
$ curl -s https://example.com/table | awk '
/<tr>/ { row = "" }
/<td>/ {
gsub(/<[^>]*>/, "")
row = row (row ? "," : "") $0
}
/<\/tr>/ { print row }
'
⚠️ 注意:网页内容提取建议使用专门的工具如
pup、xq或 Python 的 BeautifulSoup。
8.5 高级提取技巧
条件提取
# 提取两行之间的内容
$ awk '/^BEGIN$/,/^END$/{ if (!/^BEGIN$/ && !/^END$/) print }' file
# 提取第一个匹配之后的内容
$ awk 'found; /pattern/ { found=1 }' file
# 提取段落(空行分隔)
$ awk 'BEGIN{RS=""} /keyword/' file
带上下文的提取
# 类似 grep -A -B -C 的效果
$ awk '/error/ {
for (i=NR-2; i<=NR+2; i++) {
if (i > 0 && i in lines)
printf "%d: %s\n", i, lines[i]
}
print "---"
}
{ lines[NR] = $0 }' logfile
# 更好的方式(使用环形缓冲区)
$ awk '
/error/ {
for (i in context) print context[i]
print NR": "$0
after = 2
next
}
after > 0 { print NR": "$0; after-- }
{ context[NR] = NR": "$0; delete context[NR-3] }
' logfile
8.6 报告生成基础
文本报告
#!/bin/bash
# generate_report.sh — 生成系统报告
REPORT_FILE="report_$(date +%Y%m%d).txt"
{
echo "=========================================="
echo " 系统状态报告"
echo " 生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo " 主机名: $(hostname)"
echo "=========================================="
echo ""
echo "=== 系统信息 ==="
uname -a
echo ""
echo "=== CPU 信息 ==="
lscpu | grep -E "^(Architecture|CPU\(s\)|Model name|Thread)"
echo ""
echo "=== 内存使用 ==="
free -h
echo ""
echo "=== 磁盘使用 ==="
df -h | awk 'NR==1 || /^[\/]/'
echo ""
echo "=== 网络连接 ==="
netstat -an | awk '
/ESTABLISHED/ { established++ }
/LISTEN/ { listen++ }
END {
printf "活跃连接: %d\n监听端口: %d\n", established, listen
}'
echo ""
echo "=== 最近登录 ==="
last -5 2>/dev/null || echo "无法获取"
} > "$REPORT_FILE"
echo "报告已生成: $REPORT_FILE"
带表格的报告
# 生成格式化的磁盘使用报告
$ df -h | awk '
BEGIN {
print "┌─────────────────────┬────────┬────────┬────────┬──────┬────────────┐"
printf "│ %-19s │ %6s │ %6s │ %6s │ %4s │ %-10s │\n", "文件系统", "总计", "已用", "可用", "使用", "挂载点"
print "├─────────────────────┼────────┼────────┼────────┼──────┼────────────┤"
}
NR>1 && $1 ~ /^\// {
printf "│ %-19s │ %6s │ %6s │ %6s │ %4s │ %-10s │\n", $1, $2, $3, $4, $5, $6
}
END {
print "└─────────────────────┴────────┴────────┴────────┴──────┴────────────┘"
}'
8.7 实战:数据管道
🏢 场景一:API 响应分析
# 假设 API 响应日志格式
cat > api.log << 'EOF'
2024-01-15T10:00:00 GET /api/users 200 45ms
2024-01-15T10:00:01 POST /api/login 401 120ms
2024-01-15T10:00:02 GET /api/users/123 200 32ms
2024-01-15T10:00:03 PUT /api/users/123 200 67ms
2024-01-15T10:00:04 DELETE /api/users/456 404 15ms
2024-01-15T10:00:05 GET /api/products 200 89ms
2024-01-15T10:00:06 POST /api/orders 201 234ms
2024-01-15T10:00:07 GET /api/users 500 5002ms
EOF
# 分析 API 响应时间
$ awk '{
gsub(/ms/, "", $5)
method = $2
path = $3
status = $4
time = $5 + 0
# 按接口统计
api[method " " path]++
api_time[method " " path] += time
# 慢请求检测
if (time > 500) slow++
# 状态码统计
status_count[status]++
}
END {
printf "%-6s %-20s %6s %8s %8s\n", "方法", "路径", "次数", "平均(ms)", "总耗时"
printf "%-6s %-20s %6s %8s %8s\n", "------", "--------------------", "------", "--------", "--------"
for (a in api)
printf "%-25s %6d %8.0f %8d\n", a, api[a], api_time[a]/api[a], api_time[a]
print ""
if (slow > 0) printf "⚠️ 慢请求 (>500ms): %d 个\n", slow
print "\n--- 状态码分布 ---"
for (s in status_count)
printf " %s: %d\n", s, status_count[s]
}' api.log
🏢 场景二:资源使用报告
# 生成 HTML 资源报告
cat > generate_html_report.awk << 'EOF'
BEGIN {
print "<!DOCTYPE html>"
print "<html><head><style>"
print "table { border-collapse: collapse; width: 80%; margin: 20px auto; }"
print "th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }"
print "th { background-color: #4CAF50; color: white; }"
print "tr:nth-child(even) { background-color: #f2f2f2; }"
print ".warning { color: orange; }"
print ".critical { color: red; font-weight: bold; }"
print "</style></head><body>"
print "<h1 style='text-align:center'>系统资源报告</h1>"
print "<p style='text-align:center'>生成时间: " strftime("%Y-%m-%d %H:%M:%S") "</p>"
print "<table>"
print "<tr><th>资源</th><th>总量</th><th>已用</th><th>使用率</th><th>状态</th></tr>"
}
{
resource = $1
total = $2
used = $3
pct = $4 + 0
if (pct >= 90) cls = "critical"
else if (pct >= 70) cls = "warning"
else cls = ""
printf "<tr><td>%s</td><td>%s</td><td>%s</td><td class='%s'>%.1f%%</td><td class='%s'>%s</td></tr>\n",
resource, total, used, cls, pct, cls, (pct >= 90 ? "🔴" : pct >= 70 ? "⚠️" : "✅")
}
END {
print "</table>"
print "</body></html>"
}
EOF
8.8 提取技巧速查
# 常用提取模式
# 提取第 N 列
awk '{print $N}' file
# 提取最后一个字段
awk '{print $NF}' file
# 提取匹配行的特定字段
awk '/pattern/ {print $2}' file
# 提取两个标记之间的内容
sed -n '/START/,/END/p' file | sed '1d;$d'
# 提取数字
grep -oE '[0-9]+' file
# 提取引号中的内容
grep -oP '"\K[^"]+' file
# 提取括号中的内容
grep -oP '\(\K[^)]+' file
# 提取并计算
awk '{sum+=$3} END{print sum}' file
# 提取并排序
awk '{print $2}' file | sort | uniq -c | sort -rn
# 提取并格式化
awk '{printf "%-20s %10d\n", $1, $2}' file
扩展阅读
- GNU AWK — Reading Files
- jq cookbook
- ripgrep — 更快的 grep 替代
下一章:第 9 章:管道组合 — 数据清洗与多工具协作。