AWK & SED 生产力教程 / 第 8 章：数据提取

第 8 章：数据提取

数据提取是将"信息海洋"转化为"可用数据"的关键步骤。

8.1 日志解析基础

多种日志格式

# Apache Combined Log Format
# 192.168.1.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /index.html HTTP/1.0" 200 2326 "http://www.example.com" "Mozilla/4.08"

# Nginx 默认格式（类似 Apache）
# 192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234

# Syslog 格式
# Jan 15 10:23:45 hostname sshd[12345]: Accepted password for user from 192.168.1.1

# JSON 日志
# {"ts":"2024-01-15T10:23:45Z","level":"error","msg":"Something failed","code":500}

通用解析框架

# 通用日志解析器
cat > parse_log.awk << 'EOF'
BEGIN {
    total = 0
    errors = 0
    total_bytes = 0
}

{
    total++
    
    # 提取基本字段（假设空格分隔的常见日志格式）
    client_ip = $1
    timestamp = $4 " " $5
    method = $6
    path = $7
    status = $9
    bytes = $10
    
    # 统计
    status_count[status]++
    ip_count[client_ip]++
    path_count[path]++
    
    if (bytes ~ /^[0-9]+$/) total_bytes += bytes
    if (status ~ /^[45]/) errors++
}

END {
    printf "========== 日志分析报告 ==========\n\n"
    printf "总请求数: %d\n", total
    printf "错误请求数: %d (%.1f%%)\n", errors, (errors/total)*100
    printf "总传输量: %.2f MB\n\n", total_bytes/1048576
    
    printf "--- 状态码分布 ---\n"
    for (s in status_count)
        printf "  %s: %d\n", s, status_count[s]
    
    printf "\n--- Top 5 IP ---\n"
    for (ip in ip_count)
        printf "%6d  %s\n", ip_count[ip], ip
    close("sort -rn | head -5")
}
EOF

$ awk -f parse_log.awk access.log

8.2 指标采集

系统指标采集脚本

#!/bin/bash
# collect_metrics.sh — 采集系统指标并输出为 CSV

echo "timestamp,cpu_usage_pct,mem_usage_pct,disk_usage_pct,load_1m,load_5m,load_15m"

while true; do
    ts=$(date '+%Y-%m-%d %H:%M:%S')
    
    # CPU 使用率
    cpu=$(top -bn1 | awk '/^%Cpu/ {printf "%.1f", 100-$8}')
    
    # 内存使用率
    mem=$(free | awk '/^Mem:/ {printf "%.1f", $3/$2*100}')
    
    # 磁盘使用率（根分区）
    disk=$(df / | awk 'NR==2 {gsub(/%/,""); print $5}')
    
    # 负载
    load=$(awk '{printf "%s,%s,%s", $1, $2, $3}' /proc/loadavg)
    
    echo "$ts,$cpu,$mem,$disk,$load"
    sleep 60
done

从输出中提取指标

# 从 free 命令提取内存指标
$ free -m | awk '
/Mem:/ {
    total = $2
    used = $3
    free = $4
    available = $7
    printf "内存使用率: %.1f%%\n", used/total*100
    printf "可用内存: %d MB\n", available
}
/Swap:/ {
    total = $2
    used = $3
    printf "Swap 使用率: %.1f%%\n", (total>0 ? used/total*100 : 0)
}'

# 从 vmstat 提取 CPU 指标
$ vmstat 1 5 | awk 'NR>2 {
    printf "CPU 空闲: %s%%, 等待IO: %s%%, 用户态: %s%%\n", $15, $16, $13
}'

# 从 iostat 提取磁盘指标
$ iostat -x 1 2 | awk '/^[sv]d/ && NR>10 {
    printf "磁盘: %-8s 读IOPS: %6s 写IOPS: %6s 读延迟: %sms\n", $1, $2, $3, $10
}'

🏢 场景：实时监控面板

#!/bin/bash
# monitor.sh — 终端实时监控

watch -n 2 '
echo "=== 系统资源监控 ==="
echo ""

# CPU
echo "--- CPU ---"
top -bn1 | awk "/^%Cpu/ {
    usage = 100 - \$8
    bar = \"\"
    filled = int(usage / 5)
    for (i=0; i<filled; i++) bar = bar \"█\"
    for (i=filled; i<20; i++) bar = bar \"░\"
    printf \"[%s] %.1f%%\n\", bar, usage
}"

# 内存
echo "--- 内存 ---"
free -m | awk "/^Mem:/ {
    pct = \$3/\$2 * 100
    bar = \"\"
    filled = int(pct / 5)
    for (i=0; i<filled; i++) bar = bar \"█\"
    for (i=filled; i<20; i++) bar = bar \"░\"
    printf \"[%s] %.1f%% (%d/%d MB)\n\", bar, pct, \$3, \$2
}"

# 磁盘
echo "--- 磁盘 ---"
df -h / | awk "NR==2 {
    gsub(/%/, \"\", \$5)
    pct = \$5+0
    bar = \"\"
    filled = int(pct / 5)
    for (i=0; i<filled; i++) bar = bar \"█\"
    for (i=filled; i<20; i++) bar = bar \"░\"
    printf \"[%s] %s%% (%s/%s)\n\", bar, \$5, \$3, \$2
}"
'

8.3 结构化数据提取

从命令输出中提取

# 从 ps 命令提取进程信息
$ ps aux | awk 'NR>1 {
    user=$1; pid=$2; cpu=$3; mem=$4; cmd=$11
    if (cpu+0 > 50) printf "⚠️  高CPU: PID=%s CPU=%s%% CMD=%s\n", pid, cpu, cmd
}'

# 从 netstat 提取连接统计
$ netstat -an | awk '
    /ESTABLISHED/ { established++ }
    /TIME_WAIT/   { timewait++ }
    /LISTEN/      { listen++ }
    /CLOSE_WAIT/  { closewait++ }
    END {
        printf "ESTABLISHED: %d\n", established
        printf "TIME_WAIT:   %d\n", timewait
        printf "LISTEN:      %d\n", listen
        printf "CLOSE_WAIT:  %d\n", closewait
    }'

# 从 docker ps 提取容器信息
$ docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | awk -F'\t' '{
    name = $1; status = $2; ports = $3
    printf "%-20s %-20s %s\n", name, status, ports
}'

🏢 场景：服务健康检查

#!/bin/bash
# health_check.sh — 检查多个服务状态

check_service() {
    local name=$1
    local host=$2
    local port=$3
    local expected=$4
    
    # 检查端口是否可达
    if nc -z -w3 "$host" "$port" 2>/dev/null; then
        status="✅ UP"
    else
        status="🔴 DOWN"
    fi
    
    printf "%-20s %-10s %s:%s\n" "$name" "$status" "$host" "$port"
}

echo "=== 服务健康检查 ==="
check_service "Nginx" "localhost" "80" "UP"
check_service "MySQL" "localhost" "3306" "UP"
check_service "Redis" "localhost" "6379" "UP"
check_service "App" "localhost" "8080" "UP"

8.4 多格式数据提取

提取并转换格式

# 从 /etc/passwd 提取并输出为 CSV
$ awk -F: '{printf "%s,%s,%s,%s\n", $1, $3, $5, $7}' /etc/passwd

# 从 /etc/fstab 提取并格式化
$ awk '!/^#/ && !/^$/ && NF>=3 {
    printf "%-25s %-20s %-10s %s\n", $1, $2, $3, $4
}' /etc/fstab

# 从 mount 提取挂载信息
$ mount | awk -F'[ (]' '{
    printf "%-25s %-15s %s\n", $1, $3, $2
}'

网页内容提取

# 提取 HTML 中的链接
$ curl -s https://example.com | grep -oP 'href="\K[^"]+'

# 提取 HTML 中的标题
$ curl -s https://example.com | sed -n 's/<title>\(.*\)<\/title>/\1/p'

# 提取 HTML 表格数据
$ curl -s https://example.com/table | awk '
    /<tr>/ { row = "" }
    /<td>/ {
        gsub(/<[^>]*>/, "")
        row = row (row ? "," : "") $0
    }
    /<\/tr>/ { print row }
'

⚠️ 注意：网页内容提取建议使用专门的工具如 pup、xq 或 Python 的 BeautifulSoup。

8.5 高级提取技巧

条件提取

# 提取两行之间的内容
$ awk '/^BEGIN$/,/^END$/{ if (!/^BEGIN$/ && !/^END$/) print }' file

# 提取第一个匹配之后的内容
$ awk 'found; /pattern/ { found=1 }' file

# 提取段落（空行分隔）
$ awk 'BEGIN{RS=""} /keyword/' file

带上下文的提取

# 类似 grep -A -B -C 的效果
$ awk '/error/ {
    for (i=NR-2; i<=NR+2; i++) {
        if (i > 0 && i in lines)
            printf "%d: %s\n", i, lines[i]
    }
    print "---"
}
{ lines[NR] = $0 }' logfile

# 更好的方式（使用环形缓冲区）
$ awk '
/error/ {
    for (i in context) print context[i]
    print NR": "$0
    after = 2
    next
}
after > 0 { print NR": "$0; after-- }
{ context[NR] = NR": "$0; delete context[NR-3] }
' logfile

8.6 报告生成基础

文本报告

#!/bin/bash
# generate_report.sh — 生成系统报告

REPORT_FILE="report_$(date +%Y%m%d).txt"

{
    echo "=========================================="
    echo "  系统状态报告"
    echo "  生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
    echo "  主机名: $(hostname)"
    echo "=========================================="
    echo ""
    
    echo "=== 系统信息 ==="
    uname -a
    echo ""
    
    echo "=== CPU 信息 ==="
    lscpu | grep -E "^(Architecture|CPU\(s\)|Model name|Thread)"
    echo ""
    
    echo "=== 内存使用 ==="
    free -h
    echo ""
    
    echo "=== 磁盘使用 ==="
    df -h | awk 'NR==1 || /^[\/]/'
    echo ""
    
    echo "=== 网络连接 ==="
    netstat -an | awk '
        /ESTABLISHED/ { established++ }
        /LISTEN/ { listen++ }
        END {
            printf "活跃连接: %d\n监听端口: %d\n", established, listen
        }'
    
    echo ""
    echo "=== 最近登录 ==="
    last -5 2>/dev/null || echo "无法获取"
    
} > "$REPORT_FILE"

echo "报告已生成: $REPORT_FILE"

带表格的报告

# 生成格式化的磁盘使用报告
$ df -h | awk '
BEGIN {
    print "┌─────────────────────┬────────┬────────┬────────┬──────┬────────────┐"
    printf "│ %-19s │ %6s │ %6s │ %6s │ %4s │ %-10s │\n", "文件系统", "总计", "已用", "可用", "使用", "挂载点"
    print "├─────────────────────┼────────┼────────┼────────┼──────┼────────────┤"
}
NR>1 && $1 ~ /^\// {
    printf "│ %-19s │ %6s │ %6s │ %6s │ %4s │ %-10s │\n", $1, $2, $3, $4, $5, $6
}
END {
    print "└─────────────────────┴────────┴────────┴────────┴──────┴────────────┘"
}'

8.7 实战：数据管道

🏢 场景一：API 响应分析

# 假设 API 响应日志格式
cat > api.log << 'EOF'
2024-01-15T10:00:00 GET /api/users 200 45ms
2024-01-15T10:00:01 POST /api/login 401 120ms
2024-01-15T10:00:02 GET /api/users/123 200 32ms
2024-01-15T10:00:03 PUT /api/users/123 200 67ms
2024-01-15T10:00:04 DELETE /api/users/456 404 15ms
2024-01-15T10:00:05 GET /api/products 200 89ms
2024-01-15T10:00:06 POST /api/orders 201 234ms
2024-01-15T10:00:07 GET /api/users 500 5002ms
EOF

# 分析 API 响应时间
$ awk '{
    gsub(/ms/, "", $5)
    method = $2
    path = $3
    status = $4
    time = $5 + 0
    
    # 按接口统计
    api[method " " path]++
    api_time[method " " path] += time
    
    # 慢请求检测
    if (time > 500) slow++
    
    # 状态码统计
    status_count[status]++
}
END {
    printf "%-6s %-20s %6s %8s %8s\n", "方法", "路径", "次数", "平均(ms)", "总耗时"
    printf "%-6s %-20s %6s %8s %8s\n", "------", "--------------------", "------", "--------", "--------"
    for (a in api)
        printf "%-25s %6d %8.0f %8d\n", a, api[a], api_time[a]/api[a], api_time[a]
    
    print ""
    if (slow > 0) printf "⚠️  慢请求 (>500ms): %d 个\n", slow
    
    print "\n--- 状态码分布 ---"
    for (s in status_count)
        printf "  %s: %d\n", s, status_count[s]
}' api.log

🏢 场景二：资源使用报告

# 生成 HTML 资源报告
cat > generate_html_report.awk << 'EOF'
BEGIN {
    print "<!DOCTYPE html>"
    print "<html><head><style>"
    print "table { border-collapse: collapse; width: 80%; margin: 20px auto; }"
    print "th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }"
    print "th { background-color: #4CAF50; color: white; }"
    print "tr:nth-child(even) { background-color: #f2f2f2; }"
    print ".warning { color: orange; }"
    print ".critical { color: red; font-weight: bold; }"
    print "</style></head><body>"
    print "<h1 style='text-align:center'>系统资源报告</h1>"
    print "<p style='text-align:center'>生成时间: " strftime("%Y-%m-%d %H:%M:%S") "</p>"
    print "<table>"
    print "<tr><th>资源</th><th>总量</th><th>已用</th><th>使用率</th><th>状态</th></tr>"
}
{
    resource = $1
    total = $2
    used = $3
    pct = $4 + 0
    
    if (pct >= 90) cls = "critical"
    else if (pct >= 70) cls = "warning"
    else cls = ""
    
    printf "<tr><td>%s</td><td>%s</td><td>%s</td><td class='%s'>%.1f%%</td><td class='%s'>%s</td></tr>\n",
        resource, total, used, cls, pct, cls, (pct >= 90 ? "🔴" : pct >= 70 ? "⚠️" : "✅")
}
END {
    print "</table>"
    print "</body></html>"
}
EOF

8.8 提取技巧速查

# 常用提取模式
# 提取第 N 列
awk '{print $N}' file

# 提取最后一个字段
awk '{print $NF}' file

# 提取匹配行的特定字段
awk '/pattern/ {print $2}' file

# 提取两个标记之间的内容
sed -n '/START/,/END/p' file | sed '1d;$d'

# 提取数字
grep -oE '[0-9]+' file

# 提取引号中的内容
grep -oP '"\K[^"]+' file

# 提取括号中的内容
grep -oP '\(\K[^)]+' file

# 提取并计算
awk '{sum+=$3} END{print sum}' file

# 提取并排序
awk '{print $2}' file | sort | uniq -c | sort -rn

# 提取并格式化
awk '{printf "%-20s %10d\n", $1, $2}' file

扩展阅读

下一章：第 9 章：管道组合 — 数据清洗与多工具协作。