AWK & SED 生产力教程 / 第 10 章:系统管理
第 10 章:系统管理
作为系统管理员,你无法手动管理上百台服务器。自动化脚本是你的超能力,AWK 和 SED 是你的武器。
10.1 配置文件管理
常见配置文件格式
# INI 格式
[database]
host = localhost
port = 5432
# Key=Value 格式
DATABASE_HOST=localhost
DATABASE_PORT=5432
# YAML 格式
database:
host: localhost
port: 5432
# Space 分隔
server_name localhost
listen 80
安全修改配置文件
# 修改 key=value 格式的配置
# 假设配置:DB_HOST=localhost
$ sed -i.bak 's/^DB_HOST=.*/DB_HOST=192.168.1.100/' /etc/app/config.env
# 修改并验证
$ sed -i.bak 's/^DB_HOST=.*/DB_HOST=192.168.1.100/' /etc/app/config.env
$ grep 'DB_HOST' /etc/app/config.env
# 只在 key 存在时修改,不存在时追加
$ grep -q '^DB_HOST=' config.env \
&& sed -i 's/^DB_HOST=.*/DB_HOST=new_value/' config.env \
|| echo 'DB_HOST=new_value' >> config.env
🏢 场景一:批量修改 SSH 配置
#!/bin/bash
# harden_ssh.sh — SSH 安全加固
SSHD_CONFIG="/etc/ssh/sshd_config"
BACKUP="${SSHD_CONFIG}.bak.$(date +%Y%m%d)"
# 备份
cp "$SSHD_CONFIG" "$BACKUP"
# 修改配置
declare -A settings=(
["PermitRootLogin"]="no"
["PasswordAuthentication"]="no"
["X11Forwarding"]="no"
["MaxAuthTries"]="3"
["ClientAliveInterval"]="300"
["ClientAliveCountMax"]="2"
["Protocol"]="2"
)
for key in "${!settings[@]}"; do
value="${settings[$key]}"
if grep -q "^${key}" "$SSHD_CONFIG"; then
# 已存在,修改
sed -i "s/^${key}.*/${key} ${value}/" "$SSHD_CONFIG"
echo "修改: ${key} = ${value}"
elif grep -q "^#${key}" "$SSHD_CONFIG"; then
# 被注释,取消注释并修改
sed -i "s/^#${key}.*/${key} ${value}/" "$SSHD_CONFIG"
echo "启用: ${key} = ${value}"
else
# 不存在,追加
echo "${key} ${value}" >> "$SSHD_CONFIG"
echo "新增: ${key} = ${value}"
fi
done
# 验证语法
sshd -t && echo "✅ 配置语法正确" || echo "❌ 配置语法错误"
# 重载配置
systemctl reload sshd
echo "配置已重载"
🏢 场景二:Nginx 虚拟主机配置
#!/bin/bash
# add_vhost.sh — 添加 Nginx 虚拟主机
DOMAIN=$1
if [ -z "$DOMAIN" ]; then
echo "用法: $0 <域名>"
exit 1
fi
CONFIG_FILE="/etc/nginx/sites-available/${DOMAIN}"
cat > "$CONFIG_FILE" << EOF
server {
listen 80;
server_name ${DOMAIN} www.${DOMAIN};
root /var/www/${DOMAIN};
index index.html index.htm;
access_log /var/log/nginx/${DOMAIN}.access.log;
error_log /var/log/nginx/${DOMAIN}.error.log;
location / {
try_files \$uri \$uri/ =404;
}
location ~ /\.ht {
deny all;
}
}
EOF
# 启用站点
ln -sf "$CONFIG_FILE" "/etc/nginx/sites-enabled/"
# 创建网站目录
mkdir -p "/var/www/${DOMAIN}"
# 测试配置
nginx -t && systemctl reload nginx
echo "✅ 虚拟主机 ${DOMAIN} 已创建"
10.2 批量操作
批量用户管理
#!/bin/bash
# batch_users.sh — 批量创建用户
# 用户列表文件格式:username:password:group
cat > users.txt << 'EOF'
alice:Pass123:developers
bob:Pass456:developers
carol:Pass789:managers
EOF
while IFS=: read -r username password group; do
# 创建组(如果不存在)
groupadd -f "$group"
# 创建用户
useradd -m -g "$group" -s /bin/bash "$username"
# 设置密码
echo "${username}:${password}" | chpasswd
echo "创建用户: ${username} (组: ${group})"
done < users.txt
批量修改文件
# 批量替换多个文件中的字符串
$ find /etc/nginx/sites-available/ -type f \
-exec grep -l 'old-domain.com' {} + \
| xargs sed -i.bak 's/old-domain\.com/new-domain.com/g'
# 批量修改文件权限
$ find /var/www/ -type f -exec chmod 644 {} +
$ find /var/www/ -type d -exec chmod 755 {} +
# 批量重命名文件
$ ls *.txt | awk '{print "mv "$0" "$0}' | sed 's/\.txt/.md/' | bash
# 批量删除特定天数前的日志
$ find /var/log/ -name "*.log" -mtime +30 -exec rm {} +
🏢 场景:服务器配置同步
#!/bin/bash
# sync_config.sh — 同步配置到多台服务器
SERVERS="server1 server2 server3 server4"
CONFIG_FILE="/etc/app/config.yml"
for server in $SERVERS; do
echo "同步到 ${server}..."
# 备份远程文件
ssh "$server" "cp ${CONFIG_FILE} ${CONFIG_FILE}.bak"
# 传输新配置
scp "$CONFIG_FILE" "${server}:${CONFIG_FILE}"
# 验证并重启服务
ssh "$server" "systemctl restart app && echo '✅ ${server} 完成' || echo '❌ ${server} 失败'"
done
10.3 系统监控脚本
磁盘空间监控
#!/bin/bash
# disk_monitor.sh — 磁盘空间告警
THRESHOLD=80
ALERT_EMAIL="[email protected]"
df -h | awk 'NR>1 {
gsub(/%/, "", $5)
if ($5+0 >= 80) {
printf "⚠️ 警告: %s 使用率 %s%% (挂载点: %s)\n", $1, $5, $6
}
}' | while read -r line; do
echo "$line"
# 可以发送邮件或写入日志
echo "$line" >> /var/log/disk_alert.log
done
内存监控
#!/bin/bash
# mem_monitor.sh — 内存使用监控
free -m | awk '/^Mem:/ {
total = $2
used = $3
available = $7
pct = used / total * 100
printf "内存使用率: %.1f%%\n", pct
printf "已用: %d MB / 总计: %d MB\n", used, total
printf "可用: %d MB\n", available
if (pct > 90) {
print "🔴 严重: 内存使用率超过 90%!"
exit 1
} else if (pct > 80) {
print "⚠️ 警告: 内存使用率超过 80%"
exit 0
} else {
print "✅ 正常"
exit 0
}
}'
进程监控
#!/bin/bash
# process_monitor.sh — 关键进程监控
PROCESSES="nginx mysql redis-server sshd"
for proc in $PROCESSES; do
count=$(pgrep -c "$proc" 2>/dev/null)
if [ "$count" -gt 0 ]; then
echo "✅ ${proc}: 运行中 (${count} 个进程)"
else
echo "🔴 ${proc}: 未运行!"
# 可以在这里添加自动重启逻辑
# systemctl restart "$proc"
fi
done
# 检查高 CPU 进程
echo ""
echo "=== 高 CPU 进程 (>50%) ==="
ps aux | awk 'NR>1 && $3>50 {printf "%-10s %5s%% %5s%% %s\n", $1, $3, $4, $11}'
综合系统健康检查
#!/bin/bash
# health_check.sh — 综合系统健康检查
echo "========================================"
echo " 系统健康检查报告"
echo " 时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo " 主机: $(hostname)"
echo "========================================"
# CPU
echo ""
echo "--- CPU 使用率 ---"
top -bn1 | awk '/^%Cpu/ {
usage = 100 - $8
if (usage > 90) level = "🔴 严重"
else if (usage > 70) level = "⚠️ 警告"
else level = "✅ 正常"
printf "使用率: %.1f%% %s\n", usage, level
}'
# 内存
echo ""
echo "--- 内存使用 ---"
free -m | awk '/^Mem:/ {
pct = $3/$2*100
if (pct > 90) level = "🔴 严重"
else if (pct > 70) level = "⚠️ 警告"
else level = "✅ 正常"
printf "使用率: %.1f%% (%d/%d MB) %s\n", pct, $3, $2, level
}'
# 磁盘
echo ""
echo "--- 磁盘使用 ---"
df -h | awk 'NR>1 && $1 ~ /^\// {
gsub(/%/, "", $5)
pct = $5+0
if (pct > 90) level = "🔴 严重"
else if (pct > 70) level = "⚠️ 警告"
else level = "✅ 正常"
printf "%-20s %3d%% (%s/%s) %s\n", $6, pct, $3, $2, level
}'
# 负载
echo ""
echo "--- 系统负载 ---"
awk '{
load = $1
cpus = 0
while (("nproc" | getline n) > 0) { cpus = n }
close("nproc")
ratio = load / cpus
if (ratio > 2) level = "🔴 严重"
else if (ratio > 1) level = "⚠️ 警告"
else level = "✅ 正常"
printf "负载: %s %s (CPU核心: %d)\n", $0, level, cpus
}' /proc/loadavg
10.4 日志轮转与清理
#!/bin/bash
# log_cleanup.sh — 日志清理脚本
LOG_DIR="/var/log/app"
KEEP_DAYS=30
COMPRESS_DAYS=7
# 删除超过 30 天的日志
find "$LOG_DIR" -name "*.log" -mtime +$KEEP_DAYS -delete
echo "已删除 ${KEEP_DAYS} 天前的日志"
# 压缩 7 天以上的日志
find "$LOG_DIR" -name "*.log" -mtime +$COMPRESS_DAYS ! -name "*.gz" \
-exec gzip {} \;
echo "已压缩 ${COMPRESS_DAYS} 天前的日志"
# 统计当前日志大小
echo ""
echo "当前日志大小:"
du -sh "$LOG_DIR"/*
10.5 用户和权限审计
# 找出 UID 为 0 的用户(超级用户)
$ awk -F: '$3 == 0 {print $1}' /etc/passwd
# 找出没有密码的用户
$ awk -F: '($2 == "" || $2 == "!") {print $1}' /etc/shadow 2>/dev/null
# 找出 90 天未修改密码的用户
$ awk -F: '{
if ($5 > 90 || $5 == "") print $1, "密码已过期或未设置过期策略"
}' /etc/shadow 2>/dev/null
# 找出可登录的用户
$ awk -F: '$7 !~ /(nologin|false|sync|shutdown|halt)/ {print $1, $7}' /etc/passwd
# 列出所有 sudo 用户
$ grep -E '^sudo|^wheel' /etc/group | awk -F: '{print $4}' | tr ',' '\n'
# 找出最近修改过的配置文件
$ find /etc -type f -mtime -7 -ls 2>/dev/null | awk '{print $NF}'
10.6 网络管理
# 列出所有监听端口
$ ss -tlnp | awk 'NR>1 {
split($4, a, ":")
port = a[length(a)]
split($6, b, "\"")
proc = b[2]
printf "端口: %-8s 进程: %s\n", port, proc
}' | sort -t: -k2 -n
# 统计各端口连接数
$ ss -an | awk 'NR>1 {
split($4, a, ":")
port = a[length(a)]
count[port]++
} END {
for (p in count) printf "%6d 端口 %s\n", count[p], p
}' | sort -rn | head -10
# 防火墙规则审计
$ iptables -L -n 2>/dev/null | awk '/ACCEPT|DROP|REJECT/ {
action = $1
protocol = $3
source = $4
dest = $5
printf "%-8s %-6s %-20s %-20s\n", action, protocol, source, dest
}'
10.7 自动化部署辅助
#!/bin/bash
# deploy_check.sh — 部署前检查
APP_DIR="/opt/app"
CONFIG_FILE="${APP_DIR}/config.yml"
echo "=== 部署前检查 ==="
# 1. 检查目录权限
[ -w "$APP_DIR" ] && echo "✅ 目录可写" || echo "❌ 目录不可写"
# 2. 检查配置文件语法
if command -v yq &>/dev/null; then
yq eval '.' "$CONFIG_FILE" > /dev/null 2>&1 \
&& echo "✅ 配置文件语法正确" \
|| echo "❌ 配置文件语法错误"
fi
# 3. 检查端口是否被占用
PORT=$(awk -F: '/port/{gsub(/ /,"",$2); print $2}' "$CONFIG_FILE")
ss -tln | grep -q ":${PORT} " \
&& echo "⚠️ 端口 ${PORT} 已被占用" \
|| echo "✅ 端口 ${PORT} 可用"
# 4. 检查磁盘空间
AVAIL=$(df -m "$APP_DIR" | awk 'NR==2{print $4}')
[ "$AVAIL" -gt 100 ] \
&& echo "✅ 磁盘可用空间: ${AVAIL}MB" \
|| echo "⚠️ 磁盘可用空间不足: ${AVAIL}MB"
# 5. 检查依赖服务
for svc in nginx mysql redis; do
systemctl is-active "$svc" > /dev/null 2>&1 \
&& echo "✅ ${svc} 运行中" \
|| echo "❌ ${svc} 未运行"
done
扩展阅读
下一章:第 11 章:日志分析 — 访问日志、错误统计、趋势分析、告警。