Nagios 监控运维完整教程 / 第12章:Docker 部署
第12章:Docker 部署
容器化部署可以简化 Nagios 的安装和管理。本章介绍 Docker 环境下的 Nagios 部署、配置管理、插件扩展以及监控容器本身的方法。
一、Docker 部署概述
1.1 容器化优势
| 优势 | 说明 |
|---|---|
| 快速部署 | 一条命令启动完整监控环境 |
| 环境隔离 | 不影响宿主机环境 |
| 易于迁移 | 容器可在任意 Docker 主机运行 |
| 版本管理 | 镜像版本化,轻松回滚 |
| 资源限制 | CPU/内存可精确控制 |
1.2 架构
┌─────────────────────────────────────────────────┐
│ Docker Host │
│ │
│ ┌──────────────────────────────────────────┐ │
│ │ nagios 容器 │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │Nagios │ │ Apache │ │ PNP4Nagios│ │ │
│ │ │Core │ │ (CGI) │ │ (图表) │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ │ │
│ └──────────────────────────────────────────┘ │
│ │ │ │
│ ┌────────▼───────┐ ┌────────▼───────┐ │
│ │ config 卷 │ │ plugins 卷 │ │
│ │ (配置文件) │ │ (自定义插件) │ │
│ └────────────────┘ └────────────────┘ │
│ │
└─────────────────────────────────────────────────┘
二、Docker 快速部署
2.1 使用预构建镜像
# 拉取镜像
docker pull jasonrivers/nagios:latest
# 运行容器
docker run -d \
--name nagios \
-p 8080:80 \
-v /opt/nagios/config:/opt/nagios/etc \
-v /opt/nagios/plugins:/opt/nagios/libexec/custom \
-v /opt/nagios/var:/opt/nagios/var \
-e NAGIOSADMIN_PASSWD=nagiosadmin \
jasonrivers/nagios:latest
# 访问 Web 界面
# http://localhost:8080/nagios/
# 用户名: nagiosadmin
# 密码: nagiosadmin (可通过环境变量修改)
2.2 Docker Compose 部署
# docker-compose.yml
version: '3.8'
services:
nagios:
image: jasonrivers/nagios:latest
container_name: nagios
restart: unless-stopped
ports:
- "8080:80"
environment:
- NAGIOSADMIN_PASSWD=YourSecurePassword
- [email protected]
volumes:
- ./config/nagios.cfg:/opt/nagios/etc/nagios.cfg
- ./config/objects:/opt/nagios/etc/objects
- ./config/conf.d:/opt/nagios/etc/conf.d
- ./config/resource.cfg:/opt/nagios/etc/resource.cfg
- ./plugins:/opt/nagios/libexec/custom
- ./var/nagios:/opt/nagios/var
- ./var/perfdata:/opt/nagios/var/perfdata
networks:
- monitoring
healthcheck:
test: ["CMD", "curl", "-f", "-u", "nagiosadmin:${NAGIOSADMIN_PASSWD}", "http://localhost/nagios/cgi-bin/status.cgi"]
interval: 30s
timeout: 10s
retries: 3
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
networks:
- monitoring
influxdb:
image: influxdb:2.7
container_name: influxdb
restart: unless-stopped
ports:
- "8086:8086"
volumes:
- influxdb-data:/var/lib/influxdb2
environment:
- DOCKER_INFLUXDB_INIT_MODE=setup
- DOCKER_INFLUXDB_INIT_USERNAME=admin
- DOCKER_INFLUXDB_INIT_PASSWORD=YourInfluxPassword
- DOCKER_INFLUXDB_INIT_ORG=monitoring
- DOCKER_INFLUXDB_INIT_BUCKET=nagios
networks:
- monitoring
networks:
monitoring:
driver: bridge
volumes:
grafana-data:
influxdb-data:
# 启动服务
docker-compose up -d
# 查看日志
docker-compose logs -f nagios
# 停止服务
docker-compose down
# 重启服务
docker-compose restart nagios
三、配置管理
3.1 目录结构
/opt/nagios-docker/
├── docker-compose.yml
├── config/
│ ├── nagios.cfg # 主配置
│ ├── resource.cfg # 资源宏
│ ├── cgi.cfg # CGI 配置
│ ├── objects/
│ │ ├── commands.cfg # 命令定义
│ │ ├── contacts.cfg # 联系人
│ │ ├── templates.cfg # 模板
│ │ ├── timeperiods.cfg # 时间段
│ │ └── localhost.cfg # 本机监控
│ └── conf.d/
│ ├── hosts/ # 主机配置
│ │ ├── webservers.cfg
│ │ └── dbservers.cfg
│ ├── services/ # 服务配置
│ └── hostgroups/ # 主机组
├── plugins/ # 自定义插件
│ ├── check_custom.sh
│ └── check_rest_api.py
└── var/
├── nagios/ # 运行时数据
└── perfdata/ # 性能数据
3.2 配置文件示例
# config/nagios.cfg 关键配置
log_file=/opt/nagios/var/nagios.log
object_cache_file=/opt/nagios/var/objects.cache
status_file=/opt/nagios/var/status.dat
command_file=/opt/nagios/var/rw/nagios.cmd
state_retention_file=/opt/nagios/var/retention.dat
cfg_dir=/opt/nagios/etc/objects/
cfg_dir=/opt/nagios/etc/conf.d/
# config/objects/commands.cfg
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -a $ARG2$
}
define command {
command_name check_custom
command_line $USER2$/custom/$ARG1$ $ARG2$
}
3.3 配置热重载
# 方式一:使用外部命令
echo "[$(date +%s)] RESTART_NAGIOS" >> /opt/nagios/var/rw/nagios.cmd
# 方式二:Docker exec 重载
docker exec nagios /opt/nagios/bin/nagios -v /opt/nagios/etc/nagios.cfg
docker restart nagios
# 方式三:docker-compose 重载
docker-compose restart nagios
四、插件扩展
4.1 添加自定义插件
# 挂载插件目录
# docker-compose.yml
volumes:
- ./plugins:/opt/nagios/libexec/custom
# 确保插件有执行权限
chmod +x ./plugins/*.sh
chmod +x ./plugins/*.py
# 在容器内测试插件
docker exec nagios /opt/nagios/libexec/custom/check_custom.sh -H localhost -w 80 -c 95
4.2 Dockerfile 扩展
# Dockerfile.custom
FROM jasonrivers/nagios:latest
# 安装额外的依赖
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
fping \
snmp \
&& rm -rf /var/lib/apt/lists/*
# 安装 Python 插件依赖
RUN pip3 install requests psutil
# 复制自定义插件
COPY plugins/ /opt/nagios/libexec/custom/
RUN chmod +x /opt/nagios/libexec/custom/*
# 复制自定义配置
COPY config/ /opt/nagios/etc/
# 设置权限
RUN chown -R nagios:nagios /opt/nagios/
EXPOSE 80
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
# 构建自定义镜像
docker build -t my-nagios:latest -f Dockerfile.custom .
# 使用自定义镜像
docker-compose up -d
五、监控容器
5.1 使用 Docker API 检查容器状态
#!/bin/bash
# check_docker_container.sh - 检查 Docker 容器状态
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
CONTAINER=$1
DOCKER_HOST=${2:-unix:///var/run/docker.sock}
# 获取容器状态
STATUS=$(curl -s --unix-socket /var/run/docker.sock "http://localhost/containers/${CONTAINER}/json" | python3 -c "
import sys, json
data = json.load(sys.stdin)
print(data['State']['Status'])
" 2>/dev/null)
if [ -z "$STATUS" ]; then
echo "CRITICAL: Container '${CONTAINER}' not found"
exit $STATE_CRITICAL
fi
if [ "$STATUS" = "running" ]; then
# 检查健康状态
HEALTH=$(curl -s --unix-socket /var/run/docker.sock "http://localhost/containers/${CONTAINER}/json" | python3 -c "
import sys, json
data = json.load(sys.stdin)
health = data['State'].get('Health', {})
print(health.get('Status', 'none'))
" 2>/dev/null)
if [ "$HEALTH" = "unhealthy" ]; then
echo "CRITICAL: Container '${CONTAINER}' is unhealthy"
exit $STATE_CRITICAL
fi
echo "OK: Container '${CONTAINER}' is running | status=1"
exit $STATE_OK
else
echo "CRITICAL: Container '${CONTAINER}' is ${STATUS} | status=0"
exit $STATE_CRITICAL
fi
5.2 容器资源监控
#!/bin/bash
# check_docker_stats.sh - 检查容器资源使用
CONTAINER=$1
CPU_WARN=$2
CPU_CRIT=$3
MEM_WARN=$4
MEM_CRIT=$5
# 获取容器统计信息
STATS=$(docker stats --no-stream --format \
"{{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" \
$CONTAINER 2>/dev/null)
if [ -z "$STATS" ]; then
echo "CRITICAL: Unable to get stats for container '${CONTAINER}'"
exit 2
fi
CPU=$(echo "$STATS" | awk '{print $1}' | tr -d '%')
MEM=$(echo "$STATS" | awk '{print $2}' | tr -d '%')
NET=$(echo "$STATS" | awk '{print $3}')
DISK=$(echo "$STATS" | awk '{print $4}')
# 阈值判断
if (( $(echo "$CPU > $CPU_CRIT" | bc -l) )) || (( $(echo "$MEM > $MEM_CRIT" | bc -l) )); then
echo "CRITICAL: CPU=${CPU}%, MEM=${MEM}% | cpu=${CPU}%;${CPU_WARN};${CPU_CRIT} mem=${MEM}%;${MEM_WARN};${MEM_CRIT}"
exit 2
elif (( $(echo "$CPU > $CPU_WARN" | bc -l) )) || (( $(echo "$MEM > $MEM_WARN" | bc -l) )); then
echo "WARNING: CPU=${CPU}%, MEM=${MEM}% | cpu=${CPU}%;${CPU_WARN};${CPU_CRIT} mem=${MEM}%;${MEM_WARN};${MEM_CRIT}"
exit 1
else
echo "OK: CPU=${CPU}%, MEM=${MEM}% | cpu=${CPU}%;${CPU_WARN};${CPU_CRIT} mem=${MEM}%;${MEM_WARN};${MEM_CRIT}"
exit 0
fi
5.3 Nagios 配置
# 命令定义
define command {
command_name check_docker_container
command_line $USER2$/check_docker_container.sh $ARG1$
}
define command {
command_name check_docker_stats
command_line $USER2$/check_docker_stats.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$
}
# 服务定义
define service {
use generic-service
host_name docker-host-01
service_description Nagios Container
check_command check_docker_container!nagios
}
define service {
use generic-service
host_name docker-host-01
service_description Nagios Container Resources
check_command check_docker_stats!nagios!80!95!80!95
}
define service {
use generic-service
host_name docker-host-01
service_description Grafana Container
check_command check_docker_container!grafana
}
六、注意事项
| 注意事项 | 说明 |
|---|---|
| 数据持久化 | 必须挂载卷保存配置和数据 |
| 网络模式 | 监控外部主机使用 host 网络模式 |
| 权限问题 | Docker socket 需要适当权限 |
| 资源限制 | 设置容器 CPU/内存限制 |
| 日志管理 | 配置日志轮转避免磁盘占满 |
| 镜像更新 | 定期更新基础镜像修复安全漏洞 |
| 安全审计 | 自定义镜像需要安全审查 |
七、本章小结
- Docker 部署简化了 Nagios 的安装和管理
- Docker Compose 编排完整的监控栈
- 配置管理通过卷挂载实现外部管理
- 插件扩展通过挂载目录或自定义镜像
- 容器监控需要使用 Docker API 或 socket
下一章:第13章:故障排查 - 学习 Nagios 常见问题的诊断和解决方法。