系统:Centos 6.4
服务器:hostname:Nagios-Server kernel:2.6.32-358.el6.x86_64 IP:1.1.1.26
客户端:hostname:Nagios-Client kernel:2.6.32-358.el6.x86_64 IP:1.1.1.27
升级内核:
[root@Nagios-Server ~]# yum install ntpdate -y #安装ntpdate,时间同步
[root@Nagios-Server ~]# /usr/sbin/ntpdate time.nist.gov #时间同步
[root@Nagios-Server ~]# yum install kernel kernel-devel gcc gcc-c++ wget vim -y #升级内核
[root@Nagios-Client ~]# yum install ntpdate -y
[root@Nagios-Client ~]# /usr/sbin/ntpdate time.nist.gov
[root@Nagios-Client ~]# yum install kernel kernel-devel gcc gcc-c++ wget vim -y
升级后内核:
服务器:2.6.32-504.1.3.el6.x86_64
客户端:2.6.32-504.1.3.el6.x86_64
日志分析:
一、服务
①check_users服务监控,第一次soft、第二次soft、第三次hard,在然后发送邮件
[1417348396] Warning: Return code of 255 for check of service 'check_users' on host '1.1.1.27' was out of bounds.
[1417348396] SERVICE ALERT: 1.1.1.27;check_users;CRITICAL;SOFT;1;(Return code of 255 is out of bounds)
[1417348456] Warning: Return code of 255 for check of service 'check_users' on host '1.1.1.27' was out of bounds.
[1417348456] SERVICE ALERT: 1.1.1.27;check_users;CRITICAL;SOFT;2;(Return code of 255 is out of bounds)
[1417348516] Warning: Return code of 255 for check of service 'check_users' on host '1.1.1.27' was out of bounds.
[1417348516] SERVICE ALERT: 1.1.1.27;check_users;CRITICAL;HARD;3;(Return code of 255 is out of bounds)
[1417348516] SERVICE NOTIFICATION: nagiosadmin;1.1.1.27;check_users;CRITICAL;notify-service-by-email;(Return code of 255 is out of bounds)
②:check_zombie_procs服务监控,第一次soft、第二次soft、第三次hard,在然后发送邮件
[1417348426] Warning: Return code of 255 for check of service 'check_zombie_procs' on host '1.1.1.27' was out of bounds.
[1417348426] SERVICE ALERT: 1.1.1.27;check_zombie_procs;CRITICAL;SOFT;1;(Return code of 255 is out of bounds)
[1417348486] Warning: Return code of 255 for check of service 'check_zombie_procs' on host '1.1.1.27' was out of bounds.
[1417348486] SERVICE ALERT: 1.1.1.27;check_zombie_procs;CRITICAL;SOFT;2;(Return code of 255 is out of bounds)
[1417348546] Warning: Return code of 255 for check of service 'check_zombie_procs' on host '1.1.1.27' was out of bounds.
[1417348546] SERVICE ALERT: 1.1.1.27;check_zombie_procs;CRITICAL;HARD;3;(Return code of 255 is out of bounds)
[1417348546] SERVICE NOTIFICATION: nagiosadmin;1.1.1.27;check_zombie_procs;CRITICAL;notify-service-by-email;(Return code of 255 is out of bounds)
③:check_total_procs服务监控,第一次soft、第二次soft、第三次hard,在然后发送邮件
[1417348436] Warning: Return code of 255 for check of service 'check_total_procs' on host '1.1.1.27' was out of bounds.
[1417348436] SERVICE ALERT: 1.1.1.27;check_total_procs;CRITICAL;SOFT;1;(Return code of 255 is out of bounds)
[1417348496] Warning: Return code of 255 for check of service 'check_total_procs' on host '1.1.1.27' was out of bounds.
[1417348496] SERVICE ALERT: 1.1.1.27;check_total_procs;CRITICAL;SOFT;2;(Return code of 255 is out of bounds)
[1417348556] Warning: Return code of 255 for check of service 'check_total_procs' on host '1.1.1.27' was out of bounds.
[1417348556] SERVICE ALERT: 1.1.1.27;check_total_procs;CRITICAL;HARD;3;(Return code of 255 is out of bounds)
[1417348556] SERVICE NOTIFICATION: nagiosadmin;1.1.1.27;check_total_procs;CRITICAL;notify-service-by-email;(Return code of 255 is out of bounds)
二、主机
[1417349046] HOST ALERT: 1.1.1.27;DOWN;SOFT;1;CRITICAL - Host Unreachable (1.1.1.27)
[1417349116] HOST ALERT: 1.1.1.27;DOWN;SOFT;2;CRITICAL - Host Unreachable (1.1.1.27)
[1417349186] HOST ALERT: 1.1.1.27;DOWN;HARD;3;CRITICAL - Host Unreachable (1.1.1.27)
[1417349186] HOST NOTIFICATION: nagiosadmin;1.1.1.27;DOWN;notify-host-by-email;CRITICAL - Host Unreachable (1.1.1.27)
配置:
①:配置报警邮箱
sed -i 's#email nagios@localhost#email byrd_monitor@163.com#g' /usr/local/nagios/etc/objects/contacts.cfg #修改发送报警邮件地址
②:配置主机报警频次(备注:可以自定义,也可以修改修改/usr/local/nagios/etc/objects/templates.cfg)
define host{
name linux-server #linux模板通用名
use generic-host #继承了通用主机模板的其他值
check_period 24x7 #检查周期7*24小时
check_interval 2 #每隔2分钟检查一次
retry_interval 1 #异常后,1分钟后重试
max_check_attempts 3 #异常后,最大尝试3次,然后报警
check_command check-host-alive #检查主机存活命令
notification_period 24x7 #工作时间通知
notification_interval 2 #异常后,通知间隔2分
notification_options d,u,r #当主机down(关机)、unrealcable(不可达)、recovery(恢复)
contact_groups admins #通知发送管理员组
register 0 #???
}
③:配置服务报警频次
define service{
name generic-service #通用服务模板名称
active_checks_enabled 1 #服务检查启用
passive_checks_enabled 1 #被动检查启用
parallelize_check 1 #并行检查开启
obsess_over_service 1 #分布式监控使用,1启用,0禁用
check_freshness 0 #不检查服务'freshness'
notifications_enabled 1 #服务通知启用
event_handler_enabled 1 #启用服务事件处理程序
flap_detection_enabled 1 #Flap detection is enabled
failure_prediction_enabled 1 #启用故障预测
process_perf_data 1 #性能数据
retain_status_information 1 #保留重新启动状态信息
retain_nonstatus_information 1 #保留非状态信息
is_volatile 0 #The service is not volatile
check_period 24x7 #7*24
max_check_attempts 3 #重新检查服务3次,以确认是否真正的状态
normal_check_interval 1 #正常情况下每个1分钟检查一次
retry_check_interval 1 #每隔1分钟检查一次服务,直到真正的状态确定
contact_groups admins #通知管理组
notification_options w,u,c,r #发送通知,当服务状态为warning, unknown, critical, and recovery events
notification_interval 2 #60分钟后重新通知状态
notification_period 24x7 #7*24
register 0 #???
}