10 月 312014
 

查看系统日志,发现大量xinetd下nrpe日志

[root@localhost ~]# less /var/log/messages
 Oct 31 11:49:30 localhost xinetd[9646]: START: nrpe pid=10372 from=::ffff:192.168.153.110
 Oct 31 11:49:30 localhost xinetd[9646]: EXIT: nrpe status=0 pid=10372 duration=0(sec)
 Oct 31 11:51:15 localhost xinetd[9646]: START: nrpe pid=10642 from=::ffff:192.168.153.110
 Oct 31 11:51:15 localhost xinetd[9646]: EXIT: nrpe status=0 pid=10642 duration=0(sec)

修改配置文件,禁用成功状态下的日志

[root@localhost ~]# vi /etc/xinetd.conf
 # Define general logging characteristics.
                log_type        = SYSLOG daemon info
                log_on_failure  = HOST
 #              log_on_success  = PID HOST DURATION EXIT

nrpe-xinetd-log-disable

重新服务xinetd后再次查看日志,不再出现nrpe相关日志

Oct 31 11:52:05 localhost xinetd[9646]: Exiting...
Oct 31 11:52:05 localhost xinetd[10785]: xinetd Version 2.3.14 started with libwrap loadavg labeled-networking options compiled in.
Oct 31 11:52:05 localhost xinetd[10785]: Started working: 1 available service
4 月 182013
 

创建主机组配置文件

define hostgroup{
 hostgroup_name  IDC
 alias           IDC Servers
 members         mail-server
 }

定义主机和服务

define host{
 use                     linux-server
 host_name               mail-server
 alias                   postfix mail server
 address                 101.78.×××.××
 }
define service{
 use                             generic-service
 host_name                       mail-server
 service_description             Webmail
 check_command                   check_http
 notifications_enabled           0
 }
define service{
 use                             generic-service
 host_name                       mail-server
 service_description             POP3 Service
 check_command                   check_pop!100.0,20%!500.0,60%
 notifications_enabled           0
 }
define service{
 use                             generic-service
 host_name                       mail-server
 service_description             SMTP Service
 check_command                   check_smtp!100.0,20%!500.0,60%
 notifications_enabled           0
 }

完整检测命令的实现

[root@monitor libexec]# ./check_pop -H 101.78.×××.×× -p 110 -w 100.0 -c 200.0
POP OK – 0.051 second response time on port 110 [+OK Dovecot ready.]|time=0.051335s;;500.000000;0.000000;10.000000
[root@monitor libexec]#

[root@monitor libexec]# ./check_smtp -H 101.78.×××.×× -p 25 -4 -w 100 -c 500
SMTP OK – 0.096 sec. response time|time=0.095827s;100.000000;500.000000;0.000000
[root@monitor libexec]#

在nagios web管理界面中看到的新增监控项目

nagios-config-02-01

查看新增的主机组

nagios-config-02-02

主机状态通告显示新增主机为Down状态
原因是目的主机前端防火墙禁止PING响应nagios-config-02-03

 

 

 

nagios-config-02-04

查看主机状态信息并关闭主机活动检测nagios-config-02-05

nagios-config-02-06

成功执行命令

nagios-config-02-07

确认主机活动检测已关闭nagios-config-02-08

nagios-config-02-09

4 月 172013
 

……

在监控主机端安装check_nrpe插件

[root@monitor ~]# tar xzvf nrpe-2.14.tar.gz
[root@monitor ~]# cd nrpe-2.14
[root@monitor nrpe-2.14]# ./configure
[root@monitor nrpe-2.14]# make all
[root@monitor nrpe-2.14]# make install-plugin
[root@monitor nrpe-2.14]# ls /usr/local/nagios/libexec/check_nrpe
 /usr/local/nagios/libexec/check_nrpe
[root@monitor nrpe-2.14]#

在被监控主机端安装nagios plugin与nrpe daemon

[root@localhost nrpe-2.14]# useradd nagios -s /sbin/nologin

安装nagios plugin

[root@localhost nagios-plugins-1.4.16]# ./configure
[root@localhost nagios-plugins-1.4.16]# make
[root@localhost nagios-plugins-1.4.16]# make install
[root@localhost nagios-plugins-1.4.16]# chown -R nagios.nagios /usr/local/nagios/

[root@localhost nrpe-2.14]# yum install gcc make openssl-devel

[root@localhost ~]# yum install xinetd

[root@localhost nrpe-2.14]# ./configure
[root@localhost nrpe-2.14]# make
[root@localhost nrpe-2.14]# make install
[root@localhost nrpe-2.14]# make install-daemon
[root@localhost nrpe-2.14]# make install-daemon-config
[root@localhost nrpe-2.14]# make install-xinetd

修改配置文件中NRPE监听的监控主机IP地址
[root@localhost nrpe-2.14]# vi /etc/xinetd.d/nrpe
only_from       = 127.0.0.1

[root@localhost nrpe-2.14]# vi /etc/services
nrpe            5666/tcp                # NRPE Daemon

[root@localhost nrpe-2.14]# service xinetd start
Starting xinetd:                                           [  OK  ]
[root@localhost nrpe-2.14]#

[root@localhost nrpe-2.14]# netstat -at |grep nrpe
tcp        0      0 *:nrpe                      *:*                         LISTEN
[root@localhost nrpe-2.14]#

[root@localhost nrpe-2.14]# /usr/local/nagios/libexec/check_nrpe -H localhost
NRPE v2.14
[root@localhost nrpe-2.14]#

在监控主机使用命令检测被监控主机当前登录用户数量

[root@monitor libexec]# ./check_nrpe -H 192.168.1.90 -c check_users
USERS OK – 1 users currently logged in |users=1;5;10;0
[root@monitor libexec]#

使用NRPE需要定义被监控主机所使用的命令
[root@localhost nrpe-2.14]# vi /usr/local/nagios/etc/nrpe.cfg
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/hda1
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200

在监控主机创建check_nrpe命令定义

# ‘check_nrpe’ command definition
define command{
command_name    check_nrpe
command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

增加被监控主机的监控服务定义

[root@monitor etc]# vi servers/szvs-v01.cfg
define service{
use                             generic-service
host_name                       szvs-v01
service_description             Current Users
check_command                   check_nrpe!check_users
}

define service{
use                             generic-service
host_name                       szvs-v01
service_description             CPU Load
check_command                   check_nrpe!check_load
}

nagios-nrpe-install-01 nagios-nrpe-install-02

相关下载:
(1) NRPE 2.14

4 月 172013
 

确定将要使用的在templates.cfg已定义的主机模板

在主配置文件中取消注释以下内容

 #cfg_dir=/usr/local/nagios/etc/servers

创建目录

[root@monitor objects]# mkdir ../servers
 [root@monitor objects]# chown -R nagios.nagios ../servers/
 [root@monitor objects]# chmod -R 775 ../servers/

创建主机组配置文件szvs.cfg并添加主机

define hostgroup{
 hostgroup_name  v-servers
 alias           Virtualization Servers
 members         szvs-v01
 }

创建主机配置文件szvs-v01.cfg

定义主机部分,使用linux-server主机模板

define host{
 use                     linux-server
 host_name               szvs-v01
 alias                   app
 address                 192.168.1.90
 }

定义服务部分

引用的服务模板generic-service来自模板配置文件

define service{
 use                             generic-service
 host_name                       szvs-v01
 service_description             PING
 check_command                   check_ping!100.0,20%!500.0,60%
 notifications_enabled           0
 }
define service{
 use                             generic-service
 host_name                       szvs-v01
 service_description             HTTP
 check_command                   check_http
 notifications_enabled           0
 }

检测当前配置文件正确性

[root@monitor objects]# ../../bin/nagios -v ../nagios.cfg

Nagios Core 3.5.0
Copyright (c) 2009-2011 Nagios Core Development Team and Community Contributors
Copyright (c) 1999-2009 Ethan Galstad
Last Modified: 03-15-2013
License: GPL

Website: http://www.nagios.org
Reading configuration data…
Read main config file okay…
Processing object config file ‘/usr/local/nagios/etc/objects/commands.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/contacts.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/timeperiods.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/templates.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/localhost.cfg’…
Processing object config directory ‘/usr/local/nagios/etc/servers’…
Processing object config file ‘/usr/local/nagios/etc/servers/szvs.cfg’…
Processing object config file ‘/usr/local/nagios/etc/servers/szvs-v01.cfg’…
Read object config files okay…

Running pre-flight check on configuration data…

Checking services…
Checked 9 services.
Checking hosts…
Checked 2 hosts.
Checking host groups…
Checked 2 host groups.
Checking service groups…
Checked 0 service groups.
Checking contacts…
Checked 1 contacts.
Checking contact groups…
Checked 1 contact groups.
Checking service escalations…
Checked 0 service escalations.
Checking service dependencies…
Checked 0 service dependencies.
Checking host escalations…
Checked 0 host escalations.
Checking host dependencies…
Checked 0 host dependencies.
Checking commands…
Checked 24 commands.
Checking time periods…
Checked 5 time periods.
Checking for circular paths between hosts…
Checking for circular host and service dependencies…
Checking global event handlers…
Checking obsessive compulsive processor commands…
Checking misc settings…

Total Warnings: 0
Total Errors:   0

Things look okay – No serious problems were detected during the pre-flight check
[root@monitor objects]#

重启nagios服务

[root@monitor objects]# service nagios restart
 Running configuration check...done.
 Stopping nagios: .done.
 Starting nagios:This account is currently not available.
 done.
 [root@monitor objects]#

查看重启以后的nagios日志信息

nagios-config-01-01

 

查看主机地图

nagios-config-01-02

查看主机列表

nagios-config-01-03

查看服务列表

nagios-config-01-04

 

 

查看主机组列表

nagios-config-01-05

检测项目的任务队列

首次添加的任务显示为PENDING状态,等待检测

nagios-config-01-06

查看任务队列中的检测时间排列

nagios-config-01-07

成功执行新添加的HTTP服务检测nagios-config-01-08

4 月 172013
 

……

nagios-monitor-01

主配置文件nagios.cfg默认引用的对象配置文件

cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/templates.cfg
cfg_file=/usr/local/nagios/etc/objects/localhost.cfg

Localhost监控对象配置文件分析

定义主机

define host{
        use                     linux-server
        host_name               localhost
        alias                   localhost
        address                 127.0.0.1
        }

use 定义当前主机使用的主机模板,引用在templates.cfg中已定义的linux-server主机模板
hostname 定义显示在nagios web管理界面中的主机名称
alias 别名,主机名的完整描述
address 定义当前主机IP地址

定义主机组

 define hostgroup{
 hostgroup_name  linux-servers
 alias           Linux Servers
 members         localhost
 }

主机组用来定相似服务类型或处于同一地域的一组主机
hostgroup_name 定义当前主机组名称
alias 主机组别名,主机组名称的完整描述
members 定义当前主机组中包含的主机,使用已定义主机名称并使用逗号分隔

定义服务(定义具体的监控项目)

监控已定义主机localhost的ping响应
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             PING
check_command                   check_ping!100.0,20%!500.0,60%
}

use 引用在templates.cfg中已定义的服务模板
hostname 指定启用此监控项目的已定义的主机名
service_description 显示在nagios web界面的服务名称
check_command

配置文件中检测命令的完整工作过程

查看

[root@monitor objects]# pwd
 /usr/local/nagios/etc/objects
[root@monitor objects]# ls ../../libexec/check_ping
 ../../libexec/check_ping
[root@monitor objects]#
[root@monitor objects]# ../../libexec/check_ping
 check_ping: Could not parse arguments
 Usage:
 check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%
[-p packets] [-t timeout] [-4|-6]
[root@monitor objects]#

-w 指定警告数值和百分比
-c 指定临界数值和百分比

nagios检测命令的完整格式及返回结果

[root@monitor objects]# ../../libexec/check_ping -H localhost -w 100.0,20% -c 500.0,60%
 PING OK - Packet loss = 0%, RTA = 0.05 ms|rta=0.046000ms;100.000000;500.000000;0.000000 pl=0%;20;60;0
 [root@monitor objects]#

响应时间达到或超过100毫秒进入警告状态,响应时间达到或超过500毫秒进入临界状态
检测得到的实际响应时间为0.05毫秒

监控已定义主机的localhost的根分区(/)可用磁盘容量

define service{
 use                             local-service         ; Name of service template to use
 host_name                       localhost
 service_description             Root Partition
 check_command                   check_local_disk!20%!10%!/
 }

check_command中的check_local_disk实际为引用命令定义文件commands.cfg已定义命令名称
# ‘check_local_disk’ command definition
define command{
command_name    check_local_disk
command_line    $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}

nagios检测命令的完整格式及返回结果
[root@monitor libexec]# ./check_disk
check_disk: Could not parse arguments
Usage:
check_disk -w limit -c limit [-W limit] [-K limit] {-p path | -x device}
[-C] [-E] [-e] [-g group ] [-k] [-l] [-M] [-m] [-R path ] [-r path ]
[-t timeout] [-u unit] [-v] [-X type]
[root@monitor libexec]# ./check_disk -w 20% -c 10% -p /
DISK OK – free space: / 45970 MB (96% inode=98%);| /=1866MB;40316;45356;0;50396
[root@monitor libexec]#

监控已定义主机的localhost的当前登录用户
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Current Users
check_command                   check_local_users!20!50
}

监控已定义主机的localhost的进程数量
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Total Processes
check_command                   check_local_procs!250!400!RSZDT
}

监控已定义主机的localhost的负载状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Current Load
check_command                   check_local_load!5.0,4.0,3.0!10.0,6.0,4.0
}

监控已定义主机的localhost的磁盘交换空间使用状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Swap Usage
check_command                   check_local_swap!20!10
}

监控已定义主机的localhost的ssh服务或端口开启状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             SSH
check_command                   check_ssh
notifications_enabled           0
}

监控已定义主机的localhost的web服务或80端口状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             HTTP
check_command                   check_http
notifications_enabled           0
}

4 月 162013
 

……44

dsdf

[root@localhost nagios]# useradd nagios -s /sbin/nologin
[root@localhost nagios]# groupadd nagcmd
[root@localhost nagios]#
[root@localhost nagios]# usermod -a -G nagcmd nagios
[root@localhost nagios]# usermod -a -G nagcmd apache

安装编译工具

yum install gcc make

编译Nagios时缺少GD相关软件开发包

*** GD, PNG, and/or JPEG libraries could not be located... *********

安装GD及相关软件开发包

[root@localhost nagios]# yum install gd-devel libpng-devel libjpeg-devel

nagios-install-01

编译安装

[root@localhost nagios]# ./configure --prefix=/usr/local/nagios \
> --with-nagios-user=nagios --with-nagios-group=nagios \
> --with-command-user=apache --with-command-group=nagcmd
[root@localhost nagios]# make all
[root@localhost nagios]# make install
[root@localhost nagios]# make install-init
[root@localhost nagios]# make install-config
[root@localhost nagios]# make install-commandmode
[root@localhost nagios]# make install-webconf

生成登录用户及密码

[root@localhost nagios]# htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
 New password:
 Re-type new password:
 Adding password for user nagiosadmin
[root@localhost nagios]#

 

service httpd restart

错误分析

make[2]: *** [check_http.o] Error 1
make[2]: Leaving directory `/root/nagios-plugins-1.4.16/plugins'
make[1]: *** [all-recursive] Error 1
make[1]: Leaving directory `/root/nagios-plugins-1.4.16'
make: *** [all] Error 2

yum install openssl-devel

[root@localhost nagios-plugins-1.4.16]#
./configure –with-nagios-user=nagios –with-nagios-group=nagios
[root@localhost ~]# /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
[root@monitor ~]# chkconfig –add nagios
[root@monitor ~]# service nagios start
Starting nagios:This account is currently not available.
done.
[root@monitor ~]#

nagios-install-02

nagios-install-03

 

nagios-install-04

 

nagios-install-05

 

nagios-install-06

相关下载:
(1) Nagios Core 3.5.0 (2) Nagios Plugin 1.4.16