nagios4/test/test-downtime.pl

1760 lines
55 KiB
Perl
Raw Normal View History

2017-05-19 23:37:19 +02:00
#!/usr/bin/perl -w
# Script to test downtime in Nagios with and without restarts
use English;
use Getopt::Long;
use Algorithm::Permute;
my $testPaused = 0;
my @SLEEPTICKS = ( "-", "\\", "|", "/", "-", "\\", "|", "/");
my $USESLEEPTICKS = 0;
my $maxStatusLevel = 3;
sub statusString {
my $level = shift;
my $message = shift;
return if( $level > $maxStatusLevel);
return sprintf( " %s %s", humanTime( time), $message);
}
# Read the contents of a .dat file such as status.dat or retention.dat
sub readDatFile {
my $file = shift;
# Read the entire file into a variable
my $oldrs = $RS;
$RS = undef;
open( FILE, "$file") || die "Unable to open $file for reading";
my $contents = <FILE>;
close( FILE) || die "Unable to close $file";
$RS = $oldrs;
# Strip comment and blank lines
$contents =~ s/#.*\n//g;
$contents =~ s/^\s*\n//g;
$contents =~ s/\n\s*\n/\n/g;
# Break into sections
my @sections = split( /}\n/, $contents);
my %contents;
# Parse each section
for( my $x = 0; $x < @sections; $x++) {
my @lines = split( /\n/, $sections[ $x]);
# Get the section type
chomp $lines[ 0];
$lines[ 0] =~ s/^\s*//g;
$lines[ 0] =~ s/\s+{\s*$//g;
# Create a hash of the remaining lines
my $href = {};
for( my $y = 1; $y < @lines; $y++) {
next if( $lines[ $y] =~ /^\s*$/);
chomp( $lines[ $y]);
$lines[ $y] =~ s/^\s*//;
$lines[ $y] =~ s/\s*$//;
die "Unknown format for line: " . $lines[ $y] unless( $lines[ $y] =~ /^([^=]+)=(.*)$/);
my( $key, $value) = ( $1, $2);
$href->{ $key} = $value;
}
# Enter stuff into the contents hash
if( $lines[ 0] =~ /^(info|program|programstatus)$/) {
$contents{ $lines[ 0]} = $href;
}
elsif( $lines[ 0] =~ /^(contact|contactstatus|host|hoststatus|service|servicecomment|servicedowntime|servicestatus)$/) {
$contents{ $lines[ 0]} = [] unless( exists( $contents{ $lines[ 0]}));
push( @{ $contents{ $lines[ 0]}}, $href);
}
else {
die "Unanticipated section encountered: " . $lines[ 0];
}
}
return \%contents;
}
# Read the contents of the objects.cache file
sub readObjectCache {
my $file = shift;
# Read the entire file into a variable
my $oldrs = $RS;
$RS = undef;
open( FILE, "$file") || die "Unable to open $file for reading";
my $contents = <FILE>;
close( FILE) || die "Unable to close $file";
$RS = $oldrs;
# Strip comment and blank lines
$contents =~ s/#.*\n//g;
$contents =~ s/^\s*\n//g;
$contents =~ s/\n\s*\n/\n/g;
# Break into sections
my @sections = split( /}\n/, $contents);
my %contents;
# Parse each definition
for( my $x = 0; $x < @sections; $x++) {
my @lines = split( /\n/, $sections[ $x]);
# Get the definition type
chomp $lines[ 0];
my $definitionType = undef;
if( $lines[ 0] =~ /define\s+(\S+)\s*{/) {
$definitionType = $1;
}
die "Unknown definition line: " . $lines[ 0] if( !defined( $definitionType));
# Create a hash of the remaining lines
my $href = {};
for( my $y = 1; $y < @lines; $y++) {
chomp( $lines[ $y]);
next if( $lines[ $y] =~ /^\s*$/);
die "Unknown format for line: " . $lines[ $y] unless( $lines[ $y] =~ /^\s*(\S+)\s+(.*)\s*$/);
my( $key, $value) = ( $1, $2);
$href->{ $key} = $value;
}
# Enter stuff into the contents hash
$contents{ $definitionType} = [] unless( exists( $contents{ $definitionType}));
push( @{ $contents{ $definitionType}}, $href);
}
return \%contents;
}
# Read the contents of the nagios.cfg file.
# NOTE: This function does not recursively read the contents of configuration
# files specified in the cfg_file and cfg_dir directives.
sub readCfgFile {
my $file = shift;
my %cfg;
open( FILE, "$file") || die "Unable to open $file for reading";
while( <FILE>) {
chomp;
next if( /^\s*#/);
next if( /^\s*$/);
if( /^([^=]+)\s*=\s*(.*)$/) {
my( $key, $value) = ( $1, $2);
if( $key =~ /cfg_file|cfg_dir/) {
$cfg{ $key} = [] unless( exists( $cfg{ $key}));
push( @{ $cfg{ $key}}, $value);
}
else {
die "Parameter $key already exists" if( exists( $cfg{ $key}));
$cfg{ $key} = $value;
}
}
else {
die "Unknown formatting for line: $_";
}
}
return \%cfg;
}
# Send a command to Nagios to schedule downtime
sub scheduleDowntime {
my $service = shift;
my $start = shift;
my $end = shift;
my $fixed = shift;
my $duration = shift;
my $comment = shift;
my $cmd = sprintf( "%s;%s;%s;%lu;%lu;%d;%u;%u;%s;%s",
"SCHEDULE_SVC_DOWNTIME", $service->{ "host_name"},
$service->{ "service_description"}, $start, $end, $fixed,
0, $duration, "Downtime Testing", $comment);
my $cmdfile = "/usr/local/nagios/var/rw/nagios.cmd";
open( CMD, ">$cmdfile") || die "Cannot open $cmdfile for writing";
print CMD sprintf( "[%lu] %s\n", time, $cmd);
close( CMD) || die "Unable to close $cmdfile";
sleep 1;
}
# Send a command to Nagios to delete downtime
sub deleteDowntime {
my $id = shift;
my $cfg = shift;
my $service = shift;
# There is a bug in Nagios that currently causes Nagios to crash when
# deleting a downtime. Instead of deleting the downtime, we'll just
# have to wait until it ends.
my $datfile = readDatFile( $cfg->{ "status_file"});
my $downtimes = findServiceDowntimes( $datfile, $service);
while( scalar( @$downtimes) > 0) {
sleep 1;
$datfile = readDatFile( $cfg->{ "status_file"});
$downtimes = findServiceDowntimes( $datfile, $service);
}
return;
my $cmd = sprintf( "%s;%u", "DEL_SVC_DOWNTIME", $id);
my $cmdfile = "/usr/local/nagios/var/rw/nagios.cmd";
open( CMD, ">$cmdfile") || die "Cannot open $cmdfile for writing";
print CMD sprintf( "[%lu] %s\n", time, $cmd);
close( CMD) || die "Unable to close $cmdfile";
}
# Send a command to Nagios to process passive service check results
sub sendPassiveResults {
my $service = shift;
my $returnCode = shift;
my $pluginOutput = shift;
my $cmd = sprintf( "%s;%s;%s;%u;%s", "PROCESS_SERVICE_CHECK_RESULT",
$service->{ "host_name"}, $service->{ "service_description"},
$returnCode, $pluginOutput);
my $cmdfile = "/usr/local/nagios/var/rw/nagios.cmd";
open( CMD, ">$cmdfile") || die "Cannot open $cmdfile for writing";
print CMD sprintf( "[%lu] %s\n", time, $cmd);
close( CMD) || die "Unable to close $cmdfile";
}
# Determine the amount of time to discover a service failure above
# the normal check results reaper frequency
sub maxDiscoveryTime {
my $cfg = shift;
my $service = shift;
my $versionMajor = shift;
my $maxDiscoveryTime = 0;
if( $service->{ "active_checks_enabled"}) {
$maxDiscoveryTime += ( $service->{ "check_interval"} *
$cfg->{ "interval_length"});
$maxDiscoveryTime += ((( $service->{ "retry_interval"} +
( $versionMajor < 4 ?
$cfg->{ "check_result_reaper_frequency"} : 0)) *
$cfg->{ "interval_length"}) *
( $service->{ "max_check_attempts"} - 1));
}
return $maxDiscoveryTime;
}
# Return information about a particular service recorded in a .dat file
sub findServiceStatus {
my $datfile = shift;
my $service = shift;
my $found;
if( exists( $datfile->{ "servicestatus"})) {
for( my $x = 0; $x < @{ $datfile->{ "servicestatus"}}; $x++) {
if(( $datfile->{ "servicestatus"}->[ $x]->{ "host_name"} eq
$service->{ "host_name"}) &&
( $datfile->{ "servicestatus"}->[ $x]->{ "service_description"} eq
$service->{ "service_description"})) {
return $datfile->{ "servicestatus"}->[ $x];
}
}
}
return \$found;
}
sub findServiceNextCheck {
my $cfg = shift;
my $service = shift;
my $datfile = readDatFile( $cfg->{ "status_file"});
my $serviceStatus = findServiceStatus( $datfile, $service);
return $serviceStatus->{ "next_check"};
}
# Return information about downtimes for a particular service recorded in a
# .dat file
sub findServiceDowntimes {
my $datfile = shift;
my $service = shift;
my @found;
if( exists( $datfile->{ "servicedowntime"})) {
for( my $x = 0; $x < @{ $datfile->{ "servicedowntime"}}; $x++) {
if(( $datfile->{ "servicedowntime"}->[ $x]->{ "host_name"} eq
$service->{ "host_name"}) &&
( $datfile->{ "servicedowntime"}->[ $x]->{ "service_description"} eq
$service->{ "service_description"})) {
push( @found, $datfile->{ "servicedowntime"}->[ $x]);
}
}
}
return \@found;
}
# Fine a particular service in a config (.cfg) file or the objects cache.
sub findService {
my $cfgfile = shift;
my $host = shift;
my $service = shift;
if( exists( $cfgfile->{ "service"})) {
for( my $x = 0; $x < @{ $cfgfile->{ "service"}}; $x++) {
if(( $cfgfile->{ "service"}->[ $x]->{ "host_name"} eq $host) &&
( $cfgfile->{ "service"}->[ $x]->{ "service_description"} eq $service)) {
return $cfgfile->{ "service"}->[ $x];
}
}
}
return undef;
}
# Sleep for a given amount of time displaying a message while sleeping
sub noisySleep {
my $length = shift;
my $message = shift;
print $message;
print " " if( $USESLEEPTICKS);
for( my $x = 0; $x < $length; $x++) {
if( $USESLEEPTICKS) {
printf( "\b%s", $SLEEPTICKS[ $x % scalar( @SLEEPTICKS)]);
}
else {
print ".";
}
sleep 1;
}
print "\b " if( $USESLEEPTICKS);
print "\n";
}
# Sleep until a given time displaying a message while sleeping
sub noisySleepUntilTime {
my $time = shift;
my $message = shift;
print $message;
print " " if( $USESLEEPTICKS);
while( time < $time) {
if( $USESLEEPTICKS) {
printf( "\b%s", $SLEEPTICKS[ time % scalar( @SLEEPTICKS)]);
}
else {
print ".";
}
sleep 1;
}
print "\b " if( $USESLEEPTICKS);
print "\n";
}
# Sleep until a specified file is updated, displaying a message while sleeping
sub noisySleepUntilFileUpdate {
my $file = shift; # File whose mtime should be watched
my $after = shift; # Time after which file must be updated
my $maxtime = shift; # Maximum number of seconds after $after to wait
my $message = shift; # Message to display
print $message;
print " " if( $USESLEEPTICKS);
while(( ! -f $file) && ( time < ( $after + $maxtime))) {
if( $USESLEEPTICKS) {
printf( "\b%s", $SLEEPTICKS[ time % scalar( @SLEEPTICKS)]);
}
else {
print ".";
}
sleep 1;
}
my ( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime,
$ctime, $blksize, $blocks) = stat( $file);
while(( $mtime <= $after) && ( time < ( $after + $maxtime))) {
if( $USESLEEPTICKS) {
printf( "\b%s", $SLEEPTICKS[ time % scalar( @SLEEPTICKS)]);
}
else {
print ".";
}
sleep 1;
( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime,
$ctime, $blksize, $blocks) = stat( $file);
}
print "\b " if( $USESLEEPTICKS);
print "\n";
if( time >= ( $after + $maxtime)) {
die "Maximum time exceeded waiting for $file to update\n";
}
}
# Verify that a downtime is recorded correctly in a .dat file
sub verifyDowntime {
my $service = shift;
my $cfg = shift;
my $downtimeFixed = shift;
my $downtimeStart = shift;
my $downtimeEnd = shift;
my $downtimeDuration = shift;
# Grab the status data
print statusString( 2, "Reading status data.\n");
my $datfile = readDatFile( $cfg->{ "status_file"});
# Find the downtimes
print statusString( 2, "Looking for downtimes.\n");
my $downtimes = findServiceDowntimes( $datfile, $service);
# Check to see if we found the correct number of downtimes (1)
print statusString( 2, "Verifying the downtime.\n");
if( scalar( @$downtimes) != 1) {
print STDERR "Incorrect number of downtimes found: " .
scalar( @downtimes) . ".\n";
return 0;
}
# Verify that the aspects of the downtime match
if( $downtimes->[ 0]->{ "fixed"} != $downtimeFixed) {
print STDERR sprintf( "Downtime type incorrect: should be %s, is %s.\n",
( $downtimes->[ 0]->{ "fixed"} ? "Fixed" : "Flexible"),
( $downtimeFixed ? "Fixed" : "Flexible"));
return 0;
}
if( $downtimes->[ 0]->{ "start_time"} != $downtimeStart) {
print STDERR sprintf( "Downtime start time incorrect: should be %lu, is %lu.\n",
$downtimes->[ 0]->{ "start_time"}, $downtimeStart);
return 0;
}
if( $downtimes->[ 0]->{ "end_time"} != $downtimeEnd) {
print STDERR sprintf( "Downtime end time incorrect: should be %lu, is %lu.\n",
$downtimes->[ 0]->{ "end_time"}, $downtimeEnd);
return 0;
}
if( ! $downtimeFixed && ( $downtimes->[ 0]->{ "duration"} !=
$downtimeDuration)) {
print STDERR sprintf( "Downtime duration incorrect: should be %lu, is %lu.\n",
$downtimes->[ 0]->{ "duration"}, $downtimeDuration);
return 0;
}
return 1;
}
sub recoverService {
my $service = shift;
my $cfg = shift;
my $message = shift;
my $nagiosRunning = shift;
my $versionMajor = shift;
print statusString( 1, sprintf( "%s\n", $message));
if( $service->{ "active_checks_enabled"}) {
if(( $service->{ "host_name"} eq "localhost") &&
( $service->{ "service_description"} eq "HTTP")) {
system( "/sbin/service httpd start");
}
elsif(( $service->{ "host_name"} eq "localhost") &&
( $service->{ "service_description"} eq "SSH")) {
system( "/sbin/service sshd start");
}
else {
die sprintf( "Don't know how to start service %s:%s",
$service->{ "host_name"},
$service->{ "service_description"});
}
}
else {
die "Nagios is not running" unless( $nagiosRunning);
sendPassiveResults( $service, 0, "OK - Everything Okey Dokey");
}
# If Nagios is running, sleep so an undetected recovery doesn't
# interfere with future operations
if( $nagiosRunning) {
if( $service->{ "active_checks_enabled"}) {
my $nextCheck = findServiceNextCheck( $cfg, $service);
noisySleepUntilTime( $nextCheck, statusString( 2,
sprintf( "Waiting for next check to run at %s",
humanTime( $nextCheck))));
}
if( $versionMajor < 4) {
noisySleep( $cfg->{ "check_result_reaper_frequency"} + 2,
statusString( 2, "Waiting for check results reaper to run"));
}
noisySleepUntilFileUpdate( $cfg->{ "status_file"}, time,
$cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"},
statusString( 2, "Waiting for status to update"));
}
return 1;
}
sub failService {
my $service = shift;
my $cfg = shift;
my $schedule = shift;
my $downtimeStart = shift;
my $nagiosRunning = shift;
my $versionMajor = shift;
# downtimeEndCheckIncrement is how much time we're going to add before the
# DowntimeEndCheck event due to a service check being delayed after the
# service fail command is sent.
my $downtimeEndCheckIncrement = 0;
print statusString( 1, "Sending service failure.\n");
if( $service->{ "active_checks_enabled"}) {
if(( $service->{ "host_name"} eq "localhost") &&
( $service->{ "service_description"} eq "HTTP")) {
system( "/sbin/service httpd stop");
}
elsif(( $service->{ "host_name"} eq "localhost") &&
( $service->{ "service_description"} eq "SSH")) {
system( "/sbin/service sshd stop");
}
else {
die sprintf( "Don't know how to stop service %s:%s",
$service->{ "host_name"},
$service->{ "service_description"});
}
}
else {
die "Nagios is not running" unless( $nagiosRunning);
sendPassiveResults( $service, 2, "CRITICAL - Danger Will Robinson");
}
if( $nagiosRunning) {
if( $service->{ "active_checks_enabled"}) {
my $nextCheck = findServiceNextCheck( $cfg, $service);
$downtimeEndCheckIncrement = $nextCheck - time -
maxDiscoveryTime( $cfg, $service, $versionMajor);
$downtimeEndCheckIncrement = 0 if( $downtimeEndCheckIncrement < 0);
noisySleepUntilTime( $nextCheck, statusString( 2,
sprintf( "Waiting for next check to run at %s",
humanTime( $nextCheck))));
}
if( $versionMajor < 4) {
noisySleep( $cfg->{ "check_result_reaper_frequency"} + 2,
statusString( 2, "Waiting for check results reaper to run"));
}
noisySleepUntilFileUpdate( $cfg->{ "status_file"}, time,
$cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"},
statusString( 2, "Waiting for status to update"));
if( time < $downtimeStart + $schedule->{ "DowntimeEnd"}) {
# We have not passed the end of the downtime waiting for
# a check that would trigger a downtime
print statusString( 2,
"Verifying that flexible downtime is in effect.\n");
my $datfile = readDatFile( $cfg->{ "status_file"});
my $downtimes = findServiceDowntimes( $datfile, $service);
if( scalar( @$downtimes) != 1) {
print STDERR "Incorrect number of downtimes found: " .
scalar( @$downtimes) . "\n";
return ( -1, undef);
}
if( ! $downtimes->[ 0]->{ "is_in_effect"}) {
print STDERR "Downtime is not in effect\n";
return ( -1, undef);
}
print statusString( 2,
sprintf( "The flexible downtime was triggered at %s.\n",
humanTime( $downtimes->[ 0]->{ "flex_downtime_start"})));
return ( $downtimeEndCheckIncrement, $downtimes->[ 0]);
}
else {
return ( $downtimeEndCheckIncrement, undef);
}
}
else {
return ( 0, undef);
}
}
sub checkStateAfterNagiosRestart {
my $cfg = shift;
my $service = shift;
my $schedule = shift;
my $downtimeStart = shift;
my $downtimeFixed = shift;
my $downtimeAfterServiceFail = shift;
my $downtimesAtNagiosStop = shift;
# If a flexible downtime was triggered and completed before the
# Nagios restart, we have nothing to do.
my $durationEndBeforeNagiosStop = ( ! $downtimeFixed &&
( $schedule->{ "ServiceFail"} > 0) &&
( $schedule->{ "NagiosStop"} > 0) &&
( ref( $downtimeAfterServiceFail) eq "HASH") &&
( $downtimeAfterServiceFail->{ "flex_downtime_start"} > 0) &&
( $downtimeAfterServiceFail->{ "flex_downtime_start"} +
$schedule->{ "DowntimeDuration"} < $schedule->{ "NagiosStart"} +
$downtimeStart));
print statusString( 3,
sprintf( "Downtime duration %s before Nagios was restarted\n",
( $durationEndBeforeNagiosStop ? "ended" : "did not end")));
return 0 if ( $durationEndBeforeNagiosStop);
# If the service failed AND recovered while Nagios was down, a flexible
# downtime would never be trigged.
my $serviceCycleWhileNagiosDown = ( ! $downtimeFixed &&
( $schedule->{ "ServiceFail"} > 0) &&
( $schedule->{ "NagiosStop"} > 0) &&
( $schedule->{ "NagiosStop"} < $schedule->{ "ServiceFail"}) &&
( $schedule->{ "ServiceRecover"} < $schedule->{ "NagiosStart"}));
print statusString( 3, sprintf( "Service %s while Nagios was down\n",
( $serviceCycleWhileNagiosDown ? "cycled" : "did not cycle")));
return 0 if( $serviceCycleWhileNagiosDown);
# downtimeEndCheckIncrement is how much time we're going to add before the
# DowntimeEndCheck event due to the downtime not being triggered until
# after a Nagios restart, if that even happens
my $downtimeEndCheckIncrement = 0;
# Determine whether the Nagios restart occurred after the end of the
# downtime.
my $nagiosRestartAfterDowntimeEnd = ( $schedule->{ "NagiosStart"} >
$schedule->{ "DowntimeEnd"});
# Use the above information and the schedule to determine whether a
# flexible downtime should have been triggered or should yet be triggered.
my $downtimeTriggeredBeforeNagiosStop = 0;
my $downtimeTriggeredAfterNagiosStart = 0;
if( ! $downtimeFixed) {
if(( $schedule->{ "ServiceFail"} > 0) &&
( $schedule->{ "ServiceFail"} < $schedule->{ "NagiosStop"})) {
$downtimeTriggeredBeforeNagiosStop = 1;
print statusString( 3, "Downtime should have been triggered " .
"before Nagios stopped\n");
}
else {
print statusString( 3, "Downtime would not have been triggered " .
"before Nagios stopped\n");
}
if( ! $nagiosRestartAfterDowntimeEnd &&
( $schedule->{ "ServiceFail"} > 0 ) &&
( $schedule->{ "NagiosStop"} < $schedule->{ "ServiceFail"}) &&
( $schedule->{ "ServiceFail"} < $schedule->{ "NagiosStart"})) {
$downtimeTriggeredAfterNagiosStart = 1;
print statusString( 3, "Downtime should be triggered after " .
"Nagios start\n");
}
else {
print statusString( 3, "Downtime would not be triggered after " .
"Nagios start\n");
}
}
# If the downtime should have been triggered, but was not detected prior
# to the Nagios stop, wait for it to be detected.
if( $downtimeTriggeredAfterNagiosStart) {
if( $service->{ "active_checks_enabled"}) {
my $nextCheck = findServiceNextCheck( $cfg, $service);
$downtimeEndCheckIncrement = ( $nextCheck - time) +
( $schedule->{ "NagiosStart"} -
$schedule->{ "ServiceFail"});
noisySleepUntilTime( $nextCheck, statusString( 1,
sprintf( "Waiting for next check to run at %s",
humanTime( $nextCheck))));
}
if( time > $downtimeStart + $schedule->{ "DowntimeEnd"}) {
# We passed the end of the downtime waiting for a check that
# would trigger a downtime
print statusString( 2,
"Downtime ended while waiting for next check.\n");
return 0;
}
}
# Next determine whether the downtime should still exist
my $downtimeShouldExist = 0;
if( $downtimeFixed) {
if( time < $downtimeStart + $schedule->{ "DowntimeEnd"}) {
# If the downtime is fixed and we have not reached the
# downtime end, it should still exists so check it below
$downtimeShouldExist = 1;
}
}
else {
if( ! $downtimeTriggeredBeforeNagiosStop &&
! $downtimeTriggeredAfterNagiosStart) {
if( time < $downtimeStart + $schedule->{ "DowntimeEnd"}) {
# If the downtime is flexible, it was never triggered
# and we have not reached the downtime end, it should
# still exist so check it below
print statusString( 2, "We have not reached the downtime " .
"end, so it should still exist.\n");
$downtimeShouldExist = 1;
}
}
else {
# In this case the downtime is flexible and should have been
# triggered.
print statusString( 3,
sprintf( "There were %d downtime(s) when Nagios stopped.\n",
scalar( @$downtimesAtNagiosStop)));
if( scalar( @$downtimesAtNagiosStop) == 1) {
print statusString( 3,
sprintf( "The flex downtime start for the downtime " .
"existing when\nNagios stopped was %s\n",
humanTime( $downtimesAtNagiosStop->[ 0]->{ "flex_downtime_start"})));
}
if(( scalar( @$downtimesAtNagiosStop) == 1) &&
( $downtimesAtNagiosStop->[ 0]->{ "flex_downtime_start"} > 0)) {
print statusString( 2,
"The downtime was triggered before Nagios stopped.\n");
if( time <
( $downtimesAtNagiosStop->[ 0]->{ "flex_downtime_start"} +
$downtimesAtNagiosStop->[ 0]->{ "duration"})) {
# If the trigger was detected before Nagios was stopped and
# we are still in the flexible downtime, it should exist
print statusString( 2, "We have not reached the duration " .
"end; it should still exist.\n");
$downtimeShouldExist = 1;
}
else {
print statusString( 2, "We have reached the duration " .
"end; it should not exist.\n");
}
}
elsif( time < ( $downtimeStart + $schedule->{ "NagiosStart"} +
$schedule->{ "DowntimeDuration"})) {
# If the downtime was not triggered until after Nagios restarted
# and we have not reached the duration end, it should still
# exist so check it below
print statusString( 2, "We have not reached the duration " .
"end; it should still exist.\n");
$downtimeShouldExist = 1;
}
}
}
# If the downtime should exist, verify the downtime is back. If it should
# not exist, verification that the downtime is gone happens at the
# DowntimeEndCheck event.
if( $downtimeShouldExist) {
print statusString( 1, "Verifying downtime after Nagios restart.\n");
if( !verifyDowntime( $service, $cfg, $downtimeFixed,
$downtimeStart,
$downtimeStart + ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"}),
$schedule->{ "DowntimeDuration"})) {
return -1;
}
}
else {
print statusString( 1,
"The downtime should have ended; it will be checked later.\n");
}
return $downtimeEndCheckIncrement;
}
# Perform the check of a downtime sequence
sub checkDowntime {
my $testComment = shift;
my $service = shift;
my $cfg = shift;
my $downtimeFixed = shift;
my $schedule = shift;
my $versionMajor = shift;
my $startTime = time;
my $datfile;
my $nagiosRunning = 1;
my $downtimesAtNagiosStop;
my $downtimeAfterServiceFail;
my $downtimeEndCheckIncrement = 0;
print "$testComment\n";
# Verify parameters
die "No downtime start specified" unless( exists( $schedule->{ "DowntimeStart"}));
die "No downtime end specified" unless( $schedule->{ "DowntimeEnd"} >= 0);
die "No downtime duration specified" unless( $downtimeFixed ||
$schedule->{ "DowntimeDuration"} >= 0);
# Determine the amount of time to discover a service failure above
# the normal check results reaper frequency
my $maxDiscoveryTime = maxDiscoveryTime( $cfg, $service, $versionMajor);
# Create an events list
my @events;
my $eventTime;
# Figure out the time for the check at the end of the downtime
if( ! $downtimeFixed && $schedule->{ "ServiceFail"} > 0) {
if( $schedule->{ "NagiosStart"} > $schedule->{ "ServiceFail"} +
$schedule->{ "DowntimeDuration"} + $maxDiscoveryTime) {
$eventTime = $schedule->{ "NagiosStart"} + 1;
}
else {
$eventTime = $schedule->{ "ServiceFail"} +
$schedule->{ "DowntimeDuration"};
if( $service->{ "active_checks_enabled"}) {
$eventTime += $maxDiscoveryTime;
}
}
}
else {
if( $schedule->{ "NagiosStart"} > ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"})) {
$eventTime = $schedule->{ "NagiosStart"} + 1;
}
else {
$eventTime = ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"});
}
}
push( @events, { "time" => $eventTime + $cfg->{ "status_update_interval"},
"type" => "DowntimeEndCheck"});
# Add the Nagios stop and start events if they're in the permutation
if( $schedule->{ "NagiosStop"} > 0) {
push( @events, { "time" => $schedule->{ "NagiosStop"},
"type" => "NagiosStop"});
if( $schedule->{ "NagiosStart"} > 0) {
push( @events, { "time" => $schedule->{ "NagiosStart"},
"type" => "NagiosStart"});
}
else {
die "No Nagios restart time specified";
}
}
# Add the service failure and recover events if they're in the permutation
if( $schedule->{ "ServiceFail"} > 0) {
push( @events, { "time" => $schedule->{ "ServiceFail"},
"type" => "ServiceFail"});
if( $schedule->{ "ServiceRecover"} > 0) {
push( @events, { "time" => $schedule->{ "ServiceRecover"},
"type" => "ServiceRecover"});
}
else {
die "No service recovery time specified";
}
}
# Sort the events in chronological order
my @sortedEvents = sort { $a->{ "time"} <=> $b->{ "time"}} @events;
# Make sure the service starts in a good state
if( ! recoverService( $service, $cfg,
"Making sure the service is in a good state.", $nagiosRunning,
$versionMajor)) {
return -1;
}
# Cancel all current downtimes
print statusString( 1, "Deleting current downtimes: ");
$datfile = readDatFile( $cfg->{ "status_file"});
$downtimes = findServiceDowntimes( $datfile, $service);
for( my $x = 0; $x < @$downtimes; $x++) {
print ", " if( $x > 0);
print $downtimes->[ $x]->{ "downtime_id"};
deleteDowntime( $downtimes->[ $x]->{ "downtime_id"}, $cfg, $service);
}
print "\n";
# Schedule the downtime
my $downtimeStart = time + $schedule->{ "DowntimeStart"};
if( $downtimeFixed) {
print statusString( 1,
sprintf( "Scheduling fixed downtime\n\t\tfrom %s to %s.\n",
humanTime( $downtimeStart),
humanTime( $downtimeStart + ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"}))));
}
else {
print statusString( 1, sprintf( "Scheduling flexible downtime of " .
"%d seconds\n\t\tfrom %s to %s.\n",
$schedule->{ "DowntimeDuration"}, humanTime( $downtimeStart),
humanTime( $downtimeStart + ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"}))));
}
scheduleDowntime( $service, $downtimeStart,
$downtimeStart + ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"}), $downtimeFixed,
$schedule->{ "DowntimeDuration"}, $testComment);
# Wait for the status file to update
noisySleepUntilFileUpdate( $cfg->{ "status_file"}, time,
$cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"},
statusString( 2, "Waiting for status to update"));
# Verify the downtime
if( !verifyDowntime( $service, $cfg, $downtimeFixed, $downtimeStart,
$downtimeStart + ( $schedule->{ "DowntimeEnd"} -
$schedule->{ "DowntimeStart"}),
$schedule->{ "DowntimeDuration"})) {
return -1;
}
# Enter the event loop
my $eventIndex = 0;
while( $eventIndex < @sortedEvents) {
# Sleep until the next event is to to be executed
if( time < ( $downtimeStart + $sortedEvents[ $eventIndex]->{ "time"})) {
my $message = statusString( 1,
sprintf( "Waiting for %s event at %s",
$sortedEvents[ $eventIndex]->{ "type"},
humanTime( $downtimeStart +
$sortedEvents[ $eventIndex]->{ "time"})));
noisySleepUntilTime(( $downtimeStart +
$sortedEvents[ $eventIndex]->{ "time"}), $message);
}
# Verify the downtime is gone at the end of the check
if( $sortedEvents[ $eventIndex]->{ "type"} eq "DowntimeEndCheck") {
noisySleepUntilFileUpdate( $cfg->{ "status_file"}, time,
$cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"},
statusString( 2, "Waiting for status to update"));
# Grab the status data again
print statusString( 2, "Rereading status data.\n");
$datfile = readDatFile( $cfg->{ "status_file"});
# Find the downtimes
print statusString( 2, "Looking for downtimes again.\n");
$downtimes = findServiceDowntimes( $datfile, $service);
print statusString( 1, "Verifying the downtime has ended.\n");
if( scalar( @$downtimes) > 0) {
for( my $y = 0; $y < @$downtimes; $y++) {
if( $downtimes->[ $y]->{ "is_in_effect"}) {
print STDERR "Downtime still exists.\n";
return -1;
}
}
}
}
# Restart Nagios
elsif( $sortedEvents[ $eventIndex]->{ "type"} eq "NagiosStart") {
print statusString( 1, "Starting Nagios.\n");
system( "/sbin/service nagios start");
system( "/bin/chgrp nagcmd /usr/local/nagios/var/rw/nagios.cmd");
# Wait for the status file to update
noisySleepUntilFileUpdate( $cfg->{ "status_file"}, time,
$cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"},
statusString( 2, "Waiting for status to update"));
# Check the status of things after the Nagios restart
$downtimeEndCheckIncrement = checkStateAfterNagiosRestart( $cfg,
$service, $schedule, $downtimeStart, $downtimeFixed,
$downtimeAfterServiceFail, $downtimesAtNagiosStop);
if( $downtimeEndCheckIncrement < 0) {
return -1;
}
elsif( $downtimeEndCheckIncrement > 0) {
print statusString( 3,
sprintf( "Extending the DowntimeEndCheck time by %d " .
"seconds\n", $downtimeEndCheckIncrement));
$sortedEvents[ @sortedEvents - 1]->{ "time"} +=
$downtimeEndCheckIncrement;
}
$nagiosRunning = 1;
}
# Stop Nagios
elsif( $sortedEvents[ $eventIndex]->{ "type"} eq "NagiosStop") {
$datfile = readDatFile( $cfg->{ "status_file"});
$downtimesAtNagiosStop = findServiceDowntimes( $datfile, $service);
print statusString( 3, sprintf(
"There are %d downtime(s) just before stopping Nagios.\n",
scalar( @$downtimesAtNagiosStop)));
print statusString( 1, "Stopping Nagios.\n");
system( "/sbin/service nagios stop");
$nagiosRunning = 0;
}
# Cause the service to fail
elsif( $sortedEvents[ $eventIndex]->{ "type"} eq "ServiceFail") {
( $downtimeEndCheckIncrement, $downtimeAfterServiceFail) =
failService( $service, $cfg, $schedule, $downtimeStart,
$nagiosRunning, $versionMajor);
if( $downtimeEndCheckIncrement < 0) {
return -1;
}
elsif( $downtimeEndCheckIncrement > 0) {
print statusString( 3,
sprintf( "Extending the DowntimeEndCheck time by %d " .
"seconds\n", $downtimeEndCheckIncrement));
$sortedEvents[ @sortedEvents - 1]->{ "time"} +=
$downtimeEndCheckIncrement;
}
}
# Cause the service to recover
elsif( $sortedEvents[ $eventIndex]->{ "type"} eq "ServiceRecover") {
if( !recoverService( $service, $cfg, "Sending service recovery.",
$nagiosRunning, $versionMajor)) {
return -1;
}
}
# Something is dreadfully wrong
else {
die "Unhandled event type: " . $sortedEvents[ $eventIndex]->{ "type"};
}
$eventIndex++;
}
return( time - $startTime);
}
# Determine whether a event permutation is valid
sub isValidEventPermutation {
my $permutation = shift;
my $activeChecks = shift;
# Create a hash of the events with the event name as the key and the order
# as the value.
my %permutation;
for( my $x = 0; $x < @$permutation; $x++) {
$permutation{ $permutation->[ $x]} = $x;
}
# The downtime cannot be created after it ends
if( exists( $permutation{ "DowntimeEnd"}) &&
exists( $permutation{ "EntryTime"}) &&
( $permutation{ "DowntimeEnd"} < $permutation{ "EntryTime"})) {
return 0;
}
# The downtime cannot end before it starts
if( exists( $permutation{ "DowntimeEnd"}) &&
exists( $permutation{ "DowntimeStart"}) &&
( $permutation{ "DowntimeEnd"} < $permutation{ "DowntimeStart"})) {
return 0;
}
# The service cannot recover before it fails
if( exists( $permutation{ "ServiceFail"}) &&
exists( $permutation{ "ServiceRecover"}) &&
( $permutation{ "ServiceRecover"} < $permutation{ "ServiceFail"})) {
return 0;
}
# Nagios cannot be restarted before it is initially stopped
if( exists( $permutation{ "NagiosStop"}) &&
exists( $permutation{ "NagiosStart"}) &&
( $permutation{ "NagiosStart"} < $permutation{ "NagiosStop"})) {
return 0;
}
# It doesn't make sense to test when a service failure occurs after the
# end of the downtime
if( exists( $permutation{ "ServiceFail"}) &&
exists( $permutation{ "DowntimeEnd"}) &&
( $permutation{ "ServiceFail"} > $permutation{ "DowntimeEnd"})) {
return 0;
}
# It doesn't make sense to test when a service failure occurs before the
# downtime is created
if( exists( $permutation{ "ServiceFail"}) &&
exists( $permutation{ "EntryTime"}) &&
( $permutation{ "ServiceFail"} < $permutation{ "EntryTime"})) {
return 0;
}
# It doesn't make sense to test when a service failure occurs before the
# downtime starts
if( exists( $permutation{ "ServiceFail"}) &&
exists( $permutation{ "DowntimeStart"}) &&
( $permutation{ "ServiceFail"} < $permutation{ "DowntimeStart"})) {
return 0;
}
# It doesn't make sense to test when a Nagios shutdown occurs after the
# end of the downtime
if( exists( $permutation{ "NagiosStop"}) &&
exists( $permutation{ "DowntimeEnd"}) &&
( $permutation{ "NagiosStop"} > $permutation{ "DowntimeEnd"})) {
return 0;
}
# A downtime cannot be created when Nagios is shutdown and it doesn't
# make sense to test when Nagios is shutdown and restarted before the
# downtime is created
if( exists( $permutation{ "NagiosStop"}) &&
exists( $permutation{ "EntryTime"}) &&
( $permutation{ "NagiosStop"} < $permutation{ "EntryTime"})) {
return 0;
}
# It doesn't make sense for the duration to be so short it ends before
# any real testing occurs
if( exists( $permutation{ "DurationEnd"}) &&
( $permutation{ "DurationEnd"} < 1)) {
return 0;
}
# The following only applies to passive checks
unless( $activeChecks) {
# A passive service failure cannot be sent to Nagios when it is not
# running
if( exists( $permutation{ "ServiceFail"}) &&
exists( $permutation{ "NagiosStop"}) &&
exists( $permutation{ "NagiosStart"}) &&
( $permutation{ "ServiceFail"} > $permutation{ "NagiosStop"}) &&
( $permutation{ "ServiceFail"} < $permutation{ "NagiosStart"})) {
return 0;
}
# A passive service recovery cannot be sent to Nagios when it is not
# running
if( exists( $permutation{ "ServiceRecover"}) &&
exists( $permutation{ "NagiosStop"}) &&
exists( $permutation{ "NagiosStart"}) &&
( $permutation{ "ServiceRecover"} > $permutation{ "NagiosStop"}) &&
( $permutation{ "ServiceRecover"} < $permutation{ "NagiosStart"})) {
return 0;
}
}
return 1;
}
# Display a time in human-readable format
sub humanTime {
my $epoch = shift;
my $longFormat = 0;
$longFormat = shift if( @_);
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) =
localtime( $epoch);
if( $longFormat) {
return sprintf( "%04d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1,
$mday, $hour, $min, $sec);
}
else {
return sprintf( "%02d:%02d:%02d", $hour, $min, $sec);
}
}
# Display a time duration in human-readable format
sub humanDuration {
my $duration = shift;
my $wordyFormat = 0;
$wordyFormat = shift if( @_);
my ( $durationLeft, $hours, $minutes, $seconds);
$durationLeft = $duration;
$hours = int( $durationLeft / ( 60 * 60));
$durationLeft -= $hours * 60 * 60;
$minutes = int( $durationLeft / 60);
$durationLeft -= $minutes * 60;
$seconds = $durationLeft;
if( $wordyFormat) {
if( $hours > 0) {
return sprintf( "%d hour%s, %d minute%s, %d second%s",
$hours, ( $hours == 1 ? "" : "s"),
$minutes, ( $minutes == 1 ? "" : "s"),
$seconds, ( $seconds == 1 ? "" : "s"));
}
elsif( $minutes > 0) {
return sprintf( "%d minute%s, %d second%s",
$minutes, ( $minutes == 1 ? "" : "s"),
$seconds, ( $seconds == 1 ? "" : "s"));
}
else {
return sprintf( "%d second%s", $duration,
( $duration == 1 ? "" : "s"));
}
}
else {
return sprintf( "%02d:%02d:%02d (%ds)", $hours, $minutes, $seconds,
$duration);
}
}
# Schedule a downtime series' event times
sub scheduleDowntimePermutation {
my $service = shift;
my $cfg = shift;
my $permutation = shift;
my $versionMajor = shift;
# Initialize times
my %schedule;
$schedule{ "DowntimeStart"} = -1;
$schedule{ "DowntimeEnd"} = -1;
$schedule{ "DowntimeDuration"} = -1;
$schedule{ "NagiosStop"} = -1;
$schedule{ "NagiosStart"} = -1;
$schedule{ "ServiceFail"} = -1;
$schedule{ "ServiceRecover"} = -1;
# Determine the amount of time to discover a service failure above
# the normal check results reaper frequency
my $maxDiscoveryTime = maxDiscoveryTime( $cfg, $service, $versionMajor);
# Get the current time, to be used for all future times
my $currentTime = time;
# Last event time
my $lastEventTime;
if( $permutation->[ 0] eq "EntryTime") {
$schedule{ "DowntimeStart"} = 10;
$lastEventTime = $schedule{ "DowntimeStart"};
}
elsif( $permutation->[ 0] eq "DowntimeStart") {
$schedule{ "DowntimeStart"} = -10;
$lastEventTime = $schedule{ "DowntimeStart"} +
( $versionMajor < 4 ?
$cfg->{ "check_result_reaper_frequency"} : 0) + 2 ;
}
else {
die "Unsupported permutation order: " . $permutation->[ 0] . " => " .
$permutation->[ 1];
}
print join( " => ", @$permutation) . "\n";
printf( " Downtime Start: %s\n", $schedule{ "DowntimeStart"});
# Iterate through the remaining events, determining the time for each
for( my $x = 0; $x < @$permutation; $x++) {
if( $permutation->[ $x] eq "DowntimeEnd") {
$schedule{ "DowntimeEnd"} = $lastEventTime +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"});
$lastEventTime = $schedule{ "DowntimeEnd"};
printf( " Downtime End: %s\n", $schedule{ "DowntimeEnd"});
}
elsif( $permutation->[ $x] eq "DowntimeStart") {
; # Do nothing - Downtime Start was determined earlier
}
elsif( $permutation->[ $x] eq "DurationEnd") {
$schedule{ "DowntimeDuration"} = ( $lastEventTime -
$schedule{ "DowntimeStart"}) +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"} +
$maxDiscoveryTime);
$lastEventTime = $schedule{ "DowntimeDuration"} +
$schedule{ "DowntimeStart"};
printf( " Duration: %d\n", $schedule{ "DowntimeDuration"});
}
elsif( $permutation->[ $x] eq "EntryTime") {
; # Do nothing
}
elsif( $permutation->[ $x] eq "NagiosStop") {
$schedule{ "NagiosStop"} = $lastEventTime +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"});
$lastEventTime = $schedule{ "NagiosStop"};
printf( " Nagios Stop: %s\n", $schedule{ "NagiosStop"});
}
elsif( $permutation->[ $x] eq "NagiosStart") {
$schedule{ "NagiosStart"} = $lastEventTime +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"});
$lastEventTime = $schedule{ "NagiosStart"};
printf( " Nagios Start: %s\n", $schedule{ "NagiosStart"});
}
elsif( $permutation->[ $x] eq "ServiceFail") {
$schedule{ "ServiceFail"} = $lastEventTime +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"});
$lastEventTime = $schedule{ "ServiceFail"} +
( $versionMajor < 4 ?
$cfg->{ "check_result_reaper_frequency"} : 0) +
$maxDiscoveryTime;
printf( " Service Fail: %s\n", $schedule{ "ServiceFail"});
}
elsif( $permutation->[ $x] eq "ServiceRecover") {
$schedule{ "ServiceRecover"} = $lastEventTime +
( $cfg->{ "status_update_interval"} *
$cfg->{ "StatusUpdateIntervalMultiplier"});
$lastEventTime = $schedule{ "ServiceRecover"} +
( $versionMajor < 4 ?
$cfg->{ "check_result_reaper_frequency"} : 0) +
$maxDiscoveryTime;
printf( " Service Recover: %s\n", $schedule{ "ServiceRecover"});
}
else {
die "Unhandled event type: " . $permutation->[ $x];
}
}
return \%schedule;
}
# Generate a list of valid permutations for fixed downtime events
sub fixedDowntimeTestPermutations {
my $service = shift;
my @allPermutations;
my @noRestart = ( "EntryTime", "DowntimeStart", "DowntimeEnd");
my $iteratorNoRestart= Algorithm::Permute->new ( \@noRestart);
while (my @perm1 = $iteratorNoRestart->next) {
if( isValidEventPermutation( \@perm1,
$service->{ "active_checks_enabled"})) {
# print "(@perm1)\n";
push( @allPermutations, \@perm1);
}
}
my @withRestart = ( "EntryTime", "DowntimeStart", "NagiosStop",
"NagiosStart", "DowntimeEnd");
my $iteratorWithRestart =
Algorithm::Permute->new ( \@withRestart);
while (my @perm2 = $iteratorWithRestart->next) {
if( isValidEventPermutation( \@perm2,
$service->{ "active_checks_enabled"})) {
# print "(@perm2)\n";
push( @allPermutations, \@perm2);
}
}
return \@allPermutations;
}
# Generate a list of valid permutations for flexible downtime events
sub flexibleDowntimeTestPermutations {
my $service = shift;
my @allPermutations;
my @noRestartOrTrigger = ( "EntryTime", "DowntimeStart", "DurationEnd",
"DowntimeEnd");
my $iteratorNoRestartOrTrigger =
Algorithm::Permute->new ( \@noRestartOrTrigger );
while (my @perm1 = $iteratorNoRestartOrTrigger->next) {
if( isValidEventPermutation( \@perm1,
$service->{ "active_checks_enabled"})) {
push( @allPermutations, \@perm1);
}
}
my @noRestart = ( "EntryTime", "DowntimeStart", "DurationEnd",
"DowntimeEnd", "ServiceFail", "ServiceRecover");
my $iteratorNoRestart = Algorithm::Permute->new ( \@noRestart );
while (my @perm2 = $iteratorNoRestart->next) {
if( isValidEventPermutation( \@perm2,
$service->{ "active_checks_enabled"})) {
push( @allPermutations, \@perm2);
}
}
my @noTrigger = ( "EntryTime", "DowntimeStart", "DurationEnd",
"DowntimeEnd", "NagiosStop", "NagiosStart");
my $iteratorNoTrigger = Algorithm::Permute->new ( \@noTrigger );
while (my @perm3 = $iteratorNoTrigger->next) {
if( isValidEventPermutation( \@perm3,
$service->{ "active_checks_enabled"})) {
push( @allPermutations, \@perm3);
}
}
my @allEvents = ( "EntryTime", "DowntimeStart", "DurationEnd",
"DowntimeEnd", "NagiosStop", "NagiosStart", "ServiceFail",
"ServiceRecover");
my $iteratorAllEventns = Algorithm::Permute->new ( \@allEvents );
while (my @perm4 = $iteratorAllEventns->next) {
if( isValidEventPermutation( \@perm4,
$service->{ "active_checks_enabled"})) {
push( @allPermutations, \@perm4);
}
}
return \@allPermutations;
}
# This function verifies the global Nagios configuration
sub verifyConfig {
my $cfg = shift;
my $versionMajor = shift;
# Verify the status update interval
if( $cfg->{ "status_update_interval"} < 5) {
print STDERR sprintf( "status_update_interval (%d) is too short\n",
$cfg->{ "status_update_interval"});
return 0;
}
if( $cfg->{ "status_update_interval"} >= 10) {
print sprintf( "The status_update_interval (%d) is long. " .
"It should optimally be 5 seconds.\n",
$cfg->{ "status_update_interval"});
print "Press ENTER to continue...";
<STDIN>;
}
if( $versionMajor < 4) {
# Verify the check_result_reaper_frequency
if( $cfg->{ "check_result_reaper_frequency"} < 5) {
print STDERR sprintf( "check_result_reaper_frequency (%d) is too " .
"short\n", $cfg->{ "check_result_reaper_frequency"});
return 0;
}
if( $cfg->{ "check_result_reaper_frequency"} >= 10) {
print sprintf( "The check_result_reaper_frequency (%d) is long. " .
"It should optimally be 5 seconds.\n",
$cfg->{ "check_result_reaper_frequency"});
print "Press ENTER to continue...";
<STDIN>;
}
}
return 1;
}
sub verifyService {
my $cfg = shift;
my $hostName = shift;
my $serviceDescription = shift;
# Make sure there is an appropriately configured service before we
# start testing.
my $objects = readObjectCache( $cfg->{ "object_cache_file"});
my $service = findService( $objects, $hostName, $serviceDescription);
if( !defined( $service)) {
print STDERR sprintf( "No service found for host '%s' and " .
"service description '%s'.\n", $hostName, $serviceDescription);
return undef;
}
if( !( $service->{ "active_checks_enabled"} ||
$service->{ "passive_checks_enabled"})) {
print STDERR sprintf( "Either active or passive checks must be " .
"enabled for %s:%s\n", $hostName, $serviceDescription);
return undef;
}
if( $service->{ "notifications_enabled"}) {
print STDERR sprintf( "Notifications must not be enabled for %s:%s\n",
$hostName, $serviceDescription);
return undef;
}
if( $service->{ "flap_detection_enabled"}) {
print STDERR sprintf( "Flap detection must not be enabled for %s:%s\n",
$hostName, $serviceDescription);
return undef;
}
if( $service->{ "process_perf_data"}) {
print STDERR sprintf( "Performance data processing must not be " .
"enabled for %s:%s\n", $hostName, $serviceDescription);
return undef;
}
if( $service->{ "active_checks_enabled"}) {
if( $service->{ "max_check_attempts"} > 1) {
print sprintf( "The max_check_attempts (%d) is high. " .
"It should optimally be 1.\n",
$cfg->{ "max_check_attempts"});
print "Press ENTER to continue...";
<STDIN>;
}
if( $service->{ "check_interval"} > 1) {
print sprintf( "The normal_check_interval (%d) is long. " .
"It should optimally be 1 minute.\n",
$cfg->{ "normal_interval"});
print "Press ENTER to continue...";
<STDIN>;
}
if(( $service->{ "max_check_attempts"} > 1) &&
( $service->{ "retry_interval"} > 1)) {
print sprintf( "The retry_check_interval (%d) is long. " .
"It should optimally be 1 minute.\n",
$cfg->{ "retry_interval"});
print "Press ENTER to continue...";
<STDIN>;
}
}
else {
if( $service->{ "check_freshness"}) {
print STDERR sprintf( "check_freshness must not be enabled " .
"for %s:%s\n", $hostName, $serviceDescription);
return undef;
}
if( $service->{ "max_check_attempts"} != 1) {
print STDERR sprintf( "max_check_attempts must be 1 for %s:%s\n",
$hostName, $serviceDescription);
return undef;
}
}
return $service;
}
sub getVersion {
my $cfg = shift;
my $status = readDatFile( $cfg->{ "status_file"});
my @versionparts = split( /\./, $status->{ "info"}->{ "version"});
my %version;
$version{ "major"} = $versionparts[ 0];
$version{ "minor"} = $versionparts[ 1];
$version{ "micro"} = $versionparts[ 2];
return \%version;
}
sub isInArray {
my $string = shift;
my $array = shift;
for( my $x = 0; $x < @$array; $x++) {
return 1 if( $array->[ $x] eq $string);
}
return 0;
}
# This is the SIGUSR1 handler - it sets a flag that causes the test to
# pause.
sub pauseTest {
$testPaused = 1;
}
sub usage {
my $cmd = shift;
print <<EOU;
This command runs one or more downtime checks. With only the required
options, the test will run all appropriate permutations of downtime
event sequences (see --single-run below for a list of events).
The service checked must be an existing service and can be either an
actively or passively checked service. Note that this test can run a
very long time, over 24 hours for active checks.
Sending the program the USR1 single will cause it to pause after the
current permutation has completed testing.
Usage: $cmd --host-name <host> --service-description <service>
[--single-run <test>] [--no-fixed-tests] [--no-flexible-tests]
[--skip-logged <logfile>] [--help|-?]
Where:
--host-name <host> specifies the name of the host whose service
is to be used for the downtime check.
--service-description <service> specifies the name of the service
to be used for the downtime check.
--single-run <test> indicates that the single specified test is
to be run. <test> must be of the form "(Fixed|Flexible)
downtime: <eventlist>" <eventlist> is a comma and space
separated list of the following events: EntryTime,
DowntimeStart, [NagiosStop, NagiosStart,] [ServiceFail,
ServiceRecover,] DurationEnd, DowntimeEnd. The events within
the brackets are optional, but if one of the pair is used,
both must be used. Hint: the format of <test> is the same as
is displayed at the beginning of each downtime check when
--single-run is not specified.
--no-fixed-tests indicates that no fixed tests should be run.
This option is ignored is --single-run is specified.
--no-flexible-tests indicates that no flexible tests should be
run. This option is ignored is --single-run is specified.
--skip-logged <logfile> indicates that any tests logged in
<logfile> should be skipped. This is useful for start over
where you left off if the test had been aborted prematurely.
--help|-? displays this help message.
EOU
}
# Host and service to be used for testing
my $hostName = "localhost";
my $serviceDescription = "Downtime Test";
my $singleRun = undef;
my $logFile = undef;
my $noFixedTests = 0;
my $noFlexibleTests = 0;
my $showUsage = 0;
$result = GetOptions ( "host-name=s" => \$hostName,
"service-description=s" => \$serviceDescription,
"single-run=s" => \$singleRun,
"no-fixed-tests" => \$noFixedTests,
"no-flexible-tests" => \$noFlexibleTests,
"skip-logged=s" => \$logFile,
"help|?" => \$showUsage);
die "Error processing command line options" unless( $result);
if( $showUsage) {
usage( $PROGRAM_NAME);
exit( 0);
}
# Install signal handle to enable test pausing
$SIG{ "USR1"} = "pauseTest";
# Make sure we're running as root
if( $EUID != 0) {
print STDERR "This script must be run as root.\n";
exit 1;
}
# Make sure Nagios is running
my $status = `/sbin/service nagios status`;
if( $status !~ /nagios \(pid( \d+)+\) is running.../) {
print STDERR "Nagios must be running in order to run this test.\n";
exit 1;
}
# Read the Nagios configuration
my $cfg = readCfgFile( "/usr/local/nagios/etc/nagios.cfg");
exit 1 unless( defined( $cfg));
# Get the version
my $version = getVersion( $cfg);
printf( "Nagios version is %d.%d.%d\n", $version->{ "major"},
$version->{ "minor"}, $version->{ "micro"});
# Verify the global configuration
unless( verifyConfig( $cfg, $version->{ "major"}) > 0) {
exit 1;
}
$cfg->{ "StatusUpdateIntervalMultiplier"} = 4;
# Verify the service configuration
my $service = verifyService( $cfg, $hostName, $serviceDescription);
exit 1 unless( defined( $service));
# Enable autoflush on STDOUT so updates occur
my $old_fh = select(STDOUT);
$| = 1;
select($old_fh);
my $activeChecks = 0;
my $permutationTime = 0;
my $elapsedTime = 0;
my $schedule;
my $comment;
#my @testPerm = ( "DowntimeStart", "DurationEnd", "EntryTime", "ServiceFail", "NagiosStop", "DowntimeEnd", "NagiosStart", "ServiceRecover");
#my $testSchedule = scheduleDowntimePermutation( $service, $cfg, \@testPerm,
# $version->{ "major"});
#my $testComment = join( ", ", @testPerm);
#checkDowntime( "Flexible downtime: $testComment", $service, $cfg, 0,
# $testSchedule, $version->{ "major"});
#exit( 0);
if( defined( $singleRun)) {
if( $singleRun =~ /^(Fixed|Flexible) downtime: (.*)/) {
my $downtimeType = $1;
my @singleRunPerm = split( /, /, $2);
my $fixed = (( $singleRun =~ /^Fixed/) ? 1 : 0);
my $testSchedule = scheduleDowntimePermutation( $service, $cfg,
\@singleRunPerm, $version->{ "major"});
my $testComment = join( ", ", @singleRunPerm);
checkDowntime( "$downtimeType downtime: $testComment", $service, $cfg,
$fixed, $testSchedule, $version->{ "major"});
exit 0;
}
else {
die "Unrecognized single run format: $singleRun";
}
}
else {
my @skipTests = ();
if( defined( $logFile)) {
open( LOG, "$logFile") ||
die "Unable to open log file $logFile for reading";
while( <LOG>) {
chomp;
push( @skipTests, $_) if( /^(Fixed|Flexible) downtime: /);
}
close( LOG) || die "Unable to close $logFile";
}
my $lastTriedPermutation = -1;
unless( $noFixedTests) {
my $fixedDowntimeTests = fixedDowntimeTestPermutations( $service);
for( my $x = 0; $x < @$fixedDowntimeTests; $x++) {
if( $testPaused) {
print "Testing paused as requested.\n";
print "Press ENTER to continue...";
<STDIN>;
$testPaused = 0;
}
$comment = join( ", ", @{ $fixedDowntimeTests->[ $x]});
unless( isInArray( "Fixed downtime: $comment", \@skipTests)) {
printf( "Fixed Test %d of %d\n", $x + 1,
scalar( @$fixedDowntimeTests));
$schedule = scheduleDowntimePermutation( $service, $cfg,
$fixedDowntimeTests->[ $x], $version->{ "major"});
$permutationTime = checkDowntime( "Fixed downtime: $comment",
$service, $cfg, 1, $schedule, $version->{ "major"});
if( $permutationTime == -1) {
if( $lastTriedPermutation != $x) {
print "Downtime test failed: retrying\n";
$lastTriedPermutation = $x;
$x--;
}
else {
die "Downtime test failed after retry\n";
}
}
else {
$lastTriedPermutation = $x;
$elapsedTime += $permutationTime;
printf( "Last test time: %s\n",
humanDuration( $permutationTime));
printf( "Elapsed time thus far: %s\n",
humanDuration( $elapsedTime));
}
}
else {
$lastTriedPermutation = $x;
printf( "Skipping fixed downtime: %s\n", $comment);
}
}
}
$lastTriedPermutation = -1;
unless( $noFlexibleTests) {
my $flexibleDowntimeTests = flexibleDowntimeTestPermutations( $service);
for( my $y = 0; $y < @$flexibleDowntimeTests; $y++) {
if( $testPaused) {
print "Testing paused as requested.\n";
print "Press ENTER to continue...";
<STDIN>;
$testPaused = 0;
}
$comment = join( ", ", @{ $flexibleDowntimeTests->[ $y]});
unless( isInArray( "Flexible downtime: $comment", \@skipTests)) {
printf( "Flexible Test %d of %d\n", $y + 1,
scalar( @$flexibleDowntimeTests));
$schedule = scheduleDowntimePermutation( $service, $cfg,
$flexibleDowntimeTests->[ $y], $version->{ "major"});
$permutationTime = checkDowntime( "Flexible downtime: $comment",
$service, $cfg, 0, $schedule, $version->{ "major"});
if( $permutationTime == -1) {
if( $lastTriedPermutation != $y) {
print "Downtime test failed: retrying\n";
$lastTriedPermutation = $y;
$y--;
}
else {
die "Downtime test failed after retry\n";
}
}
else {
$lastTriedPermutation = $y;
$elapsedTime += $permutationTime;
printf( "Last test time: %s\n",
humanDuration( $permutationTime));
printf( "Elapsed time thus far: %s\n",
humanDuration( $elapsedTime));
}
}
else {
$lastTriedPermutation = $y;
printf( "Skipping flexible downtime: %s\n", $comment);
}
}
}
}