/***************************************************************************** * * CHECKS.C - Service and host check functions for Nagios * * * License: * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *****************************************************************************/ #include "../include/config.h" #include "../include/comments.h" #include "../include/common.h" #include "../include/statusdata.h" #include "../include/downtime.h" #include "../include/macros.h" #include "../include/nagios.h" #include "../include/broker.h" #include "../include/perfdata.h" #include "../include/workers.h" /*#define DEBUG_CHECKS*/ /*#define DEBUG_HOST_CHECKS 1*/ #ifdef USE_EVENT_BROKER #include "../include/neberrors.h" #endif /******************************************************************/ /********************** CHECK REAPER FUNCTIONS ********************/ /******************************************************************/ /* reaps host and service check results */ int reap_check_results(void) { int reaped_checks = 0; log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() start\n"); log_debug_info(DEBUGL_CHECKS, 0, "Starting to reap check results.\n"); /* process files in the check result queue */ reaped_checks = process_check_result_queue(check_result_path); log_debug_info(DEBUGL_CHECKS, 0, "Finished reaping %d check results\n", reaped_checks); log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() end\n"); return OK; } /******************************************************************/ /****************** SERVICE MONITORING FUNCTIONS ******************/ /******************************************************************/ /* executes a scheduled service check */ int run_scheduled_service_check(service *svc, int check_options, double latency) { int result = OK; time_t current_time = 0L; time_t preferred_time = 0L; time_t next_valid_time = 0L; int time_is_valid = TRUE; if(svc == NULL) return ERROR; log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_service_check() start\n"); log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n", svc->description, svc->host_name, check_options, latency); /* * reset the next_check_event so we know it's * no longer in the scheduling queue */ svc->next_check_event = NULL; /* attempt to run the check */ result = run_async_service_check(svc, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time); /* an error occurred, so reschedule the check */ if(result == ERROR) { log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled service check at this time\n"); /* only attempt to (re)schedule checks that should get checked... */ if(svc->should_be_scheduled == TRUE) { /* get current time */ time(¤t_time); /* determine next time we should check the service if needed */ /* if service has no check interval, schedule it again for 5 minutes from now */ if(current_time >= preferred_time) preferred_time = current_time + ((svc->check_interval <= 0) ? 300 : (svc->check_interval * interval_length)); /* make sure we rescheduled the next service check at a valid time */ get_next_valid_time(preferred_time, &next_valid_time, svc->check_period_ptr); /* * If we really can't reschedule the service properly, we * just push the check to preferred_time plus some reasonable * random value and try again then. */ if(time_is_valid == FALSE && check_time_against_period(next_valid_time, svc->check_period_ptr) == ERROR) { svc->next_check = preferred_time + ranged_urand(0, check_window(svc)); logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' could not be rescheduled properly. Scheduling check for %s...\n", svc->description, svc->host_name, ctime(&preferred_time)); log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next service check!\n"); } /* this service could be rescheduled... */ else { svc->next_check = next_valid_time; if(next_valid_time > preferred_time) { /* Next valid time is further in the future because of * timeperiod constraints. Add a random amount so we * don't get all checks subject to that timeperiod * constraint scheduled at the same time */ svc->next_check += ranged_urand(0, check_window(svc)); } svc->should_be_scheduled = TRUE; log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next service check for %s", ctime(&next_valid_time)); } } /* * reschedule the next service check - unless we couldn't * find a valid next check time, but keep original options */ if(svc->should_be_scheduled == TRUE) schedule_service_check(svc, svc->next_check, check_options); /* update the status log */ update_service_status(svc, FALSE); return ERROR; } return OK; } /* forks a child process to run a service check, but does not wait for the service check result */ int run_async_service_check(service *svc, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) { nagios_macros mac; char *raw_command = NULL; char *processed_command = NULL; struct timeval start_time, end_time; host *temp_host = NULL; double old_latency = 0.0; check_result *cr; int runchk_result = OK; int macro_options = STRIP_ILLEGAL_MACRO_CHARS | ESCAPE_MACRO_CHARS; #ifdef USE_EVENT_BROKER int neb_result = OK; #endif log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_service_check()\n"); /* make sure we have something */ if(svc == NULL) return ERROR; /* is the service check viable at this time? */ if(check_service_check_viability(svc, check_options, time_is_valid, preferred_time) == ERROR) return ERROR; /* find the host associated with this service */ if((temp_host = svc->host_ptr) == NULL) return ERROR; /******** GOOD TO GO FOR A REAL SERVICE CHECK AT THIS POINT ********/ #ifdef USE_EVENT_BROKER /* initialize start/end times */ start_time.tv_sec = 0L; start_time.tv_usec = 0L; end_time.tv_sec = 0L; end_time.tv_usec = 0L; /* send data to event broker */ neb_result = broker_service_check(NEBTYPE_SERVICECHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, svc, CHECK_TYPE_ACTIVE, start_time, end_time, svc->check_command, svc->latency, 0.0, 0, FALSE, 0, NULL, NULL, NULL); if (neb_result == NEBERROR_CALLBACKCANCEL || neb_result == NEBERROR_CALLBACKOVERRIDE) { log_debug_info(DEBUGL_CHECKS, 0, "Check of service '%s' on host '%s' (id=%u) was %s by a module\n", svc->description, svc->host_name, svc->id, neb_result == NEBERROR_CALLBACKCANCEL ? "cancelled" : "overridden"); } /* neb module wants to cancel the service check - the check will be rescheduled for a later time by the scheduling logic */ if(neb_result == NEBERROR_CALLBACKCANCEL) { if(preferred_time) *preferred_time += (svc->check_interval * interval_length); return ERROR; } /* neb module wants to override (or cancel) the service check - perhaps it will check the service itself */ /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */ /* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */ if(neb_result == NEBERROR_CALLBACKOVERRIDE) return OK; #endif log_debug_info(DEBUGL_CHECKS, 0, "Checking service '%s' on host '%s'...\n", svc->description, svc->host_name); /* clear check options - we don't want old check options retained */ /* only clear check options for scheduled checks - ondemand checks shouldn't affected retained check options */ if(scheduled_check == TRUE) svc->check_options = CHECK_OPTION_NONE; /* update latency for macros, event broker, save old value for later */ old_latency = svc->latency; svc->latency = latency; /* grab the host and service macro variables */ memset(&mac, 0, sizeof(mac)); grab_host_macros_r(&mac, temp_host); grab_service_macros_r(&mac, svc); /* get the raw command line */ get_raw_command_line_r(&mac, svc->check_command_ptr, svc->check_command, &raw_command, macro_options); if(raw_command == NULL) { clear_volatile_macros_r(&mac); log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name); if(preferred_time) *preferred_time += (svc->check_interval * interval_length); svc->latency = old_latency; return ERROR; } /* process any macros contained in the argument */ process_macros_r(&mac, raw_command, &processed_command, macro_options); my_free(raw_command); if(processed_command == NULL) { clear_volatile_macros_r(&mac); log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name); if(preferred_time) *preferred_time += (svc->check_interval * interval_length); svc->latency = old_latency; return ERROR; } /* get the command start time */ gettimeofday(&start_time, NULL); cr = calloc(1, sizeof(*cr)); if (!cr) { clear_volatile_macros_r(&mac); svc->latency = old_latency; my_free(processed_command); return ERROR; } init_check_result(cr); /* save check info */ cr->object_check_type = SERVICE_CHECK; cr->check_type = CHECK_TYPE_ACTIVE; cr->check_options = check_options; cr->scheduled_check = scheduled_check; cr->reschedule_check = reschedule_check; cr->latency = latency; cr->start_time = start_time; cr->finish_time = start_time; cr->early_timeout = FALSE; cr->exited_ok = TRUE; cr->return_code = STATE_OK; cr->output = NULL; cr->host_name = (char *)strdup(svc->host_name); cr->service_description = (char *)strdup(svc->description); #ifdef USE_EVENT_BROKER /* send data to event broker */ neb_result = broker_service_check(NEBTYPE_SERVICECHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, svc, CHECK_TYPE_ACTIVE, start_time, end_time, svc->check_command, svc->latency, 0.0, service_check_timeout, FALSE, 0, processed_command, NULL, cr); /* neb module wants to override the service check - perhaps it will check the service itself */ if(neb_result == NEBERROR_CALLBACKOVERRIDE) { clear_volatile_macros_r(&mac); svc->latency = old_latency; free_check_result(cr); my_free(processed_command); return OK; } #endif /* reset latency (permanent value will be set later) */ svc->latency = old_latency; /* paw off the check to a worker to run */ runchk_result = wproc_run_check(cr, processed_command, &mac); if (runchk_result == ERROR) { logit(NSLOG_RUNTIME_ERROR, TRUE, "Unable to run check for service '%s' on host '%s'\n", svc->description, svc->host_name); } else { /* do the book-keeping */ currently_running_service_checks++; svc->is_executing = TRUE; update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_SERVICE_CHECK_STATS : ACTIVE_ONDEMAND_SERVICE_CHECK_STATS, start_time.tv_sec); } /* free memory */ my_free(processed_command); clear_volatile_macros_r(&mac); return runchk_result; } static int get_service_check_return_code(service *temp_service, check_result *queued_check_result) { int rc; char *temp_plugin_output = NULL; log_debug_info(DEBUGL_FUNCTIONS, 0, "get_service_check_return_code()\n"); if(NULL == temp_service || NULL == queued_check_result) { return STATE_UNKNOWN; } /* grab the return code */ rc = queued_check_result->return_code; /* adjust return code (active checks only) */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE) { if(queued_check_result->early_timeout == TRUE) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' timed out after %.3fs!\n", temp_service->description, temp_service->host_name, temp_service->execution_time); my_free(temp_service->plugin_output); my_free(temp_service->long_plugin_output); my_free(temp_service->perf_data); asprintf(&temp_service->plugin_output, "(Service check timed out after %.2lf seconds)", temp_service->execution_time); rc = service_check_timeout_state; } /* if there was some error running the command, just skip it (this shouldn't be happening) */ else if(queued_check_result->exited_ok == FALSE) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' did not exit properly!\n", temp_service->description, temp_service->host_name); my_free(temp_service->plugin_output); my_free(temp_service->long_plugin_output); my_free(temp_service->perf_data); temp_service->plugin_output = (char *)strdup("(Service check did not exit properly)"); rc = STATE_CRITICAL; } /* make sure the return code is within bounds */ else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of service '%s' on host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_service->description, temp_service->host_name, (queued_check_result->return_code == 126 ? "Make sure the plugin you're trying to run is executable." : (queued_check_result->return_code == 127 ? " Make sure the plugin you're trying to run actually exists." : ""))); asprintf(&temp_plugin_output, "(Return code of %d is out of bounds%s : %s)", queued_check_result->return_code, (queued_check_result->return_code == 126 ? " - plugin may not be executable" : (queued_check_result->return_code == 127 ? " - plugin may be missing" : "")), temp_service->plugin_output); my_free(temp_service->plugin_output); asprintf(&temp_service->plugin_output, "%s)", temp_plugin_output); my_free(temp_plugin_output); my_free(temp_service->long_plugin_output); my_free(temp_service->perf_data); rc = STATE_CRITICAL; } } return rc; } /* handles asynchronous service check results */ int handle_async_service_check_result(service *temp_service, check_result *queued_check_result) { host *temp_host = NULL; time_t next_service_check = 0L; time_t preferred_time = 0L; time_t next_valid_time = 0L; int reschedule_check = FALSE; int state_change = FALSE; int hard_state_change = FALSE; int first_host_check_initiated = FALSE; int route_result = HOST_UP; time_t current_time = 0L; int state_was_logged = FALSE; char *old_plugin_output = NULL; char *temp_plugin_output = NULL; char *temp_ptr = NULL; servicedependency *temp_dependency = NULL; service *master_service = NULL; int flapping_check_done = FALSE; log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_service_check_result()\n"); /* make sure we have what we need */ if(temp_service == NULL || queued_check_result == NULL) return ERROR; /* get the current time */ time(¤t_time); if (current_time < temp_service->next_check) next_service_check = temp_service->next_check + check_window(temp_service); else next_service_check = current_time + check_window(temp_service); log_debug_info(DEBUGL_CHECKS, 0, "** Handling check result for service '%s' on host '%s' from '%s'...\n", temp_service->description, temp_service->host_name, check_result_source(queued_check_result)); log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, SERVICE: %s, CHECK TYPE: %s, OPTIONS: %d, SCHEDULED: %s, RESCHEDULE: %s, EXITED OK: %s, RETURN CODE: %d, OUTPUT: %s\n", temp_service->host_name, temp_service->description, (queued_check_result->check_type == CHECK_TYPE_ACTIVE) ? "Active" : "Passive", queued_check_result->check_options, (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No", queued_check_result->return_code, queued_check_result->output); /* decrement the number of service checks still out there... */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE && currently_running_service_checks > 0) currently_running_service_checks--; /* skip this service check results if its passive and we aren't accepting passive check results */ if(queued_check_result->check_type == CHECK_TYPE_PASSIVE) { if(accept_passive_service_checks == FALSE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive service checks are disabled globally.\n"); return ERROR; } if(temp_service->accept_passive_checks == FALSE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive checks are disabled for this service.\n"); return ERROR; } } /* clear the freshening flag (it would have been set if this service was determined to be stale) */ if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) temp_service->is_being_freshened = FALSE; /* clear the execution flag if this was an active check */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE) temp_service->is_executing = FALSE; /* DISCARD INVALID FRESHNESS CHECK RESULTS */ /* If a services goes stale, Nagios will initiate a forced check in order to freshen it. There is a race condition whereby a passive check could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would make the service fresh again, so we do a quick check to make sure the service is still stale before we accept the check result. */ if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_service_result_fresh(temp_service, current_time, FALSE) == TRUE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding service freshness check result because the service is currently fresh (race condition avoided).\n"); return OK; } /* check latency is passed to us */ temp_service->latency = queued_check_result->latency; /* update the execution time for this check (millisecond resolution) */ temp_service->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0); if(temp_service->execution_time < 0.0) temp_service->execution_time = 0.0; /* get the last check time */ temp_service->last_check = queued_check_result->start_time.tv_sec; /* was this check passive or active? */ temp_service->check_type = (queued_check_result->check_type == CHECK_TYPE_ACTIVE) ? CHECK_TYPE_ACTIVE : CHECK_TYPE_PASSIVE; /* update check statistics for passive checks */ if(queued_check_result->check_type == CHECK_TYPE_PASSIVE) update_check_stats(PASSIVE_SERVICE_CHECK_STATS, queued_check_result->start_time.tv_sec); /* should we reschedule the next service check? NOTE: This may be overridden later... */ reschedule_check = queued_check_result->reschedule_check; /* save the old service status info */ temp_service->last_state = temp_service->current_state; /* save old plugin output */ if(temp_service->plugin_output) old_plugin_output = (char *)strdup(temp_service->plugin_output); /* clear the old plugin output and perf data buffers */ my_free(temp_service->plugin_output); my_free(temp_service->long_plugin_output); my_free(temp_service->perf_data); /* parse check output to get: (1) short output, (2) long output, (3) perf data */ parse_check_output(queued_check_result->output, &temp_service->plugin_output, &temp_service->long_plugin_output, &temp_service->perf_data, TRUE, FALSE); /* make sure the plugin output isn't null */ if(temp_service->plugin_output == NULL) temp_service->plugin_output = (char *)strdup("(No output returned from plugin)"); /* replace semicolons in plugin output (but not performance data) with colons */ else if((temp_ptr = temp_service->plugin_output)) { while((temp_ptr = strchr(temp_ptr, ';'))) * temp_ptr = ':'; } /* grab the return code */ temp_service->current_state = get_service_check_return_code(temp_service, queued_check_result); log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n"); log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_service->plugin_output == NULL) ? "NULL" : temp_service->plugin_output); log_debug_info(DEBUGL_CHECKS, 2, "Long Output: %s\n", (temp_service->long_plugin_output == NULL) ? "NULL" : temp_service->long_plugin_output); log_debug_info(DEBUGL_CHECKS, 2, "Perf Data: %s\n", (temp_service->perf_data == NULL) ? "NULL" : temp_service->perf_data); /* record the time the last state ended */ switch(temp_service->last_state) { case STATE_OK: temp_service->last_time_ok = temp_service->last_check; break; case STATE_WARNING: temp_service->last_time_warning = temp_service->last_check; break; case STATE_UNKNOWN: temp_service->last_time_unknown = temp_service->last_check; break; case STATE_CRITICAL: temp_service->last_time_critical = temp_service->last_check; break; default: break; } /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */ if(temp_service->check_type == CHECK_TYPE_PASSIVE) { if(log_passive_checks == TRUE) logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE SERVICE CHECK: %s;%s;%d;%s\n", temp_service->host_name, temp_service->description, temp_service->current_state, temp_service->plugin_output); } /* get the host that this service runs on */ temp_host = (host *)temp_service->host_ptr; /* if the service check was okay... */ if(temp_service->current_state == STATE_OK) { /* if the host has never been checked before, verify its status */ /* only do this if 1) the initial state was set to non-UP or 2) the host is not scheduled to be checked soon (next 5 minutes) */ if(temp_host->has_been_checked == FALSE && (temp_host->initial_state != HOST_UP || (unsigned long)temp_host->next_check == 0L || (unsigned long)(temp_host->next_check - current_time) > 300)) { /* set a flag to remember that we launched a check */ first_host_check_initiated = TRUE; schedule_host_check(temp_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } /* increment the current attempt number if this is a soft state (service was rechecked) */ if(temp_service->state_type == SOFT_STATE && (temp_service->current_attempt < temp_service->max_attempts)) temp_service->current_attempt = temp_service->current_attempt + 1; log_debug_info(DEBUGL_CHECKS, 2, "ST: %s CA: %d MA: %d CS: %d LS: %d LHS: %d\n", (temp_service->state_type == SOFT_STATE) ? "SOFT" : "HARD", temp_service->current_attempt, temp_service->max_attempts, temp_service->current_state, temp_service->last_state, temp_service->last_hard_state); /* check for a state change (either soft or hard) */ if(temp_service->current_state != temp_service->last_state) { log_debug_info(DEBUGL_CHECKS, 2, "Service has changed state since last check!\n"); state_change = TRUE; } /* checks for a hard state change where host was down at last service check */ /* this occurs in the case where host goes down and service current attempt gets reset to 1 */ /* if this check is not made, the service recovery looks like a soft recovery instead of a hard one */ if(temp_service->host_problem_at_last_check == TRUE && temp_service->current_state == STATE_OK) { log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n"); hard_state_change = TRUE; } /* check for a "normal" hard state change where max check attempts is reached */ if(temp_service->current_attempt >= temp_service->max_attempts && temp_service->current_state != temp_service->last_hard_state) { log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n"); hard_state_change = TRUE; } /* a state change occurred... */ /* reset last and next notification times and acknowledgement flag if necessary, misc other stuff */ if(state_change == TRUE || hard_state_change == TRUE) { /* reschedule the service check */ reschedule_check = TRUE; /* reset notification times */ temp_service->last_notification = (time_t)0; temp_service->next_notification = (time_t)0; /* reset notification suppression option */ temp_service->no_more_notifications = FALSE; if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) { temp_service->problem_has_been_acknowledged = FALSE; temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE; /* remove any non-persistant comments associated with the ack */ delete_service_acknowledgement_comments(temp_service); } else if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && temp_service->current_state == STATE_OK) { temp_service->problem_has_been_acknowledged = FALSE; temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE; /* remove any non-persistant comments associated with the ack */ delete_service_acknowledgement_comments(temp_service); } /* * hard changes between non-OK states should continue * to be escalated, so don't reset current notification number */ } /* initialize the last host and service state change times if necessary */ if(temp_service->last_state_change == (time_t)0) temp_service->last_state_change = temp_service->last_check; if(temp_service->last_hard_state_change == (time_t)0) temp_service->last_hard_state_change = temp_service->last_check; if(temp_host->last_state_change == (time_t)0) temp_host->last_state_change = temp_service->last_check; if(temp_host->last_hard_state_change == (time_t)0) temp_host->last_hard_state_change = temp_service->last_check; /* update last service state change times */ if(state_change == TRUE) temp_service->last_state_change = temp_service->last_check; if(hard_state_change == TRUE) temp_service->last_hard_state_change = temp_service->last_check; /* update the event and problem ids */ if(state_change == TRUE) { /* always update the event id on a state change */ temp_service->last_event_id = temp_service->current_event_id; temp_service->current_event_id = next_event_id; next_event_id++; /* update the problem id when transitioning to a problem state */ if(temp_service->last_state == STATE_OK) { /* don't reset last problem id, or it will be zero the next time a problem is encountered */ temp_service->current_problem_id = next_problem_id; next_problem_id++; } /* clear the problem id when transitioning from a problem state to an OK state */ if(temp_service->current_state == STATE_OK) { temp_service->last_problem_id = temp_service->current_problem_id; temp_service->current_problem_id = 0L; } } /**************************************/ /******* SERVICE CHECK OK LOGIC *******/ /**************************************/ /* if the service is up and running OK... */ if(temp_service->current_state == STATE_OK) { log_debug_info(DEBUGL_CHECKS, 1, "Service is OK.\n"); /* reset the acknowledgement flag (this should already have been done, but just in case...) */ temp_service->problem_has_been_acknowledged = FALSE; temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE; /* verify the route to the host and send out host recovery notifications */ if(temp_host->current_state != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Host is NOT UP, so we'll check it to see if it recovered...\n"); if(first_host_check_initiated == TRUE) log_debug_info(DEBUGL_CHECKS, 1, "First host check was already initiated, so we'll skip a new host check.\n"); else { /* can we use the last cached host state? */ /* usually only use cached host state if no service state change has occurred */ if(state_change == FALSE && temp_host->has_been_checked == TRUE && ((current_time - temp_host->last_check) <= cached_host_check_horizon)) { log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state); update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time); update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time); } /* else launch an async (parallel) check of the host */ else schedule_host_check(temp_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } /* if a hard service recovery has occurred... */ if(hard_state_change == TRUE) { log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a HARD RECOVERY.\n"); /* set the state type macro */ temp_service->state_type = HARD_STATE; /* log the service recovery */ log_service_event(temp_service); state_was_logged = TRUE; /* 10/04/07 check to see if the service and/or associate host is flapping */ /* this should be done before a notification is sent out to ensure the host didn't just start flapping */ check_for_service_flapping(temp_service, TRUE, TRUE); check_for_host_flapping(temp_host, TRUE, FALSE, TRUE); flapping_check_done = TRUE; /* notify contacts about the service recovery */ service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE); /* run the service event handler to handle the hard state change */ handle_service_event(temp_service); } /* else if a soft service recovery has occurred... */ else if(state_change == TRUE) { log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a SOFT RECOVERY.\n"); /* this is a soft recovery */ temp_service->state_type = SOFT_STATE; /* log the soft recovery */ log_service_event(temp_service); state_was_logged = TRUE; /* run the service event handler to handle the soft state change */ handle_service_event(temp_service); } /* else no service state change has occurred... */ else { log_debug_info(DEBUGL_CHECKS, 1, "Service did not change state.\n"); } /* should we obsessive over service checks? */ if(obsess_over_services == TRUE) obsessive_compulsive_service_check_processor(temp_service); /* reset all service variables because its okay now... */ temp_service->host_problem_at_last_check = FALSE; temp_service->current_attempt = 1; temp_service->state_type = HARD_STATE; temp_service->last_hard_state = STATE_OK; temp_service->last_notification = (time_t)0; temp_service->next_notification = (time_t)0; temp_service->current_notification_number = 0; temp_service->problem_has_been_acknowledged = FALSE; temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE; temp_service->notified_on = 0; if(reschedule_check == TRUE) next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length)); } /*******************************************/ /******* SERVICE CHECK PROBLEM LOGIC *******/ /*******************************************/ /* hey, something's not working quite like it should... */ else { log_debug_info(DEBUGL_CHECKS, 1, "Service is in a non-OK state!\n"); /* check the route to the host if its up right now... */ if(temp_host->current_state == HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Host is currently UP, so we'll recheck its state to make sure...\n"); /* only run a new check if we can and have to */ if(execute_host_checks && state_change == TRUE && temp_host->last_check + cached_host_check_horizon < current_time) { schedule_host_check(temp_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } else { log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state); route_result = temp_host->current_state; update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time); update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time); } } /* else the host is either down or unreachable, so recheck it if necessary */ else { log_debug_info(DEBUGL_CHECKS, 1, "Host is currently %s.\n", host_state_name(temp_host->current_state)); if(execute_host_checks && state_change == TRUE) { schedule_host_check(temp_host, current_time, CHECK_OPTION_NONE); } /* else fake the host check, but (possibly) resend host notifications to contacts... */ else { log_debug_info(DEBUGL_CHECKS, 1, "Assuming host is in same state as before...\n"); /* if the host has never been checked before, set the checked flag and last check time */ /* This probably never evaluates to FALSE, present for historical reasons only, can probably be removed in the future */ if(temp_host->has_been_checked == FALSE) { temp_host->has_been_checked = TRUE; temp_host->last_check = temp_service->last_check; } /* fake the route check result */ route_result = temp_host->current_state; /* possibly re-send host notifications... */ host_notification(temp_host, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE); } } /* if the host is down or unreachable ... */ if(route_result != HOST_UP) { if (temp_host->state_type == HARD_STATE) { log_debug_info(DEBUGL_CHECKS, 2, "Host is not UP, so we mark state changes if appropriate\n"); /* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */ if(temp_service->last_hard_state != temp_service->current_state) hard_state_change = TRUE; /* update last state change times */ if(state_change == TRUE || hard_state_change == TRUE) temp_service->last_state_change = temp_service->last_check; if(hard_state_change == TRUE) { temp_service->last_hard_state_change = temp_service->last_check; temp_service->state_type = HARD_STATE; temp_service->last_hard_state = temp_service->current_state; } /* put service into a hard state without attempting check retries and don't send out notifications about it */ temp_service->host_problem_at_last_check = TRUE; } else if (temp_service->last_state == STATE_OK) temp_service->state_type = SOFT_STATE; } /* the host is up - it recovered since the last time the service was checked... */ else if(temp_service->host_problem_at_last_check == TRUE) { /* next time the service is checked we shouldn't get into this same case... */ temp_service->host_problem_at_last_check = FALSE; /* reset the current check counter, so we give the service a chance */ /* this helps prevent the case where service has N max check attempts, N-1 of which have already occurred. */ /* if we didn't do this, the next check might fail and result in a hard problem - we should really give it more time */ /* ADDED IF STATEMENT 01-17-05 EG */ /* 01-17-05: Services in hard problem states before hosts went down would sometimes come back as soft problem states after */ /* the hosts recovered. This caused problems, so hopefully this will fix it */ if(temp_service->state_type == SOFT_STATE) temp_service->current_attempt = 1; } log_debug_info(DEBUGL_CHECKS, 1, "Current/Max Attempt(s): %d/%d\n", temp_service->current_attempt, temp_service->max_attempts); /* if we should retry the service check, do so (except if the host is down or unreachable!) */ if(temp_service->current_attempt < temp_service->max_attempts) { /* the host is down or unreachable, so don't attempt to retry the service check */ if(route_result != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Host isn't UP, so we won't retry the service check...\n"); /* the host is not up, so reschedule the next service check at regular interval */ if(reschedule_check == TRUE) next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length)); /* log the problem as a hard state if the host just went down */ if(hard_state_change == TRUE) { log_service_event(temp_service); state_was_logged = TRUE; /* run the service event handler to handle the hard state */ handle_service_event(temp_service); } } /* the host is up, so continue to retry the service check */ else { log_debug_info(DEBUGL_CHECKS, 1, "Host is UP, so we'll retry the service check...\n"); /* this is a soft state */ temp_service->state_type = SOFT_STATE; /* log the service check retry */ log_service_event(temp_service); state_was_logged = TRUE; /* run the service event handler to handle the soft state */ handle_service_event(temp_service); if(reschedule_check == TRUE) next_service_check = (time_t)(temp_service->last_check + (temp_service->retry_interval * interval_length)); } /* perform dependency checks on the second to last check of the service */ if(execute_service_checks && enable_predictive_service_dependency_checks == TRUE && temp_service->current_attempt == (temp_service->max_attempts - 1)) { objectlist *list; log_debug_info(DEBUGL_CHECKS, 1, "Looking for services to check for predictive dependency checks...\n"); /* check services that THIS ONE depends on for notification AND execution */ /* we do this because we might be sending out a notification soon and we want the dependency logic to be accurate */ for(list = temp_service->exec_deps; list; list = list->next) { temp_dependency = (servicedependency *)list->object_ptr; if(temp_dependency->dependent_service_ptr == temp_service && temp_dependency->master_service_ptr != NULL) { master_service = (service *)temp_dependency->master_service_ptr; log_debug_info(DEBUGL_CHECKS, 2, "Predictive check of service '%s' on host '%s' queued.\n", master_service->description, master_service->host_name); schedule_service_check(master_service, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } for(list = temp_service->notify_deps; list; list = list->next) { temp_dependency = (servicedependency *)list->object_ptr; if(temp_dependency->dependent_service_ptr == temp_service && temp_dependency->master_service_ptr != NULL) { master_service = (service *)temp_dependency->master_service_ptr; log_debug_info(DEBUGL_CHECKS, 2, "Predictive check of service '%s' on host '%s' queued.\n", master_service->description, master_service->host_name); schedule_service_check(master_service, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } } } /* we've reached the maximum number of service rechecks, so handle the error */ else { log_debug_info(DEBUGL_CHECKS, 1, "Service has reached max number of rechecks, so we'll handle the error...\n"); /* this is a hard state */ temp_service->state_type = HARD_STATE; /* check for start of flexible (non-fixed) scheduled downtime if we just had a hard error */ /* we need to check for both, state_change (SOFT) and hard_state_change (HARD) values */ if((hard_state_change == TRUE || state_change == TRUE) && temp_service->pending_flex_downtime > 0) check_pending_flex_service_downtime(temp_service); /* if we've hard a hard state change... */ if(hard_state_change == TRUE) { /* log the service problem (even if host is not up, which is new in 0.0.5) */ log_service_event(temp_service); state_was_logged = TRUE; } /* else log the problem (again) if this service is flagged as being volatile */ else if(temp_service->is_volatile == TRUE) { log_service_event(temp_service); state_was_logged = TRUE; } /* 10/04/07 check to see if the service and/or associate host is flapping */ /* this should be done before a notification is sent out to ensure the host didn't just start flapping */ check_for_service_flapping(temp_service, TRUE, TRUE); check_for_host_flapping(temp_host, TRUE, FALSE, TRUE); flapping_check_done = TRUE; /* (re)send notifications out about this service problem if the host is up (and was at last check also) and the dependencies were okay... */ service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE); /* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */ if(hard_state_change == TRUE || temp_service->is_volatile == TRUE) handle_service_event(temp_service); /* save the last hard state */ temp_service->last_hard_state = temp_service->current_state; /* reschedule the next check at the regular interval */ if(reschedule_check == TRUE) next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length)); } /* should we obsessive over service checks? */ if(obsess_over_services == TRUE) obsessive_compulsive_service_check_processor(temp_service); } /* reschedule the next service check ONLY for active, scheduled checks */ if(reschedule_check == TRUE) { log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of service at %s", ctime(&next_service_check)); /* default is to reschedule service check unless a test below fails... */ temp_service->should_be_scheduled = TRUE; /* next check time was calculated above */ temp_service->next_check = next_service_check; /* make sure we don't get ourselves into too much trouble... */ if(current_time > temp_service->next_check) temp_service->next_check = current_time; /* make sure we rescheduled the next service check at a valid time */ preferred_time = temp_service->next_check; get_next_valid_time(preferred_time, &next_valid_time, temp_service->check_period_ptr); temp_service->next_check = next_valid_time; if(next_valid_time > preferred_time) { /* Next valid time is further in the future because of timeperiod * constraints. Add a random amount so we don't get all checks * subject to that timeperiod constraint scheduled at the same time */ temp_service->next_check += ranged_urand(0, check_window(temp_service)); } /* services with non-recurring intervals do not get rescheduled */ if(temp_service->check_interval == 0) temp_service->should_be_scheduled = FALSE; /* services with active checks disabled do not get rescheduled */ if(temp_service->checks_enabled == FALSE) temp_service->should_be_scheduled = FALSE; /* schedule a non-forced check if we can */ if(temp_service->should_be_scheduled == TRUE) schedule_service_check(temp_service, temp_service->next_check, CHECK_OPTION_NONE); } /* if we're stalking this state type and state was not already logged AND the plugin output changed since last check, log it now.. */ if(temp_service->state_type == HARD_STATE && state_change == FALSE && state_was_logged == FALSE && compare_strings(old_plugin_output, temp_service->plugin_output)) { if(should_stalk(temp_service)) log_service_event(temp_service); } #ifdef USE_EVENT_BROKER /* send data to event broker */ broker_service_check(NEBTYPE_SERVICECHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_service, temp_service->check_type, queued_check_result->start_time, queued_check_result->finish_time, NULL, temp_service->latency, temp_service->execution_time, service_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, NULL, queued_check_result); #endif /* set the checked flag */ temp_service->has_been_checked = TRUE; /* update the current service status log */ update_service_status(temp_service, FALSE); /* check to see if the service and/or associate host is flapping */ if(flapping_check_done == FALSE) { check_for_service_flapping(temp_service, TRUE, TRUE); check_for_host_flapping(temp_host, TRUE, FALSE, TRUE); } /* update service performance info */ update_service_performance_data(temp_service); /* free allocated memory */ my_free(temp_plugin_output); my_free(old_plugin_output); return OK; } /* schedules an immediate or delayed service check */ void schedule_service_check(service *svc, time_t check_time, int options) { timed_event *temp_event = NULL; int use_original_event = TRUE; log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_service_check()\n"); if(svc == NULL) return; log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of service '%s' on host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", svc->description, svc->host_name, ctime(&check_time)); /* don't schedule a check if active checks of this service are disabled */ if(svc->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) { log_debug_info(DEBUGL_CHECKS, 0, "Active checks of this service are disabled.\n"); return; } /* we may have to nudge this check a bit */ if (options == CHECK_OPTION_DEPENDENCY_CHECK) { if (svc->last_check + cached_service_check_horizon > check_time) { log_debug_info(DEBUGL_CHECKS, 0, "Last check result is recent enough (%s)", ctime(&svc->last_check)); return; } } /* default is to use the new event */ use_original_event = FALSE; temp_event = (timed_event *)svc->next_check_event; /* * If the service already has a check scheduled, * we need to decide which of the events to use */ if(temp_event != NULL) { log_debug_info(DEBUGL_CHECKS, 2, "Found another service check event for this service @ %s", ctime(&temp_event->run_time)); /* use the originally scheduled check unless we decide otherwise */ use_original_event = TRUE; /* the original event is a forced check... */ if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) { /* the new event is also forced and its execution time is earlier than the original, so use it instead */ if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) { use_original_event = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced and occurs before the existing event, so the new event will be used instead.\n"); } } /* the original event is not a forced check... */ else { /* the new event is a forced check, so use it instead */ if((options & CHECK_OPTION_FORCE_EXECUTION)) { use_original_event = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced, so it will be used instead of the existing event.\n"); } /* the new event is not forced either and its execution time is earlier than the original, so use it instead */ else if(check_time < temp_event->run_time) { use_original_event = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs before the existing (older) event, so it will be used instead.\n"); } /* the new event is older, so override the existing one */ else { log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs after the existing event, so we'll ignore it.\n"); } } } /* schedule a new event */ if(use_original_event == FALSE) { /* make sure we remove the old event from the queue */ if(temp_event) { remove_event(nagios_squeue, temp_event); } else { /* allocate memory for a new event item */ temp_event = (timed_event *)calloc(1, sizeof(timed_event)); if(temp_event == NULL) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of service '%s' on host '%s'!\n", svc->description, svc->host_name); return; } } log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new service check event.\n"); /* set the next service check event and time */ svc->next_check_event = temp_event; svc->next_check = check_time; /* save check options for retention purposes */ svc->check_options = options; /* place the new event in the event queue */ temp_event->event_type = EVENT_SERVICE_CHECK; temp_event->event_data = (void *)svc; temp_event->event_args = (void *)NULL; temp_event->event_options = options; temp_event->run_time = svc->next_check; temp_event->recurring = FALSE; temp_event->event_interval = 0L; temp_event->timing_func = NULL; temp_event->compensate_for_time_change = TRUE; add_event(nagios_squeue, temp_event); } else { /* reset the next check time (it may be out of sync) */ if(temp_event != NULL) svc->next_check = temp_event->run_time; log_debug_info(DEBUGL_CHECKS, 2, "Keeping original service check event (ignoring the new one).\n"); } /* update the status log */ update_service_status(svc, FALSE); return; } /* checks viability of performing a service check */ int check_service_check_viability(service *svc, int check_options, int *time_is_valid, time_t *new_time) { int result = OK; int perform_check = TRUE; time_t current_time = 0L; time_t preferred_time = 0L; int check_interval = 0; host *temp_host = NULL; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_check_viability()\n"); /* make sure we have a service */ if(svc == NULL) return ERROR; /* get the check interval to use if we need to reschedule the check */ if(svc->state_type == SOFT_STATE && svc->current_state != STATE_OK) check_interval = (svc->retry_interval * interval_length); else check_interval = (svc->check_interval * interval_length); /* get the current time */ time(¤t_time); /* initialize the next preferred check time */ preferred_time = current_time; /* can we check the host right now? */ if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) { /* if checks of the service are currently disabled... */ if(svc->checks_enabled == FALSE) { preferred_time = current_time + check_interval; perform_check = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "Active checks of the service are currently disabled.\n"); } /* make sure this is a valid time to check the service */ if(check_time_against_period((unsigned long)current_time, svc->check_period_ptr) == ERROR) { preferred_time = current_time; if(time_is_valid) *time_is_valid = FALSE; perform_check = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "This is not a valid time for this service to be actively checked.\n"); } /* check service dependencies for execution */ if(check_service_dependencies(svc, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) { preferred_time = current_time + check_interval; perform_check = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "Execution dependencies for this service failed, so it will not be actively checked.\n"); } } /* check if parent service is OK */ if(check_service_parents(svc) == DEPENDENCIES_FAILED) { preferred_time = current_time + check_interval; perform_check = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "Execution parents for this service failed, so it will not be actively checked.\n"); } /* check if host is up - if not, do not perform check */ if(host_down_disable_service_checks) { if((temp_host = svc->host_ptr) == NULL) { log_debug_info(DEBUGL_CHECKS, 2, "Host pointer NULL in check_service_check_viability().\n"); return ERROR; } else { if(temp_host->current_state != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 2, "Host state not UP, so service check will not be performed - will be rescheduled as normal.\n"); perform_check = FALSE; } } } /* pass back the next viable check time */ if(new_time) *new_time = preferred_time; result = (perform_check == TRUE) ? OK : ERROR; return result; } /* checks service parents */ int check_service_parents(service *svc) { servicesmember *temp_servicesmember = NULL; int state = STATE_OK; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_parents()\n"); /* check all parents... */ for(temp_servicesmember = svc->parents; temp_servicesmember; temp_servicesmember = temp_servicesmember->next) { service *parent_service; /* find the service we depend on... */ if((parent_service = temp_servicesmember->service_ptr) == NULL) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: service '%s' on host '%s' is NULL ptr\n", temp_servicesmember->service_description, temp_servicesmember->host_name); continue; } state = parent_service->last_hard_state; /* is the service we depend on in a state that fails the dependency tests? */ if((state == STATE_CRITICAL) || (state == STATE_UNKNOWN)) return DEPENDENCIES_FAILED; if(check_service_parents(parent_service) != DEPENDENCIES_OK) return DEPENDENCIES_FAILED; } return DEPENDENCIES_OK; } /* checks service dependencies */ int check_service_dependencies(service *svc, int dependency_type) { objectlist *list; int state = STATE_OK; time_t current_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_dependencies()\n"); /* only check dependencies of the desired type */ if(dependency_type == NOTIFICATION_DEPENDENCY) list = svc->notify_deps; else list = svc->exec_deps; /* check all dependencies of the desired type... */ for(; list; list = list->next) { service *temp_service; servicedependency *temp_dependency = (servicedependency *)list->object_ptr; /* find the service we depend on... */ if((temp_service = temp_dependency->master_service_ptr) == NULL) continue; /* skip this dependency if it has a timeperiod and the current time isn't valid */ time(¤t_time); if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR) return FALSE; /* get the status to use (use last hard state if its currently in a soft state) */ if(temp_service->state_type == SOFT_STATE && soft_state_dependencies == FALSE) state = temp_service->last_hard_state; else state = temp_service->current_state; /* is the service we depend on in state that fails the dependency tests? */ if(flag_isset(temp_dependency->failure_options, 1 << state)) return DEPENDENCIES_FAILED; /* immediate dependencies ok at this point - check parent dependencies if necessary */ if(temp_dependency->inherits_parent == TRUE) { if(check_service_dependencies(temp_service, dependency_type) != DEPENDENCIES_OK) return DEPENDENCIES_FAILED; } } return DEPENDENCIES_OK; } /* check for services that never returned from a check... */ void check_for_orphaned_services(void) { service *temp_service = NULL; time_t current_time = 0L; time_t expected_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_services()\n"); /* get the current time */ time(¤t_time); /* check all services... */ for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) { /* skip services that are not currently executing */ if(temp_service->is_executing == FALSE) continue; /* determine the time at which the check results should have come in (allow 10 minutes slack time) */ expected_time = (time_t)(temp_service->next_check + temp_service->latency + service_check_timeout + check_reaper_interval + 600); /* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */ if(expected_time < current_time) { /* log a warning */ logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back; last_check=%lu; next_check=%lu). I'm scheduling an immediate check of the service...\n", temp_service->description, temp_service->host_name, temp_service->last_check, temp_service->next_check); log_debug_info(DEBUGL_CHECKS, 1, "Service '%s' on host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_service->description, temp_service->host_name); log_debug_info(DEBUGL_CHECKS, 1, " next_check=%lu (%s); last_check=%lu (%s);\n", temp_service->next_check, ctime(&temp_service->next_check), temp_service->last_check, ctime(&temp_service->last_check)); /* decrement the number of running service checks */ if(currently_running_service_checks > 0) currently_running_service_checks--; /* disable the executing flag */ temp_service->is_executing = FALSE; /* schedule an immediate check of the service */ schedule_service_check(temp_service, current_time, CHECK_OPTION_ORPHAN_CHECK); } } return; } /* check freshness of service results */ void check_service_result_freshness(void) { service *temp_service = NULL; time_t current_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_result_freshness()\n"); log_debug_info(DEBUGL_CHECKS, 1, "Checking the freshness of service check results...\n"); /* bail out if we're not supposed to be checking freshness */ if(check_service_freshness == FALSE) { log_debug_info(DEBUGL_CHECKS, 1, "Service freshness checking is disabled.\n"); return; } /* get the current time */ time(¤t_time); /* check all services... */ for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) { /* skip services we shouldn't be checking for freshness */ if(temp_service->check_freshness == FALSE) continue; /* skip services that are currently executing (problems here will be caught by orphaned service check) */ if(temp_service->is_executing == TRUE) continue; /* skip services that have both active and passive checks disabled */ if(temp_service->checks_enabled == FALSE && temp_service->accept_passive_checks == FALSE) continue; /* skip services that are already being freshened */ if(temp_service->is_being_freshened == TRUE) continue; /* see if the time is right... */ if(check_time_against_period(current_time, temp_service->check_period_ptr) == ERROR) continue; /* EXCEPTION */ /* don't check freshness of services without regular check intervals if we're using auto-freshness threshold */ if(temp_service->check_interval == 0 && temp_service->freshness_threshold == 0) continue; /* the results for the last check of this service are stale! */ if(is_service_result_fresh(temp_service, current_time, TRUE) == FALSE) { /* set the freshen flag */ temp_service->is_being_freshened = TRUE; /* schedule an immediate forced check of the service */ schedule_service_check(temp_service, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK); } } return; } /* tests whether or not a service's check results are fresh */ int is_service_result_fresh(service *temp_service, time_t current_time, int log_this) { int freshness_threshold = 0; time_t expiration_time = 0L; int days = 0; int hours = 0; int minutes = 0; int seconds = 0; int tdays = 0; int thours = 0; int tminutes = 0; int tseconds = 0; log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name); /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */ if(temp_service->freshness_threshold == 0) { if(temp_service->state_type == HARD_STATE || temp_service->current_state == STATE_OK) freshness_threshold = (temp_service->check_interval * interval_length) + temp_service->latency + additional_freshness_latency; else freshness_threshold = (temp_service->retry_interval * interval_length) + temp_service->latency + additional_freshness_latency; } else freshness_threshold = temp_service->freshness_threshold; log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: service=%d, use=%d\n", temp_service->freshness_threshold, freshness_threshold); /* calculate expiration time */ /* * CHANGED 11/10/05 EG - * program start is only used in expiration time calculation * if > last check AND active checks are enabled, so active checks * can become stale immediately upon program startup */ /* * CHANGED 02/25/06 SG - * passive checks also become stale, so remove dependence on active * check logic */ if(temp_service->has_been_checked == FALSE) expiration_time = (time_t)(event_start + freshness_threshold); /* * CHANGED 06/19/07 EG - * Per Ton's suggestion (and user requests), only use program start * time over last check if no specific threshold has been set by user. * Problems can occur if Nagios is restarted more frequently that * freshness threshold intervals (services never go stale). */ /* * CHANGED 10/07/07 EG: * Only match next condition for services that * have active checks enabled... */ /* * CHANGED 10/07/07 EG: * Added max_service_check_spread to expiration time as suggested * by Altinity */ else if(temp_service->checks_enabled == TRUE && event_start > temp_service->last_check && temp_service->freshness_threshold == 0) expiration_time = (time_t)(event_start + freshness_threshold + (max_service_check_spread * interval_length)); else expiration_time = (time_t)(temp_service->last_check + freshness_threshold); /* * If the check was last done passively, we assume it's going * to continue that way and we need to handle the fact that * Nagios might have been shut off for quite a long time. If so, * we mustn't spam freshness notifications but use event_start * instead of last_check to determine freshness expiration time. * The threshold for "long time" is determined as 61.8% of the normal * freshness threshold based on vast heuristical research (ie, "some * guy once told me the golden ratio is good for loads of stuff"). */ if (temp_service->check_type == CHECK_TYPE_PASSIVE) { if (temp_service->last_check < event_start && event_start - last_program_stop > freshness_threshold * 0.618) { expiration_time = event_start + freshness_threshold; } } log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_service->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_service->last_check, (unsigned long)current_time, (unsigned long)expiration_time); /* the results for the last check of this service are stale */ if(expiration_time < current_time) { get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds); get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds); /* log a warning */ if(log_this == TRUE) logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the service.\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds); log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the service...\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds); return FALSE; } log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are fresh.\n", temp_service->description, temp_service->host_name); return TRUE; } /******************************************************************/ /*************** COMMON ROUTE/HOST CHECK FUNCTIONS ****************/ /******************************************************************/ /* schedules an immediate or delayed host check */ void schedule_host_check(host *hst, time_t check_time, int options) { timed_event *temp_event = NULL; int use_original_event = TRUE; log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_host_check()\n"); if(hst == NULL) return; log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", hst->name, ctime(&check_time)); /* don't schedule a check if active checks of this host are disabled */ if(hst->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) { log_debug_info(DEBUGL_CHECKS, 0, "Active checks are disabled for this host.\n"); return; } if (options == CHECK_OPTION_DEPENDENCY_CHECK) { if (hst->last_check + cached_host_check_horizon > check_time) { log_debug_info(DEBUGL_CHECKS, 0, "Last check result is recent enough (%s)\n", ctime(&hst->last_check)); return; } } /* default is to use the new event */ use_original_event = FALSE; temp_event = (timed_event *)hst->next_check_event; /* * If the host already had a check scheduled we need * to decide which check event to use */ if(temp_event != NULL) { log_debug_info(DEBUGL_CHECKS, 2, "Found another host check event for this host @ %s", ctime(&temp_event->run_time)); /* use the originally scheduled check unless we decide otherwise */ use_original_event = TRUE; /* the original event is a forced check... */ if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) { /* the new event is also forced and its execution time is earlier than the original, so use it instead */ if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) { log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced and occurs before the existing event, so the new event be used instead.\n"); use_original_event = FALSE; } } /* the original event is not a forced check... */ else { /* the new event is a forced check, so use it instead */ if((options & CHECK_OPTION_FORCE_EXECUTION)) { use_original_event = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced, so it will be used instead of the existing event.\n"); } /* the new event is not forced either and its execution time is earlier than the original, so use it instead */ else if(check_time < temp_event->run_time) { use_original_event = FALSE; log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs before the existing (older) event, so it will be used instead.\n"); } /* the new event is older, so override the existing one */ else { log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs after the existing event, so we'll ignore it.\n"); } } } /* use the new event */ if(use_original_event == FALSE) { log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new host check event.\n"); /* possibly allocate memory for a new event item */ if (temp_event) { remove_event(nagios_squeue, temp_event); } else if((temp_event = (timed_event *)calloc(1, sizeof(timed_event))) == NULL) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of host '%s'!\n", hst->name); return; } /* set the next host check event and time */ hst->next_check_event = temp_event; hst->next_check = check_time; /* save check options for retention purposes */ hst->check_options = options; /* place the new event in the event queue */ temp_event->event_type = EVENT_HOST_CHECK; temp_event->event_data = (void *)hst; temp_event->event_args = (void *)NULL; temp_event->event_options = options; temp_event->run_time = hst->next_check; temp_event->recurring = FALSE; temp_event->event_interval = 0L; temp_event->timing_func = NULL; temp_event->compensate_for_time_change = TRUE; add_event(nagios_squeue, temp_event); } else { /* reset the next check time (it may be out of sync) */ if(temp_event != NULL) hst->next_check = temp_event->run_time; log_debug_info(DEBUGL_CHECKS, 2, "Keeping original host check event (ignoring the new one).\n"); } /* update the status log */ update_host_status(hst, FALSE); return; } /* checks host dependencies */ int check_host_dependencies(host *hst, int dependency_type) { hostdependency *temp_dependency = NULL; objectlist *list; host *temp_host = NULL; int state = HOST_UP; time_t current_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_dependencies()\n"); if (dependency_type == NOTIFICATION_DEPENDENCY) { list = hst->notify_deps; } else { list = hst->exec_deps; } /* check all dependencies... */ for(; list; list = list->next) { temp_dependency = (hostdependency *)list->object_ptr; /* find the host we depend on... */ if((temp_host = temp_dependency->master_host_ptr) == NULL) continue; /* skip this dependency if it has a timeperiod and the current time isn't valid */ time(¤t_time); if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR) return FALSE; /* get the status to use (use last hard state if its currently in a soft state) */ if(temp_host->state_type == SOFT_STATE && soft_state_dependencies == FALSE) state = temp_host->last_hard_state; else state = temp_host->current_state; /* is the host we depend on in state that fails the dependency tests? */ if(flag_isset(temp_dependency->failure_options, 1 << state)) return DEPENDENCIES_FAILED; /* immediate dependencies ok at this point - check parent dependencies if necessary */ if(temp_dependency->inherits_parent == TRUE) { if(check_host_dependencies(temp_host, dependency_type) != DEPENDENCIES_OK) return DEPENDENCIES_FAILED; } } return DEPENDENCIES_OK; } /* check for hosts that never returned from a check... */ void check_for_orphaned_hosts(void) { host *temp_host = NULL; time_t current_time = 0L; time_t expected_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_hosts()\n"); /* get the current time */ time(¤t_time); /* check all hosts... */ for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) { /* skip hosts that don't have a set check interval (on-demand checks are missed by the orphan logic) */ if(temp_host->next_check == (time_t)0L) continue; /* skip hosts that are not currently executing */ if(temp_host->is_executing == FALSE) continue; /* determine the time at which the check results should have come in (allow 10 minutes slack time) */ expected_time = (time_t)(temp_host->next_check + temp_host->latency + host_check_timeout + check_reaper_interval + 600); /* this host was supposed to have executed a while ago, but for some reason the results haven't come back in... */ if(expected_time < current_time) { /* log a warning */ logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of host '%s' looks like it was orphaned (results never came back). I'm scheduling an immediate check of the host...\n", temp_host->name); log_debug_info(DEBUGL_CHECKS, 1, "Host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_host->name); /* decrement the number of running host checks */ if(currently_running_host_checks > 0) currently_running_host_checks--; /* disable the executing flag */ temp_host->is_executing = FALSE; /* schedule an immediate check of the host */ schedule_host_check(temp_host, current_time, CHECK_OPTION_ORPHAN_CHECK); } } return; } /* check freshness of host results */ void check_host_result_freshness(void) { host *temp_host = NULL; time_t current_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_result_freshness()\n"); log_debug_info(DEBUGL_CHECKS, 2, "Attempting to check the freshness of host check results...\n"); /* bail out if we're not supposed to be checking freshness */ if(check_host_freshness == FALSE) { log_debug_info(DEBUGL_CHECKS, 2, "Host freshness checking is disabled.\n"); return; } /* get the current time */ time(¤t_time); /* check all hosts... */ for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) { /* skip hosts we shouldn't be checking for freshness */ if(temp_host->check_freshness == FALSE) continue; /* skip hosts that have both active and passive checks disabled */ if(temp_host->checks_enabled == FALSE && temp_host->accept_passive_checks == FALSE) continue; /* skip hosts that are currently executing (problems here will be caught by orphaned host check) */ if(temp_host->is_executing == TRUE) continue; /* skip hosts that are already being freshened */ if(temp_host->is_being_freshened == TRUE) continue; /* see if the time is right... */ if(check_time_against_period(current_time, temp_host->check_period_ptr) == ERROR) continue; /* the results for the last check of this host are stale */ if(is_host_result_fresh(temp_host, current_time, TRUE) == FALSE) { /* set the freshen flag */ temp_host->is_being_freshened = TRUE; /* schedule an immediate forced check of the host */ schedule_host_check(temp_host, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK); } } return; } /* checks to see if a hosts's check results are fresh */ int is_host_result_fresh(host *temp_host, time_t current_time, int log_this) { time_t expiration_time = 0L; int freshness_threshold = 0; int days = 0; int hours = 0; int minutes = 0; int seconds = 0; int tdays = 0; int thours = 0; int tminutes = 0; int tseconds = 0; double interval = 0; log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of host '%s'...\n", temp_host->name); /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */ if(temp_host->freshness_threshold == 0) { if(temp_host->state_type == HARD_STATE || temp_host->current_state == STATE_OK) { interval = temp_host->check_interval; } else { interval = temp_host->retry_interval; } freshness_threshold = (interval * interval_length) + temp_host->latency + additional_freshness_latency; } else freshness_threshold = temp_host->freshness_threshold; log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: host=%d, use=%d\n", temp_host->freshness_threshold, freshness_threshold); /* calculate expiration time */ /* * CHANGED 11/10/05 EG: * program start is only used in expiration time calculation * if > last check AND active checks are enabled, so active checks * can become stale immediately upon program startup */ if(temp_host->has_been_checked == FALSE) expiration_time = (time_t)(event_start + freshness_threshold); /* * CHANGED 06/19/07 EG: * Per Ton's suggestion (and user requests), only use program start * time over last check if no specific threshold has been set by user. * Problems can occur if Nagios is restarted more frequently that * freshness threshold intervals (hosts never go stale). */ /* * CHANGED 10/07/07 EG: * Added max_host_check_spread to expiration time as suggested by * Altinity */ else if(temp_host->checks_enabled == TRUE && event_start > temp_host->last_check && temp_host->freshness_threshold == 0) expiration_time = (time_t)(event_start + freshness_threshold + (max_host_check_spread * interval_length)); else expiration_time = (time_t)(temp_host->last_check + freshness_threshold); /* * If the check was last done passively, we assume it's going * to continue that way and we need to handle the fact that * Nagios might have been shut off for quite a long time. If so, * we mustn't spam freshness notifications but use event_start * instead of last_check to determine freshness expiration time. * The threshold for "long time" is determined as 61.8% of the normal * freshness threshold based on vast heuristical research (ie, "some * guy once told me the golden ratio is good for loads of stuff"). */ if (temp_host->check_type == CHECK_TYPE_PASSIVE) { if (temp_host->last_check < event_start && event_start - last_program_stop > freshness_threshold * 0.618) { expiration_time = event_start + freshness_threshold; } } log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_host->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_host->last_check, (unsigned long)current_time, (unsigned long)expiration_time); /* the results for the last check of this host are stale */ if(expiration_time < current_time) { get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds); get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds); /* log a warning */ if(log_this == TRUE) logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the host.\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds); log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the host...\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds); return FALSE; } else log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are fresh.\n", temp_host->name); return TRUE; } /* run a scheduled host check asynchronously */ int run_scheduled_host_check(host *hst, int check_options, double latency) { int result = OK; time_t current_time = 0L; time_t preferred_time = 0L; time_t next_valid_time = 0L; int time_is_valid = TRUE; log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_host_check()\n"); if(hst == NULL) return ERROR; log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n", hst->name, check_options, latency); /* * reset the next_check_event so we know this host * check is no longer in the scheduling queue */ hst->next_check_event = NULL; /* attempt to run the check */ result = run_async_host_check(hst, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time); /* an error occurred, so reschedule the check */ if(result == ERROR) { log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled host check at this time\n"); /* only attempt to (re)schedule checks that should get checked... */ if(hst->should_be_scheduled == TRUE) { /* get current time */ time(¤t_time); /* determine next time we should check the host if needed */ /* if host has no check interval, schedule it again for 5 minutes from now */ if(current_time >= preferred_time) preferred_time = current_time + ((hst->check_interval <= 0) ? 300 : (hst->check_interval * interval_length)); /* make sure we rescheduled the next host check at a valid time */ get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr); /* * If the host really can't be rescheduled properly we * set next check time to preferred_time and try again then */ if(time_is_valid == FALSE && check_time_against_period(next_valid_time, hst->check_period_ptr) == ERROR) { hst->next_check = preferred_time + ranged_urand(0, check_window(hst)); logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' could not be rescheduled properly. Scheduling check for %s...\n", hst->name, ctime(&preferred_time)); log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next host check!\n"); } /* this service could be rescheduled... */ else { hst->next_check = next_valid_time; if(next_valid_time > preferred_time) { /* Next valid time is further in the future because of * timeperiod constraints. Add a random amount so we * don't get all checks subject to that timeperiod * constraint scheduled at the same time */ hst->next_check += ranged_urand(0, check_window(hst)); } hst->should_be_scheduled = TRUE; log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next host check for %s", ctime(&next_valid_time)); } } /* update the status log */ update_host_status(hst, FALSE); /* reschedule the next host check - unless we couldn't find a valid next check time */ /* 10/19/07 EG - keep original check options */ if(hst->should_be_scheduled == TRUE) schedule_host_check(hst, hst->next_check, check_options); return ERROR; } return OK; } /* perform an asynchronous check of a host */ /* scheduled host checks will use this, as will some checks that result from on-demand checks... */ int run_async_host_check(host *hst, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) { nagios_macros mac; char *raw_command = NULL; char *processed_command = NULL; struct timeval start_time, end_time; double old_latency = 0.0; check_result *cr; int runchk_result = OK; int macro_options = STRIP_ILLEGAL_MACRO_CHARS | ESCAPE_MACRO_CHARS; #ifdef USE_EVENT_BROKER int neb_result = OK; #endif log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_host_check(%s ...)\n", hst ? hst->name : "(NULL host!)"); /* make sure we have a host */ if(hst == NULL) return ERROR; log_debug_info(DEBUGL_CHECKS, 0, "** Running async check of host '%s'...\n", hst->name); /* abort if check is already running or was recently completed */ if (!(check_options & CHECK_OPTION_FORCE_EXECUTION)) { if(hst->is_executing == TRUE) { log_debug_info(DEBUGL_CHECKS, 1, "A check of this host is already being executed, so we'll pass for the moment...\n"); return ERROR; } if (hst->last_check + cached_host_check_horizon > time(NULL)) { log_debug_info(DEBUGL_CHECKS, 0, "Host '%s' was last checked within its cache horizon. Aborting check\n", hst->name); return ERROR; } } log_debug_info(DEBUGL_CHECKS, 0, "Host '%s' passed first hurdle (caching/execution)\n", hst->name); /* is the host check viable at this time? */ if(check_host_check_viability(hst, check_options, time_is_valid, preferred_time) == ERROR) { log_debug_info(DEBUGL_CHECKS, 0, "Host check isn't viable at this point.\n"); return ERROR; } /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/ #ifdef USE_EVENT_BROKER /* initialize start/end times */ start_time.tv_sec = 0L; start_time.tv_usec = 0L; end_time.tv_sec = 0L; end_time.tv_usec = 0L; /* send data to event broker */ neb_result = broker_host_check(NEBTYPE_HOSTCHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, hst, CHECK_TYPE_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL, NULL); if(neb_result == NEBERROR_CALLBACKCANCEL || neb_result == NEBERROR_CALLBACKOVERRIDE) { log_debug_info(DEBUGL_CHECKS, 0, "Check of host '%s' (id=%u) was %s by a module\n", hst->name, hst->id, neb_result == NEBERROR_CALLBACKCANCEL ? "cancelled" : "overridden"); } /* neb module wants to cancel the host check - the check will be rescheduled for a later time by the scheduling logic */ if(neb_result == NEBERROR_CALLBACKCANCEL) { if(preferred_time) *preferred_time += check_window(hst); return ERROR; } /* neb module wants to override the host check - perhaps it will check the host itself */ /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */ /* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */ if(neb_result == NEBERROR_CALLBACKOVERRIDE) return OK; #endif log_debug_info(DEBUGL_CHECKS, 0, "Checking host '%s'...\n", hst->name); /* clear check options - we don't want old check options retained */ /* only clear options if this was a scheduled check - on demand check options shouldn't affect retained info */ if(scheduled_check == TRUE) hst->check_options = CHECK_OPTION_NONE; /* adjust host check attempt */ adjust_host_check_attempt(hst, TRUE); /* set latency (temporarily) for macros and event broker */ old_latency = hst->latency; hst->latency = latency; /* grab the host macro variables */ memset(&mac, 0, sizeof(mac)); grab_host_macros_r(&mac, hst); /* get the raw command line */ get_raw_command_line_r(&mac, hst->check_command_ptr, hst->check_command, &raw_command, macro_options); if(raw_command == NULL) { clear_volatile_macros_r(&mac); log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for host '%s' was NULL - aborting.\n", hst->name); return ERROR; } /* process any macros contained in the argument */ process_macros_r(&mac, raw_command, &processed_command, macro_options); my_free(raw_command); if(processed_command == NULL) { clear_volatile_macros_r(&mac); log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for host '%s' was NULL - aborting.\n", hst->name); return ERROR; } /* get the command start time */ gettimeofday(&start_time, NULL); cr = calloc(1, sizeof(*cr)); if (!cr) { log_debug_info(DEBUGL_CHECKS, 0, "Failed to allocate checkresult struct\n"); clear_volatile_macros_r(&mac); clear_host_macros_r(&mac); return ERROR; } init_check_result(cr); /* save check info */ cr->object_check_type = HOST_CHECK; cr->host_name = (char *)strdup(hst->name); cr->service_description = NULL; cr->check_type = CHECK_TYPE_ACTIVE; cr->check_options = check_options; cr->scheduled_check = scheduled_check; cr->reschedule_check = reschedule_check; cr->latency = latency; cr->start_time = start_time; cr->finish_time = start_time; cr->early_timeout = FALSE; cr->exited_ok = TRUE; cr->return_code = STATE_OK; cr->output = NULL; #ifdef USE_EVENT_BROKER /* send data to event broker */ neb_result = broker_host_check(NEBTYPE_HOSTCHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, hst, CHECK_TYPE_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, processed_command, NULL, NULL, NULL, NULL, cr); /* neb module wants to override the service check - perhaps it will check the service itself */ if (neb_result == NEBERROR_CALLBACKOVERRIDE) { clear_volatile_macros_r(&mac); hst->latency = old_latency; free_check_result(cr); my_free(processed_command); return OK; } #endif /* reset latency (permanent value for this check will get set later) */ hst->latency = old_latency; runchk_result = wproc_run_check(cr, processed_command, &mac); if (runchk_result == ERROR) { logit(NSLOG_RUNTIME_ERROR, TRUE, "Unable to send check for host '%s' to worker (ret=%d)\n", hst->name, runchk_result); } else { /* do the book-keeping */ currently_running_host_checks++; hst->is_executing = TRUE; update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_HOST_CHECK_STATS : ACTIVE_ONDEMAND_HOST_CHECK_STATS, start_time.tv_sec); update_check_stats(PARALLEL_HOST_CHECK_STATS, start_time.tv_sec); } /* free memory */ clear_volatile_macros_r(&mac); my_free(processed_command); return runchk_result; } static int get_host_check_return_code(host *temp_host, check_result *queued_check_result) { int rc; char *temp_plugin_output = NULL; log_debug_info(DEBUGL_FUNCTIONS, 0, "get_host_check_return_code()\n"); /* get the unprocessed return code */ /* NOTE: for passive checks, this is the final/processed state */ rc = queued_check_result->return_code; /* adjust return code (active checks only) */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE) { if(queued_check_result->early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' timed out after %.2lf seconds\n", temp_host->name, temp_host->execution_time); my_free(temp_host->plugin_output); my_free(temp_host->long_plugin_output); my_free(temp_host->perf_data); asprintf(&temp_host->plugin_output, "(Host check timed out after %.2lf seconds)", temp_host->execution_time); rc = STATE_UNKNOWN; } /* if there was some error running the command, just skip it (this shouldn't be happening) */ else if(queued_check_result->exited_ok == FALSE) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' did not exit properly!\n", temp_host->name); my_free(temp_host->plugin_output); my_free(temp_host->long_plugin_output); my_free(temp_host->perf_data); temp_host->plugin_output = (char *)strdup("(Host check did not exit properly)"); rc = STATE_CRITICAL; } /* make sure the return code is within bounds */ else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_host->name, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " Make sure the plugin you're trying to run actually exists." : ""); asprintf(&temp_plugin_output, "(Return code of %d is out of bounds%s : %s)", queued_check_result->return_code, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " - plugin may be missing" : "", temp_host->plugin_output); my_free(temp_host->plugin_output); asprintf(&temp_host->plugin_output, "%s)", temp_plugin_output); my_free(temp_plugin_output); my_free(temp_host->long_plugin_output); my_free(temp_host->perf_data); rc = STATE_CRITICAL; } /* a NULL host check command means we should assume the host is UP */ if(temp_host->check_command == NULL) { my_free(temp_host->plugin_output); temp_host->plugin_output = (char *)strdup("(Host assumed to be UP)"); rc = STATE_OK; } } return rc; } /* process results of an asynchronous host check */ int handle_async_host_check_result(host *temp_host, check_result *queued_check_result) { time_t current_time; int result = STATE_OK; int reschedule_check = FALSE; char *old_plugin_output = NULL; char *temp_ptr = NULL; struct timeval start_time_hires; struct timeval end_time_hires; log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_host_check_result(%s ...)\n", temp_host ? temp_host->name : "(NULL host!)"); /* make sure we have what we need */ if(temp_host == NULL || queued_check_result == NULL) return ERROR; time(¤t_time); log_debug_info(DEBUGL_CHECKS, 1, "** Handling async check result for host '%s' from '%s'...\n", temp_host->name, check_result_source(queued_check_result)); log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Type: %s\n", (queued_check_result->check_type == CHECK_TYPE_ACTIVE) ? "Active" : "Passive"); log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Options: %d\n", queued_check_result->check_options); log_debug_info(DEBUGL_CHECKS, 2, "\tScheduled Check?: %s\n", (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No"); log_debug_info(DEBUGL_CHECKS, 2, "\tReschedule Check?: %s\n", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No"); log_debug_info(DEBUGL_CHECKS, 2, "\tExited OK?: %s\n", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No"); log_debug_info(DEBUGL_CHECKS, 2, "\tExec Time: %.3f\n", temp_host->execution_time); log_debug_info(DEBUGL_CHECKS, 2, "\tLatency: %.3f\n", temp_host->latency); log_debug_info(DEBUGL_CHECKS, 2, "\tReturn Status: %d\n", queued_check_result->return_code); log_debug_info(DEBUGL_CHECKS, 2, "\tOutput: %s\n", (queued_check_result == NULL) ? "NULL" : queued_check_result->output); /* decrement the number of host checks still out there... */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE && currently_running_host_checks > 0) currently_running_host_checks--; /* skip this host check results if its passive and we aren't accepting passive check results */ if(queued_check_result->check_type == CHECK_TYPE_PASSIVE) { if(accept_passive_host_checks == FALSE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive host checks are disabled globally.\n"); return ERROR; } if(temp_host->accept_passive_checks == FALSE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive checks are disabled for this host.\n"); return ERROR; } } /* clear the freshening flag (it would have been set if this host was determined to be stale) */ if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) temp_host->is_being_freshened = FALSE; /* DISCARD INVALID FRESHNESS CHECK RESULTS */ /* If a host goes stale, Nagios will initiate a forced check in order to freshen it. There is a race condition whereby a passive check could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would make the host fresh again, so we do a quick check to make sure the host is still stale before we accept the check result. */ if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_host_result_fresh(temp_host, current_time, FALSE) == TRUE) { log_debug_info(DEBUGL_CHECKS, 0, "Discarding host freshness check result because the host is currently fresh (race condition avoided).\n"); return OK; } /* was this check passive or active? */ temp_host->check_type = (queued_check_result->check_type == CHECK_TYPE_ACTIVE) ? CHECK_TYPE_ACTIVE : CHECK_TYPE_PASSIVE; /* update check statistics for passive results */ if(queued_check_result->check_type == CHECK_TYPE_PASSIVE) update_check_stats(PASSIVE_HOST_CHECK_STATS, queued_check_result->start_time.tv_sec); /* should we reschedule the next check of the host? NOTE: this might be overridden later... */ reschedule_check = queued_check_result->reschedule_check; /* check latency is passed to us for both active and passive checks */ temp_host->latency = queued_check_result->latency; /* update the execution time for this check (millisecond resolution) */ temp_host->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0); if(temp_host->execution_time < 0.0) temp_host->execution_time = 0.0; /* set the checked flag */ temp_host->has_been_checked = TRUE; /* clear the execution flag if this was an active check */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE) temp_host->is_executing = FALSE; /* get the last check time */ temp_host->last_check = queued_check_result->start_time.tv_sec; /* was this check passive or active? */ temp_host->check_type = (queued_check_result->check_type == CHECK_TYPE_ACTIVE) ? CHECK_TYPE_ACTIVE : CHECK_TYPE_PASSIVE; /* save the old host state */ temp_host->last_state = temp_host->current_state; if(temp_host->state_type == HARD_STATE) temp_host->last_hard_state = temp_host->current_state; /* save old plugin output */ if(temp_host->plugin_output) old_plugin_output = (char *)strdup(temp_host->plugin_output); /* clear the old plugin output and perf data buffers */ my_free(temp_host->plugin_output); my_free(temp_host->long_plugin_output); my_free(temp_host->perf_data); /* parse check output to get: (1) short output, (2) long output, (3) perf data */ parse_check_output(queued_check_result->output, &temp_host->plugin_output, &temp_host->long_plugin_output, &temp_host->perf_data, TRUE, FALSE); /* make sure we have some data */ if(temp_host->plugin_output == NULL || !strcmp(temp_host->plugin_output, "")) { my_free(temp_host->plugin_output); temp_host->plugin_output = (char *)strdup("(No output returned from host check)"); } /* replace semicolons in plugin output (but not performance data) with colons */ if((temp_ptr = temp_host->plugin_output)) { while((temp_ptr = strchr(temp_ptr, ';'))) * temp_ptr = ':'; } log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n"); log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_host->plugin_output == NULL) ? "NULL" : temp_host->plugin_output); log_debug_info(DEBUGL_CHECKS, 2, "Long Output: %s\n", (temp_host->long_plugin_output == NULL) ? "NULL" : temp_host->long_plugin_output); log_debug_info(DEBUGL_CHECKS, 2, "Perf Data: %s\n", (temp_host->perf_data == NULL) ? "NULL" : temp_host->perf_data); /* get the check return code */ result = get_host_check_return_code(temp_host, queued_check_result); /* translate return code to basic UP/DOWN state - the DOWN/UNREACHABLE state determination is made later */ /* NOTE: only do this for active checks - passive check results already have the final state */ if(queued_check_result->check_type == CHECK_TYPE_ACTIVE) { /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */ if(use_aggressive_host_checking == FALSE && result == STATE_WARNING) result = STATE_OK; /* OK states means the host is UP */ if(result == STATE_OK) result = HOST_UP; /* any problem state indicates the host is not UP */ else result = HOST_DOWN; } /******************* PROCESS THE CHECK RESULTS ******************/ /* process the host check result */ process_host_check_result(temp_host, result, old_plugin_output, CHECK_OPTION_NONE, reschedule_check, TRUE, cached_host_check_horizon); /* free memory */ my_free(old_plugin_output); log_debug_info(DEBUGL_CHECKS, 1, "** Async check result for host '%s' handled: new state=%d\n", temp_host->name, temp_host->current_state); /* high resolution start time for event broker */ start_time_hires = queued_check_result->start_time; /* high resolution end time for event broker */ gettimeofday(&end_time_hires, NULL); #ifdef USE_EVENT_BROKER /* send data to event broker */ broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_host, temp_host->check_type, temp_host->current_state, temp_host->state_type, start_time_hires, end_time_hires, temp_host->check_command, temp_host->latency, temp_host->execution_time, host_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, temp_host->plugin_output, temp_host->long_plugin_output, temp_host->perf_data, NULL, queued_check_result); #endif return OK; } /* processes the result of a synchronous or asynchronous host check */ int process_host_check_result(host *hst, int new_state, char *old_plugin_output, int check_options, int reschedule_check, int use_cached_result, unsigned long check_timestamp_horizon) { hostsmember *temp_hostsmember = NULL; host *child_host = NULL; host *parent_host = NULL; host *master_host = NULL; time_t current_time = 0L; time_t next_check = 0L; time_t preferred_time = 0L; time_t next_valid_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "process_host_check_result()\n"); log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, ATTEMPT=%d/%d, CHECK TYPE=%s, STATE TYPE=%s, OLD STATE=%d, NEW STATE=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->check_type == CHECK_TYPE_ACTIVE) ? "ACTIVE" : "PASSIVE", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state, new_state); /* get the current time */ time(¤t_time); /* default next check time */ next_check = current_time + normal_check_window(hst); /* we have to adjust current attempt # for passive checks, as it isn't done elsewhere */ if(hst->check_type == CHECK_TYPE_PASSIVE && passive_host_checks_are_soft == TRUE) adjust_host_check_attempt(hst, FALSE); /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */ if(hst->check_type == CHECK_TYPE_PASSIVE) { if(log_passive_checks == TRUE) logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE HOST CHECK: %s;%d;%s\n", hst->name, new_state, hst->plugin_output); } /******* HOST WAS DOWN/UNREACHABLE INITIALLY *******/ if(hst->current_state != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Host was %s.\n", host_state_name(hst->current_state)); /***** HOST IS NOW UP *****/ /* the host just recovered! */ if(new_state == HOST_UP) { /* set the current state */ hst->current_state = HOST_UP; /* set the state type */ /* set state type to HARD for passive checks and active checks that were previously in a HARD STATE */ if(hst->state_type == HARD_STATE || (hst->check_type == CHECK_TYPE_PASSIVE && passive_host_checks_are_soft == FALSE)) hst->state_type = HARD_STATE; else hst->state_type = SOFT_STATE; log_debug_info(DEBUGL_CHECKS, 1, "Host experienced a %s recovery (it's now UP).\n", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT"); /* reschedule the next check of the host at the normal interval */ reschedule_check = TRUE; next_check = (unsigned long)(current_time + (hst->check_interval * interval_length)); /* propagate checks to immediate parents if they are not already UP */ /* we do this because a parent host (or grandparent) may have recovered somewhere and we should catch the recovery as soon as possible */ log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to parent host(s)...\n"); for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) { parent_host = temp_hostsmember->host_ptr; if(parent_host->current_state != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Check of parent host '%s' queued.\n", parent_host->name); schedule_host_check(parent_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } /* propagate checks to immediate children if they are not already UP */ /* we do this because children may currently be UNREACHABLE, but may (as a result of this recovery) switch to UP or DOWN states */ log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to child host(s)...\n"); for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) { child_host = temp_hostsmember->host_ptr; if(child_host->current_state != HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name); schedule_host_check(child_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); } } } /***** HOST IS STILL DOWN/UNREACHABLE *****/ /* we're still in a problem state... */ else { log_debug_info(DEBUGL_CHECKS, 1, "Host is still %s.\n", host_state_name(hst->current_state)); /* passive checks are treated as HARD states by default... */ if(hst->check_type == CHECK_TYPE_PASSIVE && passive_host_checks_are_soft == FALSE) { /* set the state type */ hst->state_type = HARD_STATE; /* reset the current attempt */ hst->current_attempt = 1; } /* active checks and passive checks (treated as SOFT states) */ else { /* set the state type */ /* we've maxed out on the retries */ if(hst->current_attempt == hst->max_attempts) hst->state_type = HARD_STATE; /* the host was in a hard problem state before, so it still is now */ /* 2015-07-23 with the change adjust_host_check_attempt, this can no longer happen else if(hst->current_attempt == 1) hst->state_type = HARD_STATE; */ /* the host is in a soft state and the check will be retried */ else hst->state_type = SOFT_STATE; } /* make a determination of the host's state */ /* translate host state between DOWN/UNREACHABLE (only for passive checks if enabled) */ hst->current_state = new_state; if(hst->check_type == CHECK_TYPE_ACTIVE || translate_passive_host_checks == TRUE) hst->current_state = determine_host_reachability(hst); /* reschedule the next check if the host state changed */ if(hst->last_state != hst->current_state || hst->last_hard_state != hst->current_state) { reschedule_check = TRUE; /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */ if(hst->state_type == SOFT_STATE) next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length)); /* host has maxed out on retries (or was previously in a hard problem state), so reschedule the next check at the normal interval */ else next_check = (unsigned long)(current_time + (hst->check_interval * interval_length)); } } } /******* HOST WAS UP INITIALLY *******/ else { log_debug_info(DEBUGL_CHECKS, 1, "Host was UP.\n"); /***** HOST IS STILL UP *****/ /* either the host never went down since last check */ if(new_state == HOST_UP) { log_debug_info(DEBUGL_CHECKS, 1, "Host is still UP.\n"); /* set the current state */ hst->current_state = HOST_UP; /* set the state type */ hst->state_type = HARD_STATE; /* reschedule the next check at the normal interval */ if(reschedule_check == TRUE) next_check = (unsigned long)(current_time + (hst->check_interval * interval_length)); } /***** HOST IS NOW DOWN/UNREACHABLE *****/ else { log_debug_info(DEBUGL_CHECKS, 1, "Host is now %s.\n", host_state_name(hst->current_state)); /* active and (in some cases) passive check results are treated as SOFT states */ if(hst->check_type == CHECK_TYPE_ACTIVE || passive_host_checks_are_soft == TRUE) { /* set the state type */ hst->state_type = SOFT_STATE; } /* by default, passive check results are treated as HARD states */ else { /* set the state type */ hst->state_type = HARD_STATE; /* reset the current attempt */ hst->current_attempt = 1; } /* make a (in some cases) preliminary determination of the host's state */ /* translate host state between DOWN/UNREACHABLE (for passive checks only if enabled) */ hst->current_state = new_state; if(hst->check_type == CHECK_TYPE_ACTIVE || translate_passive_host_checks == TRUE) hst->current_state = determine_host_reachability(hst); /* reschedule a check of the host */ reschedule_check = TRUE; /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */ if(hst->check_type == CHECK_TYPE_ACTIVE || passive_host_checks_are_soft == TRUE) next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length)); /* schedule a re-check of the host at the normal interval */ else next_check = (unsigned long)(current_time + (hst->check_interval * interval_length)); /* propagate checks to immediate parents if they are UP */ /* we do this because a parent host (or grandparent) may have gone down and blocked our route */ /* checking the parents ASAP will allow us to better determine the final state (DOWN/UNREACHABLE) of this host later */ log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate parent hosts that are UP...\n"); for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) { parent_host = temp_hostsmember->host_ptr; if(parent_host->current_state == HOST_UP) { schedule_host_check(parent_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK); log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", parent_host->name); } } /* propagate checks to immediate children if they are not UNREACHABLE */ /* we do this because we may now be blocking the route to child hosts */ log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate non-UNREACHABLE child hosts...\n"); for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) { child_host = temp_hostsmember->host_ptr; if(child_host->current_state != HOST_UNREACHABLE) { log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name); schedule_host_check(child_host, current_time, CHECK_OPTION_NONE); } } /* check dependencies on second to last host check */ if(enable_predictive_host_dependency_checks == TRUE && hst->current_attempt == (hst->max_attempts - 1)) { objectlist *list; /* propagate checks to hosts that THIS ONE depends on for notifications AND execution */ /* we do to help ensure that the dependency checks are accurate before it comes time to notify */ log_debug_info(DEBUGL_CHECKS, 1, "Propagating predictive dependency checks to hosts this one depends on...\n"); for(list = hst->notify_deps; list; list = list->next) { hostdependency *dep = (hostdependency *)list->object_ptr; if(dep->dependent_host_ptr == hst && dep->master_host_ptr != NULL) { master_host = (host *)dep->master_host_ptr; log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", master_host->name); schedule_host_check(master_host, current_time, CHECK_OPTION_NONE); } } for(list = hst->exec_deps; list; list = list->next) { hostdependency *dep = (hostdependency *)list->object_ptr; if(dep->dependent_host_ptr == hst && dep->master_host_ptr != NULL) { master_host = (host *)dep->master_host_ptr; log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", master_host->name); schedule_host_check(master_host, current_time, CHECK_OPTION_NONE); } } } } } log_debug_info(DEBUGL_CHECKS, 1, "Pre-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d (%s)\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state, host_state_name(hst->current_state)); /* handle the host state */ handle_host_state(hst); log_debug_info(DEBUGL_CHECKS, 1, "Post-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d (%s)\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state, host_state_name(hst->current_state)); /******************** POST-PROCESSING STUFF *********************/ /* if the plugin output differs from previous check and no state change, log the current state/output if state stalking is enabled */ if(hst->last_state == hst->current_state && should_stalk(hst) && compare_strings(old_plugin_output, hst->plugin_output)) { log_host_event(hst); } /* check to see if the associated host is flapping */ check_for_host_flapping(hst, TRUE, TRUE, TRUE); /* reschedule the next check of the host (usually ONLY for scheduled, active checks, unless overridden above) */ if(reschedule_check == TRUE) { log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of host at %s", ctime(&next_check)); /* default is to reschedule host check unless a test below fails... */ hst->should_be_scheduled = TRUE; /* get the new current time */ time(¤t_time); /* make sure we don't get ourselves into too much trouble... */ if(current_time > next_check) hst->next_check = current_time; else hst->next_check = next_check; /* make sure we rescheduled the next service check at a valid time */ preferred_time = hst->next_check; get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr); hst->next_check = next_valid_time; if(next_valid_time > preferred_time) { /* Next valid time is further in the future because of timeperiod * constraints. Add a random amount so we don't get all checks * subject to that timeperiod constraint scheduled at the same time */ hst->next_check += ranged_urand(0, check_window(hst)); } /* hosts with non-recurring intervals do not get rescheduled if we're in a HARD or UP state */ if(hst->check_interval == 0 && (hst->state_type == HARD_STATE || hst->current_state == HOST_UP)) hst->should_be_scheduled = FALSE; /* host with active checks disabled do not get rescheduled */ if(hst->checks_enabled == FALSE) hst->should_be_scheduled = FALSE; /* schedule a non-forced check if we can */ if(hst->should_be_scheduled == TRUE) { schedule_host_check(hst, hst->next_check, CHECK_OPTION_NONE); } } /* update host status - for both active (scheduled) and passive (non-scheduled) hosts */ update_host_status(hst, FALSE); return OK; } /* checks viability of performing a host check */ int check_host_check_viability(host *hst, int check_options, int *time_is_valid, time_t *new_time) { int result = OK; int perform_check = TRUE; time_t current_time = 0L; time_t preferred_time = 0L; int check_interval = 0; log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_check_viability()\n"); /* make sure we have a host */ if(hst == NULL) return ERROR; /* get the check interval to use if we need to reschedule the check */ if(hst->state_type == SOFT_STATE && hst->current_state != HOST_UP) check_interval = (hst->retry_interval * interval_length); else check_interval = (hst->check_interval * interval_length); /* make sure check interval is positive - otherwise use 5 minutes out for next check */ if(check_interval <= 0) check_interval = 300; /* get the current time */ time(¤t_time); /* initialize the next preferred check time */ preferred_time = current_time; /* can we check the host right now? */ if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) { /* if checks of the host are currently disabled... */ if(hst->checks_enabled == FALSE) { preferred_time = current_time + check_interval; perform_check = FALSE; } /* make sure this is a valid time to check the host */ if(check_time_against_period((unsigned long)current_time, hst->check_period_ptr) == ERROR) { log_debug_info(DEBUGL_CHECKS, 0, "Timeperiod check failed\n"); preferred_time = current_time; if(time_is_valid) *time_is_valid = FALSE; perform_check = FALSE; } /* check host dependencies for execution */ if(check_host_dependencies(hst, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) { log_debug_info(DEBUGL_CHECKS, 0, "Host check dependencies failed\n"); preferred_time = current_time + check_interval; perform_check = FALSE; } } /* pass back the next viable check time */ if(new_time) *new_time = preferred_time; result = (perform_check == TRUE) ? OK : ERROR; return result; } /* adjusts current host check attempt before a new check is performed */ int adjust_host_check_attempt(host *hst, int is_active) { log_debug_info(DEBUGL_FUNCTIONS, 0, "adjust_host_check_attempt()\n"); if(hst == NULL) return ERROR; log_debug_info(DEBUGL_CHECKS, 2, "Adjusting check attempt number for host '%s': current attempt=%d/%d, state=%d, state type=%d\n", hst->name, hst->current_attempt, hst->max_attempts, hst->current_state, hst->state_type); /* if host is in a hard state, reset current attempt number */ /* 2015-07-23 only reset current_attempt if host is up */ if(hst->state_type == HARD_STATE && hst->current_state == HOST_UP) hst->current_attempt = 1; /* if host is in a soft UP state, reset current attempt number (active checks only) */ else if(is_active == TRUE && hst->state_type == SOFT_STATE && hst->current_state == HOST_UP) hst->current_attempt = 1; /* increment current attempt number */ else if(hst->current_attempt < hst->max_attempts) hst->current_attempt++; log_debug_info(DEBUGL_CHECKS, 2, "New check attempt number = %d\n", hst->current_attempt); return OK; } /* determination of the host's state based on route availability*/ /* used only to determine difference between DOWN and UNREACHABLE states */ int determine_host_reachability(host *hst) { host *parent_host = NULL; hostsmember *temp_hostsmember = NULL; log_debug_info(DEBUGL_FUNCTIONS, 0, "determine_host_reachability(host=%s)\n", hst ? hst->name : "(NULL host!)"); if(hst == NULL) return HOST_DOWN; log_debug_info(DEBUGL_CHECKS, 2, "Determining state of host '%s': current state=%d (%s)\n", hst->name, hst->current_state, host_state_name(hst->current_state)); /* host is UP - no translation needed */ if(hst->current_state == HOST_UP) { log_debug_info(DEBUGL_CHECKS, 2, "Host is UP, no state translation needed.\n"); return HOST_UP; } /* host has no parents, so it is DOWN */ if(hst->check_type == CHECK_TYPE_PASSIVE && hst->current_state == HOST_UNREACHABLE) { log_debug_info(DEBUGL_CHECKS, 2, "Passive check so keep it UNREACHABLE.\n"); return HOST_UNREACHABLE; } else if(hst->parent_hosts == NULL) { log_debug_info(DEBUGL_CHECKS, 2, "Host has no parents, so it is DOWN.\n"); return HOST_DOWN; } /* check all parent hosts to see if we're DOWN or UNREACHABLE */ else { for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) { parent_host = temp_hostsmember->host_ptr; log_debug_info(DEBUGL_CHECKS, 2, " Parent '%s' is %s\n", parent_host->name, host_state_name(parent_host->current_state)); /* bail out as soon as we find one parent host that is UP */ if(parent_host->current_state == HOST_UP) { /* set the current state */ log_debug_info(DEBUGL_CHECKS, 2, "At least one parent (%s) is up, so host is DOWN.\n", parent_host->name); return HOST_DOWN; } } } log_debug_info(DEBUGL_CHECKS, 2, "No parents were up, so host is UNREACHABLE.\n"); return HOST_UNREACHABLE; } /******************************************************************/ /****************** HOST STATE HANDLER FUNCTIONS ******************/ /******************************************************************/ /* top level host state handler - occurs after every host check (soft/hard and active/passive) */ int handle_host_state(host *hst) { int state_change = FALSE; int hard_state_change = FALSE; time_t current_time = 0L; log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_host_state()\n"); /* get current time */ time(¤t_time); /* obsess over this host check */ obsessive_compulsive_host_check_processor(hst); /* update performance data */ update_host_performance_data(hst); /* record the time the last state ended */ switch(hst->last_state) { case HOST_UP: hst->last_time_up = current_time; break; case HOST_DOWN: hst->last_time_down = current_time; break; case HOST_UNREACHABLE: hst->last_time_unreachable = current_time; break; default: break; } /* has the host state changed? */ if(hst->last_state != hst->current_state || (hst->current_state == HOST_UP && hst->state_type == SOFT_STATE)) state_change = TRUE; if(hst->current_attempt >= hst->max_attempts && hst->last_hard_state != hst->current_state) hard_state_change = TRUE; /* if the host state has changed... */ if(state_change == TRUE || hard_state_change == TRUE) { /* reset the next and last notification times */ hst->last_notification = (time_t)0; hst->next_notification = (time_t)0; /* reset notification suppression option */ hst->no_more_notifications = FALSE; /* reset the acknowledgement flag if necessary */ if(hst->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) { hst->problem_has_been_acknowledged = FALSE; hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE; /* remove any non-persistant comments associated with the ack */ delete_host_acknowledgement_comments(hst); } else if(hst->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && hst->current_state == HOST_UP) { hst->problem_has_been_acknowledged = FALSE; hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE; /* remove any non-persistant comments associated with the ack */ delete_host_acknowledgement_comments(hst); } } /* Not sure about this, but is old behaviour */ if(hst->last_hard_state != hst->current_state) hard_state_change = TRUE; if(state_change == TRUE || hard_state_change == TRUE) { /* update last state change times */ hst->last_state_change = current_time; if(hst->state_type == HARD_STATE) hst->last_hard_state_change = current_time; /* update the event id */ hst->last_event_id = hst->current_event_id; hst->current_event_id = next_event_id; next_event_id++; /* update the problem id when transitioning to a problem state */ if(hst->last_state == HOST_UP) { /* don't reset last problem id, or it will be zero the next time a problem is encountered */ hst->current_problem_id = next_problem_id; next_problem_id++; } /* clear the problem id when transitioning from a problem state to an UP state */ if(hst->current_state == HOST_UP) { hst->last_problem_id = hst->current_problem_id; hst->current_problem_id = 0L; } /* write the host state change to the main log file */ if(hst->state_type == HARD_STATE || (hst->state_type == SOFT_STATE && log_host_retries == TRUE)) log_host_event(hst); /* check for start of flexible (non-fixed) scheduled downtime */ /* It can start on soft states */ check_pending_flex_host_downtime(hst); /* notify contacts about the recovery or problem if its a "hard" state */ if(hst->state_type == HARD_STATE) host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE); /* handle the host state change */ handle_host_event(hst); /* the host just recovered, so reset the current host attempt */ if(hst->current_state == HOST_UP) hst->current_attempt = 1; /* the host recovered, so reset the current notification number and state flags (after the recovery notification has gone out) */ if(hst->current_state == HOST_UP) { hst->current_notification_number = 0; hst->notified_on = 0; } } /* else the host state has not changed */ else { /* notify contacts if host is still down or unreachable */ if(hst->current_state != HOST_UP && hst->state_type == HARD_STATE) host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE); /* if we're in a soft state and we should log host retries, do so now... */ if(hst->state_type == SOFT_STATE && log_host_retries == TRUE) log_host_event(hst); } return OK; } /* Parses raw plugin output and returns: short and long output, perf data. */ int parse_check_output(char *buf, char **short_output, char **long_output, char **perf_data, int escape_newlines_please, int newlines_are_escaped) { int current_line = 0; int eof = FALSE; int in_perf_data = FALSE; const int dbuf_chunk = 1024; dbuf long_text; dbuf perf_text; char *ptr = NULL; int x = 0; int y = 0; /* Initialize output values. */ if (short_output) *short_output = NULL; if (long_output) *long_output = NULL; if (perf_data) *perf_data = NULL; /* No input provided or no output requested, nothing to do. */ if (!buf || !*buf || (!short_output && !long_output && !perf_data)) return OK; /* Initialize dynamic buffers (1KB chunk size). */ dbuf_init(&long_text, dbuf_chunk); dbuf_init(&perf_text, dbuf_chunk); /* We should never need to worry about unescaping here again. We assume a * common internal plugin output format that is newline delimited. */ if (newlines_are_escaped) { for (x = 0, y = 0; buf[x]; x++) { if (buf[x] == '\\' && buf[x + 1] == '\\') { x++; buf[y++] = buf[x]; } else if (buf[x] == '\\' && buf[x + 1] == 'n') { x++; buf[y++] = '\n'; } else buf[y++] = buf[x]; } buf[y] = '\0'; } /* Process each line of input. */ for (x = 0; !eof && buf[0]; x++) { /* Continue on until we reach the end of a line (or input). */ if (buf[x] == '\n') buf[x] = '\0'; else if (buf[x] == '\0') eof = TRUE; else continue; /* Handle this line of input. */ current_line++; /* The first line contains short plugin output and optional perf data. */ if (current_line == 1) { /* Get the short plugin output. If buf[0] is '|', strtok() will * return buf+1 or NULL if buf[1] is '\0'. We use my_strtok() * instead which returns a pointer to '\0' in this case. */ if ((ptr = my_strtok(buf, "|"))) { if (short_output) { strip(ptr); /* Remove leading and trailing whitespace. */ *short_output = strdup(ptr); } /* Get the optional perf data. */ if ((ptr = my_strtok(NULL, "\n"))) dbuf_strcat(&perf_text, ptr); } } /* Additional lines contain long plugin output and optional perf data. * Once we've hit perf data, the rest of the output is perf data. */ else if (in_perf_data) { if (perf_text.buf && *perf_text.buf) dbuf_strcat(&perf_text, " "); dbuf_strcat(&perf_text, buf); } /* Look for the perf data separator. */ else if (strchr(buf, '|')) { in_perf_data = TRUE; if ((ptr = my_strtok(buf, "|"))) { /* Get the remaining long plugin output. */ if (current_line > 2) dbuf_strcat(&long_text, "\n"); dbuf_strcat(&long_text, ptr); /* Get the perf data. */ if ((ptr = my_strtok(NULL, "\n"))) { if (perf_text.buf && *perf_text.buf) dbuf_strcat(&perf_text, " "); dbuf_strcat(&perf_text, ptr); } } } /* Otherwise it's still just long output. */ else { if (current_line > 2) dbuf_strcat(&long_text, "\n"); dbuf_strcat(&long_text, buf); } /* Point buf to the start of the next line. *(buf+x+1) will be a valid * memory reference on our next iteration or we are at the end of input * (eof == TRUE) and *(buf+x+1) will never be referenced. */ buf += x + 1; x = -1; /* x will be incremented to 0 by the loop update. */ } /* Save long output. */ if (long_output && long_text.buf && *long_text.buf) { /* Escape newlines (and backslashes) in long output if requested. */ if (escape_newlines_please) *long_output = escape_newlines(long_text.buf); else *long_output = strdup(long_text.buf); } /* Save perf data. */ if (perf_data && perf_text.buf && *perf_text.buf) { strip(perf_text.buf); /* Remove leading and trailing whitespace. */ *perf_data = strdup(perf_text.buf); } /* free dynamic buffers */ dbuf_free(&long_text); dbuf_free(&perf_text); return OK; }