beegfs/beeond/scripts/lib/beegfs-ondemand-stoplocal
2025-08-10 01:34:16 +02:00

401 lines
12 KiB
Bash

#!/bin/bash
# beegfs-ondemand-stoplocal
# This file contains helper functions to stop BeeOND services locally on one node.
# This is meant to be sourced from another script (i.e. beeond)
# Checks the return code of the last command that has been executed. If the code is !=0, indicating
# an error, it prints a message and sets an error flag.
# Parameters:
# * The return code of the last command
# * A string containing a hint on what was being done that could have caused the error. It is
# used for the error message.
# Modifies:
# ERROR: Is set to "true" when an error was encountered.
sl_checkerror()
{
if [ "${1}" != 0 ]
then
echo "ERROR: There was a problem ${2} on host $(hostname)"
ERROR="true"
fi
}
# Prints an info message if the QUIET variable is not set.
# Parameter:
# A string (the message). It is prefixed with INFO when printed.
# Checks:
# QUIET: If "true", nothing is printed.
sl_print_info()
{
local MESSAGE=${1}
if [ "${QUIET}" != "true" ]
then
echo "INFO: ${MESSAGE}"
fi
}
# unmounts tmpfs mounts listed in the status file
sl_unmount_tmpfs()
{
local SERVICE MOUNTPOINT _
IFS=,
while read -r _ SERVICE MOUNTPOINT _ _
do
if [ "${SERVICE}" != "tmpfs" ]
then
continue
fi
sl_print_info "Unmounting tmpfs at ${MOUNTPOINT}"
if [ "${CLEANUP}" != "true" ]
then
fuser -k "${MOUNTPOINT}"
umount -l "${MOUNTPOINT}"
sl_checkerror $? "unmounting tmpfs"
else
fuser -k "${MOUNTPOINT}" 2>/dev/null
umount -l "${MOUNTPOINT}" 2>/dev/null
true
fi
done < "${STATUSFILE}"
unset IFS
}
# Unmounts all local mounts listed in the status file
sl_unmount_local_mounts()
{
local SERVICE MOUNTPOINT _
IFS=,
while read -r _ SERVICE MOUNTPOINT _ _
do
if [ "${SERVICE}" != "${CLIENTSERVICE}" ]
then
continue
fi
sl_print_info "Unmounting ${MOUNTPOINT}"
if [ "${CLEANUP}" != "true" ]
then
fuser -k "${MOUNTPOINT}" # no "sl_checkerror" after this, becuase fuser also returns
# non-zero when there are no processes accessing the file system
umount -l "${MOUNTPOINT}"
sl_checkerror $? "unmounting the ondemand file system"
else
fuser -k "${MOUNTPOINT}" 2>/dev/null
umount -l "${MOUNTPOINT}" 2>/dev/null
true # reset error code before next invocation of sl_checkerror
fi
done < "${STATUSFILE}"
unset IFS
# try to remove the client module - this is allowed to fail, because we might have a "normal"
# beegfs mount somewhere in the system.
rmmod beegfs 2>/dev/null || true
}
# sends a SIGTERM to a process, then waits until the process is stopped or appriximately 10 seconds
# have passed.
# Parameter:
# The PID of the proces
# Returns:
# 0 if process was stopped within 10 seconds, 1 if it wasn't, 255 if initial kill returned an
# error.
sl_kill_check()
{
local PID=$1
if ! kill "$PID"
then
return 255
fi
for ((i=0; i<100; i++))
do
if kill -0 "$PID" 2>/dev/null
then
sleep 0.1
else
return 0
fi
done
return 1
}
# stops all services listed in the status file except for clients
sl_stop_services()
{
local SERVICE DATAPATH PIDFILE _
IFS=,
while read -r _ SERVICE DATAPATH _ PIDFILE
do
if [ "${PIDFILE}" != "-" ] # pidfile is "-" for beegfs-client and tmpfs, because it is not
# a process
then
if [ -e "${PIDFILE}" ]
then
PID=$(cat "${PIDFILE}")
sl_kill_check "${PID}"
RES=$?
if [ $RES -eq 1 ]
then
echo "ERROR: ${SERVICE} did not stop within 10 seconds (PID ${PID})."
ERROR="true"
elif [ $RES -eq 255 ]
then
echo "ERROR: ${SERVICE} does not seem to be running any more (PID ${PID})."
fi
else
if [ "${CLEANUP}" != "true" ]
then
echo "ERROR: PID file ${PIDFILE} does not exist on host $(hostname)"
ERROR="true"
fi
fi
# delete data...
if [ "${DELETE_DATA}" = "true" ]
then
if [ "${DATAPATH}" != "-" ]
then
sl_print_info "Deleting stored data; Data path: ${DATAPATH}"
rm -rf "${DATAPATH}"
sl_checkerror $? "deleting ${DATAPATH}"
fi
fi
# delete preferredMds and preferredTarget files
rm -f "${PREFERRED_MDS_FILE}"
sl_checkerror $? "deleting ${PREFERRED_MDS_FILE}"
rm -f "${PREFERRED_TARGET_FILE}"
sl_checkerror $? "deleting ${PREFERRED_TARGET_FILE}"
fi
done < "${STATUSFILE}"
unset IFS
# unmount tempfs if it was used
sl_unmount_tmpfs
}
# deletes the logfiles listed in the status file if ERROR is set to false
# If the log directory is empty afterwards, it is also deleted
sl_delete_logfiles()
{
local LOGFILE # declare it here, because the last LOGFILE path is needed to delete the directory
# after the loop
# delete log files
if [ "${ERROR}" != "true" ] # if we haven't encountered an error yet.
then
# delete log files
local SERVICE LOGFILE _
IFS=,
while read -r _ SERVICE _ LOGFILE _
do
if [ "${ONLY_UNMOUNT}" = "true" ] && [ "${SERVICE}" != "${CLIENTSERVICE}" ]
then continue; fi
if [ "${ONLY_STOP_SERVER}" = "true" ] && [ "${SERVICE}" = "${CLIENTSERVICE}" ]
then continue; fi
if [ "${LOGFILE}" != "-" ]
then
sl_print_info "Deleting log file ${LOGFILE}"
rm -f "${LOGFILE}" 2>/dev/null # beegfs-client does not (always) generate a logfile.
# in this case rm gives an error message, but we don't
# want to see it. - for the same reason no sl_checkerror
# here
fi
done < "${STATUSFILE}"
unset IFS
# delete log directory if empty
local LOG_DIR
LOG_DIR=$(dirname "${LOGFILE}")
if [ "${LOG_DIR}" != "." ] && [ ! "$(ls -A "${LOG_DIR}")" ]
then
echo "Deleting log directory ${LOG_DIR}"
rmdir "${LOG_DIR}"
sl_checkerror $? "deleting ${LOG_DIR}"
fi
else
sl_print_info "Not deleting log files because of a previous error."
fi
}
# The "main" stoplocal function. From here, the functions to unmount the file system and stop the
# services are called. If there was no error, sl_delete_logfiles is called, and the status file is
# also removed.
# Checks the following variables:
# STATUSFILE The location of the status file
# ONLY_STOP_SERVER If "true", the umount_local_mounts step is skipped, and status file is not
# removed.
# ONLY_UNMOUNT If "true", the stop_services step is skipped, and status file is not
# removed.
# Modifies:
# ERROR Is set to "true" (and an error message is printed to %2) if an error is
# encountered in any step.
stoplocal()
{
sl_print_info "Using status file ${STATUSFILE}"
# do the actual shutdown process
# unmount the file system (skip this step if we only want to stop the server)
if [ "${ONLY_STOP_SERVER}" != "true" ]
then
sl_unmount_local_mounts
fi
# stop the services (skip this step if we only got asked to unmount the file system)
if [ "${ONLY_UNMOUNT}" != "true" ]
then
sl_stop_services
fi
# delete the logfiles
if [ "${ERROR}" != "true" ] && [ "${DELETE_LOGS}" = "true" ]
then
sl_delete_logfiles
fi
# delete the status file (only if a full shutdown was requested)
if [ "${ONLY_UNMOUNT}" != "true" ] && [ "${ONLY_STOP_SERVER}" != "true" ]
then
rm -f "${STATUSFILE}"
sl_checkerror $? "deleting the status file"
fi
}
# the user interface / main entry point to stoplocal
# Options:
# -i FILENAME => Status information filename
# (DEFAULT: ${DEFAULT_STATUSFILE})
# -d => Delete BeeGFS data on disks
# -L => Delete log files after successful shutdown
# -q => Suppress \"INFO\" messages, only print \"ERROR\"s
# -c => "Cleanup": Remove remaining processes and directories of a
# potentially unsuccessful shutdown of an earlier beeond
# instance. This switch silences the error message when a status
# information file is not found or an unmount command fails;
# instead, a message is printed (if \"INFO\" messages are not
# suppressed) when a status file DOES exist, because this means
# there actually was an instance before that is now being
# cleaned up.
# -u => ONLY unmount the file systems(*)
# -s => ONLY stop non-client services(*)
#
# (*) Options -u and -s are mutually exclusive
# If -u or -s are given, the status file is not deleted.
do_stoplocal()
{
local DEFAULT_STATUSFILE=/tmp/beeond.tmp
local CLIENTSERVICE=beegfs-client
local DELETE_DATA="false"
local DELETE_LOGS="false"
local ONLY_UNMOUNT="false"
local ONLY_STOP_SERVER="false"
local PREFERRED_MDS_FILE=/tmp/preferredMds.fod
local PREFERRED_TARGET_FILE=/tmp/preferredTarget.fod
local QUIET="false"
local ERROR="false"
local STATUSFILE="${DEFAULT_STATUSFILE}"
local OPTIND=1
local OPTARG=""
while getopts ":i:dLusqc" opt "$@"
do
case $opt in
i)
STATUSFILE=${OPTARG}
;;
d)
DELETE_DATA="true"
;;
L)
DELETE_LOGS="true"
;;
u)
if [ "${ONLY_STOP_SERVER}" = "true" ]
then
echo "ERROR: Options -s and -${OPTARG} are mutually exclusive" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
fi
ONLY_UNMOUNT="true"
;;
s)
if [ "${ONLY_UNMOUNT}" = "true" ]
then
echo "ERROR: Options -u and -${OPTARG} are mutually exclusive" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
fi
ONLY_STOP_SERVER="true"
;;
q)
QUIET="true"
;;
c)
CLEANUP="true"
;;
\?)
echo "ERROR: invalid option -${OPTARG}" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
;;
:)
echo "ERROR: Option -${OPTARG} requires an argument" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
;;
esac
done
# if statusfile can't be found, print a message and exit.
if [ ! -f ${STATUSFILE} ]
then
# only print message when we're not doing a cleanup run.
if [ "${CLEANUP}" != "true" ]
then
echo "ERROR: Status file ${STATUSFILE} not found." >&2
# If the user has specified a status file, just give a brief error message and exit.
# If the user has not specified a status file, give the full usage info - maybe the user
# didn't know how to specify a status file.
if [ "${STATUSFILE}" = "${DEFAULT_STATUSFILE}" ]
then
if declare -f -F "print_usage_and_exit" >/dev/null
then print_usage_and_exit; fi
fi
return 1
else
return 0 # return 0 if we're doing a cleanup so that pdsh doesn't complain
fi
fi
# if we're doing a cleanup run, inform the user that a status file was found.
if [ "${CLEANUP}" = "true" ]
then
sl_print_info "Status file found."
fi
stoplocal
if [ "${ERROR}" = "true" ]
then
return 1
else
return 0
fi
}