2025-08-10 01:34:16 +02:00

1607 lines
53 KiB
Bash
Executable File

#!/bin/bash
CURRENTTIME=$(date +%Y%m%d-%H%M%S)
MGMTD_BIN=beegfs-mgmtd
META_BIN=beegfs-meta
STORAGE_BIN=beegfs-storage
CLIENT_BIN=beegfs-client # not really a binary, but name of config, init, etc.
CTL_BIN=beegfs
DEFAULT_LOG_PATH=/var/log
LOG_PATH=${DEFAULT_LOG_PATH}
STORAGE_LOG=${STORAGE_BIN}_${CURRENTTIME}.log
META_LOG=${META_BIN}_${CURRENTTIME}.log
CLIENT_LOG=${CLIENT_BIN}_${CURRENTTIME}.log
STORAGE_CFG_NAME=${STORAGE_BIN}.conf
META_CFG_NAME=${META_BIN}.conf
MGMTD_CFG_NAME=${MGMTD_BIN}.toml
CLIENT_CFG_NAME=${CLIENT_BIN}.conf
META_NUMID_FILE=nodeNumID
TARGET_NUMID_FILE=targetNumID
PREFERRED_MDS_FILE=/tmp/preferredMds.fod
PREFERRED_TARGET_FILE=/tmp/preferredTarget.fod
DEFAULT_STATUSFILE=/var/tmp/beeond.tmp
STATUSFILE=${DEFAULT_STATUSFILE}
NUM_META_SERVER=1
NUM_STORAGE_SERVER=0
BEEGFS_BIN_PATH=/opt/beegfs/sbin
DEFAULT_MGMTD_GRPC_PORT=8010
DEFAULT_PORT_SHIFT=1000
SSH="ssh"
SSH_PARAMS=( -qq -oNumberOfPasswordPrompts=0 -oStrictHostKeyChecking=no -n )
DEFAULT_PDSH_PATH=$(which pdsh 2>/dev/null)
PDSH_RCMD="ssh"
# source helper script
ABSOLUTE_PATH=$(dirname "$(readlink -e "$0")") # using readlink, because somone might be calling
# this script using a symlink
if [ -e "${ABSOLUTE_PATH}/../lib/beegfs-ondemand-stoplocal" ]
then
BEEOND_STOPLOCAL="${ABSOLUTE_PATH}/../lib/beegfs-ondemand-stoplocal"
else
BEEOND_STOPLOCAL="${ABSOLUTE_PATH}/../scripts/lib/beegfs-ondemand-stoplocal"
fi
#shellcheck source=scripts/lib/beegfs-ondemand-stoplocal
source "${BEEOND_STOPLOCAL}"
# print usage
print_usage_and_exit()
{
echo ""
echo "BeeOND - BeeGFS OnDemand (http://www.beegfs.com)"
echo ""
echo "DESCRIPTION:"
echo " Script to set up or shut down a BeeGFS setup on the fly."
echo ""
echo " Creates a new BeeGFS file system on a set of hosts. All necessary services"
echo " are automatically started and the file system is mounted. In the same way,"
echo " the file system can be unmounted again and the services will be shut down."
echo " Optionally, the contents of the file system can be deleted."
echo ""
echo " This script can be used e.g. to automatically create a temporary scratch file"
echo " system for cluster nodes during a compute job, and to remove it after the job"
echo " is finished."
echo ""
echo "USAGE: $(basename "$0") <action> <options>"
echo ""
echo "ACTIONS:"
echo " The first argument to $(basename "$0") is considered to be an action that the"
echo " script should perform."
echo ""
echo " The following actions are available:"
echo ""
echo " start:"
echo " Start the file system on a number of nodes, specified by the node file."
echo " The necessary services will be started and the newly created file system"
echo " will be mounted at the specified mount point. Information about the"
echo " running file system are stored in a status file on each node."
echo ""
echo " Mandatory arguments:"
echo " -n FILENAME => Node file with line-separated hostnames."
echo " -d PATH => Path for BeeGFS data on servers."
echo " -c PATH => Mount point for BeeGFS clients."
echo ""
echo " Optional arguments:"
echo " -i FILENAME => Status information file name."
echo " Default: ${DEFAULT_STATUSFILE}"
echo " -F => Remove contents of data path before starting services."
echo " This is useful if the processes and status file of a"
echo " previous beeond session are gone, but the"
echo " data is still there."
echo " -m NUM => Number of metadata servers to start. Default: 1"
echo " -s NUM => Number of storage servers to start."
echo " Default: Number of hosts."
echo " -p NUM => Network port shift. The standard BeeGFS network port"
echo " numbers are shifted by this number. Useful in order to"
echo " have several BeeGFS instances running on the same node."
echo " Default: ${DEFAULT_PORT_SHIFT}"
echo " -f PATH => Directory containing additional beegfs config files."
echo " There can be one file for each service as well as the client."
echo " They must be named in the form beegfs-<service>.conf, where "
echo " <service> can be meta, storage, mgmtd or client."
echo " Only the options specified within the files are"
echo " set/overwritten, the rest of the defaults will not be"
echo " touched and still be applied. The directory and the "
echo " files need to be present on every node."
echo " -L PATH => Log file directory. If necessary, the directory will be"
echo " created. Default: ${DEFAULT_LOG_PATH}"
echo " -l => Prefer local storage nodes."
echo " -P => Use pdsh for parallel startup. If this option is not"
echo " given, ssh is used to start up the services on the nodes"
echo " sequentially."
echo " -b PATH => Path to the pdsh binary. Default: <auto-detected>"
echo " -r => Use tmpfs for beegfs storage and metadata."
echo " Note: On older Linux versions, tmpfs does not support"
echo " extended attributes. If you get an error message"
echo " from beegfs_meta reading \"Failed to store root"
echo " directory\" you have to provide an additional config"
echo " file beegfs-meta.conf containing the line"
echo " storeUseExtendedAttribs = false"
echo " -k => enable storage target mirroring"
echo " Note: Needs an even number of storage servers (-s)."
echo " -j => enable metadata server mirroring"
echo " Note: Needs an even number of metadata servers (-m)."
echo " -q => Suppress INFO messages, only print ERRORs."
echo " -t FILE => Use FILE to define multiple storage targets and assign"
echo " them to storage pools. The file needs to be in the"
echo " following format:"
echo ""
echo " pool_1:/path/to/target_1,/path/to/target_2,..."
echo " pool_2:/path/to/target_3,/path/to/target_4,..."
echo " ..."
echo ""
echo " pool_n is the name of the storage pool, the comma separated"
echo " list after the colon are the paths to the target directories"
echo " that shall be part of this pool."
echo " The lines can't contain whitespaces. BeeOND will look for"
echo " these directories and add them as a storage target on all"
echo " nodes where they exist. To avoid having unwanted targets"
echo " in a pool, make sure each of the specified paths only"
echo " exists on nodes where they are actually mounted on the"
echo " desired storage medium."
echo " BeeOND will then assign the targets to the corresponding"
echo " storage pools and create a directory for each pool"
echo " on the root level of the BeeGFS mount."
echo " This option can only be used together with -F."
echo " -T => Don't create and assign the pool directories when using -t."
echo " -G => The base gRPC port (before port shifting) that the mgmtd"
echo " uses in this BeeOND instance. Defaults to 8010 and only needs"
echo " to be supplied if mgmtd is configured via configuration file"
echo " (see -f) to use a base gRPC port other than 8010."
echo ""
echo " Arguments that require a configuration directory (option -f) that is available"
echo " on all nodes and contains the required files (see option descriptions):"
echo " -C => Enable connection authentication. Requires a \"conn.auth\""
echo " file in the configuration directory."
echo " -E => Enable TLS encryption between the mgmtmd and the command"
echo " line configuration tool. Requires \"cert.pem\" and"
echo " \"key.pem\" files in the configuration directory."
echo " -H => Enable enterprise features. This mode is required by all"
echo " other modes that enable enterprise features and requires a"
echo " \"license.pem\" file in the configuration directory."
echo ""
echo " stop:"
echo " Stop the file system on a number of nodes, specified by the node file."
echo " Use the information from the status file to unmount a file system on a"
echo " number of nodes specified by the node file, and shut down the services."
echo ""
echo " Mandatory arguments:"
echo " -n FILENAME => Node file."
echo ""
echo " Optional arguments:"
echo " -i FILENAME => Status information file name."
echo " Default: ${DEFAULT_STATUSFILE}"
echo " -d => Delete BeeGFS data on disks."
echo " -L => Delete log files after successful shutdown."
echo " -c => \"Cleanup\": Remove remaining processes and directories"
echo " of a potentially unsuccessful shutdown of an earlier"
echo " beeond instance. This switch silences the error"
echo " message when a status information file is not found on a"
echo " node or an unmount command fails; instead, a message is"
echo " printed (if \"INFO\" messages are not suppressed) when a"
echo " status file DOES exist, because this means there"
echo " actually was an instance before that is now being"
echo " cleaned up."
echo " -P => Use pdsh for parallel shutdown. If this option is not"
echo " given, ssh is used to unmount the file system and stop"
echo " the services on all nodes sequentially."
echo " -b PATH => Path to the pdsh binary. Default: ${DEFAULT_PDSH_PATH}"
echo " -q => Suppress INFO messages, only print ERRORs."
echo ""
echo " stoplocal:"
echo " Stop the file system on the local host only. This is recommended only as"
echo " an emergency measure, e.g. after a host encountered an error during the"
echo " distributed shutdown procedure. Uses the information from the status file"
echo " to unmount the file system and stop the services on the local host only."
echo ""
echo " Optional arguments:"
echo " -i FILENAME => Status information file."
echo " Default: ${DEFAULT_STATUSFILE}"
echo " -d => Delete BeeGFS data on disks."
echo " -L => Delete log files after successful shutdown. If the log"
echo " directory is empty afterwards, it will also be removed."
echo " -c => \"Cleanup\": Remove remaining processes and directories"
echo " of a potentially unsuccessful shutdown of an earlier"
echo " beeond instance. This switch silences the error"
echo " message when the status information file is not found or"
echo " the unmount command fails; instead, a message is printed"
echo " (if \"INFO\" messages are not suppressed) when a status"
echo " file DOES exist, because this means there actually was"
echo " an instance before that is now being cleaned up."
echo " -q => Suppress INFO messages, only print ERRORs."
echo " -u => ONLY unmount the file system."
echo " (Cannot be used in combination with \"-s\".)"
echo " -s => ONLY stop non-client services. (*)"
echo " (Cannot be used in combination with \"-u\".)"
echo ""
echo "EXAMPLES:"
echo " Start a beeond instance on the nodes given in nodefile, using the data"
echo " directory /data/beeond and the client mountpoint /mnt/beeond via pdsh"
echo " for parallel startup:"
echo " $(basename "$0") start -n nodefile -d /data/beeond -c /mnt/beeond -P"
echo ""
echo " Stop the file system:"
echo " $(basename "$0") stop -n nodefile -P -L -d"
echo ""
exit 1
}
### internal functions for general usage ###
print_error()
{
echo "ERROR: ${1}" >&2
echo ""
}
print_error_and_exit()
{
print_error "${1}"
exit 1
}
print_info()
{
local MESSAGE=${1}
if [ "${QUIET}" != "true" ]
then
echo "INFO: ${MESSAGE}"
fi
}
check_pdsh()
{
#an array is passed here, so this makes parameter passing a bit more complex
local HOSTS=$1
print_info "Checking PDSH availability on the following hosts: ${HOSTS}"
# execute cmd
test -e "${PDSH}" &&\
${PDSH} -R ${PDSH_RCMD} -S -w "${HOSTS}" \
"test \${SHELL} = '/bin/bash' || exit 2"
RES=$?
if [ $RES -eq 2 ]
then
print_error_and_exit "One or more hosts don't use /bin/bash as default shell."
elif [ $RES -ne 0 ]
then
print_info "pdsh does not seem to work on all nodes. Disabling pdsh and using ssh instead"
USE_PDSH=false
# We have to repeat the reachability check using conventional SSH before continuing.
IFS=,
for HOST in ${HOSTS}
do
check_reachability "${HOST}"
done
unset IFS
return
fi
${PDSH} -R ${PDSH_RCMD} -S -w "${HOSTS}" \
"if [ -e ${BEEOND_STOPLOCAL} ]; then true; else exit 2; fi" || \
print_error_and_exit "Unable to find BeeOND helper program on one or more nodes.
Please make sure BeeOND is installed on all machines."
}
execute_ssh_cmd()
{
local HOST="$1"
local CMD="$2"
# error checks
if [ "${HOST}" = "" ] || [ "${CMD}" = "" ]
then
print_error_and_exit "Internal function 'execute_ssh_cmd' was called without a host or \
without a command"
fi
# execute cmd
${SSH} "${SSH_PARAMS[@]}" "${HOST}" "${CMD}"
}
execute_pdsh_cmd()
{
local HOSTS="$1" # comma-separated list
local CMD="$2"
local CONTINUE_ON_ERROR="$3"
local TMPTIME
TMPTIME=$(date +%Y%m%d-%H%M%S)
local TMPFAILFILE="/tmp/beegfs.pdsh_fail.${TMPTIME}"
# error checks
if [ "${HOSTS}" = "" ] || [ "${CMD}" = "" ]
then
print_error_and_exit "Internal function 'execute_pdsh_cmd' was called without a host or \
without a command"
fi
# execute cmd
if ! ${PDSH} -R ${PDSH_RCMD} -S -w "${HOSTS}" "${CMD} || (touch ${TMPFAILFILE} && false)"
then
# pdsh returned non-zero, so there must have been an error on at least one node
# (-S returns the greatest return value of all nodes).
# the executed line created a file on the failing node
# now we have to look on each node for this file if we are interested which node failed
# for now, we do not do that; only abort and leave it to the user to investigate pdsh output
if [ "${CONTINUE_ON_ERROR}" = "true" ]
then
print_error "Execution of a command failed. Please see pdsh output for more information."
ERROR="true"
else
print_error_and_exit "Execution of a command failed. Please see pdsh output for more \
information."
fi
fi
}
check_reachability()
{
local HOST="$1"
# error checks
if [ "${HOST}" = "" ]
then
print_error_and_exit "Internal function 'check_reachability' was called without a hostname"
fi
print_info "Checking reachability of host ${HOST}"
execute_ssh_cmd "${HOST}" "test \${SHELL} = '/bin/bash'"
RES=$?
if [ $RES -eq 255 ]
then
print_error_and_exit "Host is unreachable via ssh: ${HOST}"
elif [ $RES -eq 1 ]
then
print_error_and_exit "Host doesn't use /bin/bash as default shell: ${HOST}"
elif [ $RES -ne 0 ]
then
print_error_and_exit "Error contacting host: ${HOST}"
fi
execute_ssh_cmd "${HOST}" "test -e ${BEEOND_STOPLOCAL}" || \
print_error_and_exit "Could not find BeeOND helper program on host: ${HOST}
Please make sure BeeOND is installed on all machines."
}
check_hostfile()
{
# hostfile set?
if [ "${HOSTFILE}" = "" ]
then
print_error_and_exit "Node file undefined"
fi
# does it exist
if [ ! -f "${HOSTFILE}" ]
then
print_error_and_exit "Node file does not exist: ${HOSTFILE}"
fi
}
check_datapath()
{
if [ "${DATA_PATH}" = "" ]
then
print_error_and_exit "Path for BeeGFS data undefined"
fi
}
check_mountpoint()
{
if [ "${MOUNTPOINT}" = "" ]
then
print_error_and_exit "Path for client mountpoint undefined"
fi
}
check_statusfile()
{
# checks for every node:
# - whether the statusfile already exists (maybe a session is already running)
# - whether the statusfile can be created (if not, we can't continue)
local HOSTS=$1
if [ "${HOSTS}" = "" ]
then
print_error_and_exit "Internal function 'check_statusfile' was called without a hostname"
fi
local CHECK_CMD="[ ! -e \"${STATUSFILE}\" ]"
local TOUCH_CMD="touch \"${STATUSFILE}\""
if [ "${USE_PDSH}" = "true" ]
then
# see if statusfile already exists
if ! ${PDSH} -R ${PDSH_RCMD} -S -w "${HOSTS}" "${CHECK_CMD} || (echo \"Statusfile already exists.\" && false)"
then
print_error_and_exit "Statusfile ${STATUSFILE} on one ore more hosts already exists. \
Maybe a session is already running or the previous session was not properly \
shut down."
fi
# touch statusfile on every host, to make sure the file can be accessed
if ! ${PDSH} -R ${PDSH_RCMD} -S -w "${HOSTS}" "${TOUCH_CMD}"
then
print_error_and_exit "Could not create status file ${STATUSFILE} on one ore more hosts."
fi
else
IFS=,
for HOST in ${HOSTS}
do
# see if statusfile already exists
if ! ${SSH} "${SSH_PARAMS[@]}" "${HOST}" "${CHECK_CMD}"
then
print_error_and_exit "Status file ${STATUSFILE} on host ${HOST} already exists. \
Maybe a session is already running or the previous session was not properly \
shut down."
fi
done
for HOST in ${HOSTS}
do
if ! ${SSH} "${SSH_PARAMS[@]}" "${HOST}" "${TOUCH_CMD}"
then
print_error_and_exit "Could not create status file ${STATUSFILE} on host ${HOST}"
fi
done
unset IFS
fi
}
create_log_path()
{
local HOSTS
HOSTS=$(IFS=,; echo "$*") # turn argument list into comma-separated string for PDSH
if [ "${HOSTS}" = "" ]
then
print_error_and_exit "Internal function 'create_log_path' was called without a host."
fi
# if the path doesn't exist, it's created. If it already exists, nothing happens
CMD="mkdir -p \"${LOG_PATH}\""
if [ "${USE_PDSH}" = "true" ]
then
execute_pdsh_cmd "${HOSTS}" "${CMD}" "false"
else
# no pdsh: do it manually with a loop
IFS=,
for HOST in ${HOSTS}
do
if ! execute_ssh_cmd "${HOST}" "${CMD}"
then
print_error_and_exit "Could not create log path ${LOG_PATH} on host ${HOST}"
fi
done
unset IFS
fi
}
### internal functions for beegfs-ondemand start ###
start_tmpfs()
{
local HOSTS=$1
local DATAPATH=$2
# error checks
if [ "${HOSTS}" = "" ] || [ "${DATAPATH}" = "" ]
then
print_error_and_exit "Internal function 'start_tmpfs' called without all needes parameters"
fi
CMD="mkdir -p ${DATAPATH} && mount -t tmpfs tmpfs ${DATAPATH}"
if [ "${USE_PDSH}" = "true" ]
then
print_info "Starting tempfs on the following hosts: ${HOSTS}"
execute_pdsh_cmd "${HOSTS}" "${CMD}" "false"
IFS=','
for HOST in ${HOSTS}
do
if [ "${HOST}" = "" ]; then continue; fi
add_to_status_file "${HOST}" tmpfs "${DATAPATH}" - -
done
unset IFS
else
# no pdsh => do it manually with ssh loop
print_info "Starting tmpfs mounts"
# for each host, start server
IFS=,
for HOST in ${HOSTS}
do
print_info "Starting tmpfs on host: ${HOST}"
if ! execute_ssh_cmd "${HOST}" "${CMD}"
then
print_error_and_exit "Unable to start tmpfs on host: ${HOST}"
else
add_to_status_file "${HOST}" tmpfs "${DATAPATH}" "-" "-"
fi
done
unset IFS
fi
}
start_meta_servers()
{
local HOSTS=$1 # comma seperated
local DATAPATH=$2
local MGMTD=$3
local PORT_SHIFT=$4 # port shift can be empty!
local CFG_PATH=$5 # may be empty
local CFG_FILE=${CFG_PATH}/${META_CFG_NAME}
local LOGFILE=${LOG_PATH}/${META_LOG}
local PIDFILE=/var/run/${META_BIN}-${CURRENTTIME}.pid
# error checks
if [ "${HOSTS}" = "" ] || [ "${MGMTD}" = "" ] || [ "${DATAPATH}" = "" ]
then
print_error_and_exit "Internal function 'start_meta_servers_ssh' was called without all \
needed parameters"
fi
DATAPATH=${DATAPATH}/${META_BIN}
PARAMS="sysMgmtdHost=${MGMTD} storeMetaDirectory=${DATAPATH} logStdFile=${LOGFILE} \
${CONNAUTH_LEGACY} runDaemonized=true pidFile=${PIDFILE}"
if [ "${PORT_SHIFT}" != "" ]
then
PARAMS="${PARAMS} connPortShift=${PORT_SHIFT}"
fi
CMD="PARAMS=\"${PARAMS}\"; \
if [ -n \"${CFG_PATH}\" ] && [ -e \"${CFG_FILE}\" ]; then \
PARAMS=\"\${PARAMS} cfgFile=${CFG_FILE}\"; fi; \
if [ \"${CLEAR_DATA}\" = \"true\" ]; then \
rm -rf ${DATAPATH}; fi; \
${BEEGFS_BIN_PATH}/${META_BIN} \${PARAMS}"
if [ "${USE_PDSH}" = "true" ]
then
print_info "Starting ${META_BIN} processes on the following hosts: ${HOSTS}"
print_info "Metadata server log: ${LOGFILE}"
execute_pdsh_cmd "${HOSTS}" "${CMD}" "false"
if [ "${PREFER_LOCAL}" = "true" ]
then
# create the preferred MDS file (actually just a symlink to the node ID file)
execute_pdsh_cmd "${HOSTS}" "rm -f ${PREFERRED_MDS_FILE}; \
ln -s ${DATAPATH}/${META_NUMID_FILE} ${PREFERRED_MDS_FILE}" "false"
fi
execute_pdsh_cmd "${HOSTS}" "echo %h,${META_BIN},${DATAPATH},${LOGFILE},${PIDFILE} >> ${STATUSFILE}" "false"
else
# no pdsh => do it manually with ssh loop
print_info "Starting ${META_BIN} processes"
print_info "Metadata server log: ${LOGFILE}"
# for each host, start server
IFS=,
for HOST in ${HOSTS}
do
print_info "Starting ${META_BIN} on host: ${HOST}"
if ! execute_ssh_cmd "${HOST}" "${CMD}"
then
print_error_and_exit "Unable to start ${META_BIN} on host: ${HOST}"
else
add_to_status_file "${HOST}" "${META_BIN}" "${DATAPATH}" "${LOGFILE}" "${PIDFILE}"
if [ "${PREFER_LOCAL}" = "true" ]
then
# create the preferred MDS file (actually just a symlink to the node ID file)
execute_ssh_cmd "${HOST}" "rm -f ${PREFERRED_MDS_FILE}; \
ln -s ${DATAPATH}/${META_NUMID_FILE} ${PREFERRED_MDS_FILE}"
fi
fi
done
unset IFS
fi
if [ "${QUIET}" != "true" ]
then
echo ""
fi
}
start_storage_servers()
{
local HOSTS=$1
local DATAPATH=$2
local MGMTD=$3
local PORT_SHIFT=$4 # port shift can be empty!
local CFG_PATH=$5 # may be empty
local CFG_FILE=${CFG_PATH}/${STORAGE_CFG_NAME}
local LOGFILE=${LOG_PATH}/${STORAGE_LOG}
local PIDFILE=/var/run/${STORAGE_BIN}-${CURRENTTIME}.pid
# error checks
if [ "${HOSTS}" = "" ] || [ "${MGMTD}" = "" ] || [ "${DATAPATH}" = "" ]
then
print_error_and_exit "Internal function 'start_storage_servers' was called without all \
needed parameters"
fi
DATAPATH=${DATAPATH}/${STORAGE_BIN}
PARAMS="sysMgmtdHost=${MGMTD} logStdFile=${LOGFILE} runDaemonized=true pidFile=${PIDFILE} ${CONNAUTH_LEGACY}"
if [ "${PORT_SHIFT}" != "" ]
then
PARAMS="${PARAMS} connPortShift=${PORT_SHIFT}"
fi
if [ "${TARGETFILE}" != "" ]
then
local ALL_TARGETS
ALL_TARGETS=$(get_all_targets_from_targetfile)
CMD="while read T; do \
if [ -d \"\${T}\" ] ; then EXISTING_TARGETS=\"\${EXISTING_TARGETS}\${T},\"; \
if [ \"${CLEAR_DATA}\" = \"true\" ]; then \
rm -rf \"${T}/*\"; \
fi;
fi ; \
done < <(echo \"${ALL_TARGETS}\" | tr ',' '\n' ); \
PARAMS=\"${PARAMS} storeStorageDirectory=\${EXISTING_TARGETS}\"; \
if [ -n \"${CFG_PATH}\" ] && [ -e \"${CFG_FILE}\" ]; then \
PARAMS=\"\${PARAMS} cfgFile=${CFG_FILE}\"; fi; \
${BEEGFS_BIN_PATH}/${STORAGE_BIN} \${PARAMS}"
else
CMD="PARAMS=\"${PARAMS} storeStorageDirectory=${DATAPATH}\"; \
if [ -n \"${CFG_PATH}\" ] && [ -e \"${CFG_FILE}\" ]; then \
PARAMS=\"\${PARAMS} cfgFile=${CFG_FILE}\"; fi; \
if [ \"${CLEAR_DATA}\" = \"true\" ]; then \
rm -rf ${DATAPATH}; fi; \
${BEEGFS_BIN_PATH}/${STORAGE_BIN} \${PARAMS}"
fi
if [ "${USE_PDSH}" = "true" ]
then
print_info "Starting ${STORAGE_BIN} processes on the following hosts: ${HOSTS}"
# trailing ',' removed
print_info "Storage server log: ${LOGFILE}"
execute_pdsh_cmd "${HOSTS}" "${CMD}" "false"
if [ "${PREFER_LOCAL}" = "true" ]
then
# create the preferred target file (actually just a symlink to the target ID file)
execute_pdsh_cmd "${HOSTS}" "rm -f ${PREFERRED_TARGET_FILE}; \
ln -s ${DATAPATH}/${TARGET_NUMID_FILE} ${PREFERRED_TARGET_FILE}" "false"
fi
execute_pdsh_cmd "${HOSTS}" "echo %h,${STORAGE_BIN},${DATAPATH},${LOGFILE},${PIDFILE} >> ${STATUSFILE}" "false"
else
# no pdsh => do it manually with ssh loop
print_info "Starting ${STORAGE_BIN} processes"
print_info "Storage server log: ${LOGFILE}"
# for each host, start server
IFS=,
for HOST in ${HOSTS}
do
print_info "Starting ${STORAGE_BIN} on host: ${HOST}"
if ! execute_ssh_cmd "${HOST}" "${CMD}"
then
print_error_and_exit "Unable to start ${STORAGE_BIN} on host: ${HOST}"
else
add_to_status_file "${HOST}" "${STORAGE_BIN}" "${DATAPATH}" "${LOGFILE}" "${PIDFILE}"
if [ "${PREFER_LOCAL}" = "true" ]
then
# create the preferred target file (actually just a symlink to the target ID file)
execute_ssh_cmd "${HOST}" "rm -f ${PREFERRED_TARGET_FILE}; \
ln -s ${DATAPATH}/${TARGET_NUMID_FILE} ${PREFERRED_TARGET_FILE}"
fi
fi
done
unset IFS
fi
if [ "${TARGETFILE}" != "" ]
then
create_storage_pools "${HOSTS}"
fi
if [ "${QUIET}" != "true" ]
then
echo ""
fi
}
create_storage_pools()
{
local HOSTS=$1
if [ "${TARGETFILE}" != "" ]
then
while read LINE
do
IFS=: read POOL TARGETS <<< "${LINE}"
TARGETS=$(echo "${TARGETS}" | tr -d "[:space:]")
TARGET_IDS=
while read HOST
do
CMD="echo \"${TARGETS}\" | tr ',' '\n' | \
while read T; do if [ -f \"\${T}/${TARGET_NUMID_FILE}\" ]; then \
echo -n \"\$(cat \"\${T}/targetNumID\") \"; fi; done"
HOST_TARGETS=$(execute_ssh_cmd "${HOST}" "${CMD}")
for TARGET_ID in $HOST_TARGETS
do
TARGET_IDS="${TARGET_IDS:+$TARGET_IDS,}storage:$TARGET_ID"
done
done < <(echo "${HOSTS}" | tr ',' '\n')
if [ "$POOL" == "default" ] || [ "$POOL" == "Default" ]
then
"${CTL_BIN}" ${CTL_GLOBAL_PARAMS} \
pool set-alias storage:1 "${POOL}" > /dev/null
else
# create pool with collected ids
"${CTL_BIN}" ${CTL_GLOBAL_PARAMS} \
pool create "${POOL}" --targets "${TARGET_IDS}" > /dev/null
fi
done < <(grep -v "^$" "${TARGETFILE}" | grep -v "^\s*\#")
fi
}
assign_storage_pool_dirs()
{
local POOLS
POOLS=$("${CTL_BIN}" ${CTL_GLOBAL_PARAMS} \
pool list --columns alias | grep -v '^\s*$'| tail -n+2)
while read LINE
do
read ALIAS <<< "${LINE}"
"${CTL_BIN}" ${CTL_GLOBAL_PARAMS} \
entry create dir --mount=none "/${ALIAS}" > /dev/null
"${CTL_BIN}" ${CTL_GLOBAL_PARAMS} \
entry set --mount=none --pool "${ALIAS}" "/${ALIAS}" &> /dev/null
done < <(echo "${POOLS}")
}
check_targetfile()
{
local CHECK1
local CHECK2
local LINE_REGEX
LINE_REGEX='^\w+:([\w/_.-]+,?)+\s*$'
CHECK1=$(grep -i -P "${LINE_REGEX}" "${TARGETFILE}")
CHECK2=$(grep -i -P -v "${LINE_REGEX}" "${TARGETFILE}" | grep -v "^\s*\#" | \
grep -v "^$")
if [ "${CHECK1}" == "" ] || [ "${CHECK2}" != "" ]
then
print_error_and_exit "${TARGETFILE} contains invalid entries or is empty."
fi
CHECK1=$(grep -v "^$" "${TARGETFILE}" | grep -v "^\s*\#" | \
tr -d ' ' | awk -F ':' '{print $1}' | sort | uniq -i -d)
if [ "${CHECK1}" != "" ]
then
print_error_and_exit "${TARGETFILE} contains non-unique pool names."
fi
CHECK1=$(get_all_targets_from_targetfile | tr ',' '\n' | sort | uniq -i -d)
if [ "${CHECK1}" != "" ]
then
print_error_and_exit "${TARGETFILE} contains non-unique target paths."
fi
if [ "${CLEAR_DATA}" != "true" ]
then
print_error_and_exit "Using storage pools requires the -F option \
to make sure no old data is left."
fi
if [ "${STORAGE_MIRROR}" == "true" ]
then
print_error_and_exit "Using storage pools doesn't support storage mirroring (-k)."
fi
}
get_all_targets_from_targetfile()
{
local ALL_TARGETS
while read LINE
do
IFS=: read POOL TARGETS <<< ${LINE}
while read T
do
T=$(echo "${T}" | tr -d "[:space:]")
ALL_TARGETS="${ALL_TARGETS}${T},"
done < <(echo "${TARGETS}" | tr ',' '\n')
done < <(grep -v "^$" "${TARGETFILE}" | grep -v "^\s*\#")
echo "$ALL_TARGETS"
}
start_mgmtd()
{
local HOST=$1
local DATAPATH=$2
local PORT_SHIFT=$3 # port shift can be empty!
local CFG_PATH=$4 # may be empty
local CFG_FILE=${CFG_PATH}/${MGMTD_CFG_NAME}
local PIDFILE=/var/run/${MGMTD_BIN}-${CURRENTTIME}.pid
# error checks
if [ "${HOST}" = "" ] || [ "${DATAPATH}" = "" ]
then
print_error_and_exit "Internal function 'start_mgmtd' was called without all needed \
parameters"
fi
DATAPATH=${DATAPATH}/${MGMTD_BIN}
DBPATH=${DATAPATH}/beegfs-mgmtd.sqlite3
# start server
print_info "Starting ${MGMTD_BIN} processes"
print_info "Starting ${MGMTD_BIN} on host: ${HOST}"
PARAMS="--db-file ${DBPATH} --daemonize true --daemonize-pid-file ${PIDFILE} ${TLS_DISABLE} ${TLS_CERT_FILE} ${TLS_KEY_FILE} ${CONNAUTH_FLAG} ${LICENSE_FILE}"
if [ "${PORT_SHIFT}" != "" ]
then
PARAMS="${PARAMS} --port-shift ${PORT_SHIFT}"
fi
CMD="PARAMS=\"${PARAMS}\"; \
if [ -n \"${CFG_PATH}\" ] && [ -e \"${CFG_FILE}\" ]; then \
PARAMS=\"\${PARAMS} --config-file ${CFG_FILE}\"; fi; \
if [ \"${CLEAR_DATA}\" = \"true\" ]; then \
rm -rf ${DATAPATH}; fi; \
${BEEGFS_BIN_PATH}/${MGMTD_BIN} --init --db-file ${DBPATH}; \
${BEEGFS_BIN_PATH}/${MGMTD_BIN} \${PARAMS}"
if ! execute_ssh_cmd "${HOST}" "${CMD}"
then
print_error_and_exit "Unable to start ${MGMTD_BIN} on host: ${HOST}"
else
add_to_status_file "${HOST}" "${MGMTD_BIN}" "${DATAPATH}" "-" "${PIDFILE}"
fi
if [ "${QUIET}" != "true" ]
then
echo ""
fi
}
start_clients()
{
local HOSTS=$1
local MGMTD=$2
local MOUNTPOINT=$3
local PORT_SHIFT=$4 # port shift can be empty!
local CFG_PATH=$5 # may be empty
local CLIENT_CFG_FILE=${CFG_PATH}/${CLIENT_CFG_NAME}
local LOGFILE=${LOG_PATH}/${CLIENT_LOG}
# error checks
if [ "${HOSTS}" = "" ] || [ "${MGMTD}" = "" ] || [ "${MOUNTPOINT}" = "" ]
then
print_error_and_exit "Internal function 'start_clients_ssh' was called without all \
needed parameters"
fi
MODPROBE_CMD="modprobe beegfs"
REBUILD_CMD="/etc/init.d/${CLIENT_BIN} rebuild"
MOUNT_PARAMS="-osysMgmtdHost=${MGMTD},${CONNAUTH_LEGACY// /,}"
if [ "${PORT_SHIFT}" != "" ]
then
MOUNT_PARAMS="${MOUNT_PARAMS},connPortShift=${PORT_SHIFT}"
fi
MOUNT_CMD="PARAMS=\"${MOUNT_PARAMS}\"; if [ -n \"${CFG_PATH}\" ] && \
[ -e \"${CLIENT_CFG_FILE}\" ]; then PARAMS=\"\${PARAMS},cfgFile=${CLIENT_CFG_FILE}\"; fi; \
if [ \"${PREFER_LOCAL}\" = \"true\" ] && [ -e \"${PREFERRED_MDS_FILE}\" ]; \
then PARAMS=\"\${PARAMS},tunePreferredMetaFile=${PREFERRED_MDS_FILE}\"; fi; \
if [ \"${PREFER_LOCAL}\" = \"true\" ] && [ -e \"${PREFERRED_TARGET_FILE}\" ]; \
then PARAMS=\"\${PARAMS},tunePreferredStorageFile=${PREFERRED_TARGET_FILE}\"; fi; \
mkdir -p ${MOUNTPOINT} && ${MODPROBE_CMD} && mount -t beegfs beegfs_ondemand ${MOUNTPOINT} \${PARAMS}"
if [ "${USE_PDSH}" = "true" ]
then
# trailing ',' removed in output
print_info "Starting ${CLIENT_BIN} processes on the following hosts: ${HOSTS}"
print_info "Client log: ${LOGFILE}"
execute_pdsh_cmd "${HOSTS}" "echo %h,${CLIENT_BIN},${MOUNTPOINT},${LOGFILE},- >> ${STATUSFILE}" "false"
execute_pdsh_cmd "${HOSTS}" "${MODPROBE_CMD} || ${REBUILD_CMD}" "false"
execute_pdsh_cmd "${HOSTS}" "${MOUNT_CMD}" "false"
if [ "${PREFER_LOCAL}" = "true" ] #set target count to 1
then
CTL_CMD="${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
entry set --num-targets 1 --chunk-size 512ki ${MOUNTPOINT} > /dev/null"
execute_pdsh_cmd "${HOSTS}" "${CTL_CMD}" "false"
fi
else
# no pdsh => do it manually with ssh loop
print_info "Starting ${CLIENT_BIN} processes"
print_info "Client log: ${LOGFILE}"
# for each host, start client
IFS=,
for HOST in ${HOSTS}
do
print_info "Starting ${CLIENT_BIN} on host: ${HOST}"
if ! execute_ssh_cmd "${HOST}" "${MODPROBE_CMD}"
then
print_info "Module beegfs could not be loaded on host: ${HOST}. Trying to recompile \
from source."
execute_ssh_cmd "${HOST}" "${REBUILD_CMD}"
fi
if ! execute_ssh_cmd "${HOST}" "${MOUNT_CMD}"
then
print_error_and_exit "Unable to start BeeGFS client on host: ${HOST}"
else
# NOTE : mountpoint as data path
add_to_status_file "${HOST}" "${CLIENT_BIN}" "${MOUNTPOINT}" "${LOGFILE}" "-"
if [ "${PREFER_LOCAL}" = "true" ] #set target count to 1
then
CTL_CMD="${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
entry set --num-targets 1 --chunk-size 512ki ${MOUNTPOINT} > /dev/null"
execute_ssh_cmd "${HOST}" "${CTL_CMD}"
fi
fi
done
unset IFS
fi
if [ "${QUIET}" != "true" ]
then
echo ""
fi
}
add_to_status_file()
{
local HOST=$1
local SERVICE=$2
local DATAPATH=$3
local LOGFILE=$4
local PIDFILE=$5
# error checks
if [ "${HOST}" = "" ] || [ "${SERVICE}" = "" ] || [ "${LOGFILE}" = "" ] || [ "${PIDFILE}" = "" ]
then
print_error_and_exit "Internal function 'add_to_status_file' was called without all \
needed parameters"
fi
INFO="${HOST},${SERVICE},${DATAPATH},${LOGFILE},${PIDFILE}"
execute_ssh_cmd "${HOST}" "echo ${INFO} >> ${STATUSFILE}"
}
### internal functions for beegfs-ondemand stop ###
# build the argument string for the "stoplocal" function
make_stoplocal_args()
{
local STOPLOCAL_ARGS=" -q" # quiet
if [ "${DELETE_DATA}" = "true" ]
then
STOPLOCAL_ARGS="${STOPLOCAL_ARGS} -d" # delete data
fi
if [ "${DELETE_LOGS}" = "true" ]
then
STOPLOCAL_ARGS="${STOPLOCAL_ARGS} -L" # delete logs
fi
if [ "${CLEANUP}" = "true" ]
then
STOPLOCAL_ARGS="${STOPLOCAL_ARGS} -c" # don't complain about missing files (from properly shut
fi # down beegfs-ondemand instances)
echo "${STOPLOCAL_ARGS}"
}
stop_procs()
{
local HOSTS=$1
local DELETE_DATA=$2
local DELETE_LOGS=$3
# prepare command for remote script
STOPSERVERSCMD="source ${BEEOND_STOPLOCAL}; \
do_stoplocal -s -i ${STATUSFILE} $(make_stoplocal_args)"
# issue the stop server command via ssh/pdsh
if [ "${USE_PDSH}" = "true" ]
then
print_info "Stopping remaining processes on the following hosts: ${HOSTS}"
execute_pdsh_cmd "${HOSTS}" "${STOPSERVERSCMD}" "true"
else
# ssh mode - launch command for each host separately
IFS=,
for HOST in ${HOSTS}
do
print_info "Stopping remaining processes on host: ${HOST}"
execute_ssh_cmd "${HOST}" "${STOPSERVERSCMD}"
done
unset IFS
fi
# delete the statusfile
local DELETESTATUSFILECMD="rm -f ${STATUSFILE}"
if [ "${USE_PDSH}" = "true" ]
then
print_info "Deleting status file on hosts: ${HOSTS}"
execute_pdsh_cmd "${HOSTS}" "${DELETESTATUSFILECMD}" "true"
else
# ssh mode - launch command for each host separately
IFS=,
for HOST in ${HOSTS}
do
print_info "Deleting status file on host: ${HOST}"
execute_ssh_cmd "${HOST}" "${DELETESTATUSFILECMD}"
done
unset IFS
fi
}
unmount_clients()
{
HOSTS=$1
# prepare command for remote script
local UMOUNTCMD
UMOUNTCMD="source ${BEEOND_STOPLOCAL}; \
do_stoplocal -u -i ${STATUSFILE} $(make_stoplocal_args)"
if [ "${USE_PDSH}" = "true" ]
then
print_info "Unmounting file system on the following hosts: ${HOSTS}"
execute_pdsh_cmd "${HOSTS}" "${UMOUNTCMD}" "true"
else
# ssh mode - launch command for each host separately
IFS=,
for HOST in ${HOSTS}
do
print_info "Unmounting file system on host: ${HOST}"
execute_ssh_cmd "${HOST}" "${UMOUNTCMD}"
done
unset IFS
fi
}
# Blocks until all targets are online/good
# Parameters: nodetype numNodes
wait_online_good()
{
local NODE_TYPE=$1
local NUM_NODES=$2
#first, wait for the correct number of *nodes* to become available.
echo -n "Waiting until all nodes have registered with mgmtd..."
local CTLCMD="${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
node list --node-type ${NODE_TYPE}"
while [ ! "$(${CTLCMD} | head -n-1 | tail -n+2 | wc -l)" = "${NUM_NODES}" ]
do
echo -n "."
sleep 1
done
echo
# now wait for all *targets* to become online/good. (also works for metadata servers, since they
# are internally treated as one target per server)
echo -n "Waiting for all nodes/targets to be online and in sync..."
local CTLCMD="${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
target list --node-type ${NODE_TYPE} --state --columns reachability,consistency"
while [ ! "$(${CTLCMD} | head -n-1 | tail -n+2 | grep "Online\s\+Good" -c)" = "$(${CTLCMD} | head -n-1 | tail -n+2 | wc -l)" ]
do
echo -n "."
sleep 1
done
echo
# and now, we wait until all targets have reported their available space and inodes. It should be
# good enough to only check space, because both will be reported at the same time. Without this,
# the automatic mirror group creation might fail, because it compares target sizes and free
# inodes.
echo -n "Waiting for all nodes/targets to report their available space..."
local CTLCMD="${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
target list --node-type ${NODE_TYPE} --raw --columns space"
while [ ! "$(${CTLCMD} | head -n-1 | tail -n+2 | grep -s '^-\s\+$' | wc -l)" == 0 ]
do
echo -n "."
sleep 1
done
echo
}
### main functions ###
do_start()
{
CLEAR_DATA="false"
USE_PDSH="false"
PREFER_LOCAL="false"
QUIET="false"
USE_TMPFS="false"
STORAGE_MIRROR="false"
META_MIRROR="false"
PORT_SHIFT=${DEFAULT_PORT_SHIFT}
ASSIGN_STORAGE_POOL_DIRS=true
CONNAUTH_FLAG="--auth-disable"
CONNAUTH_LEGACY="connDisableAuthentication=true"
TLS_DISABLE="--tls-disable"
TLS_CERT_FILE=""
TLS_KEY_FILE=""
LICENSE_FILE=""
MGMTD_GRPC_PORT=${DEFAULT_MGMTD_GRPC_PORT}
while getopts ":c:d:f:Fi:m:n:p:G:lL:Pb:s:qrkjt:TCEH" opt; do
case $opt in
n)
HOSTFILE=${OPTARG}
;;
d)
DATA_PATH=${OPTARG}
;;
F)
CLEAR_DATA="true"
;;
c)
MOUNTPOINT=${OPTARG}
;;
i)
STATUSFILE=${OPTARG}
;;
L)
LOG_PATH=${OPTARG}
;;
m)
if ! [[ ${OPTARG} =~ ^[0-9]+$ ]];
then
print_error_and_exit "number of metadata servers must be numeric";
fi
NUM_META_SERVER=${OPTARG}
;;
p)
if ! [[ ${OPTARG} =~ ^[0-9]+$ ]];
then
print_error_and_exit "port shift must be numeric";
fi
PORT_SHIFT=${OPTARG}
;;
G)
if ! [[ ${OPTARG} =~ ^[0-9]+$ ]];
then
print_error_and_exit "management gRPC port must be numeric";
fi
MGMTD_GRPC_PORT=${OPTARG}
;;
P)
USE_PDSH="true"
;;
b)
PDSH=${OPTARG}
;;
s)
if ! [[ ${OPTARG} =~ ^[0-9]+$ ]];
then
print_error_and_exit "number of storage servers must be numeric";
fi
NUM_STORAGE_SERVER=$OPTARG
;;
f)
if ! [[ -d ${OPTARG} ]]; then
print_error_and_exit "The -f option expects a path to a directory: ${OPTARG}"
fi
CONFIGPATH=${OPTARG}
;;
l)
PREFER_LOCAL="true"
;;
q)
QUIET="true"
;;
r)
USE_TMPFS="true"
;;
k)
if [[ -z ${LICENSE_FILE} ]] ; then
print_error_and_exit "To use mirroring, licensing (option -H) must be configured before the option -k"
fi
STORAGE_MIRROR="true"
;;
j)
if [[ -z ${LICENSE_FILE} ]] ; then
print_error_and_exit "To use mirroring, licensing (option -H) must be configured before the option -j"
fi
META_MIRROR="true"
;;
t)
if [[ -z ${LICENSE_FILE} ]] ; then
print_error_and_exit "To use storage pools, licensing (option -H) must be configured before the option -t"
fi
TARGETFILE=${OPTARG}
;;
T)
ASSIGN_STORAGE_POOL_DIRS="false"
;;
C)
if ! [[ -f ${CONFIGPATH}/conn.auth ]] ; then
print_error_and_exit "To use connection authentication, a config path (option -f) that contains a \"conn.auth\" file and is available on all nodes needs to be specified before the option -C"
fi
CONNAUTH_FLAG="--auth-file ${CONFIGPATH}/conn.auth"
CONNAUTH_LEGACY="connAuthFile=${CONFIGPATH}/conn.auth connDisableAuthentication=false"
;;
E)
if ! [[ -f ${CONFIGPATH}/cert.pem && -f ${CONFIGPATH}/key.pem ]] ; then
print_error_and_exit "To use TLS encryption, a config path (option -f) that contains a \"cert.pem\" and a \"key.pem\" file and is available on all nodes needs to be specified before the option -E"
fi
TLS_DISABLE="--tls-disable=false"
TLS_CERT_FILE="--tls-cert-file ${CONFIGPATH}/cert.pem"
TLS_KEY_FILE="--tls-key-file ${CONFIGPATH}/key.pem"
;;
H)
if ! [[ -f ${CONFIGPATH}/license.pem ]] ; then
print_error_and_exit "To use enterprise features, a config path (option -f) that contains a \"license.pem\" file and is available on all nodes needs to be specified before the option -H"
fi
LICENSE_FILE="--license-cert-file ${CONFIGPATH}/license.pem"
;;
\?)
echo "ERROR: invalid option: -${OPTARG}" >&2
print_usage_and_exit
;;
:)
echo "ERROR: Option -${OPTARG} requires an argument" >&2
print_usage_and_exit
;;
esac
done
if [ "${USE_PDSH}" = "true" ]
then
PDSH=${PDSH:-${DEFAULT_PDSH_PATH}}
if [ -z "${PDSH}" ]; then
echo "Unable to autodetect pdsh. Please specify using the -b option."
exit 1
fi
fi
check_hostfile
check_datapath
check_mountpoint
if [ "${STORAGE_MIRROR}" = "true" ] && [ "${PREFER_LOCAL}" = "true" ]
then
print_error_and_exit "Options -k and -l are mutually exclusive."
fi
if [ "${TARGETFILE}" != "" ]
then
check_targetfile
fi
print_info "Using status information file: ${STATUSFILE}"
NODECOUNT=$(grep -v '^$' ${HOSTFILE} | uniq | wc -l) #ignore empty lines
NODES=( $(grep -v '^$' ${HOSTFILE} | uniq) ) #store as array and ignore empty lines
# make list of all nodes first - needed for clients and tmpfs mounts
ALLNODES=$(IFS=,; echo "${NODES[*]}")
if [ "${USE_PDSH}" = "true" ]
then
# check all nodes for reachability and working PDSH
check_pdsh "${ALLNODES}"
else
# check reachability of all nodes
for HOST in "${NODES[@]}"
do
check_reachability "${HOST}"
done
fi
check_statusfile "${ALLNODES}"
# if the number of meta servers given is 0 or greater than node count, start it on all hosts
if [ ${NUM_META_SERVER} -eq 0 ] || [ ${NUM_META_SERVER} -gt "${NODECOUNT}" ]
then
NUM_META_SERVER=${NODECOUNT}
print_info "Number of metadata servers automatically set to ${NUM_META_SERVER}"
fi
# if the number of storage servers given is 0 or greater than node count, start it on hosts
if [ ${NUM_STORAGE_SERVER} -eq 0 ] || [ ${NUM_STORAGE_SERVER} -gt "${NODECOUNT}" ]
then
NUM_STORAGE_SERVER=${NODECOUNT}
print_info "Number of storage servers automatically set to ${NUM_STORAGE_SERVER}"
fi
# create the log path on all nodes if it doesn't exist yet
# without an existing logfile path, the server won't start up
create_log_path "${NODES[@]}"
# take the first host as master host
MASTERHOST=${NODES[0]}
# delete STATUS_FILE
execute_ssh_cmd "${MASTERHOST}" "rm -f ${STATUSFILE}"
# mount tmpfs
if [ "${USE_TMPFS}" = "true" ]
then
start_tmpfs "${ALLNODES}" "${DATA_PATH}"
fi
# MASTERHOST is also mgmtd host
MGMTD=${MASTERHOST}
# The gRPC port for MGMTD, important for CTL
MGMTD_GRPC_PORT=$((MGMTD_GRPC_PORT+PORT_SHIFT))
# Combine variables relevant to CTL into one
CTL_GLOBAL_PARAMS="--mgmtd-addr ${MGMTD}:${MGMTD_GRPC_PORT} ${TLS_DISABLE} ${TLS_CERT_FILE} ${CONNAUTH_FLAG}"
# port shift and config path may be empty, but that's ok
start_mgmtd "${MGMTD}" "${DATA_PATH}" "${PORT_SHIFT}" "${CONFIGPATH}"
# take the first NUM_STORAGE_SERVER as storage servers
STORAGENODES=$(IFS=,; echo "${NODES[*]:0:${NUM_STORAGE_SERVER}}")
# port shift and config path may be empty, but that's ok
start_storage_servers "${STORAGENODES}" "${DATA_PATH}" "${MGMTD}" "${PORT_SHIFT}" "${CONFIGPATH}"
# take the first NUM_META_SERVER as metadata servers
METANODES=$(IFS=,; echo "${NODES[*]:0:${NUM_META_SERVER}}")
# port shift and config path may be empty, but that's ok
start_meta_servers "${METANODES}" "${DATA_PATH}" "${MGMTD}" "${PORT_SHIFT}" "${CONFIGPATH}"
# give the management daemon some time to get all information from servers
wait_online_good storage "${NUM_STORAGE_SERVER}"
wait_online_good meta "${NUM_META_SERVER}"
# enable mirroring
if [ "${STORAGE_MIRROR}" = "true" ]
then
if ! ${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
mirror autocreate storage > /dev/null
then
print_error_and_exit "Unable to create storage target buddy mirror groups."
fi
# all metadata servers need to know about the storage mirror groups
sleep 8
fi
if [ "${META_MIRROR}" = "true" ]
then
if ! ${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
mirror autocreate meta > /dev/null
then
print_error_and_exit "Unable to create metadata server buddy mirror groups."
fi
# all metadata servers need to know about the newly created mirror groups
sleep 8
if ! ${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
mirror init --yes > /dev/null
then
print_error_and_exit "Unable to enable metadata mirroring."
fi
# wait for initial resync
wait_online_good meta "${NUM_META_SERVER}"
fi
# take all hosts as client
# port shift and config path may be empty, but that's ok
start_clients "${ALLNODES}" "${MGMTD}" "${MOUNTPOINT}" "${PORT_SHIFT}" "${CONFIGPATH}"
if [ "${ASSIGN_STORAGE_POOL_DIRS}" = "true" ] && [ "${TARGETFILE}" != "" ]
then
assign_storage_pool_dirs
fi
if [ "${STORAGE_MIRROR}" = "true" ]
then
if ! ${CTL_BIN} ${CTL_GLOBAL_PARAMS} \
entry set --chunk-size 512ki --num-targets 4 --pattern mirrored "${MOUNTPOINT}" > /dev/null
then
print_error_and_exit "Unable to enable mirroring pattern."
fi
fi
echo " ****************************************************************************** "
echo "* BeeOND setup finished successfully! To configure the \`beegfs\` command line"
echo "* utility to talk to the BeeOND mgmtd service, some additional configuration"
echo "* might be necessary."
echo "*"
echo "* If there is more than one BeeGFS mounted on the node that runs \`beegfs\`,"
echo "* the correct mgmtd will need to be configured by using"
echo "* --mgmtd-addr \"${MGMTD}:${MGMTD_GRPC_PORT}\" or"
echo "* export BEEGFS_MGMTD_ADDR=\"${MGMTD}:${MGMTD_GRPC_PORT}\""
if ! [[ -z ${CONNAUTH_FLAG} ]]
then
echo "*"
echo "* To configure \`beegfs\` to use the correct connection authentication file,"
echo "* please use"
echo "* --auth-file \"${CONFIGPATH}/conn.auth\" or"
echo "* export BEEGFS_AUTH_FILE=\"${CONFIGPATH}/conn.auth\""
fi
if ! [[ -z ${TLS_DISABLE} ]]
then
echo "*"
echo "* To configure \`beegfs\` to use TLS encryption when talking to mgmtd, use"
echo "* --tls-cert-file \"${CONFIGPATH}/cert.pem\" or"
echo "* export BEEGFS_TLS_CERT_FILE=\"${CONFIGPATH}/cert.pem\""
fi
echo " ****************************************************************************** "
}
do_stop()
{
DELETE_DATA="false"
USE_PDSH="false"
DELETE_LOGS="false"
QUIET="false"
CLEANUP="false"
while getopts "di:n:Pb:Lcq" opt; do
case $opt in
n)
HOSTFILE=${OPTARG}
;;
i)
STATUSFILE=${OPTARG}
;;
d)
DELETE_DATA="true"
;;
c)
CLEANUP="true"
;;
P)
USE_PDSH="true"
;;
b)
PDSH=${OPTARG}
;;
L)
DELETE_LOGS="true"
;;
q)
QUIET="true"
;;
\?)
echo "ERROR: invalid option: -${OPTARG}" >&2
print_usage_and_exit
;;
:)
echo "ERROR: Option -${OPTARG} requires an argument" >&2
print_usage_and_exit
;;
esac
done
if [ "${USE_PDSH}" = "true" ]
then
PDSH=${PDSH:-${DEFAULT_PDSH_PATH}}
if [ -z "${PDSH}" ]; then
echo "Unable to autodetect pdsh. Please specify using the -b option."
exit 1
fi
fi
check_hostfile
print_info "Using status information file: ${STATUSFILE}"
NODES=( $(grep -v '^$' ${HOSTFILE} | uniq) ) #store as array and ignore empty lines
ALLNODES=$(IFS=,; echo "${NODES[*]}")
if [ "${USE_PDSH}" = "true" ]
then
# check all nodes for reachability and working PDSH
check_pdsh "${ALLNODES}"
else
# check reachability of all nodes
for HOST in ${NODES[*]}
do
check_reachability "${HOST}"
done
fi
# take the first host as master host
MASTERHOST=${NODES[0]}
ALLNODES=$(IFS=,; echo "${NODES[*]}")
# read status file on master host and stop all servers
unmount_clients "${ALLNODES}"
# read status file on master host and stop all servers
stop_procs "${ALLNODES}" ${DELETE_DATA} ${DELETE_LOGS}
}
# print help if no arguments given
if [ $# -eq 0 ] ; then
print_usage_and_exit
fi
# parse arguments
ACTION=$1
if [ "${ACTION}" = "start" ]
then
shift
do_start "$@"
elif [ "${ACTION}" = "stop" ]
then
shift
ERROR="false" # store if we encountered an error, so that we can return a statuscode
# (because the stop function does not abort on error)
do_stop "$@"
if [ "${ERROR}" = "true" ]
then
exit 1
fi;
elif [ "${ACTION}" = "stoplocal" ]
then
shift
do_stoplocal "$@"
exit $?
else
print_usage_and_exit
fi