beegfs/beeond/scripts/lib/beeond-lib
2025-08-10 01:34:16 +02:00

393 lines
14 KiB
Bash

#!/bin/bash
# This file contains some functions used across all of the BeeOND scripts.
BEEOND_FILENAME_PREFIX=".beeond_"
BEEOND_COPY_FILE_LIST="${BEEOND_FILENAME_PREFIX}files_copy"
BEEOND_COPY_SCAN_LIST="${BEEOND_FILENAME_PREFIX}scan_list" # List of dirs that have to be scanned.
BEEOND_COPY_DIR_LIST="${BEEOND_FILENAME_PREFIX}dirs_copy"
BEEOND_START_FILE_LIST="${BEEOND_FILENAME_PREFIX}files_start"
BEEOND_END_FILE_LIST="${BEEOND_FILENAME_PREFIX}files_end"
BEEOND_END_UPDATED_FILE_LIST="${BEEOND_FILENAME_PREFIX}files_end_updated"
BEEOND_START_DIR_LIST="${BEEOND_FILENAME_PREFIX}dirs_start"
BEEOND_END_DIR_LIST="${BEEOND_FILENAME_PREFIX}dirs_end"
BEEOND_END_UPDATED_DIR_LIST="${BEEOND_FILENAME_PREFIX}dirs_end_updated"
BEEOND_SESSION_FILE="${BEEOND_FILENAME_PREFIX}session"
BEEOND_BATCH_SIZE=20
beeond_print_error()
{
echo "ERROR: ${1}" >&2
echo ""
}
beeond_print_error_and_exit()
{
beeond_print_error "${1}" >&2
exit 1
}
beeond_print_info()
{
local MESSAGE="${1}"
if [ "${QUIET}" != "true" ]
then
echo "INFO: ${MESSAGE}"
fi
}
# Saves the session info file which contains the paths of the global store and the node file.
beeond_save_session_info()
{
local NODEFILE="${1}"
local GLOBAL_PATH="${2}"
if ! printf "NodeFile=%q\nGlobalPath=%q\n" "${NODEFILE}" "${GLOBAL_PATH}" \
> "${LOCAL_PATH}/${BEEOND_SESSION_FILE}"
then
beeond_print_error_and_exit "Could not write to session file."
fi
}
# Generate the list of files in the beeond folder.
beeond_generate_file_list()
{
local LOCAL_PATH="${1}"
local LISTFILE="${2}"
local REFERENCE_FILE="${3}"
pushd "${LOCAL_PATH}"
if [ "${REFERENCE_FILE}" = "" ]
then # No reference file - just generate the full list (e.g. on startup).
beeond_print_info "Generating file list ${LISTFILE}..."
find . ! -path ./${BEEOND_FILENAME_PREFIX}\* \( -type f -or -type l \)\
-exec bash -c 'printf "%q\n" "$0"' {} \; \
| grep -v ^\\$ | sort > "${LISTFILE}"
# The grep statement filters out file names with newlines in them. While they are technically
# legal they would cause problems later on due to the way the script run by parallel handles
# the arguments.
else # Reference file given: Compare timestamps.
beeond_print_info \
"Generating file list ${LISTFILE}. Timestamp reference: ${REFERENCE_FILE}..."
find . ! -path ./${BEEOND_FILENAME_PREFIX}\* \( -type f -or -type l \) \
\( -cnewer "${REFERENCE_FILE}" -or -newer "${REFERENCE_FILE}" \) \
-exec bash -c 'printf "%q\n" "$0"' {} \; \
| grep -v ^\\$ | sort > "${LISTFILE}"
fi
popd
}
# Generate list of directories - this is necessary in case directories were created during the
# session and need to be created on the global store as well.
beeond_generate_directory_list()
{
local LOCAL_PATH="${1}"
local LISTFILE="${2}"
local REFERENCE_FILE="${3}"
pushd "${LOCAL_PATH}"
if [ "${REFERENCE_FILE}" = "" ]
then # No reference file - just generate the full list (e.g. on startup).
beeond_print_info "Generating directory list ${LISTFILE}..."
find . ! -path . -type d \
-exec bash -c 'printf "%q\n" "$0"' {} \; \
| grep -v ^\\$ | sort > "${LISTFILE}"
else # Reference file given: Compare timestamps.
beeond_print_info \
"Generating directory list ${LISTFILE}. Timestamp reference: ${REFERENCE_FILE}..."
find . ! -path . -type d \
\( -cnewer "${REFERENCE_FILE}" -or -newer "${REFERENCE_FILE}" \) \
-exec bash -c 'printf "%q\n" "$0"' {} \; \
| grep -v ^\\$ | sort > "${LISTFILE}"
fi
popd
}
# Generate copy file list. If we do a parallel copy, we can't just list the paths to all the files
# to be copied, because we have to "flatten" the folder hierarchy (e.g. when the user says
# "copy dir/subfir/file_a anotherdir/file_b target_dir" we want to end up with
# target_dir/file_a and target_dir/file_b. To achieve this, we just save an explicit target path
# to each source file. We also have to keep a list of directories we encounter because we want to
# create them before we start copying.
beeond_generate_copy_lists()
{
local TARGET="${1}"
local NODE_LIST="${2}"
local CONCURRENCY="${3}"
shift 3
# Note: We do relative path expansion here, (and not directly in the do_... functions)
# because this is the first time we iterate over the source list.
# Expand target path.
if [ ! "${TARGET:0:1}" = "/" ]
then
TARGET="${PWD}/${TARGET}"
fi
# Delete possibly left over file lists.
rm -f "${TARGET}/${BEEOND_COPY_SCAN_LIST}" \
"${TARGET}/${BEEOND_COPY_DIR_LIST}" \
"${TARGET}/${BEEOND_COPY_FILE_LIST}"
# Generate lists: A list of files which can be used directly, and a list of directories which
# have to be scanned first.
for ENTRY in "$@"
do
# Expand path if it's relative.
if [ ! "${ENTRY:0:1}" = "/" ]
then
ENTRY="${PWD}/${ENTRY}"
fi
beeond_print_info "Path to scan: ${ENTRY}"
if [ -d "${ENTRY}" ]; then
printf "%q\n" "${ENTRY}" >> "${TARGET}/${BEEOND_COPY_SCAN_LIST}"
elif [ -f "${ENTRY}" ]; then
printf "%q\n" >> "${TARGET}/${BEEOND_COPY_FILE_LIST}"
else
beeond_print_error_and_exit "File or directory does not exist: ${ENTRY}"
fi
done
beeond_print_info "Scanning sources..."
< "${TARGET}/${BEEOND_COPY_SCAN_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read DIR; do \
cd \"\${DIR}\"; \
find . -type d -exec bash -c \\\'printf \"%s/%q\n\" \"\$0\" \"\$1\"\' \"\`basename \"\${DIR}\"\`\" \{\} \; \
| grep -v ^\\$; \
done;
" \
| sort > "${TARGET}/${BEEOND_COPY_DIR_LIST}"
< "${TARGET}/${BEEOND_COPY_SCAN_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read DIR; do \
cd \"\${DIR}\"; \
find . \( -type f -or -type l \) -exec bash -c \
\\\'printf \"%q %q\n\" \"\${PWD}/\$0\" \"${TARGET}/\`basename \"\${PWD}\"\`/\$0\"\' \{\} \; \
| grep -v ^\\$; \
done; \
" \
| sort > "${TARGET}/${BEEOND_COPY_FILE_LIST}"
}
# Parallel copy of the files from a previously generated file list to the target directory.
# First, the directory structure is generated, then the files are copied into it.
beeond_parallel_copy()
{
local TARGET="${1}"
local NODE_LIST="${2}"
local CONCURRENCY="${3}"
# Expand target path.
if [ ! "${TARGET:0:1}" = "/" ]
then
TARGET="${PWD}/${TARGET}"
fi
beeond_print_info "Generating target directory structure..."
< "${TARGET}/${BEEOND_COPY_DIR_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read DIR; do \
mkdir -pv \"${TARGET}/\${DIR}\"; \
done; \
"
beeond_print_info "Copying files..."
< "${TARGET}/${BEEOND_COPY_FILE_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read -a LINE; do \
cp -av \"\${LINE[0]}\" \"\${LINE[1]}\"; \
done; \
"
# Delete temporary files.
# rm "${TARGET}/${BEEOND_COPY_SCAN_LIST}" \
# "${TARGET}/${BEEOND_COPY_DIR_LIST}" \
# "${TARGET}/${BEEOND_COPY_FILE_LIST}"
}
# Remove all files from the global store that have been deleted during the session.
beeond_remove_removed_files()
{
local GLOBAL_PATH="${1}"
local LOCAL_PATH="${2}"
local NODE_LIST="${3}"
local CONCURRENCY="${4}"
beeond_print_info "Deleting files:"
comm -23 "${LOCAL_PATH}/${BEEOND_START_FILE_LIST}" "${LOCAL_PATH}/${BEEOND_END_FILE_LIST}" | \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read FILE; do \
rm -v \"${GLOBAL_PATH}/\${FILE}\"; \
done; \
"
beeond_print_info "Deleting directories:"
comm -23 "${LOCAL_PATH}/${BEEOND_START_DIR_LIST}" "${LOCAL_PATH}/${BEEOND_END_DIR_LIST}" | \
tac | \
xargs -I{} rmdir -v "${GLOBAL_PATH}/{}"
# Not being done in parallel to avoid deleting a subdirectory before its parent (this is also
# the reason the list is inverted (tac)).
}
# Copy back all files to the global store that have been updated during the session.
beeond_copy_updated_files()
{
local GLOBAL_PATH="${1}"
local LOCAL_PATH="${2}"
local NODE_LIST="${3}"
local CONCURRENCY="${4}"
beeond_print_info "Creating new directories:"
< "${LOCAL_PATH}/${BEEOND_END_DIR_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read DIR; do \
mkdir -pv \"${GLOBAL_PATH}/\${DIR}\"; \
done; \
"
beeond_print_info "Copying back changed files:"
< "${LOCAL_PATH}/${BEEOND_END_UPDATED_FILE_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read FILE; do \
cp -uv \"${LOCAL_PATH}/\${FILE}\" \"${GLOBAL_PATH}/\${FILE}\"; \
done; \
"
# Copy files into updated directories. (When a directory is renamed or files
# are moved to a directory, the files in it don't have their timestamp
# updated. Therefore, we need to check all the updated directories again).
beeond_print_info "Copying back changed files (updated dirs):"
pushd "${LOCAL_PATH}"
< "${LOCAL_PATH}/${BEEOND_END_UPDATED_DIR_LIST}" \
xargs -I{} find {} -maxdepth 1 \( -type f -or -type l \) | \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read FILE; do \
cp -uv \"${LOCAL_PATH}/\${FILE}\" \"\`dirname \"${GLOBAL_PATH}/\${FILE}\"\`\"; \
done; \
"
popd
}
# Stage in process: Copy all files from the global store to the local store.
beeond_stage_in()
{
local GLOBAL_PATH="${1}"
local LOCAL_PATH="${2}"
local NODE_LIST="${3}"
local CONCURRENCY="${4}"
# Generate list of files that have to be copied.
beeond_generate_file_list "${GLOBAL_PATH}" "${LOCAL_PATH}/${BEEOND_START_FILE_LIST}"
beeond_generate_directory_list "${GLOBAL_PATH}" "${LOCAL_PATH}/${BEEOND_START_DIR_LIST}"
beeond_print_info "Creating directory structure..."
< "${LOCAL_PATH}/${BEEOND_START_DIR_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read DIR; do \
mkdir -pv \"${LOCAL_PATH}/\${DIR}\"; \
touch --reference=\"${GLOBAL_PATH}/\${DIR}\" \"${LOCAL_PATH}/\${DIR}\"; \
done; \
"
beeond_print_info "Copying files to local directory..."
if ! < "${LOCAL_PATH}/${BEEOND_START_FILE_LIST}" \
${PARALLEL} -S "${NODE_LIST}" -j"${CONCURRENCY}" --pipe --recend "\n" \
-N${BEEOND_BATCH_SIZE} --controlmaster --will-cite \
" \
while read FILE; do \
cp -av \"${GLOBAL_PATH}/\${FILE}\" \"\`dirname \"${LOCAL_PATH}/\${FILE}\"\`\"/; \
done;
"
then
beeond_print_error "Stage-in copy did not succeed. Data is incompletely staged in."
fi
# Generate list of files and dirs that were actually copied to keep track if the user deletes
# files during a session. (re-generate list here, so that if something went wrong during the
# stage-in copy, we don't start deleting stuff from the global store by accident).
beeond_generate_file_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_START_FILE_LIST}"
beeond_generate_directory_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_START_DIR_LIST}"
}
# Stage out process: remove all the files that have been removed during the beeond session,
# and copy back the files which have been changed.
beeond_stage_out()
{
local GLOBAL_PATH="${1}"
local LOCAL_PATH="${2}"
local NODE_LIST="${3}"
local CONCURRENCY="${4}"
beeond_print_info "Nodes for parallel stage out: ${NODE_LIST}."
beeond_generate_file_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_END_FILE_LIST}"
beeond_generate_file_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_END_UPDATED_FILE_LIST}" \
"${BEEOND_START_FILE_LIST}"
beeond_generate_directory_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_END_DIR_LIST}"
beeond_generate_directory_list "${LOCAL_PATH}" "${LOCAL_PATH}/${BEEOND_END_UPDATED_DIR_LIST}" \
"${BEEOND_START_DIR_LIST}"
beeond_remove_removed_files "${GLOBAL_PATH}" "${LOCAL_PATH}" "${NODE_LIST}" "${CONCURRENCY}"
beeond_copy_updated_files "${GLOBAL_PATH}" "${LOCAL_PATH}" "${NODE_LIST}" "${CONCURRENCY}"
}
# Parallel copy process: copy all files and directories (recursively) into the target directory.
beeond_copy()
{
local NODE_LIST="${1}"
local CONCURRENCY="${2}"
shift 2
beeond_print_info "Nodes for parallel copy: ${NODE_LIST}; Concurrency: ${CONCURRENCY}"
beeond_generate_copy_lists "${@:$#}" "${NODE_LIST}" "${CONCURRENCY}" "${@:1:$#-1}"
beeond_parallel_copy "${@:$#}" "${NODE_LIST}" "${CONCURRENCY}"
}