2025-08-10 01:34:16 +02:00

531 lines
17 KiB
C++

#include <program/Program.h>
#include <common/components/worker/IncSyncedCounterWork.h>
#include <common/net/message/nodes/SetTargetConsistencyStatesMsg.h>
#include <common/net/message/nodes/SetTargetConsistencyStatesRespMsg.h>
#include <common/net/message/storage/mirroring/StorageResyncStartedMsg.h>
#include <common/net/message/storage/mirroring/StorageResyncStartedRespMsg.h>
#include <common/threading/Barrier.h>
#include <common/toolkit/DebugVariable.h>
#include <common/toolkit/SynchronizedCounter.h>
#include <app/App.h>
#include <components/buddyresyncer/BuddyResyncerBulkSyncSlave.h>
#include <components/buddyresyncer/BuddyResyncerModSyncSlave.h>
#include <components/worker/BarrierWork.h>
#include <toolkit/BuddyCommTk.h>
#include "BuddyResyncJob.h"
BuddyResyncJob::BuddyResyncJob() :
PThread("BuddyResyncJob"),
state(BuddyResyncJobState_NOTSTARTED),
startTime(0), endTime(0),
gatherSlave(boost::make_unique<BuddyResyncerGatherSlave>(&syncCandidates))
{
App* app = Program::getApp();
Config* cfg = app->getConfig();
buddyNodeID =
NumNodeID(app->getMetaBuddyGroupMapper()->getBuddyTargetID(app->getLocalNodeNumID().val()));
const unsigned numSyncSlaves = std::max<unsigned>(cfg->getTuneNumResyncSlaves(), 1);
for (size_t i = 0; i < numSyncSlaves; i++)
bulkSyncSlaves.emplace_back(
boost::make_unique<BuddyResyncerBulkSyncSlave>(*this, &syncCandidates, i, buddyNodeID));
sessionStoreResyncer = boost::make_unique<SessionStoreResyncer>(buddyNodeID);
modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(*this, &syncCandidates, 1, buddyNodeID);
}
BuddyResyncJob::~BuddyResyncJob() = default;
void BuddyResyncJob::run()
{
const char* logContext = "Run resync job";
InternodeSyncer* internodeSyncer = Program::getApp()->getInternodeSyncer();
App* app = Program::getApp();
WorkerList* workers = app->getWorkers();
NodeStore* metaNodes = app->getMetaNodes();
const std::string metaPath = app->getMetaPath();
const std::string metaBuddyMirPath = app->getMetaPath() + "/" + CONFIG_BUDDYMIRROR_SUBDIR_NAME;
Barrier workerBarrier(workers->size() + 1);
bool workersStopped = false;
startTime = time(NULL);
syncCandidates.clear();
auto buddyNode = metaNodes->referenceNode(buddyNodeID);
if (!buddyNode)
{
LOG(MIRRORING, ERR, "Unable to resolve buddy node. Resync will not start.");
setState(BuddyResyncJobState_FAILURE);
goto cleanup;
}
DEBUG_ENV_VAR(unsigned, DIE_AT_RESYNC_N, 0, "BEEGFS_RESYNC_DIE_AT_N");
if (DIE_AT_RESYNC_N) {
static unsigned resyncs = 0;
// for #479: terminating a server at this point caused the workers to terminate before the
// resyncer had communicated with them, causing a deadlock on shutdown
if (++resyncs == DIE_AT_RESYNC_N) {
::kill(0, SIGTERM);
sleep(4);
}
}
stopAllWorkersOn(workerBarrier);
{
// Notify buddy that resync started and wait for confirmation
StorageResyncStartedMsg msg(buddyNodeID.val());
const auto respMsg = MessagingTk::requestResponse(*buddyNode, msg,
NETMSGTYPE_StorageResyncStartedResp);
if (!respMsg)
{
LogContext(logContext).logErr("Unable to notify buddy about resync attempt. "
"Resync will not start.");
setState(BuddyResyncJobState_FAILURE);
workerBarrier.wait();
goto cleanup;
}
// resync could have been aborted before we got here. if so, exit as soon as possible without
// setting the resync job state to something else.
{
std::unique_lock<Mutex> lock(stateMutex);
if (state == BuddyResyncJobState_INTERRUPTED)
{
lock.unlock();
workerBarrier.wait();
goto cleanup;
}
state = BuddyResyncJobState_RUNNING;
}
internodeSyncer->setResyncInProgress(true);
const bool startGatherSlaveRes = startGatherSlaves();
if (!startGatherSlaveRes)
{
setState(BuddyResyncJobState_FAILURE);
workerBarrier.wait();
goto cleanup;
}
const bool startResyncSlaveRes = startSyncSlaves();
if (!startResyncSlaveRes)
{
setState(BuddyResyncJobState_FAILURE);
workerBarrier.wait();
goto cleanup;
}
}
workerBarrier.wait();
LOG_DEBUG(__func__, Log_DEBUG, "Going to join gather slaves.");
joinGatherSlaves();
LOG_DEBUG(__func__, Log_DEBUG, "Joined gather slaves.");
LOG_DEBUG(__func__, Log_DEBUG, "Going to join sync slaves.");
// gather slaves have finished. Tell sync slaves to stop when work packages are empty and wait.
for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
{
(*it)->setOnlyTerminateIfIdle(true);
(*it)->selfTerminate();
}
for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
(*it)->join();
// here we can be in one of two situations:
// 1. bulk resync has succeeded. we then totally stop the workers: the session store must be in
// a quiescent state for resync, so for simplicitly, we suspend all client operations here.
// we do not want to do this any earlier than this point, because bulk syncers may take a
// very long time to complete.
// 2. bulk resync has failed. in this case, the bulk syncers have aborted the currently running
// job, and the mod syncer is either dead or in the process of dying. here we MUST NOT stop
// the workers, because they are very likely blocked on the mod sync queue already and will
// not unblock before the queue is cleared.
if (getState() == BuddyResyncJobState_RUNNING)
{
stopAllWorkersOn(workerBarrier);
workersStopped = true;
}
modSyncSlave->setOnlyTerminateIfIdle(true);
modSyncSlave->selfTerminate();
modSyncSlave->join();
// gatherers are done and the workers have been stopped, we can safely resync the session now.
LOG_DEBUG(__func__, Log_DEBUG, "Joined sync slaves.");
// Perform session store resync
// the job may have been aborted or terminated by errors. in this case, do not resync the session
// store. end the sync as quickly as possible.
if (getState() == BuddyResyncJobState_RUNNING)
sessionStoreResyncer->doSync();
// session store is now synced, and future actions can be forwarded safely. we do not restart
// the workers here because the resync may still enter FAILED state, and we don't want to forward
// to the secondary in this case.
cleanup:
bool syncErrors = false;
{
std::lock_guard<Mutex> lock(gatherSlave->stateMutex);
while (gatherSlave->isRunning)
gatherSlave->isRunningChangeCond.wait(&gatherSlave->stateMutex);
syncErrors |= gatherSlave->getStats().errors != 0;
}
for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
{
BuddyResyncerBulkSyncSlave* slave = it->get();
std::lock_guard<Mutex> lock(slave->stateMutex);
while (slave->isRunning)
slave->isRunningChangeCond.wait(&slave->stateMutex);
syncErrors |= slave->getStats().dirErrors != 0;
syncErrors |= slave->getStats().fileErrors != 0;
}
syncErrors |= sessionStoreResyncer->getStats().errors;
{
while (modSyncSlave->isRunning)
modSyncSlave->isRunningChangeCond.wait(&modSyncSlave->stateMutex);
syncErrors |= modSyncSlave->getStats().errors != 0;
}
if (getState() == BuddyResyncJobState_RUNNING || getState() == BuddyResyncJobState_INTERRUPTED)
{
if (syncErrors)
setState(BuddyResyncJobState_ERRORS);
else if (getState() == BuddyResyncJobState_RUNNING)
setState(BuddyResyncJobState_SUCCESS);
// delete timestamp override file if it exists.
BuddyCommTk::setBuddyNeedsResync(metaPath, false);
const TargetConsistencyState buddyState = newBuddyState();
informBuddy(buddyState);
informMgmtd(buddyState);
const bool interrupted = getState() != BuddyResyncJobState_SUCCESS;
LOG(MIRRORING, WARNING, "Resync finished.", interrupted, syncErrors);
}
internodeSyncer->setResyncInProgress(false);
endTime = time(NULL);
// restart all the worker threads
if (workersStopped)
workerBarrier.wait();
// if the resync was aborted, the mod sync queue may still contain items. additionally, workers
// may be waiting for a changeset slot, or they may have started executing after the resync was
// aborted by the sync slaves, but before the resync was officially set to "not running".
// we cannot set the resync to "not running" in abort() because we have no upper bound for the
// number of worker threads. even if we did set the resync to "not running" in abort() and
// cleared the sync queues at the same time, there may still be an arbitrary number of threads
// waiting for a changeset slot.
// instead, we have to wait for each thread to "see" that the resync is over, and periodically
// clear the sync queue to unblock those workers that are still waiting for slots.
if (syncErrors)
{
SynchronizedCounter counter;
for (auto it = workers->begin(); it != workers->end(); ++it)
{
auto& worker = **it;
worker.getWorkQueue()->addPersonalWork(
new IncSyncedCounterWork(&counter),
worker.getPersonalWorkQueue());
}
while (!counter.timedWaitForCount(workers->size(), 100))
{
while (!syncCandidates.isFilesEmpty())
{
MetaSyncCandidateFile candidate;
syncCandidates.fetch(candidate, this);
candidate.signal();
}
}
}
}
void BuddyResyncJob::stopAllWorkersOn(Barrier& barrier)
{
WorkerList* workers = Program::getApp()->getWorkers();
for (WorkerListIter workerIt = workers->begin(); workerIt != workers->end(); ++workerIt)
{
Worker* worker = *workerIt;
PersonalWorkQueue* personalQ = worker->getPersonalWorkQueue();
MultiWorkQueue* workQueue = worker->getWorkQueue();
workQueue->addPersonalWork(new BarrierWork(&barrier), personalQ);
}
barrier.wait(); // Wait until all workers are blocked.
}
void BuddyResyncJob::abort(bool wait_for_completion)
{
setState(BuddyResyncJobState_INTERRUPTED);
gatherSlave->selfTerminate();
// set onlyTerminateIfIdle on the slaves to false - they will be stopped by the main loop then.
for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
{
BuddyResyncerBulkSyncSlave* slave = it->get();
slave->setOnlyTerminateIfIdle(false);
}
modSyncSlave->selfTerminate();
int retry = 600;
/* Wait till all on-going thread events are fetched or max 30mins.
* (fetch waits for 3secs if there are no files to be fetched)
*/
if (wait_for_completion)
{
modSyncSlave->join();
while (threadCount > 0 && retry)
{
LOG(MIRRORING, WARNING, "Wait for pending worker threads to finish");
if (!syncCandidates.isFilesEmpty())
{
MetaSyncCandidateFile candidate;
syncCandidates.fetch(candidate, this);
candidate.signal();
}
retry--;
}
if (threadCount)
LOG(MIRRORING, ERR, "Cleanup of aborted resync failed: I/O worker threads"
" did not finish properly: ",
("threadCount", threadCount.load()));
}
}
bool BuddyResyncJob::startGatherSlaves()
{
try
{
gatherSlave->resetSelfTerminate();
gatherSlave->start();
gatherSlave->setIsRunning(true);
}
catch (PThreadCreateException& e)
{
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what());
return false;
}
return true;
}
bool BuddyResyncJob::startSyncSlaves()
{
App* app = Program::getApp();
const NumNodeID localNodeID = app->getLocalNodeNumID();
const NumNodeID buddyNodeID(
app->getMetaBuddyGroupMapper()->getBuddyTargetID(localNodeID.val(), NULL) );
for (size_t i = 0; i < bulkSyncSlaves.size(); i++)
{
try
{
bulkSyncSlaves[i]->resetSelfTerminate();
bulkSyncSlaves[i]->start();
bulkSyncSlaves[i]->setIsRunning(true);
}
catch (PThreadCreateException& e)
{
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() );
for (size_t j = 0; j < i; j++)
bulkSyncSlaves[j]->selfTerminate();
return false;
}
}
try
{
modSyncSlave->resetSelfTerminate();
modSyncSlave->start();
modSyncSlave->setIsRunning(true);
}
catch (PThreadCreateException& e)
{
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() );
for (size_t j = 0; j < bulkSyncSlaves.size(); j++)
bulkSyncSlaves[j]->selfTerminate();
return false;
}
return true;
}
void BuddyResyncJob::joinGatherSlaves()
{
gatherSlave->join();
}
MetaBuddyResyncJobStatistics BuddyResyncJob::getJobStats()
{
std::lock_guard<Mutex> lock(stateMutex);
BuddyResyncerGatherSlave::Stats gatherStats = gatherSlave->getStats();
const uint64_t dirsDiscovered = gatherStats.dirsDiscovered;
const uint64_t gatherErrors = gatherStats.errors;
uint64_t dirsSynced = 0;
uint64_t filesSynced = 0;
uint64_t dirErrors = 0;
uint64_t fileErrors = 0;
for (auto syncerIt = bulkSyncSlaves.begin(); syncerIt != bulkSyncSlaves.end(); ++syncerIt)
{
BuddyResyncerBulkSyncSlave::Stats bulkSyncStats = (*syncerIt)->getStats();
dirsSynced += bulkSyncStats.dirsSynced;
filesSynced += bulkSyncStats.filesSynced;
dirErrors += bulkSyncStats.dirErrors;
fileErrors += bulkSyncStats.fileErrors;
}
SessionStoreResyncer::Stats sessionSyncStats = sessionStoreResyncer->getStats();
const uint64_t sessionsToSync = sessionSyncStats.sessionsToSync;
const uint64_t sessionsSynced = sessionSyncStats.sessionsSynced;
const bool sessionSyncErrors = sessionSyncStats.errors;
BuddyResyncerModSyncSlave::Stats modSyncStats = modSyncSlave->getStats();
uint64_t modObjectsSynced = modSyncStats.objectsSynced;
uint64_t modSyncErrors = modSyncStats.errors;
return MetaBuddyResyncJobStatistics(
state, startTime, endTime,
dirsDiscovered, gatherErrors,
dirsSynced, filesSynced, dirErrors, fileErrors,
sessionsToSync, sessionsSynced, sessionSyncErrors,
modObjectsSynced, modSyncErrors);
}
/**
* Determine the state for the buddy after the end of a resync job.
* @returns the new state to be set on the buddy accroding to this job's JobState.
*/
TargetConsistencyState BuddyResyncJob::newBuddyState()
{
switch (getState())
{
case BuddyResyncJobState_ERRORS:
case BuddyResyncJobState_INTERRUPTED:
case BuddyResyncJobState_FAILURE:
return TargetConsistencyState_BAD;
case BuddyResyncJobState_SUCCESS:
return TargetConsistencyState_GOOD;
default:
LOG(MIRRORING, ERR, "Undefined resync state.", state);
return TargetConsistencyState_BAD;
}
}
void BuddyResyncJob::informBuddy(const TargetConsistencyState newTargetState)
{
App* app = Program::getApp();
NodeStore* metaNodes = app->getMetaNodes();
MirrorBuddyGroupMapper* buddyGroups = app->getMetaBuddyGroupMapper();
NumNodeID buddyNodeID =
NumNodeID(buddyGroups->getBuddyTargetID(app->getLocalNodeNumID().val()));
auto metaNode = metaNodes->referenceNode(buddyNodeID);
if (!metaNode)
{
LOG(MIRRORING, ERR, "Unable to inform buddy about finished resync", buddyNodeID.str());
return;
}
UInt16List nodeIDs(1, buddyNodeID.val());
UInt8List states(1, newTargetState);
SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false);
const auto respMsg = MessagingTk::requestResponse(*metaNode, msg,
NETMSGTYPE_SetTargetConsistencyStatesResp);
if (!respMsg)
{
LogContext(__func__).logErr(
"Unable to inform buddy about finished resync. "
"BuddyNodeID: " + buddyNodeID.str() + "; "
"error: Communication Error");
return;
}
{
auto* respMsgCast = static_cast<SetTargetConsistencyStatesRespMsg*>(respMsg.get());
FhgfsOpsErr result = respMsgCast->getResult();
if (result != FhgfsOpsErr_SUCCESS)
{
LogContext(__func__).logErr(
"Error while informing buddy about finished resync. "
"BuddyNodeID: " + buddyNodeID.str() + "; "
"error: " + boost::lexical_cast<std::string>(result) );
}
}
}
void BuddyResyncJob::informMgmtd(const TargetConsistencyState newTargetState)
{
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if (!mgmtNode)
{
LOG(MIRRORING, ERR, "Unable to communicate with management node.");
return;
}
UInt16List nodeIDs(1, buddyNodeID.val());
UInt8List states(1, newTargetState);
SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false);
const auto respMsg = MessagingTk::requestResponse(*mgmtNode, msg,
NETMSGTYPE_SetTargetConsistencyStatesResp);
if (!respMsg)
{
LOG(MIRRORING, ERR,
"Unable to inform management node about finished resync: Communication error.");
return;
}
{
auto* respMsgCast = static_cast<SetTargetConsistencyStatesRespMsg*>(respMsg.get());
FhgfsOpsErr result = respMsgCast->getResult();
if (result != FhgfsOpsErr_SUCCESS)
LOG(MIRRORING, ERR, "Error informing management node about finished resync.", result);
}
}