#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "BuddyResyncJob.h" BuddyResyncJob::BuddyResyncJob() : PThread("BuddyResyncJob"), state(BuddyResyncJobState_NOTSTARTED), startTime(0), endTime(0), gatherSlave(boost::make_unique(&syncCandidates)) { App* app = Program::getApp(); Config* cfg = app->getConfig(); buddyNodeID = NumNodeID(app->getMetaBuddyGroupMapper()->getBuddyTargetID(app->getLocalNodeNumID().val())); const unsigned numSyncSlaves = std::max(cfg->getTuneNumResyncSlaves(), 1); for (size_t i = 0; i < numSyncSlaves; i++) bulkSyncSlaves.emplace_back( boost::make_unique(*this, &syncCandidates, i, buddyNodeID)); sessionStoreResyncer = boost::make_unique(buddyNodeID); modSyncSlave = boost::make_unique(*this, &syncCandidates, 1, buddyNodeID); } BuddyResyncJob::~BuddyResyncJob() = default; void BuddyResyncJob::run() { const char* logContext = "Run resync job"; InternodeSyncer* internodeSyncer = Program::getApp()->getInternodeSyncer(); App* app = Program::getApp(); WorkerList* workers = app->getWorkers(); NodeStore* metaNodes = app->getMetaNodes(); const std::string metaPath = app->getMetaPath(); const std::string metaBuddyMirPath = app->getMetaPath() + "/" + CONFIG_BUDDYMIRROR_SUBDIR_NAME; Barrier workerBarrier(workers->size() + 1); bool workersStopped = false; startTime = time(NULL); syncCandidates.clear(); auto buddyNode = metaNodes->referenceNode(buddyNodeID); if (!buddyNode) { LOG(MIRRORING, ERR, "Unable to resolve buddy node. Resync will not start."); setState(BuddyResyncJobState_FAILURE); goto cleanup; } DEBUG_ENV_VAR(unsigned, DIE_AT_RESYNC_N, 0, "BEEGFS_RESYNC_DIE_AT_N"); if (DIE_AT_RESYNC_N) { static unsigned resyncs = 0; // for #479: terminating a server at this point caused the workers to terminate before the // resyncer had communicated with them, causing a deadlock on shutdown if (++resyncs == DIE_AT_RESYNC_N) { ::kill(0, SIGTERM); sleep(4); } } stopAllWorkersOn(workerBarrier); { // Notify buddy that resync started and wait for confirmation StorageResyncStartedMsg msg(buddyNodeID.val()); const auto respMsg = MessagingTk::requestResponse(*buddyNode, msg, NETMSGTYPE_StorageResyncStartedResp); if (!respMsg) { LogContext(logContext).logErr("Unable to notify buddy about resync attempt. " "Resync will not start."); setState(BuddyResyncJobState_FAILURE); workerBarrier.wait(); goto cleanup; } // resync could have been aborted before we got here. if so, exit as soon as possible without // setting the resync job state to something else. { std::unique_lock lock(stateMutex); if (state == BuddyResyncJobState_INTERRUPTED) { lock.unlock(); workerBarrier.wait(); goto cleanup; } state = BuddyResyncJobState_RUNNING; } internodeSyncer->setResyncInProgress(true); const bool startGatherSlaveRes = startGatherSlaves(); if (!startGatherSlaveRes) { setState(BuddyResyncJobState_FAILURE); workerBarrier.wait(); goto cleanup; } const bool startResyncSlaveRes = startSyncSlaves(); if (!startResyncSlaveRes) { setState(BuddyResyncJobState_FAILURE); workerBarrier.wait(); goto cleanup; } } workerBarrier.wait(); LOG_DEBUG(__func__, Log_DEBUG, "Going to join gather slaves."); joinGatherSlaves(); LOG_DEBUG(__func__, Log_DEBUG, "Joined gather slaves."); LOG_DEBUG(__func__, Log_DEBUG, "Going to join sync slaves."); // gather slaves have finished. Tell sync slaves to stop when work packages are empty and wait. for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it) { (*it)->setOnlyTerminateIfIdle(true); (*it)->selfTerminate(); } for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it) (*it)->join(); // here we can be in one of two situations: // 1. bulk resync has succeeded. we then totally stop the workers: the session store must be in // a quiescent state for resync, so for simplicitly, we suspend all client operations here. // we do not want to do this any earlier than this point, because bulk syncers may take a // very long time to complete. // 2. bulk resync has failed. in this case, the bulk syncers have aborted the currently running // job, and the mod syncer is either dead or in the process of dying. here we MUST NOT stop // the workers, because they are very likely blocked on the mod sync queue already and will // not unblock before the queue is cleared. if (getState() == BuddyResyncJobState_RUNNING) { stopAllWorkersOn(workerBarrier); workersStopped = true; } modSyncSlave->setOnlyTerminateIfIdle(true); modSyncSlave->selfTerminate(); modSyncSlave->join(); // gatherers are done and the workers have been stopped, we can safely resync the session now. LOG_DEBUG(__func__, Log_DEBUG, "Joined sync slaves."); // Perform session store resync // the job may have been aborted or terminated by errors. in this case, do not resync the session // store. end the sync as quickly as possible. if (getState() == BuddyResyncJobState_RUNNING) sessionStoreResyncer->doSync(); // session store is now synced, and future actions can be forwarded safely. we do not restart // the workers here because the resync may still enter FAILED state, and we don't want to forward // to the secondary in this case. cleanup: bool syncErrors = false; { std::lock_guard lock(gatherSlave->stateMutex); while (gatherSlave->isRunning) gatherSlave->isRunningChangeCond.wait(&gatherSlave->stateMutex); syncErrors |= gatherSlave->getStats().errors != 0; } for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it) { BuddyResyncerBulkSyncSlave* slave = it->get(); std::lock_guard lock(slave->stateMutex); while (slave->isRunning) slave->isRunningChangeCond.wait(&slave->stateMutex); syncErrors |= slave->getStats().dirErrors != 0; syncErrors |= slave->getStats().fileErrors != 0; } syncErrors |= sessionStoreResyncer->getStats().errors; { while (modSyncSlave->isRunning) modSyncSlave->isRunningChangeCond.wait(&modSyncSlave->stateMutex); syncErrors |= modSyncSlave->getStats().errors != 0; } if (getState() == BuddyResyncJobState_RUNNING || getState() == BuddyResyncJobState_INTERRUPTED) { if (syncErrors) setState(BuddyResyncJobState_ERRORS); else if (getState() == BuddyResyncJobState_RUNNING) setState(BuddyResyncJobState_SUCCESS); // delete timestamp override file if it exists. BuddyCommTk::setBuddyNeedsResync(metaPath, false); const TargetConsistencyState buddyState = newBuddyState(); informBuddy(buddyState); informMgmtd(buddyState); const bool interrupted = getState() != BuddyResyncJobState_SUCCESS; LOG(MIRRORING, WARNING, "Resync finished.", interrupted, syncErrors); } internodeSyncer->setResyncInProgress(false); endTime = time(NULL); // restart all the worker threads if (workersStopped) workerBarrier.wait(); // if the resync was aborted, the mod sync queue may still contain items. additionally, workers // may be waiting for a changeset slot, or they may have started executing after the resync was // aborted by the sync slaves, but before the resync was officially set to "not running". // we cannot set the resync to "not running" in abort() because we have no upper bound for the // number of worker threads. even if we did set the resync to "not running" in abort() and // cleared the sync queues at the same time, there may still be an arbitrary number of threads // waiting for a changeset slot. // instead, we have to wait for each thread to "see" that the resync is over, and periodically // clear the sync queue to unblock those workers that are still waiting for slots. if (syncErrors) { SynchronizedCounter counter; for (auto it = workers->begin(); it != workers->end(); ++it) { auto& worker = **it; worker.getWorkQueue()->addPersonalWork( new IncSyncedCounterWork(&counter), worker.getPersonalWorkQueue()); } while (!counter.timedWaitForCount(workers->size(), 100)) { while (!syncCandidates.isFilesEmpty()) { MetaSyncCandidateFile candidate; syncCandidates.fetch(candidate, this); candidate.signal(); } } } } void BuddyResyncJob::stopAllWorkersOn(Barrier& barrier) { WorkerList* workers = Program::getApp()->getWorkers(); for (WorkerListIter workerIt = workers->begin(); workerIt != workers->end(); ++workerIt) { Worker* worker = *workerIt; PersonalWorkQueue* personalQ = worker->getPersonalWorkQueue(); MultiWorkQueue* workQueue = worker->getWorkQueue(); workQueue->addPersonalWork(new BarrierWork(&barrier), personalQ); } barrier.wait(); // Wait until all workers are blocked. } void BuddyResyncJob::abort(bool wait_for_completion) { setState(BuddyResyncJobState_INTERRUPTED); gatherSlave->selfTerminate(); // set onlyTerminateIfIdle on the slaves to false - they will be stopped by the main loop then. for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it) { BuddyResyncerBulkSyncSlave* slave = it->get(); slave->setOnlyTerminateIfIdle(false); } modSyncSlave->selfTerminate(); int retry = 600; /* Wait till all on-going thread events are fetched or max 30mins. * (fetch waits for 3secs if there are no files to be fetched) */ if (wait_for_completion) { modSyncSlave->join(); while (threadCount > 0 && retry) { LOG(MIRRORING, WARNING, "Wait for pending worker threads to finish"); if (!syncCandidates.isFilesEmpty()) { MetaSyncCandidateFile candidate; syncCandidates.fetch(candidate, this); candidate.signal(); } retry--; } if (threadCount) LOG(MIRRORING, ERR, "Cleanup of aborted resync failed: I/O worker threads" " did not finish properly: ", ("threadCount", threadCount.load())); } } bool BuddyResyncJob::startGatherSlaves() { try { gatherSlave->resetSelfTerminate(); gatherSlave->start(); gatherSlave->setIsRunning(true); } catch (PThreadCreateException& e) { LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what()); return false; } return true; } bool BuddyResyncJob::startSyncSlaves() { App* app = Program::getApp(); const NumNodeID localNodeID = app->getLocalNodeNumID(); const NumNodeID buddyNodeID( app->getMetaBuddyGroupMapper()->getBuddyTargetID(localNodeID.val(), NULL) ); for (size_t i = 0; i < bulkSyncSlaves.size(); i++) { try { bulkSyncSlaves[i]->resetSelfTerminate(); bulkSyncSlaves[i]->start(); bulkSyncSlaves[i]->setIsRunning(true); } catch (PThreadCreateException& e) { LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() ); for (size_t j = 0; j < i; j++) bulkSyncSlaves[j]->selfTerminate(); return false; } } try { modSyncSlave->resetSelfTerminate(); modSyncSlave->start(); modSyncSlave->setIsRunning(true); } catch (PThreadCreateException& e) { LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() ); for (size_t j = 0; j < bulkSyncSlaves.size(); j++) bulkSyncSlaves[j]->selfTerminate(); return false; } return true; } void BuddyResyncJob::joinGatherSlaves() { gatherSlave->join(); } MetaBuddyResyncJobStatistics BuddyResyncJob::getJobStats() { std::lock_guard lock(stateMutex); BuddyResyncerGatherSlave::Stats gatherStats = gatherSlave->getStats(); const uint64_t dirsDiscovered = gatherStats.dirsDiscovered; const uint64_t gatherErrors = gatherStats.errors; uint64_t dirsSynced = 0; uint64_t filesSynced = 0; uint64_t dirErrors = 0; uint64_t fileErrors = 0; for (auto syncerIt = bulkSyncSlaves.begin(); syncerIt != bulkSyncSlaves.end(); ++syncerIt) { BuddyResyncerBulkSyncSlave::Stats bulkSyncStats = (*syncerIt)->getStats(); dirsSynced += bulkSyncStats.dirsSynced; filesSynced += bulkSyncStats.filesSynced; dirErrors += bulkSyncStats.dirErrors; fileErrors += bulkSyncStats.fileErrors; } SessionStoreResyncer::Stats sessionSyncStats = sessionStoreResyncer->getStats(); const uint64_t sessionsToSync = sessionSyncStats.sessionsToSync; const uint64_t sessionsSynced = sessionSyncStats.sessionsSynced; const bool sessionSyncErrors = sessionSyncStats.errors; BuddyResyncerModSyncSlave::Stats modSyncStats = modSyncSlave->getStats(); uint64_t modObjectsSynced = modSyncStats.objectsSynced; uint64_t modSyncErrors = modSyncStats.errors; return MetaBuddyResyncJobStatistics( state, startTime, endTime, dirsDiscovered, gatherErrors, dirsSynced, filesSynced, dirErrors, fileErrors, sessionsToSync, sessionsSynced, sessionSyncErrors, modObjectsSynced, modSyncErrors); } /** * Determine the state for the buddy after the end of a resync job. * @returns the new state to be set on the buddy accroding to this job's JobState. */ TargetConsistencyState BuddyResyncJob::newBuddyState() { switch (getState()) { case BuddyResyncJobState_ERRORS: case BuddyResyncJobState_INTERRUPTED: case BuddyResyncJobState_FAILURE: return TargetConsistencyState_BAD; case BuddyResyncJobState_SUCCESS: return TargetConsistencyState_GOOD; default: LOG(MIRRORING, ERR, "Undefined resync state.", state); return TargetConsistencyState_BAD; } } void BuddyResyncJob::informBuddy(const TargetConsistencyState newTargetState) { App* app = Program::getApp(); NodeStore* metaNodes = app->getMetaNodes(); MirrorBuddyGroupMapper* buddyGroups = app->getMetaBuddyGroupMapper(); NumNodeID buddyNodeID = NumNodeID(buddyGroups->getBuddyTargetID(app->getLocalNodeNumID().val())); auto metaNode = metaNodes->referenceNode(buddyNodeID); if (!metaNode) { LOG(MIRRORING, ERR, "Unable to inform buddy about finished resync", buddyNodeID.str()); return; } UInt16List nodeIDs(1, buddyNodeID.val()); UInt8List states(1, newTargetState); SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false); const auto respMsg = MessagingTk::requestResponse(*metaNode, msg, NETMSGTYPE_SetTargetConsistencyStatesResp); if (!respMsg) { LogContext(__func__).logErr( "Unable to inform buddy about finished resync. " "BuddyNodeID: " + buddyNodeID.str() + "; " "error: Communication Error"); return; } { auto* respMsgCast = static_cast(respMsg.get()); FhgfsOpsErr result = respMsgCast->getResult(); if (result != FhgfsOpsErr_SUCCESS) { LogContext(__func__).logErr( "Error while informing buddy about finished resync. " "BuddyNodeID: " + buddyNodeID.str() + "; " "error: " + boost::lexical_cast(result) ); } } } void BuddyResyncJob::informMgmtd(const TargetConsistencyState newTargetState) { App* app = Program::getApp(); NodeStore* mgmtNodes = app->getMgmtNodes(); auto mgmtNode = mgmtNodes->referenceFirstNode(); if (!mgmtNode) { LOG(MIRRORING, ERR, "Unable to communicate with management node."); return; } UInt16List nodeIDs(1, buddyNodeID.val()); UInt8List states(1, newTargetState); SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false); const auto respMsg = MessagingTk::requestResponse(*mgmtNode, msg, NETMSGTYPE_SetTargetConsistencyStatesResp); if (!respMsg) { LOG(MIRRORING, ERR, "Unable to inform management node about finished resync: Communication error."); return; } { auto* respMsgCast = static_cast(respMsg.get()); FhgfsOpsErr result = respMsgCast->getResult(); if (result != FhgfsOpsErr_SUCCESS) LOG(MIRRORING, ERR, "Error informing management node about finished resync.", result); } }