New upstream version 8.1.0
This commit is contained in:
745
storage/source/components/buddyresyncer/BuddyResyncJob.cpp
Normal file
745
storage/source/components/buddyresyncer/BuddyResyncJob.cpp
Normal file
@@ -0,0 +1,745 @@
|
||||
#include <program/Program.h>
|
||||
|
||||
#include <common/components/worker/IncSyncedCounterWork.h>
|
||||
#include <common/net/message/nodes/SetTargetConsistencyStatesMsg.h>
|
||||
#include <common/net/message/nodes/SetTargetConsistencyStatesRespMsg.h>
|
||||
#include <common/net/message/storage/mirroring/StorageResyncStartedMsg.h>
|
||||
#include <common/net/message/storage/mirroring/StorageResyncStartedRespMsg.h>
|
||||
#include <common/toolkit/StringTk.h>
|
||||
#include "BuddyResyncJob.h"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
#define BUDDYRESYNCJOB_MAXDIRWALKDEPTH 2
|
||||
|
||||
BuddyResyncJob::BuddyResyncJob(uint16_t targetID) :
|
||||
PThread("BuddyResyncJob_" + StringTk::uintToStr(targetID)),
|
||||
targetID(targetID),
|
||||
status(BuddyResyncJobState_NOTSTARTED),
|
||||
startTime(0), endTime(0)
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
unsigned numGatherSlaves = app->getConfig()->getTuneNumResyncGatherSlaves();
|
||||
unsigned numSyncSlavesTotal = app->getConfig()->getTuneNumResyncSlaves();
|
||||
unsigned numFileSyncSlaves = BEEGFS_MAX((numSyncSlavesTotal / 2), 1);
|
||||
unsigned numDirSyncSlaves = BEEGFS_MAX((numSyncSlavesTotal / 2), 1);
|
||||
|
||||
// prepare slaves (vectors) and result vector
|
||||
gatherSlaveVec.resize(numGatherSlaves);
|
||||
fileSyncSlaveVec.resize(numFileSyncSlaves);
|
||||
dirSyncSlaveVec.resize(numDirSyncSlaves);
|
||||
}
|
||||
|
||||
BuddyResyncJob::~BuddyResyncJob()
|
||||
{
|
||||
for(BuddyResyncerGatherSlaveVecIter iter = gatherSlaveVec.begin(); iter != gatherSlaveVec.end();
|
||||
iter++)
|
||||
{
|
||||
BuddyResyncerGatherSlave* slave = *iter;
|
||||
SAFE_DELETE(slave);
|
||||
}
|
||||
|
||||
for(BuddyResyncerFileSyncSlaveVecIter iter = fileSyncSlaveVec.begin();
|
||||
iter != fileSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerFileSyncSlave* slave = *iter;
|
||||
SAFE_DELETE(slave);
|
||||
}
|
||||
|
||||
for(BuddyResyncerDirSyncSlaveVecIter iter = dirSyncSlaveVec.begin();
|
||||
iter != dirSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerDirSyncSlave* slave = *iter;
|
||||
SAFE_DELETE(slave);
|
||||
}
|
||||
}
|
||||
|
||||
void BuddyResyncJob::run()
|
||||
{
|
||||
// make sure only one job at a time can run!
|
||||
{
|
||||
std::lock_guard<Mutex> mutexLock(statusMutex);
|
||||
|
||||
if (status == BuddyResyncJobState_RUNNING)
|
||||
{
|
||||
LogContext(__func__).logErr("Refusing to run same BuddyResyncJob twice!");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = BuddyResyncJobState_RUNNING;
|
||||
startTime = time(NULL);
|
||||
endTime = 0;
|
||||
}
|
||||
}
|
||||
|
||||
App* app = Program::getApp();
|
||||
StorageTargets* storageTargets = app->getStorageTargets();
|
||||
MirrorBuddyGroupMapper* buddyGroupMapper = app->getMirrorBuddyGroupMapper();
|
||||
TargetMapper* targetMapper = app->getTargetMapper();
|
||||
NodeStoreServers* storageNodes = app->getStorageNodes();
|
||||
WorkerList* workerList = app->getWorkers();
|
||||
|
||||
bool startGatherSlavesRes;
|
||||
bool startSyncSlavesRes;
|
||||
|
||||
std::string targetPath;
|
||||
std::string chunksPath;
|
||||
|
||||
bool buddyCommIsOverride = false; // treat errors during lastbuddycomm read as "0, no override"
|
||||
int64_t lastBuddyCommTimeSecs;
|
||||
int64_t lastBuddyCommSafetyThresholdSecs;
|
||||
bool checkTopLevelDirRes;
|
||||
bool walkRes;
|
||||
|
||||
auto& target = *storageTargets->getTargets().at(targetID);
|
||||
|
||||
shallAbort.setZero();
|
||||
targetWasOffline = false;
|
||||
|
||||
// delete sync candidates and gather queue; just in case there was something from a previous run
|
||||
syncCandidates.clear();
|
||||
gatherSlavesWorkQueue.clear();
|
||||
|
||||
target.setBuddyResyncInProgress(true);
|
||||
|
||||
LogContext(__func__).log(Log_NOTICE,
|
||||
"Started resync of targetID " + StringTk::uintToStr(targetID));
|
||||
|
||||
// before starting the threads make sure every worker knows about the resync (the current work
|
||||
// package must be finished), for that we use a dummy package
|
||||
Mutex mutex;
|
||||
Condition counterIncrementedCond;
|
||||
|
||||
SynchronizedCounter numReadyWorkers;
|
||||
size_t numWorkers = workerList->size();
|
||||
for (WorkerListIter iter = workerList->begin(); iter != workerList->end(); iter++)
|
||||
{
|
||||
Worker* worker = *iter;
|
||||
PersonalWorkQueue* personalQueue = worker->getPersonalWorkQueue();
|
||||
MultiWorkQueue* workQueue = worker->getWorkQueue();
|
||||
IncSyncedCounterWork* incCounterWork = new IncSyncedCounterWork(&numReadyWorkers);
|
||||
|
||||
workQueue->addPersonalWork(incCounterWork, personalQueue);
|
||||
}
|
||||
|
||||
numReadyWorkers.waitForCount(numWorkers);
|
||||
|
||||
// notify buddy, that resync started and wait for confirmation
|
||||
uint16_t buddyTargetID = buddyGroupMapper->getBuddyTargetID(targetID);
|
||||
NumNodeID buddyNodeID = targetMapper->getNodeID(buddyTargetID);
|
||||
auto buddyNode = storageNodes->referenceNode(buddyNodeID);
|
||||
StorageResyncStartedMsg storageResyncStartedMsg(buddyTargetID);
|
||||
const auto respMsg = MessagingTk::requestResponse(*buddyNode, storageResyncStartedMsg,
|
||||
NETMSGTYPE_StorageResyncStartedResp);
|
||||
|
||||
std::pair<bool, std::chrono::system_clock::time_point> lastBuddyComm;
|
||||
|
||||
if (!respMsg)
|
||||
{
|
||||
LOG(MIRRORING, ERR, "Unable to notify buddy about resync attempt. Resync will not start.",
|
||||
targetID, buddyTargetID);
|
||||
setStatus(BuddyResyncJobState_FAILURE);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
startGatherSlavesRes = startGatherSlaves(target);
|
||||
if (!startGatherSlavesRes)
|
||||
{
|
||||
setStatus(BuddyResyncJobState_FAILURE);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
startSyncSlavesRes = startSyncSlaves();
|
||||
if (!startSyncSlavesRes)
|
||||
{
|
||||
setStatus(BuddyResyncJobState_FAILURE);
|
||||
|
||||
// terminate gather slaves
|
||||
for (size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
gatherSlaveVec[i]->selfTerminate();
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
numDirsDiscovered.setZero();
|
||||
numDirsMatched.setZero();
|
||||
|
||||
// walk over the directories until we reach a certain level and then pass the direcories to
|
||||
// gather slaves to parallelize it
|
||||
targetPath = target.getPath().str();
|
||||
chunksPath = targetPath + "/" + CONFIG_BUDDYMIRROR_SUBDIR_NAME;
|
||||
|
||||
lastBuddyComm = target.getLastBuddyComm();
|
||||
buddyCommIsOverride = lastBuddyComm.first;
|
||||
lastBuddyCommTimeSecs = std::chrono::system_clock::to_time_t(lastBuddyComm.second);
|
||||
|
||||
lastBuddyCommSafetyThresholdSecs = app->getConfig()->getSysResyncSafetyThresholdMins()*60;
|
||||
if ( (lastBuddyCommSafetyThresholdSecs == 0) && (!buddyCommIsOverride) ) // ignore timestamp file
|
||||
lastBuddyCommTimeSecs = 0;
|
||||
else
|
||||
if (lastBuddyCommTimeSecs > lastBuddyCommSafetyThresholdSecs)
|
||||
lastBuddyCommTimeSecs -= lastBuddyCommSafetyThresholdSecs;
|
||||
|
||||
checkTopLevelDirRes = checkTopLevelDir(chunksPath, lastBuddyCommTimeSecs);
|
||||
if (!checkTopLevelDirRes)
|
||||
{
|
||||
setStatus(BuddyResyncJobState_FAILURE);
|
||||
|
||||
// terminate gather slaves
|
||||
for (size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
gatherSlaveVec[i]->selfTerminate();
|
||||
|
||||
// terminate sync slaves
|
||||
for (size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
fileSyncSlaveVec[i]->selfTerminate();
|
||||
|
||||
for (size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
dirSyncSlaveVec[i]->selfTerminate();
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
walkRes = walkDirs(chunksPath, "", 0, lastBuddyCommTimeSecs);
|
||||
if (!walkRes)
|
||||
{
|
||||
setStatus(BuddyResyncJobState_FAILURE);
|
||||
|
||||
// terminate gather slaves
|
||||
for (size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
gatherSlaveVec[i]->selfTerminate();
|
||||
|
||||
// terminate sync slaves
|
||||
for (size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
fileSyncSlaveVec[i]->selfTerminate();
|
||||
|
||||
for (size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
dirSyncSlaveVec[i]->selfTerminate();
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// all directories are read => tell gather slave to stop when work queue is empty and wait for
|
||||
// all to stop
|
||||
for(size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
{
|
||||
if (likely(shallAbort.read() == 0))
|
||||
gatherSlaveVec[i]->setOnlyTerminateIfIdle(true);
|
||||
else
|
||||
gatherSlaveVec[i]->setOnlyTerminateIfIdle(false);
|
||||
|
||||
gatherSlaveVec[i]->selfTerminate();
|
||||
}
|
||||
|
||||
joinGatherSlaves();
|
||||
|
||||
// gather slaves have finished => tell sync slaves to stop when work packages are empty and wait
|
||||
for(size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
{
|
||||
if (likely(shallAbort.read() == 0))
|
||||
fileSyncSlaveVec[i]->setOnlyTerminateIfIdle(true);
|
||||
else
|
||||
fileSyncSlaveVec[i]->setOnlyTerminateIfIdle(false);
|
||||
|
||||
fileSyncSlaveVec[i]->selfTerminate();
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
{
|
||||
if (likely(shallAbort.read() == 0))
|
||||
dirSyncSlaveVec[i]->setOnlyTerminateIfIdle(true);
|
||||
else
|
||||
dirSyncSlaveVec[i]->setOnlyTerminateIfIdle(false);
|
||||
|
||||
dirSyncSlaveVec[i]->selfTerminate();
|
||||
}
|
||||
|
||||
joinSyncSlaves();
|
||||
|
||||
cleanup:
|
||||
// wait for gather slaves to stop
|
||||
for(BuddyResyncerGatherSlaveVecIter iter = gatherSlaveVec.begin();
|
||||
iter != gatherSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerGatherSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
std::lock_guard<Mutex> safeLock(slave->statusMutex);
|
||||
while (slave->isRunning)
|
||||
slave->isRunningChangeCond.wait(&(slave->statusMutex));
|
||||
}
|
||||
}
|
||||
|
||||
bool syncErrors = false;
|
||||
|
||||
// wait for sync slaves to stop and save if any errors occured
|
||||
for(BuddyResyncerFileSyncSlaveVecIter iter = fileSyncSlaveVec.begin();
|
||||
iter != fileSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerFileSyncSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
{
|
||||
std::lock_guard<Mutex> safeLock(slave->statusMutex);
|
||||
while (slave->isRunning)
|
||||
slave->isRunningChangeCond.wait(&(slave->statusMutex));
|
||||
}
|
||||
|
||||
if (slave->getErrorCount() != 0)
|
||||
syncErrors = true;
|
||||
}
|
||||
}
|
||||
|
||||
for(BuddyResyncerDirSyncSlaveVecIter iter = dirSyncSlaveVec.begin();
|
||||
iter != dirSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerDirSyncSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
{
|
||||
std::lock_guard<Mutex> safeLock(slave->statusMutex);
|
||||
while (slave->isRunning)
|
||||
slave->isRunningChangeCond.wait(&(slave->statusMutex));
|
||||
}
|
||||
|
||||
if (slave->getErrorCount() != 0)
|
||||
syncErrors = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (getStatus() == BuddyResyncJobState_RUNNING) // status not set to anything special
|
||||
{ // (e.g. FAILURE)
|
||||
if (shallAbort.read() != 0) // job aborted?
|
||||
{
|
||||
setStatus(BuddyResyncJobState_INTERRUPTED);
|
||||
informBuddy();
|
||||
}
|
||||
else if (syncErrors || targetWasOffline.read()) // any sync errors or success?
|
||||
{
|
||||
// we must set the buddy BAD if it has been offline during any period of time during which
|
||||
// the resync was also running. we implicitly do this during resync proper, since resync
|
||||
// slaves abort with errors if the target is offline. if the target goes offline *after*
|
||||
// the last proper resync messages has been sent and comes *back* before we try to inform
|
||||
// it we will never detect that it has been offline at all. concurrently executing
|
||||
// messages (eg TruncFile) may run between our opportunities to detect the offline state
|
||||
// and may fail to forward their actions *even though they should forward*. this would
|
||||
// lead to an inconsistent secondary. since the target has gone offline, the only
|
||||
// reasonable course of action is to fail to resync entirely.
|
||||
setStatus(BuddyResyncJobState_ERRORS);
|
||||
informBuddy();
|
||||
}
|
||||
else
|
||||
{
|
||||
setStatus(BuddyResyncJobState_SUCCESS);
|
||||
// unset timestamp override file if an override was set
|
||||
target.setLastBuddyComm(std::chrono::system_clock::from_time_t(0), true);
|
||||
// so the target went offline between the previous check "syncErrors || targetWasOffline".
|
||||
// any message that has tried to forward itself in the intervening time will have seen the
|
||||
// offline state, but will have been unable to set the buddy to needs-resync because it
|
||||
// still *is* needs-resync. the resync itself has been perfectly successful, but we have
|
||||
// to start another one anyway once the target comes back to ensure that no information
|
||||
// was lost.
|
||||
target.setBuddyNeedsResync(targetWasOffline.read());
|
||||
informBuddy();
|
||||
|
||||
if (targetWasOffline.read())
|
||||
LOG(MIRRORING, WARNING,
|
||||
"Resync successful, but target went offline during finalization. "
|
||||
"Setting target to needs-resync again.", targetID);
|
||||
}
|
||||
}
|
||||
|
||||
target.setBuddyResyncInProgress(false);
|
||||
endTime = time(NULL);
|
||||
}
|
||||
|
||||
void BuddyResyncJob::abort()
|
||||
{
|
||||
shallAbort.set(1); // tell the file walk in this class to abort
|
||||
|
||||
// set setOnlyTerminateIfIdle on the slaves to false; they will be stopped by the main loop then
|
||||
for(BuddyResyncerGatherSlaveVecIter iter = gatherSlaveVec.begin(); iter != gatherSlaveVec.end();
|
||||
iter++)
|
||||
{
|
||||
BuddyResyncerGatherSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
slave->setOnlyTerminateIfIdle(false);
|
||||
}
|
||||
}
|
||||
|
||||
// stop sync slaves
|
||||
for(BuddyResyncerFileSyncSlaveVecIter iter = fileSyncSlaveVec.begin();
|
||||
iter != fileSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerFileSyncSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
slave->setOnlyTerminateIfIdle(false);
|
||||
}
|
||||
}
|
||||
|
||||
for(BuddyResyncerDirSyncSlaveVecIter iter = dirSyncSlaveVec.begin();
|
||||
iter != dirSyncSlaveVec.end(); iter++)
|
||||
{
|
||||
BuddyResyncerDirSyncSlave* slave = *iter;
|
||||
if(slave)
|
||||
{
|
||||
slave->setOnlyTerminateIfIdle(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BuddyResyncJob::startGatherSlaves(const StorageTarget& target)
|
||||
{
|
||||
// create a gather slaves if they don't exist yet and start them
|
||||
for (size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
{
|
||||
if(!gatherSlaveVec[i])
|
||||
gatherSlaveVec[i] = new BuddyResyncerGatherSlave(target, &syncCandidates,
|
||||
&gatherSlavesWorkQueue, i);
|
||||
|
||||
try
|
||||
{
|
||||
gatherSlaveVec[i]->resetSelfTerminate();
|
||||
gatherSlaveVec[i]->start();
|
||||
gatherSlaveVec[i]->setIsRunning(true);
|
||||
}
|
||||
catch (PThreadCreateException& e)
|
||||
{
|
||||
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what());
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BuddyResyncJob::startSyncSlaves()
|
||||
{
|
||||
// create sync slaves and start them
|
||||
for(size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
{
|
||||
if(!fileSyncSlaveVec[i])
|
||||
fileSyncSlaveVec[i] = new BuddyResyncerFileSyncSlave(targetID, &syncCandidates, i);
|
||||
|
||||
try
|
||||
{
|
||||
fileSyncSlaveVec[i]->resetSelfTerminate();
|
||||
fileSyncSlaveVec[i]->start();
|
||||
fileSyncSlaveVec[i]->setIsRunning(true);
|
||||
}
|
||||
catch (PThreadCreateException& e)
|
||||
{
|
||||
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what());
|
||||
|
||||
// stop already started sync slaves
|
||||
for(size_t j = 0; j < i; j++)
|
||||
fileSyncSlaveVec[j]->selfTerminate();
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
{
|
||||
if(!dirSyncSlaveVec[i])
|
||||
dirSyncSlaveVec[i] = new BuddyResyncerDirSyncSlave(targetID, &syncCandidates, i);
|
||||
|
||||
try
|
||||
{
|
||||
dirSyncSlaveVec[i]->resetSelfTerminate();
|
||||
dirSyncSlaveVec[i]->start();
|
||||
dirSyncSlaveVec[i]->setIsRunning(true);
|
||||
}
|
||||
catch (PThreadCreateException& e)
|
||||
{
|
||||
LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what());
|
||||
|
||||
// stop already started sync slaves
|
||||
for (size_t j = 0; j < fileSyncSlaveVec.size(); j++)
|
||||
fileSyncSlaveVec[j]->selfTerminate();
|
||||
|
||||
for (size_t j = 0; j < i; j++)
|
||||
dirSyncSlaveVec[j]->selfTerminate();
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void BuddyResyncJob::joinGatherSlaves()
|
||||
{
|
||||
for (size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
gatherSlaveVec[i]->join();
|
||||
}
|
||||
|
||||
void BuddyResyncJob::joinSyncSlaves()
|
||||
{
|
||||
for (size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
fileSyncSlaveVec[i]->join();
|
||||
|
||||
for (size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
dirSyncSlaveVec[i]->join();
|
||||
}
|
||||
|
||||
void BuddyResyncJob::getJobStats(StorageBuddyResyncJobStatistics& outStats)
|
||||
{
|
||||
uint64_t discoveredFiles = 0;
|
||||
uint64_t matchedFiles = 0;
|
||||
uint64_t discoveredDirs = numDirsDiscovered.read();
|
||||
uint64_t matchedDirs = numDirsMatched.read();
|
||||
uint64_t syncedFiles = 0;
|
||||
uint64_t syncedDirs = 0;
|
||||
uint64_t errorFiles = 0;
|
||||
uint64_t errorDirs = 0;
|
||||
|
||||
for(size_t i = 0; i < gatherSlaveVec.size(); i++)
|
||||
{
|
||||
BuddyResyncerGatherSlave* slave = gatherSlaveVec[i];
|
||||
if(slave)
|
||||
{
|
||||
uint64_t tmpDiscoveredFiles = 0;
|
||||
uint64_t tmpMatchedFiles = 0;
|
||||
uint64_t tmpDiscoveredDirs = 0;
|
||||
uint64_t tmpMatchedDirs = 0;
|
||||
slave->getCounters(tmpDiscoveredFiles, tmpMatchedFiles, tmpDiscoveredDirs, tmpMatchedDirs);
|
||||
|
||||
discoveredFiles += tmpDiscoveredFiles;
|
||||
matchedFiles += tmpMatchedFiles;
|
||||
discoveredDirs += tmpDiscoveredDirs;
|
||||
matchedDirs += tmpMatchedDirs;
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < fileSyncSlaveVec.size(); i++)
|
||||
{
|
||||
BuddyResyncerFileSyncSlave* slave = fileSyncSlaveVec[i];
|
||||
if(slave)
|
||||
{
|
||||
syncedFiles += slave->getNumChunksSynced();
|
||||
errorFiles += slave->getErrorCount();
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < dirSyncSlaveVec.size(); i++)
|
||||
{
|
||||
BuddyResyncerDirSyncSlave* slave = dirSyncSlaveVec[i];
|
||||
if (slave)
|
||||
{
|
||||
syncedDirs += slave->getNumDirsSynced();
|
||||
discoveredDirs += slave->getNumAdditionalDirsMatched();
|
||||
matchedDirs += slave->getNumAdditionalDirsMatched();
|
||||
errorDirs += slave->getErrorCount();
|
||||
}
|
||||
}
|
||||
|
||||
outStats = StorageBuddyResyncJobStatistics(status, startTime, endTime, discoveredFiles,
|
||||
discoveredDirs, matchedFiles, matchedDirs, syncedFiles, syncedDirs, errorFiles, errorDirs);
|
||||
}
|
||||
|
||||
void BuddyResyncJob::informBuddy()
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
NodeStore* storageNodes = app->getStorageNodes();
|
||||
MirrorBuddyGroupMapper* buddyGroupMapper = app->getMirrorBuddyGroupMapper();
|
||||
TargetMapper* targetMapper = app->getTargetMapper();
|
||||
|
||||
BuddyResyncJobState status = getStatus();
|
||||
TargetConsistencyState newTargetState;
|
||||
if ( (status == BuddyResyncJobState_ERRORS) || (status == BuddyResyncJobState_INTERRUPTED))
|
||||
newTargetState = TargetConsistencyState_BAD;
|
||||
else
|
||||
if (status == BuddyResyncJobState_SUCCESS)
|
||||
newTargetState = TargetConsistencyState_GOOD;
|
||||
else
|
||||
{
|
||||
LogContext(__func__).log(Log_NOTICE, "Refusing to set a state for buddy target, because "
|
||||
"resync status isn't well-defined. "
|
||||
"localTargetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"resyncState: " + StringTk::intToStr(status));
|
||||
return;
|
||||
}
|
||||
|
||||
uint16_t buddyTargetID = buddyGroupMapper->getBuddyTargetID(targetID);
|
||||
NumNodeID buddyNodeID = targetMapper->getNodeID(buddyTargetID);
|
||||
auto storageNode = storageNodes->referenceNode(buddyNodeID);
|
||||
|
||||
if (!storageNode)
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Unable to inform buddy about finished resync. TargetID: " + StringTk::uintToStr(targetID)
|
||||
+ "; buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; buddyNodeID: "
|
||||
+ buddyNodeID.str() + "; error: unknown storage node");
|
||||
return;
|
||||
}
|
||||
|
||||
SetTargetConsistencyStatesRespMsg* respMsgCast;
|
||||
FhgfsOpsErr result;
|
||||
UInt16List targetIDs;
|
||||
UInt8List states;
|
||||
|
||||
targetIDs.push_back(buddyTargetID);
|
||||
states.push_back(newTargetState);
|
||||
|
||||
SetTargetConsistencyStatesMsg msg(NODETYPE_Storage, &targetIDs, &states, false);
|
||||
|
||||
const auto respMsg = MessagingTk::requestResponse(*storageNode, msg,
|
||||
NETMSGTYPE_SetTargetConsistencyStatesResp);
|
||||
if (!respMsg)
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Unable to inform buddy about finished resync. "
|
||||
"targetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"buddyNodeID: " + buddyNodeID.str() + "; "
|
||||
"error: Communication error");
|
||||
return;
|
||||
}
|
||||
|
||||
respMsgCast = (SetTargetConsistencyStatesRespMsg*) respMsg.get();
|
||||
result = respMsgCast->getResult();
|
||||
|
||||
if(result != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Error while informing buddy about finished resync. "
|
||||
"targetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"buddyNodeID: " + buddyNodeID.str() + "; "
|
||||
"error: " + boost::lexical_cast<std::string>(result));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* check the CONFIG_BUDDYMIRROR_SUBDIR_NAME directory
|
||||
*/
|
||||
bool BuddyResyncJob::checkTopLevelDir(std::string& path, int64_t lastBuddyCommTimeSecs)
|
||||
{
|
||||
struct stat statBuf;
|
||||
int statRes = stat(path.c_str(), &statBuf);
|
||||
|
||||
if(statRes != 0)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"Couldn't stat chunks directory; resync job can't run. targetID: "
|
||||
+ StringTk::uintToStr(targetID) + "; path: " + path
|
||||
+ "; Error: " + System::getErrString(errno));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
numDirsDiscovered.increase();
|
||||
int64_t dirMTime = (int64_t) statBuf.st_mtim.tv_sec;
|
||||
if(dirMTime > lastBuddyCommTimeSecs)
|
||||
{ // sync candidate
|
||||
ChunkSyncCandidateDir candidate("", targetID);
|
||||
syncCandidates.add(candidate, this);
|
||||
numDirsMatched.increase();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* recursively walk through buddy mir directory until a depth of BUDDYRESYNCJOB_MAXDIRWALKDEPTH is
|
||||
* reached; everything with a greater depth gets passed to the GatherSlaves to work on it in
|
||||
* parallel
|
||||
*/
|
||||
bool BuddyResyncJob::walkDirs(std::string chunksPath, std::string relPath, int level,
|
||||
int64_t lastBuddyCommTimeSecs)
|
||||
{
|
||||
bool retVal = true;
|
||||
|
||||
DIR* dirHandle;
|
||||
struct dirent* dirEntry;
|
||||
|
||||
dirHandle = opendir(std::string(chunksPath + "/" + relPath).c_str());
|
||||
|
||||
if(!dirHandle)
|
||||
{
|
||||
LogContext(__func__).logErr("Unable to open path. "
|
||||
"targetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"Rel. path: " + relPath + "; "
|
||||
"Error: " + System::getErrString(errno) );
|
||||
return false;
|
||||
}
|
||||
|
||||
while ((dirEntry = StorageTk::readdirFiltered(dirHandle)) != NULL)
|
||||
{
|
||||
if(shallAbort.read() != 0)
|
||||
break;
|
||||
|
||||
// get stat info
|
||||
std::string currentRelPath;
|
||||
if(unlikely(relPath.empty()))
|
||||
currentRelPath = dirEntry->d_name;
|
||||
else
|
||||
currentRelPath = relPath + "/" + dirEntry->d_name;
|
||||
|
||||
std::string currentFullPath = chunksPath + "/" + currentRelPath;
|
||||
struct stat statBuf;
|
||||
int statRes = stat(currentFullPath.c_str(), &statBuf);
|
||||
|
||||
if(statRes != 0)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"Couldn't stat directory, which was discovered previously. Resync job might not be "
|
||||
"complete. targetID " + StringTk::uintToStr(targetID) + "; "
|
||||
"Rel. path: " + relPath + "; "
|
||||
"Error: " + System::getErrString(errno));
|
||||
|
||||
retVal = false;
|
||||
|
||||
break; // => one error aborts it all
|
||||
}
|
||||
|
||||
if(S_ISDIR(statBuf.st_mode))
|
||||
{
|
||||
// if level of dir is smaller than max, take care of it and recurse into it
|
||||
if(level < BUDDYRESYNCJOB_MAXDIRWALKDEPTH)
|
||||
{
|
||||
numDirsDiscovered.increase();
|
||||
int64_t dirMTime = (int64_t) statBuf.st_mtim.tv_sec;
|
||||
if(dirMTime > lastBuddyCommTimeSecs)
|
||||
{ // sync candidate
|
||||
ChunkSyncCandidateDir candidate(currentRelPath, targetID);
|
||||
syncCandidates.add(candidate, this);
|
||||
numDirsMatched.increase();
|
||||
}
|
||||
|
||||
bool walkRes = walkDirs(chunksPath, currentRelPath, level+1, lastBuddyCommTimeSecs);
|
||||
|
||||
if (!walkRes)
|
||||
retVal = false;
|
||||
}
|
||||
else
|
||||
// otherwise pass it to the slaves; NOTE: gather slave takes full path
|
||||
gatherSlavesWorkQueue.add(currentFullPath, this);
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_DEBUG(__func__, Log_WARNING, "Found a file in directory structure");
|
||||
}
|
||||
}
|
||||
|
||||
if(!dirEntry && errno) // error occured
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Unable to read all directories; chunksPath: " + chunksPath + "; relativePath: " + relPath
|
||||
+ "; SysErr: " + System::getErrString(errno));
|
||||
|
||||
retVal = false;
|
||||
}
|
||||
|
||||
int closedirRes = closedir(dirHandle);
|
||||
if (closedirRes != 0)
|
||||
LOG_DEBUG(__func__, Log_WARNING,
|
||||
"Unable to open path. targetID " + StringTk::uintToStr(targetID) + "; Rel. path: "
|
||||
+ relPath + "; Error: " + System::getErrString(errno));
|
||||
|
||||
return retVal;
|
||||
}
|
||||
90
storage/source/components/buddyresyncer/BuddyResyncJob.h
Normal file
90
storage/source/components/buddyresyncer/BuddyResyncJob.h
Normal file
@@ -0,0 +1,90 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/storage/mirroring/BuddyResyncJobStatistics.h>
|
||||
#include <components/buddyresyncer/BuddyResyncerDirSyncSlave.h>
|
||||
#include <components/buddyresyncer/BuddyResyncerFileSyncSlave.h>
|
||||
#include <components/buddyresyncer/BuddyResyncerGatherSlave.h>
|
||||
|
||||
#define GATHERSLAVEQUEUE_MAXSIZE 5000
|
||||
|
||||
class BuddyResyncJob : public PThread
|
||||
{
|
||||
friend class GenericDebugMsgEx;
|
||||
|
||||
public:
|
||||
BuddyResyncJob(uint16_t targetID);
|
||||
virtual ~BuddyResyncJob();
|
||||
|
||||
virtual void run();
|
||||
|
||||
void abort();
|
||||
void getJobStats(StorageBuddyResyncJobStatistics& outStats);
|
||||
|
||||
private:
|
||||
uint16_t targetID;
|
||||
Mutex statusMutex;
|
||||
BuddyResyncJobState status;
|
||||
|
||||
int64_t startTime;
|
||||
int64_t endTime;
|
||||
|
||||
ChunkSyncCandidateStore syncCandidates;
|
||||
BuddyResyncerGatherSlaveWorkQueue gatherSlavesWorkQueue;
|
||||
|
||||
BuddyResyncerGatherSlaveVec gatherSlaveVec;
|
||||
BuddyResyncerFileSyncSlaveVec fileSyncSlaveVec;
|
||||
BuddyResyncerDirSyncSlaveVec dirSyncSlaveVec;
|
||||
|
||||
// this thread walks over the top dir structures itself, so we need to track that
|
||||
AtomicUInt64 numDirsDiscovered;
|
||||
AtomicUInt64 numDirsMatched;
|
||||
|
||||
AtomicInt16 shallAbort; // quasi-boolean
|
||||
AtomicInt16 targetWasOffline;
|
||||
|
||||
bool checkTopLevelDir(std::string& path, int64_t lastBuddyCommTimeSecs);
|
||||
bool walkDirs(std::string chunksPath, std::string relPath, int level,
|
||||
int64_t lastBuddyCommTimeSecs);
|
||||
|
||||
bool startGatherSlaves(const StorageTarget& target);
|
||||
bool startSyncSlaves();
|
||||
void joinGatherSlaves();
|
||||
void joinSyncSlaves();
|
||||
|
||||
public:
|
||||
uint16_t getTargetID() const
|
||||
{
|
||||
return targetID;
|
||||
}
|
||||
|
||||
BuddyResyncJobState getStatus()
|
||||
{
|
||||
std::lock_guard<Mutex> mutexLock(statusMutex);
|
||||
return status;
|
||||
}
|
||||
|
||||
bool isRunning()
|
||||
{
|
||||
std::lock_guard<Mutex> mutexLock(statusMutex);
|
||||
return status == BuddyResyncJobState_RUNNING;
|
||||
}
|
||||
|
||||
void setTargetOffline()
|
||||
{
|
||||
targetWasOffline.set(1);
|
||||
}
|
||||
|
||||
private:
|
||||
void setStatus(BuddyResyncJobState status)
|
||||
{
|
||||
std::lock_guard<Mutex> mutexLock(statusMutex);
|
||||
this->status = status;
|
||||
}
|
||||
|
||||
void informBuddy();
|
||||
};
|
||||
|
||||
typedef std::map<uint16_t, BuddyResyncJob*> BuddyResyncJobMap; //mapping: targetID, job
|
||||
typedef BuddyResyncJobMap::iterator BuddyResyncJobMapIter;
|
||||
|
||||
|
||||
40
storage/source/components/buddyresyncer/BuddyResyncer.cpp
Normal file
40
storage/source/components/buddyresyncer/BuddyResyncer.cpp
Normal file
@@ -0,0 +1,40 @@
|
||||
#include <program/Program.h>
|
||||
|
||||
#include "BuddyResyncer.h"
|
||||
|
||||
BuddyResyncer::~BuddyResyncer()
|
||||
{
|
||||
// delete remaining jobs
|
||||
for (BuddyResyncJobMapIter iter = resyncJobMap.begin(); iter != resyncJobMap.end(); iter++)
|
||||
{
|
||||
BuddyResyncJob* job = iter->second;
|
||||
if( job->isRunning() )
|
||||
{
|
||||
job->abort();
|
||||
job->join();
|
||||
}
|
||||
|
||||
SAFE_DELETE(job);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return FhgfsOpsErr_SUCCESS if everything was successfully started, FhgfsOpsErr_INUSE if already
|
||||
* running
|
||||
*/
|
||||
FhgfsOpsErr BuddyResyncer::startResync(uint16_t targetID)
|
||||
{
|
||||
bool isNewJob;
|
||||
|
||||
// try to add an existing resync job; if it already exists, we get that
|
||||
BuddyResyncJob* resyncJob = addResyncJob(targetID, isNewJob);
|
||||
|
||||
// Job already exists *and* is already running:
|
||||
if (!isNewJob && resyncJob->isRunning() )
|
||||
return FhgfsOpsErr_INUSE;
|
||||
|
||||
// job is ready and not running
|
||||
resyncJob->start();
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
59
storage/source/components/buddyresyncer/BuddyResyncer.h
Normal file
59
storage/source/components/buddyresyncer/BuddyResyncer.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <components/buddyresyncer/BuddyResyncJob.h>
|
||||
|
||||
#include <mutex>
|
||||
|
||||
/**
|
||||
* This is not a component that represents a separate thread by itself. Instead, it is the
|
||||
* controlling frontend for slave threads, which are started and stopped on request (i.e. it is not
|
||||
* automatically started when the app is started).
|
||||
*
|
||||
* Callers should only use methods in this controlling frontend and not access the slave's methods
|
||||
* directly.
|
||||
*/
|
||||
class BuddyResyncer
|
||||
{
|
||||
public:
|
||||
~BuddyResyncer();
|
||||
|
||||
FhgfsOpsErr startResync(uint16_t targetID);
|
||||
|
||||
private:
|
||||
BuddyResyncJobMap resyncJobMap;
|
||||
Mutex resyncJobMapMutex;
|
||||
|
||||
public:
|
||||
BuddyResyncJob* getResyncJob(uint16_t targetID)
|
||||
{
|
||||
std::lock_guard<Mutex> mutexLock(resyncJobMapMutex);
|
||||
|
||||
BuddyResyncJobMapIter iter = resyncJobMap.find(targetID);
|
||||
if (iter != resyncJobMap.end())
|
||||
return iter->second;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
private:
|
||||
BuddyResyncJob* addResyncJob(uint16_t targetID, bool& outIsNew)
|
||||
{
|
||||
|
||||
std::lock_guard<Mutex> mutexLock(resyncJobMapMutex);
|
||||
|
||||
BuddyResyncJobMapIter iter = resyncJobMap.find(targetID);
|
||||
if (iter != resyncJobMap.end())
|
||||
{
|
||||
outIsNew = false;
|
||||
return iter->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
BuddyResyncJob* job = new BuddyResyncJob(targetID);
|
||||
resyncJobMap.insert(BuddyResyncJobMap::value_type(targetID, job) );
|
||||
outIsNew = true;
|
||||
return job;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -0,0 +1,395 @@
|
||||
#include <app/App.h>
|
||||
#include <common/net/message/storage/creating/RmChunkPathsMsg.h>
|
||||
#include <common/net/message/storage/creating/RmChunkPathsRespMsg.h>
|
||||
#include <common/net/message/storage/listing/ListChunkDirIncrementalMsg.h>
|
||||
#include <common/net/message/storage/listing/ListChunkDirIncrementalRespMsg.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include <program/Program.h>
|
||||
|
||||
#include "BuddyResyncerDirSyncSlave.h"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
#define CHECK_AT_ONCE 50
|
||||
|
||||
BuddyResyncerDirSyncSlave::BuddyResyncerDirSyncSlave(uint16_t targetID,
|
||||
ChunkSyncCandidateStore* syncCandidates, uint8_t slaveID) :
|
||||
PThread("BuddyResyncerDirSyncSlave_" + StringTk::uintToStr(targetID) + "-"
|
||||
+ StringTk::uintToStr(slaveID))
|
||||
{
|
||||
this->isRunning = false;
|
||||
this->targetID = targetID;
|
||||
this->syncCandidates = syncCandidates;
|
||||
}
|
||||
|
||||
BuddyResyncerDirSyncSlave::~BuddyResyncerDirSyncSlave()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a component, which is started through its control frontend on-demand at
|
||||
* runtime and terminates when it's done.
|
||||
* We have to ensure (in cooperation with the control frontend) that we don't get multiple instances
|
||||
* of this thread running at the same time.
|
||||
*/
|
||||
void BuddyResyncerDirSyncSlave::run()
|
||||
{
|
||||
setIsRunning(true);
|
||||
|
||||
try
|
||||
{
|
||||
LogContext(__func__).log(Log_DEBUG, "Component started.");
|
||||
|
||||
registerSignalHandler();
|
||||
|
||||
numAdditionalDirsMatched.setZero();
|
||||
numDirsSynced.setZero();
|
||||
errorCount.setZero();
|
||||
|
||||
syncLoop();
|
||||
|
||||
LogContext(__func__).log(Log_DEBUG, "Component stopped.");
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
PThread::getCurrentThreadApp()->handleComponentException(e);
|
||||
}
|
||||
|
||||
setIsRunning(false);
|
||||
}
|
||||
|
||||
void BuddyResyncerDirSyncSlave::syncLoop()
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
MirrorBuddyGroupMapper* buddyGroupMapper = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
while (! getSelfTerminateNotIdle())
|
||||
{
|
||||
if((syncCandidates->isDirsEmpty()) && (getSelfTerminate()))
|
||||
break;
|
||||
|
||||
ChunkSyncCandidateDir candidate;
|
||||
|
||||
syncCandidates->fetch(candidate, this);
|
||||
|
||||
if (unlikely(candidate.getTargetID() == 0)) // ignore targetID 0
|
||||
continue;
|
||||
|
||||
std::string relativePath = candidate.getRelativePath();
|
||||
uint16_t localTargetID = candidate.getTargetID();
|
||||
|
||||
// get buddy targetID
|
||||
uint16_t buddyTargetID = buddyGroupMapper->getBuddyTargetID(localTargetID);
|
||||
// perform sync
|
||||
FhgfsOpsErr resyncRes = doSync(relativePath, localTargetID, buddyTargetID);
|
||||
if (resyncRes == FhgfsOpsErr_SUCCESS)
|
||||
numDirsSynced.increase();
|
||||
else if (resyncRes != FhgfsOpsErr_INTERRUPTED)
|
||||
errorCount.increase(); // increment error count if an error occurred; note: if the slaves
|
||||
// were interrupted from the outside (e.g. ctl) this is not an error
|
||||
}
|
||||
}
|
||||
|
||||
FhgfsOpsErr BuddyResyncerDirSyncSlave::doSync(const std::string& dirPath, uint16_t localTargetID,
|
||||
uint16_t buddyTargetID)
|
||||
{
|
||||
FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
|
||||
|
||||
App* app = Program::getApp();
|
||||
TargetMapper* targetMapper = app->getTargetMapper();
|
||||
NodeStoreServers* storageNodes = app->getStorageNodes();
|
||||
|
||||
// try to find the node with the buddyTargetID
|
||||
NumNodeID buddyNodeID = targetMapper->getNodeID(buddyTargetID);
|
||||
auto node = storageNodes->referenceNode(buddyNodeID);
|
||||
|
||||
if(!node)
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Storage node does not exist; nodeID " + buddyNodeID.str());
|
||||
|
||||
return FhgfsOpsErr_UNKNOWNNODE;
|
||||
}
|
||||
|
||||
int64_t offset = 0;
|
||||
unsigned entriesFetched;
|
||||
|
||||
do
|
||||
{
|
||||
int64_t newOffset;
|
||||
StringList names;
|
||||
IntList entryTypes;
|
||||
|
||||
FhgfsOpsErr listRes = getBuddyDirContents(*node, dirPath, buddyTargetID, offset, names,
|
||||
entryTypes, newOffset);
|
||||
|
||||
if(listRes != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
retVal = listRes;
|
||||
break;
|
||||
}
|
||||
|
||||
offset = newOffset;
|
||||
entriesFetched = names.size();
|
||||
|
||||
// match locally
|
||||
FhgfsOpsErr findRes = findChunks(localTargetID, dirPath, names, entryTypes);
|
||||
|
||||
if(findRes != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
retVal = findRes;
|
||||
break;
|
||||
}
|
||||
|
||||
// delete the remaining chunks/dirs on the buddy
|
||||
StringList rmPaths;
|
||||
for (StringListIter iter = names.begin(); iter != names.end(); iter++)
|
||||
{
|
||||
std::string path = dirPath + "/" + *iter;
|
||||
rmPaths.push_back(path);
|
||||
}
|
||||
|
||||
FhgfsOpsErr rmRes = removeBuddyChunkPaths(*node, localTargetID, buddyTargetID, rmPaths);
|
||||
|
||||
if (rmRes != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
retVal = rmRes;
|
||||
break;
|
||||
}
|
||||
|
||||
if (getSelfTerminateNotIdle())
|
||||
{
|
||||
retVal = FhgfsOpsErr_INTERRUPTED;
|
||||
break;
|
||||
}
|
||||
|
||||
} while (entriesFetched == CHECK_AT_ONCE);
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
FhgfsOpsErr BuddyResyncerDirSyncSlave::getBuddyDirContents(Node& node, const std::string& dirPath,
|
||||
uint16_t targetID, int64_t offset, StringList& outNames, IntList& outEntryTypes,
|
||||
int64_t& outNewOffset)
|
||||
{
|
||||
FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
|
||||
unsigned msgRetryIntervalMS = 5000;
|
||||
|
||||
// get a part of the dir contents from the buddy target
|
||||
ListChunkDirIncrementalMsg listMsg(targetID, true, dirPath, offset, CHECK_AT_ONCE, false, true);
|
||||
listMsg.setMsgHeaderTargetID(targetID);
|
||||
|
||||
CombinedTargetState state;
|
||||
bool getStateRes = Program::getApp()->getTargetStateStore()->getState(targetID, state);
|
||||
|
||||
// send request to node and receive response
|
||||
std::unique_ptr<NetMessage> respMsg;
|
||||
|
||||
while ( (!respMsg) && (getStateRes)
|
||||
&& (state.reachabilityState != TargetReachabilityState_OFFLINE) )
|
||||
{
|
||||
respMsg = MessagingTk::requestResponse(node, listMsg, NETMSGTYPE_ListChunkDirIncrementalResp);
|
||||
|
||||
if (!respMsg)
|
||||
{
|
||||
LOG_DEBUG(__func__, Log_NOTICE,
|
||||
"Unable to communicate, but target is not offline; sleeping "
|
||||
+ StringTk::uintToStr(msgRetryIntervalMS) + "ms before retry. targetID: "
|
||||
+ StringTk::uintToStr(targetID));
|
||||
|
||||
PThread::sleepMS(msgRetryIntervalMS);
|
||||
|
||||
// if thread shall terminate, break loop here
|
||||
if ( getSelfTerminateNotIdle() )
|
||||
break;
|
||||
|
||||
getStateRes = Program::getApp()->getTargetStateStore()->getState(targetID, state);
|
||||
}
|
||||
}
|
||||
|
||||
if (!respMsg)
|
||||
{ // communication error
|
||||
LogContext(__func__).logErr(
|
||||
"Communication with storage node failed: " + node.getTypedNodeID());
|
||||
|
||||
retVal = FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
else
|
||||
if(!getStateRes)
|
||||
{
|
||||
LogContext(__func__).logErr("No valid state for node ID: " + node.getTypedNodeID() );
|
||||
|
||||
retVal = FhgfsOpsErr_INTERNAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
// correct response type received
|
||||
ListChunkDirIncrementalRespMsg* respMsgCast = (ListChunkDirIncrementalRespMsg*) respMsg.get();
|
||||
|
||||
FhgfsOpsErr listRes = respMsgCast->getResult();
|
||||
|
||||
if (listRes == FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
outNewOffset = respMsgCast->getNewOffset();
|
||||
respMsgCast->getNames().swap(outNames);
|
||||
respMsgCast->getEntryTypes().swap(outEntryTypes);
|
||||
}
|
||||
else
|
||||
if (listRes != FhgfsOpsErr_PATHNOTEXISTS)
|
||||
{ // not exists is ok, because path might have been deleted
|
||||
LogContext(__func__).log(Log_WARNING, "Error listing chunks dir; "
|
||||
"dirPath: " + dirPath + "; "
|
||||
"targetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"node: " + node.getTypedNodeID() + "; "
|
||||
"Error: " + boost::lexical_cast<std::string>(listRes));
|
||||
|
||||
retVal = listRes;
|
||||
}
|
||||
}
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
FhgfsOpsErr BuddyResyncerDirSyncSlave::findChunks(uint16_t targetID, const std::string& dirPath,
|
||||
StringList& inOutNames, IntList& inOutEntryTypes)
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
ChunkLockStore* chunkLockStore = app->getChunkLockStore();
|
||||
|
||||
const auto& target = app->getStorageTargets()->getTargets().at(targetID);
|
||||
|
||||
const int targetFD = *target->getMirrorFD();
|
||||
|
||||
StringListIter namesIter = inOutNames.begin();
|
||||
IntListIter typesIter = inOutEntryTypes.begin();
|
||||
while (namesIter != inOutNames.end())
|
||||
{
|
||||
std::string entryID = *namesIter;
|
||||
DirEntryType entryType = (DirEntryType)*typesIter;
|
||||
|
||||
std::string entryPath;
|
||||
if (likely(!dirPath.empty()))
|
||||
entryPath = dirPath + "/" + entryID;
|
||||
else
|
||||
entryPath = entryID;
|
||||
|
||||
if (DirEntryType_ISDIR(entryType))
|
||||
{
|
||||
bool entryExists = StorageTk::pathExists(targetFD, entryPath);
|
||||
|
||||
if (!entryExists)
|
||||
{
|
||||
// dir not found, so we didn't know about it yet => add it to sync candidate store, so
|
||||
// that it gets checked and we get a list of its contents;
|
||||
ChunkSyncCandidateDir syncCandidate(entryPath, targetID);
|
||||
syncCandidates->add(syncCandidate, this);
|
||||
numAdditionalDirsMatched.increase();
|
||||
}
|
||||
|
||||
// no matter if found or not: remove it from the list, because we do not explicitely
|
||||
// delete directories on the buddy
|
||||
namesIter = inOutNames.erase(namesIter);
|
||||
typesIter = inOutEntryTypes.erase(typesIter);
|
||||
}
|
||||
else
|
||||
{
|
||||
// need to lock the chunk to check it
|
||||
chunkLockStore->lockChunk(targetID, entryID);
|
||||
|
||||
bool entryExists = StorageTk::pathExists(targetFD, entryPath);
|
||||
|
||||
if (entryExists)
|
||||
{
|
||||
// chunk found => delete it from list an unlock it
|
||||
namesIter = inOutNames.erase(namesIter);
|
||||
typesIter = inOutEntryTypes.erase(typesIter);
|
||||
chunkLockStore->unlockChunk(targetID, entryID);
|
||||
}
|
||||
else
|
||||
{
|
||||
// chunk not found => keep lock; will be unlocked after removal
|
||||
namesIter++;
|
||||
typesIter++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
|
||||
FhgfsOpsErr BuddyResyncerDirSyncSlave::removeBuddyChunkPaths(Node& node, uint16_t localTargetID,
|
||||
uint16_t buddyTargetID, StringList& paths)
|
||||
{
|
||||
unsigned msgRetryIntervalMS = 5000;
|
||||
|
||||
ChunkLockStore* chunkLockStore = Program::getApp()->getChunkLockStore();
|
||||
RmChunkPathsMsg rmMsg(buddyTargetID, &paths);
|
||||
rmMsg.addMsgHeaderFeatureFlag(RMCHUNKPATHSMSG_FLAG_BUDDYMIRROR);
|
||||
rmMsg.setMsgHeaderTargetID(buddyTargetID);
|
||||
|
||||
CombinedTargetState state;
|
||||
bool getStateRes = Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
|
||||
// send request to node and receive response
|
||||
std::unique_ptr<NetMessage> respMsg;
|
||||
|
||||
while ((!respMsg) && (getStateRes)
|
||||
&& (state.reachabilityState != TargetReachabilityState_OFFLINE))
|
||||
{
|
||||
respMsg = MessagingTk::requestResponse(node, rmMsg, NETMSGTYPE_RmChunkPathsResp);
|
||||
|
||||
if (!respMsg)
|
||||
{
|
||||
LOG_DEBUG(__func__, Log_NOTICE,
|
||||
"Unable to communicate, but target is not offline; sleeping "
|
||||
+ StringTk::uintToStr(msgRetryIntervalMS) + "ms before retry. targetID: "
|
||||
+ StringTk::uintToStr(targetID));
|
||||
PThread::sleepMS(msgRetryIntervalMS);
|
||||
|
||||
// if thread shall terminate, break loop here
|
||||
if ( getSelfTerminateNotIdle() )
|
||||
break;
|
||||
|
||||
getStateRes = Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
}
|
||||
}
|
||||
|
||||
// no matter if that succeeded or not we unlock all chunks here first
|
||||
for (StringListIter iter = paths.begin(); iter != paths.end(); iter++)
|
||||
{
|
||||
std::string entryID = StorageTk::getPathBasename(*iter);
|
||||
chunkLockStore->unlockChunk(localTargetID, entryID);
|
||||
}
|
||||
|
||||
if (!respMsg)
|
||||
{ // communication error
|
||||
LogContext(__func__).logErr(
|
||||
"Communication with storage node failed: " + node.getTypedNodeID());
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
else
|
||||
if(!getStateRes)
|
||||
{
|
||||
LogContext(__func__).logErr("No valid state for node ID: " + node.getTypedNodeID() );
|
||||
|
||||
return FhgfsOpsErr_INTERNAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
// correct response type received
|
||||
RmChunkPathsRespMsg* respMsgCast = (RmChunkPathsRespMsg*) respMsg.get();
|
||||
StringList& failedPaths = respMsgCast->getFailedPaths();
|
||||
|
||||
for(StringListIter iter = failedPaths.begin(); iter != failedPaths.end(); iter++)
|
||||
{
|
||||
LogContext(__func__).logErr("Chunk path could not be deleted; "
|
||||
"path: " + *iter + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"node: " + node.getTypedNodeID());
|
||||
}
|
||||
}
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/nodes/Node.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <common/threading/PThread.h>
|
||||
#include <components/buddyresyncer/SyncCandidate.h>
|
||||
|
||||
class BuddyResyncerDirSyncSlave : public PThread
|
||||
{
|
||||
friend class BuddyResyncer; // (to grant access to internal mutex)
|
||||
friend class BuddyResyncJob; // (to grant access to internal mutex)
|
||||
|
||||
public:
|
||||
BuddyResyncerDirSyncSlave(uint16_t targetID, ChunkSyncCandidateStore* syncCandidates,
|
||||
uint8_t slaveID);
|
||||
virtual ~BuddyResyncerDirSyncSlave();
|
||||
|
||||
private:
|
||||
Mutex statusMutex; // protects isRunning
|
||||
Condition isRunningChangeCond;
|
||||
|
||||
AtomicSizeT onlyTerminateIfIdle;
|
||||
|
||||
AtomicUInt64 numDirsSynced;
|
||||
AtomicUInt64 numAdditionalDirsMatched;
|
||||
AtomicUInt64 errorCount;
|
||||
|
||||
bool isRunning; // true if an instance of this component is currently running
|
||||
|
||||
uint16_t targetID;
|
||||
ChunkSyncCandidateStore* syncCandidates;
|
||||
|
||||
virtual void run();
|
||||
void syncLoop();
|
||||
|
||||
FhgfsOpsErr doSync(const std::string& dirPath, uint16_t localTargetID,
|
||||
uint16_t buddyTargetID);
|
||||
FhgfsOpsErr getBuddyDirContents(Node& node, const std::string& dirPath, uint16_t targetID,
|
||||
int64_t offset, StringList& outNames, IntList& outEntryTypes, int64_t& outNewOffset);
|
||||
FhgfsOpsErr findChunks(uint16_t targetID, const std::string& dirPath, StringList& inOutNames,
|
||||
IntList& inOutEntryTypes);
|
||||
FhgfsOpsErr removeBuddyChunkPaths(Node& node, uint16_t localTargetID, uint16_t buddyTargetID,
|
||||
StringList& paths);
|
||||
|
||||
public:
|
||||
// getters & setters
|
||||
bool getIsRunning()
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
return this->isRunning;
|
||||
}
|
||||
|
||||
void setOnlyTerminateIfIdle(bool value)
|
||||
{
|
||||
if (value)
|
||||
onlyTerminateIfIdle.set(1);
|
||||
else
|
||||
onlyTerminateIfIdle.setZero();
|
||||
}
|
||||
|
||||
bool getOnlyTerminateIfIdle()
|
||||
{
|
||||
if (onlyTerminateIfIdle.read() == 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t getNumDirsSynced()
|
||||
{
|
||||
return numDirsSynced.read();
|
||||
}
|
||||
|
||||
uint64_t getNumAdditionalDirsMatched()
|
||||
{
|
||||
return numAdditionalDirsMatched.read();
|
||||
}
|
||||
|
||||
uint64_t getErrorCount()
|
||||
{
|
||||
return errorCount.read();
|
||||
}
|
||||
|
||||
private:
|
||||
// getters & setters
|
||||
|
||||
void setIsRunning(bool isRunning)
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
this->isRunning = isRunning;
|
||||
isRunningChangeCond.broadcast();
|
||||
}
|
||||
|
||||
bool getSelfTerminateNotIdle()
|
||||
{
|
||||
return ( (getSelfTerminate() && (!getOnlyTerminateIfIdle())) );
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::list<BuddyResyncerDirSyncSlave*> BuddyResyncerDirSyncSlaveList;
|
||||
typedef BuddyResyncerDirSyncSlaveList::iterator BuddyResyncerDirSyncSlaveListIter;
|
||||
typedef std::vector<BuddyResyncerDirSyncSlave*> BuddyResyncerDirSyncSlaveVec;
|
||||
typedef BuddyResyncerDirSyncSlaveVec::iterator BuddyResyncerDirSyncSlaveVecIter;
|
||||
|
||||
@@ -0,0 +1,471 @@
|
||||
#include <app/App.h>
|
||||
#include <common/net/message/storage/creating/RmChunkPathsMsg.h>
|
||||
#include <common/net/message/storage/creating/RmChunkPathsRespMsg.h>
|
||||
#include <common/net/message/storage/mirroring/ResyncLocalFileMsg.h>
|
||||
#include <common/net/message/storage/mirroring/ResyncLocalFileRespMsg.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include <program/Program.h>
|
||||
|
||||
#include "BuddyResyncerFileSyncSlave.h"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
#define PROCESS_AT_ONCE 1
|
||||
#define SYNC_BLOCK_SIZE (1024*1024) // 1M
|
||||
|
||||
BuddyResyncerFileSyncSlave::BuddyResyncerFileSyncSlave(uint16_t targetID,
|
||||
ChunkSyncCandidateStore* syncCandidates, uint8_t slaveID) :
|
||||
PThread("BuddyResyncerFileSyncSlave_" + StringTk::uintToStr(targetID) + "-"
|
||||
+ StringTk::uintToStr(slaveID))
|
||||
{
|
||||
this->isRunning = false;
|
||||
this->syncCandidates = syncCandidates;
|
||||
this->targetID = targetID;
|
||||
}
|
||||
|
||||
BuddyResyncerFileSyncSlave::~BuddyResyncerFileSyncSlave()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a component, which is started through its control frontend on-demand at
|
||||
* runtime and terminates when it's done.
|
||||
* We have to ensure (in cooperation with the control frontend) that we don't get multiple instances
|
||||
* of this thread running at the same time.
|
||||
*/
|
||||
void BuddyResyncerFileSyncSlave::run()
|
||||
{
|
||||
setIsRunning(true);
|
||||
|
||||
try
|
||||
{
|
||||
LogContext(__func__).log(Log_DEBUG, "Component started.");
|
||||
|
||||
registerSignalHandler();
|
||||
|
||||
numChunksSynced.setZero();
|
||||
errorCount.setZero();
|
||||
|
||||
syncLoop();
|
||||
|
||||
LogContext(__func__).log(Log_DEBUG, "Component stopped.");
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
PThread::getCurrentThreadApp()->handleComponentException(e);
|
||||
}
|
||||
|
||||
setIsRunning(false);
|
||||
}
|
||||
|
||||
void BuddyResyncerFileSyncSlave::syncLoop()
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
MirrorBuddyGroupMapper* buddyGroupMapper = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
while (! getSelfTerminateNotIdle())
|
||||
{
|
||||
if((syncCandidates->isFilesEmpty()) && (getSelfTerminate()))
|
||||
break;
|
||||
|
||||
ChunkSyncCandidateFile candidate;
|
||||
|
||||
syncCandidates->fetch(candidate, this);
|
||||
|
||||
if (unlikely(candidate.getTargetID() == 0)) // ignore targetID 0
|
||||
continue;
|
||||
|
||||
std::string relativePath = candidate.getRelativePath();
|
||||
uint16_t localTargetID = candidate.getTargetID();
|
||||
|
||||
// get buddy targetID
|
||||
uint16_t buddyTargetID = buddyGroupMapper->getBuddyTargetID(localTargetID);
|
||||
// perform sync
|
||||
FhgfsOpsErr resyncRes = doResync(relativePath, localTargetID, buddyTargetID);
|
||||
if (resyncRes == FhgfsOpsErr_SUCCESS)
|
||||
numChunksSynced.increase();
|
||||
else
|
||||
if (resyncRes != FhgfsOpsErr_INTERRUPTED)
|
||||
errorCount.increase();
|
||||
}
|
||||
}
|
||||
|
||||
FhgfsOpsErr BuddyResyncerFileSyncSlave::doResync(std::string& chunkPathStr, uint16_t localTargetID,
|
||||
uint16_t buddyTargetID)
|
||||
{
|
||||
FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
|
||||
unsigned msgRetryIntervalMS = 5000;
|
||||
|
||||
App* app = Program::getApp();
|
||||
TargetMapper* targetMapper = app->getTargetMapper();
|
||||
NodeStoreServers* storageNodes = app->getStorageNodes();
|
||||
ChunkLockStore* chunkLockStore = app->getChunkLockStore();
|
||||
|
||||
std::string entryID = StorageTk::getPathBasename(chunkPathStr);
|
||||
|
||||
// try to find the node with the buddyTargetID
|
||||
NumNodeID buddyNodeID = targetMapper->getNodeID(buddyTargetID);
|
||||
|
||||
auto node = storageNodes->referenceNode(buddyNodeID);
|
||||
|
||||
if(!node)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"Storage node does not exist; nodeID " + buddyNodeID.str());
|
||||
|
||||
return FhgfsOpsErr_UNKNOWNNODE;
|
||||
}
|
||||
|
||||
int64_t offset = 0;
|
||||
ssize_t readRes = 0;
|
||||
unsigned resyncMsgFlags = 0;
|
||||
resyncMsgFlags |= RESYNCLOCALFILEMSG_FLAG_BUDDYMIRROR;
|
||||
|
||||
LogContext(__func__).log(Log_DEBUG,
|
||||
"File sync started. chunkPath: " + chunkPathStr + "; localTargetID: "
|
||||
+ StringTk::uintToStr(localTargetID) + "; buddyTargetID"
|
||||
+ StringTk::uintToStr(buddyTargetID));
|
||||
|
||||
do
|
||||
{
|
||||
boost::scoped_array<char> data(new char[SYNC_BLOCK_SIZE]);
|
||||
|
||||
const auto& target = app->getStorageTargets()->getTargets().at(localTargetID);
|
||||
|
||||
// lock the chunk
|
||||
chunkLockStore->lockChunk(localTargetID, entryID);
|
||||
|
||||
const int fd = openat(*target->getMirrorFD(), chunkPathStr.c_str(), O_RDONLY | O_NOATIME);
|
||||
|
||||
if (fd == -1)
|
||||
{
|
||||
int errCode = errno;
|
||||
|
||||
if(errCode == ENOENT)
|
||||
{ // chunk was deleted => no error
|
||||
// delete the mirror chunk and return
|
||||
bool rmRes = removeBuddyChunkUnlocked(*node, buddyTargetID, chunkPathStr);
|
||||
|
||||
if (!rmRes) // rm failed; stop resync
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"File sync not started. chunkPath: " + chunkPathStr + "; localTargetID: "
|
||||
+ StringTk::uintToStr(localTargetID) + "; buddyTargetID: "
|
||||
+ StringTk::uintToStr(buddyTargetID));
|
||||
|
||||
retVal = FhgfsOpsErr_INTERNAL;
|
||||
}
|
||||
}
|
||||
else // error => log and return
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Open of chunk failed. chunkPath: " + chunkPathStr + "; targetID: "
|
||||
+ StringTk::uintToStr(localTargetID) + "; Error: "
|
||||
+ System::getErrString(errCode));
|
||||
|
||||
retVal = FhgfsOpsErr_INTERNAL;
|
||||
}
|
||||
|
||||
chunkLockStore->unlockChunk(localTargetID, entryID);
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
int seekRes = lseek(fd, offset, SEEK_SET);
|
||||
|
||||
if (seekRes == -1)
|
||||
{
|
||||
LogContext(__func__).logErr(
|
||||
"Seeking in chunk failed. chunkPath: " + chunkPathStr + "; targetID: "
|
||||
+ StringTk::uintToStr(localTargetID) + "; offset: " + StringTk::int64ToStr(offset));
|
||||
|
||||
chunkLockStore->unlockChunk(localTargetID, entryID);
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
readRes = read(fd, data.get(), SYNC_BLOCK_SIZE);
|
||||
|
||||
if( readRes == -1)
|
||||
{
|
||||
LogContext(__func__).logErr("Error during read; "
|
||||
"chunkPath: " + chunkPathStr + "; "
|
||||
"targetID: " + StringTk::uintToStr(localTargetID) + "; "
|
||||
"BuddyNode: " + node->getTypedNodeID() + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"Error: " + System::getErrString(errno));
|
||||
|
||||
retVal = FhgfsOpsErr_INTERNAL;
|
||||
|
||||
goto end_of_loop;
|
||||
}
|
||||
|
||||
if(readRes > 0)
|
||||
{
|
||||
const char zeroBuf[RESYNCER_SPARSE_BLOCK_SIZE] = { 0 };
|
||||
|
||||
// check if sparse blocks are in the buffer
|
||||
ssize_t bufPos = 0;
|
||||
bool dataFound = false;
|
||||
while (bufPos < readRes)
|
||||
{
|
||||
size_t cmpLen = BEEGFS_MIN(readRes-bufPos, RESYNCER_SPARSE_BLOCK_SIZE);
|
||||
|
||||
int cmpRes = memcmp(data.get() + bufPos, zeroBuf, cmpLen);
|
||||
if(cmpRes != 0)
|
||||
dataFound = true;
|
||||
else // sparse area detected
|
||||
{
|
||||
if(dataFound) // had data before
|
||||
{
|
||||
resyncMsgFlags |= RESYNCLOCALFILEMSG_CHECK_SPARSE; // let the receiver do a check
|
||||
break; // and stop checking here
|
||||
}
|
||||
}
|
||||
|
||||
bufPos += cmpLen;
|
||||
}
|
||||
|
||||
// this inner loop is over and there are only sparse areas
|
||||
|
||||
/* make sure we always send a msg at offset==0 to truncate the file and allow concurrent
|
||||
writers in a big inital sparse area */
|
||||
if(offset && (readRes > 0) && (readRes == SYNC_BLOCK_SIZE) && !dataFound)
|
||||
{
|
||||
goto end_of_loop;
|
||||
// => no transfer needed
|
||||
}
|
||||
|
||||
/* let the receiver do a check, because we might be sending a sparse block at beginnig or
|
||||
end of file */
|
||||
if(!dataFound)
|
||||
resyncMsgFlags |= RESYNCLOCALFILEMSG_CHECK_SPARSE;
|
||||
}
|
||||
|
||||
{
|
||||
ResyncLocalFileMsg resyncMsg(data.get(), chunkPathStr, buddyTargetID, offset, readRes);
|
||||
|
||||
if (!readRes || (readRes < SYNC_BLOCK_SIZE) ) // last iteration, set attribs and trunc buddy chunk
|
||||
{
|
||||
struct stat statBuf;
|
||||
int statRes = fstat(fd, &statBuf);
|
||||
|
||||
if (statRes == 0)
|
||||
{
|
||||
if(statBuf.st_size < offset)
|
||||
{ // in case someone truncated the file while we're reading at a high offset
|
||||
offset = statBuf.st_size;
|
||||
resyncMsg.setOffset(offset);
|
||||
}
|
||||
else
|
||||
if(offset && !readRes)
|
||||
resyncMsgFlags |= RESYNCLOCALFILEMSG_FLAG_TRUNC;
|
||||
|
||||
int mode = statBuf.st_mode;
|
||||
unsigned userID = statBuf.st_uid;
|
||||
unsigned groupID = statBuf.st_gid;
|
||||
int64_t mtimeSecs = statBuf.st_mtim.tv_sec;
|
||||
int64_t atimeSecs = statBuf.st_atim.tv_sec;
|
||||
SettableFileAttribs chunkAttribs = {mode, userID,groupID, mtimeSecs, atimeSecs};
|
||||
resyncMsg.setChunkAttribs(chunkAttribs);
|
||||
resyncMsgFlags |= RESYNCLOCALFILEMSG_FLAG_SETATTRIBS;
|
||||
}
|
||||
else
|
||||
{
|
||||
LogContext(__func__).logErr("Error getting chunk attributes; "
|
||||
"chunkPath: " + chunkPathStr + "; "
|
||||
"targetID: " + StringTk::uintToStr(localTargetID) + "; "
|
||||
"BuddyNode: " + node->getTypedNodeID() + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"Error: " + System::getErrString(errno));
|
||||
}
|
||||
}
|
||||
|
||||
resyncMsg.setMsgHeaderFeatureFlags(resyncMsgFlags);
|
||||
resyncMsg.setMsgHeaderTargetID(buddyTargetID);
|
||||
|
||||
CombinedTargetState state;
|
||||
bool getStateRes =
|
||||
Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
|
||||
// send request to node and receive response
|
||||
std::unique_ptr<NetMessage> respMsg;
|
||||
|
||||
while ( (!respMsg) && (getStateRes)
|
||||
&& (state.reachabilityState != TargetReachabilityState_OFFLINE) )
|
||||
{
|
||||
respMsg = MessagingTk::requestResponse(*node, resyncMsg,
|
||||
NETMSGTYPE_ResyncLocalFileResp);
|
||||
|
||||
if (!respMsg)
|
||||
{
|
||||
LOG_DEBUG(__func__, Log_NOTICE,
|
||||
"Unable to communicate, but target is not offline; sleeping "
|
||||
+ StringTk::uintToStr(msgRetryIntervalMS) + "ms before retry. targetID: "
|
||||
+ StringTk::uintToStr(targetID));
|
||||
|
||||
PThread::sleepMS(msgRetryIntervalMS);
|
||||
|
||||
// if thread shall terminate, break loop here
|
||||
if ( getSelfTerminateNotIdle() )
|
||||
break;
|
||||
|
||||
getStateRes =
|
||||
Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
}
|
||||
}
|
||||
|
||||
if (!respMsg)
|
||||
{ // communication error
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"Communication with storage node failed: " + node->getTypedNodeID());
|
||||
|
||||
retVal = FhgfsOpsErr_COMMUNICATION;
|
||||
|
||||
// set readRes to non-zero to force exiting loop
|
||||
readRes = -2;
|
||||
}
|
||||
else
|
||||
if(!getStateRes)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"No valid state for node ID: " + node->getTypedNodeID());
|
||||
|
||||
retVal = FhgfsOpsErr_INTERNAL;
|
||||
|
||||
// set readRes to non-zero to force exiting loop
|
||||
readRes = -2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// correct response type received
|
||||
ResyncLocalFileRespMsg* respMsgCast = (ResyncLocalFileRespMsg*) respMsg.get();
|
||||
|
||||
FhgfsOpsErr syncRes = respMsgCast->getResult();
|
||||
|
||||
if(syncRes != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING, "Error during resync; "
|
||||
"chunkPath: " + chunkPathStr + "; "
|
||||
"targetID: " + StringTk::uintToStr(localTargetID) + "; "
|
||||
"BuddyNode: " + node->getTypedNodeID() + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"Error: " + boost::lexical_cast<std::string>(syncRes));
|
||||
|
||||
retVal = syncRes;
|
||||
|
||||
// set readRes to non-zero to force exiting loop
|
||||
readRes = -2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
end_of_loop:
|
||||
int closeRes = close(fd);
|
||||
if (closeRes == -1)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING, "Error closing file descriptor; "
|
||||
"chunkPath: " + chunkPathStr + "; "
|
||||
"targetID: " + StringTk::uintToStr(localTargetID) + "; "
|
||||
"BuddyNode: " + node->getTypedNodeID() + "; "
|
||||
"buddyTargetID: " + StringTk::uintToStr(buddyTargetID) + "; "
|
||||
"Error: " + System::getErrString(errno));
|
||||
}
|
||||
// unlock the chunk
|
||||
chunkLockStore->unlockChunk(localTargetID, entryID);
|
||||
|
||||
// increment offset for next iteration
|
||||
offset += readRes;
|
||||
|
||||
if ( getSelfTerminateNotIdle() )
|
||||
{
|
||||
retVal = FhgfsOpsErr_INTERRUPTED;
|
||||
break;
|
||||
}
|
||||
|
||||
} while (readRes == SYNC_BLOCK_SIZE);
|
||||
|
||||
cleanup:
|
||||
LogContext(__func__).log(Log_DEBUG, "File sync finished. chunkPath: " + chunkPathStr);
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: Chunk has to be locked by caller.
|
||||
*/
|
||||
bool BuddyResyncerFileSyncSlave::removeBuddyChunkUnlocked(Node& node, uint16_t buddyTargetID,
|
||||
std::string& pathStr)
|
||||
{
|
||||
bool retVal = true;
|
||||
unsigned msgRetryIntervalMS = 5000;
|
||||
|
||||
std::string entryID = StorageTk::getPathBasename(pathStr);
|
||||
StringList rmPaths;
|
||||
rmPaths.push_back(pathStr);
|
||||
|
||||
RmChunkPathsMsg rmMsg(buddyTargetID, &rmPaths);
|
||||
rmMsg.addMsgHeaderFeatureFlag(RMCHUNKPATHSMSG_FLAG_BUDDYMIRROR);
|
||||
rmMsg.setMsgHeaderTargetID(buddyTargetID);
|
||||
|
||||
CombinedTargetState state;
|
||||
bool getStateRes = Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
|
||||
// send request to node and receive response
|
||||
std::unique_ptr<NetMessage> respMsg;
|
||||
|
||||
while ( (!respMsg) && (getStateRes)
|
||||
&& (state.reachabilityState != TargetReachabilityState_OFFLINE) )
|
||||
{
|
||||
respMsg = MessagingTk::requestResponse(node, rmMsg, NETMSGTYPE_RmChunkPathsResp);
|
||||
|
||||
if (!respMsg)
|
||||
{
|
||||
LOG_DEBUG(__func__, Log_NOTICE,
|
||||
"Unable to communicate, but target is not offline; "
|
||||
"sleeping " + StringTk::uintToStr(msgRetryIntervalMS) + " ms before retry. "
|
||||
"targetID: " + StringTk::uintToStr(targetID) );
|
||||
|
||||
PThread::sleepMS(msgRetryIntervalMS);
|
||||
|
||||
// if thread shall terminate, break loop here
|
||||
if ( getSelfTerminateNotIdle() )
|
||||
break;
|
||||
|
||||
getStateRes = Program::getApp()->getTargetStateStore()->getState(buddyTargetID, state);
|
||||
}
|
||||
}
|
||||
|
||||
if (!respMsg)
|
||||
{ // communication error
|
||||
LogContext(__func__).logErr(
|
||||
"Communication with storage node failed: " + node.getTypedNodeID() );
|
||||
|
||||
return false;
|
||||
}
|
||||
else
|
||||
if(!getStateRes)
|
||||
{
|
||||
LogContext(__func__).log(Log_WARNING,
|
||||
"No valid state for node ID: " + node.getTypedNodeID() );
|
||||
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
// correct response type received
|
||||
RmChunkPathsRespMsg* respMsgCast = (RmChunkPathsRespMsg*) respMsg.get();
|
||||
StringList& failedPaths = respMsgCast->getFailedPaths();
|
||||
|
||||
for (StringListIter iter = failedPaths.begin(); iter != failedPaths.end(); iter++)
|
||||
{
|
||||
LogContext(__func__).logErr("Chunk path could not be deleted; "
|
||||
"path: " + *iter + "; "
|
||||
"targetID: " + StringTk::uintToStr(targetID) + "; "
|
||||
"node: " + node.getTypedNodeID());
|
||||
retVal = false;
|
||||
}
|
||||
}
|
||||
|
||||
return retVal;
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/storage/mirroring/SyncCandidateStore.h>
|
||||
#include <common/nodes/Node.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <common/threading/PThread.h>
|
||||
|
||||
#include <mutex>
|
||||
|
||||
class BuddyResyncerFileSyncSlave : public PThread
|
||||
{
|
||||
friend class BuddyResyncer; // (to grant access to internal mutex)
|
||||
friend class BuddyResyncJob; // (to grant access to internal mutex)
|
||||
|
||||
public:
|
||||
BuddyResyncerFileSyncSlave(uint16_t targetID, ChunkSyncCandidateStore* syncCandidates,
|
||||
uint8_t slaveID);
|
||||
virtual ~BuddyResyncerFileSyncSlave();
|
||||
|
||||
private:
|
||||
AtomicSizeT onlyTerminateIfIdle; // atomic quasi-bool
|
||||
|
||||
Mutex statusMutex; // protects isRunning
|
||||
Condition isRunningChangeCond;
|
||||
|
||||
AtomicUInt64 numChunksSynced;
|
||||
AtomicUInt64 errorCount;
|
||||
|
||||
bool isRunning; // true if an instance of this component is currently running
|
||||
|
||||
uint16_t targetID;
|
||||
|
||||
ChunkSyncCandidateStore* syncCandidates;
|
||||
|
||||
virtual void run();
|
||||
void syncLoop();
|
||||
FhgfsOpsErr doResync(std::string& chunkPathStr, uint16_t localTargetID,
|
||||
uint16_t buddyTargetID);
|
||||
bool removeBuddyChunkUnlocked(Node& node, uint16_t buddyTargetID, std::string& pathStr);
|
||||
|
||||
public:
|
||||
// getters & setters
|
||||
bool getIsRunning()
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
return this->isRunning;
|
||||
}
|
||||
|
||||
void setOnlyTerminateIfIdle(bool value)
|
||||
{
|
||||
if (value)
|
||||
onlyTerminateIfIdle.set(1);
|
||||
else
|
||||
onlyTerminateIfIdle.setZero();
|
||||
}
|
||||
|
||||
bool getOnlyTerminateIfIdle()
|
||||
{
|
||||
if (onlyTerminateIfIdle.read() == 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t getNumChunksSynced()
|
||||
{
|
||||
return numChunksSynced.read();
|
||||
}
|
||||
|
||||
uint64_t getErrorCount()
|
||||
{
|
||||
return errorCount.read();
|
||||
}
|
||||
|
||||
private:
|
||||
// getters & setters
|
||||
|
||||
void setIsRunning(bool isRunning)
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
this->isRunning = isRunning;
|
||||
isRunningChangeCond.broadcast();
|
||||
}
|
||||
|
||||
bool getSelfTerminateNotIdle()
|
||||
{
|
||||
return ( (getSelfTerminate() && (!getOnlyTerminateIfIdle())) );
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::list<BuddyResyncerFileSyncSlave*> BuddyResyncerFileSyncSlaveList;
|
||||
typedef BuddyResyncerFileSyncSlaveList::iterator BuddyResyncerFileSyncSlaveListIter;
|
||||
|
||||
typedef std::vector<BuddyResyncerFileSyncSlave*> BuddyResyncerFileSyncSlaveVec;
|
||||
typedef BuddyResyncerFileSyncSlaveVec::iterator BuddyResyncerFileSyncSlaveVecIter;
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
#include <app/App.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include <storage/StorageTargets.h>
|
||||
|
||||
#include <program/Program.h>
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include "BuddyResyncerGatherSlave.h"
|
||||
|
||||
Mutex BuddyResyncerGatherSlave::staticGatherSlavesMutex;
|
||||
std::map<std::string, BuddyResyncerGatherSlave*> BuddyResyncerGatherSlave::staticGatherSlaves;
|
||||
|
||||
BuddyResyncerGatherSlave::BuddyResyncerGatherSlave(const StorageTarget& target,
|
||||
ChunkSyncCandidateStore* syncCandidates, BuddyResyncerGatherSlaveWorkQueue* workQueue,
|
||||
uint8_t slaveID) :
|
||||
PThread("BuddyResyncerGatherSlave_" + StringTk::uintToStr(target.getID()) + "-" +
|
||||
StringTk::uintToStr(slaveID)),
|
||||
target(target)
|
||||
{
|
||||
this->isRunning = false;
|
||||
this->syncCandidates = syncCandidates;
|
||||
this->workQueue = workQueue;
|
||||
|
||||
const std::lock_guard<Mutex> lock(staticGatherSlavesMutex);
|
||||
|
||||
staticGatherSlaves[this->getName()] = this;
|
||||
}
|
||||
|
||||
BuddyResyncerGatherSlave::~BuddyResyncerGatherSlave()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is a component, which is started through its control frontend on-demand at
|
||||
* runtime and terminates when it's done.
|
||||
* We have to ensure (in cooperation with the control frontend) that we don't get multiple instances
|
||||
* of this thread running at the same time.
|
||||
*/
|
||||
void BuddyResyncerGatherSlave::run()
|
||||
{
|
||||
setIsRunning(true);
|
||||
|
||||
numChunksDiscovered.setZero();
|
||||
numChunksMatched.setZero();
|
||||
numDirsDiscovered.setZero();
|
||||
numDirsMatched.setZero();
|
||||
|
||||
try
|
||||
{
|
||||
LogContext(__func__).log(Log_DEBUG, "Component started.");
|
||||
|
||||
registerSignalHandler();
|
||||
|
||||
workLoop();
|
||||
|
||||
LogContext(__func__).log(Log_DEBUG, "Component stopped.");
|
||||
}
|
||||
catch(std::exception& e)
|
||||
{
|
||||
PThread::getCurrentThreadApp()->handleComponentException(e);
|
||||
}
|
||||
|
||||
setIsRunning(false);
|
||||
}
|
||||
|
||||
void BuddyResyncerGatherSlave::workLoop()
|
||||
{
|
||||
const unsigned maxOpenFDsNum = 20; // max open FDs => max path sub-depth for efficient traversal
|
||||
|
||||
while (!getSelfTerminateNotIdle())
|
||||
{
|
||||
if ((workQueue->queueEmpty()) && (getSelfTerminate()))
|
||||
break;
|
||||
|
||||
// get a directory to scan
|
||||
std::string pathStr = workQueue->fetch(this);
|
||||
|
||||
if(unlikely(pathStr.empty()))
|
||||
continue;
|
||||
|
||||
int nftwRes = nftw(pathStr.c_str(), handleDiscoveredEntry, maxOpenFDsNum, FTW_ACTIONRETVAL);
|
||||
if(nftwRes == -1)
|
||||
{ // error occurred
|
||||
LogContext(__func__).logErr("Error during chunks walk. SysErr: " + System::getErrString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int BuddyResyncerGatherSlave::handleDiscoveredEntry(const char* path,
|
||||
const struct stat* statBuf, int ftwEntryType, struct FTW* ftwBuf)
|
||||
{
|
||||
std::string chunksPath;
|
||||
|
||||
BuddyResyncerGatherSlave* thisStatic = nullptr;
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(staticGatherSlavesMutex);
|
||||
|
||||
thisStatic = staticGatherSlaves[PThread::getCurrentThreadName()];
|
||||
}
|
||||
|
||||
App* app = Program::getApp();
|
||||
Config* cfg = app->getConfig();
|
||||
|
||||
const auto& targetPath = thisStatic->target.getPath().str();
|
||||
chunksPath = targetPath + "/" + CONFIG_BUDDYMIRROR_SUBDIR_NAME;
|
||||
|
||||
if (strlen(path) <= chunksPath.length())
|
||||
return FTW_CONTINUE;
|
||||
|
||||
std::string relPathStr = path + chunksPath.size() + 1;
|
||||
|
||||
if ( relPathStr.empty() )
|
||||
return FTW_CONTINUE;
|
||||
|
||||
const auto lastBuddyComm = thisStatic->target.getLastBuddyComm();
|
||||
|
||||
const bool buddyCommIsOverride = lastBuddyComm.first;
|
||||
int64_t lastBuddyCommTimeSecs = std::chrono::system_clock::to_time_t(lastBuddyComm.second);
|
||||
int64_t lastBuddyCommSafetyThresholdSecs = cfg->getSysResyncSafetyThresholdMins()*60;
|
||||
if ( (lastBuddyCommSafetyThresholdSecs == 0) && (!buddyCommIsOverride) ) // ignore timestamp file
|
||||
lastBuddyCommTimeSecs = 0;
|
||||
else
|
||||
if (lastBuddyCommTimeSecs > lastBuddyCommSafetyThresholdSecs)
|
||||
lastBuddyCommTimeSecs -= lastBuddyCommSafetyThresholdSecs;
|
||||
|
||||
if(ftwEntryType == FTW_D) // directory
|
||||
{
|
||||
thisStatic->numDirsDiscovered.increase();
|
||||
|
||||
int64_t dirModificationTime = (int64_t)statBuf->st_mtim.tv_sec;
|
||||
|
||||
if(dirModificationTime > lastBuddyCommTimeSecs)
|
||||
{ // sync candidate
|
||||
ChunkSyncCandidateDir candidate(relPathStr, thisStatic->target.getID());
|
||||
thisStatic->syncCandidates->add(candidate, thisStatic);
|
||||
thisStatic->numDirsMatched.increase();
|
||||
}
|
||||
}
|
||||
else
|
||||
if(ftwEntryType == FTW_F) // file
|
||||
{
|
||||
// we found a chunk
|
||||
thisStatic->numChunksDiscovered.increase();
|
||||
|
||||
// we need to use ctime here, because mtime can be set manually (even to the future)
|
||||
time_t chunkChangeTime = statBuf->st_ctim.tv_sec;
|
||||
|
||||
if(chunkChangeTime > lastBuddyCommTimeSecs)
|
||||
{ // sync candidate
|
||||
std::string relPathStr = path + chunksPath.size() + 1;
|
||||
|
||||
ChunkSyncCandidateFile candidate(relPathStr, thisStatic->target.getID());
|
||||
thisStatic->syncCandidates->add(candidate, thisStatic);
|
||||
|
||||
thisStatic->numChunksMatched.increase();
|
||||
}
|
||||
}
|
||||
|
||||
return FTW_CONTINUE;
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/app/log/LogContext.h>
|
||||
#include <common/storage/mirroring/SyncCandidateStore.h>
|
||||
#include <common/components/ComponentInitException.h>
|
||||
#include <common/threading/PThread.h>
|
||||
|
||||
#include <ftw.h>
|
||||
|
||||
class StorageTarget;
|
||||
|
||||
#define GATHERSLAVEQUEUE_MAXSIZE 5000
|
||||
|
||||
class BuddyResyncerGatherSlaveWorkQueue
|
||||
{
|
||||
/*
|
||||
* This is more or less just a small class for convenience, that is tightly coupled to
|
||||
* BuddyResyncerGatherSlave and BuddyResyncerJob
|
||||
*/
|
||||
public:
|
||||
BuddyResyncerGatherSlaveWorkQueue(): gatherSlavesWorkQueueLen(0) { }
|
||||
|
||||
private:
|
||||
StringList paths;
|
||||
size_t gatherSlavesWorkQueueLen; // used to avoid constant calling of size() method of list
|
||||
Mutex mutex;
|
||||
Condition pathAddedCond;
|
||||
Condition pathFetchedCond;
|
||||
|
||||
public:
|
||||
void add(std::string& path, PThread* caller)
|
||||
{
|
||||
unsigned waitTimeoutMS = 3000;
|
||||
|
||||
const std::lock_guard<Mutex> lock(mutex);
|
||||
|
||||
while (gatherSlavesWorkQueueLen > GATHERSLAVEQUEUE_MAXSIZE)
|
||||
{
|
||||
if((caller) && (unlikely(caller->getSelfTerminate())))
|
||||
break;
|
||||
pathFetchedCond.timedwait(&mutex, waitTimeoutMS);
|
||||
}
|
||||
|
||||
paths.push_back(path);
|
||||
gatherSlavesWorkQueueLen++;
|
||||
pathAddedCond.signal();
|
||||
}
|
||||
|
||||
std::string fetch(PThread* caller)
|
||||
{
|
||||
unsigned waitTimeoutMS = 3000;
|
||||
|
||||
const std::lock_guard<Mutex> lock(mutex);
|
||||
|
||||
while (paths.empty())
|
||||
{
|
||||
if((caller) && (unlikely(caller->getSelfTerminate())))
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
pathAddedCond.timedwait(&mutex, waitTimeoutMS);
|
||||
}
|
||||
|
||||
std::string retVal = paths.front();
|
||||
paths.pop_front();
|
||||
gatherSlavesWorkQueueLen--;
|
||||
pathFetchedCond.signal();
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
bool queueEmpty()
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(mutex);
|
||||
|
||||
return gatherSlavesWorkQueueLen == 0;
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(mutex);
|
||||
|
||||
paths.clear();
|
||||
gatherSlavesWorkQueueLen = 0;
|
||||
}
|
||||
};
|
||||
|
||||
class BuddyResyncerGatherSlave : public PThread
|
||||
{
|
||||
friend class BuddyResyncer; // (to grant access to internal mutex)
|
||||
friend class BuddyResyncJob; // (to grant access to internal mutex)
|
||||
|
||||
public:
|
||||
BuddyResyncerGatherSlave(const StorageTarget& target, ChunkSyncCandidateStore* syncCandidates,
|
||||
BuddyResyncerGatherSlaveWorkQueue* workQueue, uint8_t slaveID);
|
||||
virtual ~BuddyResyncerGatherSlave();
|
||||
|
||||
void workLoop();
|
||||
|
||||
private:
|
||||
AtomicSizeT onlyTerminateIfIdle; // atomic quasi-bool
|
||||
|
||||
Mutex statusMutex; // protects isRunning
|
||||
Condition isRunningChangeCond;
|
||||
|
||||
const StorageTarget& target;
|
||||
|
||||
AtomicUInt64 numChunksDiscovered;
|
||||
AtomicUInt64 numChunksMatched;
|
||||
|
||||
AtomicUInt64 numDirsDiscovered;
|
||||
AtomicUInt64 numDirsMatched;
|
||||
|
||||
bool isRunning; // true if an instance of this component is currently running
|
||||
|
||||
ChunkSyncCandidateStore* syncCandidates;
|
||||
BuddyResyncerGatherSlaveWorkQueue* workQueue;
|
||||
|
||||
// nftw() callback needs access the slave threads
|
||||
static Mutex staticGatherSlavesMutex;
|
||||
static std::map<std::string, BuddyResyncerGatherSlave*> staticGatherSlaves;
|
||||
|
||||
virtual void run();
|
||||
|
||||
static int handleDiscoveredEntry(const char* path, const struct stat* statBuf,
|
||||
int ftwEntryType, struct FTW* ftwBuf);
|
||||
|
||||
public:
|
||||
// getters & setters
|
||||
bool getIsRunning()
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
return this->isRunning;
|
||||
}
|
||||
|
||||
void getCounters(uint64_t& outNumChunksDiscovered, uint64_t& outNumChunksMatched,
|
||||
uint64_t& outNumDirsDiscovered, uint64_t& outNumDirsMatched)
|
||||
{
|
||||
outNumChunksDiscovered = numChunksDiscovered.read();
|
||||
outNumChunksMatched = numChunksMatched.read();
|
||||
outNumDirsDiscovered = numDirsDiscovered.read();
|
||||
outNumDirsMatched = numDirsMatched.read();
|
||||
}
|
||||
|
||||
void setOnlyTerminateIfIdle(bool value)
|
||||
{
|
||||
if (value)
|
||||
onlyTerminateIfIdle.set(1);
|
||||
else
|
||||
onlyTerminateIfIdle.setZero();
|
||||
}
|
||||
|
||||
bool getOnlyTerminateIfIdle()
|
||||
{
|
||||
if (onlyTerminateIfIdle.read() == 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// getters & setters
|
||||
|
||||
void setIsRunning(bool isRunning)
|
||||
{
|
||||
const std::lock_guard<Mutex> lock(statusMutex);
|
||||
|
||||
this->isRunning = isRunning;
|
||||
isRunningChangeCond.broadcast();
|
||||
}
|
||||
|
||||
bool getSelfTerminateNotIdle()
|
||||
{
|
||||
return ( (getSelfTerminate() && (!getOnlyTerminateIfIdle())) );
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::vector<BuddyResyncerGatherSlave*> BuddyResyncerGatherSlaveVec;
|
||||
typedef BuddyResyncerGatherSlaveVec::iterator BuddyResyncerGatherSlaveVecIter;
|
||||
|
||||
44
storage/source/components/buddyresyncer/SyncCandidate.h
Normal file
44
storage/source/components/buddyresyncer/SyncCandidate.h
Normal file
@@ -0,0 +1,44 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/storage/mirroring/SyncCandidateStore.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
/**
|
||||
* A storage sync candidate. Has a target ID and a path.
|
||||
*/
|
||||
class ChunkSyncCandidateDir
|
||||
{
|
||||
public:
|
||||
ChunkSyncCandidateDir(const std::string& relativePath, const uint16_t targetID)
|
||||
: relativePath(relativePath), targetID(targetID)
|
||||
{ }
|
||||
|
||||
ChunkSyncCandidateDir()
|
||||
: targetID(0)
|
||||
{ }
|
||||
|
||||
private:
|
||||
std::string relativePath;
|
||||
uint16_t targetID;
|
||||
|
||||
public:
|
||||
const std::string& getRelativePath() const { return relativePath; }
|
||||
uint16_t getTargetID() const { return targetID; }
|
||||
};
|
||||
|
||||
/**
|
||||
* A storage sync candidate that also has an onlyAttribs flag.
|
||||
*/
|
||||
class ChunkSyncCandidateFile : public ChunkSyncCandidateDir
|
||||
{
|
||||
public:
|
||||
ChunkSyncCandidateFile(const std::string& relativePath, uint16_t targetID)
|
||||
: ChunkSyncCandidateDir(relativePath, targetID)
|
||||
{ }
|
||||
|
||||
ChunkSyncCandidateFile() = default;
|
||||
};
|
||||
|
||||
typedef SyncCandidateStore<ChunkSyncCandidateDir, ChunkSyncCandidateFile> ChunkSyncCandidateStore;
|
||||
|
||||
Reference in New Issue
Block a user