beegfs/meta/source/components/InternodeSyncer.cpp
2025-08-10 01:34:16 +02:00

1344 lines
43 KiB
C++

#include <common/net/message/nodes/HeartbeatMsg.h>
#include <common/net/message/nodes/ChangeTargetConsistencyStatesMsg.h>
#include <common/net/message/nodes/ChangeTargetConsistencyStatesRespMsg.h>
#include <common/net/message/nodes/RefreshCapacityPoolsMsg.h>
#include <common/net/message/storage/SetStorageTargetInfoMsg.h>
#include <common/net/message/storage/SetStorageTargetInfoRespMsg.h>
#include <common/net/message/storage/quota/RequestExceededQuotaMsg.h>
#include <common/net/message/storage/quota/RequestExceededQuotaRespMsg.h>
#include <common/toolkit/MessagingTk.h>
#include <common/toolkit/NodesTk.h>
#include <common/toolkit/SessionTk.h>
#include <common/toolkit/Time.h>
#include <common/toolkit/ZipIterator.h>
#include <common/nodes/NodeStore.h>
#include <common/nodes/TargetCapacityPools.h>
#include <app/App.h>
#include <app/config/Config.h>
#include <net/msghelpers/MsgHelperClose.h>
#include <program/Program.h>
#include <toolkit/BuddyCommTk.h>
#include "InternodeSyncer.h"
#include <boost/lexical_cast.hpp>
// forward declaration
namespace UUID {
std::string getMachineUUID();
}
InternodeSyncer::InternodeSyncer(TargetConsistencyState initialConsistencyState)
: PThread("XNodeSync"),
log("XNodeSync"),
forcePoolsUpdate(true),
forceTargetStatesUpdate(true),
forcePublishCapacities(true),
forceStoragePoolsUpdate(true),
offlineWait(Program::getApp()->getConfig() ),
nodeConsistencyState(initialConsistencyState),
buddyResyncInProgress(false)
{
MirrorBuddyGroupMapper* mbg = Program::getApp()->getMetaBuddyGroupMapper();
MirrorBuddyState buddyState = mbg->getBuddyState(Program::getApp()->getLocalNodeNumID().val() );
if ((buddyState == BuddyState_PRIMARY)
&& (nodeConsistencyState == TargetConsistencyState_NEEDS_RESYNC))
offlineWait.startTimer();
}
void InternodeSyncer::run()
{
try
{
registerSignalHandler();
syncLoop();
log.log(Log_DEBUG, "Component stopped.");
}
catch(std::exception& e)
{
PThread::getCurrentThreadApp()->handleComponentException(e);
}
}
void InternodeSyncer::syncLoop()
{
const App* app = Program::getApp();
const Config* cfg = app->getConfig();
const int sleepIntervalMS = 3*1000; // 3sec
// If (undocumented) sysUpdateTargetStatesSecs is set in config, use that value, otherwise
// default to 1/6 sysTargetOfflineTimeoutSecs.
const unsigned updateTargetStatesMS =
(cfg->getSysUpdateTargetStatesSecs() != 0)
? cfg->getSysUpdateTargetStatesSecs() * 1000
: cfg->getSysTargetOfflineTimeoutSecs() * 166;
const unsigned updateCapacityPoolsMS = 4 * updateTargetStatesMS;
const unsigned metaCacheSweepNormalMS = 5*1000; // 5sec
const unsigned metaCacheSweepStressedMS = 2*1000; // 2sec
const unsigned idleDisconnectIntervalMS = 70*60*1000; /* 70 minutes (must be less than half the
streamlis idle disconnect interval to avoid cases where streamlis disconnects first) */
const unsigned updateIDTimeMS = 60 * 1000; // 1 min
const unsigned downloadNodesIntervalMS = 300000; // 5 min
const unsigned updateStoragePoolsMS = downloadNodesIntervalMS;
const unsigned checkNetworkIntervalMS = 60*1000; // 1 minute
Time lastCapacityUpdateT;
Time lastMetaCacheSweepT;
Time lastIdleDisconnectT;
Time lastTimeIDSet;
Time lastTargetStatesUpdateT;
Time lastDownloadNodesT;
Time lastStoragePoolsUpdateT;
Time lastCapacityPublishedT;
Time lastCheckNetworkT;
bool doRegisterLocalNode = false;
unsigned currentCacheSweepMS = metaCacheSweepNormalMS; // (adapted inside the loop below)
while(!waitForSelfTerminateOrder(sleepIntervalMS) )
{
const bool capacityPoolsUpdateForced = forcePoolsUpdate.exchange(false);
const bool doCapacityPoolsUpdate = capacityPoolsUpdateForced
|| (lastCapacityUpdateT.elapsedMS() > updateCapacityPoolsMS);
const bool doTargetStatesUpdate = forceTargetStatesUpdate.exchange(false)
|| (lastTargetStatesUpdateT.elapsedMS() > updateTargetStatesMS);
const bool doPublishCapacities = forcePublishCapacities.exchange(false)
|| (lastCapacityPublishedT.elapsedMS() > updateTargetStatesMS);
const bool doStoragePoolsUpdate = forceStoragePoolsUpdate.exchange(false)
|| (lastStoragePoolsUpdateT.elapsedMS() > updateStoragePoolsMS);
const bool doCheckNetwork = forceCheckNetwork.exchange(false)
|| (lastCheckNetworkT.elapsedMS() > checkNetworkIntervalMS);
if (doCheckNetwork)
{
if (checkNetwork())
doRegisterLocalNode = true;
lastCheckNetworkT.setToNow();
}
if (doRegisterLocalNode)
doRegisterLocalNode = !registerNode(app->getDatagramListener());
// download & sync nodes
if (lastDownloadNodesT.elapsedMS() > downloadNodesIntervalMS)
{
downloadAndSyncNodes();
downloadAndSyncTargetMappings();
lastDownloadNodesT.setToNow();
}
if (doStoragePoolsUpdate)
{
downloadAndSyncStoragePools();
lastStoragePoolsUpdateT.setToNow();
}
if(doCapacityPoolsUpdate)
{
updateMetaCapacityPools();
updateMetaBuddyCapacityPools();
if ( (capacityPoolsUpdateForced) && (!doStoragePoolsUpdate) )
{ // capacity pools changed, but storage pools were not downloaded (i.e. no update on
// storage capactity pools was made)
updateStorageCapacityPools();
updateTargetBuddyCapacityPools();
}
lastCapacityUpdateT.setToNow();
}
if(lastMetaCacheSweepT.elapsedMS() > currentCacheSweepMS)
{
bool flushTriggered = app->getMetaStore()->cacheSweepAsync();
currentCacheSweepMS = (flushTriggered ? metaCacheSweepStressedMS : metaCacheSweepNormalMS);
lastMetaCacheSweepT.setToNow();
}
if(lastIdleDisconnectT.elapsedMS() > idleDisconnectIntervalMS)
{
dropIdleConns();
lastIdleDisconnectT.setToNow();
}
if(lastTimeIDSet.elapsedMS() > updateIDTimeMS)
{
StorageTk::resetIDCounterToNow();
lastTimeIDSet.setToNow();
}
if(doTargetStatesUpdate)
{
if (this->offlineWait.hasTimeout() )
{
// if we're waiting to be offlined, set our local state to needs-resync and don't report
// anything to the mgmtd
setNodeConsistencyState(TargetConsistencyState_NEEDS_RESYNC);
}
else
{
TargetConsistencyState newConsistencyState;
if (updateMetaStatesAndBuddyGroups(newConsistencyState, true))
setNodeConsistencyState(newConsistencyState);
downloadAndSyncTargetStatesAndBuddyGroups();
}
lastTargetStatesUpdateT.setToNow();
}
if (doPublishCapacities)
{
publishNodeCapacity();
lastCapacityPublishedT.setToNow();
}
}
}
/**
* Inspect the available and allowed network interfaces for any changes.
*/
bool InternodeSyncer::checkNetwork()
{
App* app = Program::getApp();
NicAddressList newLocalNicList;
bool res = false;
app->findAllowedInterfaces(newLocalNicList);
app->findAllowedRDMAInterfaces(newLocalNicList);
if (!std::equal(newLocalNicList.begin(), newLocalNicList.end(), app->getLocalNicList().begin()))
{
log.log(Log_NOTICE, "checkNetwork: local interfaces have changed");
app->updateLocalNicList(newLocalNicList);
res = true;
}
return res;
}
bool InternodeSyncer::updateMetaCapacityPools()
{
NodeCapacityPools* pools = Program::getApp()->getMetaCapacityPools();
auto downloadRes = downloadCapacityPools(CapacityPoolQuery_META);
if(!downloadRes.first)
return false;
// poolsMap does only contain one element with INVALID_POOL_ID in this case
const auto& capacityPoolLists = downloadRes.second[StoragePoolStore::INVALID_POOL_ID];
pools->syncPoolsFromLists(capacityPoolLists);
return true;
}
bool InternodeSyncer::updateMetaBuddyCapacityPools()
{
NodeCapacityPools* pools = Program::getApp()->getMetaBuddyCapacityPools();
auto downloadRes = downloadCapacityPools(CapacityPoolQuery_METABUDDIES);
if(!downloadRes.first)
return false;
// poolsMap does only contain one element with INVALID_POOL_ID in this case
const auto& capacityPoolLists = downloadRes.second[StoragePoolStore::INVALID_POOL_ID];
pools->syncPoolsFromLists(capacityPoolLists);
return true;
}
bool InternodeSyncer::updateStorageCapacityPools()
{
const App* app = Program::getApp();
const Config* cfg = app->getConfig();
const TargetMapper* targetMapper = app->getTargetMapper();
const auto downloadRes = downloadCapacityPools(CapacityPoolQuery_STORAGE);
if(!downloadRes.first)
return false;
TargetMap targetMap;
if(cfg->getSysTargetAttachmentMap() )
targetMap = *cfg->getSysTargetAttachmentMap(); // user-provided custom assignment map
else
targetMap = targetMapper->getMapping();
const GetNodeCapacityPoolsRespMsg::PoolsMap& poolsMap = downloadRes.second;
bool failed = false;
for (auto iter = poolsMap.begin(); iter != poolsMap.end(); iter++)
{
StoragePoolPtr storagePool = app->getStoragePoolStore()->getPool(iter->first);
if (!storagePool)
{
LOG(CAPACITY, ERR, "Received capacity pools for unknown storage pool.",
("storagePoolId", iter->first));
failed = true;
continue;
}
storagePool->getTargetCapacityPools()->syncPoolsFromLists(iter->second, targetMap);
}
return !failed;
}
bool InternodeSyncer::updateTargetBuddyCapacityPools()
{
const App* app = Program::getApp();
auto downloadRes = downloadCapacityPools(CapacityPoolQuery_STORAGEBUDDIES);
if(!downloadRes.first)
return false;
const GetNodeCapacityPoolsRespMsg::PoolsMap& poolsMap = downloadRes.second;
bool failed = false;
for (auto iter = poolsMap.begin(); iter != poolsMap.end(); iter++)
{
StoragePoolPtr storagePool = app->getStoragePoolStore()->getPool(iter->first);
if (!storagePool)
{
LOG(CAPACITY, ERR, "Received capacity pools for unknown storage pool.",
("storagePoolId", iter->first));
failed = true;
continue;
}
storagePool->getBuddyCapacityPools()->syncPoolsFromLists(iter->second);
}
return !failed;
}
/**
* @return a pair, with first being false on error/true on success and second being the downloaded
* map of capacity pools sorted by storage pool
*/
std::pair<bool, GetNodeCapacityPoolsRespMsg::PoolsMap> InternodeSyncer::downloadCapacityPools(
CapacityPoolQueryType poolType)
{
LOG(STATES, DEBUG, "Downloading capacity pools.",
("Pool type", GetNodeCapacityPoolsMsg::queryTypeToStr(poolType)));
NodeStore* mgmtNodes = Program::getApp()->getMgmtNodes();
auto node = mgmtNodes->referenceFirstNode();
if(!node)
return std::make_pair(false, GetNodeCapacityPoolsRespMsg::PoolsMap());
GetNodeCapacityPoolsMsg msg(poolType);
RequestResponseArgs rrArgs(node.get(), &msg, NETMSGTYPE_GetNodeCapacityPoolsResp);
GetNodeCapacityPoolsRespMsg* respMsgCast;
#ifndef BEEGFS_DEBUG
rrArgs.logFlags |= REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED
| REQUESTRESPONSEARGS_LOGFLAG_RETRY;
#endif // BEEGFS_DEBUG
// connect & communicate
bool commRes = MessagingTk::requestResponse(&rrArgs);
if(!commRes)
return std::make_pair(false, GetNodeCapacityPoolsRespMsg::PoolsMap());
// handle result
respMsgCast = (GetNodeCapacityPoolsRespMsg*)rrArgs.outRespMsg.get();
GetNodeCapacityPoolsRespMsg::PoolsMap poolsMap = respMsgCast->getPoolsMap();
return std::make_pair(true, poolsMap);
}
/**
* @return true if an ack was received for the heartbeat, false otherwise
*/
bool InternodeSyncer::registerNode(AbstractDatagramListener* dgramLis)
{
const char* logContext = "Register node";
static bool registrationFailureLogged = false; // to avoid log spamming
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
Config* cfg = app->getConfig();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
Node& localNode = Program::getApp()->getLocalNode();
NumNodeID localNodeNumID = localNode.getNumID();
NumNodeID rootNodeID = app->getMetaRoot().getID();
bool rootIsBuddyMirrored = app->getMetaRoot().getIsMirrored();
NicAddressList nicList(localNode.getNicList());
HeartbeatMsg msg(localNode.getAlias(), localNodeNumID, NODETYPE_Meta, &nicList);
msg.setRootNumID(rootNodeID);
msg.setRootIsBuddyMirrored(rootIsBuddyMirrored);
msg.setPorts(cfg->getConnMetaPort(), cfg->getConnMetaPort() );
auto uuid = UUID::getMachineUUID();
if (uuid.empty()) {
LogContext(logContext).log(Log_CRITICAL,
"Couldn't determine UUID for machine. Node registration not possible.");
return false;
}
msg.setMachineUUID(uuid);
bool nodeRegistered = dgramLis->sendToNodeUDPwithAck(mgmtNode, &msg);
if(nodeRegistered)
LogContext(logContext).log(Log_WARNING, "Node registration successful.");
else
if(!registrationFailureLogged)
{
LogContext(logContext).log(Log_CRITICAL,
"Node registration not successful. Management node offline? Will keep on trying...");
registrationFailureLogged = true;
}
return nodeRegistered;
}
/**
* Download and sync metadata server target states and mirror buddy groups.
*
* @param outConsistencyState The new node consistency state.
*/
bool InternodeSyncer::updateMetaStatesAndBuddyGroups(TargetConsistencyState& outConsistencyState,
bool publish)
{
LOG(STATES, DEBUG, "Starting state update.");
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
TargetStateStore* metaStateStore = app->getMetaStateStore();
MirrorBuddyGroupMapper* buddyGroupMapper = app->getMetaBuddyGroupMapper();
static bool downloadFailedLogged = false; // to avoid log spamming
static bool publishFailedLogged = false;
NumNodeID localNodeID = app->getLocalNodeNumID();
auto node = mgmtNodes->referenceFirstNode();
if(!node)
{
LOG(STATES, ERR, "Management node not defined.");
return false;
}
unsigned numRetries = 10; // If publishing states fails 10 times, give up (-> POFFLINE).
// Note: Publishing fails if between downloadStatesAndBuddyGroups and
// publishLocalTargetStateChanges, a state on the mgmtd is changed (e.g. because the primary
// sets NEEDS_RESYNC for the secondary). In that case, we will retry.
LOG(STATES, DEBUG, "Beginning target state update...");
bool publishSuccess = false;
while (!publishSuccess && (numRetries--) )
{
MirrorBuddyGroupMap buddyGroups;
TargetStateMap states;
bool downloadRes = NodesTk::downloadStatesAndBuddyGroups(*node, NODETYPE_Meta, buddyGroups,
states, true);
if (!downloadRes)
{
if (!downloadFailedLogged)
{
LOG(STATES, WARNING,
"Downloading target states from management node failed. "
"Setting all target states to probably-offline.");
downloadFailedLogged = true;
}
metaStateStore->setAllStates(TargetReachabilityState_POFFLINE);
break;
}
downloadFailedLogged = false;
// Sync buddy groups here, because decideResync depends on it.
metaStateStore->syncStatesAndGroups(buddyGroupMapper, states,
std::move(buddyGroups), localNodeID);
CombinedTargetState newStateFromMgmtd;
// Find local state which was sent by mgmtd
for (const auto& state : states)
{
if (state.first == localNodeID.val())
{
newStateFromMgmtd = CombinedTargetState(state.second.reachabilityState,
state.second.consistencyState);
}
}
TargetConsistencyState localChangedState = decideResync(newStateFromMgmtd);
outConsistencyState = localChangedState;
if (!publish)
{
metaStateStore->setState(localNodeID.val(),
CombinedTargetState(TargetReachabilityState_ONLINE, localChangedState) );
return true;
}
// Note: In this case "old" means "before we changed it locally".
TargetConsistencyState oldState = newStateFromMgmtd.consistencyState;
publishSuccess = publishNodeStateChange(oldState, localChangedState);
if (publishSuccess)
{
metaStateStore->setState(localNodeID.val(),
CombinedTargetState(TargetReachabilityState_ONLINE, localChangedState) );
BuddyCommTk::checkBuddyNeedsResync();
}
}
if (!publishSuccess)
{
if (!publishFailedLogged)
{
LOG(STATES, WARNING, "Pushing local state to management node failed.");
publishFailedLogged = true;
}
}
else
publishFailedLogged = false;
return true;
}
/**
* Synchronize local client sessions with registered mgmtd clients to release orphaned sessions.
*
* @param clientsList must be ordered; contained nodes will be removed and may no longer be
* accessed after calling this method.
* @param allowRemoteComm usually true; setting this to false is only useful when called during
* app shutdown to avoid communication; if false, unlocking of user locks, closing of storage server
* files and disposal of unlinked files won't be performed
*/
void InternodeSyncer::syncClients(const std::vector<NodeHandle>& clientsList, bool allowRemoteComm)
{
const char* logContext = "Sync clients";
App* app = Program::getApp();
MetaStore* metaStore = Program::getApp()->getMetaStore();
SessionStore* sessions = app->getSessions();
SessionStore* mirroredSessions = app->getMirroredSessions();
SessionList removedSessions;
NumNodeIDList unremovableSessions;
sessions->syncSessions(clientsList, removedSessions, unremovableSessions);
mirroredSessions->syncSessions(clientsList, removedSessions, unremovableSessions);
// print client session removal results (upfront)
if(!removedSessions.empty() || !unremovableSessions.empty())
{
std::ostringstream logMsgStream;
logMsgStream << "Removing " << removedSessions.size() << " client sessions.";
if(unremovableSessions.empty() )
LogContext(logContext).log(Log_DEBUG, logMsgStream.str() ); // no unremovable sessions
else
{ // unremovable sessions found => log warning
logMsgStream << " (" << unremovableSessions.size() << " are unremovable)";
LogContext(logContext).log(Log_WARNING, logMsgStream.str() );
}
}
// walk over all removed sessions (to cleanup the contained files)
SessionListIter sessionIter = removedSessions.begin();
for( ; sessionIter != removedSessions.end(); sessionIter++)
{ // walk over all client sessions: cleanup each session
Session* session = *sessionIter;
NumNodeID sessionID = session->getSessionID();
SessionFileStore* sessionFiles = session->getFiles();
SessionFileList removedSessionFiles;
UIntList referencedSessionFiles;
sessionFiles->removeAllSessions(&removedSessionFiles, &referencedSessionFiles);
/* note: referencedSessionFiles should always be empty, because otherwise the reference holder
would also hold a reference to the client session (and we woudn't be here if the client
session had any references) */
// print session files results (upfront)
if (!removedSessionFiles.empty() || !referencedSessionFiles.empty())
{
std::ostringstream logMsgStream;
logMsgStream << "Removing " << removedSessionFiles.size() << " file sessions. ("
<< referencedSessionFiles.size() << " are unremovable). clientNumID: " << sessionID;
if (referencedSessionFiles.empty())
LogContext(logContext).log(Log_SPAM, logMsgStream.str() );
else
LogContext(logContext).log(Log_NOTICE, logMsgStream.str() );
}
// walk over all files in the current session (to clean them up)
SessionFileListIter fileIter = removedSessionFiles.begin();
for( ; fileIter != removedSessionFiles.end(); fileIter++)
{ // walk over all files: unlock user locks, close meta, close local, dispose unlinked
SessionFile* sessionFile = *fileIter;
unsigned ownerFD = sessionFile->getSessionID();
unsigned accessFlags = sessionFile->getAccessFlags();
unsigned numHardlinks;
unsigned numInodeRefs;
bool lastWriterClosed; // ignored here!
MetaFileHandle inode = sessionFile->releaseInode();
std::string fileID = inode->getEntryID();
std::string fileHandleID = SessionTk::generateFileHandleID(ownerFD, fileID);
// save nodeIDs for later
StripePattern* pattern = inode->getStripePattern();
int maxUsedNodesIndex = pattern->getStripeTargetIDs()->size() - 1;
// unlock all user locks
auto appendGranted = inode->flockAppendCancelByClientID(sessionID);
auto flockGranted = inode->flockEntryCancelByClientID(sessionID);
auto rangeGranted = inode->flockRangeCancelByClientID(sessionID);
if(allowRemoteComm)
{
LockingNotifier::notifyWaitersEntryLock(LockEntryNotifyType_APPEND,
inode->getReferenceParentID(), inode->getEntryID(), inode->getIsBuddyMirrored(),
std::move(appendGranted));
LockingNotifier::notifyWaitersEntryLock(LockEntryNotifyType_FLOCK,
inode->getReferenceParentID(), inode->getEntryID(), inode->getIsBuddyMirrored(),
std::move(flockGranted));
LockingNotifier::notifyWaitersRangeLock(inode->getReferenceParentID(),
inode->getEntryID(), inode->getIsBuddyMirrored(), std::move(rangeGranted));
}
EntryInfo* entryInfo = sessionFile->getEntryInfo();
FileIDLock lock(sessions->getEntryLockStore(), entryInfo->getEntryID(), true);
if(allowRemoteComm)
MsgHelperClose::closeChunkFile(sessionID, fileHandleID.c_str(),
maxUsedNodesIndex, *inode, entryInfo, NETMSG_DEFAULT_USERID);
LogContext(logContext).log(Log_NOTICE, "closing file. ParentID: " +
entryInfo->getParentEntryID() + " FileName: " + entryInfo->getFileName() );
metaStore->closeFile(entryInfo, std::move(inode), accessFlags, &numHardlinks,
&numInodeRefs, lastWriterClosed);
if(allowRemoteComm && !numHardlinks && !numInodeRefs)
MsgHelperClose::unlinkDisposableFile(fileID, NETMSG_DEFAULT_USERID,
entryInfo->getIsBuddyMirrored());
delete sessionFile;
} // end of files loop
delete(session);
} // end of client sessions loop
}
bool InternodeSyncer::downloadAndSyncNodes()
{
LOG(STATES, DEBUG, "Starting node list sync.");
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
Node& localNode = app->getLocalNode();
NodeStore* metaNodes = app->getMetaNodes();
NodeStore* storageNodes = app->getStorageNodes();
// metadata nodes
std::vector<NodeHandle> metaNodesList;
NumNodeIDList addedMetaNodes;
NumNodeIDList removedMetaNodes;
NumNodeID rootNodeID;
bool rootIsBuddyMirrored;
if(NodesTk::downloadNodes(*mgmtNode, NODETYPE_Meta, metaNodesList, true, &rootNodeID,
&rootIsBuddyMirrored) )
{
metaNodes->syncNodes(metaNodesList, &addedMetaNodes, &removedMetaNodes, &localNode);
if (app->getMetaRoot().setIfDefault(rootNodeID, rootIsBuddyMirrored))
{
LOG(STATES, CRITICAL,
std::string("Root node ID (from sync results): ") + rootNodeID.str());
app->getRootDir()->setOwnerNodeID(rootNodeID);
}
printSyncResults(NODETYPE_Meta, &addedMetaNodes, &removedMetaNodes);
}
// storage nodes
std::vector<NodeHandle> storageNodesList;
NumNodeIDList addedStorageNodes;
NumNodeIDList removedStorageNodes;
if(NodesTk::downloadNodes(*mgmtNode, NODETYPE_Storage, storageNodesList, true) )
{
storageNodes->syncNodes(storageNodesList, &addedStorageNodes, &removedStorageNodes,
&localNode);
printSyncResults(NODETYPE_Storage, &addedStorageNodes, &removedStorageNodes);
}
return true;
}
void InternodeSyncer::downloadAndSyncClients(bool requeue)
{
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
NodeStoreClients* clientNodes = app->getClientNodes();
TimerQueue* timerQ = app->getTimerQueue();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return;
std::vector<NodeHandle> clientNodesList;
NumNodeIDList addedClientNodes;
NumNodeIDList removedClientNodes;
if(NodesTk::downloadNodes(*mgmtNode, NODETYPE_Client, clientNodesList, true) )
{
clientNodes->syncNodes(clientNodesList, &addedClientNodes, &removedClientNodes);
printSyncResults(NODETYPE_Client, &addedClientNodes, &removedClientNodes);
syncClients(clientNodesList, true); // sync client sessions
}
if (requeue)
timerQ->enqueue(std::chrono::minutes(5),
[] { InternodeSyncer::downloadAndSyncClients(true); });
}
bool InternodeSyncer::downloadAndSyncTargetMappings()
{
LOG(STATES, DEBUG, "Syncing target mappings.");
App* app = Program::getApp();
TargetMapper* targetMapper = app->getTargetMapper();
NodeStore* mgmtNodes = app->getMgmtNodes();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
auto mappings = NodesTk::downloadTargetMappings(*mgmtNode, true);
if (mappings.first)
targetMapper->syncTargets(std::move(mappings.second));
return true;
}
bool InternodeSyncer::downloadAndSyncStoragePools()
{
LOG(STORAGEPOOLS, DEBUG, "Syncing storage pools.");
const App* app = Program::getApp();
const NodeStore* mgmtNodes = app->getMgmtNodes();
StoragePoolStore* storagePoolStore = app->getStoragePoolStore();
const auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
StoragePoolPtrVec storagePools;
// note: storage pool download does include capacity pools
const bool downloadStoragePoolsRes =
NodesTk::downloadStoragePools(*mgmtNode, storagePools, true);
if (!downloadStoragePoolsRes)
return false;
storagePoolStore->syncFromVector(storagePools);
return true;
}
/**
* Download and sync storage target states and mirror buddy groups.
*/
bool InternodeSyncer::downloadAndSyncTargetStatesAndBuddyGroups()
{
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
MirrorBuddyGroupMapper* buddyGroupMapper = app->getStorageBuddyGroupMapper();
TargetStateStore* targetStates = app->getTargetStateStore();
LOG(STATES, DEBUG, "Downloading target states and buddy groups");
static bool downloadFailedLogged = false;
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
TargetStateMap states;
MirrorBuddyGroupMap buddyGroups;
bool downloadRes = NodesTk::downloadStatesAndBuddyGroups(*mgmtNode, NODETYPE_Storage,
buddyGroups, states, true);
if ( downloadRes )
{
targetStates->syncStatesAndGroups(buddyGroupMapper, states,
std::move(buddyGroups), app->getLocalNode().getNumID());
downloadFailedLogged = false;
}
else
{ // download failed, so we don't know actual status => carefully set all to poffline
if(!downloadFailedLogged)
{
LOG(STATES, WARNING, "Download from management node failed. "
"Setting all targets to probably-offline.");
downloadFailedLogged = true;
}
targetStates->setAllStates(TargetReachabilityState_POFFLINE);
}
return true;
}
void InternodeSyncer::printSyncResults(NodeType nodeType, NumNodeIDList* addedNodes,
NumNodeIDList* removedNodes)
{
Logger* const logger = Logger::getLogger();
const int logLevel = nodeType == NODETYPE_Client ? Log_DEBUG : Log_WARNING;
if (!addedNodes->empty())
logger->log(LogTopic_STATES, logLevel, __func__,
std::string("Nodes added (sync results): ") +
StringTk::uintToStr(addedNodes->size() ) +
" (Type: " + boost::lexical_cast<std::string>(nodeType) + ")");
if (!removedNodes->empty())
logger->log(LogTopic_STATES, logLevel, __func__,
std::string("Nodes removed (sync results): ") +
StringTk::uintToStr(removedNodes->size() ) +
" (Type: " + boost::lexical_cast<std::string>(nodeType) + ")");
}
/**
* Decides new consistency state based on local state, buddy group membership and state fetched
* from management node.
*
* @param newState New state from the management node.
* @returns The new target consistency state.
*/
TargetConsistencyState InternodeSyncer::decideResync(const CombinedTargetState newState)
{
App* app = Program::getApp();
MirrorBuddyGroupMapper* metaBuddyGroupMapper = app->getMetaBuddyGroupMapper();
InternodeSyncer* internodeSyncer = app->getInternodeSyncer();
NumNodeID localNodeID = app->getLocalNodeNumID();
MirrorBuddyState buddyState = metaBuddyGroupMapper->getBuddyState(localNodeID.val());
if (buddyState == BuddyState_UNMAPPED)
return TargetConsistencyState_GOOD;
// If the consistency state is BAD, it stays BAD until admin intervenes.
if (internodeSyncer && // during early startup, INS is not constructed yet.
internodeSyncer->getNodeConsistencyState() == TargetConsistencyState_BAD)
return TargetConsistencyState_BAD;
const bool isResyncing = newState.consistencyState == TargetConsistencyState_NEEDS_RESYNC;
const bool isBad = newState.consistencyState == TargetConsistencyState_BAD;
if (internodeSyncer &&
internodeSyncer->getNodeConsistencyState() == TargetConsistencyState_NEEDS_RESYNC)
{
// If we're already (locally) maked as needs resync, this state can only be left when our
// (primary) buddy tells us the resync is finished.
return TargetConsistencyState_NEEDS_RESYNC;
}
else if (isResyncing || isBad)
{
return TargetConsistencyState_NEEDS_RESYNC;
}
else
{
// If mgmtd reports the target is (P)OFFLINE, then the meta server knows better and we set our
// state to GOOD / ONLINE. Otherwise we accept the state reported by the mgmtd.
if ( (newState.reachabilityState == TargetReachabilityState_OFFLINE)
|| (newState.reachabilityState == TargetReachabilityState_POFFLINE) )
return TargetConsistencyState_GOOD;
else
return newState.consistencyState;
}
}
bool InternodeSyncer::publishNodeStateChange(const TargetConsistencyState oldState,
const TargetConsistencyState newState)
{
const char* logContext = "Publish node state";
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
const NumNodeID localNodeID = app->getLocalNodeNumID();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if (!mgmtNode)
{
LogContext(logContext).logErr("Management node not defined.");
return false;
}
bool res;
UInt8List oldStateList(1, oldState);
UInt8List newStateList(1, newState);
UInt16List nodeIDList(1, localNodeID.val());
ChangeTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDList, &oldStateList, &newStateList);
RequestResponseArgs rrArgs(mgmtNode.get(), &msg, NETMSGTYPE_ChangeTargetConsistencyStatesResp);
#ifndef BEEGFS_DEBUG
rrArgs.logFlags |= REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED
| REQUESTRESPONSEARGS_LOGFLAG_RETRY;
#endif // BEEGFS_DEBUG
bool sendRes = MessagingTk::requestResponse(&rrArgs);
static bool failureLogged = false;
if (!sendRes)
{
if (!failureLogged)
LogContext(logContext).log(Log_CRITICAL, "Pushing node state to management node failed.");
res = false;
failureLogged = true;
}
else
{
const auto respMsgCast =
static_cast<const ChangeTargetConsistencyStatesRespMsg*>(rrArgs.outRespMsg.get());
if ( (FhgfsOpsErr)respMsgCast->getValue() != FhgfsOpsErr_SUCCESS)
{
LogContext(logContext).log(Log_CRITICAL, "Management node did not accept node state change.");
res = false;
}
else
{
res = true;
}
failureLogged = false;
}
return res;
}
/**
* Send local node free space / free inode info to management node.
*/
void InternodeSyncer::publishNodeCapacity()
{
App* app = Program::getApp();
NodeStore* mgmtNodes = app->getMgmtNodes();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if (!mgmtNode)
{
log.logErr("Management node not defined.");
return;
}
int64_t sizeTotal = 0;
int64_t sizeFree = 0;
int64_t inodesTotal = 0;
int64_t inodesFree = 0;
std::string metaPath = app->getMetaPath();
getStatInfo(&sizeTotal, &sizeFree, &inodesTotal, &inodesFree);
StorageTargetInfo targetInfo(app->getLocalNodeNumID().val(), metaPath, sizeTotal, sizeFree,
inodesTotal, inodesFree, getNodeConsistencyState());
// Note: As long as we don't have meta-HA, consistency state will always be GOOD.
StorageTargetInfoList targetInfoList(1, targetInfo);
SetStorageTargetInfoMsg msg(NODETYPE_Meta, &targetInfoList);
RequestResponseArgs rrArgs(mgmtNode.get(), &msg, NETMSGTYPE_SetStorageTargetInfoResp);
#ifndef BEEGFS_DEBUG
rrArgs.logFlags |= REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED
| REQUESTRESPONSEARGS_LOGFLAG_RETRY;
#endif // BEEGFS_DEBUG
bool sendRes = MessagingTk::requestResponse(&rrArgs);
static bool failureLogged = false;
if (!sendRes)
{
if (!failureLogged)
log.log(Log_CRITICAL, "Pushing node free space to management node failed.");
failureLogged = true;
return;
}
else
{
const auto respMsgCast = (const SetStorageTargetInfoRespMsg*)rrArgs.outRespMsg.get();
failureLogged = false;
if ( (FhgfsOpsErr)respMsgCast->getValue() != FhgfsOpsErr_SUCCESS)
{
log.log(Log_CRITICAL, "Management node did not accept free space info message.");
return;
}
}
// If we were just started and are publishing our capacity for the first time, force a pool
// refresh on the mgmtd so we're not stuck in the emergency pool until the first regular
// pool refresh.
static bool firstTimePubilished = true;
if (firstTimePubilished)
{
forceMgmtdPoolsRefresh();
firstTimePubilished = false;
}
}
/**
* Drop/reset idle conns from all server stores.
*/
void InternodeSyncer::dropIdleConns()
{
App* app = Program::getApp();
unsigned numDroppedConns = 0;
numDroppedConns += dropIdleConnsByStore(app->getMgmtNodes() );
numDroppedConns += dropIdleConnsByStore(app->getMetaNodes() );
numDroppedConns += dropIdleConnsByStore(app->getStorageNodes() );
if(numDroppedConns)
{
log.log(Log_DEBUG, "Dropped idle connections: " + StringTk::uintToStr(numDroppedConns) );
}
}
/**
* Walk over all nodes in the given store and drop/reset idle connections.
*
* @return number of dropped connections
*/
unsigned InternodeSyncer::dropIdleConnsByStore(NodeStoreServers* nodes)
{
App* app = Program::getApp();
unsigned numDroppedConns = 0;
for (const auto& node : nodes->referenceAllNodes())
{
/* don't do any idle disconnect stuff with local node
(the LocalNodeConnPool doesn't support and doesn't need this kind of treatment) */
if (node.get() != &app->getLocalNode())
{
NodeConnPool* connPool = node->getConnPool();
numDroppedConns += connPool->disconnectAndResetIdleStreams();
}
}
return numDroppedConns;
}
void InternodeSyncer::getStatInfo(int64_t* outSizeTotal, int64_t* outSizeFree,
int64_t* outInodesTotal, int64_t* outInodesFree)
{
const char* logContext = "GetStorageTargetInfoMsg (stat path)";
std::string targetPathStr = Program::getApp()->getMetaPath();
bool statSuccess = StorageTk::statStoragePath(targetPathStr, outSizeTotal, outSizeFree,
outInodesTotal, outInodesFree);
if(unlikely(!statSuccess) )
{ // error
LogContext(logContext).logErr("Unable to statfs() storage path: " + targetPathStr +
" (SysErr: " + System::getErrString() );
*outSizeTotal = -1;
*outSizeFree = -1;
*outInodesTotal = -1;
*outInodesFree = -1;
}
// read and use value from manual free space override file (if it exists)
StorageTk::statStoragePathOverride(targetPathStr, outSizeFree, outInodesFree);
}
/**
* Tell mgmtd to update its capacity pools.
*/
void InternodeSyncer::forceMgmtdPoolsRefresh()
{
App* app = Program::getApp();
DatagramListener* dgramLis = app->getDatagramListener();
NodeStoreServers* mgmtNodes = app->getMgmtNodes();
auto mgmtNode = mgmtNodes->referenceFirstNode();
if (!mgmtNode)
{
log.log(Log_DEBUG, "Management node not defined.");
return;
}
RefreshCapacityPoolsMsg msg;
bool ackReceived = dgramLis->sendToNodeUDPwithAck(mgmtNode, &msg);
if (!ackReceived)
log.log(Log_DEBUG, "Management node did not accept pools refresh request.");
}
/**
* @return false on error
*/
bool InternodeSyncer::downloadExceededQuotaList(StoragePoolId storagePoolId, QuotaDataType idType,
QuotaLimitType exType, UIntList* outIDList, FhgfsOpsErr& error)
{
App* app = Program::getApp();
NodeStoreServers* mgmtNodes = app->getMgmtNodes();
bool retVal = false;
auto mgmtNode = mgmtNodes->referenceFirstNode();
if(!mgmtNode)
return false;
RequestExceededQuotaMsg msg(idType, exType, storagePoolId);
RequestExceededQuotaRespMsg* respMsgCast = NULL;
const auto respMsg = MessagingTk::requestResponse(*mgmtNode, msg,
NETMSGTYPE_RequestExceededQuotaResp);
if (!respMsg)
goto err_exit;
// handle result
respMsgCast = (RequestExceededQuotaRespMsg*)respMsg.get();
respMsgCast->getExceededQuotaIDs()->swap(*outIDList);
error = respMsgCast->getError();
retVal = true;
err_exit:
return retVal;
}
bool InternodeSyncer::downloadAllExceededQuotaLists(const StoragePoolPtrVec& storagePools)
{
bool retVal = true;
// note: this is fairly inefficient, but it is done only once on startup
for (auto iter = storagePools.begin(); iter != storagePools.end(); iter++)
{
if (!downloadAllExceededQuotaLists(*iter))
retVal = false;
}
return retVal;
}
/**
* @return false on error
*/
bool InternodeSyncer::downloadAllExceededQuotaLists(const StoragePoolPtr storagePool)
{
const char* logContext = "Exceeded quota sync";
App* app = Program::getApp();
Config* cfg = app->getConfig();
UInt16Set targets = storagePool->getTargets();
bool retVal = true;
UIntList tmpExceededUIDsSize;
UIntList tmpExceededGIDsSize;
UIntList tmpExceededUIDsInode;
UIntList tmpExceededGIDsInode;
FhgfsOpsErr error;
if (downloadExceededQuotaList(storagePool->getId(), QuotaDataType_USER, QuotaLimitType_SIZE,
&tmpExceededUIDsSize, error) )
{
// update exceeded store for every target in the pool
for (auto iter = targets.begin(); iter != targets.end(); iter++)
{
uint16_t targetId = *iter;
ExceededQuotaStorePtr exceededQuotaStore = app->getExceededQuotaStores()->get(targetId);
if (!exceededQuotaStore)
{
LOG(STORAGEPOOLS, ERR, "Could not access exceeded quota store in file size quota for users.", targetId);
retVal = false;
break;
}
exceededQuotaStore->updateExceededQuota(&tmpExceededUIDsSize, QuotaDataType_USER,
QuotaLimitType_SIZE);
}
// enable or disable quota enforcement
if (error == FhgfsOpsErr_NOTSUPP)
{
if (cfg->getQuotaEnableEnforcement())
{
LogContext(logContext).log(Log_DEBUG,
"Quota enforcement is enabled in the configuration of this metadata server, "
"but not on the management daemon. "
"The configuration from the management daemon overrides the local setting.");
}
else
{
LogContext(logContext).log(Log_DEBUG, "Quota enforcement disabled by management daemon.");
}
cfg->setQuotaEnableEnforcement(false);
return true;
}
else
{
if (!cfg->getQuotaEnableEnforcement())
{
LogContext(logContext).log(Log_DEBUG,
"Quota enforcement is enabled on the management daemon, "
"but not in the configuration of this metadata server. "
"The configuration from the management daemon overrides the local setting.");
}
else
{
LogContext(logContext).log(Log_DEBUG, "Quota enforcement enabled by management daemon.");
}
cfg->setQuotaEnableEnforcement(true);
}
}
else
{ // error
LogContext(logContext).logErr("Unable to download exceeded file size quota for users.");
retVal = false;
}
if (downloadExceededQuotaList(storagePool->getId(), QuotaDataType_GROUP, QuotaLimitType_SIZE,
&tmpExceededGIDsSize, error))
{
for (auto iter = targets.begin(); iter != targets.end(); iter++)
{
uint16_t targetId = *iter;
ExceededQuotaStorePtr exceededQuotaStore = app->getExceededQuotaStores()->get(targetId);
if (!exceededQuotaStore)
{
LOG(STORAGEPOOLS, ERR, "Could not access exceeded quota store in file size quota for groups.", targetId);
retVal = false;
break;
}
exceededQuotaStore->updateExceededQuota(&tmpExceededGIDsSize, QuotaDataType_GROUP,
QuotaLimitType_SIZE);
}
}
else
{ // error
LogContext(logContext).logErr("Unable to download exceeded file size quota for groups.");
retVal = false;
}
if (downloadExceededQuotaList(storagePool->getId(), QuotaDataType_USER, QuotaLimitType_INODE,
&tmpExceededUIDsInode, error))
{
for (auto iter = targets.begin(); iter != targets.end(); iter++)
{
uint16_t targetId = *iter;
ExceededQuotaStorePtr exceededQuotaStore = app->getExceededQuotaStores()->get(targetId);
if (!exceededQuotaStore)
{
LOG(STORAGEPOOLS, ERR, "Could not access exceeded quota store in file number quota for users.", targetId);
retVal = false;
break;
}
exceededQuotaStore->updateExceededQuota(&tmpExceededUIDsInode, QuotaDataType_USER,
QuotaLimitType_INODE);
}
}
else
{ // error
LogContext(logContext).logErr("Unable to download exceeded file number quota for users.");
retVal = false;
}
if (downloadExceededQuotaList(storagePool->getId(), QuotaDataType_USER, QuotaLimitType_INODE,
&tmpExceededGIDsInode, error))
{
for (auto iter = targets.begin(); iter != targets.end(); iter++)
{
uint16_t targetId = *iter;
ExceededQuotaStorePtr exceededQuotaStore = app->getExceededQuotaStores()->get(targetId);
if (!exceededQuotaStore)
{
LOG(STORAGEPOOLS, ERR, "Could not access exceeded quota store in file number quota for groups.", targetId);
retVal = false;
break;
}
exceededQuotaStore->updateExceededQuota(&tmpExceededGIDsInode, QuotaDataType_GROUP,
QuotaLimitType_INODE);
}
}
else
{ // error
LogContext(logContext).logErr("Unable to download exceeded file number quota for groups.");
retVal = false;
}
return retVal;
}