beegfs/meta/source/net/message/MirroredMessage.h
2025-08-10 01:34:16 +02:00

446 lines
18 KiB
C++

#pragma once
#include <app/App.h>
#include <common/app/log/Logger.h>
#include <common/components/streamlistenerv2/IncomingPreprocessedMsgWork.h>
#include <common/net/message/session/AckNotifyMsg.h>
#include <common/net/message/session/AckNotifyRespMsg.h>
#include <common/net/message/NetMessage.h>
#include <common/storage/StorageErrors.h>
#include <common/toolkit/DebugVariable.h>
#include <common/toolkit/MessagingTk.h>
#include <program/Program.h>
#include <session/MirrorMessageResponseState.h>
#include <toolkit/BuddyCommTk.h>
template<typename BaseT, typename LockStateT>
class MirroredMessage : public BaseT
{
protected:
typedef MirroredMessage BaseType;
BuddyResyncJob* resyncJob;
LockStateT lockState;
MirroredMessage():
resyncJob(nullptr)
{}
virtual FhgfsOpsErr processSecondaryResponse(NetMessage& resp) = 0;
virtual const char* mirrorLogContext() const = 0;
virtual std::unique_ptr<MirroredMessageResponseState> executeLocally(
NetMessage::ResponseContext& ctx, bool isSecondary) = 0;
virtual bool isMirrored() = 0;
// IMPORTANT NOTE ON LOCKING ORDER:
// * always take locks the order
// - HashDirLock
// - DirIDLock
// - ParentNameLock
// - FileIDLock
// * always take locks of each type with the order induced by:
// - HashDirLock: id
// - DirIDLock: (id, forWrite)
// - ParentNameLock: (parentID, name)
// - FileIDLock: id
//
// not doing this may result in deadlocks.
virtual LockStateT lock(EntryLockStore& store) = 0;
virtual void forwardToSecondary(NetMessage::ResponseContext& ctx) = 0;
virtual bool processIncoming(NetMessage::ResponseContext& ctx)
{
Session* session = nullptr;
bool isNewState = true;
if (isMirrored() && !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond))
{
if (Program::getApp()->getInternodeSyncer()->getResyncInProgress())
resyncJob = Program::getApp()->getBuddyResyncer()->getResyncJob();
lockState = lock(*Program::getApp()->getMirroredSessions()->getEntryLockStore());
}
// make sure that the thread change set is *always* cleared when we leave this method.
struct _ClearChangeSet {
~_ClearChangeSet()
{
if (BuddyResyncer::getSyncChangeset())
{
LOG(MIRRORING, WARNING, "Abandoning sync changeset");
BuddyResyncer::abandonSyncChangeset();
}
}
} _clearChangeSet;
(void) _clearChangeSet;
mirrorState.reset();
if (isMirrored())
{
const auto nodeID = this->getRequestorID(ctx).second;
session = Program::getApp()->getMirroredSessions()->referenceSession(nodeID, true);
}
if (isMirrored() && this->hasFlag(NetMessageHeader::Flag_HasSequenceNumber))
{
// special case: client has not been told where to start its sequence. in this case,
// we want to answer with only the new seqNoBase for the client, and do NO processing.
if (this->getSequenceNumber() == 0)
{
GenericResponseMsg response(GenericRespMsgCode_NEWSEQNOBASE, "New seqNoBase");
response.addFlag(NetMessageHeader::Flag_HasSequenceNumber);
response.setSequenceNumber(session->getSeqNoBase());
ctx.sendResponse(response);
goto exit;
}
// a note on locking of mirrorState. since clients process each request in only one
// thread, per client we can have only one request for a given sequence number at any
// given time. retries may reuse the same sequence number, and they may be processed in
// a different thread on the server, but no two threads process the same sequence number
// from the same client at the same time. thus, no locking for the actual structure is
// needed, but extra memory barriers to ensure propagation of results between threads
// are necessary.
__sync_synchronize();
if (this->hasFlag(NetMessageHeader::Flag_IsSelectiveAck))
std::tie(mirrorState, isNewState) = session->acquireMirrorStateSlotSelective(
this->getSequenceNumberDone(),
this->getSequenceNumber());
else
std::tie(mirrorState, isNewState) = session->acquireMirrorStateSlot(
this->getSequenceNumberDone(),
this->getSequenceNumber());
}
if (!isNewState)
{
if (mirrorState->response)
mirrorState->response->sendResponse(ctx);
else
ctx.sendResponse(
GenericResponseMsg(
GenericRespMsgCode_TRYAGAIN,
"Request for same sequence number is currently in progress"));
}
else
{
if (resyncJob && resyncJob->isRunning())
{
BuddyResyncer::registerSyncChangeset();
resyncJob->registerOps();
}
auto responseState = executeLocally(ctx,
isMirrored() && this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond));
// responseState may ne null if the message has called earlyComplete(). do not finish
// the operation twice in this case.
if (responseState)
finishOperation(ctx, std::move(responseState));
}
exit:
if (session)
Program::getApp()->getMirroredSessions()->releaseSession(session);
return true;
}
template<typename ResponseT>
void earlyComplete(NetMessage::ResponseContext& ctx, ResponseT&& state)
{
finishOperation(ctx, boost::make_unique<ResponseT>(std::move(state)));
Socket* sock = ctx.getSocket();
IncomingPreprocessedMsgWork::releaseSocket(Program::getApp(), &sock, this);
}
void buddyResyncNotify(NetMessage::ResponseContext& ctx, bool stateChanged)
{
// pairs with the memory barrier before acquireMirrorStateSlot
__sync_synchronize();
if (BuddyResyncer::getSyncChangeset())
{
if (isMirrored() &&
!this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
stateChanged)
BuddyResyncer::commitThreadChangeSet();
else
BuddyResyncer::abandonSyncChangeset();
}
}
void finishOperation(NetMessage::ResponseContext& ctx,
std::unique_ptr<MirroredMessageResponseState> state)
{
auto* responsePtr = state.get();
if (isMirrored() &&
!this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
state)
{
if (state->changesObservableState())
forwardToSecondary(ctx);
else
notifySecondaryOfACK(ctx);
}
if (mirrorState)
mirrorState->response = std::move(state);
// pairs with the memory barrier before acquireMirrorStateSlot
__sync_synchronize();
if (BuddyResyncer::getSyncChangeset())
{
resyncJob = Program::getApp()->getBuddyResyncer()->getResyncJob();
if (isMirrored() &&
!this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
responsePtr &&
responsePtr->changesObservableState())
BuddyResyncer::commitThreadChangeSet();
else
BuddyResyncer::abandonSyncChangeset();
resyncJob->unregisterOps();
}
if (responsePtr)
responsePtr->sendResponse(ctx);
lockState = {};
}
void notifySecondaryOfACK(NetMessage::ResponseContext& ctx)
{
AckNotifiyMsg msg;
// if the secondary does not respond with SUCCESS, it will automatically be set to
// needs-resync. eventually, resync will clear the secondary sessions entirely, which will
// also flush the sequence number store.
sendToSecondary(ctx, msg, NETMSGTYPE_AckNotifyResp);
}
virtual void prepareMirrorRequestArgs(RequestResponseArgs& args)
{
}
template<typename T>
void sendToSecondary(NetMessage::ResponseContext& ctx, MirroredMessageBase<T>& message,
unsigned respType, FhgfsOpsErr expectedResult = FhgfsOpsErr_SUCCESS)
{
App* app = Program::getApp();
NodeStoreServers* metaNodes = app->getMetaNodes();
MirrorBuddyGroupMapper* buddyGroups = app->getMetaBuddyGroupMapper();
DEBUG_ENV_VAR(unsigned, FORWARD_DELAY, 0, "BEEGFS_FORWARD_DELAY_SECS");
if (FORWARD_DELAY)
sleep(FORWARD_DELAY);
// if a resync is currently running, abort right here, immediatly. we do not need to know
// the exact state of the buddy: a resync is running. it's bad.
if (app->getInternodeSyncer()->getResyncInProgress())
return;
// check whether the secondary is viable at all: if it is not online and good,
// communicating will not do any good. even online/needs-resync must be skipped, because
// the resyncer must be the only entitity that changes the secondary as long as it is not
// good yet.
{
CombinedTargetState secondaryState;
NumNodeID secondaryID(buddyGroups->getSecondaryTargetID(
buddyGroups->getLocalGroupID()));
bool getStateRes = app->getMetaStateStore()->getState(secondaryID.val(),
secondaryState);
// if the secondary is anything except online/good, set it to needs-resync immediately.
// whenever we pass this point, the secondary will have missed *something* of
// importance, so anything except online/good must be set to needs-resync right here.
if (!getStateRes
|| secondaryState.reachabilityState != TargetReachabilityState_ONLINE
|| secondaryState.consistencyState != TargetConsistencyState_GOOD)
{
auto* const resyncer = app->getBuddyResyncer();
auto* const job = resyncer->getResyncJob();
// if we have no job or a running job, we must start a resync soon. if we have a
// job that has finished successfully, the management server may not have noticed
// that the secondary is completely resynced, so our buddys state may well not be
// GOOD even though we have resynced completely. we may assume that a successful
// resync implies that the buddy is good, even if the management server thinks it
// isn't.
if (!job ||
(!job->isRunning() && job->getState() != BuddyResyncJobState_SUCCESS))
{
setBuddyNeedsResync();
return;
}
}
}
RequestResponseArgs rrArgs(NULL, &message, respType);
RequestResponseNode rrNode(NumNodeID(buddyGroups->getLocalGroupID()), metaNodes);
rrNode.setMirrorInfo(buddyGroups, true);
rrNode.setTargetStates(app->getMetaStateStore());
prepareMirrorRequestArgs(rrArgs);
// copy sequence numbers and set original requestor info for secondary
message.setSequenceNumber(this->getSequenceNumber());
message.setSequenceNumberDone(this->getSequenceNumberDone());
message.setRequestorID(this->getRequestorID(ctx));
// (almost) all messages do some sort of statistics gathering by user ID
message.setMsgHeaderUserID(this->getMsgHeaderUserID());
// set flag here instead of at the beginning because &message == this is often used
message.addFlag(NetMessageHeader::Flag_BuddyMirrorSecond);
message.addFlag(this->getFlags() & NetMessageHeader::Flag_IsSelectiveAck);
message.addFlag(this->getFlags() & NetMessageHeader::Flag_HasSequenceNumber);
FhgfsOpsErr commRes = MessagingTk::requestResponseNode(&rrNode, &rrArgs);
message.removeFlag(NetMessageHeader::Flag_BuddyMirrorSecond);
if (commRes != FhgfsOpsErr_SUCCESS)
{
// since we have reached this point, the secondary has indubitably not received
// important information from the primary. we now have two choices to keep the system
// in a consistent, safe state:
//
// 1) set the secondary to needs-resync
// 2) rollback the modifications we have made and let the client retry, hoping that
// some future communication with the secondary is successful
//
// 2 is not a viable option: since some operations may move data off of this metadata
// server and onto another one completely; allowing these to be undone requires a
// two-phase commit protocol, which incurs large communication overhead for a
// (hopefully) very rare error case. other operations delete local state (eg unlink,
// or close of an unlinked file), which would have to be held in limbo until either a
// commit or a rollback is issued.
//
// since we assume that communication errors are very rare, option 1 is the most
// efficient in the general case (as it does not have to keep objects alive past their
// intended lifetimes), so we set the secondary to needs-resync on any kind of
// communication error.
// other errors, e.g. out-of-memory conditions or errors caused by streamout hooks, are
// also assumed to be rare. if any of these happens, the secondary must be resynced no
// matter what actually happened. since the operations itself succeeded, we cannot send
// a notification about the communication error either - we'd have to drop the operation
// result to do that.
#ifdef BEEGFS_DEBUG
int buddyNodeID = buddyGroups->getBuddyTargetID(app->getLocalNodeNumID().val());
LOG_CTX(MIRRORING, DEBUG, mirrorLogContext(), "Communication with secondary failed. "
"Resync will be required when secondary comes back", buddyNodeID, commRes);
#endif
setBuddyNeedsResync();
return;
}
FhgfsOpsErr respMsgRes = processSecondaryResponse(*rrArgs.outRespMsg);
if (respMsgRes != expectedResult)
{
// whoops; primary and secondary did different things; if secondary is not resyncing
// AND communication was good this is concerning (result must have been success on
// primary, otherwise no forwarding would have happened).
// usually, this would mean that primary and secondary do not have the same state, or
// that the secondary has some kind of system error. (if the primary had a system error,
// it would be more likely to fail than to succeed).
// in either case, the secondary should be resynced, even if the primary experienced
// a hardware fault or similar errors: at this point, we can no longer differentiate
// between good and bad state on the primary, and the secondary may be arbitrarily out
// of sync.
LOG_CTX(MIRRORING, NOTICE, mirrorLogContext(),
"Different return codes from primary and secondary buddy. "
"Setting secondary to needs-resync.",
("Expected response", expectedResult),
("Received response", respMsgRes));
setBuddyNeedsResync();
}
}
// inodes that are changes during mirrored processing on the secondary (eg file creation or
// deletion, setxattr, etc) may have timestamps changes to a different value than the primary.
// to remedy this, the secondary must explicitly set these timestamps during processing.
bool shouldFixTimestamps()
{
return isMirrored() && Program::getApp()->getConfig()->getTuneMirrorTimestamps();
}
void fixInodeTimestamp(DirInode& inode, MirroredTimestamps& ts)
{
if (!isMirrored())
return;
BEEGFS_BUG_ON_DEBUG(!inode.getIsLoaded(), "inode not loaded");
StatData stat;
inode.getStatData(stat);
if (!this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond))
{
ts = stat.getMirroredTimestamps();
}
else
{
stat.setMirroredTimestamps(ts);
inode.setStatData(stat);
}
}
void fixInodeTimestamp(FileInode& inode, MirroredTimestamps& ts,
EntryInfo* const saveEntryInfo)
{
if (!isMirrored())
return;
StatData stat;
inode.getStatData(stat);
if (!this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond))
{
ts = stat.getMirroredTimestamps();
}
else
{
stat.setMirroredTimestamps(ts);
inode.setStatData(stat);
if (saveEntryInfo)
inode.updateInodeOnDisk(saveEntryInfo);
}
}
void updateNodeOp(NetMessage::ResponseContext& ctx, MetaOpCounterTypes type)
{
const auto counter = isMirrored() && this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond)
? MetaOpCounter_MIRROR
: type;
Program::getApp()->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(),
counter, this->getMsgHeaderUserID());
}
private:
std::shared_ptr<MirrorStateSlot> mirrorState;
void setBuddyNeedsResync()
{
BuddyCommTk::setBuddyNeedsResync(Program::getApp()->getMetaPath(), true);
}
};