beegfs/client_module/source/net/filesystem/FhgfsOpsCommKitCommon.h
2025-08-10 01:34:16 +02:00

188 lines
5.6 KiB
C

#ifndef FHGFSOPSCOMMKITCOMMON_H_
#define FHGFSOPSCOMMKITCOMMON_H_
#include <app/log/Logger.h>
#include <common/net/message/session/rw/WriteLocalFileRespMsg.h>
#include <common/toolkit/MessagingTk.h>
#include <common/toolkit/SocketTk.h>
#include <common/Common.h>
#include <common/Types.h>
#include <net/filesystem/RemotingIOInfo.h>
#include <nodes/NodeStoreEx.h>
#define BEEGFS_COMMKIT_MSGBUF_SIZE 4096
#define COMMKIT_DEBUG_READ_SEND (1 << 0)
#define COMMKIT_DEBUG_READ_HEADER (1 << 1)
#define COMMKIT_DEBUG_RECV_DATA (1 << 2)
#define COMMKIT_DEBUG_WRITE_HEADER (1 << 3)
#define COMMKIT_DEBUG_WRITE_POLL (1 << 4)
#define COMMKIT_DEBUG_WRITE_SEND (1 << 5)
#define COMMKIT_DEBUG_WRITE_RECV (1 << 6)
// #define BEEGFS_COMMKIT_DEBUG 0b010
#ifndef BEEGFS_COMMKIT_DEBUG
#define BEEGFS_COMMKIT_DEBUG 0
#endif
#ifdef BEEGFS_COMMKIT_DEBUG
#define CommKitErrorInjectRate 101 // fail a send/receive req every X jiffies
#endif
struct CommKitContext;
typedef struct CommKitContext CommKitContext;
static inline void __FhgfsOpsCommKitCommon_pollStateSocks(CommKitContext* context,
size_t numStates);
static inline void __FhgfsOpsCommKitCommon_handlePollError(CommKitContext* context,
int pollRes);
struct CommKitTargetInfo;
enum
{
CK_RETRY_BUDDY_FALLBACK = 1 << 0,
CK_RETRY_LOOP_EAGAIN = 1 << 1,
};
enum CKTargetBadAction
{
CK_SKIP_TARGET = 0,
CK_CONTINUE_TARGET = 1,
};
struct CommKitContextOps
{
enum CKTargetBadAction (*selectedTargetBad)(CommKitContext*, struct CommKitTargetInfo*,
const CombinedTargetState*);
unsigned (*prepareHeader)(CommKitContext*, struct CommKitTargetInfo*);
// returns >0 for successful send, 0 to advance to next state, or negative error code
int (*sendData)(CommKitContext*, struct CommKitTargetInfo*);
// return >= for success, 0 to advance, or negative beegfs error code
int (*recvHeader)(CommKitContext*, struct CommKitTargetInfo*);
// return >= for success, 0 to advance, or negative beegfs error code
int (*recvData)(CommKitContext*, struct CommKitTargetInfo*);
void (*printSendDataDetails)(CommKitContext*, struct CommKitTargetInfo*);
void (*printSocketDetails)(CommKitContext*, struct CommKitTargetInfo*);
int retryFlags;
const char* logContext;
};
/**
* Additional data that is required or useful for all the states.
* (This is shared states data.)
*
* invariant: (numRetryWaiters + numDone + numUnconnectable + numPollSocks) <= numStates
*/
struct CommKitContext
{
const struct CommKitContextOps* ops;
App* app;
Logger* log;
void* private;
RemotingIOInfo* ioInfo;
struct list_head* targetInfoList;
unsigned numRetryWaiters; // number of states with a communication error
unsigned numDone; // number of finished states
unsigned numAcquiredConns; // number of acquired conns to decide waiting or not (avoids deadlock)
unsigned numUnconnectable; // states that didn't get a conn this round (reset to 0 in each round)
unsigned numBufferless; // states that couldn't acquire a header buffer
bool pollTimedOut;
bool pollTimeoutLogged;
bool connFailedLogged;
PollState pollState;
unsigned numPollSocks;
unsigned currentRetryNum; // number of used up retries (in case of comm errors)
unsigned maxNumRetries; // 0 if infinite number of retries enabled
#ifdef BEEGFS_NVFS
int gpudRc;
#endif
};
/**
* Poll the state socks that need to be polled.
* This will call _handlePollError() if something goes wrong with the poll(), e.g. timeout.
*
* Note: Caller must have checked that there actually are socks that need polling
* (context.numPollSocks>0) before calling this.
*/
void __FhgfsOpsCommKitCommon_pollStateSocks(CommKitContext* context, size_t numStates)
{
int pollRes;
Config* cfg = App_getConfig(context->app);
size_t numWaiters = context->numPollSocks + context->numRetryWaiters +
context->numDone + context->numUnconnectable + context->numBufferless;
/* if not all the states are polling or done, some states must be ready to do something
immediately => set timeout to 0 (non-blocking) */
/* note: be very careful with timeoutMS==0, because that means we don't release the CPU,
so we need to ensure that timeoutMS==0 is no permanent condition. */
int timeoutMS = (numWaiters < numStates) ? 0 : cfg->connMsgLongTimeout;
BEEGFS_BUG_ON_DEBUG(numWaiters > numStates, "numWaiters > numStates should never happen");
pollRes = SocketTk_poll(&context->pollState, timeoutMS);
if(unlikely( (timeoutMS && pollRes <= 0) || (pollRes < 0) ) )
__FhgfsOpsCommKitCommon_handlePollError(context, pollRes);
}
void __FhgfsOpsCommKitCommon_handlePollError(CommKitContext* context, int pollRes)
{
// note: notifies the waiting sockets via context.pollTimedOut (they do not care whether
// it was a timeout or an error)
if(!context->pollTimeoutLogged)
{
if(!pollRes)
{ // no incoming data => timout or interrupted
if (fatal_signal_pending(current))
{ // interrupted
Logger_logFormatted(context->log, 3, context->ops->logContext,
"Communication (poll() ) interrupted by signal.");
}
else
{ // timout
Logger_logErrFormatted(context->log, context->ops->logContext,
"Communication (poll() ) timeout for %u sockets.", context->numPollSocks);
}
}
else
{ // error
Logger_logErrFormatted(context->log, context->ops->logContext,
"Communication (poll() ) error for %u sockets. ErrCode: %d",
context->numPollSocks, pollRes);
}
}
context->pollTimeoutLogged = true;
context->pollTimedOut = true;
}
#endif /*FHGFSOPSCOMMKITCOMMON_H_*/