2025-08-10 01:34:16 +02:00

201 lines
7.3 KiB
C++

#pragma once
#include <common/toolkit/serialization/Serialization.h>
#include <common/Common.h>
#include <net/sock/ibvsocket/OpenTk_IBVSocket.h>
#include <arpa/inet.h>
#include <sys/poll.h>
#include <sys/socket.h>
#include <netdb.h>
#include <queue>
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#ifdef BEEGFS_NVFS
#include <common/threading/Mutex.h>
#include <unordered_map>
#endif /* BEEGFS_NVFS */
#define IBVSOCKET_RECV_WORK_ID_OFFSET (1)
#define IBVSOCKET_SEND_WORK_ID_OFFSET (1 + IBVSOCKET_RECV_WORK_ID_OFFSET)
#define IBVSOCKET_WRITE_WORK_ID (1 + IBVSOCKET_SEND_WORK_ID_OFFSET)
#define IBVSOCKET_READ_WORK_ID (1 + IBVSOCKET_WRITE_WORK_ID)
#define IBVSOCKET_EVENTS_GATHER_NUM (64)
#define IBVSOCKET_PRIVATEDATA_STR "fhgfs0 " // must be exactly(!!) 8 bytes long
#define IBVSOCKET_PRIVATEDATA_STR_LEN 8
#define IBVSOCKET_PRIVATEDATA_PROTOCOL_VER 1
struct IBVIncompleteRecv;
typedef struct IBVIncompleteRecv IBVIncompleteRecv;
struct IBVIncompleteSend;
typedef struct IBVIncompleteSend IBVIncompleteSend;
struct IBVCommContext;
typedef struct IBVCommContext IBVCommContext;
struct IBVCommDest;
typedef struct IBVCommDest IBVCommDest;
typedef std::queue<struct rdma_cm_event*> CmEventQueue;
#ifdef BEEGFS_NVFS
typedef std::unordered_map<char *, struct ibv_mr *> MRMap;
typedef std::unordered_map<uint64_t, int> CQMap;
#endif /* BEEGFS_NVFS */
extern void __IBVSocket_initFromCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommContext* commContext);
extern IBVSocket* __IBVSocket_constructFromCommContext(struct rdma_cm_id* cm_id,
IBVCommContext* commContext);
extern int __IBVSocket_registerBuf(IBVCommContext* commContext, void* buf, size_t bufLen,
struct ibv_mr **outMR);
extern char* __IBVSocket_allocAndRegisterBuf(IBVCommContext* commContext, size_t bufLen,
struct ibv_mr **outMR);
extern bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext);
extern void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext);
extern void __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest);
extern bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest);
extern int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex);
extern int __IBVSocket_postWrite(IBVSocket* _this, IBVCommDest* remoteDest,
struct ibv_mr* localMR, char* localBuf, int bufLen);
extern int __IBVSocket_postRead(IBVSocket* _this, IBVCommDest* remoteDest,
struct ibv_mr* localMR, char* localBuf, int bufLen);
#ifdef BEEGFS_NVFS
extern int __IBVSocket_postWrite(IBVSocket* _this, char* localBuf, int bufLen, unsigned lkey,
uint64_t remoteBuf, unsigned rkey);
extern int __IBVSocket_postRead(IBVSocket* _this, char* localBuf, int bufLen, unsigned lkey,
uint64_t remoteBuf, unsigned rkey);
#endif /* BEEGFS_NVFS */
extern int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex, int bufLen);
extern int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ibv_wc* outWC);
extern int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS);
extern void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this);
extern int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS);
extern int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS,
struct ibv_wc* outWC);
extern int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this,
int numSendElements, int numWriteElements, int numReadElements);
extern int __IBVSocket_waitForUsedSendBufsReset(IBVSocket* _this);
extern ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this,
char* buf, size_t bufLen);
extern void __IBVSocket_disconnect(IBVSocket* _this);
extern void __IBVSocket_close(IBVSocket* _this);
extern bool __IBVSocket_initEpollFD(IBVSocket* _this);
extern const char* __IBVSocket_wcStatusStr(int wcStatusCode);
struct IBVIncompleteRecv
{
int isAvailable;
int completedOffset;
struct ibv_wc wc;
};
struct IBVIncompleteSend
{
unsigned numAvailable;
};
struct IBVTimeoutConfig
{
int connectMS;
int flowSendMS;
int pollMS;
};
struct IBVCommContext
{
struct ibv_context* context;
struct ibv_pd* pd; // protection domain
struct ibv_mr* recvMR; // recvBuf mem region
struct ibv_mr* sendMR; // sendBuf mem region
struct ibv_mr* controlMR; // flow/flood control mem region
struct ibv_mr* controlResetMR; // flow/flood control reset mem region
struct ibv_comp_channel* recvCompChannel; // recv completion event channel
unsigned numUnackedRecvCompChannelEvents; // number of gathered events
struct ibv_cq* recvCQ; // recv completion queue
struct ibv_cq* sendCQ; // send completion queue
struct ibv_qp* qp; // send+recv queue pair
IBVCommConfig commCfg;
char* recvBuf; // large alloc'ed and reg'ed buffer for recvBufs
char** recvBufs; // points to chunks inside recvBuf
char* sendBuf; // large alloc'ed and reg'ed buffer for sendBufs
char** sendBufs; // points to chunks inside sendBuf
volatile uint64_t numUsedSendBufs; // sender's flow/flood control counter (volatile!!)
volatile uint64_t numUsedSendBufsReset; // flow/flood control reset value
uint64_t numUsedRecvBufs; // receiver's flow/flood control (reset) counter
unsigned numReceivedBufsLeft; // flow control v2 to avoid IB rnr timeout
unsigned numSendBufsLeft; // flow control v2 to avoid IB rnr timeout
IBVIncompleteRecv incompleteRecv;
IBVIncompleteSend incompleteSend;
#ifdef BEEGFS_NVFS
uint64_t wr_id;
Mutex *cqMutex;
CQMap *cqCompletions;
MRMap *workerMRs;
#endif /* BEEGFS_NVFS */
};
#pragma pack(push, 1)
// Note: Make sure this struct has the same size on all architectures (because we use
// sizeof(IBVCommDest) for private_data during handshake)
struct IBVCommDest
{
char verificationStr[IBVSOCKET_PRIVATEDATA_STR_LEN];
uint64_t protocolVersion;
uint64_t vaddr;
unsigned rkey;
unsigned recvBufNum;
unsigned recvBufSize;
};
#pragma pack(pop)
struct IBVSocket
{
struct rdma_event_channel* cm_channel;
struct rdma_cm_id* cm_id;
IBVCommDest localDest;
IBVCommDest* remoteDest;
IBVCommContext* commContext;
int epollFD; // only for connected sockets, invalid (-1) for listeners
bool sockValid;
int errState;
CmEventQueue* delayedCmEventsQ;
uint8_t typeOfService;
unsigned connectionRejectionRate;
unsigned connectionRejectionCount;
IBVTimeoutConfig timeoutCfg;
struct in_addr bindIP;
};