New upstream version 8.1.0

This commit is contained in:
geos_one
2025-08-10 01:34:16 +02:00
commit c891bb7105
4398 changed files with 838833 additions and 0 deletions

View File

@@ -0,0 +1,152 @@
#include "IBVBuffer.h"
#include "IBVSocket.h"
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
bool IBVBuffer_init(IBVBuffer* buffer, IBVCommContext* ctx, size_t bufLen,
size_t fragmentLen, enum dma_data_direction dma_dir)
{
unsigned count;
unsigned i;
if (fragmentLen == 0)
fragmentLen = bufLen;
count = (bufLen + fragmentLen - 1) / fragmentLen;
bufLen = MIN(fragmentLen, bufLen);
buffer->dma_dir = dma_dir;
buffer->buffers = kzalloc(count * sizeof(*buffer->buffers), GFP_KERNEL);
buffer->lists = kzalloc(count * sizeof(*buffer->lists), GFP_KERNEL);
if(!buffer->buffers || !buffer->lists)
goto fail;
for(i = 0; i < count; i++)
{
buffer->lists[i].lkey = ctx->pd->local_dma_lkey;
buffer->lists[i].length = bufLen;
buffer->buffers[i] = kmalloc(bufLen, GFP_KERNEL);
if(unlikely(!buffer->buffers[i]))
{
printk_fhgfs(KERN_ERR, "Failed to allocate buffer size=%zu\n", bufLen);
goto fail;
}
buffer->lists[i].addr = ib_dma_map_single(ctx->pd->device, buffer->buffers[i],
bufLen, dma_dir);
if (unlikely(ib_dma_mapping_error(ctx->pd->device, buffer->lists[i].addr)))
{
buffer->lists[i].addr = 0;
printk_fhgfs(KERN_ERR, "Failed to dma map buffer size=%zu\n", bufLen);
goto fail;
}
BUG_ON(buffer->lists[i].addr == 0);
}
buffer->bufferSize = bufLen;
buffer->listLength = count;
buffer->bufferCount = count;
return true;
fail:
IBVBuffer_free(buffer, ctx);
return false;
}
bool IBVBuffer_initRegistration(IBVBuffer* buffer, IBVCommContext* ctx)
{
struct scatterlist* sg;
int res;
int i;
buffer->mr = ib_alloc_mr(ctx->pd, IB_MR_TYPE_MEM_REG, buffer->bufferCount);
if (IS_ERR(buffer->mr))
{
printk_fhgfs(KERN_ERR, "Failed to alloc mr, errCode=%ld\n", PTR_ERR(buffer->mr));
buffer->mr = NULL;
goto fail;
}
sg = kzalloc(buffer->bufferCount * sizeof(struct scatterlist), GFP_KERNEL);
if (sg == NULL)
{
printk_fhgfs(KERN_ERR, "Failed to alloc sg\n");
goto fail;
}
for (i = 0; i < buffer->bufferCount; ++i)
{
sg_dma_address(&sg[i]) = buffer->lists[i].addr;
sg_dma_len(&sg[i]) = buffer->lists[i].length;
}
res = ib_map_mr_sg(buffer->mr, sg, buffer->bufferCount, NULL, PAGE_SIZE);
kfree(sg);
if (res < 0)
{
printk_fhgfs(KERN_ERR, "Failed to map mr res=%d\n", res);
goto fail;
}
return true;
fail:
if (buffer->mr)
{
ib_dereg_mr(buffer->mr);
buffer->mr = NULL;
}
return false;
}
void IBVBuffer_free(IBVBuffer* buffer, IBVCommContext* ctx)
{
if(buffer->buffers && buffer->lists)
{
unsigned i;
for(i = 0; i < buffer->bufferCount; i++)
{
if (buffer->lists[i].addr)
ib_dma_unmap_single(ctx->pd->device, buffer->lists[i].addr,
buffer->bufferSize, buffer->dma_dir);
if (buffer->buffers[i])
kfree(buffer->buffers[i]);
}
}
if (buffer->mr)
ib_dereg_mr(buffer->mr);
if (buffer->buffers)
kfree(buffer->buffers);
if (buffer->lists)
kfree(buffer->lists);
}
ssize_t IBVBuffer_fill(IBVBuffer* buffer, struct iov_iter* iter)
{
ssize_t total = 0;
unsigned i;
for(i = 0; i < buffer->bufferCount && iov_iter_count(iter) > 0; i++)
{
size_t fragment = MIN(MIN(iov_iter_count(iter), buffer->bufferSize), 0xFFFFFFFF);
if(copy_from_iter(buffer->buffers[i], fragment, iter) != fragment)
return -EFAULT;
buffer->lists[i].length = fragment;
buffer->listLength = i + 1;
total += fragment;
}
return total;
}
#endif

View File

@@ -0,0 +1,49 @@
#ifndef IBVBuffer_h_aMQFNfzrjbEHDOcv216fi
#define IBVBuffer_h_aMQFNfzrjbEHDOcv216fi
#include <common/Common.h>
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <os/iov_iter.h>
struct IBVBuffer;
typedef struct IBVBuffer IBVBuffer;
struct IBVCommContext;
struct IBVSocket;
extern bool IBVBuffer_init(IBVBuffer* buffer, struct IBVCommContext* ctx, size_t bufLen,
size_t fragmentLen, enum dma_data_direction dma_dir);
/**
* Prepare the instance to use its internal ib_mr. This is only needed for buffers used
* with RDMA READ/WRITE and when not using a global rkey. This may be called before
* the connection is established. Once the connection has been established,
* the registration must be completed via a call to IBVSocket_registerMr().
*/
extern bool IBVBuffer_initRegistration(IBVBuffer* buffer, struct IBVCommContext* ctx);
extern void IBVBuffer_free(IBVBuffer* buffer, struct IBVCommContext* ctx);
extern ssize_t IBVBuffer_fill(IBVBuffer* buffer, struct iov_iter* iter);
struct IBVBuffer
{
char** buffers;
struct ib_sge* lists;
struct ib_mr* mr;
size_t bufferSize;
unsigned bufferCount;
unsigned listLength;
enum dma_data_direction dma_dir;
};
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,283 @@
#ifndef OPENTK_IBVSOCKET_H_
#define OPENTK_IBVSOCKET_H_
#include <common/Common.h>
#include <common/toolkit/Random.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <asm/atomic.h>
#include <os/iov_iter.h>
#define IBVSOCKET_PRIVATEDATA_STR "fhgfs0 " // must be exactly(!!) 8 bytes long
#define IBVSOCKET_PRIVATEDATA_STR_LEN 8
#define IBVSOCKET_PRIVATEDATA_PROTOCOL_VER 1
struct ib_device;
struct ib_mr;
struct IBVIncompleteRecv;
typedef struct IBVIncompleteRecv IBVIncompleteRecv;
struct IBVIncompleteSend;
typedef struct IBVIncompleteSend IBVIncompleteSend;
struct IBVCommContext;
typedef struct IBVCommContext IBVCommContext;
struct IBVCommDest;
typedef struct IBVCommDest IBVCommDest;
struct IBVTimeoutConfig;
typedef struct IBVTimeoutConfig IBVTimeoutConfig;
struct IBVSocket; // forward declaration
typedef struct IBVSocket IBVSocket;
struct IBVCommConfig;
typedef struct IBVCommConfig IBVCommConfig;
struct NicAddressStats;
typedef struct NicAddressStats NicAddressStats;
enum IBVSocketKeyType
{
IBVSOCKETKEYTYPE_UnsafeGlobal = 0,
IBVSOCKETKEYTYPE_UnsafeDMA,
IBVSOCKETKEYTYPE_Register
};
typedef enum IBVSocketKeyType IBVSocketKeyType;
// construction/destruction
extern __must_check bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats);
extern void IBVSocket_uninit(IBVSocket* _this);
// static
extern bool IBVSocket_rdmaDevicesExist(void);
// methods
extern bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress,
unsigned short port, IBVCommConfig* commCfg);
extern bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr,
unsigned short port);
extern bool IBVSocket_listen(IBVSocket* _this);
extern bool IBVSocket_shutdown(IBVSocket* _this);
extern ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags,
int timeoutMS);
extern ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags);
extern int IBVSocket_checkConnection(IBVSocket* _this);
extern unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll);
// getters & setters
extern void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS);
extern void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService);
extern void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value);
extern struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this);
// Only access members of NicAddressStats when the owner NodeConnPool mutex is held.
// OK to access "nic" without holding mutex.
extern NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this);
extern unsigned IBVSocket_getRkey(IBVSocket* _this);
extern struct ib_device* IBVSocket_getDevice(IBVSocket* _this);
extern int IBVSocket_registerMr(IBVSocket* _this, struct ib_mr* mr, int access);
struct IBVTimeoutConfig
{
int connectMS;
int completionMS;
int flowSendMS;
int flowRecvMS;
int pollMS;
};
struct IBVCommConfig
{
unsigned bufNum; // number of available buffers
unsigned bufSize; // total size of each buffer
/**
* IBVBuffer can allocate the buffer in multiple memory regions. This
* is to allow allocation of large buffers without requiring the
* buffer to be entirely contiguous. A value of 0 means that the
* buffer should not be fragmented.
*/
unsigned fragmentSize; // size of buffer fragments
IBVSocketKeyType keyType; // Which type of rkey for RDMA
};
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <common/threading/Mutex.h>
#include "IBVBuffer.h"
enum IBVSocketConnState;
typedef enum IBVSocketConnState IBVSocketConnState_t;
extern bool __IBVSocket_createNewID(IBVSocket* _this);
extern bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext);
extern void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext);
extern bool __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest);
extern bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest);
extern int __IBVSocket_receiveCheck(IBVSocket* _this, int timeoutMS);
extern int __IBVSocket_nonblockingSendCheck(IBVSocket* _this);
extern int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex);
extern int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex);
extern int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC);
extern int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS);
extern void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this);
extern int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS);
extern int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS,
struct ib_wc* outWC);
extern int __IBVSocket_waitForSendCompletionEvent(IBVSocket* _this, int oldSendCount,
int timeoutMS);
extern int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this,
unsigned* numSendElements, unsigned* numWriteElements, unsigned* numReadElements, int timeoutMS);
extern ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, struct iov_iter* iter);
extern int __IBVSocket_cmaHandler(struct rdma_cm_id* cm_id, struct rdma_cm_event* event);
extern void __IBVSocket_cqSendEventHandler(struct ib_event* event, void* data);
extern void __IBVSocket_sendCompletionHandler(struct ib_cq* cq, void* cq_context);
extern void __IBVSocket_cqRecvEventHandler(struct ib_event* event, void* data);
extern void __IBVSocket_recvCompletionHandler(struct ib_cq* cq, void* cq_context);
extern void __IBVSocket_qpEventHandler(struct ib_event* event, void* data);
extern int __IBVSocket_routeResolvedHandler(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext);
extern int __IBVSocket_connectedHandler(IBVSocket* _this, struct rdma_cm_event *event);
extern struct ib_cq* __IBVSocket_createCompletionQueue(struct ib_device* device,
ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *),
void* cq_context, int cqe);
extern const char* __IBVSocket_wcStatusStr(int wcStatusCode);
enum IBVSocketConnState
{
IBVSOCKETCONNSTATE_UNCONNECTED=0,
IBVSOCKETCONNSTATE_CONNECTING=1,
IBVSOCKETCONNSTATE_ADDRESSRESOLVED=2,
IBVSOCKETCONNSTATE_ROUTERESOLVED=3,
IBVSOCKETCONNSTATE_ESTABLISHED=4,
IBVSOCKETCONNSTATE_FAILED=5,
IBVSOCKETCONNSTATE_REJECTED_STALE=6
};
struct IBVIncompleteRecv
{
int isAvailable;
int completedOffset;
int bufIndex;
int totalSize;
};
struct IBVIncompleteSend
{
unsigned numAvailable;
bool forceWaitForAll; // true if we received only some completions and need
// to wait for the rest before we can send more data
};
struct IBVCommContext
{
struct ib_pd* pd; // protection domain
struct ib_mr* dmaMR; // system DMA MR. Not supported on all platforms.
atomic_t recvCompEventCount; // incremented on incoming event notification
wait_queue_head_t recvCompWaitQ; // for recvCompEvents
wait_queue_t recvWait;
bool recvWaitInitialized; // true if init_wait was called for the thread
atomic_t sendCompEventCount; // incremented on incoming event notification
wait_queue_head_t sendCompWaitQ; // for sendCompEvents
wait_queue_t sendWait;
bool sendWaitInitialized; // true if init_wait was called for the thread
struct ib_cq* recvCQ; // recv completion queue
struct ib_cq* sendCQ; // send completion queue
struct ib_qp* qp; // send+recv queue pair
IBVCommConfig commCfg;
struct IBVBuffer* sendBufs;
struct IBVBuffer* recvBufs;
struct IBVBuffer checkConBuffer;
unsigned numReceivedBufsLeft; // flow control v2 to avoid IB rnr timeout
unsigned numSendBufsLeft; // flow control v2 to avoid IB rnr timeout
IBVIncompleteRecv incompleteRecv;
IBVIncompleteSend incompleteSend;
u32 checkConnRkey;
};
#pragma pack(push, 1)
// Note: Make sure this struct has the same size on all architectures (because we use
// sizeof(IBVCommDest) for private_data during handshake)
struct IBVCommDest
{
char verificationStr[IBVSOCKET_PRIVATEDATA_STR_LEN];
uint64_t protocolVersion;
uint64_t vaddr;
unsigned rkey;
unsigned recvBufNum;
unsigned recvBufSize;
};
#pragma pack(pop)
struct IBVSocket
{
wait_queue_head_t eventWaitQ; // used to wait for connState change during connect
struct rdma_cm_id* cm_id;
struct in_addr srcIpAddr;
IBVCommDest localDest;
IBVCommDest* remoteDest;
IBVCommContext* commContext;
int errState; // 0 = <no error>; -1 = <unspecified error>
volatile IBVSocketConnState_t connState;
int typeOfService;
unsigned remapConnectionFailureStatus;
NicAddressStats* nicStats; // Owned by a NodeConnPool instance. Do not access
// members without locking the NodeConnPool mutex.
// Possibly NULL.
IBVTimeoutConfig timeoutCfg;
Mutex cmaMutex; // used to manage concurrency of cm_id and commContext
// with __IBVSocket_cmaHandler
};
#else
struct IBVSocket
{
/* empty structs are not allowed, so until this kludge can go, add a dummy member */
unsigned:0;
};
#endif
#endif /*OPENTK_IBVSOCKET_H_*/

View File

@@ -0,0 +1,114 @@
#include "IBVSocket.h"
#ifndef BEEGFS_RDMA
#define no_ibvsocket_err() \
printk_fhgfs(KERN_INFO, "%s:%d: You should never see this message\n", __func__, __LINE__)
bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats)
{
no_ibvsocket_err();
return false;
}
void IBVSocket_uninit(IBVSocket* _this)
{
// nothing to be done here
}
bool IBVSocket_rdmaDevicesExist(void)
{
return false;
}
bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port,
IBVCommConfig* commCfg)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr, unsigned short port)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_listen(IBVSocket* _this)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_shutdown(IBVSocket* _this)
{
no_ibvsocket_err();
return false;
}
ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags, int timeoutMS)
{
no_ibvsocket_err();
return -1;
}
ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags)
{
no_ibvsocket_err();
return -1;
}
/**
* @return 0 on success, -1 on error
*/
int IBVSocket_checkConnection(IBVSocket* _this)
{
no_ibvsocket_err();
return -1;
}
unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll)
{
no_ibvsocket_err();
return ~0;
}
unsigned IBVSocket_getRkey(IBVSocket* _this)
{
no_ibvsocket_err();
return ~0;
}
struct ib_device* IBVSocket_getDevice(IBVSocket* _this)
{
return NULL;
}
void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS)
{
}
void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService)
{
}
void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value)
{
}
struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this)
{
struct in_addr r = {
.s_addr = ~0
};
return r;
}
NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this)
{
return NULL;
}
#endif