2025-08-10 01:34:16 +02:00

2093 lines
62 KiB
C

#include "IBVSocket.h"
#ifdef BEEGFS_RDMA
#ifdef KERNEL_HAS_SCSI_FC_COMPAT
#include <scsi/fc_compat.h> // some kernels (e.g. rhel 5.9) forgot this in their rdma headers
#endif // KERNEL_HAS_SCSI_FC_COMPAT
#include <common/toolkit/TimeTk.h>
#include <common/toolkit/Time.h>
#include <linux/in.h>
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/socket.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#define IBVSOCKET_CONN_TIMEOUT_MS 5000
/* this also includes send completion wait times */
#define IBVSOCKET_COMPLETION_TIMEOUT_MS 300000
#define IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS 180000
#define IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS 180000
#define IBVSOCKET_SHUTDOWN_TIMEOUT_MS 250
#define IBVSOCKET_POLL_TIMEOUT_MS 10000
#define IBVSOCKET_FLOWCONTROL_MSG_LEN 1
#define IBVSOCKET_STALE_RETRIES_NUM 128
#define IBVSOCKET_MIN_SGE 1
#define IBVSOCKET_MIN_WR 1
/**
* IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS is used by IBVSocket_recvT when timeoutMS
* is passed as < 0, which indicates that __IBVSocket_receiveCheck should be
* called until it does not timeout. Thus, the long timeout value is
* inconsequential for that case. There doesn't appear to be any current code
* that passes timeoutMS < 0 to IBVSocket_recvT.
*/
#define IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS 1000000
#define ibv_print_info(str, ...) printk_fhgfs(KERN_INFO, "%s:%d: " str, __func__, __LINE__, \
##__VA_ARGS__)
#define ibv_print_info_ir(str, ...) printk_fhgfs_ir(KERN_INFO, "%s:%d: " str, __func__, __LINE__, \
##__VA_ARGS__)
#define ibv_pwarn(str, ...) printk_fhgfs(KERN_WARNING, "%s:%d: " str, __func__, __LINE__, \
##__VA_ARGS__)
#define ibv_print_info_debug(str, ...) printk_fhgfs_debug(KERN_INFO, "%s:%d: " str, __func__, \
__LINE__, ##__VA_ARGS__)
#define ibv_print_info_ir_debug(str, ...) printk_fhgfs_ir_debug(KERN_INFO, "%s:%d: " str, \
__func__, __LINE__, ##__VA_ARGS__)
/* 4.19 added const qualifiers to ib_post_send and ib_post_recv. */
typedef __typeof__(
__builtin_choose_expr(
__builtin_types_compatible_p(
__typeof__(&ib_post_send),
int (*)(struct ib_qp*, struct ib_send_wr*, struct ib_send_wr**)),
(struct ib_send_wr*) 0,
(const struct ib_send_wr*) 0))
_bad_send_wr;
typedef __typeof__(
__builtin_choose_expr(
__builtin_types_compatible_p(
__typeof__(&ib_post_recv),
int (*)(struct ib_qp*, struct ib_recv_wr*, struct ib_recv_wr**)),
(struct ib_recv_wr*) 0,
(const struct ib_recv_wr*) 0))
_bad_recv_wr;
bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats)
{
memset(_this, 0, sizeof(*_this) );
_this->connState = IBVSOCKETCONNSTATE_UNCONNECTED;
_this->timeoutCfg.connectMS = IBVSOCKET_CONN_TIMEOUT_MS;
_this->timeoutCfg.completionMS = IBVSOCKET_COMPLETION_TIMEOUT_MS;
_this->timeoutCfg.flowSendMS = IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS;
_this->timeoutCfg.flowRecvMS = IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS;
_this->timeoutCfg.pollMS = IBVSOCKET_POLL_TIMEOUT_MS;
_this->typeOfService = 0;
_this->srcIpAddr = srcIpAddr;
_this->nicStats = nicStats;
init_waitqueue_head(&_this->eventWaitQ);
Mutex_init(&_this->cmaMutex);
return __IBVSocket_createNewID(_this);
}
void IBVSocket_uninit(IBVSocket* _this)
{
Mutex_lock(&_this->cmaMutex);
__IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext);
Mutex_unlock(&_this->cmaMutex);
if(_this->cm_id)
rdma_destroy_id(_this->cm_id);
Mutex_uninit(&_this->cmaMutex);
SAFE_KFREE(_this->remoteDest);
}
bool IBVSocket_rdmaDevicesExist()
{
// Note: We use this (currently) just to inform the higher levels
// about availability of RDMA functionality
return true;
}
unsigned IBVSocket_getRkey(IBVSocket *_this)
{
return _this->commContext->checkConnRkey;
}
struct ib_device* IBVSocket_getDevice(IBVSocket* _this)
{
return _this->commContext->pd->device;
}
/**
* Create a new cm_id.
* This is not only intended for new sockets, but also for stale cm_ids, so this can cleanup/replace
* existing cm_ids and resets error states.
*/
bool __IBVSocket_createNewID(IBVSocket* _this)
{
struct rdma_cm_id* new_cm_id;
// We need to unconditionally destroy the old CM id. It is unusable at this point.
if(_this->cm_id)
{
rdma_destroy_id(_this->cm_id);
_this->cm_id = NULL;
}
#if defined(OFED_HAS_NETNS) || defined(rdma_create_id)
new_cm_id = rdma_create_id(&init_net, __IBVSocket_cmaHandler, _this, RDMA_PS_TCP, IB_QPT_RC);
#elif defined(OFED_HAS_RDMA_CREATE_QPTYPE)
new_cm_id = rdma_create_id(__IBVSocket_cmaHandler, _this, RDMA_PS_TCP, IB_QPT_RC);
#else
new_cm_id = rdma_create_id(__IBVSocket_cmaHandler, _this, RDMA_PS_TCP);
#endif
if(IS_ERR(new_cm_id) )
{
ibv_print_info("rdma_create_id failed. ErrCode: %ld\n", PTR_ERR(new_cm_id) );
return false;
}
_this->cm_id = new_cm_id;
_this->connState = IBVSOCKETCONNSTATE_UNCONNECTED;
_this->errState = 0;
return true;
}
bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port,
IBVCommConfig* commCfg)
{
struct sockaddr_in sin;
struct sockaddr_in src;
struct sockaddr_in* srcp;
long connTimeoutJiffies = TimeTk_msToJiffiesSchedulable(IBVSOCKET_CONN_TIMEOUT_MS);
Time connElapsed;
int rc;
/* note: rejected as stale means remote side still had an old open connection associated with
our current cm_id. what most likely happened is that the client was reset (i.e. no clean
disconnect) and our new cm_id after reboot now matches one of the old previous cm_ids.
=> only possible solution seems to be retrying with another cm_id. */
int numStaleRetriesLeft = IBVSOCKET_STALE_RETRIES_NUM;
Time_setToNow(&connElapsed);
for( ; ; ) // stale retry loop
{
// set type of service for this connection
#ifdef OFED_HAS_SET_SERVICE_TYPE
if (_this->typeOfService)
rdma_set_service_type(_this->cm_id, _this->typeOfService);
#endif // OFED_HAS_SET_SERVICE_TYPE
/* note: the rest of the connect procedure is invoked through the cmaHandler when the
corresponding asynchronous events arrive => we just have to wait for the connState
to change here */
_this->connState = IBVSOCKETCONNSTATE_CONNECTING;
// resolve IP address ...
// (async event handler also automatically resolves route on success)
sin.sin_addr.s_addr = ipaddress.s_addr;
sin.sin_family = AF_INET;
sin.sin_port = htons(port);
srcp = NULL;
if (_this->srcIpAddr.s_addr != 0)
{
src.sin_addr = _this->srcIpAddr;
src.sin_family = AF_INET;
src.sin_port = 0;
srcp = &src;
}
if(rdma_resolve_addr(_this->cm_id, (struct sockaddr*)srcp, (struct sockaddr*)&sin,
_this->timeoutCfg.connectMS) )
{
ibv_print_info_debug("rdma_resolve_addr failed\n");
goto err_invalidateSock;
}
// wait for async event
wait_event_interruptible(_this->eventWaitQ,
_this->connState != IBVSOCKETCONNSTATE_CONNECTING);
if(_this->connState != IBVSOCKETCONNSTATE_ADDRESSRESOLVED)
goto err_invalidateSock;
if(rdma_resolve_route(_this->cm_id, _this->timeoutCfg.connectMS) )
{
ibv_print_info_debug("rdma_resolve_route failed.\n");
goto err_invalidateSock;
}
wait_event_interruptible(_this->eventWaitQ,
_this->connState != IBVSOCKETCONNSTATE_ADDRESSRESOLVED);
if(_this->connState != IBVSOCKETCONNSTATE_ROUTERESOLVED)
goto err_invalidateSock;
// establish connection...
// (handler calls rdma_connect() )
Mutex_lock(&_this->cmaMutex);
rc = __IBVSocket_routeResolvedHandler(_this, _this->cm_id, commCfg, &_this->commContext);
Mutex_unlock(&_this->cmaMutex);
if (rc)
{
ibv_print_info_debug("route resolved handler failed\n");
goto err_invalidateSock;
}
// wait for async event
// Note: rdma_connect() can take a very long time (>5m) if the peer's HCA has gone down.
wait_event_interruptible_timeout(_this->eventWaitQ,
_this->connState != IBVSOCKETCONNSTATE_ROUTERESOLVED,
connTimeoutJiffies);
// test point for failed connections
if((_this->connState != IBVSOCKETCONNSTATE_ESTABLISHED) &&
(_this->remapConnectionFailureStatus != 0))
_this->connState = _this->remapConnectionFailureStatus;
// check if cm_id was reported as stale by remote side
if(_this->connState == IBVSOCKETCONNSTATE_REJECTED_STALE)
{
bool createIDRes;
if(!numStaleRetriesLeft)
{ // no more stale retries left
if(IBVSOCKET_STALE_RETRIES_NUM) // did we have any retries at all
ibv_print_info("Giving up after %d stale connection retries\n",
IBVSOCKET_STALE_RETRIES_NUM);
goto err_invalidateSock;
}
printk_fhgfs_connerr(KERN_INFO, "Stale connection detected. Retrying with a new one...\n");
// We need to clean up the commContext created in the routeResolvedHandler because
// the next time through the loop it will get recreated. If this is the final try,
// then we don't need it anymore.
Mutex_lock(&_this->cmaMutex);
__IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext);
_this->commContext = NULL;
createIDRes = __IBVSocket_createNewID(_this);
Mutex_unlock(&_this->cmaMutex);
if(!createIDRes)
goto err_invalidateSock;
numStaleRetriesLeft--;
continue;
}
if(_this->connState != IBVSOCKETCONNSTATE_ESTABLISHED)
{
ibv_print_info_debug("Failed after %d stale connection retries, elapsed = %u\n",
IBVSOCKET_STALE_RETRIES_NUM - numStaleRetriesLeft, Time_elapsedMS(&connElapsed));
goto err_invalidateSock;
}
// connected
if(numStaleRetriesLeft != IBVSOCKET_STALE_RETRIES_NUM)
{
ibv_print_info_debug("Succeeded after %d stale connection retries, elapsed = %u\n",
IBVSOCKET_STALE_RETRIES_NUM - numStaleRetriesLeft, Time_elapsedMS(&connElapsed));
}
return true;
}
err_invalidateSock:
// If we have a comm context, we need to delete it since we can't use it. We set an error state
// on the socket first, so we stop accepting callbacks that would access the commContext that is
// in the process of being destroyed.
_this->errState = -1;
Mutex_lock(&_this->cmaMutex);
__IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext);
_this->commContext = NULL;
Mutex_unlock(&_this->cmaMutex);
return false;
}
bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr, unsigned short port)
{
struct sockaddr_in bindAddr;
bindAddr.sin_family = AF_INET;
bindAddr.sin_addr = ipAddr;
bindAddr.sin_port = htons(port);
if(rdma_bind_addr(_this->cm_id, (struct sockaddr*)&bindAddr) )
{
_this->errState = -1;
return false;
}
return true;
}
/**
* @return true on success
*/
bool IBVSocket_listen(IBVSocket* _this)
{
if(rdma_listen(_this->cm_id, 0) )
{
ibv_print_info("rdma_listen failed\n");
_this->errState = -1;
return false;
}
return true;
}
bool IBVSocket_shutdown(IBVSocket* _this)
{
IBVCommContext* commContext = _this->commContext;
unsigned numWaitWrites = 0;
unsigned numWaitReads = 0;
int timeoutMS = IBVSOCKET_SHUTDOWN_TIMEOUT_MS;
if(_this->errState)
return true; // true, because the conn is down anyways
if(!commContext)
return true; // this socket has never been connected
if(commContext->incompleteSend.numAvailable)
{ // wait for all incomplete sends
int waitRes;
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
&commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS);
if(waitRes < 0)
{
ibv_print_info_debug("Waiting for incomplete send requests failed\n");
return false;
}
}
return true;
}
/**
* Continues an incomplete former recv() by returning immediately available data from the
* corresponding buffer.
*/
ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, struct iov_iter* iter)
{
IBVCommContext* commContext = _this->commContext;
struct IBVIncompleteRecv* recv = &commContext->incompleteRecv;
size_t bufIndex = recv->bufIndex;
struct IBVBuffer* buffer = &commContext->recvBufs[bufIndex];
size_t copyRes = 0;
ssize_t total = 0;
if(unlikely(_this->errState) )
return -1;
while(iov_iter_count(iter) > 0 && recv->totalSize != recv->completedOffset)
{
unsigned page = recv->completedOffset / buffer->bufferSize;
unsigned offset = recv->completedOffset % buffer->bufferSize;
unsigned fragment = MIN(MIN(iov_iter_count(iter), buffer->bufferSize - offset),
recv->totalSize - recv->completedOffset);
copyRes = copy_to_iter(buffer->buffers[page] + offset, fragment, iter);
if(copyRes != fragment)
{
copyRes = 0;
break;
}
total += fragment;
recv->completedOffset += fragment;
}
if(recv->completedOffset == recv->totalSize)
{
int postRes;
commContext->incompleteRecv.isAvailable = 0;
postRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex);
if(unlikely(postRes) )
goto err_invalidateSock;
}
if(!copyRes)
goto err_fault;
return total;
err_invalidateSock:
ibv_print_info_debug("invalidating connection\n");
err_fault:
_this->errState = -1;
return -EFAULT;
}
/**
* @return number of received bytes on success, -ETIMEDOUT on timeout, -ECOMM on error
*/
ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags, int timeoutMS)
{
int checkRes;
int wait = timeoutMS < 0 ? IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS : timeoutMS;
do {
checkRes = __IBVSocket_receiveCheck(_this, wait);
} while (checkRes == 0 && timeoutMS < 0);
if(checkRes < 0)
return -ECOMM;
if(checkRes == 0)
return -ETIMEDOUT;
return __IBVSocket_recvContinueIncomplete(_this, iter);
}
/**
* @flags supports MSG_DONTWAIT
* @return number of bytes sent or negative error code (-EAGAIN in case of MSG_DONTWAIT if no data
* could be sent without blocking)
*/
ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags)
{
IBVCommContext* commContext = _this->commContext;
int flowControlRes;
size_t currentBufIndex;
struct iov_iter source = *iter;
int postRes;
size_t postedLen = 0;
ssize_t currentPostLen;
int waitRes;
unsigned numWaitWrites = 0;
unsigned numWaitReads = 0;
int timeoutMS = _this->timeoutCfg.completionMS;
if(unlikely(_this->errState) )
return -1;
// handle flags
if(flags & MSG_DONTWAIT)
{ // send only as much as we can without blocking
// note: we adapt the bufLen variable as necessary here for simplicity
int checkSendRes;
size_t bufNumLeft;
size_t bufLenLeft;
checkSendRes = __IBVSocket_nonblockingSendCheck(_this);
if(!checkSendRes)
{ // we can't send non-blocking at the moment, caller shall try again later
return -EAGAIN;
}
else
if(unlikely(checkSendRes < 0) )
goto err_invalidateSock;
// buffers available => adapt bufLen (if necessary)
bufNumLeft = MIN(commContext->commCfg.bufNum - commContext->incompleteSend.numAvailable,
commContext->numSendBufsLeft);
bufLenLeft = bufNumLeft * commContext->commCfg.bufSize;
iov_iter_truncate(&source, bufLenLeft);
}
// send data cut in buf-sized pieces...
do
{
flowControlRes = __IBVSocket_flowControlOnSendWait(_this,
_this->timeoutCfg.flowSendMS);
if(unlikely(flowControlRes <= 0) )
goto err_invalidateSock;
// note: we only poll for completed sends if forced or after we used up all (!) available bufs
if(commContext->incompleteSend.forceWaitForAll ||
(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) )
{ // wait for all (!) incomplete sends
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
&commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS);
if(waitRes <= 0)
goto err_invalidateSock;
commContext->incompleteSend.forceWaitForAll = false;
}
currentBufIndex = commContext->incompleteSend.numAvailable;
currentPostLen = IBVBuffer_fill(&commContext->sendBufs[currentBufIndex], &source);
if(currentPostLen < 0)
goto err_fault;
commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */
postRes = __IBVSocket_postSend(_this, currentBufIndex);
if(unlikely(postRes) )
{
commContext->incompleteSend.numAvailable--;
goto err_invalidateSock;
}
postedLen += currentPostLen;
} while(iov_iter_count(&source));
iov_iter_advance(iter, postedLen);
return (ssize_t)postedLen;
err_invalidateSock:
_this->errState = -1;
return -ECOMM;
err_fault:
_this->errState = -1;
return -EFAULT;
}
void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS)
{
_this->timeoutCfg.connectMS = connectMS > 0? connectMS : IBVSOCKET_CONN_TIMEOUT_MS;
_this->timeoutCfg.completionMS = completionMS > 0? completionMS : IBVSOCKET_COMPLETION_TIMEOUT_MS;
_this->timeoutCfg.flowSendMS = flowSendMS > 0? flowSendMS : IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS;
_this->timeoutCfg.flowRecvMS = flowRecvMS > 0? flowRecvMS : IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS;
_this->timeoutCfg.pollMS = pollMS > 0? pollMS : IBVSOCKET_POLL_TIMEOUT_MS;
#ifdef BEEGFS_DEBUG
ibv_print_info_debug("connectMS=%d completionMS=%d flowSendMS=%d flowRecvMS=%d pollMS=%d\n",
_this->timeoutCfg.connectMS, _this->timeoutCfg.completionMS, _this->timeoutCfg.flowSendMS,
_this->timeoutCfg.flowRecvMS, _this->timeoutCfg.pollMS);
#endif
}
void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService)
{
_this->typeOfService = typeOfService;
}
void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value)
{
_this->remapConnectionFailureStatus = value;
}
bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext)
{
IBVCommContext* commContext;
struct ib_device* dev = cm_id->device;
struct ib_qp_init_attr qpInitAttr;
int qpRes;
unsigned i;
unsigned fragmentSize;
unsigned maxSge;
unsigned maxWr;
bool globalRkey = (commCfg->keyType == IBVSOCKETKEYTYPE_UnsafeGlobal);
commContext = kzalloc(sizeof(*commContext), GFP_KERNEL);
if(!commContext)
goto err_cleanup;
ibv_print_info_debug("Alloc CommContext @ %p\n", commContext);
// prepare recv and send event notification
init_waitqueue_head(&commContext->recvCompWaitQ);
init_waitqueue_head(&commContext->sendCompWaitQ);
atomic_set(&commContext->recvCompEventCount, 0);
atomic_set(&commContext->sendCompEventCount, 0);
// protection domain...
// IB_PD_UNSAFE_GLOBAL_RKEY is still present as of kernel 6.3.
#ifdef OFED_UNSAFE_GLOBAL_RKEY
commContext->pd = ib_alloc_pd(dev, globalRkey? IB_PD_UNSAFE_GLOBAL_RKEY : 0);
#else
if (globalRkey)
{
ibv_print_info("Unsafe global rkey not supported on this platform.");
goto err_cleanup;
}
commContext->pd = ib_alloc_pd(dev, 0);
#endif
if(IS_ERR(commContext->pd) )
{
ibv_print_info("Couldn't allocate PD. ErrCode: %ld\n", PTR_ERR(commContext->pd) );
commContext->pd = NULL;
goto err_cleanup;
}
#ifdef OFED_UNSAFE_GLOBAL_RKEY
if (globalRkey)
commContext->checkConnRkey = commContext->pd->unsafe_global_rkey;
#endif
if (commCfg->keyType == IBVSOCKETKEYTYPE_UnsafeDMA)
{
// DMA system mem region...
// (Note: IB spec says:
// "The consumer is not allowed to assign remote-write (or remote-atomic) to
// a memory region that has not been assigned local-write.")
// ib_get_dma_mr() goes away in kernel 4.9. Is is still present in MOFED 5.9.
// If not using the global rkey, then either ib_get_dmr_mr() or an allocated ib_mr
// needs to be used.
#ifdef OFED_IB_GET_DMA_MR
commContext->dmaMR = ib_get_dma_mr(commContext->pd,
IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
if(IS_ERR_OR_NULL(commContext->dmaMR) )
{
ibv_print_info("ib_get_dma_mr failed. ErrCode: %ld\n", PTR_ERR(commContext->dmaMR) );
commContext->dmaMR = NULL;
goto err_cleanup;
}
commContext->checkConnRkey = commContext->dmaMR->rkey;
#else
ibv_print_info("RDMA keyType is dma and ib_get_dma_mr() not supported on this platform.\n");
goto err_cleanup;
#endif
}
#ifdef BEEGFS_DEBUG
ibv_print_info("%s: checkConnRkey = %u\n", __func__, commContext->checkConnRkey);
#endif
// alloc and register buffers...
commContext->commCfg = *commCfg;
commContext->recvBufs = kzalloc(commCfg->bufNum * sizeof(struct IBVBuffer), GFP_KERNEL);
if(!commContext->recvBufs)
{
ibv_print_info("couldn't prepare receive buffer list\n");
goto err_cleanup;
}
for(i=0; i < commCfg->bufNum; i++)
{
if(!IBVBuffer_init(&commContext->recvBufs[i], commContext, commCfg->bufSize,
commCfg->fragmentSize, DMA_FROM_DEVICE) )
{
ibv_print_info("couldn't prepare recvBuf #%d\n", i + 1);
goto err_cleanup;
}
}
commContext->sendBufs = kzalloc(commCfg->bufNum * sizeof(struct IBVBuffer), GFP_KERNEL);
if(!commContext->sendBufs)
{
ibv_print_info("couldn't prepare send buffer list\n");
goto err_cleanup;
}
for(i=0; i < commCfg->bufNum; i++)
{
if(!IBVBuffer_init(&commContext->sendBufs[i], commContext, commCfg->bufSize,
commCfg->fragmentSize, DMA_TO_DEVICE) )
{
ibv_print_info("couldn't prepare sendBuf #%d\n", i + 1);
goto err_cleanup;
}
}
if(!IBVBuffer_init(&commContext->checkConBuffer, commContext, sizeof(u64),
0, DMA_TO_DEVICE) )
{
ibv_print_info("couldn't alloc dma control memory region\n");
goto err_cleanup;
}
// init flow control v2 (to avoid long receiver-not-ready timeouts)
/* note: we use -1 because the last buf might not be read by the user (eg during
nonblockingRecvCheck) and so it might not be immediately available again. */
commContext->numReceivedBufsLeft = commCfg->bufNum - 1;
commContext->numSendBufsLeft = commCfg->bufNum - 1;
// create completion queues...
commContext->recvCQ = __IBVSocket_createCompletionQueue(cm_id->device,
__IBVSocket_recvCompletionHandler, __IBVSocket_cqRecvEventHandler,
_this, commCfg->bufNum);
if(IS_ERR(commContext->recvCQ) )
{
ibv_print_info("couldn't create recv CQ. ErrCode: %ld\n", PTR_ERR(commContext->recvCQ) );
commContext->recvCQ = NULL;
goto err_cleanup;
}
// note: 1+commCfg->bufNum here for the checkConnection() RDMA read
commContext->sendCQ = __IBVSocket_createCompletionQueue(cm_id->device,
__IBVSocket_sendCompletionHandler, __IBVSocket_cqSendEventHandler,
_this, 1+commCfg->bufNum);
if(IS_ERR(commContext->sendCQ) )
{
ibv_print_info("couldn't create send CQ. ErrCode: %ld\n", PTR_ERR(commContext->sendCQ) );
commContext->sendCQ = NULL;
goto err_cleanup;
}
fragmentSize = commCfg->fragmentSize;
if (fragmentSize == 0)
fragmentSize = commCfg->bufSize;
maxSge = MAX(IBVSOCKET_MIN_SGE, commCfg->bufSize / fragmentSize + 1);
maxWr = MAX(IBVSOCKET_MIN_WR, commCfg->bufNum + 1);
// note: 1+commCfg->bufNum here for the checkConnection() RDMA read
memset(&qpInitAttr, 0, sizeof(qpInitAttr) );
qpInitAttr.event_handler = __IBVSocket_qpEventHandler;
qpInitAttr.send_cq = commContext->sendCQ;
qpInitAttr.recv_cq = commContext->recvCQ;
qpInitAttr.qp_type = IB_QPT_RC;
qpInitAttr.sq_sig_type = IB_SIGNAL_REQ_WR;
qpInitAttr.cap.max_send_wr = maxWr;
qpInitAttr.cap.max_recv_wr = maxWr;
qpInitAttr.cap.max_send_sge = maxSge;
qpInitAttr.cap.max_recv_sge = maxSge;
qpInitAttr.cap.max_inline_data = 0;
qpRes = rdma_create_qp(cm_id, commContext->pd, &qpInitAttr);
if(qpRes)
{
ibv_print_info("couldn't create QP. ErrCode: %d\n", qpRes);
goto err_cleanup;
}
commContext->qp = cm_id->qp;
// prepare event notification...
// initial event notification requests
if(ib_req_notify_cq(commContext->recvCQ, IB_CQ_NEXT_COMP) )
{
ibv_print_info("couldn't request CQ notification\n");
goto err_cleanup;
}
if(ib_req_notify_cq(commContext->sendCQ, IB_CQ_NEXT_COMP) )
{
ibv_print_info("couldn't request CQ notification\n");
goto err_cleanup;
}
*outCommContext = commContext;
return true;
// error handling
err_cleanup:
__IBVSocket_cleanupCommContext(cm_id, commContext);
*outCommContext = NULL;
return false;
}
void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext)
{
unsigned i;
struct ib_device* dev;
ibv_print_info_debug("Free CommContext @ %p\n", commContext);
if(!commContext)
return;
dev = commContext->pd ? commContext->pd->device : NULL;
if(!dev)
goto cleanup_no_dev;
if(cm_id && commContext->qp && cm_id->qp)
rdma_destroy_qp(cm_id);
if(commContext->sendCQ)
#ifdef OFED_IB_DESTROY_CQ_IS_VOID
ib_destroy_cq(commContext->sendCQ);
#else
{
int destroyRes = ib_destroy_cq(commContext->sendCQ);
if (unlikely(destroyRes) )
{
ibv_pwarn("sendCQ destroy failed: %d\n", destroyRes);
dump_stack();
}
}
#endif
if(commContext->recvCQ)
#ifdef OFED_IB_DESTROY_CQ_IS_VOID
ib_destroy_cq(commContext->recvCQ);
#else
{
int destroyRes = ib_destroy_cq(commContext->recvCQ);
if (unlikely(destroyRes) )
{
ibv_pwarn("recvCQ destroy failed: %d\n", destroyRes);
dump_stack();
}
}
#endif
IBVBuffer_free(&commContext->checkConBuffer, commContext);
for(i=0; i < commContext->commCfg.bufNum; i++)
{
if(commContext->recvBufs)
IBVBuffer_free(&commContext->recvBufs[i], commContext);
if(commContext->sendBufs)
IBVBuffer_free(&commContext->sendBufs[i], commContext);
}
SAFE_KFREE(commContext->recvBufs);
SAFE_KFREE(commContext->sendBufs);
if(commContext->dmaMR)
ib_dereg_mr(commContext->dmaMR);
if(commContext->pd)
ib_dealloc_pd(commContext->pd);
cleanup_no_dev:
kfree(commContext);
}
/**
* Initializes a (local) IBVCommDest.
*/
bool __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest)
{
memcpy(outDest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN);
outDest->protocolVersion = cpu_to_le64(IBVSOCKET_PRIVATEDATA_PROTOCOL_VER);
outDest->rkey = cpu_to_le32(commContext->checkConnRkey);
outDest->vaddr = cpu_to_le64(commContext->checkConBuffer.lists[0].addr);
outDest->recvBufNum = cpu_to_le32(commContext->commCfg.bufNum);
outDest->recvBufSize = cpu_to_le32(commContext->commCfg.bufSize);
#ifdef BEEGFS_DEBUG
ibv_print_info("%s: rkey=%u vaddr=%llu", __func__, outDest->rkey, outDest->vaddr);
#endif
return true;
}
/**
* Parses and checks a (remote) IBVCommDest.
*
* @param buf should usually be the private_data of the connection handshake
* @param outDest will be kmalloced (if true is returned) and needs to be kfree'd by the
* caller
* @return true if data is okay, false otherwise
*/
bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest)
{
const IBVCommDest* src = buf;
IBVCommDest* dest = NULL;
// Note: "bufLen < ..." (and not "!="), because there might be some extra padding
if(!buf || (bufLen < sizeof(*dest) ) )
{
ibv_print_info("Bad private data size. length: %zu\n", bufLen);
return false;
}
dest = kmalloc(sizeof(*dest), GFP_ATOMIC);
if(!dest)
return false;
*outDest = dest;
*dest = *src;
dest->protocolVersion = le64_to_cpu(dest->protocolVersion);
dest->vaddr = le64_to_cpu(dest->vaddr);
dest->rkey = le32_to_cpu(dest->rkey);
dest->recvBufNum = le32_to_cpu(dest->recvBufNum);
dest->recvBufSize = le32_to_cpu(dest->recvBufSize);
if(memcmp(dest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN) )
goto err_cleanup;
if(dest->protocolVersion != IBVSOCKET_PRIVATEDATA_PROTOCOL_VER)
goto err_cleanup;
return true;
err_cleanup:
kfree(dest);
*outDest = NULL;
return false;
}
/**
* Append buffer to receive queue.
*
* @param commContext passed seperately because it's not the _this->commContext during
* accept() of incoming connections
* @return 0 on success, -1 on error
*/
int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex)
{
struct ib_recv_wr wr;
_bad_recv_wr bad_wr;
int postRes;
commContext->sendBufs[bufIndex].lists[0].length = commContext->commCfg.bufSize;
wr.next = NULL;
wr.wr_id = bufIndex;
wr.sg_list = commContext->recvBufs[bufIndex].lists;
wr.num_sge = commContext->recvBufs[bufIndex].listLength;
postRes = ib_post_recv(commContext->qp, &wr, &bad_wr);
if(unlikely(postRes) )
{
ibv_print_info("ib_post_recv failed. ErrCode: %d\n", postRes);
return -1;
}
return 0;
}
int IBVSocket_checkConnection(IBVSocket* _this)
{
IBVCommContext* commContext = _this->commContext;
#ifdef OFED_SPLIT_WR
# define rdma_of(wr) (wr)
# define wr_of(wr) (wr.wr)
struct ib_rdma_wr wr;
#else
# define rdma_of(wr) (wr.wr.rdma)
# define wr_of(wr) (wr)
struct ib_send_wr wr;
#endif
_bad_send_wr bad_wr;
int postRes;
int waitRes;
int timeoutMS = _this->timeoutCfg.completionMS;
unsigned numWaitWrites = 0;
unsigned numWaitReads = 1;
rdma_of(wr).remote_addr = _this->remoteDest->vaddr;
rdma_of(wr).rkey = _this->remoteDest->rkey;
wr_of(wr).wr_id = 0;
wr_of(wr).sg_list = commContext->checkConBuffer.lists;
wr_of(wr).num_sge = commContext->checkConBuffer.listLength;
wr_of(wr).opcode = IB_WR_RDMA_READ;
wr_of(wr).send_flags = IB_SEND_SIGNALED;
wr_of(wr).next = NULL;
postRes = ib_post_send(commContext->qp, &wr_of(wr), &bad_wr);
if(unlikely(postRes) )
{
ibv_print_info("ib_post_send() failed. ErrCode: %d\n", postRes);
goto error;
}
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
&commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS);
if(unlikely(waitRes <= 0) )
goto error;
commContext->incompleteSend.forceWaitForAll = false;
return 0;
#undef rdma_of
#undef wr_of
error:
_this->errState = -1;
return -1;
}
/**
* Note: contains flow control.
*
* @return 0 on success, -1 on error
*/
int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex)
{
IBVCommContext* commContext = _this->commContext;
struct ib_send_wr wr;
_bad_send_wr bad_wr;
int postRes;
wr.wr_id = bufIndex;
wr.sg_list = commContext->sendBufs[bufIndex].lists;
wr.num_sge = commContext->sendBufs[bufIndex].listLength;
wr.opcode = IB_WR_SEND;
wr.send_flags = IB_SEND_SIGNALED;
wr.next = NULL;
postRes = ib_post_send(commContext->qp, &wr, &bad_wr);
if(unlikely(postRes) )
{
ibv_print_info("ib_post_send() failed. ErrCode: %d\n", postRes);
return -1;
}
// flow control
__IBVSocket_flowControlOnSendUpdateCounters(_this);
return 0;
}
/**
* Receive work completion.
*
* Note: contains flow control.
*
* @param timeoutMS 0 for non-blocking
* @return 1 on success, 0 on timeout, <0 on error
*/
int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC)
{
IBVCommContext* commContext = _this->commContext;
int waitRes;
size_t bufIndex;
waitRes = __IBVSocket_waitForRecvCompletionEvent(_this, timeoutMS, outWC);
if(waitRes <= 0)
{ // (note: waitRes==0 can often happen, because we call this with timeoutMS==0)
if(unlikely(waitRes < 0) )
{
if(waitRes != -ERESTARTSYS)
{ // only print message if user didn't press "<CTRL>-C"
ibv_print_info("retrieval of completion event failed. result: %d\n", waitRes);
}
}
return waitRes;
}
// we got something...
if(unlikely(outWC->status != IB_WC_SUCCESS) )
{
printk_fhgfs_connerr(KERN_INFO, "%s: Connection error (wc_status: %d; msg: %s)\n",
"IBVSocket (recv work completion)", (int)outWC->status,
__IBVSocket_wcStatusStr(outWC->status) );
return -1;
}
bufIndex = outWC->wr_id;
if(unlikely(bufIndex >= commContext->commCfg.bufNum) )
{
ibv_print_info("Completion for unknown/invalid wr_id %d\n", (int)outWC->wr_id);
return -1;
}
// receive completed
// flow control
if(unlikely(__IBVSocket_flowControlOnRecv(_this, _this->timeoutCfg.flowRecvMS) ) )
{
ibv_print_info_debug("got an error from flowControlOnRecv().\n");
return -1;
}
return 1;
}
/**
* Intention: Avoid IB rnr by sending control msg when (almost) all our recv bufs are used up to
* show that we got our new recv bufs ready.
*
* @timeoutMS don't set this to 0, we really need to wait here sometimes
* @return 0 on success, -1 on error
*/
int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS)
{
IBVCommContext* commContext = _this->commContext;
// we received a packet, so peer has received all of our currently pending data => reset counter
commContext->numSendBufsLeft = commContext->commCfg.bufNum - 1; /* (see
createCommContext() for "-1" reason) */
// send control packet if recv counter expires...
#ifdef BEEGFS_DEBUG
if(!commContext->numReceivedBufsLeft)
ibv_print_info("BUG: numReceivedBufsLeft underflow!\n");
#endif // BEEGFS_DEBUG
commContext->numReceivedBufsLeft--;
if(!commContext->numReceivedBufsLeft)
{
size_t currentBufIndex;
int postRes;
if(commContext->incompleteSend.forceWaitForAll ||
(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) )
{ // wait for all (!) incomplete sends
/* note: it's ok that all send bufs are used up, because it's possible that we do a lot of
recv without the user sending any data in between (so the bufs were actually used up by
flow control). */
unsigned numWaitWrites = 0;
unsigned numWaitReads = 0;
int waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
&commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS);
if(waitRes <= 0)
{
ibv_print_info("problem encountered during waitForTotalSendCompletion(). ErrCode: %d\n",
waitRes);
return -1;
}
commContext->incompleteSend.forceWaitForAll = false;
}
currentBufIndex = commContext->incompleteSend.numAvailable;
commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */
commContext->sendBufs[currentBufIndex].lists[0].length = IBVSOCKET_FLOWCONTROL_MSG_LEN;
commContext->sendBufs[currentBufIndex].listLength = 1;
postRes = __IBVSocket_postSend(_this, currentBufIndex);
if(unlikely(postRes) )
{
commContext->incompleteSend.numAvailable--;
return -1;
}
// note: numReceivedBufsLeft is reset during postSend() flow control
}
return 0;
}
/**
* Called after sending a packet to update flow control counters.
*
* Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up.
*
* Note: This is only one part of the on-send flow control. The other one is
* _flowControlOnSendWait().
*/
void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this)
{
IBVCommContext* commContext = _this->commContext;
// we sent a packet, so we received all currently pending data from the peer => reset counter
commContext->numReceivedBufsLeft = commContext->commCfg.bufNum - 1; /* (see
createCommContext() for "-1" reason) */
#ifdef BEEGFS_DEBUG
if(!commContext->numSendBufsLeft)
ibv_print_info("BUG: numSendBufsLeft underflow!\n");
#endif
commContext->numSendBufsLeft--;
}
/**
* Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up.
*
* @timeoutMS may be 0 for non-blocking operation, otherwise typically
* IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS
* @return >0 on success, 0 on timeout (waiting for flow control packet from peer), <0 on error
*/
int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS)
{
IBVCommContext* commContext = _this->commContext;
struct ib_wc wc;
int recvRes;
size_t bufIndex;
int postRecvRes;
if(commContext->numSendBufsLeft)
return 1; // flow control not triggered yet
recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc);
if(recvRes <= 0)
return recvRes;
bufIndex = wc.wr_id;
if(unlikely(wc.byte_len != IBVSOCKET_FLOWCONTROL_MSG_LEN) )
{ // error (bad length)
ibv_print_info("received flow control packet length mismatch %d\n", (int)wc.byte_len);
return -1;
}
postRecvRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex);
if(postRecvRes)
return -1;
// note: numSendBufsLeft is reset during recvWC() (if it actually received a packet)
return 1;
}
/**
* @return 1 on available data, 0 on timeout, <0 on error
*/
int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC)
{
IBVCommContext* commContext = _this->commContext;
long waitRes;
int numEvents = 0;
int checkRes;
// special quick path: other than in the userspace version of this method, we only need the
// quick path when timeoutMS==0, because then we might have been called from a special
// context, in which we don't want to sleep
if(!timeoutMS)
return ib_poll_cq(commContext->recvCQ, 1, outWC);
while(timeoutMS != 0)
{
/* note: we use pollTimeoutMS to check the conn every few secs (otherwise we might
wait for a very long time in case the other side disconnected silently) */
int pollTimeoutMS = MIN(_this->timeoutCfg.pollMS, timeoutMS);
long pollTimeoutJiffies = TimeTk_msToJiffiesSchedulable(pollTimeoutMS);
/* note: don't think about ib_peek_cq here, because it is not implemented in the drivers. */
waitRes = wait_event_timeout(commContext->recvCompWaitQ,
(numEvents = ib_poll_cq(commContext->recvCQ, 1, outWC) ), pollTimeoutJiffies);
if(unlikely(waitRes == -ERESTARTSYS || fatal_signal_pending(current)))
{ // signal pending
ibv_print_info_debug("wait for recvCompEvent ended by pending signal\n");
return waitRes;
}
if(likely(numEvents) )
{ // we got something
return numEvents;
}
// timeout
checkRes = IBVSocket_checkConnection(_this);
if(checkRes < 0)
return -ECONNRESET;
timeoutMS -= pollTimeoutMS;
} // end of for-loop
return 0;
}
/**
* @param oldSendCount old sendCompEventCount
* @return 1 on available data, 0 on timeout, -1 on error
*/
int __IBVSocket_waitForSendCompletionEvent(IBVSocket* _this, int oldSendCount, int timeoutMS)
{
IBVCommContext* commContext = _this->commContext;
long waitRes;
while(timeoutMS != 0)
{
// Note: We use pollTimeoutMS to check the conn every few secs (otherwise we might
// wait for a very long time in case the other side disconnected silently)
int pollTimeoutMS = MIN(_this->timeoutCfg.pollMS, timeoutMS);
long pollTimeoutJiffies = TimeTk_msToJiffiesSchedulable(pollTimeoutMS);
waitRes = wait_event_timeout(commContext->sendCompWaitQ,
atomic_read(&commContext->sendCompEventCount) != oldSendCount, pollTimeoutJiffies);
if(unlikely(waitRes == -ERESTARTSYS || fatal_signal_pending(current)))
{ // signal pending
ibv_print_info_debug("wait for sendCompEvent ended by pending signal\n");
return -1;
}
if(likely(atomic_read(&commContext->sendCompEventCount) != oldSendCount) )
return 1;
// timeout
timeoutMS -= pollTimeoutMS;
}
return 0;
}
/**
* @param numSendElements also used as out-param to return the remaining number
* @param timeoutMS 0 for non-blocking; this is a soft timeout that is reset after each received
* completion
* @return 1 if all completions received, 0 if completions missing (in case you wanted non-blocking)
* or -1 in case of an error.
*/
int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this,
unsigned* numSendElements, unsigned* numWriteElements, unsigned* numReadElements, int timeoutMS)
{
IBVCommContext* commContext = _this->commContext;
int numElements;
int waitRes;
int oldSendCount;
int i;
size_t bufIndex;
struct ib_wc wc[2];
do
{
oldSendCount = atomic_read(&commContext->sendCompEventCount);
numElements = ib_poll_cq(commContext->sendCQ, 2, wc);
if(unlikely(numElements < 0) )
{
ibv_print_info("bad ib_poll_cq result: %d\n", numElements);
return -1;
}
else
if(!numElements)
{ // no completions available yet => wait
if(!timeoutMS)
return 0;
waitRes = __IBVSocket_waitForSendCompletionEvent(_this, oldSendCount, timeoutMS);
if(likely(waitRes > 0) )
continue;
return waitRes;
}
// we got something...
// for each completion element
for(i=0; i < numElements; i++)
{
if(unlikely(wc[i].status != IB_WC_SUCCESS) )
{
printk_fhgfs_connerr(KERN_INFO, "%s: Connection error (wc_status: %d; msg: %s)\n",
"IBVSocket (wait for total send completion)", (int)(wc[i].status),
__IBVSocket_wcStatusStr(wc[i].status) );
return -1;
}
switch(wc[i].opcode)
{
case IB_WC_SEND:
{
bufIndex = wc[i].wr_id;
if(unlikely(bufIndex >= commContext->commCfg.bufNum) )
{
ibv_print_info("bad send completion wr_id 0x%x\n", (int)wc[i].wr_id);
return -1;
}
if(likely(*numSendElements) )
(*numSendElements)--;
else
{
ibv_print_info("received bad/unexpected send completion\n");
return -1;
}
} break;
case IB_WC_RDMA_READ:
{
if(unlikely(wc[i].wr_id != 0) )
{
ibv_print_info("bad read completion wr_id 0x%x\n", (int)wc[i].wr_id);
return -1;
}
if(likely(*numReadElements) )
(*numReadElements)--;
else
{
ibv_print_info("received bad/unexpected RDMA read completion\n");
return -1;
}
} break;
default:
{
ibv_print_info("received bad/unexpected completion opcode %d\n", wc[i].opcode);
return -1;
} break;
} // end of switch
} // end of for-loop
} while(*numSendElements || *numWriteElements || *numReadElements);
return 1;
}
/**
* @return <0 on error, 0 if recv would block, >0 if recv would not block
*/
int __IBVSocket_receiveCheck(IBVSocket* _this, int timeoutMS)
{
IBVCommContext* commContext = _this->commContext;
struct ib_wc wc;
int flowControlRes;
int recvRes;
if(unlikely(_this->errState) )
return -1;
if(commContext->incompleteRecv.isAvailable)
return 1;
// check whether we have a pending on-send flow control packet that needs to be received first
flowControlRes = __IBVSocket_flowControlOnSendWait(_this, timeoutMS);
if(unlikely(flowControlRes < 0) )
{
ibv_print_info_debug("got an error from flowControlOnSendWait(). ErrCode: %d\n",
flowControlRes);
goto err_invalidateSock;
}
if(!flowControlRes)
return 0;
// recv one packet (if available) and add it as incompleteRecv
recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc);
if(unlikely(recvRes < 0) )
{
ibv_print_info_debug("got an error from __IBVSocket_recvWC(). ErrCode: %d\n", recvRes);
goto err_invalidateSock;
}
if(!recvRes)
return 0;
// we got something => prepare to continue later
commContext->incompleteRecv.totalSize = wc.byte_len;
commContext->incompleteRecv.bufIndex = wc.wr_id;
commContext->incompleteRecv.completedOffset = 0;
commContext->incompleteRecv.isAvailable = 1;
return 1;
err_invalidateSock:
ibv_print_info_debug("invalidating connection\n");
_this->errState = -1;
return -1;
}
/**
* @return <0 on error, 0 if send would block, >0 if send would not block
*/
int __IBVSocket_nonblockingSendCheck(IBVSocket* _this)
{
IBVCommContext* commContext = _this->commContext;
int flowControlRes;
int waitRes;
unsigned numWaitWrites = 0;
unsigned numWaitReads = 0;
int timeoutMS = 0;
if(unlikely(_this->errState) )
return -1;
// check whether we have a pending on-send flow control packet that needs to be received first
flowControlRes = __IBVSocket_flowControlOnSendWait(_this, 0);
if(unlikely(flowControlRes < 0) )
goto err_invalidateSock;
if(!flowControlRes)
return flowControlRes;
if(!commContext->incompleteSend.forceWaitForAll &&
(commContext->incompleteSend.numAvailable < commContext->commCfg.bufNum) )
return 1;
commContext->incompleteSend.forceWaitForAll = true; // always setting saves an "if" below
// we have to wait for completions before we can send...
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
&commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS);
if(unlikely(waitRes < 0) )
goto err_invalidateSock;
if(waitRes > 0)
commContext->incompleteSend.forceWaitForAll = false; // no more completions peding
return waitRes;
err_invalidateSock:
ibv_print_info_debug("invalidating connection\n");
_this->errState = -1;
return -1;
}
/**
* Note: Call this only once with finishPoll==true (=> non-blocking) or multiple times with
* finishPoll==true in the last call from the current thread (for cleanup).
* Note: It's safe to call this multiple times with finishPoll==true (in case that caller does
* not want to sleep anymore).
*
* @param events the event flags you are interested in (POLL...)
* @param finishPoll true for cleanup if you don't call poll again from this thread; (it's also ok
* to set this to true if you call poll only once and want to avoid blocking)
* @return mask all available revents (like poll(), POLL... flags), may not only be the events
* that were requested (and can also be the error events)
*/
unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll)
{
/* note: there are two possible uses for finishPoll==true:
1) this method is called multiple times and finishPoll is true in the last loop
(for cleanup)
2) this method is called only once non-blocking and finishPoll is true to avoid
blocking */
/* note: it's good to call prepare_to_wait more than once for the same thread (e.g. if
caller woke up from schedule() and decides to sleep again) */
/* note: condition needs to be re-checked after prepare_to_wait to avoid the race when it became
true between the initial check and the call to prepare_to_wait */
/* note: we assume that after we returned a positive result, the caller will not try to sleep
(but will still call this again with finishPoll==true to cleanup) */
IBVCommContext* commContext = _this->commContext;
unsigned long revents = 0; // return value
if(unlikely(_this->errState) )
{
ibv_print_info_debug("called for an erroneous connection. ErrCode: %d\n", _this->errState);
revents |= POLLERR;
return revents;
}
if(!commContext->numSendBufsLeft)
{ /* special case: on-send flow control triggered, so we actually need to wait for incoming data
even though the user set POLLOUT */
events |= POLLIN;
/* note: the actual checks for POLLIN will be handled below like in the normal POLLIN case.
(we just want need to wake up when there is incoming data.) */
/* note: this only works efficiently, because the beegfs client just checks for any revent and
then calls _send() again. checking for POLLOUT explicitly would result in another call to
_poll() and then we would notice that we can send now => would work, but is less efficient,
obviously. */
}
/* note: the "if(POLLIN || recvWaitInitialized)" is necessary because !numSendBufsLeft might have
triggered unrequested POLLIN check and we need to clean that up later. */
if( (events & POLLIN) || (commContext->recvWaitInitialized) )
{
// initial check and wait preparations for incoming data
if(__IBVSocket_receiveCheck(_this, 0) )
{ // immediate data available => no need to prepare wait
revents |= POLLIN;
}
else
if(!finishPoll && !commContext->recvWaitInitialized)
{ // no incoming data and caller is planning to schedule()
#ifdef BEEGFS_DEBUG
if(waitqueue_active(&commContext->recvCompWaitQ) )
ibv_print_info("BUG: recvCompWaitQ was not empty\n");
#endif // BEEGFS_DEBUG
commContext->recvWaitInitialized = true;
init_waitqueue_entry(&commContext->recvWait, current);
add_wait_queue(&commContext->recvCompWaitQ, &commContext->recvWait);
if(__IBVSocket_receiveCheck(_this, 0) )
revents |= POLLIN;
}
// cleanup
if(finishPoll && commContext->recvWaitInitialized)
{
commContext->recvWaitInitialized = false;
remove_wait_queue(&commContext->recvCompWaitQ, &commContext->recvWait);
}
}
/* note: POLLOUT check must come _after_ POLLIN check, because the pollin-part
won't set the POLLOUT flag if we recv the on-send flow ctl packet during
nonblockingRecvCheck(). */
if(events & POLLOUT)
{
// initial check and wait preparations for outgoing data
if(__IBVSocket_nonblockingSendCheck(_this) )
{ // immediate data available => no need to prepare wait
revents |= POLLOUT;
}
else
if(!finishPoll && !commContext->sendWaitInitialized)
{ // no incoming data and caller is planning to schedule()
#ifdef BEEGFS_DEBUG
if(waitqueue_active(&commContext->sendCompWaitQ) )
ibv_print_info("BUG: sendCompWaitQ was not empty\n");
#endif // BEEGFS_DEBUG
commContext->sendWaitInitialized = true;
init_waitqueue_entry(&commContext->sendWait, current);
add_wait_queue(&commContext->sendCompWaitQ, &commContext->sendWait);
if(__IBVSocket_nonblockingSendCheck(_this) )
revents |= POLLOUT;
}
// cleanup
if(finishPoll && commContext->sendWaitInitialized)
{
commContext->sendWaitInitialized = false;
remove_wait_queue(&commContext->sendCompWaitQ, &commContext->sendWait);
}
}
// check errState again in case it was modified during the checks above
if(unlikely(_this->errState) )
{
ibv_print_info_debug("got erroneous connection state. ErrCode: %d\n", _this->errState);
revents |= POLLERR;
}
return revents;
}
/**
* Handle connection manager event callbacks.
*
* Locking of mutexes and sleeping is permitted.
*
* @return negative Linux error code on error, 0 otherwise; in case of return!=0, rdma_cm will
* automatically call rdma_destroy_id().
*/
int __IBVSocket_cmaHandler(struct rdma_cm_id* cm_id, struct rdma_cm_event* event)
{
IBVSocket* _this = cm_id->context;
int retVal = 0;
if(unlikely(!_this || _this->errState !=0) )
{
ibv_print_info_debug("cm_id is being torn down. Event: %d\n", event->event);
return (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) ? -EINVAL : 0;
}
Mutex_lock(&_this->cmaMutex);
if (_this->cm_id != cm_id)
{
Mutex_unlock(&_this->cmaMutex);
return -EINVAL;
}
ibv_print_info_debug("rdma event: %i, status: %i\n", event->event, event->status);
switch(event->event)
{
case RDMA_CM_EVENT_ADDR_RESOLVED:
_this->connState = IBVSOCKETCONNSTATE_ADDRESSRESOLVED;
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
retVal = -ENETUNREACH;
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
_this->connState = IBVSOCKETCONNSTATE_ROUTERESOLVED;
break;
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
retVal = -ETIMEDOUT;
break;
case RDMA_CM_EVENT_CONNECT_REQUEST:
// incoming connections not supported => reject all
#ifdef OFED_RDMA_REJECT_NEEDS_REASON
rdma_reject(cm_id, NULL, 0, 0);
#else
rdma_reject(cm_id, NULL, 0);
#endif // OFED_RDMA_REJECT_NEEDS_REASON
break;
case RDMA_CM_EVENT_CONNECT_RESPONSE:
retVal = rdma_accept(cm_id, NULL);
break;
case RDMA_CM_EVENT_REJECTED:
retVal = event->status == IB_CM_REJ_STALE_CONN ? -ESTALE : -ECONNREFUSED;
break;
case RDMA_CM_EVENT_ESTABLISHED:
retVal = __IBVSocket_connectedHandler(_this, event);
_this->connState = IBVSOCKETCONNSTATE_ESTABLISHED;
break;
case RDMA_CM_EVENT_DISCONNECTED:
rdma_disconnect(cm_id);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
/**
* Sigh... what to do? There were previous attempts to perform cleanup of the
* IBVCommContext and return -ENETRESET when this event is encountered.
* Returning a nonzero value causes RDMA CM to destroy the cm_id and anything
* belonging to that cm_id must be destroyed here before returning nonzero. There
* are a lot of race conditions with the worker thread and attempting to implement
* a locking scheme would require significant redesign due to the use of
* blocking calls to ib_poll_cq. Ignoring the event appears to allow normal
* cleanup of resources after the RDMA routines return an error to the caller.
*/
ibv_print_info("Device has been removed: %s\n", cm_id->device->name);
break;
default:
ibv_print_info_debug("Ignoring RDMA_CMA event: %d\n", event->event);
break;
}
if(unlikely(retVal) )
{
if(retVal == -ESTALE)
_this->connState = IBVSOCKETCONNSTATE_REJECTED_STALE;
else
_this->connState = IBVSOCKETCONNSTATE_FAILED;
_this->errState = -1;
// free connection resources later. freeing everything here may race with send/recv
// operations in case of a connection breakage.
retVal = 0;
}
Mutex_unlock(&_this->cmaMutex);
wake_up(&_this->eventWaitQ);
return retVal;
}
/**
* Invoked when an asynchronous event not associated with a completion occurs on the CQ.
*/
void __IBVSocket_cqSendEventHandler(struct ib_event *event, void *data)
{
ibv_print_info_debug("called. event type: %d (not handled)\n", event->event);
}
/**
* Invoked when a completion event occurs on the CQ.
*
* @param cq_context IBVSocket* _this
*/
void __IBVSocket_sendCompletionHandler(struct ib_cq *cq, void *cq_context)
{
IBVSocket* _this = cq_context;
IBVCommContext* commContext = _this->commContext;
int reqNotifySendRes;
atomic_inc(&commContext->sendCompEventCount);
reqNotifySendRes = ib_req_notify_cq(commContext->sendCQ, IB_CQ_NEXT_COMP);
if(unlikely(reqNotifySendRes) )
ibv_print_info("Couldn't request CQ notification\n");
wake_up(&commContext->sendCompWaitQ);
}
/**
* Invoked when an asynchronous event not associated with a completion occurs on the CQ.
*/
void __IBVSocket_cqRecvEventHandler(struct ib_event *event, void *data)
{
ibv_print_info_debug("called. event type: %d (not handled)\n", event->event);
}
/**
* Invoked when a completion event occurs on the CQ.
*
* @param cq_context IBVSocket* _this
*/
void __IBVSocket_recvCompletionHandler(struct ib_cq *cq, void *cq_context)
{
IBVSocket* _this = cq_context;
IBVCommContext* commContext = _this->commContext;
int reqNotifyRecvRes;
atomic_inc(&commContext->recvCompEventCount);
reqNotifyRecvRes = ib_req_notify_cq(commContext->recvCQ, IB_CQ_NEXT_COMP);
if(unlikely(reqNotifyRecvRes) )
ibv_print_info("Couldn't request CQ notification\n");
wake_up(&commContext->recvCompWaitQ);
}
void __IBVSocket_qpEventHandler(struct ib_event *event, void *data)
{
ibv_print_info_debug("called. event type: %d (not handled)\n", event->event);
}
int __IBVSocket_routeResolvedHandler(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext)
{
bool createContextRes;
struct rdma_conn_param conn_param;
createContextRes = __IBVSocket_createCommContext(_this, _this->cm_id, commCfg,
&_this->commContext);
if(!createContextRes)
{
ibv_print_info("creation of CommContext failed\n");
_this->errState = -1;
return -EPERM;
}
// establish connection...
if (_this->commContext->checkConnRkey == 0)
{
if (!IBVBuffer_initRegistration(&_this->commContext->checkConBuffer, _this->commContext))
{
_this->errState = -1;
return -EPERM;
}
_this->commContext->checkConnRkey = _this->commContext->checkConBuffer.mr->rkey;
}
if (!__IBVSocket_initCommDest(_this->commContext, &_this->localDest))
{
ibv_print_info_ir("creation of CommDest failed\n");
_this->errState = -1;
return -EPERM;
}
memset(&conn_param, 0, sizeof(conn_param) );
#ifdef BEEGFS_NVFS
conn_param.responder_resources = _this->commContext->pd->device->attrs.max_qp_rd_atom;
conn_param.initiator_depth = _this->commContext->pd->device->attrs.max_qp_init_rd_atom;
#else
conn_param.responder_resources = 1;
conn_param.initiator_depth = 1;
#endif
conn_param.flow_control = 0;
conn_param.retry_count = 7; // (3 bits)
conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity)
conn_param.private_data = &_this->localDest;
conn_param.private_data_len = sizeof(_this->localDest);
return rdma_connect(_this->cm_id, &conn_param);
}
int __IBVSocket_connectedHandler(IBVSocket* _this, struct rdma_cm_event* event)
{
IBVCommContext* commContext = _this->commContext;
IBVCommConfig* commCfg;
int retVal = 0;
bool parseCommDestRes;
const void* private_data;
u8 private_data_len;
int i;
if (!commContext)
return -EINVAL;
commCfg = &commContext->commCfg;
if (_this->commContext->commCfg.keyType == IBVSOCKETKEYTYPE_Register)
{
if(IBVSocket_registerMr(_this, _this->commContext->checkConBuffer.mr, IB_ACCESS_REMOTE_READ))
{
ibv_print_info("register buffer failed\n");
_this->errState = -1;
return -EPERM;
}
}
// post initial recv buffers...
for(i=0; i < commCfg->bufNum; i++)
{
if(__IBVSocket_postRecv(_this, commContext, i) )
{
ibv_print_info("couldn't post recv buffer with index %d\n", i);
goto err_invalidateSock;
}
}
#if defined BEEGFS_OFED_1_2_API && (BEEGFS_OFED_1_2_API == 1)
private_data = event->private_data;
private_data_len = event->private_data_len;
#else // OFED 1.2.5 or higher API
private_data = event->param.conn.private_data;
private_data_len = event->param.conn.private_data_len;
#endif
parseCommDestRes = __IBVSocket_parseCommDest(
private_data, private_data_len, &_this->remoteDest);
if(!parseCommDestRes)
{
ibv_print_info("bad private data received. len: %d\n", private_data_len);
retVal = -EOPNOTSUPP;
goto err_invalidateSock;
}
return retVal;
err_invalidateSock:
_this->errState = -1;
return retVal;
}
struct ib_cq* __IBVSocket_createCompletionQueue(struct ib_device* device,
ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *),
void* cq_context, int cqe)
{
#if defined (BEEGFS_OFED_1_2_API) && BEEGFS_OFED_1_2_API >= 1
return ib_create_cq(device, comp_handler, event_handler, cq_context, cqe);
#elif defined OFED_HAS_IB_CREATE_CQATTR || defined ib_create_cq
struct ib_cq_init_attr attrs = {
.cqe = cqe,
#ifdef KERNEL_HAS_GET_RANDOM_INT
.comp_vector = get_random_int()%device->num_comp_vectors,
#else
.comp_vector = get_random_long()%device->num_comp_vectors,
#endif
};
return ib_create_cq(device, comp_handler, event_handler, cq_context, &attrs);
#else // OFED 1.2.5 or higher API
return ib_create_cq(device, comp_handler, event_handler, cq_context, cqe, 0);
#endif
}
/**
* @return pointer to static buffer with human readable string for a wc status code
*/
const char* __IBVSocket_wcStatusStr(int wcStatusCode)
{
switch(wcStatusCode)
{
case IB_WC_WR_FLUSH_ERR:
return "work request flush error";
case IB_WC_RETRY_EXC_ERR:
return "retries exceeded error";
case IB_WC_RESP_TIMEOUT_ERR:
return "response timeout error";
default:
return "<undefined>";
}
}
int IBVSocket_registerMr(IBVSocket* _this, struct ib_mr* mr, int access)
{
struct ib_reg_wr wr;
int res;
memset(&wr, 0, sizeof(wr));
wr.wr.opcode = IB_WR_REG_MR;
wr.mr = mr;
wr.key = mr->rkey;
wr.access = IB_ACCESS_LOCAL_WRITE | access;
res = ib_post_send(_this->commContext->qp, &wr.wr, NULL);
if (unlikely(res))
{
printk_fhgfs(KERN_ERR, "Failed to post IB_WR_REG_MR res=%d\n", res);
return -1;
}
return 0;
}
struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this)
{
return _this->srcIpAddr;
}
NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this)
{
return _this->nicStats;
}
#endif