#include "IBVSocket.h" #ifdef BEEGFS_RDMA #ifdef KERNEL_HAS_SCSI_FC_COMPAT #include // some kernels (e.g. rhel 5.9) forgot this in their rdma headers #endif // KERNEL_HAS_SCSI_FC_COMPAT #include #include #include #include #include #include #include #include #include #define IBVSOCKET_CONN_TIMEOUT_MS 5000 /* this also includes send completion wait times */ #define IBVSOCKET_COMPLETION_TIMEOUT_MS 300000 #define IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS 180000 #define IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS 180000 #define IBVSOCKET_SHUTDOWN_TIMEOUT_MS 250 #define IBVSOCKET_POLL_TIMEOUT_MS 10000 #define IBVSOCKET_FLOWCONTROL_MSG_LEN 1 #define IBVSOCKET_STALE_RETRIES_NUM 128 #define IBVSOCKET_MIN_SGE 1 #define IBVSOCKET_MIN_WR 1 /** * IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS is used by IBVSocket_recvT when timeoutMS * is passed as < 0, which indicates that __IBVSocket_receiveCheck should be * called until it does not timeout. Thus, the long timeout value is * inconsequential for that case. There doesn't appear to be any current code * that passes timeoutMS < 0 to IBVSocket_recvT. */ #define IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS 1000000 #define ibv_print_info(str, ...) printk_fhgfs(KERN_INFO, "%s:%d: " str, __func__, __LINE__, \ ##__VA_ARGS__) #define ibv_print_info_ir(str, ...) printk_fhgfs_ir(KERN_INFO, "%s:%d: " str, __func__, __LINE__, \ ##__VA_ARGS__) #define ibv_pwarn(str, ...) printk_fhgfs(KERN_WARNING, "%s:%d: " str, __func__, __LINE__, \ ##__VA_ARGS__) #define ibv_print_info_debug(str, ...) printk_fhgfs_debug(KERN_INFO, "%s:%d: " str, __func__, \ __LINE__, ##__VA_ARGS__) #define ibv_print_info_ir_debug(str, ...) printk_fhgfs_ir_debug(KERN_INFO, "%s:%d: " str, \ __func__, __LINE__, ##__VA_ARGS__) /* 4.19 added const qualifiers to ib_post_send and ib_post_recv. */ typedef __typeof__( __builtin_choose_expr( __builtin_types_compatible_p( __typeof__(&ib_post_send), int (*)(struct ib_qp*, struct ib_send_wr*, struct ib_send_wr**)), (struct ib_send_wr*) 0, (const struct ib_send_wr*) 0)) _bad_send_wr; typedef __typeof__( __builtin_choose_expr( __builtin_types_compatible_p( __typeof__(&ib_post_recv), int (*)(struct ib_qp*, struct ib_recv_wr*, struct ib_recv_wr**)), (struct ib_recv_wr*) 0, (const struct ib_recv_wr*) 0)) _bad_recv_wr; bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats) { memset(_this, 0, sizeof(*_this) ); _this->connState = IBVSOCKETCONNSTATE_UNCONNECTED; _this->timeoutCfg.connectMS = IBVSOCKET_CONN_TIMEOUT_MS; _this->timeoutCfg.completionMS = IBVSOCKET_COMPLETION_TIMEOUT_MS; _this->timeoutCfg.flowSendMS = IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS; _this->timeoutCfg.flowRecvMS = IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS; _this->timeoutCfg.pollMS = IBVSOCKET_POLL_TIMEOUT_MS; _this->typeOfService = 0; _this->srcIpAddr = srcIpAddr; _this->nicStats = nicStats; init_waitqueue_head(&_this->eventWaitQ); Mutex_init(&_this->cmaMutex); return __IBVSocket_createNewID(_this); } void IBVSocket_uninit(IBVSocket* _this) { Mutex_lock(&_this->cmaMutex); __IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext); Mutex_unlock(&_this->cmaMutex); if(_this->cm_id) rdma_destroy_id(_this->cm_id); Mutex_uninit(&_this->cmaMutex); SAFE_KFREE(_this->remoteDest); } bool IBVSocket_rdmaDevicesExist() { // Note: We use this (currently) just to inform the higher levels // about availability of RDMA functionality return true; } unsigned IBVSocket_getRkey(IBVSocket *_this) { return _this->commContext->checkConnRkey; } struct ib_device* IBVSocket_getDevice(IBVSocket* _this) { return _this->commContext->pd->device; } /** * Create a new cm_id. * This is not only intended for new sockets, but also for stale cm_ids, so this can cleanup/replace * existing cm_ids and resets error states. */ bool __IBVSocket_createNewID(IBVSocket* _this) { struct rdma_cm_id* new_cm_id; // We need to unconditionally destroy the old CM id. It is unusable at this point. if(_this->cm_id) { rdma_destroy_id(_this->cm_id); _this->cm_id = NULL; } #if defined(OFED_HAS_NETNS) || defined(rdma_create_id) new_cm_id = rdma_create_id(&init_net, __IBVSocket_cmaHandler, _this, RDMA_PS_TCP, IB_QPT_RC); #elif defined(OFED_HAS_RDMA_CREATE_QPTYPE) new_cm_id = rdma_create_id(__IBVSocket_cmaHandler, _this, RDMA_PS_TCP, IB_QPT_RC); #else new_cm_id = rdma_create_id(__IBVSocket_cmaHandler, _this, RDMA_PS_TCP); #endif if(IS_ERR(new_cm_id) ) { ibv_print_info("rdma_create_id failed. ErrCode: %ld\n", PTR_ERR(new_cm_id) ); return false; } _this->cm_id = new_cm_id; _this->connState = IBVSOCKETCONNSTATE_UNCONNECTED; _this->errState = 0; return true; } bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port, IBVCommConfig* commCfg) { struct sockaddr_in sin; struct sockaddr_in src; struct sockaddr_in* srcp; long connTimeoutJiffies = TimeTk_msToJiffiesSchedulable(IBVSOCKET_CONN_TIMEOUT_MS); Time connElapsed; int rc; /* note: rejected as stale means remote side still had an old open connection associated with our current cm_id. what most likely happened is that the client was reset (i.e. no clean disconnect) and our new cm_id after reboot now matches one of the old previous cm_ids. => only possible solution seems to be retrying with another cm_id. */ int numStaleRetriesLeft = IBVSOCKET_STALE_RETRIES_NUM; Time_setToNow(&connElapsed); for( ; ; ) // stale retry loop { // set type of service for this connection #ifdef OFED_HAS_SET_SERVICE_TYPE if (_this->typeOfService) rdma_set_service_type(_this->cm_id, _this->typeOfService); #endif // OFED_HAS_SET_SERVICE_TYPE /* note: the rest of the connect procedure is invoked through the cmaHandler when the corresponding asynchronous events arrive => we just have to wait for the connState to change here */ _this->connState = IBVSOCKETCONNSTATE_CONNECTING; // resolve IP address ... // (async event handler also automatically resolves route on success) sin.sin_addr.s_addr = ipaddress.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(port); srcp = NULL; if (_this->srcIpAddr.s_addr != 0) { src.sin_addr = _this->srcIpAddr; src.sin_family = AF_INET; src.sin_port = 0; srcp = &src; } if(rdma_resolve_addr(_this->cm_id, (struct sockaddr*)srcp, (struct sockaddr*)&sin, _this->timeoutCfg.connectMS) ) { ibv_print_info_debug("rdma_resolve_addr failed\n"); goto err_invalidateSock; } // wait for async event wait_event_interruptible(_this->eventWaitQ, _this->connState != IBVSOCKETCONNSTATE_CONNECTING); if(_this->connState != IBVSOCKETCONNSTATE_ADDRESSRESOLVED) goto err_invalidateSock; if(rdma_resolve_route(_this->cm_id, _this->timeoutCfg.connectMS) ) { ibv_print_info_debug("rdma_resolve_route failed.\n"); goto err_invalidateSock; } wait_event_interruptible(_this->eventWaitQ, _this->connState != IBVSOCKETCONNSTATE_ADDRESSRESOLVED); if(_this->connState != IBVSOCKETCONNSTATE_ROUTERESOLVED) goto err_invalidateSock; // establish connection... // (handler calls rdma_connect() ) Mutex_lock(&_this->cmaMutex); rc = __IBVSocket_routeResolvedHandler(_this, _this->cm_id, commCfg, &_this->commContext); Mutex_unlock(&_this->cmaMutex); if (rc) { ibv_print_info_debug("route resolved handler failed\n"); goto err_invalidateSock; } // wait for async event // Note: rdma_connect() can take a very long time (>5m) if the peer's HCA has gone down. wait_event_interruptible_timeout(_this->eventWaitQ, _this->connState != IBVSOCKETCONNSTATE_ROUTERESOLVED, connTimeoutJiffies); // test point for failed connections if((_this->connState != IBVSOCKETCONNSTATE_ESTABLISHED) && (_this->remapConnectionFailureStatus != 0)) _this->connState = _this->remapConnectionFailureStatus; // check if cm_id was reported as stale by remote side if(_this->connState == IBVSOCKETCONNSTATE_REJECTED_STALE) { bool createIDRes; if(!numStaleRetriesLeft) { // no more stale retries left if(IBVSOCKET_STALE_RETRIES_NUM) // did we have any retries at all ibv_print_info("Giving up after %d stale connection retries\n", IBVSOCKET_STALE_RETRIES_NUM); goto err_invalidateSock; } printk_fhgfs_connerr(KERN_INFO, "Stale connection detected. Retrying with a new one...\n"); // We need to clean up the commContext created in the routeResolvedHandler because // the next time through the loop it will get recreated. If this is the final try, // then we don't need it anymore. Mutex_lock(&_this->cmaMutex); __IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext); _this->commContext = NULL; createIDRes = __IBVSocket_createNewID(_this); Mutex_unlock(&_this->cmaMutex); if(!createIDRes) goto err_invalidateSock; numStaleRetriesLeft--; continue; } if(_this->connState != IBVSOCKETCONNSTATE_ESTABLISHED) { ibv_print_info_debug("Failed after %d stale connection retries, elapsed = %u\n", IBVSOCKET_STALE_RETRIES_NUM - numStaleRetriesLeft, Time_elapsedMS(&connElapsed)); goto err_invalidateSock; } // connected if(numStaleRetriesLeft != IBVSOCKET_STALE_RETRIES_NUM) { ibv_print_info_debug("Succeeded after %d stale connection retries, elapsed = %u\n", IBVSOCKET_STALE_RETRIES_NUM - numStaleRetriesLeft, Time_elapsedMS(&connElapsed)); } return true; } err_invalidateSock: // If we have a comm context, we need to delete it since we can't use it. We set an error state // on the socket first, so we stop accepting callbacks that would access the commContext that is // in the process of being destroyed. _this->errState = -1; Mutex_lock(&_this->cmaMutex); __IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext); _this->commContext = NULL; Mutex_unlock(&_this->cmaMutex); return false; } bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr, unsigned short port) { struct sockaddr_in bindAddr; bindAddr.sin_family = AF_INET; bindAddr.sin_addr = ipAddr; bindAddr.sin_port = htons(port); if(rdma_bind_addr(_this->cm_id, (struct sockaddr*)&bindAddr) ) { _this->errState = -1; return false; } return true; } /** * @return true on success */ bool IBVSocket_listen(IBVSocket* _this) { if(rdma_listen(_this->cm_id, 0) ) { ibv_print_info("rdma_listen failed\n"); _this->errState = -1; return false; } return true; } bool IBVSocket_shutdown(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; unsigned numWaitWrites = 0; unsigned numWaitReads = 0; int timeoutMS = IBVSOCKET_SHUTDOWN_TIMEOUT_MS; if(_this->errState) return true; // true, because the conn is down anyways if(!commContext) return true; // this socket has never been connected if(commContext->incompleteSend.numAvailable) { // wait for all incomplete sends int waitRes; waitRes = __IBVSocket_waitForTotalSendCompletion(_this, &commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS); if(waitRes < 0) { ibv_print_info_debug("Waiting for incomplete send requests failed\n"); return false; } } return true; } /** * Continues an incomplete former recv() by returning immediately available data from the * corresponding buffer. */ ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, struct iov_iter* iter) { IBVCommContext* commContext = _this->commContext; struct IBVIncompleteRecv* recv = &commContext->incompleteRecv; size_t bufIndex = recv->bufIndex; struct IBVBuffer* buffer = &commContext->recvBufs[bufIndex]; size_t copyRes = 0; ssize_t total = 0; if(unlikely(_this->errState) ) return -1; while(iov_iter_count(iter) > 0 && recv->totalSize != recv->completedOffset) { unsigned page = recv->completedOffset / buffer->bufferSize; unsigned offset = recv->completedOffset % buffer->bufferSize; unsigned fragment = MIN(MIN(iov_iter_count(iter), buffer->bufferSize - offset), recv->totalSize - recv->completedOffset); copyRes = copy_to_iter(buffer->buffers[page] + offset, fragment, iter); if(copyRes != fragment) { copyRes = 0; break; } total += fragment; recv->completedOffset += fragment; } if(recv->completedOffset == recv->totalSize) { int postRes; commContext->incompleteRecv.isAvailable = 0; postRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex); if(unlikely(postRes) ) goto err_invalidateSock; } if(!copyRes) goto err_fault; return total; err_invalidateSock: ibv_print_info_debug("invalidating connection\n"); err_fault: _this->errState = -1; return -EFAULT; } /** * @return number of received bytes on success, -ETIMEDOUT on timeout, -ECOMM on error */ ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags, int timeoutMS) { int checkRes; int wait = timeoutMS < 0 ? IBVSOCKET_RECVT_INFINITE_TIMEOUT_MS : timeoutMS; do { checkRes = __IBVSocket_receiveCheck(_this, wait); } while (checkRes == 0 && timeoutMS < 0); if(checkRes < 0) return -ECOMM; if(checkRes == 0) return -ETIMEDOUT; return __IBVSocket_recvContinueIncomplete(_this, iter); } /** * @flags supports MSG_DONTWAIT * @return number of bytes sent or negative error code (-EAGAIN in case of MSG_DONTWAIT if no data * could be sent without blocking) */ ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags) { IBVCommContext* commContext = _this->commContext; int flowControlRes; size_t currentBufIndex; struct iov_iter source = *iter; int postRes; size_t postedLen = 0; ssize_t currentPostLen; int waitRes; unsigned numWaitWrites = 0; unsigned numWaitReads = 0; int timeoutMS = _this->timeoutCfg.completionMS; if(unlikely(_this->errState) ) return -1; // handle flags if(flags & MSG_DONTWAIT) { // send only as much as we can without blocking // note: we adapt the bufLen variable as necessary here for simplicity int checkSendRes; size_t bufNumLeft; size_t bufLenLeft; checkSendRes = __IBVSocket_nonblockingSendCheck(_this); if(!checkSendRes) { // we can't send non-blocking at the moment, caller shall try again later return -EAGAIN; } else if(unlikely(checkSendRes < 0) ) goto err_invalidateSock; // buffers available => adapt bufLen (if necessary) bufNumLeft = MIN(commContext->commCfg.bufNum - commContext->incompleteSend.numAvailable, commContext->numSendBufsLeft); bufLenLeft = bufNumLeft * commContext->commCfg.bufSize; iov_iter_truncate(&source, bufLenLeft); } // send data cut in buf-sized pieces... do { flowControlRes = __IBVSocket_flowControlOnSendWait(_this, _this->timeoutCfg.flowSendMS); if(unlikely(flowControlRes <= 0) ) goto err_invalidateSock; // note: we only poll for completed sends if forced or after we used up all (!) available bufs if(commContext->incompleteSend.forceWaitForAll || (commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) ) { // wait for all (!) incomplete sends waitRes = __IBVSocket_waitForTotalSendCompletion(_this, &commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS); if(waitRes <= 0) goto err_invalidateSock; commContext->incompleteSend.forceWaitForAll = false; } currentBufIndex = commContext->incompleteSend.numAvailable; currentPostLen = IBVBuffer_fill(&commContext->sendBufs[currentBufIndex], &source); if(currentPostLen < 0) goto err_fault; commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */ postRes = __IBVSocket_postSend(_this, currentBufIndex); if(unlikely(postRes) ) { commContext->incompleteSend.numAvailable--; goto err_invalidateSock; } postedLen += currentPostLen; } while(iov_iter_count(&source)); iov_iter_advance(iter, postedLen); return (ssize_t)postedLen; err_invalidateSock: _this->errState = -1; return -ECOMM; err_fault: _this->errState = -1; return -EFAULT; } void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS, int completionMS, int flowSendMS, int flowRecvMS, int pollMS) { _this->timeoutCfg.connectMS = connectMS > 0? connectMS : IBVSOCKET_CONN_TIMEOUT_MS; _this->timeoutCfg.completionMS = completionMS > 0? completionMS : IBVSOCKET_COMPLETION_TIMEOUT_MS; _this->timeoutCfg.flowSendMS = flowSendMS > 0? flowSendMS : IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS; _this->timeoutCfg.flowRecvMS = flowRecvMS > 0? flowRecvMS : IBVSOCKET_FLOWCONTROL_ONRECV_TIMEOUT_MS; _this->timeoutCfg.pollMS = pollMS > 0? pollMS : IBVSOCKET_POLL_TIMEOUT_MS; #ifdef BEEGFS_DEBUG ibv_print_info_debug("connectMS=%d completionMS=%d flowSendMS=%d flowRecvMS=%d pollMS=%d\n", _this->timeoutCfg.connectMS, _this->timeoutCfg.completionMS, _this->timeoutCfg.flowSendMS, _this->timeoutCfg.flowRecvMS, _this->timeoutCfg.pollMS); #endif } void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService) { _this->typeOfService = typeOfService; } void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value) { _this->remapConnectionFailureStatus = value; } bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id, IBVCommConfig* commCfg, IBVCommContext** outCommContext) { IBVCommContext* commContext; struct ib_device* dev = cm_id->device; struct ib_qp_init_attr qpInitAttr; int qpRes; unsigned i; unsigned fragmentSize; unsigned maxSge; unsigned maxWr; bool globalRkey = (commCfg->keyType == IBVSOCKETKEYTYPE_UnsafeGlobal); commContext = kzalloc(sizeof(*commContext), GFP_KERNEL); if(!commContext) goto err_cleanup; ibv_print_info_debug("Alloc CommContext @ %p\n", commContext); // prepare recv and send event notification init_waitqueue_head(&commContext->recvCompWaitQ); init_waitqueue_head(&commContext->sendCompWaitQ); atomic_set(&commContext->recvCompEventCount, 0); atomic_set(&commContext->sendCompEventCount, 0); // protection domain... // IB_PD_UNSAFE_GLOBAL_RKEY is still present as of kernel 6.3. #ifdef OFED_UNSAFE_GLOBAL_RKEY commContext->pd = ib_alloc_pd(dev, globalRkey? IB_PD_UNSAFE_GLOBAL_RKEY : 0); #else if (globalRkey) { ibv_print_info("Unsafe global rkey not supported on this platform."); goto err_cleanup; } commContext->pd = ib_alloc_pd(dev, 0); #endif if(IS_ERR(commContext->pd) ) { ibv_print_info("Couldn't allocate PD. ErrCode: %ld\n", PTR_ERR(commContext->pd) ); commContext->pd = NULL; goto err_cleanup; } #ifdef OFED_UNSAFE_GLOBAL_RKEY if (globalRkey) commContext->checkConnRkey = commContext->pd->unsafe_global_rkey; #endif if (commCfg->keyType == IBVSOCKETKEYTYPE_UnsafeDMA) { // DMA system mem region... // (Note: IB spec says: // "The consumer is not allowed to assign remote-write (or remote-atomic) to // a memory region that has not been assigned local-write.") // ib_get_dma_mr() goes away in kernel 4.9. Is is still present in MOFED 5.9. // If not using the global rkey, then either ib_get_dmr_mr() or an allocated ib_mr // needs to be used. #ifdef OFED_IB_GET_DMA_MR commContext->dmaMR = ib_get_dma_mr(commContext->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if(IS_ERR_OR_NULL(commContext->dmaMR) ) { ibv_print_info("ib_get_dma_mr failed. ErrCode: %ld\n", PTR_ERR(commContext->dmaMR) ); commContext->dmaMR = NULL; goto err_cleanup; } commContext->checkConnRkey = commContext->dmaMR->rkey; #else ibv_print_info("RDMA keyType is dma and ib_get_dma_mr() not supported on this platform.\n"); goto err_cleanup; #endif } #ifdef BEEGFS_DEBUG ibv_print_info("%s: checkConnRkey = %u\n", __func__, commContext->checkConnRkey); #endif // alloc and register buffers... commContext->commCfg = *commCfg; commContext->recvBufs = kzalloc(commCfg->bufNum * sizeof(struct IBVBuffer), GFP_KERNEL); if(!commContext->recvBufs) { ibv_print_info("couldn't prepare receive buffer list\n"); goto err_cleanup; } for(i=0; i < commCfg->bufNum; i++) { if(!IBVBuffer_init(&commContext->recvBufs[i], commContext, commCfg->bufSize, commCfg->fragmentSize, DMA_FROM_DEVICE) ) { ibv_print_info("couldn't prepare recvBuf #%d\n", i + 1); goto err_cleanup; } } commContext->sendBufs = kzalloc(commCfg->bufNum * sizeof(struct IBVBuffer), GFP_KERNEL); if(!commContext->sendBufs) { ibv_print_info("couldn't prepare send buffer list\n"); goto err_cleanup; } for(i=0; i < commCfg->bufNum; i++) { if(!IBVBuffer_init(&commContext->sendBufs[i], commContext, commCfg->bufSize, commCfg->fragmentSize, DMA_TO_DEVICE) ) { ibv_print_info("couldn't prepare sendBuf #%d\n", i + 1); goto err_cleanup; } } if(!IBVBuffer_init(&commContext->checkConBuffer, commContext, sizeof(u64), 0, DMA_TO_DEVICE) ) { ibv_print_info("couldn't alloc dma control memory region\n"); goto err_cleanup; } // init flow control v2 (to avoid long receiver-not-ready timeouts) /* note: we use -1 because the last buf might not be read by the user (eg during nonblockingRecvCheck) and so it might not be immediately available again. */ commContext->numReceivedBufsLeft = commCfg->bufNum - 1; commContext->numSendBufsLeft = commCfg->bufNum - 1; // create completion queues... commContext->recvCQ = __IBVSocket_createCompletionQueue(cm_id->device, __IBVSocket_recvCompletionHandler, __IBVSocket_cqRecvEventHandler, _this, commCfg->bufNum); if(IS_ERR(commContext->recvCQ) ) { ibv_print_info("couldn't create recv CQ. ErrCode: %ld\n", PTR_ERR(commContext->recvCQ) ); commContext->recvCQ = NULL; goto err_cleanup; } // note: 1+commCfg->bufNum here for the checkConnection() RDMA read commContext->sendCQ = __IBVSocket_createCompletionQueue(cm_id->device, __IBVSocket_sendCompletionHandler, __IBVSocket_cqSendEventHandler, _this, 1+commCfg->bufNum); if(IS_ERR(commContext->sendCQ) ) { ibv_print_info("couldn't create send CQ. ErrCode: %ld\n", PTR_ERR(commContext->sendCQ) ); commContext->sendCQ = NULL; goto err_cleanup; } fragmentSize = commCfg->fragmentSize; if (fragmentSize == 0) fragmentSize = commCfg->bufSize; maxSge = MAX(IBVSOCKET_MIN_SGE, commCfg->bufSize / fragmentSize + 1); maxWr = MAX(IBVSOCKET_MIN_WR, commCfg->bufNum + 1); // note: 1+commCfg->bufNum here for the checkConnection() RDMA read memset(&qpInitAttr, 0, sizeof(qpInitAttr) ); qpInitAttr.event_handler = __IBVSocket_qpEventHandler; qpInitAttr.send_cq = commContext->sendCQ; qpInitAttr.recv_cq = commContext->recvCQ; qpInitAttr.qp_type = IB_QPT_RC; qpInitAttr.sq_sig_type = IB_SIGNAL_REQ_WR; qpInitAttr.cap.max_send_wr = maxWr; qpInitAttr.cap.max_recv_wr = maxWr; qpInitAttr.cap.max_send_sge = maxSge; qpInitAttr.cap.max_recv_sge = maxSge; qpInitAttr.cap.max_inline_data = 0; qpRes = rdma_create_qp(cm_id, commContext->pd, &qpInitAttr); if(qpRes) { ibv_print_info("couldn't create QP. ErrCode: %d\n", qpRes); goto err_cleanup; } commContext->qp = cm_id->qp; // prepare event notification... // initial event notification requests if(ib_req_notify_cq(commContext->recvCQ, IB_CQ_NEXT_COMP) ) { ibv_print_info("couldn't request CQ notification\n"); goto err_cleanup; } if(ib_req_notify_cq(commContext->sendCQ, IB_CQ_NEXT_COMP) ) { ibv_print_info("couldn't request CQ notification\n"); goto err_cleanup; } *outCommContext = commContext; return true; // error handling err_cleanup: __IBVSocket_cleanupCommContext(cm_id, commContext); *outCommContext = NULL; return false; } void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext) { unsigned i; struct ib_device* dev; ibv_print_info_debug("Free CommContext @ %p\n", commContext); if(!commContext) return; dev = commContext->pd ? commContext->pd->device : NULL; if(!dev) goto cleanup_no_dev; if(cm_id && commContext->qp && cm_id->qp) rdma_destroy_qp(cm_id); if(commContext->sendCQ) #ifdef OFED_IB_DESTROY_CQ_IS_VOID ib_destroy_cq(commContext->sendCQ); #else { int destroyRes = ib_destroy_cq(commContext->sendCQ); if (unlikely(destroyRes) ) { ibv_pwarn("sendCQ destroy failed: %d\n", destroyRes); dump_stack(); } } #endif if(commContext->recvCQ) #ifdef OFED_IB_DESTROY_CQ_IS_VOID ib_destroy_cq(commContext->recvCQ); #else { int destroyRes = ib_destroy_cq(commContext->recvCQ); if (unlikely(destroyRes) ) { ibv_pwarn("recvCQ destroy failed: %d\n", destroyRes); dump_stack(); } } #endif IBVBuffer_free(&commContext->checkConBuffer, commContext); for(i=0; i < commContext->commCfg.bufNum; i++) { if(commContext->recvBufs) IBVBuffer_free(&commContext->recvBufs[i], commContext); if(commContext->sendBufs) IBVBuffer_free(&commContext->sendBufs[i], commContext); } SAFE_KFREE(commContext->recvBufs); SAFE_KFREE(commContext->sendBufs); if(commContext->dmaMR) ib_dereg_mr(commContext->dmaMR); if(commContext->pd) ib_dealloc_pd(commContext->pd); cleanup_no_dev: kfree(commContext); } /** * Initializes a (local) IBVCommDest. */ bool __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest) { memcpy(outDest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN); outDest->protocolVersion = cpu_to_le64(IBVSOCKET_PRIVATEDATA_PROTOCOL_VER); outDest->rkey = cpu_to_le32(commContext->checkConnRkey); outDest->vaddr = cpu_to_le64(commContext->checkConBuffer.lists[0].addr); outDest->recvBufNum = cpu_to_le32(commContext->commCfg.bufNum); outDest->recvBufSize = cpu_to_le32(commContext->commCfg.bufSize); #ifdef BEEGFS_DEBUG ibv_print_info("%s: rkey=%u vaddr=%llu", __func__, outDest->rkey, outDest->vaddr); #endif return true; } /** * Parses and checks a (remote) IBVCommDest. * * @param buf should usually be the private_data of the connection handshake * @param outDest will be kmalloced (if true is returned) and needs to be kfree'd by the * caller * @return true if data is okay, false otherwise */ bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest) { const IBVCommDest* src = buf; IBVCommDest* dest = NULL; // Note: "bufLen < ..." (and not "!="), because there might be some extra padding if(!buf || (bufLen < sizeof(*dest) ) ) { ibv_print_info("Bad private data size. length: %zu\n", bufLen); return false; } dest = kmalloc(sizeof(*dest), GFP_ATOMIC); if(!dest) return false; *outDest = dest; *dest = *src; dest->protocolVersion = le64_to_cpu(dest->protocolVersion); dest->vaddr = le64_to_cpu(dest->vaddr); dest->rkey = le32_to_cpu(dest->rkey); dest->recvBufNum = le32_to_cpu(dest->recvBufNum); dest->recvBufSize = le32_to_cpu(dest->recvBufSize); if(memcmp(dest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN) ) goto err_cleanup; if(dest->protocolVersion != IBVSOCKET_PRIVATEDATA_PROTOCOL_VER) goto err_cleanup; return true; err_cleanup: kfree(dest); *outDest = NULL; return false; } /** * Append buffer to receive queue. * * @param commContext passed seperately because it's not the _this->commContext during * accept() of incoming connections * @return 0 on success, -1 on error */ int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex) { struct ib_recv_wr wr; _bad_recv_wr bad_wr; int postRes; commContext->sendBufs[bufIndex].lists[0].length = commContext->commCfg.bufSize; wr.next = NULL; wr.wr_id = bufIndex; wr.sg_list = commContext->recvBufs[bufIndex].lists; wr.num_sge = commContext->recvBufs[bufIndex].listLength; postRes = ib_post_recv(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { ibv_print_info("ib_post_recv failed. ErrCode: %d\n", postRes); return -1; } return 0; } int IBVSocket_checkConnection(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; #ifdef OFED_SPLIT_WR # define rdma_of(wr) (wr) # define wr_of(wr) (wr.wr) struct ib_rdma_wr wr; #else # define rdma_of(wr) (wr.wr.rdma) # define wr_of(wr) (wr) struct ib_send_wr wr; #endif _bad_send_wr bad_wr; int postRes; int waitRes; int timeoutMS = _this->timeoutCfg.completionMS; unsigned numWaitWrites = 0; unsigned numWaitReads = 1; rdma_of(wr).remote_addr = _this->remoteDest->vaddr; rdma_of(wr).rkey = _this->remoteDest->rkey; wr_of(wr).wr_id = 0; wr_of(wr).sg_list = commContext->checkConBuffer.lists; wr_of(wr).num_sge = commContext->checkConBuffer.listLength; wr_of(wr).opcode = IB_WR_RDMA_READ; wr_of(wr).send_flags = IB_SEND_SIGNALED; wr_of(wr).next = NULL; postRes = ib_post_send(commContext->qp, &wr_of(wr), &bad_wr); if(unlikely(postRes) ) { ibv_print_info("ib_post_send() failed. ErrCode: %d\n", postRes); goto error; } waitRes = __IBVSocket_waitForTotalSendCompletion(_this, &commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS); if(unlikely(waitRes <= 0) ) goto error; commContext->incompleteSend.forceWaitForAll = false; return 0; #undef rdma_of #undef wr_of error: _this->errState = -1; return -1; } /** * Note: contains flow control. * * @return 0 on success, -1 on error */ int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex) { IBVCommContext* commContext = _this->commContext; struct ib_send_wr wr; _bad_send_wr bad_wr; int postRes; wr.wr_id = bufIndex; wr.sg_list = commContext->sendBufs[bufIndex].lists; wr.num_sge = commContext->sendBufs[bufIndex].listLength; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.next = NULL; postRes = ib_post_send(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { ibv_print_info("ib_post_send() failed. ErrCode: %d\n", postRes); return -1; } // flow control __IBVSocket_flowControlOnSendUpdateCounters(_this); return 0; } /** * Receive work completion. * * Note: contains flow control. * * @param timeoutMS 0 for non-blocking * @return 1 on success, 0 on timeout, <0 on error */ int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC) { IBVCommContext* commContext = _this->commContext; int waitRes; size_t bufIndex; waitRes = __IBVSocket_waitForRecvCompletionEvent(_this, timeoutMS, outWC); if(waitRes <= 0) { // (note: waitRes==0 can often happen, because we call this with timeoutMS==0) if(unlikely(waitRes < 0) ) { if(waitRes != -ERESTARTSYS) { // only print message if user didn't press "-C" ibv_print_info("retrieval of completion event failed. result: %d\n", waitRes); } } return waitRes; } // we got something... if(unlikely(outWC->status != IB_WC_SUCCESS) ) { printk_fhgfs_connerr(KERN_INFO, "%s: Connection error (wc_status: %d; msg: %s)\n", "IBVSocket (recv work completion)", (int)outWC->status, __IBVSocket_wcStatusStr(outWC->status) ); return -1; } bufIndex = outWC->wr_id; if(unlikely(bufIndex >= commContext->commCfg.bufNum) ) { ibv_print_info("Completion for unknown/invalid wr_id %d\n", (int)outWC->wr_id); return -1; } // receive completed // flow control if(unlikely(__IBVSocket_flowControlOnRecv(_this, _this->timeoutCfg.flowRecvMS) ) ) { ibv_print_info_debug("got an error from flowControlOnRecv().\n"); return -1; } return 1; } /** * Intention: Avoid IB rnr by sending control msg when (almost) all our recv bufs are used up to * show that we got our new recv bufs ready. * * @timeoutMS don't set this to 0, we really need to wait here sometimes * @return 0 on success, -1 on error */ int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS) { IBVCommContext* commContext = _this->commContext; // we received a packet, so peer has received all of our currently pending data => reset counter commContext->numSendBufsLeft = commContext->commCfg.bufNum - 1; /* (see createCommContext() for "-1" reason) */ // send control packet if recv counter expires... #ifdef BEEGFS_DEBUG if(!commContext->numReceivedBufsLeft) ibv_print_info("BUG: numReceivedBufsLeft underflow!\n"); #endif // BEEGFS_DEBUG commContext->numReceivedBufsLeft--; if(!commContext->numReceivedBufsLeft) { size_t currentBufIndex; int postRes; if(commContext->incompleteSend.forceWaitForAll || (commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) ) { // wait for all (!) incomplete sends /* note: it's ok that all send bufs are used up, because it's possible that we do a lot of recv without the user sending any data in between (so the bufs were actually used up by flow control). */ unsigned numWaitWrites = 0; unsigned numWaitReads = 0; int waitRes = __IBVSocket_waitForTotalSendCompletion(_this, &commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS); if(waitRes <= 0) { ibv_print_info("problem encountered during waitForTotalSendCompletion(). ErrCode: %d\n", waitRes); return -1; } commContext->incompleteSend.forceWaitForAll = false; } currentBufIndex = commContext->incompleteSend.numAvailable; commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */ commContext->sendBufs[currentBufIndex].lists[0].length = IBVSOCKET_FLOWCONTROL_MSG_LEN; commContext->sendBufs[currentBufIndex].listLength = 1; postRes = __IBVSocket_postSend(_this, currentBufIndex); if(unlikely(postRes) ) { commContext->incompleteSend.numAvailable--; return -1; } // note: numReceivedBufsLeft is reset during postSend() flow control } return 0; } /** * Called after sending a packet to update flow control counters. * * Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up. * * Note: This is only one part of the on-send flow control. The other one is * _flowControlOnSendWait(). */ void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; // we sent a packet, so we received all currently pending data from the peer => reset counter commContext->numReceivedBufsLeft = commContext->commCfg.bufNum - 1; /* (see createCommContext() for "-1" reason) */ #ifdef BEEGFS_DEBUG if(!commContext->numSendBufsLeft) ibv_print_info("BUG: numSendBufsLeft underflow!\n"); #endif commContext->numSendBufsLeft--; } /** * Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up. * * @timeoutMS may be 0 for non-blocking operation, otherwise typically * IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS * @return >0 on success, 0 on timeout (waiting for flow control packet from peer), <0 on error */ int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS) { IBVCommContext* commContext = _this->commContext; struct ib_wc wc; int recvRes; size_t bufIndex; int postRecvRes; if(commContext->numSendBufsLeft) return 1; // flow control not triggered yet recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc); if(recvRes <= 0) return recvRes; bufIndex = wc.wr_id; if(unlikely(wc.byte_len != IBVSOCKET_FLOWCONTROL_MSG_LEN) ) { // error (bad length) ibv_print_info("received flow control packet length mismatch %d\n", (int)wc.byte_len); return -1; } postRecvRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex); if(postRecvRes) return -1; // note: numSendBufsLeft is reset during recvWC() (if it actually received a packet) return 1; } /** * @return 1 on available data, 0 on timeout, <0 on error */ int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC) { IBVCommContext* commContext = _this->commContext; long waitRes; int numEvents = 0; int checkRes; // special quick path: other than in the userspace version of this method, we only need the // quick path when timeoutMS==0, because then we might have been called from a special // context, in which we don't want to sleep if(!timeoutMS) return ib_poll_cq(commContext->recvCQ, 1, outWC); while(timeoutMS != 0) { /* note: we use pollTimeoutMS to check the conn every few secs (otherwise we might wait for a very long time in case the other side disconnected silently) */ int pollTimeoutMS = MIN(_this->timeoutCfg.pollMS, timeoutMS); long pollTimeoutJiffies = TimeTk_msToJiffiesSchedulable(pollTimeoutMS); /* note: don't think about ib_peek_cq here, because it is not implemented in the drivers. */ waitRes = wait_event_timeout(commContext->recvCompWaitQ, (numEvents = ib_poll_cq(commContext->recvCQ, 1, outWC) ), pollTimeoutJiffies); if(unlikely(waitRes == -ERESTARTSYS || fatal_signal_pending(current))) { // signal pending ibv_print_info_debug("wait for recvCompEvent ended by pending signal\n"); return waitRes; } if(likely(numEvents) ) { // we got something return numEvents; } // timeout checkRes = IBVSocket_checkConnection(_this); if(checkRes < 0) return -ECONNRESET; timeoutMS -= pollTimeoutMS; } // end of for-loop return 0; } /** * @param oldSendCount old sendCompEventCount * @return 1 on available data, 0 on timeout, -1 on error */ int __IBVSocket_waitForSendCompletionEvent(IBVSocket* _this, int oldSendCount, int timeoutMS) { IBVCommContext* commContext = _this->commContext; long waitRes; while(timeoutMS != 0) { // Note: We use pollTimeoutMS to check the conn every few secs (otherwise we might // wait for a very long time in case the other side disconnected silently) int pollTimeoutMS = MIN(_this->timeoutCfg.pollMS, timeoutMS); long pollTimeoutJiffies = TimeTk_msToJiffiesSchedulable(pollTimeoutMS); waitRes = wait_event_timeout(commContext->sendCompWaitQ, atomic_read(&commContext->sendCompEventCount) != oldSendCount, pollTimeoutJiffies); if(unlikely(waitRes == -ERESTARTSYS || fatal_signal_pending(current))) { // signal pending ibv_print_info_debug("wait for sendCompEvent ended by pending signal\n"); return -1; } if(likely(atomic_read(&commContext->sendCompEventCount) != oldSendCount) ) return 1; // timeout timeoutMS -= pollTimeoutMS; } return 0; } /** * @param numSendElements also used as out-param to return the remaining number * @param timeoutMS 0 for non-blocking; this is a soft timeout that is reset after each received * completion * @return 1 if all completions received, 0 if completions missing (in case you wanted non-blocking) * or -1 in case of an error. */ int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this, unsigned* numSendElements, unsigned* numWriteElements, unsigned* numReadElements, int timeoutMS) { IBVCommContext* commContext = _this->commContext; int numElements; int waitRes; int oldSendCount; int i; size_t bufIndex; struct ib_wc wc[2]; do { oldSendCount = atomic_read(&commContext->sendCompEventCount); numElements = ib_poll_cq(commContext->sendCQ, 2, wc); if(unlikely(numElements < 0) ) { ibv_print_info("bad ib_poll_cq result: %d\n", numElements); return -1; } else if(!numElements) { // no completions available yet => wait if(!timeoutMS) return 0; waitRes = __IBVSocket_waitForSendCompletionEvent(_this, oldSendCount, timeoutMS); if(likely(waitRes > 0) ) continue; return waitRes; } // we got something... // for each completion element for(i=0; i < numElements; i++) { if(unlikely(wc[i].status != IB_WC_SUCCESS) ) { printk_fhgfs_connerr(KERN_INFO, "%s: Connection error (wc_status: %d; msg: %s)\n", "IBVSocket (wait for total send completion)", (int)(wc[i].status), __IBVSocket_wcStatusStr(wc[i].status) ); return -1; } switch(wc[i].opcode) { case IB_WC_SEND: { bufIndex = wc[i].wr_id; if(unlikely(bufIndex >= commContext->commCfg.bufNum) ) { ibv_print_info("bad send completion wr_id 0x%x\n", (int)wc[i].wr_id); return -1; } if(likely(*numSendElements) ) (*numSendElements)--; else { ibv_print_info("received bad/unexpected send completion\n"); return -1; } } break; case IB_WC_RDMA_READ: { if(unlikely(wc[i].wr_id != 0) ) { ibv_print_info("bad read completion wr_id 0x%x\n", (int)wc[i].wr_id); return -1; } if(likely(*numReadElements) ) (*numReadElements)--; else { ibv_print_info("received bad/unexpected RDMA read completion\n"); return -1; } } break; default: { ibv_print_info("received bad/unexpected completion opcode %d\n", wc[i].opcode); return -1; } break; } // end of switch } // end of for-loop } while(*numSendElements || *numWriteElements || *numReadElements); return 1; } /** * @return <0 on error, 0 if recv would block, >0 if recv would not block */ int __IBVSocket_receiveCheck(IBVSocket* _this, int timeoutMS) { IBVCommContext* commContext = _this->commContext; struct ib_wc wc; int flowControlRes; int recvRes; if(unlikely(_this->errState) ) return -1; if(commContext->incompleteRecv.isAvailable) return 1; // check whether we have a pending on-send flow control packet that needs to be received first flowControlRes = __IBVSocket_flowControlOnSendWait(_this, timeoutMS); if(unlikely(flowControlRes < 0) ) { ibv_print_info_debug("got an error from flowControlOnSendWait(). ErrCode: %d\n", flowControlRes); goto err_invalidateSock; } if(!flowControlRes) return 0; // recv one packet (if available) and add it as incompleteRecv recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc); if(unlikely(recvRes < 0) ) { ibv_print_info_debug("got an error from __IBVSocket_recvWC(). ErrCode: %d\n", recvRes); goto err_invalidateSock; } if(!recvRes) return 0; // we got something => prepare to continue later commContext->incompleteRecv.totalSize = wc.byte_len; commContext->incompleteRecv.bufIndex = wc.wr_id; commContext->incompleteRecv.completedOffset = 0; commContext->incompleteRecv.isAvailable = 1; return 1; err_invalidateSock: ibv_print_info_debug("invalidating connection\n"); _this->errState = -1; return -1; } /** * @return <0 on error, 0 if send would block, >0 if send would not block */ int __IBVSocket_nonblockingSendCheck(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; int flowControlRes; int waitRes; unsigned numWaitWrites = 0; unsigned numWaitReads = 0; int timeoutMS = 0; if(unlikely(_this->errState) ) return -1; // check whether we have a pending on-send flow control packet that needs to be received first flowControlRes = __IBVSocket_flowControlOnSendWait(_this, 0); if(unlikely(flowControlRes < 0) ) goto err_invalidateSock; if(!flowControlRes) return flowControlRes; if(!commContext->incompleteSend.forceWaitForAll && (commContext->incompleteSend.numAvailable < commContext->commCfg.bufNum) ) return 1; commContext->incompleteSend.forceWaitForAll = true; // always setting saves an "if" below // we have to wait for completions before we can send... waitRes = __IBVSocket_waitForTotalSendCompletion(_this, &commContext->incompleteSend.numAvailable, &numWaitWrites, &numWaitReads, timeoutMS); if(unlikely(waitRes < 0) ) goto err_invalidateSock; if(waitRes > 0) commContext->incompleteSend.forceWaitForAll = false; // no more completions peding return waitRes; err_invalidateSock: ibv_print_info_debug("invalidating connection\n"); _this->errState = -1; return -1; } /** * Note: Call this only once with finishPoll==true (=> non-blocking) or multiple times with * finishPoll==true in the last call from the current thread (for cleanup). * Note: It's safe to call this multiple times with finishPoll==true (in case that caller does * not want to sleep anymore). * * @param events the event flags you are interested in (POLL...) * @param finishPoll true for cleanup if you don't call poll again from this thread; (it's also ok * to set this to true if you call poll only once and want to avoid blocking) * @return mask all available revents (like poll(), POLL... flags), may not only be the events * that were requested (and can also be the error events) */ unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll) { /* note: there are two possible uses for finishPoll==true: 1) this method is called multiple times and finishPoll is true in the last loop (for cleanup) 2) this method is called only once non-blocking and finishPoll is true to avoid blocking */ /* note: it's good to call prepare_to_wait more than once for the same thread (e.g. if caller woke up from schedule() and decides to sleep again) */ /* note: condition needs to be re-checked after prepare_to_wait to avoid the race when it became true between the initial check and the call to prepare_to_wait */ /* note: we assume that after we returned a positive result, the caller will not try to sleep (but will still call this again with finishPoll==true to cleanup) */ IBVCommContext* commContext = _this->commContext; unsigned long revents = 0; // return value if(unlikely(_this->errState) ) { ibv_print_info_debug("called for an erroneous connection. ErrCode: %d\n", _this->errState); revents |= POLLERR; return revents; } if(!commContext->numSendBufsLeft) { /* special case: on-send flow control triggered, so we actually need to wait for incoming data even though the user set POLLOUT */ events |= POLLIN; /* note: the actual checks for POLLIN will be handled below like in the normal POLLIN case. (we just want need to wake up when there is incoming data.) */ /* note: this only works efficiently, because the beegfs client just checks for any revent and then calls _send() again. checking for POLLOUT explicitly would result in another call to _poll() and then we would notice that we can send now => would work, but is less efficient, obviously. */ } /* note: the "if(POLLIN || recvWaitInitialized)" is necessary because !numSendBufsLeft might have triggered unrequested POLLIN check and we need to clean that up later. */ if( (events & POLLIN) || (commContext->recvWaitInitialized) ) { // initial check and wait preparations for incoming data if(__IBVSocket_receiveCheck(_this, 0) ) { // immediate data available => no need to prepare wait revents |= POLLIN; } else if(!finishPoll && !commContext->recvWaitInitialized) { // no incoming data and caller is planning to schedule() #ifdef BEEGFS_DEBUG if(waitqueue_active(&commContext->recvCompWaitQ) ) ibv_print_info("BUG: recvCompWaitQ was not empty\n"); #endif // BEEGFS_DEBUG commContext->recvWaitInitialized = true; init_waitqueue_entry(&commContext->recvWait, current); add_wait_queue(&commContext->recvCompWaitQ, &commContext->recvWait); if(__IBVSocket_receiveCheck(_this, 0) ) revents |= POLLIN; } // cleanup if(finishPoll && commContext->recvWaitInitialized) { commContext->recvWaitInitialized = false; remove_wait_queue(&commContext->recvCompWaitQ, &commContext->recvWait); } } /* note: POLLOUT check must come _after_ POLLIN check, because the pollin-part won't set the POLLOUT flag if we recv the on-send flow ctl packet during nonblockingRecvCheck(). */ if(events & POLLOUT) { // initial check and wait preparations for outgoing data if(__IBVSocket_nonblockingSendCheck(_this) ) { // immediate data available => no need to prepare wait revents |= POLLOUT; } else if(!finishPoll && !commContext->sendWaitInitialized) { // no incoming data and caller is planning to schedule() #ifdef BEEGFS_DEBUG if(waitqueue_active(&commContext->sendCompWaitQ) ) ibv_print_info("BUG: sendCompWaitQ was not empty\n"); #endif // BEEGFS_DEBUG commContext->sendWaitInitialized = true; init_waitqueue_entry(&commContext->sendWait, current); add_wait_queue(&commContext->sendCompWaitQ, &commContext->sendWait); if(__IBVSocket_nonblockingSendCheck(_this) ) revents |= POLLOUT; } // cleanup if(finishPoll && commContext->sendWaitInitialized) { commContext->sendWaitInitialized = false; remove_wait_queue(&commContext->sendCompWaitQ, &commContext->sendWait); } } // check errState again in case it was modified during the checks above if(unlikely(_this->errState) ) { ibv_print_info_debug("got erroneous connection state. ErrCode: %d\n", _this->errState); revents |= POLLERR; } return revents; } /** * Handle connection manager event callbacks. * * Locking of mutexes and sleeping is permitted. * * @return negative Linux error code on error, 0 otherwise; in case of return!=0, rdma_cm will * automatically call rdma_destroy_id(). */ int __IBVSocket_cmaHandler(struct rdma_cm_id* cm_id, struct rdma_cm_event* event) { IBVSocket* _this = cm_id->context; int retVal = 0; if(unlikely(!_this || _this->errState !=0) ) { ibv_print_info_debug("cm_id is being torn down. Event: %d\n", event->event); return (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) ? -EINVAL : 0; } Mutex_lock(&_this->cmaMutex); if (_this->cm_id != cm_id) { Mutex_unlock(&_this->cmaMutex); return -EINVAL; } ibv_print_info_debug("rdma event: %i, status: %i\n", event->event, event->status); switch(event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: _this->connState = IBVSOCKETCONNSTATE_ADDRESSRESOLVED; break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_UNREACHABLE: retVal = -ENETUNREACH; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: _this->connState = IBVSOCKETCONNSTATE_ROUTERESOLVED; break; case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: retVal = -ETIMEDOUT; break; case RDMA_CM_EVENT_CONNECT_REQUEST: // incoming connections not supported => reject all #ifdef OFED_RDMA_REJECT_NEEDS_REASON rdma_reject(cm_id, NULL, 0, 0); #else rdma_reject(cm_id, NULL, 0); #endif // OFED_RDMA_REJECT_NEEDS_REASON break; case RDMA_CM_EVENT_CONNECT_RESPONSE: retVal = rdma_accept(cm_id, NULL); break; case RDMA_CM_EVENT_REJECTED: retVal = event->status == IB_CM_REJ_STALE_CONN ? -ESTALE : -ECONNREFUSED; break; case RDMA_CM_EVENT_ESTABLISHED: retVal = __IBVSocket_connectedHandler(_this, event); _this->connState = IBVSOCKETCONNSTATE_ESTABLISHED; break; case RDMA_CM_EVENT_DISCONNECTED: rdma_disconnect(cm_id); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /** * Sigh... what to do? There were previous attempts to perform cleanup of the * IBVCommContext and return -ENETRESET when this event is encountered. * Returning a nonzero value causes RDMA CM to destroy the cm_id and anything * belonging to that cm_id must be destroyed here before returning nonzero. There * are a lot of race conditions with the worker thread and attempting to implement * a locking scheme would require significant redesign due to the use of * blocking calls to ib_poll_cq. Ignoring the event appears to allow normal * cleanup of resources after the RDMA routines return an error to the caller. */ ibv_print_info("Device has been removed: %s\n", cm_id->device->name); break; default: ibv_print_info_debug("Ignoring RDMA_CMA event: %d\n", event->event); break; } if(unlikely(retVal) ) { if(retVal == -ESTALE) _this->connState = IBVSOCKETCONNSTATE_REJECTED_STALE; else _this->connState = IBVSOCKETCONNSTATE_FAILED; _this->errState = -1; // free connection resources later. freeing everything here may race with send/recv // operations in case of a connection breakage. retVal = 0; } Mutex_unlock(&_this->cmaMutex); wake_up(&_this->eventWaitQ); return retVal; } /** * Invoked when an asynchronous event not associated with a completion occurs on the CQ. */ void __IBVSocket_cqSendEventHandler(struct ib_event *event, void *data) { ibv_print_info_debug("called. event type: %d (not handled)\n", event->event); } /** * Invoked when a completion event occurs on the CQ. * * @param cq_context IBVSocket* _this */ void __IBVSocket_sendCompletionHandler(struct ib_cq *cq, void *cq_context) { IBVSocket* _this = cq_context; IBVCommContext* commContext = _this->commContext; int reqNotifySendRes; atomic_inc(&commContext->sendCompEventCount); reqNotifySendRes = ib_req_notify_cq(commContext->sendCQ, IB_CQ_NEXT_COMP); if(unlikely(reqNotifySendRes) ) ibv_print_info("Couldn't request CQ notification\n"); wake_up(&commContext->sendCompWaitQ); } /** * Invoked when an asynchronous event not associated with a completion occurs on the CQ. */ void __IBVSocket_cqRecvEventHandler(struct ib_event *event, void *data) { ibv_print_info_debug("called. event type: %d (not handled)\n", event->event); } /** * Invoked when a completion event occurs on the CQ. * * @param cq_context IBVSocket* _this */ void __IBVSocket_recvCompletionHandler(struct ib_cq *cq, void *cq_context) { IBVSocket* _this = cq_context; IBVCommContext* commContext = _this->commContext; int reqNotifyRecvRes; atomic_inc(&commContext->recvCompEventCount); reqNotifyRecvRes = ib_req_notify_cq(commContext->recvCQ, IB_CQ_NEXT_COMP); if(unlikely(reqNotifyRecvRes) ) ibv_print_info("Couldn't request CQ notification\n"); wake_up(&commContext->recvCompWaitQ); } void __IBVSocket_qpEventHandler(struct ib_event *event, void *data) { ibv_print_info_debug("called. event type: %d (not handled)\n", event->event); } int __IBVSocket_routeResolvedHandler(IBVSocket* _this, struct rdma_cm_id* cm_id, IBVCommConfig* commCfg, IBVCommContext** outCommContext) { bool createContextRes; struct rdma_conn_param conn_param; createContextRes = __IBVSocket_createCommContext(_this, _this->cm_id, commCfg, &_this->commContext); if(!createContextRes) { ibv_print_info("creation of CommContext failed\n"); _this->errState = -1; return -EPERM; } // establish connection... if (_this->commContext->checkConnRkey == 0) { if (!IBVBuffer_initRegistration(&_this->commContext->checkConBuffer, _this->commContext)) { _this->errState = -1; return -EPERM; } _this->commContext->checkConnRkey = _this->commContext->checkConBuffer.mr->rkey; } if (!__IBVSocket_initCommDest(_this->commContext, &_this->localDest)) { ibv_print_info_ir("creation of CommDest failed\n"); _this->errState = -1; return -EPERM; } memset(&conn_param, 0, sizeof(conn_param) ); #ifdef BEEGFS_NVFS conn_param.responder_resources = _this->commContext->pd->device->attrs.max_qp_rd_atom; conn_param.initiator_depth = _this->commContext->pd->device->attrs.max_qp_init_rd_atom; #else conn_param.responder_resources = 1; conn_param.initiator_depth = 1; #endif conn_param.flow_control = 0; conn_param.retry_count = 7; // (3 bits) conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity) conn_param.private_data = &_this->localDest; conn_param.private_data_len = sizeof(_this->localDest); return rdma_connect(_this->cm_id, &conn_param); } int __IBVSocket_connectedHandler(IBVSocket* _this, struct rdma_cm_event* event) { IBVCommContext* commContext = _this->commContext; IBVCommConfig* commCfg; int retVal = 0; bool parseCommDestRes; const void* private_data; u8 private_data_len; int i; if (!commContext) return -EINVAL; commCfg = &commContext->commCfg; if (_this->commContext->commCfg.keyType == IBVSOCKETKEYTYPE_Register) { if(IBVSocket_registerMr(_this, _this->commContext->checkConBuffer.mr, IB_ACCESS_REMOTE_READ)) { ibv_print_info("register buffer failed\n"); _this->errState = -1; return -EPERM; } } // post initial recv buffers... for(i=0; i < commCfg->bufNum; i++) { if(__IBVSocket_postRecv(_this, commContext, i) ) { ibv_print_info("couldn't post recv buffer with index %d\n", i); goto err_invalidateSock; } } #if defined BEEGFS_OFED_1_2_API && (BEEGFS_OFED_1_2_API == 1) private_data = event->private_data; private_data_len = event->private_data_len; #else // OFED 1.2.5 or higher API private_data = event->param.conn.private_data; private_data_len = event->param.conn.private_data_len; #endif parseCommDestRes = __IBVSocket_parseCommDest( private_data, private_data_len, &_this->remoteDest); if(!parseCommDestRes) { ibv_print_info("bad private data received. len: %d\n", private_data_len); retVal = -EOPNOTSUPP; goto err_invalidateSock; } return retVal; err_invalidateSock: _this->errState = -1; return retVal; } struct ib_cq* __IBVSocket_createCompletionQueue(struct ib_device* device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void* cq_context, int cqe) { #if defined (BEEGFS_OFED_1_2_API) && BEEGFS_OFED_1_2_API >= 1 return ib_create_cq(device, comp_handler, event_handler, cq_context, cqe); #elif defined OFED_HAS_IB_CREATE_CQATTR || defined ib_create_cq struct ib_cq_init_attr attrs = { .cqe = cqe, #ifdef KERNEL_HAS_GET_RANDOM_INT .comp_vector = get_random_int()%device->num_comp_vectors, #else .comp_vector = get_random_long()%device->num_comp_vectors, #endif }; return ib_create_cq(device, comp_handler, event_handler, cq_context, &attrs); #else // OFED 1.2.5 or higher API return ib_create_cq(device, comp_handler, event_handler, cq_context, cqe, 0); #endif } /** * @return pointer to static buffer with human readable string for a wc status code */ const char* __IBVSocket_wcStatusStr(int wcStatusCode) { switch(wcStatusCode) { case IB_WC_WR_FLUSH_ERR: return "work request flush error"; case IB_WC_RETRY_EXC_ERR: return "retries exceeded error"; case IB_WC_RESP_TIMEOUT_ERR: return "response timeout error"; default: return ""; } } int IBVSocket_registerMr(IBVSocket* _this, struct ib_mr* mr, int access) { struct ib_reg_wr wr; int res; memset(&wr, 0, sizeof(wr)); wr.wr.opcode = IB_WR_REG_MR; wr.mr = mr; wr.key = mr->rkey; wr.access = IB_ACCESS_LOCAL_WRITE | access; res = ib_post_send(_this->commContext->qp, &wr.wr, NULL); if (unlikely(res)) { printk_fhgfs(KERN_ERR, "Failed to post IB_WR_REG_MR res=%d\n", res); return -1; } return 0; } struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this) { return _this->srcIpAddr; } NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this) { return _this->nicStats; } #endif