#include "IBVSocket.h" #include #include #include #include #ifdef BEEGFS_NVFS // only for WORKER_BUFOUT_SIZE #include #endif /* BEEGFS_NVFS */ #define IBVSOCKET_CONN_TIMEOUT_MS 3000 // IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS is the initial timeout to wait between checks for // RDMA response events while establishing outgoing connections. Will be scaled up exponentially // after every poll (up to IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS) to reduce load but allow for quick // turnaround in the majority of cases where an event will come shortly after initiating the // connection. // // There used to be a fixed timeout of 500ms here, which lead to long initial connection initiation // times when an event was not received immediately // (https://github.com/ThinkParQ/beegfs-core/issues/4054). #define IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS 1 #define IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS 512 #define IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS 180000 #define IBVSOCKET_POLL_TIMEOUT_MS 7500 #define IBVSOCKET_LISTEN_BACKLOG 128 #define IBVSOCKET_FLOWCONTROL_MSG_LEN 1 #define IBVSOCKET_DEFAULT_TOS 0 /** * IBVSOCKET_RECV_TIMEOUT_MS is used by IBVSocket_recv, which does not take a * timeout value. It is very long because IBVSocket_recv continues to call * IBVSocket_recvT until it does not timeout. */ #define IBVSOCKET_RECV_TIMEOUT_MS (1024*1024) #define IBVSOCKET_MIN_BUF_NUM 1 #define IBVSOCKET_MIN_BUF_SIZE 4096 // 4kiB #define IBVSOCKET_MAX_BUF_SIZE_NUM 134217728 // num * size <= 128MiB #ifdef BEEGFS_NVFS #define IBVSOCKET_WC_ENTRIES 1 #endif /* BEEGFS_NVFS */ void IBVSocket_init(IBVSocket* _this) { memset(_this, 0, sizeof(*_this) ); _this->sockValid = false; _this->epollFD = -1; _this->typeOfService = IBVSOCKET_DEFAULT_TOS; _this->timeoutCfg.connectMS = IBVSOCKET_CONN_TIMEOUT_MS; _this->timeoutCfg.flowSendMS = IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS; _this->cm_channel = rdma_create_event_channel(); if(!_this->cm_channel) { LOG(SOCKLIB, WARNING, "rdma_create_event_channel failed."); return; } if(rdma_create_id(_this->cm_channel, &_this->cm_id, NULL, RDMA_PS_TCP) ) { LOG(SOCKLIB, WARNING, "rdma_create_id failed."); return; } _this->sockValid = true; return; } /** * Note: Intended for incoming accepted connections. * * @param commContext belongs to this object (so do not use or free it after calling this!) */ void __IBVSocket_initFromCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id, IBVCommContext* commContext) { memset(_this, 0, sizeof(*_this) ); _this->sockValid = false; _this->epollFD = -1; _this->typeOfService = IBVSOCKET_DEFAULT_TOS; _this->cm_id = cm_id; _this->commContext = commContext; #ifdef SYSTEM_HAS_RDMA_MIGRATE_ID__disabled // note: see _accept() for the reasons why this is currently disabled _this->cm_channel = rdma_create_event_channel(); if(!_this->cm_channel) { LOG(SOCKLIB, WARNING, "rdma_create_event_channel failed."); return; } #endif // SYSTEM_HAS_RDMA_MIGRATE_ID _this->sockValid = true; LOG(SOCKLIB, DEBUG, __func__, ("_this", StringTk::uint64ToHexStr((uint64_t) _this)), ("device", cm_id->verbs->device->name)); return; } IBVSocket* IBVSocket_construct() { IBVSocket* _this = (IBVSocket*)malloc(sizeof(*_this) ); IBVSocket_init(_this); return _this; } IBVSocket* __IBVSocket_constructFromCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext) { IBVSocket* _this = (IBVSocket*)malloc(sizeof(*_this) ); __IBVSocket_initFromCommContext(_this, cm_id, commContext); return _this; } void IBVSocket_uninit(IBVSocket* _this) { if(_this->epollFD != -1) close(_this->epollFD); __IBVSocket_close(_this); } void IBVSocket_destruct(IBVSocket* _this) { IBVSocket_uninit(_this); free(_this); } bool IBVSocket_rdmaDevicesExist() { bool devicesExist; int numDevices = 1; struct ibv_context** devicesRes; devicesRes = rdma_get_devices(&numDevices); devicesExist = (devicesRes != NULL) && (numDevices > 0); if(devicesRes) rdma_free_devices(devicesRes); return devicesExist; } /** * Prepare ibverbs for forking a child process. This is only required if the parent process * has mapped memory for RDMA. * Call this only once in your program. * * Note: There is no corresponding uninit-method that needs to be called. */ void IBVSocket_fork_init_once() { ibv_fork_init(); } bool IBVSocket_connectByName(IBVSocket* _this, const char* hostname, unsigned short port, IBVCommConfig* commCfg) { struct addrinfo *res; struct addrinfo hints; int getInfoRes; struct in_addr ipaddress; memset(&hints, 0, sizeof(hints) ); hints.ai_family = PF_INET; hints.ai_socktype = SOCK_STREAM; getInfoRes = getaddrinfo(hostname, NULL, &hints, &res); if(getInfoRes < 0) { LOG(SOCKLIB, WARNING, "Name resolution error.", hostname, port, ("error", gai_strerror(getInfoRes))); return false; } ipaddress.s_addr = ( (struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; // clean-up freeaddrinfo(res); return IBVSocket_connectByIP(_this, ipaddress, port, commCfg); } bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port, IBVCommConfig* commCfg) { struct rdma_cm_event* event; struct sockaddr_in sin; bool createContextRes; struct rdma_conn_param conn_param; bool parseCommDestRes; bool epollInitRes; int rc; int connTimeoutMS = IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS; int connTimeoutRemaining = IBVSOCKET_CONN_TIMEOUT_MS; int oldChannelFlags; int setOldFlagsRes; LOG(SOCKLIB, DEBUG, "Connect RDMASocket", ("socket", _this), ("addr", Socket::endpointAddrToStr(ipaddress, port)), ("bindIP", Socket::ipaddrToStr(_this->bindIP))); // resolve IP address... sin.sin_addr.s_addr = ipaddress.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(port); if(rdma_resolve_addr(_this->cm_id, NULL, (struct sockaddr*)&sin, _this->timeoutCfg.connectMS) ) { LOG(SOCKLIB, WARNING, "rdma_resolve_addr failed."); goto err_invalidateSock; } if(rdma_get_cm_event(_this->cm_channel, &event)) goto err_invalidateSock; if(event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { LOG(SOCKLIB, DEBUG, "Unexpected CM event.", ("event", rdma_event_str(event->event))); goto err_ack_and_invalidateSock; } rdma_ack_cm_event(event); // set type of service for connection if (_this->typeOfService) { if (rdma_set_option(_this->cm_id, RDMA_OPTION_ID, RDMA_OPTION_ID_TOS, &(_this->typeOfService), sizeof(_this->typeOfService))) { LOG(SOCKLIB, WARNING, "Failed to set Type Of Service.", ("tos", _this->typeOfService)); goto err_invalidateSock; } } // resolve route... if(rdma_resolve_route(_this->cm_id, _this->timeoutCfg.connectMS) ) { LOG(SOCKLIB, WARNING, "rdma_resolve_route failed."); goto err_invalidateSock; } if(rdma_get_cm_event(_this->cm_channel, &event)) goto err_invalidateSock; if(event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { LOG(SOCKLIB, WARNING, "Unexpected CM event.", ("event", rdma_event_str(event->event))); goto err_ack_and_invalidateSock; } rdma_ack_cm_event(event); // create comm context... createContextRes = __IBVSocket_createCommContext(_this, _this->cm_id, commCfg, &_this->commContext); if(!createContextRes) { LOG(SOCKLIB, WARNING, "creation of CommContext failed."); goto err_invalidateSock; } // establish connection... __IBVSocket_initCommDest(_this->commContext, &_this->localDest); memset(&conn_param, 0, sizeof(conn_param) ); #ifdef BEEGFS_NVFS conn_param.responder_resources = RDMA_MAX_RESP_RES; conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; #else conn_param.responder_resources = 1; conn_param.initiator_depth = 1; #endif /* BEEGFS_NVFS */ conn_param.flow_control = 0; conn_param.retry_count = 7; // (3 bits) conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity) conn_param.private_data = &_this->localDest; conn_param.private_data_len = sizeof(_this->localDest); if(rdma_connect(_this->cm_id, &conn_param)) { LOG(SOCKLIB, DEBUG, "rdma_connect failed."); goto err_invalidateSock; } oldChannelFlags = fcntl(IBVSocket_getConnManagerFD(_this), F_GETFL); rc = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags | O_NONBLOCK); if(rc < 0) { LOG(SOCKLIB, WARNING, "Set conn manager channel non-blocking failed.", sysErr); goto err_invalidateSock; } // rdma_connect() can take a very long time (>5m) to timeout if the peer's HCA is down. // Change the channel to non-blocking and use a custom timeout mechanism. rc = -1; while (connTimeoutRemaining > 0) { // (non-blocking) check for new events rc = rdma_get_cm_event(_this->cm_channel, &event); if (rc) { if (errno != ETIMEDOUT && errno != EAGAIN) { LOG(SOCKLIB, WARNING, "rdma_get_cm_event failed", ("errno", errno)); break; } } else { // we got an event LOG(SOCKLIB, DEBUG, "Received RDMA connect response event.", ("Milliseconds spent polling", IBVSOCKET_CONN_TIMEOUT_MS - connTimeoutRemaining)); break; } connTimeoutRemaining -= connTimeoutMS; if (connTimeoutRemaining > 0) { struct timespec ts = { .tv_sec = 0, .tv_nsec = (connTimeoutMS * 1000 * 1000) }; // progressively scale the timeout by squaring it until it is larger than // IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS if (connTimeoutMS == 1) { connTimeoutMS = 2; } else if (connTimeoutMS * 2 <= IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS) { connTimeoutMS *= 2; } if (::nanosleep(&ts, NULL) != 0) { LOG(SOCKLIB, DEBUG, "rdma_connect: sleep interrupted"); break; } } else LOG(SOCKLIB, DEBUG, "rdma_connect: timed out"); } // change channel mode back to blocking setOldFlagsRes = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags); if(setOldFlagsRes < 0) { LOG(SOCKLIB, WARNING, "Set conn manager channel blocking failed.", sysErr); if (rc == 0) goto err_ack_and_invalidateSock; else goto err_invalidateSock; } if (rc != 0) goto err_invalidateSock; if(event->event != RDMA_CM_EVENT_ESTABLISHED) { if(event->event == RDMA_CM_EVENT_REJECTED) LOG(SOCKLIB, DEBUG, "Connection rejected."); else LOG(SOCKLIB, WARNING, "Unexpected conn manager event.", ("event", rdma_event_str(event->event))); goto err_ack_and_invalidateSock; } parseCommDestRes = __IBVSocket_parseCommDest( event->param.conn.private_data, event->param.conn.private_data_len, &_this->remoteDest); if(!parseCommDestRes) { LOG(SOCKLIB, WARNING, "Bad private data received.", ("len", event->param.conn.private_data_len)); goto err_ack_and_invalidateSock; } rdma_ack_cm_event(event); epollInitRes = __IBVSocket_initEpollFD(_this); if(!epollInitRes) goto err_invalidateSock; return true; err_ack_and_invalidateSock: rdma_ack_cm_event(event); err_invalidateSock: _this->errState = -1; return false; } /** * @return true on success */ bool IBVSocket_bind(IBVSocket* _this, unsigned short port) { in_addr_t ipAddr = INADDR_ANY; return IBVSocket_bindToAddr(_this, ipAddr, port); } bool IBVSocket_bindToAddr(IBVSocket* _this, in_addr_t ipAddr, unsigned short port) { struct sockaddr_in bindAddr; bindAddr.sin_family = AF_INET; bindAddr.sin_addr.s_addr = ipAddr; bindAddr.sin_port = htons(port); LOG(SOCKLIB, DEBUG, "Bind RDMASocket", ("socket", _this), ("addr", Socket::endpointAddrToStr(ipAddr, port))); if(rdma_bind_addr(_this->cm_id, (struct sockaddr*)&bindAddr) ) { //SyslogLogger::log(LOG_WARNING, "%s:%d rdma_bind_addr failed (port: %d)\n", //__func__, __LINE__, (int)port); // debug in goto err_invalidateSock; } _this->bindIP.s_addr = ipAddr; return true; err_invalidateSock: _this->errState = -1; return false; } /** * Note: This also inits the delayedCmEventsQueue. * * @return true on success */ bool IBVSocket_listen(IBVSocket* _this) { if(rdma_listen(_this->cm_id, IBVSOCKET_LISTEN_BACKLOG) ) { LOG(SOCKLIB, WARNING, "rdma_listen failed."); goto err_invalidateSock; } // init delayed events queue _this->delayedCmEventsQ = new CmEventQueue(); return true; err_invalidateSock: _this->errState = -1; return false; } /** * Note: Call IBVSocket_checkDelayedEvents() after this to find out whether more events * are waiting. * Note: Because of the special way ibverbs accept connections, it is possible that we receive * some other events here as well (e.g. a child socket disconnect). In these cases, * ACCEPTRES_IGNORE will be returned. * * @param outAcceptedSock only valid when ACCEPTRES_SUCCESS is returned * @param peerAddr (out) peer address * @param peerAddrLen (out) length of peer address * @return ACCEPTRES_IGNORE in case an irrelevant event occurred */ IBVSocket_AcceptRes IBVSocket_accept(IBVSocket* _this, IBVSocket** outAcceptedSock, struct sockaddr* peerAddr, socklen_t* peerAddrLen) { struct rdma_cm_event* event = NULL; IBVCommContext* childCommContext = NULL; IBVSocket* acceptedSock = NULL; // auto-destructed on error/ignore (internal, not for caller) IBVCommDest* childRemoteDest = NULL; // auto-freed on error/ignore *outAcceptedSock = NULL; // get next waiting event from delay-queue or from event channel if (!_this->delayedCmEventsQ->empty()) { event = _this->delayedCmEventsQ->front(); _this->delayedCmEventsQ->pop(); } else if(rdma_get_cm_event(_this->cm_channel, &event) ) { _this->errState = -1; return ACCEPTRES_ERR; } // handle event type switch(event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: { // got an incoming 'connect request' => check validity of private data and accept/reject bool createContextRes; struct rdma_conn_param conn_param; bool parseCommDestRes; IBVCommConfig commCfg; struct rdma_cm_id* child_cm_id = event->id; //*peerAddrLen = sizeof(struct sockaddr_in); //memcpy(peerAddr, &child_cm_id->route.addr.dst_addr, *peerAddrLen); // parse private data to get remote dest parseCommDestRes = __IBVSocket_parseCommDest( event->param.conn.private_data, event->param.conn.private_data_len, &childRemoteDest); if(!parseCommDestRes) { // bad private data => reject connection LOG(SOCKLIB, WARNING, "Bad private data received.", ("len", event->param.conn.private_data_len)); if(rdma_reject(child_cm_id, NULL, 0) ) LOG(SOCKLIB, WARNING, "rdma_reject failed."); goto ignore; } // private data (remote dest) okay => create local comm context and socket instance // (we use the buffer config as suggested by the connecting peer) commCfg.bufNum = childRemoteDest->recvBufNum; commCfg.bufSize = childRemoteDest->recvBufSize; createContextRes = __IBVSocket_createCommContext(_this, child_cm_id, &commCfg, &childCommContext); if(!createContextRes) { LOG(SOCKLIB, WARNING, "Creation of CommContext failed."); if(rdma_reject(child_cm_id, NULL, 0) ) LOG(SOCKLIB, WARNING, "rdma_reject failed."); goto ignore; } acceptedSock = __IBVSocket_constructFromCommContext(child_cm_id, childCommContext); if(!acceptedSock->sockValid) goto ignore; acceptedSock->remoteDest = childRemoteDest; childRemoteDest = NULL; // would otherwise be destroyed at 'ignore' // send accept message (with local destination info) __IBVSocket_initCommDest(childCommContext, &acceptedSock->localDest); memset(&conn_param, 0, sizeof(conn_param) ); #ifdef BEEGFS_NVFS conn_param.responder_resources = RDMA_MAX_RESP_RES; conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; #else conn_param.responder_resources = 1; conn_param.initiator_depth = 1; #endif /* BEEGFS_NVFS */ conn_param.flow_control = 0; conn_param.retry_count = 7; // (3 bits) conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity) conn_param.private_data = &acceptedSock->localDest; conn_param.private_data_len = sizeof(acceptedSock->localDest); // test point for dropping the connect request if(IBVSocket_connectionRejection(_this)) goto ignore; if(rdma_accept(child_cm_id, &conn_param) ) { LOG(SOCKLIB, WARNING, "rdma_accept failed."); goto ignore; } if(!__IBVSocket_initEpollFD(acceptedSock) ) goto ignore; // Note that this code returns ACCEPTRES_IGNORE LOG(SOCKLIB, DEBUG, "Connection request on RDMASocket"); child_cm_id->context = acceptedSock; acceptedSock = NULL; // would otherwise be destroyed at 'ignore' } break; case RDMA_CM_EVENT_ESTABLISHED: { // received 'established' (this is what we've actually been waiting for!) *peerAddrLen = sizeof(struct sockaddr_in); memcpy(peerAddr, &event->id->route.addr.dst_addr, *peerAddrLen); *outAcceptedSock = (IBVSocket*)event->id->context; rdma_ack_cm_event(event); #ifdef SYSTEM_HAS_RDMA_MIGRATE_ID__disabled // note: this is currently disabled, because: // a) rdma_migrate_id always returns "invalid argument" // b) we need disconnect events for incoming conns to be handled and the handler must call // rdma_disconnect to enable disconnect detection for the streamlistener // note: migration might deadlock if there are any retrieved but not yet ack'ed events // for the current channel, so we cannot migrate if this is the case // note: the only purpose of migration to a separate channel is that we can do better // disconnect detection in waitForCompletion(). so living without the migration is // generally not a problem (but disconnect detection might take longer). if(_this->delayedCmEventsQ->size() ) { // events waiting => don't migrate LOG(SOCKLIB, WARNING, "Skipping rdma_migrate_id due to waiting events (but we can live without it)."); } else { // migrate cm_id from general accept-channel to its own channel int migrateRes = rdma_migrate_id( (*outAcceptedSock)->cm_id, (*outAcceptedSock)->cm_channel); if(migrateRes) { LOG(SOCKLIB, WARNING, "rdma_migrate_id failed (but we can live without it).", migrateRes, sysErr); } } #endif // SYSTEM_HAS_RDMA_MIGRATE_ID return ACCEPTRES_SUCCESS; } break; case RDMA_CM_EVENT_DISCONNECTED: { // note: be careful about what we do with the event-socket here, because the socket might // already be under destruction in another thread. LOG(SOCKLIB, DEBUG, "Disconnect event."); // note: the additional disconnect call is required to get the streamlistener event // channel (the one of the listen sock) to report the disconnect rdma_disconnect(event->id); } break; case RDMA_CM_EVENT_UNREACHABLE: { LOG(SOCKLIB, WARNING, "Remote unreachable event while waiting for 'established'."); acceptedSock = (IBVSocket*)event->id->context; // will be destroyed at 'ignore' } break; case RDMA_CM_EVENT_CONNECT_ERROR: { LOG(SOCKLIB, WARNING, "Connect error event while waiting for 'established'."); acceptedSock = (IBVSocket*)event->id->context; // will be destroyed at 'ignore' } break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: { // log only with enabled debug code LOG(SOCKLIB, DEBUG, "Ignoring conn manager event RDMA_CM_EVENT_TIMEWAIT_EXIT."); } break; case RDMA_CM_EVENT_DEVICE_REMOVAL: { AbstractApp* app = PThread::getCurrentThreadApp(); const char* devname = "unknown"; if (event->id && event->id->verbs) devname = ibv_get_device_name(event->id->verbs->device); LOG(SOCKLIB, ERR, "Device removed", ("device", devname)); app->handleNetworkInterfaceFailure(std::string(devname)); } break; default: { // ignore other events // always log LOG(SOCKLIB, WARNING, "Ignoring conn manager event.", ("event", rdma_event_str(event->event))); } break; } // irrelevant event (irrelevant for the caller) ignore: rdma_ack_cm_event(event); SAFE_FREE(childRemoteDest); if(acceptedSock) IBVSocket_destruct(acceptedSock); *outAcceptedSock = NULL; return ACCEPTRES_IGNORE; } bool IBVSocket_shutdown(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; if(!commContext) return true; // this socket has never been connected // if object is in errState, then the socket might be in an inconsistent state, // therefore further commands (except for disconnect) should not be executed if(!_this->errState && commContext->incompleteSend.numAvailable) { // wait for all incomplete sends int waitRes; waitRes = __IBVSocket_waitForTotalSendCompletion( _this, commContext->incompleteSend.numAvailable, 0, 0); if(waitRes < 0) { LOG(SOCKLIB, WARNING, "Waiting for incomplete send requests failed."); return false; } } __IBVSocket_disconnect(_this); return true; } /** * Continues an incomplete former recv() by returning immediately available data from the * corresponding buffer. */ ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, char* buf, size_t bufLen) { IBVCommContext* commContext = _this->commContext; int completedOffset = commContext->incompleteRecv.completedOffset; size_t availableLen = commContext->incompleteRecv.wc.byte_len - completedOffset; size_t bufIndex = commContext->incompleteRecv.wc.wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET; if(availableLen <= bufLen) { // old data fits completely into buf memcpy(buf, &(commContext->recvBufs)[bufIndex][completedOffset], availableLen); commContext->incompleteRecv.isAvailable = 0; int postRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex); if(unlikely(postRes) ) goto err_invalidateSock; return availableLen; } else { // still too much data for the buf => copy partially memcpy(buf, &(commContext->recvBufs)[bufIndex][completedOffset], bufLen); commContext->incompleteRecv.completedOffset += bufLen; return bufLen; } err_invalidateSock: _this->errState = -1; return -1; } ssize_t IBVSocket_recv(IBVSocket* _this, char* buf, size_t bufLen, int flags) { const int timeoutMS = IBVSOCKET_RECV_TIMEOUT_MS; ssize_t recvTRes; do { recvTRes = IBVSocket_recvT(_this, buf, bufLen, flags, timeoutMS); } while(recvTRes == -ETIMEDOUT); return recvTRes; } /** * @return number of received bytes on success, 0 on timeout, -1 on error */ ssize_t IBVSocket_recvT(IBVSocket* _this, char* buf, size_t bufLen, int flags, int timeoutMS) { IBVCommContext* commContext = _this->commContext; struct ibv_wc* wc = &commContext->incompleteRecv.wc; int flowControlRes; int recvWCRes; if(unlikely(_this->errState) ) return -1; // check whether an old buffer has not been fully read yet if(!commContext->incompleteRecv.isAvailable) { // no partially read data available => recv new buffer // check whether we have a pending on-send flow control packet that needs to be received first flowControlRes = __IBVSocket_flowControlOnSendWait(_this, timeoutMS); if(flowControlRes <= 0) { if(likely(!flowControlRes) ) return -ETIMEDOUT; // timeout goto err_invalidateSock; } // recv a new buffer (into the incompleteRecv structure) recvWCRes = __IBVSocket_recvWC(_this, timeoutMS, wc); if(recvWCRes <= 0) { if(likely(!recvWCRes) ) return -ETIMEDOUT; // timeout goto err_invalidateSock; // error occurred } // recvWC was positive => we're guaranteed to have an incompleteRecv buf availabe commContext->incompleteRecv.completedOffset = 0; commContext->incompleteRecv.isAvailable = 1; } return __IBVSocket_recvContinueIncomplete(_this, buf, bufLen); err_invalidateSock: _this->errState = -1; return -ECOMM; } ssize_t IBVSocket_send(IBVSocket* _this, const char* buf, size_t bufLen, int flags) { IBVCommContext* commContext = _this->commContext; int flowControlRes; size_t currentBufIndex; int postRes; size_t postedLen = 0; int currentPostLen; int waitRes; if(unlikely(_this->errState) ) return -1; do { flowControlRes = __IBVSocket_flowControlOnSendWait(_this, _this->timeoutCfg.flowSendMS); if(unlikely(flowControlRes <= 0) ) goto err_invalidateSock; // note: we only poll for completed sends after we used up all (!) available bufs if(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) { // wait for all (!) incomplete sends waitRes = __IBVSocket_waitForTotalSendCompletion( _this, commContext->incompleteSend.numAvailable, 0, 0); if(waitRes < 0) goto err_invalidateSock; commContext->incompleteSend.numAvailable = 0; } currentPostLen = BEEGFS_MIN(bufLen-postedLen, commContext->commCfg.bufSize); currentBufIndex = commContext->incompleteSend.numAvailable; memcpy( (commContext->sendBufs)[currentBufIndex], &buf[postedLen], currentPostLen); commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */ postRes = __IBVSocket_postSend(_this, currentBufIndex, currentPostLen); if(unlikely(postRes) ) { commContext->incompleteSend.numAvailable--; goto err_invalidateSock; } postedLen += currentPostLen; } while(postedLen < bufLen); return (ssize_t)bufLen; err_invalidateSock: _this->errState = -1; return -ECOMM; } int __IBVSocket_registerBuf(IBVCommContext* commContext, void* buf, size_t bufLen, struct ibv_mr** outMR) { /* note: IB spec says: "The consumer is not allowed to assign remote-write or remote-atomic to a memory region that has not been assigned local-write." */ enum ibv_access_flags accessFlags = (enum ibv_access_flags) (IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); *outMR = ibv_reg_mr(commContext->pd, buf, bufLen, accessFlags); if(!*outMR) { LOG(SOCKLIB, WARNING, "Couldn't allocate MR."); return -1; } return 0; } char* __IBVSocket_allocAndRegisterBuf(IBVCommContext* commContext, size_t bufLen, struct ibv_mr** outMR) { void* buf; int registerRes; int allocRes = posix_memalign(&buf, sysconf(_SC_PAGESIZE), bufLen); if(allocRes) { LOG(SOCKLIB, WARNING, "Couldn't allocate work buf."); return NULL; } memset(buf, 0, bufLen); registerRes = __IBVSocket_registerBuf(commContext, buf, bufLen, outMR); if(registerRes < 0) { free(buf); return NULL; } return (char*)buf; } bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id, IBVCommConfig* commCfg, IBVCommContext** outCommContext) { IBVCommContext* commContext = NULL; int registerControlRes; int registerControlResReset; struct ibv_qp_init_attr qpInitAttr; int createQPRes; unsigned i; // sanity checks if (unlikely(commCfg->bufNum < IBVSOCKET_MIN_BUF_NUM) ) { LOG(SOCKLIB, WARNING, "bufNum too small!", ("got", commCfg->bufNum), ("minimum", IBVSOCKET_MIN_BUF_NUM)); goto err_cleanup; } if (unlikely(commCfg->bufSize < IBVSOCKET_MIN_BUF_SIZE) ) // sanity check { LOG(SOCKLIB, WARNING, "bufSize too small!", ("got", commCfg->bufSize), ("minimum", IBVSOCKET_MIN_BUF_SIZE)); goto err_cleanup; } if (commCfg->bufSize * commCfg->bufNum > IBVSOCKET_MAX_BUF_SIZE_NUM) { LOG(SOCKLIB, WARNING, "bufSize*bufNum too large!", ("got", commCfg->bufSize * commCfg->bufNum), ("maximum", IBVSOCKET_MAX_BUF_SIZE_NUM)); goto err_cleanup; } commContext = (IBVCommContext*)calloc(1, sizeof(*commContext) ); if(!commContext) goto err_cleanup; commContext->context = cm_id->verbs; if(!commContext->context) { LOG(SOCKLIB, WARNING, "Unbound cm_id!!"); goto err_cleanup; } commContext->pd = ibv_alloc_pd(commContext->context); if(!commContext->pd) { LOG(SOCKLIB, WARNING, "Couldn't allocate PD."); goto err_cleanup; } // alloc and register buffers... commContext->commCfg = *commCfg; commContext->recvBuf = __IBVSocket_allocAndRegisterBuf( commContext, commCfg->bufSize * commCfg->bufNum, &commContext->recvMR); if(!commContext->recvBuf) { LOG(SOCKLIB, WARNING, "Couldn't prepare recvBuf."); goto err_cleanup; } commContext->recvBufs = (char**)calloc(1, commCfg->bufNum * sizeof(char*) ); for(i=0; i < commCfg->bufNum; i++) commContext->recvBufs[i] = &commContext->recvBuf[i * commCfg->bufSize]; commContext->sendBuf = __IBVSocket_allocAndRegisterBuf( commContext, commCfg->bufSize * commCfg->bufNum, &commContext->sendMR); if(!commContext->sendBuf) { LOG(SOCKLIB, WARNING, "Couldn't prepare sendBuf."); goto err_cleanup; } commContext->sendBufs = (char**)calloc(1, commCfg->bufNum * sizeof(char*) ); for(i=0; i < commCfg->bufNum; i++) commContext->sendBufs[i] = &commContext->sendBuf[i * commCfg->bufSize]; registerControlRes = __IBVSocket_registerBuf( commContext, (char*)&commContext->numUsedSendBufs, sizeof(commContext->numUsedSendBufs), &commContext->controlMR); if(registerControlRes < 0) { LOG(SOCKLIB, WARNING, "Couldn't register control memory region."); goto err_cleanup; } registerControlResReset = __IBVSocket_registerBuf( commContext, (char*)&commContext->numUsedSendBufsReset, sizeof(commContext->numUsedSendBufsReset), &commContext->controlResetMR); if(registerControlResReset < 0) { LOG(SOCKLIB, WARNING, "Couldn't register control memory reset region."); goto err_cleanup; } // init flow control v2 (to avoid long receiver-not-ready timeouts) /* note: we use -1 because the last buf might not be read by the user (eg during nonblockingRecvCheck) and so it might not be immediately available again. */ commContext->numReceivedBufsLeft = commCfg->bufNum - 1; commContext->numSendBufsLeft = commCfg->bufNum - 1; // create completion channel and queues... commContext->recvCompChannel = ibv_create_comp_channel(commContext->context); if(!commContext->recvCompChannel) { LOG(SOCKLIB, WARNING, "Couldn't create comp channel."); goto err_cleanup; } commContext->recvCQ = ibv_create_cq( commContext->context, commCfg->bufNum, commContext, commContext->recvCompChannel, rand()%commContext->context->num_comp_vectors); if(!commContext->recvCQ) { LOG(SOCKLIB, WARNING, "Couldn't create recv CQ."); goto err_cleanup; } // note: 1+commCfg->bufNum here for the RDMA write usedBufs reset work (=> flow/flood control) commContext->sendCQ = ibv_create_cq( commContext->context, 1+commCfg->bufNum, NULL, NULL, rand()%commContext->context->num_comp_vectors); if(!commContext->sendCQ) { LOG(SOCKLIB, WARNING, "Couldn't create send CQ."); goto err_cleanup; } // note: 1+commCfg->bufNum here for the RDMA write usedBufs reset work memset(&qpInitAttr, 0, sizeof(qpInitAttr) ); qpInitAttr.send_cq = commContext->sendCQ; qpInitAttr.recv_cq = commContext->recvCQ; qpInitAttr.qp_type = IBV_QPT_RC; qpInitAttr.sq_sig_all = 1; qpInitAttr.cap.max_send_wr = 1+commCfg->bufNum; qpInitAttr.cap.max_recv_wr = commCfg->bufNum; qpInitAttr.cap.max_send_sge = 1; qpInitAttr.cap.max_recv_sge = 1; qpInitAttr.cap.max_inline_data = 0; createQPRes = rdma_create_qp(cm_id, commContext->pd, &qpInitAttr); if(createQPRes) { LOG(SOCKLIB, WARNING, "Couldn't create QP.", sysErr); goto err_cleanup; } commContext->qp = cm_id->qp; // post initial recv buffers... for(i=0; i < commCfg->bufNum; i++) { if(__IBVSocket_postRecv(_this, commContext, i) ) { LOG(SOCKLIB, WARNING, "Couldn't post recv buffer.", ("index", i)); goto err_cleanup; } } // prepare event notification... // initial event notification request if(ibv_req_notify_cq(commContext->recvCQ, 0) ) { LOG(SOCKLIB, WARNING, "Couldn't request CQ notification."); goto err_cleanup; } #ifdef BEEGFS_NVFS commContext->workerMRs = new MRMap(); commContext->cqMutex = new Mutex(); commContext->cqCompletions = new CQMap(); // RDMA id. (This variable will increment for each RDMA operation.) commContext->wr_id = 1; #endif /* BEEGFS_NVFS */ LOG(SOCKLIB, DEBUG, __func__, ("_this", StringTk::uint64ToHexStr((uint64_t) _this)), ("device", cm_id->verbs->device->name)); *outCommContext = commContext; return true; // error handling err_cleanup: __IBVSocket_cleanupCommContext(cm_id, commContext); *outCommContext = NULL; return false; } void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext) { if(!commContext) return; if(commContext->qp) { // see recommendation here: https://www.rdmamojo.com/2012/12/28/ibv_destroy_qp/ // the qp should be set to error state, so that no more events can be pushed to that queue. struct ibv_qp_attr qpAttr; qpAttr.qp_state = IBV_QPS_ERR; if (ibv_modify_qp(commContext->qp, &qpAttr, IBV_QP_STATE)) { LOG(SOCKLIB, WARNING, "Failed to modify qp IBV_QP_STATE."); } } // ack remaining delayed acks if(commContext->recvCQ && commContext->numUnackedRecvCompChannelEvents) ibv_ack_cq_events(commContext->recvCQ, commContext->numUnackedRecvCompChannelEvents); if(commContext->qp) { rdma_destroy_qp(cm_id); } if(commContext->sendCQ) { if(ibv_destroy_cq(commContext->sendCQ) ) LOG(SOCKLIB, WARNING, "Failed to destroy sendCQ."); } if(commContext->recvCQ) { if(ibv_destroy_cq(commContext->recvCQ) ) LOG(SOCKLIB, WARNING, "Failed to destroy recvCQ."); } if(commContext->recvCompChannel) { if(ibv_destroy_comp_channel(commContext->recvCompChannel) ) LOG(SOCKLIB, WARNING, "Failed to destroy recvCompChannel."); } if(commContext->controlMR) { if(ibv_dereg_mr(commContext->controlMR) ) LOG(SOCKLIB, WARNING, "Failed to deregister controlMR."); } if(commContext->controlResetMR) { if(ibv_dereg_mr(commContext->controlResetMR) ) LOG(SOCKLIB, WARNING, "Failed to deregister controlResetMR."); } if(commContext->recvMR) { if(ibv_dereg_mr(commContext->recvMR) ) LOG(SOCKLIB, WARNING, "Failed to deregister recvMR."); } if(commContext->sendMR) { if(ibv_dereg_mr(commContext->sendMR) ) LOG(SOCKLIB, WARNING, "Failed to deregister sendMR."); } #ifdef BEEGFS_NVFS if (commContext->workerMRs) { for (auto& iter: *(commContext->workerMRs)) { if(ibv_dereg_mr(iter.second) ) LOG(SOCKLIB, WARNING, "Failed to deregister workerMR."); } commContext->workerMRs->clear(); delete(commContext->workerMRs); } if (commContext->cqCompletions) { commContext->cqCompletions->clear(); delete(commContext->cqCompletions); } delete(commContext->cqMutex); #endif /* BEEGFS_NVFS */ SAFE_FREE(commContext->recvBuf); SAFE_FREE(commContext->sendBuf); SAFE_FREE(commContext->recvBufs); SAFE_FREE(commContext->sendBufs); if(commContext->pd) { if(ibv_dealloc_pd(commContext->pd) ) LOG(SOCKLIB, WARNING, "Failed to dealloc pd."); } free(commContext); } /** * Initializes a (local) IBVCommDest. */ void __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest) { memcpy(outDest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN); outDest->protocolVersion = HOST_TO_LE_64(IBVSOCKET_PRIVATEDATA_PROTOCOL_VER); outDest->rkey = HOST_TO_LE_32(commContext->controlMR->rkey); outDest->vaddr = HOST_TO_LE_64((uintptr_t)&commContext->numUsedSendBufs); outDest->recvBufNum = HOST_TO_LE_32(commContext->commCfg.bufNum); outDest->recvBufSize = HOST_TO_LE_32(commContext->commCfg.bufSize); } /** * Checks and parses a (remote) IBVCommDest. * * @param buf should usually be the private_data of the connection handshake * @param outDest will be alloced (if true is returned) and needs to be free'd by the caller * @return true if data is okay, false otherwise */ bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest) { IBVCommDest* dest = NULL; *outDest = NULL; // Note: "bufLen < ..." (and not "!="), because there might be some extra padding if(!buf || (bufLen < sizeof(*dest) ) ) { LOG(SOCKLIB, WARNING, "Bad private data size.", bufLen); return false; } dest = (IBVCommDest*)malloc(sizeof(*dest) ); if(!dest) return false; memcpy(dest, buf, sizeof(*dest) ); if(memcmp(dest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN) != 0 ) goto err_cleanup; dest->protocolVersion = LE_TO_HOST_64(dest->protocolVersion); if (dest->protocolVersion != IBVSOCKET_PRIVATEDATA_PROTOCOL_VER) goto err_cleanup; dest->rkey = LE_TO_HOST_32(dest->rkey); dest->vaddr = LE_TO_HOST_64(dest->vaddr); dest->recvBufNum = LE_TO_HOST_32(dest->recvBufNum); dest->recvBufSize = LE_TO_HOST_32(dest->recvBufSize); *outDest = dest; return true; err_cleanup: SAFE_FREE(dest); return false; } /** * Append buffer to receive queue. * * @param commContext passed seperately because it's not the _this->commContext during * accept() of incoming connections * @return 0 on success, -1 on error */ int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex) { struct ibv_sge list; struct ibv_recv_wr wr; struct ibv_recv_wr* bad_wr; int postRes; list.addr = (uint64_t)commContext->recvBufs[bufIndex]; list.length = commContext->commCfg.bufSize; list.lkey = commContext->recvMR->lkey; wr.next = NULL; wr.wr_id = bufIndex + IBVSOCKET_RECV_WORK_ID_OFFSET; wr.sg_list = &list; wr.num_sge = 1; postRes = ibv_post_recv(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { LOG(SOCKLIB, WARNING, "ibv_post_recv failed.", postRes, sysErr(postRes)); return -1; } return 0; } /** * Synchronous RDMA write (waits for completion) * * @return 0 on success, -1 on error */ int __IBVSocket_postWrite(IBVSocket* _this, IBVCommDest* remoteDest, struct ibv_mr* localMR, char* localBuf, int bufLen) { IBVCommContext* commContext = _this->commContext; struct ibv_sge list; struct ibv_send_wr wr; struct ibv_send_wr *bad_wr; int postRes; int waitRes; list.addr = (uint64_t)localBuf; list.length = bufLen; list.lkey = localMR->lkey; wr.wr.rdma.remote_addr = remoteDest->vaddr; wr.wr.rdma.rkey = remoteDest->rkey; wr.wr_id = IBVSOCKET_WRITE_WORK_ID; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE; wr.send_flags = IBV_SEND_SIGNALED; wr.next = NULL; postRes = ibv_post_send(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes)); return -1; } waitRes = __IBVSocket_waitForTotalSendCompletion(_this, commContext->incompleteSend.numAvailable, 1, 0); if(unlikely(waitRes) ) return -1; commContext->incompleteSend.numAvailable = 0; return 0; } /** * Synchronous RDMA read (waits for completion). * * @return 0 on success, -1 on error */ int __IBVSocket_postRead(IBVSocket* _this, IBVCommDest* remoteDest, struct ibv_mr* localMR, char* localBuf, int bufLen) { IBVCommContext* commContext = _this->commContext; struct ibv_sge list; struct ibv_send_wr wr; struct ibv_send_wr *bad_wr; int postRes; int waitRes; list.addr = (uint64_t) localBuf; list.length = bufLen; list.lkey = localMR->lkey; wr.wr.rdma.remote_addr = remoteDest->vaddr; wr.wr.rdma.rkey = remoteDest->rkey; wr.wr_id = IBVSOCKET_READ_WORK_ID; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; wr.send_flags = IBV_SEND_SIGNALED; wr.next = NULL; postRes = ibv_post_send(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes)); return -1; } waitRes = __IBVSocket_waitForTotalSendCompletion(_this, commContext->incompleteSend.numAvailable, 0, 1); if(unlikely(waitRes) ) return -1; commContext->incompleteSend.numAvailable = 0; return 0; } #ifdef BEEGFS_NVFS static bool __IBVSocket_getBufferKey(IBVCommContext *commContext, char *buffer, unsigned *key) { struct ibv_mr *mr = NULL; MRMap::const_iterator iter = commContext->workerMRs->find(buffer); if (iter == commContext->workerMRs->end()) { // It is assumed that buffer came from a Worker and is WORKER_BUFOUT_SIZE. // TODO: pass around a Buffer with a length instead of unqualified char*. // This cache of ibv_mr will potentially grow to Workers * Targets // and the ibv_mr instances hang around until the IBVSocket is destroyed. // That is probably something to look into... if (unlikely(__IBVSocket_registerBuf(commContext, buffer, WORKER_BUFOUT_SIZE, &mr))) { LOG(SOCKLIB, WARNING, "ibv_postWrite(): failed to register buffer."); return false; } commContext->workerMRs->insert({buffer, mr}); } else { mr = iter->second; } *key = mr->lkey; return true; } /** * Wait for the completion of a specific RDMA operation. * @return number of completed elements or -1 in case of an error */ static int __IBVSocket_waitForRDMACompletion(IBVCommContext* commContext, uint64_t id) { struct ibv_wc wc[IBVSOCKET_WC_ENTRIES]; int i = 0; int found = 0; int status = 0; int num_wc = 0; /* * This function is locked so that we don't get a race condition between two workers * looking for completions. */ commContext->cqMutex->lock(); CQMap::const_iterator iter = commContext->cqCompletions->find(id); /* * Check to see if we have already found the completion we are looking for. */ if (iter != commContext->cqCompletions->end()) { commContext->cqCompletions->erase(id); commContext->cqMutex->unlock(); return 0; } /* * Continue to poll the CQ until we find the entry in question or we encounter a * bad status. */ while (!found && !status) { num_wc = ibv_poll_cq(commContext->sendCQ, IBVSOCKET_WC_ENTRIES, wc); if (num_wc > 0) { for (i = 0; i < num_wc; i++) { if (unlikely(wc[i].status != IBV_WC_SUCCESS)) { LOG(SOCKLIB, DEBUG, "Connection error.", wc[i].status); status = -1; break; } if ((wc[i].opcode == IBV_WC_RDMA_WRITE) || (wc[i].opcode == IBV_WC_RDMA_READ)) { if (wc[i].wr_id == id) { found = 1; } else { commContext->cqCompletions->insert({wc[i].wr_id, wc[i].opcode}); } } else if (wc[i].opcode == IBV_WC_SEND) { if (likely(commContext->incompleteSend.numAvailable)) { commContext->incompleteSend.numAvailable--; } else { LOG(SOCKLIB, WARNING, "Received bad/unexpected send completion."); status = -1; break; } } else { LOG(SOCKLIB, WARNING, "Received unexpected CQ opcode.", wc[i].opcode); status = -1; break; } } } } commContext->cqMutex->unlock(); return status; } /** * Process RDMA requests. * * @return 0 on success, -1 on error */ static int __IBVSocket_postRDMA(IBVSocket* _this, ibv_wr_opcode opcode, char* localBuf, int bufLen, unsigned lkey, uint64_t remoteBuf, unsigned rkey) { IBVCommContext* commContext = _this->commContext; struct ibv_sge list; struct ibv_send_wr wr; struct ibv_send_wr *bad_wr; int postRes; int waitRes; if (unlikely(lkey == 0)) { if (unlikely(!__IBVSocket_getBufferKey(commContext, localBuf, &lkey))) { LOG(SOCKLIB, WARNING, "ibv_postRDMA(): no local key."); return -1; } } list.addr = (uint64_t) localBuf; list.length = bufLen; list.lkey = lkey; wr.wr_id = __atomic_fetch_add(&commContext->wr_id, 1, __ATOMIC_SEQ_CST); wr.next = NULL; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = opcode; wr.send_flags = IBV_SEND_SIGNALED; wr.wr.rdma.remote_addr = remoteBuf; wr.wr.rdma.rkey = rkey; postRes = ibv_post_send(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes)); return -1; } waitRes = __IBVSocket_waitForRDMACompletion(commContext, wr.wr_id); return waitRes; } int __IBVSocket_postWrite(IBVSocket* _this, char* localBuf, int bufLen, unsigned lkey, uint64_t remoteBuf, unsigned rkey) { return __IBVSocket_postRDMA(_this, IBV_WR_RDMA_WRITE, localBuf, bufLen, lkey, remoteBuf, rkey); } int __IBVSocket_postRead(IBVSocket* _this, char* localBuf, int bufLen, unsigned lkey, uint64_t remoteBuf, unsigned rkey) { return __IBVSocket_postRDMA(_this, IBV_WR_RDMA_READ, localBuf, bufLen, lkey, remoteBuf, rkey); } ssize_t IBVSocket_read(IBVSocket* _this, const char* buf, size_t bufLen, unsigned lkey, const uint64_t rbuf, unsigned rkey) { return __IBVSocket_postRead(_this, (char *)buf, bufLen, lkey, rbuf, rkey); } ssize_t IBVSocket_write(IBVSocket* _this, const char* buf, size_t bufLen, unsigned lkey, const uint64_t rbuf, unsigned rkey) { return __IBVSocket_postWrite(_this, (char *)buf, bufLen, lkey, rbuf, rkey); } #endif /* BEEGFS_NVFS */ /** * Note: Contains flow control. * * @return 0 on success, -1 on error */ int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex, int bufLen) { IBVCommContext* commContext = _this->commContext; struct ibv_sge list; struct ibv_send_wr wr; struct ibv_send_wr *bad_wr; int postRes; list.addr = (uint64_t)commContext->sendBufs[bufIndex]; list.length = bufLen; list.lkey = commContext->sendMR->lkey; wr.wr_id = bufIndex + IBVSOCKET_SEND_WORK_ID_OFFSET; wr.next = NULL; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_SEND; wr.send_flags = IBV_SEND_SIGNALED; postRes = ibv_post_send(commContext->qp, &wr, &bad_wr); if(unlikely(postRes) ) { LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes)); return -1; } // flow control __IBVSocket_flowControlOnSendUpdateCounters(_this); return 0; } /** * Note: Contains flow control. * * @return 1 on success, 0 on timeout, -1 on error */ int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ibv_wc* outWC) { IBVCommContext* commContext = _this->commContext; size_t bufIndex; int waitRes = __IBVSocket_waitForRecvCompletionEvent(_this, timeoutMS, outWC); if(waitRes <= 0) { // (note: waitRes==0 can often happen, because we call this with timeoutMS==0) if(unlikely(waitRes < 0) ) LOG(SOCKLIB, DEBUG, "Retrieval of completion event failed.", waitRes); else if(unlikely(timeoutMS) ) LOG(SOCKLIB, DEBUG, "Waiting for recv completion timed out."); return waitRes; } // we got something... if(unlikely(outWC->status != IBV_WC_SUCCESS) ) { LOG(SOCKLIB, DEBUG, "Connection error.", outWC->status); return -1; } bufIndex = outWC->wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET; if(unlikely(bufIndex >= commContext->commCfg.bufNum) ) { LOG(SOCKLIB, WARNING, "Completion for unknown/invalid wr_id.", outWC->wr_id); return -1; } // receive completed //printf("%s: Recveived %u bytes.\n", __func__, outWC->byte_len); // debug in // flow control if(unlikely(__IBVSocket_flowControlOnRecv(_this, timeoutMS) ) ) return -1; return 1; } /** * Intention: Avoid IB rnr by sending control msg when (almost) all our recv bufs are used up to * show that we got our new recv bufs ready. * * @return 0 on success, -1 on error */ int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS) { IBVCommContext* commContext = _this->commContext; // we received a packet, so peer has received all of our currently pending data => reset counter commContext->numSendBufsLeft = commContext->commCfg.bufNum - 1; /* (see createCommContext() for "-1" reason) */ // send control packet if recv counter expires... #ifdef BEEGFS_DEBUG if(!commContext->numReceivedBufsLeft) LOG(SOCKLIB, WARNING, "BUG: numReceivedBufsLeft underflow!"); #endif // BEEGFS_DEBUG commContext->numReceivedBufsLeft--; if(!commContext->numReceivedBufsLeft) { size_t currentBufIndex; int postRes; if(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum) { // wait for all (!) incomplete sends /* note: it's ok that all send bufs are used up, because it's possible that we do a lot of recv without the user sending any data in between (so the bufs were actually used up by flow control). */ int waitRes = __IBVSocket_waitForTotalSendCompletion( _this, commContext->incompleteSend.numAvailable, 0, 0); if(waitRes < 0) return -1; commContext->incompleteSend.numAvailable = 0; } currentBufIndex = commContext->incompleteSend.numAvailable; commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */ postRes = __IBVSocket_postSend(_this, currentBufIndex, IBVSOCKET_FLOWCONTROL_MSG_LEN); if(unlikely(postRes) ) { commContext->incompleteSend.numAvailable--; return -1; } // note: numReceivedBufsLeft is reset during postSend() flow control } return 0; } /** * Called after sending a packet to update flow control counters. * * Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up. * * Note: This is only one part of the on-send flow control. The other one is * _flowControlOnSendWait(). */ void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; // we sent a packet, so we received all currently pending data from the peer => reset counter commContext->numReceivedBufsLeft = commContext->commCfg.bufNum - 1; /* (see createCommContext() for "-1" reason) */ #ifdef BEEGFS_DEBUG if(!commContext->numSendBufsLeft) LOG(SOCKLIB, WARNING, "BUG: numSendBufsLeft underflow!"); #endif commContext->numSendBufsLeft--; } /** * Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up. * * @timeoutMS may be 0 for non-blocking operation, otherwise typically * IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS * @return >0 on success, 0 on timeout (waiting for flow control packet from peer), <0 on error */ int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS) { IBVCommContext* commContext = _this->commContext; struct ibv_wc wc; int recvRes; size_t bufIndex; int postRecvRes; if(commContext->numSendBufsLeft) return 1; // flow control not triggered yet recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc); if(recvRes <= 0) return recvRes; bufIndex = wc.wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET; if(unlikely(wc.byte_len != IBVSOCKET_FLOWCONTROL_MSG_LEN) ) { // error (bad length) LOG(SOCKLIB, WARNING, "Received flow control packet length mismatch.", wc.byte_len); return -1; } postRecvRes = __IBVSocket_postRecv(_this, commContext, bufIndex); if(postRecvRes) return -1; // note: numSendBufsLeft is reset during recvWC() (if it actually received a packet) return 1; } /** * @return 1 on available data, 0 on timeout, -1 on error */ int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS, struct ibv_wc* outWC) { /* Note: This will also be called with timeoutMS==0 from nonblockingRecvCheck to remove * a potentially outdated event notification. for this reason, we have to check the event * channel even if "ibv_poll_cq returns 0" and "timeoutMS==0". */ IBVCommContext* commContext = _this->commContext; struct ibv_cq* ev_cq; // event completion queue void* ev_ctx; // event context struct epoll_event epollEvent; // check quick path (is an event available without waiting?) int numImmediateEvents = ibv_poll_cq(commContext->recvCQ, 1, outWC); if(unlikely(numImmediateEvents < 0) ) { LOG(SOCKLIB, WARNING, "Poll CQ failed.", numImmediateEvents); return -1; } else if(numImmediateEvents > 0) return 1; // no immediate event available => wait for them... for( ; ; ) /* (loop until "wc retrieved" or "timeout" or "error") */ { /* note: we use pollTimeoutMS to check the conn every few secs (otherwise we might wait for a very long time in case the other side disconnected silently) */ int pollTimeoutMS = BEEGFS_MIN(_this->timeoutCfg.pollMS, timeoutMS); int epollRes = epoll_wait(_this->epollFD, &epollEvent, 1, pollTimeoutMS); if(unlikely(epollRes < 0) ) { if(errno == EINTR) continue; // ignore EINTR, because debugger causes it LOG(SOCKLIB, WARNING, "Epoll error.", sysErr); return -1; } if(epollRes == 0) { // poll timed out // Note: we check "timeoutMS != 0" here because we don't want to run the // connCheck each time this method is called from nonblockingRecvCheck if(timeoutMS) { int checkRes = IBVSocket_checkConnection(_this); if(checkRes < 0) return -1; } timeoutMS -= pollTimeoutMS; if(!timeoutMS) return 0; continue; } if(unlikely(_this->cm_channel && (epollEvent.data.fd == _this->cm_channel->fd) ) ) { // cm event incoming struct rdma_cm_event* event = 0; if (rdma_get_cm_event(_this->cm_channel, &event) < 0) { LOG(SOCKLIB, DEBUG, "Disconnected by rdma_get_cm_event error."); _this->errState = -1; return -1; } // Note: this code doesn't encounter RDMA_CM_EVENT_DEVICE_REMOVAL if(event->event == RDMA_CM_EVENT_DISCONNECTED) { LOG(SOCKLIB, DEBUG, "Disconnect event received."); rdma_ack_cm_event(event); _this->errState = -1; return -1; } else { LOG(SOCKLIB, DEBUG, "Ingoring received event", ("event", rdma_event_str(event->event))); // debug in rdma_ack_cm_event(event); continue; } } // we received a completion event notification => retrieve the event... int getEventRes = ibv_get_cq_event(commContext->recvCompChannel, &ev_cq, &ev_ctx); if(unlikely(getEventRes) ) { LOG(SOCKLIB, WARNING, "Failed to get cq_event."); return -1; } if(unlikely(ev_cq != commContext->recvCQ) ) { LOG(SOCKLIB, WARNING, "CQ event for unknown CQ.", ev_cq); return -1; } // request notification for next event int reqNotifyRes = ibv_req_notify_cq(commContext->recvCQ, 0); if(unlikely(reqNotifyRes) ) { LOG(SOCKLIB, WARNING, "Couldn't request CQ notification."); return -1; } // ack is expensive, so we gather and ack multiple events // note: spec says we need this, but current send_bw.c & co don't use ibv_ack_cq_events. commContext->numUnackedRecvCompChannelEvents++; if(commContext->numUnackedRecvCompChannelEvents == IBVSOCKET_EVENTS_GATHER_NUM) { // ack events and reset counter ibv_ack_cq_events(commContext->recvCQ, commContext->numUnackedRecvCompChannelEvents); commContext->numUnackedRecvCompChannelEvents = 0; } // query event... /* note: ibv_poll_cq() does not necessarily return "!=0" after a received event, because the event might be outdated */ int numEvents = ibv_poll_cq(commContext->recvCQ, 1, outWC); if(unlikely(numEvents < 0) ) { LOG(SOCKLIB, WARNING, "Poll CQ failed.", numEvents); return -1; } else if(numEvents > 0) return 1; // we received a notification for an outdated event => wait again in the next round } // end of for-loop } /** * @return number of completed elements or -1 in case of an error */ int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this, int numSendElements, int numWriteElements, int numReadElements) { IBVCommContext* commContext = _this->commContext; int numElements; int i; size_t bufIndex; struct ibv_wc wc[2]; do { numElements = ibv_poll_cq(commContext->sendCQ, 2, wc); if(unlikely(numElements < 0) ) { LOG(SOCKLIB, WARNING, "Bad ibv_poll_cq result.", numElements); return -1; } // for each completion element for(i=0; i < numElements; i++) { if(unlikely(wc[i].status != IBV_WC_SUCCESS) ) { LOG(SOCKLIB, DEBUG, "Connection error.", wc[i].status); return -1; } switch(wc[i].opcode) { case IBV_WC_SEND: { bufIndex = wc[i].wr_id - IBVSOCKET_SEND_WORK_ID_OFFSET; if(unlikely(bufIndex >= commContext->commCfg.bufNum) ) { LOG(SOCKLIB, WARNING, "Bad send completion wr_id.", wc[i].wr_id); return -1; } if(likely(numSendElements) ) numSendElements--; else { LOG(SOCKLIB, WARNING, "Received bad/unexpected send completion."); return -1; } } break; case IBV_WC_RDMA_WRITE: { if(unlikely(wc[i].wr_id != IBVSOCKET_WRITE_WORK_ID) ) { LOG(SOCKLIB, WARNING, "bad write completion wr_id.", wc[i].wr_id); return -1; } if(likely(numWriteElements) ) numWriteElements--; else { LOG(SOCKLIB, WARNING, "Received bad/unexpected RDMA write completion."); return -1; } } break; case IBV_WC_RDMA_READ: { if(unlikely(wc[i].wr_id != IBVSOCKET_READ_WORK_ID) ) { LOG(SOCKLIB, WARNING, "Bad read completion wr_id.", wc[i].wr_id); return -1; } if(likely(numReadElements) ) numReadElements--; else { LOG(SOCKLIB, WARNING, "Received bad/unexpected RDMA read completion."); return -1; } } break; default: { LOG(SOCKLIB, WARNING, "Bad/unexpected completion opcode.", wc[i].opcode); return -1; } break; } // end of switch } // end of for-loop } while(numSendElements || numWriteElements || numReadElements); return 0; } /** * @return 0 on success, -1 on error */ int IBVSocket_checkConnection(IBVSocket* _this) { struct ibv_qp_attr qpAttr; struct ibv_qp_init_attr qpInitAttr; int qpRes; int postRes; IBVCommContext* commContext = _this->commContext; //printf("%s: querying qp...\n", __func__); // debug in // check qp status qpRes = ibv_query_qp(commContext->qp, &qpAttr, IBV_QP_STATE, &qpInitAttr); if(qpRes || (qpAttr.qp_state == IBV_QPS_ERR) ) { LOG(SOCKLIB, WARNING, "Detected QP error state."); _this->errState = -1; return -1; } // note: we read a remote value into the numUsedSendBufsReset field, which is actually // meant for something else, so we need to reset the value afterwards //printf("%d:%s: post rdma_read to check connection...\n", __LINE__, __func__); // debug in postRes = __IBVSocket_postRead(_this, _this->remoteDest, commContext->controlResetMR, (char*)&commContext->numUsedSendBufsReset, sizeof(commContext->numUsedSendBufsReset) ); if(postRes) { _this->errState = -1; return -1; } commContext->numUsedSendBufsReset = 0; //printf("%d:%s: rdma_read succeeded\n", __LINE__, __func__); // debug in return 0; } /** * @return <0 on error, 0 if recv would block, >0 if recv would not block */ ssize_t IBVSocket_nonblockingRecvCheck(IBVSocket* _this) { /* note: this will also be called from the stream listener for false alarm checks, so make * sure that we remove an (outdated) event from the channel to mute the false alarm. */ IBVCommContext* commContext = _this->commContext; struct ibv_wc* wc = &commContext->incompleteRecv.wc; int flowControlRes; int recvRes; if(unlikely(_this->errState) ) return -1; if(commContext->incompleteRecv.isAvailable) return 1; // check whether we have a pending on-send flow control packet that needs to be received first flowControlRes = __IBVSocket_flowControlOnSendWait(_this, 0); if(unlikely(flowControlRes < 0) ) goto err_invalidateSock; if(!flowControlRes) return 0; // recv one packet (if available) and add it as incompleteRecv // or remove event channel notification otherwise (to avoid endless false alerts) recvRes = __IBVSocket_recvWC(_this, 0, wc); if(unlikely(recvRes < 0) ) goto err_invalidateSock; if(!recvRes) return 0; // we got something => prepare to continue later commContext->incompleteRecv.completedOffset = 0; commContext->incompleteRecv.isAvailable = 1; return 1; err_invalidateSock: _this->errState = -1; return -1; } /** * Call this after accept() to find out whether more events are waiting (for which * no notification would not be delivered through the file descriptor). * * @return true if more events are waiting and accept() should be called again */ bool IBVSocket_checkDelayedEvents(IBVSocket* _this) { bool retVal = false; struct rdma_cm_event* event; // check for events in the delay queue if (!_this->delayedCmEventsQ->empty()) return true; // Switch channel fd to non-blocking, check for waiting events and switch back to blocking. // (Quite inefficient, but we really don't have to care about efficiency in this case.) // Note: We do this to avoid race conditions (lost events) before we're waiting for // new notifications with poll() // change mode of the connection manager channel to non-blocking int oldChannelFlags = fcntl(IBVSocket_getConnManagerFD(_this), F_GETFL); int setNewFlagsRes = fcntl( IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags | O_NONBLOCK); if(setNewFlagsRes < 0) { LOG(SOCKLIB, WARNING, "Set conn manager channel non-blocking failed.", sysErr); return false; } // (non-blocking) check for new events if(rdma_get_cm_event(_this->cm_channel, &event) ) { // non-blocking mode, so we ignore "pseudo-errors" here } else { // incoming event available //printf("%d:%s: enqueueing an event (during non-blocking check): %d (%s)\n", // __LINE__, __func__, event->event, rdma_event_str(event->event) ); // debug in _this->delayedCmEventsQ->push(event); retVal = true; } // change channel mode back to blocking int setOldFlagsRes = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags); if(setOldFlagsRes < 0) { LOG(SOCKLIB, WARNING, "Set conn manager channel blocking failed.", sysErr); return false; } return retVal; } void __IBVSocket_disconnect(IBVSocket* _this) { /* note: we only call rdma_disconnect() here if the socket is not connected to the common listen sock event channel to avoid a race condition (because the sock accept method also calls rdma_disconnect() in the streamlistener thread. ...but that's ok. we really don't need that additional event if we're actively disconnecting the sock. */ if(_this->cm_channel) { int disconnectRes = rdma_disconnect(_this->cm_id); if(disconnectRes) { LOG(SOCKLIB, WARNING, "rdma disconnect error.", sysErr); return; } // note: we can't wait for events here, because the disconnect event might // be received by the listen socket channel (for accepted sockets with older // ofed versions). /* if(!_this->cm_channel || !waitForEvent) return; rdma_get_cm_event(_this->cm_channel, &event); if(event->event != RDMA_CM_EVENT_DISCONNECTED) { SyslogLogger::log(LOG_WARNING, "%s: unexpected event during disconnect %d: %s\n", __func__, event->event, rdma_event_str(event->event) ); } rdma_ack_cm_event(event); */ } } void __IBVSocket_close(IBVSocket* _this) { SAFE_FREE(_this->remoteDest); if(_this->delayedCmEventsQ) { // ack all queued events while (!_this->delayedCmEventsQ->empty()) { struct rdma_cm_event* nextEvent = _this->delayedCmEventsQ->front(); rdma_ack_cm_event(nextEvent); _this->delayedCmEventsQ->pop(); } delete(_this->delayedCmEventsQ); } if(_this->commContext) __IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext); if(_this->cm_id) rdma_destroy_id(_this->cm_id); if(_this->cm_channel) rdma_destroy_event_channel(_this->cm_channel); } /** * Note: Call this for connected sockets only. */ bool __IBVSocket_initEpollFD(IBVSocket* _this) { _this->epollFD = epoll_create(1); // "1" is just a hint (and is actually ignored) if(_this->epollFD == -1) { LOG(SOCKLIB, WARNING, "epoll initialization error.", sysErr); return false; } struct epoll_event epollEvent; epollEvent.events = EPOLLIN; epollEvent.data.fd = IBVSocket_getRecvCompletionFD(_this); // note: we only add the recvCompletionFD here and not commContext->context->async_fd, because // accepted sockets don't have their own async event channel (they receive events through // their parent's fd) int epollAddRes = epoll_ctl(_this->epollFD, EPOLL_CTL_ADD, IBVSocket_getRecvCompletionFD(_this), &epollEvent); if(epollAddRes == -1) { LOG(SOCKLIB, WARNING, "Unable to add sock to epoll set.", sysErr); close(_this->epollFD); _this->epollFD = -1; return false; } if(_this->cm_channel) { epollEvent.events = EPOLLIN; epollEvent.data.fd = _this->cm_channel->fd; int epollAddRes = epoll_ctl(_this->epollFD, EPOLL_CTL_ADD, _this->cm_channel->fd, &epollEvent); if(epollAddRes == -1) { LOG(SOCKLIB, WARNING, "Unable to add sock to epoll set.", sysErr); close(_this->epollFD); _this->epollFD = -1; return false; } } return true; } /** * @return pointer to static buffer with human readable string for a wc status code */ const char* __IBVSocket_wcStatusStr(int wcStatusCode) { switch(wcStatusCode) { case IBV_WC_WR_FLUSH_ERR: return "work request flush error"; case IBV_WC_RETRY_EXC_ERR: return "retries exceeded error"; case IBV_WC_RESP_TIMEOUT_ERR: return "response timeout error"; default: return ""; } } bool IBVSocket_getSockValid(IBVSocket* _this) { return _this->sockValid; } int IBVSocket_getRecvCompletionFD(IBVSocket* _this) { IBVCommContext* commContext = _this->commContext; return commContext ? commContext->recvCompChannel->fd : (-1); } int IBVSocket_getConnManagerFD(IBVSocket* _this) { return _this->cm_channel ? _this->cm_channel->fd : (-1); } void IBVSocket_setTypeOfService(IBVSocket* _this, uint8_t typeOfService) { _this->typeOfService = typeOfService; } void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS, int flowSendMS, int pollMS) { _this->timeoutCfg.connectMS = connectMS > 0 ? connectMS : IBVSOCKET_CONN_TIMEOUT_MS; _this->timeoutCfg.flowSendMS = flowSendMS > 0? flowSendMS : IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS; _this->timeoutCfg.pollMS = pollMS > 0? pollMS : IBVSOCKET_POLL_TIMEOUT_MS; LOG(SOCKLIB, DEBUG, "timeouts", ("connectMS", _this->timeoutCfg.connectMS), ("flowSendMS", _this->timeoutCfg.flowSendMS), ("pollMS", _this->timeoutCfg.pollMS)); } void IBVSocket_setConnectionRejectionRate(IBVSocket* _this, unsigned rate) { _this->connectionRejectionRate = rate; } bool IBVSocket_connectionRejection(IBVSocket* _this) { if(_this->connectionRejectionRate) { ++_this->connectionRejectionCount; if((_this->connectionRejectionCount % _this->connectionRejectionRate) != 0) { LOG(SOCKLIB, WARNING, "dropping connection for testing.", _this->connectionRejectionCount, _this->connectionRejectionRate); return true; } } return false; }