#include #include #include #include #include #include #include #include #include #include #ifdef BEEGFS_NVFS #include #endif #include "Node.h" #include "NodeConnPool.h" #define NODECONNPOOL_SHUTDOWN_WAITTIMEMS 100 /** * Note: localNicCaps should to be initialized before acquiring a stream. * * @param parentNode the node object to which this conn pool belongs. * @param nicList an internal copy will be created. */ void NodeConnPool_init(NodeConnPool* this, struct App* app, struct Node* parentNode, unsigned short streamPort, NicAddressList* nicList, NicAddressList* localRdmaNicList) { Config* cfg = App_getConfig(app); this->app = app; Mutex_init(&this->mutex); Condition_init(&this->changeCond); ConnectionList_init(&this->connList, true); ListTk_cloneNicAddressList(nicList, &this->nicList, true); if (localRdmaNicList) ListTk_cloneNicAddressList(localRdmaNicList, &this->localRdmaNicList, true); else NicAddressList_init(&this->localRdmaNicList); this->establishedConns = 0; this->availableConns = 0; this->maxConns = Config_getConnMaxInternodeNum(cfg); this->fallbackExpirationSecs = Config_getConnFallbackExpirationSecs(cfg); this->maxConcurrentAttempts = Config_getConnMaxConcurrentAttempts(cfg); sema_init(&this->connSemaphore, this->maxConcurrentAttempts); this->parentNode = parentNode; this->streamPort = streamPort; memset(&this->localNicCaps, 0, sizeof(this->localNicCaps) ); memset(&this->stats, 0, sizeof(this->stats) ); memset(&this->errState, 0, sizeof(this->errState) ); // rdmaNicStatsList should only have elements when parentNode "type" is meta or storage, but // parentNode "type" isn't yet known. Initialize elements in the first call to // acquireStreamSocketEx per NodeConnPool instance. NicAddressStatsList_init(&this->rdmaNicStatsList); this->rdmaNicCount = -1; this->logConnErrors = true; this->enableTCPFallback = Config_getConnTCPFallbackEnabled(cfg); } static bool NodeConnPool_loadRdmaNicStatsList(NodeConnPool* this) { NicAddressListIter nicIter; NicAddressStatsList statsList; NicAddressStatsListIter stIter; NicAddressStatsListIter srcIter; NicAddress* nic; NicAddressStats* st; NicAddressStats* cur; int rdmaNicCount = 0; int nodeType; // indicate that this function has been called at least once this->rdmaNicCount = 0; nodeType = Node_getNodeType(this->parentNode); if (nodeType == NODETYPE_Meta || nodeType == NODETYPE_Storage) { // Construct temporary list of stat per RDMA NIC. Search for existing // stat by interface name and use that if it exists. Remove // matching stats from this->rdmaNicStatsList. NicAddressStatsList_init(&statsList); NicAddressListIter_init(&nicIter, &this->localRdmaNicList); for ( ; !NicAddressListIter_end(&nicIter); NicAddressListIter_next(&nicIter) ) { nic = NicAddressListIter_value(&nicIter); st = NULL; NicAddressStatsListIter_init(&srcIter, &this->rdmaNicStatsList); while (!NicAddressStatsListIter_end(&srcIter)) { cur = NicAddressStatsListIter_value(&srcIter); if (!strncmp(cur->nic.name, nic->name, sizeof(nic->name))) { NicAddressStatsListIter_remove(&srcIter); st = cur; NicAddressStats_setValid(cur, nic); break; } NicAddressStatsListIter_next(&srcIter); } if (st == NULL) { st = (NicAddressStats*)os_kmalloc(sizeof(NicAddressStats)); NicAddressStats_init(st, nic); } NicAddressStatsList_append(&statsList, st); rdmaNicCount++; } // Add any remaining stats from the this->rdmaNicStatsList. These were not removed // during the iteration of RDMA NICs, so they are stats for devices that have disappeared. // They have to be kept around because there may be IBVSocket instances that still // point to the memory. NicAddressStatsListIter_init(&stIter, &this->rdmaNicStatsList); while (!NicAddressStatsListIter_end(&stIter)) { st = NicAddressStatsListIter_value(&stIter); NicAddressStats_invalidate(st); NicAddressStatsList_append(&statsList, st); stIter = NicAddressStatsListIter_remove(&stIter); } NicAddressStatsList_uninit(&this->rdmaNicStatsList); this->rdmaNicStatsList = statsList; this->rdmaNicCount = rdmaNicCount; } #ifdef BEEGFS_DEBUG { const char* logContext = "NodeConn (load RDMA NIC stats)"; Logger* log = App_getLogger(this->app); Logger_logFormatted(log, Log_DEBUG, logContext, "%d RDMA NICs", this->rdmaNicCount); NicAddressStatsListIter_init(&stIter, &this->rdmaNicStatsList); for(; !NicAddressStatsListIter_end(&stIter); NicAddressStatsListIter_next(&stIter)) { char* addr; st = NicAddressStatsListIter_value(&stIter); addr = SocketTk_ipaddrToStr(st->nic.ipAddr); Logger_logFormatted(log, Log_DEBUG, logContext, "NIC: %s Addr: %s nicValid=%d established=%d available=%d", st->nic.name, addr, st->nicValid, st->established, st->available); kfree(addr); } } #endif return true; } NodeConnPool* NodeConnPool_construct(struct App* app, struct Node* parentNode, unsigned short streamPort, NicAddressList* nicList, NicAddressList* localRdmaNicList) { NodeConnPool* this = (NodeConnPool*)os_kmalloc(sizeof(*this) ); NodeConnPool_init(this, app, parentNode, streamPort, nicList, localRdmaNicList); return this; } void NodeConnPool_uninit(NodeConnPool* this) { Logger* log = App_getLogger(this->app); const char* logContext = "NodeConn (uninit)"; // close open connections if(ConnectionList_length(&this->connList) > 0) { Logger_logFormatted(log, Log_DEBUG, logContext, "Closing %lu connections...", ConnectionList_length(&this->connList) ); } // Note: invalidateStreamSocket changes the original connList // (so we must invalidate the iterator each time) while(ConnectionList_length(&this->connList) ) { ConnectionListIter connIter; PooledSocket* socket; ConnectionListIter_init(&connIter, &this->connList); socket = ConnectionListIter_value(&connIter); __NodeConnPool_invalidateSpecificStreamSocket(this, (Socket*)socket); } // delete nicList elems ListTk_kfreeNicAddressListElems(&this->nicList); ListTk_kfreeNicAddressListElems(&this->localRdmaNicList); ListTk_kfreeNicAddressStatsListElems(&this->rdmaNicStatsList); // normal clean-up NicAddressList_uninit(&this->nicList); NicAddressList_uninit(&this->localRdmaNicList); NicAddressStatsList_uninit(&this->rdmaNicStatsList); Mutex_uninit(&this->mutex); } void NodeConnPool_destruct(NodeConnPool* this) { NodeConnPool_uninit(this); kfree(this); } static NicAddressStats* NodeConnPool_rdmaNicPriority(NodeConnPool* this, DevicePriorityContext* ctx) { NicAddressStats* cur; NicAddressStats* min = NULL; NicAddressStatsListIter statsIter; bool skipped; int numa; Time now; #ifdef BEEGFS_NVFS int minNvfsPrio = INT_MAX; int nvfsPrio; numa = cpu_to_node(current->cpu); #endif if (this->rdmaNicCount < 1) return NULL; #ifdef KERNEL_HAS_CPU_IN_THREAD_INFO numa = cpu_to_node(task_thread_info(current)->cpu); #else numa = cpu_to_node(current->cpu); #endif Time_init(&now); NicAddressStatsListIter_init(&statsIter, &this->rdmaNicStatsList); for (; !NicAddressStatsListIter_end(&statsIter); NicAddressStatsListIter_next(&statsIter)) { cur = NicAddressStatsListIter_value(&statsIter); // elements with nicValid == false are at the end of the list if (!cur->nicValid) break; skipped = false; if (!NicAddressStats_usable(cur, ctx->maxConns)) skipped = true; else if (!NicAddressStats_lastErrorExpired(cur, &now, this->fallbackExpirationSecs)) skipped = true; else { #ifndef BEEGFS_NVFS if (min == NULL || NicAddressStats_comparePriority(min, cur, numa) > 0) min = cur; #else if (ctx->gpuIndex < 0) { if (min == NULL || NicAddressStats_comparePriority(min, cur, numa) > 0) min = cur; } else { nvfsPrio = RdmaInfo_nvfsDevicePriority(cur->nic.ibdev, ctx->gpuIndex); #ifdef BEEGFS_DEBUG printk_fhgfs(KERN_INFO, "%s:%d cur=%p minNvfsPrio=%d nvfsPrio=%d gpuIndex=%d\n", __func__, __LINE__, cur, minNvfsPrio, nvfsPrio, ctx->gpuIndex); #endif if (min == NULL || minNvfsPrio > nvfsPrio) { minNvfsPrio = nvfsPrio; min = cur; } else if (minNvfsPrio == nvfsPrio && NicAddressStats_comparePriority(min, cur, numa) > 0) min = cur; } #endif } #ifdef BEEGFS_DEBUG printk_fhgfs(KERN_INFO, "%s:%d: min=%p cur=%p skipped=%d cur.avail=%d cur.established=%d cur.used=%lld\n", __func__, __LINE__, min, cur, skipped, cur->available, cur->established, Time_toNS(&cur->used)); #endif } if (min) NicAddressStats_updateUsed(min); #ifdef BEEGFS_DEBUG printk_fhgfs(KERN_INFO, "%s:%d: return %p\n", __func__, __LINE__, min); #endif return min; } /** * Note: Will block if no stream socket is immediately available. * * @return connected socket; NULL on error or pending signal */ Socket* NodeConnPool_acquireStreamSocket(NodeConnPool* this) { return NodeConnPool_acquireStreamSocketEx(this, true, NULL); } /** * note: this initially checks for pending signals and returns NULL immediately if signal pending. * * @param true to allow waiting if all avaiable conns are currently in use; if waiting is not * allowed, there is no difference between all conns in use and connection error (so it is expected * that the caller will sooner or later allow waiting to detect conn errors). * @return connected socket; NULL on error or pending signal */ Socket* NodeConnPool_acquireStreamSocketEx(NodeConnPool* this, bool allowWaiting, DevicePriorityContext* devPrioCtx) { const char* logContext = "NodeConn (acquire stream)"; Logger* log = App_getLogger(this->app); NetFilter* netFilter = App_getNetFilter(this->app); NetFilter* tcpOnlyFilter = App_getTcpOnlyFilter(this->app); Socket* sock = NULL; unsigned short port; NicAddressList nicListCopy; NicAddressListIter nicIter; bool isPrimaryInterface = true; // used to set expiration for non-primary interfaces; // "primary" means: first interface in the list that is supported by client and server NicAddressStats* srcRdma = NULL; NodeType nodeType; bool rdmaConnectInterrupted = false; bool includeTcp = true; if (unlikely(fatal_signal_pending(current))) return NULL; // no need to try if the process will terminate soon anyway Mutex_lock(&this->mutex); // L O C K if (unlikely(this->rdmaNicCount < 0)) NodeConnPool_loadRdmaNicStatsList(this); if(!this->availableConns && (this->establishedConns == this->maxConns) ) { // wait for a conn to become available (or disconnected) if(!allowWaiting) { // all conns in use and waiting not allowed => exit Mutex_unlock(&this->mutex); // U N L O C K return NULL; } while(!this->availableConns && (this->establishedConns == this->maxConns) ) { // sleep interruptbile cond_wait_res_t waitRes = Condition_waitKillable(&this->changeCond, &this->mutex); if(unlikely(waitRes == COND_WAIT_SIGNAL) ) { // interrupted by signal Mutex_unlock(&this->mutex); // U N L O C K return NULL; } } } if (this->rdmaNicCount > 0) { DevicePriorityContext* dpc = devPrioCtx; // metaDevPrioCtx is hacky. When the node is meta and the client is doing // multi-rail for RDMA this structure is used for the call to // NodeConnPool_rdmaNicPriority(). This allows use of multiple RDMA // interfaces to communicate with meta. When the node is storage, devPrioCtx // will be used if not null. In that devPrioCtx has been initialized with // information from NVFS detection that happens before the stream is // acquired. DevicePriorityContext metaDevPrioCtx = { .maxConns = 0, #ifdef BEEGFS_NVFS .gpuIndex = -1, #endif }; if (!dpc && Node_getNodeType(this->parentNode) == NODETYPE_Meta) dpc = &metaDevPrioCtx; if (dpc) { dpc->maxConns = this->maxConns / this->rdmaNicCount; srcRdma = NodeConnPool_rdmaNicPriority(this, dpc); #ifdef BEEGFS_DEBUG Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Preferred IP addr is 0x%x", srcRdma == NULL? 0 : srcRdma->nic.ipAddr.s_addr); #endif } } if(likely(this->availableConns) ) { // established connection available => grab it PooledSocket* pooledSock; ConnectionListIter connIter; ConnectionListIter_init(&connIter, &this->connList); for(; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter)) { pooledSock = ConnectionListIter_value(&connIter); if (PooledSocket_isAvailable(pooledSock) && (srcRdma == NULL || (Socket_getSockType((Socket*) pooledSock) == NICADDRTYPE_RDMA && IBVSocket_getNicStats(&((RDMASocket*) pooledSock)->ibvsock) == srcRdma))) break; pooledSock = NULL; } if (pooledSock != NULL) { PooledSocket_setAvailable(pooledSock, false); PooledSocket_setHasActivity(pooledSock); // this invalidates the iterator but this code returns before the next iteration ConnectionList_moveToTail(&this->connList, pooledSock); this->availableConns--; if (srcRdma) srcRdma->available--; Mutex_unlock(&this->mutex); // U N L O C K return (Socket*)pooledSock; } } // no conn available, but maxConns not reached yet => establish a new conn port = this->streamPort; nodeType = Node_getNodeType(this->parentNode); if (!this->enableTCPFallback && (nodeType == NODETYPE_Meta || nodeType == NODETYPE_Storage)) includeTcp = false; ListTk_cloneNicAddressList(&this->nicList, &nicListCopy, includeTcp); this->establishedConns++; if (srcRdma) srcRdma->established++; Mutex_unlock(&this->mutex); // U N L O C K if (this->maxConcurrentAttempts > 0) { if (down_killable(&this->connSemaphore)) return NULL; } // walk over all available NICs, create the corresponding socket and try to connect NicAddressListIter_init(&nicIter, &nicListCopy); for( ; !NicAddressListIter_end(&nicIter); NicAddressListIter_next(&nicIter) ) { NicAddress* nicAddr = NicAddressListIter_value(&nicIter); const char* nodeTypeStr = Node_getNodeTypeStr(this->parentNode); char* endpointStr; bool connectRes; if(!NetFilter_isAllowed(netFilter, nicAddr->ipAddr) ) continue; if( (nicAddr->nicType != NICADDRTYPE_STANDARD) && NetFilter_isContained(tcpOnlyFilter, nicAddr->ipAddr) ) continue; endpointStr = SocketTk_endpointAddrToStr(nicAddr->ipAddr, port); switch(nicAddr->nicType) { case NICADDRTYPE_RDMA: { // RDMA char from[NICADDRESS_IP_STR_LEN]; struct in_addr srcAddr; if(!this->localNicCaps.supportsRDMA) goto continue_clean_endpointStr; if (srcRdma) { srcAddr = srcRdma->nic.ipAddr; NicAddress_ipToStr(srcAddr, from); } else { srcAddr.s_addr = 0; strncpy(from, "any", sizeof(from)); } Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Establishing new RDMA connection from %s to: %s@%s", from, nodeTypeStr, endpointStr); sock = (Socket*)RDMASocket_construct(srcAddr, srcRdma); } break; case NICADDRTYPE_STANDARD: { // TCP Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Establishing new TCP connection to: %s@%s", nodeTypeStr, endpointStr); sock = (Socket*)StandardSocket_constructTCP(); } break; default: { // unknown if(this->logConnErrors) Logger_logTopFormatted(log, LogTopic_CONN, Log_WARNING, logContext, "Skipping unknown connection type to: %s@%s", nodeTypeStr, endpointStr); goto continue_clean_endpointStr; } } // end of switch // check whether the socket was successfully initialized if(!sock) { // socket initialization failed if(this->logConnErrors) Logger_logFormatted(log, Log_WARNING, logContext, "Socket initialization failed: %s@%s", nodeTypeStr, endpointStr); goto continue_clean_endpointStr; } // the actual connection attempt __NodeConnPool_applySocketOptionsPreConnect(this, sock); connectRes = sock->ops->connectByIP(sock, nicAddr->ipAddr, port); if(connectRes) { // connected struct in_addr peerIP = Socket_getPeerIP(sock); NicAddrType_t sockType = Socket_getSockType(sock); if(__NodeConnPool_shouldPrintConnectedLogMsg(this, peerIP, sockType) ) { const char* protocolStr = NIC_nicTypeToString(sockType); const char* fallbackStr = isPrimaryInterface ? "" : "; fallback route"; Logger_logTopFormatted(log, LogTopic_CONN, Log_NOTICE, logContext, "Connected: %s@%s (protocol: %s%s)", nodeTypeStr, endpointStr, protocolStr, fallbackStr); } __NodeConnPool_applySocketOptionsConnected(this, (Socket*)sock); if(!isPrimaryInterface) // non-primary => set expiration counter { PooledSocket* psock = (PooledSocket*) sock; PooledSocket_setExpireTimeStart(psock); /* connection establishment can be interrupted for rdma, not for tcp. if we were * interrupted during rdma setup and connected via tcp instead, we should close that * tco connection as soon as possible. */ psock->closeOnRelease = signal_pending(current); } kfree(endpointStr); break; } else { // not connected if(this->logConnErrors) { struct in_addr peerIP = Socket_getPeerIP(sock); NicAddrType_t sockType = Socket_getSockType(sock); if(__NodeConnPool_shouldPrintConnectFailedLogMsg(this, peerIP, sockType) ) { Logger_logTopFormatted(log, LogTopic_CONN, Log_NOTICE, logContext, "Connect failed: %s@%s (protocol: %s)", nodeTypeStr, endpointStr, NIC_nicTypeToString(sockType) ); } } // the next interface is definitely non-primary isPrimaryInterface = false; if (nicAddr->nicType == NICADDRTYPE_RDMA && fatal_signal_pending(current)) rdmaConnectInterrupted = true; Socket_virtualDestruct(sock); } continue_clean_endpointStr: kfree(endpointStr); if (fatal_signal_pending(current)) break; } if (this->maxConcurrentAttempts > 0) { up(&this->connSemaphore); } Mutex_lock(&this->mutex); // L O C K if(!NicAddressListIter_end(&nicIter) && !fatal_signal_pending(current)) { // success => add to list (as unavailable) and update stats PooledSocket* pooledSock = (PooledSocket*)sock; if (srcRdma && Socket_getSockType(sock) != NICADDRTYPE_RDMA) { srcRdma->established--; if (!rdmaConnectInterrupted) NicAddressStats_updateLastError(srcRdma); } ConnectionList_append(&this->connList, pooledSock); __NodeConnPool_statsAddNic(this, PooledSocket_getNicType(pooledSock) ); __NodeConnPool_setConnSuccess(this, Socket_getPeerIP(sock), Socket_getSockType(sock) ); } else { // absolutely unable to connect sock = NULL; this->establishedConns--; if (srcRdma) { srcRdma->established--; if (!rdmaConnectInterrupted) NicAddressStats_updateLastError(srcRdma); } // connect may be interrupted by signals. return no connection without setting up alternate // routes or other error handling in that case. if (!fatal_signal_pending(current)) { if(this->logConnErrors && !__NodeConnPool_getWasLastTimeCompleteFail(this) ) { NodeString nodeAndType; Node_copyAliasWithTypeStr(this->parentNode, &nodeAndType); Logger_logTopFormatted(log, LogTopic_CONN, Log_CRITICAL, logContext, "Connect failed on all available routes: %s", nodeAndType.buf); } __NodeConnPool_setCompleteFail(this); } // we are not using this connection => notify next waiter Condition_signal(&this->changeCond); } ListTk_kfreeNicAddressListElems(&nicListCopy); NicAddressList_uninit(&nicListCopy); Mutex_unlock(&this->mutex); // U N L O C K return sock; } void NodeConnPool_releaseStreamSocket(NodeConnPool* this, Socket* sock) { PooledSocket* pooledSock = (PooledSocket*)sock; // test whether this socket has expired if(unlikely(PooledSocket_getHasExpired(pooledSock, this->fallbackExpirationSecs) || pooledSock->closeOnRelease)) { // this socket just expired => invalidate it __NodeConnPool_invalidateSpecificStreamSocket(this, sock); return; } // mark the socket as available Mutex_lock(&this->mutex); // L O C K if (unlikely(PooledSocket_getPool(pooledSock) != &this->connList)) { printk_fhgfs(KERN_ERR, "%s:%d: socket %p not in pool %p\n", __func__, __LINE__, pooledSock, this); goto exit; } this->availableConns++; PooledSocket_setAvailable(pooledSock, true); ConnectionList_moveToHead(&this->connList, pooledSock); if (Socket_getSockType((Socket*) pooledSock) == NICADDRTYPE_RDMA) { NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) pooledSock)->ibvsock); if (st) st->available++; } Condition_signal(&this->changeCond); exit: Mutex_unlock(&this->mutex); // U N L O C K } void NodeConnPool_invalidateStreamSocket(NodeConnPool* this, Socket* sock) { Logger* log = App_getLogger(this->app); const char* logContext = "NodeConn (invalidate stream)"; unsigned numInvalidated; /* note: we use invalidateAvailableStreams first here, because we want to keep other threads from trying to acquire an available socket immediately - and the parameter socket isn't availabe anyways. (we don't want to acquire a mutex here that would block the whole pool while we're just waiting for the sock-close response messages) */ numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, false); Logger_logTopFormatted(log, LogTopic_CONN, 4, logContext, "Invalidated %u pooled connections. (1 more invalidation pending...)", numInvalidated); __NodeConnPool_invalidateSpecificStreamSocket(this, sock); } void __NodeConnPool_invalidateSpecificStreamSocket(NodeConnPool* this, Socket* sock) { const char* logContext = "NodeConn (invalidate stream)"; Logger* log = App_getLogger(this->app); bool sockValid = true; PooledSocket* pooledSock = (PooledSocket*)sock; bool shutdownRes; Mutex_lock(&this->mutex); // L O C K if (unlikely(PooledSocket_getPool(pooledSock) != &this->connList)) { // socket not in the list Logger_logErrFormatted(log, logContext, "Tried to remove a socket that was not found in the pool: %s", Socket_getPeername(sock) ); sockValid = false; } else { this->establishedConns--; if (Socket_getSockType((Socket*) sock) == NICADDRTYPE_RDMA) { NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) sock)->ibvsock); if (st) st->established--; } ConnectionList_remove(&this->connList, pooledSock); __NodeConnPool_statsRemoveNic(this, PooledSocket_getNicType(pooledSock) ); Condition_signal(&this->changeCond); } Mutex_unlock(&this->mutex); // U N L O C K if(!sockValid) return; shutdownRes = sock->ops->shutdownAndRecvDisconnect(sock, NODECONNPOOL_SHUTDOWN_WAITTIMEMS); if(shutdownRes) { Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Disconnected: %s@%s", Node_getNodeTypeStr(this->parentNode), Socket_getPeername(sock) ); } else { Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Hard disconnect: %s@%s", Node_getNodeTypeStr(this->parentNode), Socket_getPeername(sock) ); } Socket_virtualDestruct(sock); } /** * Invalidate (disconnect) connections that are currently not acquired. * * @param idleStreamsOnly invalidate only conns that are marked as idle (ie don't have the activity * flag set). * @param closeOnRelease if true set every PooledSocket's closeOnRelease flag * @return number of invalidated streams */ unsigned __NodeConnPool_invalidateAvailableStreams(NodeConnPool* this, bool idleStreamsOnly, bool closeOnRelease) { /* note: we have TWO STAGES here, because we don't want to hold the mutex and block everything while we're waiting for the conns to be dropped. */ unsigned numInvalidated = 0; // retVal ConnectionListIter connIter; ConnectionListIter availableIter; ConnectionList availableConnsList; ConnectionList_init(&availableConnsList, false); Mutex_lock(&this->mutex); // L O C K // STAGE 1: grab all sockets that should be disconnected ConnectionListIter_init(&connIter, &this->connList); for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) ) { PooledSocket* sock = ConnectionListIter_value(&connIter); if(closeOnRelease) sock->closeOnRelease = true; if(!PooledSocket_isAvailable(sock) ) continue; // this one is currently in use if(idleStreamsOnly && PooledSocket_getHasActivity(sock) ) continue; // idle-only requested and this one was not idle // don't worry about reordering the connList here, the socket // will be removed in stage 2 PooledSocket_setAvailable(sock, false); this->availableConns--; if (Socket_getSockType((Socket*)sock) == NICADDRTYPE_RDMA) { NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) sock)->ibvsock); if (st) st->available--; } ConnectionList_append(&availableConnsList, sock); numInvalidated++; } Mutex_unlock(&this->mutex); // U N L O C K // STAGE 2: invalidate all grabbed sockets ConnectionListIter_init(&availableIter, &availableConnsList); for( ; !ConnectionListIter_end(&availableIter); ConnectionListIter_next(&availableIter) ) { PooledSocket* sock = ConnectionListIter_value(&availableIter); __NodeConnPool_invalidateSpecificStreamSocket(this, (Socket*)sock); } ConnectionList_uninit(&availableConnsList); return numInvalidated; } /** * Disconnect all established (available) connections. * * @return number of disconnected streams */ unsigned NodeConnPool_disconnectAvailableStreams(NodeConnPool* this) { unsigned numInvalidated; numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, false); return numInvalidated; } /** * Note: There is no locking around dropping and resetting afterwards, so there should only be one * thread which calls this function (=> InternodeSyncer). * * @return number of disconnected streams */ unsigned NodeConnPool_disconnectAndResetIdleStreams(NodeConnPool* this) { unsigned numInvalidated; numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, true, false); __NodeConnPool_resetStreamsIdleFlag(this); return numInvalidated; } /** * Resets the activity flag of all available connections to mark them as idle. */ void __NodeConnPool_resetStreamsIdleFlag(NodeConnPool* this) { ConnectionListIter connIter; Mutex_lock(&this->mutex); // L O C K ConnectionListIter_init(&connIter, &this->connList); for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) ) { PooledSocket* sock = ConnectionListIter_value(&connIter); if(!PooledSocket_isAvailable(sock) ) continue; // this one is currently in use PooledSocket_resetHasActivity(sock); } Mutex_unlock(&this->mutex); // U N L O C K } bool __NodeConnPool_applySocketOptionsPreConnect(NodeConnPool* this, Socket* sock) { Config* cfg = App_getConfig(this->app); NicAddrType_t sockType = Socket_getSockType(sock); if(sockType == NICADDRTYPE_STANDARD) { StandardSocket* stdSock = (StandardSocket*)sock; int bufSize = Config_getConnTCPRcvBufSize(cfg); if (bufSize > 0) StandardSocket_setSoRcvBuf(stdSock, bufSize); } else if(sockType == NICADDRTYPE_RDMA) { RDMASocket* rdmaSock = (RDMASocket*)sock; if (Node_getNodeType(this->parentNode) == NODETYPE_Meta) RDMASocket_setBuffers(rdmaSock, Config_getConnRDMAMetaBufNum(cfg), Config_getConnRDMAMetaBufSize(cfg), Config_getConnRDMAMetaFragmentSize(cfg), Config_getConnRDMAKeyTypeNum(cfg)); else RDMASocket_setBuffers(rdmaSock, Config_getConnRDMABufNum(cfg), Config_getConnRDMABufSize(cfg), Config_getConnRDMAFragmentSize(cfg), Config_getConnRDMAKeyTypeNum(cfg)); RDMASocket_setTimeouts(rdmaSock, Config_getConnRDMATimeoutConnect(cfg), Config_getConnRDMATimeoutCompletion(cfg), Config_getConnRDMATimeoutFlowSend(cfg), Config_getConnRDMATimeoutFlowRecv(cfg), Config_getConnRDMATimeoutPoll(cfg)); RDMASocket_setTypeOfService(rdmaSock, Config_getConnRDMATypeOfService(cfg)); RDMASocket_setConnectionFailureStatus(rdmaSock, Config_getRemapConnectionFailureStatus(cfg)); } return true; } /** * Apply socket tweaks and send channel control messages. */ bool __NodeConnPool_applySocketOptionsConnected(NodeConnPool* this, Socket* sock) { const char* logContext = "NodeConn (apply socket options)"; Logger* log = App_getLogger(this->app); Config* cfg = App_getConfig(this->app); uint64_t authHash = Config_getConnAuthHash(cfg); bool allSuccessful = true; NicAddrType_t sockType = Socket_getSockType(sock); // apply general socket options if(sockType == NICADDRTYPE_STANDARD) { StandardSocket* standardSock = (StandardSocket*)sock; bool corkRes; bool noDelayRes; bool keepAliveRes; corkRes = StandardSocket_setTcpCork(standardSock, false); if(!corkRes) { Logger_log(log, Log_NOTICE, logContext, "Failed to disable TcpCork"); allSuccessful = false; } noDelayRes = StandardSocket_setTcpNoDelay(standardSock, true); if(!noDelayRes) { Logger_log(log, Log_NOTICE, logContext, "Failed to enable TcpNoDelay"); allSuccessful = false; } keepAliveRes = StandardSocket_setSoKeepAlive(standardSock, true); if(!keepAliveRes) { Logger_log(log, Log_NOTICE, logContext, "Failed to enable SoKeepAlive"); allSuccessful = false; } } // send control messages if(authHash) { // authenticate channel char* sendBuf; size_t sendBufLen; AuthenticateChannelMsg authMsg; size_t sendRes; AuthenticateChannelMsg_initFromValue(&authMsg, authHash); sendBufLen = NetMessage_getMsgLength( (NetMessage*)&authMsg); sendBuf = (char*)os_kmalloc(sendBufLen); NetMessage_serialize( (NetMessage*)&authMsg, sendBuf, sendBufLen); sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0); if(sendRes <= 0) { Logger_log(log, Log_WARNING, logContext, "Failed to send authentication"); allSuccessful = false; } kfree(sendBuf); } { // make channel indirect char* sendBuf; size_t sendBufLen; SetChannelDirectMsg directMsg; size_t sendRes; SetChannelDirectMsg_initFromValue(&directMsg, 0); sendBufLen = NetMessage_getMsgLength( (NetMessage*)&directMsg); sendBuf = (char*)os_kmalloc(sendBufLen); NetMessage_serialize( (NetMessage*)&directMsg, sendBuf, sendBufLen); sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0); if(sendRes <= 0) { Logger_log(log, Log_WARNING, logContext, "Failed to set channel to indirect mode"); allSuccessful = false; } kfree(sendBuf); } { char* sendBuf; size_t sendBufLen; PeerInfoMsg peerInfo; size_t sendRes = -1; PeerInfoMsg_init(&peerInfo, NODETYPE_Client, this->app->localNode->numID); sendBufLen = NetMessage_getMsgLength(&peerInfo.netMessage); sendBuf = kmalloc(sendBufLen, GFP_KERNEL); if (sendBuf) { NetMessage_serialize(&peerInfo.netMessage, sendBuf, sendBufLen); sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0); } if(sendRes <= 0) { Logger_log(log, Log_WARNING, logContext, "Failed to transmit local node info"); allSuccessful = false; } kfree(sendBuf); } return allSuccessful; } void NodeConnPool_getStats(NodeConnPool* this, NodeConnPoolStats* outStats) { Mutex_lock(&this->mutex); // L O C K *outStats = this->stats; Mutex_unlock(&this->mutex); // U N L O C K } /** * Gets the first socket peer name for a connection to the specified nicType. * Only works on currently available connections. * * @outBuf buffer for the peer name string (contains "n/a" if no such connection exists or is * currently not available). * @param outIsNonPrimary set to true if conn in outBuf has an expiration counter, false otherwise * @return number of chars written to buf */ unsigned NodeConnPool_getFirstPeerName(NodeConnPool* this, NicAddrType_t nicType, ssize_t outBufLen, char* outBuf, bool* outIsNonPrimary) { unsigned numWritten = 0; // return value ConnectionListIter connIter; bool foundMatch = false; Mutex_lock(&this->mutex); // L O C K ConnectionListIter_init(&connIter, &this->connList); for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) ) { PooledSocket* pooledSock = ConnectionListIter_value(&connIter); Socket* sock = (Socket*)pooledSock; if(PooledSocket_isAvailable(pooledSock) && (Socket_getSockType(sock) == nicType) ) { // found a match => print to buf and stop numWritten += scnprintf(outBuf, outBufLen, "%s", Socket_getPeername(sock) ); *outIsNonPrimary = PooledSocket_getHasExpirationTimer(pooledSock); foundMatch = true; break; } } if(!foundMatch) { // print "n/a" numWritten += scnprintf(outBuf, outBufLen, "busy"); *outIsNonPrimary = false; } Mutex_unlock(&this->mutex); // U N L O C K return numWritten; } /** * Increase stats counter for number of established conns (by NIC type). */ void __NodeConnPool_statsAddNic(NodeConnPool* this, NicAddrType_t nicType) { switch(nicType) { case NICADDRTYPE_RDMA: { (this->stats.numEstablishedRDMA)++; } break; default: { (this->stats.numEstablishedStd)++; } break; } } /** * Decrease stats counter for number of established conns (by NIC type). */ void __NodeConnPool_statsRemoveNic(NodeConnPool* this, NicAddrType_t nicType) { switch(nicType) { case NICADDRTYPE_RDMA: { (this->stats.numEstablishedRDMA)--; } break; default: { (this->stats.numEstablishedStd)--; } break; } } /** * @param streamPort value 0 will be ignored * @param nicList will be copied * @return true if port has changed */ bool NodeConnPool_updateInterfaces(NodeConnPool* this, unsigned short streamPort, NicAddressList* nicList) { Logger* log = App_getLogger(this->app); const char* logContext = "NodeConn (update stream port)"; bool hasChanged = false; // retVal NodeString alias; Mutex_lock(&this->mutex); // L O C K Node_copyAlias(this->parentNode, &alias); if(streamPort && (streamPort != this->streamPort) ) { this->streamPort = streamPort; hasChanged = true; Logger_logFormatted(log, Log_NOTICE, logContext, "Node %s port has changed", alias.buf); } if (!NicAddressList_equals(&this->nicList, nicList)) { // update nicList (if allocation of new list fails, we just keep the old list) NicAddressList newNicList; hasChanged = true; Logger_logFormatted(log, Log_NOTICE, logContext, "Node %s interfaces have changed", alias.buf); ListTk_cloneNicAddressList(nicList, &newNicList, true); ListTk_kfreeNicAddressListElems(&this->nicList); NicAddressList_uninit(&this->nicList); this->nicList = newNicList; } Mutex_unlock(&this->mutex); // U N L O C K if (unlikely(hasChanged)) { // closeOnRelease is true, all of these sockets need to be invalidated ASAP unsigned numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, true); if(numInvalidated) { Logger_logFormatted(log, Log_DEBUG, logContext, "Invalidated %u pooled connections (due to port/interface change)", numInvalidated); } } return hasChanged; } void __NodeConnPool_setCompleteFail(NodeConnPool* this) { this->errState.lastSuccessPeerIP.s_addr = 0; this->errState.wasLastTimeCompleteFail = true; } void __NodeConnPool_setConnSuccess(NodeConnPool* this, struct in_addr lastSuccessPeerIP, NicAddrType_t lastSuccessNicType) { this->errState.lastSuccessPeerIP = lastSuccessPeerIP; this->errState.lastSuccessNicType = lastSuccessNicType; this->errState.wasLastTimeCompleteFail = false; } bool __NodeConnPool_equalsLastSuccess(NodeConnPool* this, struct in_addr lastSuccessPeerIP, NicAddrType_t lastSuccessNicType) { return (this->errState.lastSuccessPeerIP.s_addr == lastSuccessPeerIP.s_addr) && (this->errState.lastSuccessNicType == lastSuccessNicType); } bool __NodeConnPool_isLastSuccessInitialized(NodeConnPool* this) { return (this->errState.lastSuccessPeerIP.s_addr != 0); } bool __NodeConnPool_shouldPrintConnectedLogMsg(NodeConnPool* this, struct in_addr currentPeerIP, NicAddrType_t currentNicType) { /* log only if we didn's succeed at all last time, or if we succeeded last time and it was with a different IP/NIC pair */ return this->errState.wasLastTimeCompleteFail || !__NodeConnPool_isLastSuccessInitialized(this) || !__NodeConnPool_equalsLastSuccess(this, currentPeerIP, currentNicType); } bool __NodeConnPool_shouldPrintConnectFailedLogMsg(NodeConnPool* this, struct in_addr currentPeerIP, NicAddrType_t currentNicType) { /* log only if this is the first connection attempt or if we succeeded last time this this IP/NIC pair */ return (!this->errState.wasLastTimeCompleteFail && (!__NodeConnPool_isLastSuccessInitialized(this) || __NodeConnPool_equalsLastSuccess(this, currentPeerIP, currentNicType) ) ); } void NodeConnPool_updateLocalInterfaces(NodeConnPool* this, NicAddressList* localNicList, NicListCapabilities* localNicCaps, NicAddressList* localRdmaNicList) { Logger* log = App_getLogger(this->app); const char* logContext = "NodeConn (update local NIC list)"; unsigned numInvalidated; Mutex_lock(&this->mutex); // L O C K this->localNicCaps = *localNicCaps; if (localRdmaNicList) { ListTk_kfreeNicAddressListElems(&this->localRdmaNicList); NicAddressList_uninit(&this->localRdmaNicList); ListTk_cloneNicAddressList(localRdmaNicList, &this->localRdmaNicList, true); NodeConnPool_loadRdmaNicStatsList(this); } Mutex_unlock(&this->mutex); // U N L O C K numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, true); if(numInvalidated) { Logger_logFormatted(log, Log_DEBUG, logContext, "Invalidated %u pooled connections (due to local interface change)", numInvalidated); } }