2025-08-10 01:34:16 +02:00

1277 lines
40 KiB
C

#include <app/config/Config.h>
#include <app/App.h>
#include <common/net/message/control/AuthenticateChannelMsg.h>
#include <common/net/message/control/SetChannelDirectMsg.h>
#include <common/net/message/control/PeerInfoMsg.h>
#include <common/net/sock/RDMASocket.h>
#include <common/net/sock/NicAddressStatsListIter.h>
#include <common/threading/Thread.h>
#include <common/toolkit/ListTk.h>
#include <common/toolkit/NetFilter.h>
#ifdef BEEGFS_NVFS
#include <common/storage/RdmaInfo.h>
#endif
#include "Node.h"
#include "NodeConnPool.h"
#define NODECONNPOOL_SHUTDOWN_WAITTIMEMS 100
/**
* Note: localNicCaps should to be initialized before acquiring a stream.
*
* @param parentNode the node object to which this conn pool belongs.
* @param nicList an internal copy will be created.
*/
void NodeConnPool_init(NodeConnPool* this, struct App* app, struct Node* parentNode,
unsigned short streamPort, NicAddressList* nicList, NicAddressList* localRdmaNicList)
{
Config* cfg = App_getConfig(app);
this->app = app;
Mutex_init(&this->mutex);
Condition_init(&this->changeCond);
ConnectionList_init(&this->connList, true);
ListTk_cloneNicAddressList(nicList, &this->nicList, true);
if (localRdmaNicList)
ListTk_cloneNicAddressList(localRdmaNicList, &this->localRdmaNicList, true);
else
NicAddressList_init(&this->localRdmaNicList);
this->establishedConns = 0;
this->availableConns = 0;
this->maxConns = Config_getConnMaxInternodeNum(cfg);
this->fallbackExpirationSecs = Config_getConnFallbackExpirationSecs(cfg);
this->maxConcurrentAttempts = Config_getConnMaxConcurrentAttempts(cfg);
sema_init(&this->connSemaphore, this->maxConcurrentAttempts);
this->parentNode = parentNode;
this->streamPort = streamPort;
memset(&this->localNicCaps, 0, sizeof(this->localNicCaps) );
memset(&this->stats, 0, sizeof(this->stats) );
memset(&this->errState, 0, sizeof(this->errState) );
// rdmaNicStatsList should only have elements when parentNode "type" is meta or storage, but
// parentNode "type" isn't yet known. Initialize elements in the first call to
// acquireStreamSocketEx per NodeConnPool instance.
NicAddressStatsList_init(&this->rdmaNicStatsList);
this->rdmaNicCount = -1;
this->logConnErrors = true;
this->enableTCPFallback = Config_getConnTCPFallbackEnabled(cfg);
}
static bool NodeConnPool_loadRdmaNicStatsList(NodeConnPool* this)
{
NicAddressListIter nicIter;
NicAddressStatsList statsList;
NicAddressStatsListIter stIter;
NicAddressStatsListIter srcIter;
NicAddress* nic;
NicAddressStats* st;
NicAddressStats* cur;
int rdmaNicCount = 0;
int nodeType;
// indicate that this function has been called at least once
this->rdmaNicCount = 0;
nodeType = Node_getNodeType(this->parentNode);
if (nodeType == NODETYPE_Meta || nodeType == NODETYPE_Storage)
{
// Construct temporary list of stat per RDMA NIC. Search for existing
// stat by interface name and use that if it exists. Remove
// matching stats from this->rdmaNicStatsList.
NicAddressStatsList_init(&statsList);
NicAddressListIter_init(&nicIter, &this->localRdmaNicList);
for ( ; !NicAddressListIter_end(&nicIter); NicAddressListIter_next(&nicIter) )
{
nic = NicAddressListIter_value(&nicIter);
st = NULL;
NicAddressStatsListIter_init(&srcIter, &this->rdmaNicStatsList);
while (!NicAddressStatsListIter_end(&srcIter))
{
cur = NicAddressStatsListIter_value(&srcIter);
if (!strncmp(cur->nic.name, nic->name, sizeof(nic->name)))
{
NicAddressStatsListIter_remove(&srcIter);
st = cur;
NicAddressStats_setValid(cur, nic);
break;
}
NicAddressStatsListIter_next(&srcIter);
}
if (st == NULL)
{
st = (NicAddressStats*)os_kmalloc(sizeof(NicAddressStats));
NicAddressStats_init(st, nic);
}
NicAddressStatsList_append(&statsList, st);
rdmaNicCount++;
}
// Add any remaining stats from the this->rdmaNicStatsList. These were not removed
// during the iteration of RDMA NICs, so they are stats for devices that have disappeared.
// They have to be kept around because there may be IBVSocket instances that still
// point to the memory.
NicAddressStatsListIter_init(&stIter, &this->rdmaNicStatsList);
while (!NicAddressStatsListIter_end(&stIter))
{
st = NicAddressStatsListIter_value(&stIter);
NicAddressStats_invalidate(st);
NicAddressStatsList_append(&statsList, st);
stIter = NicAddressStatsListIter_remove(&stIter);
}
NicAddressStatsList_uninit(&this->rdmaNicStatsList);
this->rdmaNicStatsList = statsList;
this->rdmaNicCount = rdmaNicCount;
}
#ifdef BEEGFS_DEBUG
{
const char* logContext = "NodeConn (load RDMA NIC stats)";
Logger* log = App_getLogger(this->app);
Logger_logFormatted(log, Log_DEBUG, logContext, "%d RDMA NICs", this->rdmaNicCount);
NicAddressStatsListIter_init(&stIter, &this->rdmaNicStatsList);
for(; !NicAddressStatsListIter_end(&stIter); NicAddressStatsListIter_next(&stIter))
{
char* addr;
st = NicAddressStatsListIter_value(&stIter);
addr = SocketTk_ipaddrToStr(st->nic.ipAddr);
Logger_logFormatted(log, Log_DEBUG, logContext, "NIC: %s Addr: %s nicValid=%d established=%d available=%d",
st->nic.name, addr, st->nicValid, st->established, st->available);
kfree(addr);
}
}
#endif
return true;
}
NodeConnPool* NodeConnPool_construct(struct App* app, struct Node* parentNode,
unsigned short streamPort, NicAddressList* nicList, NicAddressList* localRdmaNicList)
{
NodeConnPool* this = (NodeConnPool*)os_kmalloc(sizeof(*this) );
NodeConnPool_init(this, app, parentNode, streamPort, nicList, localRdmaNicList);
return this;
}
void NodeConnPool_uninit(NodeConnPool* this)
{
Logger* log = App_getLogger(this->app);
const char* logContext = "NodeConn (uninit)";
// close open connections
if(ConnectionList_length(&this->connList) > 0)
{
Logger_logFormatted(log, Log_DEBUG, logContext, "Closing %lu connections...",
ConnectionList_length(&this->connList) );
}
// Note: invalidateStreamSocket changes the original connList
// (so we must invalidate the iterator each time)
while(ConnectionList_length(&this->connList) )
{
ConnectionListIter connIter;
PooledSocket* socket;
ConnectionListIter_init(&connIter, &this->connList);
socket = ConnectionListIter_value(&connIter);
__NodeConnPool_invalidateSpecificStreamSocket(this, (Socket*)socket);
}
// delete nicList elems
ListTk_kfreeNicAddressListElems(&this->nicList);
ListTk_kfreeNicAddressListElems(&this->localRdmaNicList);
ListTk_kfreeNicAddressStatsListElems(&this->rdmaNicStatsList);
// normal clean-up
NicAddressList_uninit(&this->nicList);
NicAddressList_uninit(&this->localRdmaNicList);
NicAddressStatsList_uninit(&this->rdmaNicStatsList);
Mutex_uninit(&this->mutex);
}
void NodeConnPool_destruct(NodeConnPool* this)
{
NodeConnPool_uninit(this);
kfree(this);
}
static NicAddressStats* NodeConnPool_rdmaNicPriority(NodeConnPool* this, DevicePriorityContext* ctx)
{
NicAddressStats* cur;
NicAddressStats* min = NULL;
NicAddressStatsListIter statsIter;
bool skipped;
int numa;
Time now;
#ifdef BEEGFS_NVFS
int minNvfsPrio = INT_MAX;
int nvfsPrio;
numa = cpu_to_node(current->cpu);
#endif
if (this->rdmaNicCount < 1)
return NULL;
#ifdef KERNEL_HAS_CPU_IN_THREAD_INFO
numa = cpu_to_node(task_thread_info(current)->cpu);
#else
numa = cpu_to_node(current->cpu);
#endif
Time_init(&now);
NicAddressStatsListIter_init(&statsIter, &this->rdmaNicStatsList);
for (; !NicAddressStatsListIter_end(&statsIter); NicAddressStatsListIter_next(&statsIter))
{
cur = NicAddressStatsListIter_value(&statsIter);
// elements with nicValid == false are at the end of the list
if (!cur->nicValid)
break;
skipped = false;
if (!NicAddressStats_usable(cur, ctx->maxConns))
skipped = true;
else if (!NicAddressStats_lastErrorExpired(cur, &now, this->fallbackExpirationSecs))
skipped = true;
else
{
#ifndef BEEGFS_NVFS
if (min == NULL || NicAddressStats_comparePriority(min, cur, numa) > 0)
min = cur;
#else
if (ctx->gpuIndex < 0)
{
if (min == NULL || NicAddressStats_comparePriority(min, cur, numa) > 0)
min = cur;
}
else
{
nvfsPrio = RdmaInfo_nvfsDevicePriority(cur->nic.ibdev, ctx->gpuIndex);
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s:%d cur=%p minNvfsPrio=%d nvfsPrio=%d gpuIndex=%d\n",
__func__, __LINE__, cur, minNvfsPrio, nvfsPrio, ctx->gpuIndex);
#endif
if (min == NULL || minNvfsPrio > nvfsPrio)
{
minNvfsPrio = nvfsPrio;
min = cur;
}
else if (minNvfsPrio == nvfsPrio && NicAddressStats_comparePriority(min, cur, numa) > 0)
min = cur;
}
#endif
}
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s:%d: min=%p cur=%p skipped=%d cur.avail=%d cur.established=%d cur.used=%lld\n",
__func__, __LINE__, min, cur, skipped, cur->available, cur->established, Time_toNS(&cur->used));
#endif
}
if (min)
NicAddressStats_updateUsed(min);
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s:%d: return %p\n", __func__, __LINE__, min);
#endif
return min;
}
/**
* Note: Will block if no stream socket is immediately available.
*
* @return connected socket; NULL on error or pending signal
*/
Socket* NodeConnPool_acquireStreamSocket(NodeConnPool* this)
{
return NodeConnPool_acquireStreamSocketEx(this, true, NULL);
}
/**
* note: this initially checks for pending signals and returns NULL immediately if signal pending.
*
* @param true to allow waiting if all avaiable conns are currently in use; if waiting is not
* allowed, there is no difference between all conns in use and connection error (so it is expected
* that the caller will sooner or later allow waiting to detect conn errors).
* @return connected socket; NULL on error or pending signal
*/
Socket* NodeConnPool_acquireStreamSocketEx(NodeConnPool* this, bool allowWaiting,
DevicePriorityContext* devPrioCtx)
{
const char* logContext = "NodeConn (acquire stream)";
Logger* log = App_getLogger(this->app);
NetFilter* netFilter = App_getNetFilter(this->app);
NetFilter* tcpOnlyFilter = App_getTcpOnlyFilter(this->app);
Socket* sock = NULL;
unsigned short port;
NicAddressList nicListCopy;
NicAddressListIter nicIter;
bool isPrimaryInterface = true; // used to set expiration for non-primary interfaces;
// "primary" means: first interface in the list that is supported by client and server
NicAddressStats* srcRdma = NULL;
NodeType nodeType;
bool rdmaConnectInterrupted = false;
bool includeTcp = true;
if (unlikely(fatal_signal_pending(current)))
return NULL; // no need to try if the process will terminate soon anyway
Mutex_lock(&this->mutex); // L O C K
if (unlikely(this->rdmaNicCount < 0))
NodeConnPool_loadRdmaNicStatsList(this);
if(!this->availableConns && (this->establishedConns == this->maxConns) )
{ // wait for a conn to become available (or disconnected)
if(!allowWaiting)
{ // all conns in use and waiting not allowed => exit
Mutex_unlock(&this->mutex); // U N L O C K
return NULL;
}
while(!this->availableConns && (this->establishedConns == this->maxConns) )
{ // sleep interruptbile
cond_wait_res_t waitRes = Condition_waitKillable(&this->changeCond, &this->mutex);
if(unlikely(waitRes == COND_WAIT_SIGNAL) )
{ // interrupted by signal
Mutex_unlock(&this->mutex); // U N L O C K
return NULL;
}
}
}
if (this->rdmaNicCount > 0)
{
DevicePriorityContext* dpc = devPrioCtx;
// metaDevPrioCtx is hacky. When the node is meta and the client is doing
// multi-rail for RDMA this structure is used for the call to
// NodeConnPool_rdmaNicPriority(). This allows use of multiple RDMA
// interfaces to communicate with meta. When the node is storage, devPrioCtx
// will be used if not null. In that devPrioCtx has been initialized with
// information from NVFS detection that happens before the stream is
// acquired.
DevicePriorityContext metaDevPrioCtx =
{
.maxConns = 0,
#ifdef BEEGFS_NVFS
.gpuIndex = -1,
#endif
};
if (!dpc && Node_getNodeType(this->parentNode) == NODETYPE_Meta)
dpc = &metaDevPrioCtx;
if (dpc)
{
dpc->maxConns = this->maxConns / this->rdmaNicCount;
srcRdma = NodeConnPool_rdmaNicPriority(this, dpc);
#ifdef BEEGFS_DEBUG
Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext,
"Preferred IP addr is 0x%x", srcRdma == NULL? 0 : srcRdma->nic.ipAddr.s_addr);
#endif
}
}
if(likely(this->availableConns) )
{ // established connection available => grab it
PooledSocket* pooledSock;
ConnectionListIter connIter;
ConnectionListIter_init(&connIter, &this->connList);
for(; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter))
{
pooledSock = ConnectionListIter_value(&connIter);
if (PooledSocket_isAvailable(pooledSock) &&
(srcRdma == NULL ||
(Socket_getSockType((Socket*) pooledSock) == NICADDRTYPE_RDMA
&& IBVSocket_getNicStats(&((RDMASocket*) pooledSock)->ibvsock) == srcRdma)))
break;
pooledSock = NULL;
}
if (pooledSock != NULL)
{
PooledSocket_setAvailable(pooledSock, false);
PooledSocket_setHasActivity(pooledSock);
// this invalidates the iterator but this code returns before the next iteration
ConnectionList_moveToTail(&this->connList, pooledSock);
this->availableConns--;
if (srcRdma)
srcRdma->available--;
Mutex_unlock(&this->mutex); // U N L O C K
return (Socket*)pooledSock;
}
}
// no conn available, but maxConns not reached yet => establish a new conn
port = this->streamPort;
nodeType = Node_getNodeType(this->parentNode);
if (!this->enableTCPFallback && (nodeType == NODETYPE_Meta || nodeType == NODETYPE_Storage))
includeTcp = false;
ListTk_cloneNicAddressList(&this->nicList, &nicListCopy, includeTcp);
this->establishedConns++;
if (srcRdma)
srcRdma->established++;
Mutex_unlock(&this->mutex); // U N L O C K
if (this->maxConcurrentAttempts > 0)
{
if (down_killable(&this->connSemaphore))
return NULL;
}
// walk over all available NICs, create the corresponding socket and try to connect
NicAddressListIter_init(&nicIter, &nicListCopy);
for( ; !NicAddressListIter_end(&nicIter); NicAddressListIter_next(&nicIter) )
{
NicAddress* nicAddr = NicAddressListIter_value(&nicIter);
const char* nodeTypeStr = Node_getNodeTypeStr(this->parentNode);
char* endpointStr;
bool connectRes;
if(!NetFilter_isAllowed(netFilter, nicAddr->ipAddr) )
continue;
if( (nicAddr->nicType != NICADDRTYPE_STANDARD) &&
NetFilter_isContained(tcpOnlyFilter, nicAddr->ipAddr) )
continue;
endpointStr = SocketTk_endpointAddrToStr(nicAddr->ipAddr, port);
switch(nicAddr->nicType)
{
case NICADDRTYPE_RDMA:
{ // RDMA
char from[NICADDRESS_IP_STR_LEN];
struct in_addr srcAddr;
if(!this->localNicCaps.supportsRDMA)
goto continue_clean_endpointStr;
if (srcRdma)
{
srcAddr = srcRdma->nic.ipAddr;
NicAddress_ipToStr(srcAddr, from);
}
else
{
srcAddr.s_addr = 0;
strncpy(from, "any", sizeof(from));
}
Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext,
"Establishing new RDMA connection from %s to: %s@%s", from, nodeTypeStr, endpointStr);
sock = (Socket*)RDMASocket_construct(srcAddr, srcRdma);
} break;
case NICADDRTYPE_STANDARD:
{ // TCP
Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext,
"Establishing new TCP connection to: %s@%s", nodeTypeStr, endpointStr);
sock = (Socket*)StandardSocket_constructTCP();
} break;
default:
{ // unknown
if(this->logConnErrors)
Logger_logTopFormatted(log, LogTopic_CONN, Log_WARNING, logContext,
"Skipping unknown connection type to: %s@%s", nodeTypeStr, endpointStr);
goto continue_clean_endpointStr;
}
} // end of switch
// check whether the socket was successfully initialized
if(!sock)
{ // socket initialization failed
if(this->logConnErrors)
Logger_logFormatted(log, Log_WARNING, logContext, "Socket initialization failed: %s@%s",
nodeTypeStr, endpointStr);
goto continue_clean_endpointStr;
}
// the actual connection attempt
__NodeConnPool_applySocketOptionsPreConnect(this, sock);
connectRes = sock->ops->connectByIP(sock, nicAddr->ipAddr, port);
if(connectRes)
{ // connected
struct in_addr peerIP = Socket_getPeerIP(sock);
NicAddrType_t sockType = Socket_getSockType(sock);
if(__NodeConnPool_shouldPrintConnectedLogMsg(this, peerIP, sockType) )
{
const char* protocolStr = NIC_nicTypeToString(sockType);
const char* fallbackStr = isPrimaryInterface ? "" : "; fallback route";
Logger_logTopFormatted(log, LogTopic_CONN, Log_NOTICE, logContext,
"Connected: %s@%s (protocol: %s%s)",
nodeTypeStr, endpointStr, protocolStr, fallbackStr);
}
__NodeConnPool_applySocketOptionsConnected(this, (Socket*)sock);
if(!isPrimaryInterface) // non-primary => set expiration counter
{
PooledSocket* psock = (PooledSocket*) sock;
PooledSocket_setExpireTimeStart(psock);
/* connection establishment can be interrupted for rdma, not for tcp. if we were
* interrupted during rdma setup and connected via tcp instead, we should close that
* tco connection as soon as possible. */
psock->closeOnRelease = signal_pending(current);
}
kfree(endpointStr);
break;
}
else
{ // not connected
if(this->logConnErrors)
{
struct in_addr peerIP = Socket_getPeerIP(sock);
NicAddrType_t sockType = Socket_getSockType(sock);
if(__NodeConnPool_shouldPrintConnectFailedLogMsg(this, peerIP, sockType) )
{
Logger_logTopFormatted(log, LogTopic_CONN, Log_NOTICE, logContext,
"Connect failed: %s@%s (protocol: %s)",
nodeTypeStr, endpointStr, NIC_nicTypeToString(sockType) );
}
}
// the next interface is definitely non-primary
isPrimaryInterface = false;
if (nicAddr->nicType == NICADDRTYPE_RDMA && fatal_signal_pending(current))
rdmaConnectInterrupted = true;
Socket_virtualDestruct(sock);
}
continue_clean_endpointStr:
kfree(endpointStr);
if (fatal_signal_pending(current))
break;
}
if (this->maxConcurrentAttempts > 0)
{
up(&this->connSemaphore);
}
Mutex_lock(&this->mutex); // L O C K
if(!NicAddressListIter_end(&nicIter) && !fatal_signal_pending(current))
{ // success => add to list (as unavailable) and update stats
PooledSocket* pooledSock = (PooledSocket*)sock;
if (srcRdma && Socket_getSockType(sock) != NICADDRTYPE_RDMA)
{
srcRdma->established--;
if (!rdmaConnectInterrupted)
NicAddressStats_updateLastError(srcRdma);
}
ConnectionList_append(&this->connList, pooledSock);
__NodeConnPool_statsAddNic(this, PooledSocket_getNicType(pooledSock) );
__NodeConnPool_setConnSuccess(this, Socket_getPeerIP(sock), Socket_getSockType(sock) );
}
else
{ // absolutely unable to connect
sock = NULL;
this->establishedConns--;
if (srcRdma)
{
srcRdma->established--;
if (!rdmaConnectInterrupted)
NicAddressStats_updateLastError(srcRdma);
}
// connect may be interrupted by signals. return no connection without setting up alternate
// routes or other error handling in that case.
if (!fatal_signal_pending(current))
{
if(this->logConnErrors && !__NodeConnPool_getWasLastTimeCompleteFail(this) )
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(this->parentNode, &nodeAndType);
Logger_logTopFormatted(log, LogTopic_CONN, Log_CRITICAL, logContext,
"Connect failed on all available routes: %s", nodeAndType.buf);
}
__NodeConnPool_setCompleteFail(this);
}
// we are not using this connection => notify next waiter
Condition_signal(&this->changeCond);
}
ListTk_kfreeNicAddressListElems(&nicListCopy);
NicAddressList_uninit(&nicListCopy);
Mutex_unlock(&this->mutex); // U N L O C K
return sock;
}
void NodeConnPool_releaseStreamSocket(NodeConnPool* this, Socket* sock)
{
PooledSocket* pooledSock = (PooledSocket*)sock;
// test whether this socket has expired
if(unlikely(PooledSocket_getHasExpired(pooledSock, this->fallbackExpirationSecs) ||
pooledSock->closeOnRelease))
{ // this socket just expired => invalidate it
__NodeConnPool_invalidateSpecificStreamSocket(this, sock);
return;
}
// mark the socket as available
Mutex_lock(&this->mutex); // L O C K
if (unlikely(PooledSocket_getPool(pooledSock) != &this->connList))
{
printk_fhgfs(KERN_ERR, "%s:%d: socket %p not in pool %p\n",
__func__, __LINE__, pooledSock, this);
goto exit;
}
this->availableConns++;
PooledSocket_setAvailable(pooledSock, true);
ConnectionList_moveToHead(&this->connList, pooledSock);
if (Socket_getSockType((Socket*) pooledSock) == NICADDRTYPE_RDMA)
{
NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) pooledSock)->ibvsock);
if (st)
st->available++;
}
Condition_signal(&this->changeCond);
exit:
Mutex_unlock(&this->mutex); // U N L O C K
}
void NodeConnPool_invalidateStreamSocket(NodeConnPool* this, Socket* sock)
{
Logger* log = App_getLogger(this->app);
const char* logContext = "NodeConn (invalidate stream)";
unsigned numInvalidated;
/* note: we use invalidateAvailableStreams first here, because we want to keep other threads
from trying to acquire an available socket immediately - and the parameter socket isn't
availabe anyways. (we don't want to acquire a mutex here that would block the whole pool
while we're just waiting for the sock-close response messages) */
numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, false);
Logger_logTopFormatted(log, LogTopic_CONN, 4, logContext,
"Invalidated %u pooled connections. (1 more invalidation pending...)", numInvalidated);
__NodeConnPool_invalidateSpecificStreamSocket(this, sock);
}
void __NodeConnPool_invalidateSpecificStreamSocket(NodeConnPool* this, Socket* sock)
{
const char* logContext = "NodeConn (invalidate stream)";
Logger* log = App_getLogger(this->app);
bool sockValid = true;
PooledSocket* pooledSock = (PooledSocket*)sock;
bool shutdownRes;
Mutex_lock(&this->mutex); // L O C K
if (unlikely(PooledSocket_getPool(pooledSock) != &this->connList))
{ // socket not in the list
Logger_logErrFormatted(log, logContext,
"Tried to remove a socket that was not found in the pool: %s", Socket_getPeername(sock) );
sockValid = false;
}
else
{
this->establishedConns--;
if (Socket_getSockType((Socket*) sock) == NICADDRTYPE_RDMA)
{
NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) sock)->ibvsock);
if (st)
st->established--;
}
ConnectionList_remove(&this->connList, pooledSock);
__NodeConnPool_statsRemoveNic(this, PooledSocket_getNicType(pooledSock) );
Condition_signal(&this->changeCond);
}
Mutex_unlock(&this->mutex); // U N L O C K
if(!sockValid)
return;
shutdownRes = sock->ops->shutdownAndRecvDisconnect(sock, NODECONNPOOL_SHUTDOWN_WAITTIMEMS);
if(shutdownRes)
{
Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Disconnected: %s@%s",
Node_getNodeTypeStr(this->parentNode), Socket_getPeername(sock) );
}
else
{
Logger_logTopFormatted(log, LogTopic_CONN, Log_DEBUG, logContext, "Hard disconnect: %s@%s",
Node_getNodeTypeStr(this->parentNode), Socket_getPeername(sock) );
}
Socket_virtualDestruct(sock);
}
/**
* Invalidate (disconnect) connections that are currently not acquired.
*
* @param idleStreamsOnly invalidate only conns that are marked as idle (ie don't have the activity
* flag set).
* @param closeOnRelease if true set every PooledSocket's closeOnRelease flag
* @return number of invalidated streams
*/
unsigned __NodeConnPool_invalidateAvailableStreams(NodeConnPool* this, bool idleStreamsOnly,
bool closeOnRelease)
{
/* note: we have TWO STAGES here, because we don't want to hold the mutex and block everything
while we're waiting for the conns to be dropped. */
unsigned numInvalidated = 0; // retVal
ConnectionListIter connIter;
ConnectionListIter availableIter;
ConnectionList availableConnsList;
ConnectionList_init(&availableConnsList, false);
Mutex_lock(&this->mutex); // L O C K
// STAGE 1: grab all sockets that should be disconnected
ConnectionListIter_init(&connIter, &this->connList);
for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) )
{
PooledSocket* sock = ConnectionListIter_value(&connIter);
if(closeOnRelease)
sock->closeOnRelease = true;
if(!PooledSocket_isAvailable(sock) )
continue; // this one is currently in use
if(idleStreamsOnly && PooledSocket_getHasActivity(sock) )
continue; // idle-only requested and this one was not idle
// don't worry about reordering the connList here, the socket
// will be removed in stage 2
PooledSocket_setAvailable(sock, false);
this->availableConns--;
if (Socket_getSockType((Socket*)sock) == NICADDRTYPE_RDMA)
{
NicAddressStats* st = IBVSocket_getNicStats(&((RDMASocket*) sock)->ibvsock);
if (st)
st->available--;
}
ConnectionList_append(&availableConnsList, sock);
numInvalidated++;
}
Mutex_unlock(&this->mutex); // U N L O C K
// STAGE 2: invalidate all grabbed sockets
ConnectionListIter_init(&availableIter, &availableConnsList);
for( ; !ConnectionListIter_end(&availableIter); ConnectionListIter_next(&availableIter) )
{
PooledSocket* sock = ConnectionListIter_value(&availableIter);
__NodeConnPool_invalidateSpecificStreamSocket(this, (Socket*)sock);
}
ConnectionList_uninit(&availableConnsList);
return numInvalidated;
}
/**
* Disconnect all established (available) connections.
*
* @return number of disconnected streams
*/
unsigned NodeConnPool_disconnectAvailableStreams(NodeConnPool* this)
{
unsigned numInvalidated;
numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, false);
return numInvalidated;
}
/**
* Note: There is no locking around dropping and resetting afterwards, so there should only be one
* thread which calls this function (=> InternodeSyncer).
*
* @return number of disconnected streams
*/
unsigned NodeConnPool_disconnectAndResetIdleStreams(NodeConnPool* this)
{
unsigned numInvalidated;
numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, true, false);
__NodeConnPool_resetStreamsIdleFlag(this);
return numInvalidated;
}
/**
* Resets the activity flag of all available connections to mark them as idle.
*/
void __NodeConnPool_resetStreamsIdleFlag(NodeConnPool* this)
{
ConnectionListIter connIter;
Mutex_lock(&this->mutex); // L O C K
ConnectionListIter_init(&connIter, &this->connList);
for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) )
{
PooledSocket* sock = ConnectionListIter_value(&connIter);
if(!PooledSocket_isAvailable(sock) )
continue; // this one is currently in use
PooledSocket_resetHasActivity(sock);
}
Mutex_unlock(&this->mutex); // U N L O C K
}
bool __NodeConnPool_applySocketOptionsPreConnect(NodeConnPool* this, Socket* sock)
{
Config* cfg = App_getConfig(this->app);
NicAddrType_t sockType = Socket_getSockType(sock);
if(sockType == NICADDRTYPE_STANDARD)
{
StandardSocket* stdSock = (StandardSocket*)sock;
int bufSize = Config_getConnTCPRcvBufSize(cfg);
if (bufSize > 0)
StandardSocket_setSoRcvBuf(stdSock, bufSize);
}
else
if(sockType == NICADDRTYPE_RDMA)
{
RDMASocket* rdmaSock = (RDMASocket*)sock;
if (Node_getNodeType(this->parentNode) == NODETYPE_Meta)
RDMASocket_setBuffers(rdmaSock, Config_getConnRDMAMetaBufNum(cfg),
Config_getConnRDMAMetaBufSize(cfg), Config_getConnRDMAMetaFragmentSize(cfg),
Config_getConnRDMAKeyTypeNum(cfg));
else
RDMASocket_setBuffers(rdmaSock, Config_getConnRDMABufNum(cfg),
Config_getConnRDMABufSize(cfg), Config_getConnRDMAFragmentSize(cfg),
Config_getConnRDMAKeyTypeNum(cfg));
RDMASocket_setTimeouts(rdmaSock, Config_getConnRDMATimeoutConnect(cfg),
Config_getConnRDMATimeoutCompletion(cfg), Config_getConnRDMATimeoutFlowSend(cfg),
Config_getConnRDMATimeoutFlowRecv(cfg), Config_getConnRDMATimeoutPoll(cfg));
RDMASocket_setTypeOfService(rdmaSock, Config_getConnRDMATypeOfService(cfg));
RDMASocket_setConnectionFailureStatus(rdmaSock, Config_getRemapConnectionFailureStatus(cfg));
}
return true;
}
/**
* Apply socket tweaks and send channel control messages.
*/
bool __NodeConnPool_applySocketOptionsConnected(NodeConnPool* this, Socket* sock)
{
const char* logContext = "NodeConn (apply socket options)";
Logger* log = App_getLogger(this->app);
Config* cfg = App_getConfig(this->app);
uint64_t authHash = Config_getConnAuthHash(cfg);
bool allSuccessful = true;
NicAddrType_t sockType = Socket_getSockType(sock);
// apply general socket options
if(sockType == NICADDRTYPE_STANDARD)
{
StandardSocket* standardSock = (StandardSocket*)sock;
bool corkRes;
bool noDelayRes;
bool keepAliveRes;
corkRes = StandardSocket_setTcpCork(standardSock, false);
if(!corkRes)
{
Logger_log(log, Log_NOTICE, logContext, "Failed to disable TcpCork");
allSuccessful = false;
}
noDelayRes = StandardSocket_setTcpNoDelay(standardSock, true);
if(!noDelayRes)
{
Logger_log(log, Log_NOTICE, logContext, "Failed to enable TcpNoDelay");
allSuccessful = false;
}
keepAliveRes = StandardSocket_setSoKeepAlive(standardSock, true);
if(!keepAliveRes)
{
Logger_log(log, Log_NOTICE, logContext, "Failed to enable SoKeepAlive");
allSuccessful = false;
}
}
// send control messages
if(authHash)
{ // authenticate channel
char* sendBuf;
size_t sendBufLen;
AuthenticateChannelMsg authMsg;
size_t sendRes;
AuthenticateChannelMsg_initFromValue(&authMsg, authHash);
sendBufLen = NetMessage_getMsgLength( (NetMessage*)&authMsg);
sendBuf = (char*)os_kmalloc(sendBufLen);
NetMessage_serialize( (NetMessage*)&authMsg, sendBuf, sendBufLen);
sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0);
if(sendRes <= 0)
{
Logger_log(log, Log_WARNING, logContext, "Failed to send authentication");
allSuccessful = false;
}
kfree(sendBuf);
}
{ // make channel indirect
char* sendBuf;
size_t sendBufLen;
SetChannelDirectMsg directMsg;
size_t sendRes;
SetChannelDirectMsg_initFromValue(&directMsg, 0);
sendBufLen = NetMessage_getMsgLength( (NetMessage*)&directMsg);
sendBuf = (char*)os_kmalloc(sendBufLen);
NetMessage_serialize( (NetMessage*)&directMsg, sendBuf, sendBufLen);
sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0);
if(sendRes <= 0)
{
Logger_log(log, Log_WARNING, logContext, "Failed to set channel to indirect mode");
allSuccessful = false;
}
kfree(sendBuf);
}
{
char* sendBuf;
size_t sendBufLen;
PeerInfoMsg peerInfo;
size_t sendRes = -1;
PeerInfoMsg_init(&peerInfo, NODETYPE_Client, this->app->localNode->numID);
sendBufLen = NetMessage_getMsgLength(&peerInfo.netMessage);
sendBuf = kmalloc(sendBufLen, GFP_KERNEL);
if (sendBuf)
{
NetMessage_serialize(&peerInfo.netMessage, sendBuf, sendBufLen);
sendRes = Socket_send_kernel(sock, sendBuf, sendBufLen, 0);
}
if(sendRes <= 0)
{
Logger_log(log, Log_WARNING, logContext, "Failed to transmit local node info");
allSuccessful = false;
}
kfree(sendBuf);
}
return allSuccessful;
}
void NodeConnPool_getStats(NodeConnPool* this, NodeConnPoolStats* outStats)
{
Mutex_lock(&this->mutex); // L O C K
*outStats = this->stats;
Mutex_unlock(&this->mutex); // U N L O C K
}
/**
* Gets the first socket peer name for a connection to the specified nicType.
* Only works on currently available connections.
*
* @outBuf buffer for the peer name string (contains "n/a" if no such connection exists or is
* currently not available).
* @param outIsNonPrimary set to true if conn in outBuf has an expiration counter, false otherwise
* @return number of chars written to buf
*/
unsigned NodeConnPool_getFirstPeerName(NodeConnPool* this, NicAddrType_t nicType, ssize_t outBufLen,
char* outBuf, bool* outIsNonPrimary)
{
unsigned numWritten = 0; // return value
ConnectionListIter connIter;
bool foundMatch = false;
Mutex_lock(&this->mutex); // L O C K
ConnectionListIter_init(&connIter, &this->connList);
for( ; !ConnectionListIter_end(&connIter); ConnectionListIter_next(&connIter) )
{
PooledSocket* pooledSock = ConnectionListIter_value(&connIter);
Socket* sock = (Socket*)pooledSock;
if(PooledSocket_isAvailable(pooledSock) &&
(Socket_getSockType(sock) == nicType) )
{ // found a match => print to buf and stop
numWritten += scnprintf(outBuf, outBufLen, "%s", Socket_getPeername(sock) );
*outIsNonPrimary = PooledSocket_getHasExpirationTimer(pooledSock);
foundMatch = true;
break;
}
}
if(!foundMatch)
{ // print "n/a"
numWritten += scnprintf(outBuf, outBufLen, "busy");
*outIsNonPrimary = false;
}
Mutex_unlock(&this->mutex); // U N L O C K
return numWritten;
}
/**
* Increase stats counter for number of established conns (by NIC type).
*/
void __NodeConnPool_statsAddNic(NodeConnPool* this, NicAddrType_t nicType)
{
switch(nicType)
{
case NICADDRTYPE_RDMA:
{
(this->stats.numEstablishedRDMA)++;
} break;
default:
{
(this->stats.numEstablishedStd)++;
} break;
}
}
/**
* Decrease stats counter for number of established conns (by NIC type).
*/
void __NodeConnPool_statsRemoveNic(NodeConnPool* this, NicAddrType_t nicType)
{
switch(nicType)
{
case NICADDRTYPE_RDMA:
{
(this->stats.numEstablishedRDMA)--;
} break;
default:
{
(this->stats.numEstablishedStd)--;
} break;
}
}
/**
* @param streamPort value 0 will be ignored
* @param nicList will be copied
* @return true if port has changed
*/
bool NodeConnPool_updateInterfaces(NodeConnPool* this, unsigned short streamPort,
NicAddressList* nicList)
{
Logger* log = App_getLogger(this->app);
const char* logContext = "NodeConn (update stream port)";
bool hasChanged = false; // retVal
NodeString alias;
Mutex_lock(&this->mutex); // L O C K
Node_copyAlias(this->parentNode, &alias);
if(streamPort && (streamPort != this->streamPort) )
{
this->streamPort = streamPort;
hasChanged = true;
Logger_logFormatted(log, Log_NOTICE, logContext,
"Node %s port has changed", alias.buf);
}
if (!NicAddressList_equals(&this->nicList, nicList))
{ // update nicList (if allocation of new list fails, we just keep the old list)
NicAddressList newNicList;
hasChanged = true;
Logger_logFormatted(log, Log_NOTICE, logContext,
"Node %s interfaces have changed", alias.buf);
ListTk_cloneNicAddressList(nicList, &newNicList, true);
ListTk_kfreeNicAddressListElems(&this->nicList);
NicAddressList_uninit(&this->nicList);
this->nicList = newNicList;
}
Mutex_unlock(&this->mutex); // U N L O C K
if (unlikely(hasChanged))
{
// closeOnRelease is true, all of these sockets need to be invalidated ASAP
unsigned numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, true);
if(numInvalidated)
{
Logger_logFormatted(log, Log_DEBUG, logContext,
"Invalidated %u pooled connections (due to port/interface change)", numInvalidated);
}
}
return hasChanged;
}
void __NodeConnPool_setCompleteFail(NodeConnPool* this)
{
this->errState.lastSuccessPeerIP.s_addr = 0;
this->errState.wasLastTimeCompleteFail = true;
}
void __NodeConnPool_setConnSuccess(NodeConnPool* this, struct in_addr lastSuccessPeerIP,
NicAddrType_t lastSuccessNicType)
{
this->errState.lastSuccessPeerIP = lastSuccessPeerIP;
this->errState.lastSuccessNicType = lastSuccessNicType;
this->errState.wasLastTimeCompleteFail = false;
}
bool __NodeConnPool_equalsLastSuccess(NodeConnPool* this, struct in_addr lastSuccessPeerIP,
NicAddrType_t lastSuccessNicType)
{
return (this->errState.lastSuccessPeerIP.s_addr == lastSuccessPeerIP.s_addr) &&
(this->errState.lastSuccessNicType == lastSuccessNicType);
}
bool __NodeConnPool_isLastSuccessInitialized(NodeConnPool* this)
{
return (this->errState.lastSuccessPeerIP.s_addr != 0);
}
bool __NodeConnPool_shouldPrintConnectedLogMsg(NodeConnPool* this, struct in_addr currentPeerIP,
NicAddrType_t currentNicType)
{
/* log only if we didn's succeed at all last time, or if we succeeded last time and it was
with a different IP/NIC pair */
return this->errState.wasLastTimeCompleteFail ||
!__NodeConnPool_isLastSuccessInitialized(this) ||
!__NodeConnPool_equalsLastSuccess(this, currentPeerIP, currentNicType);
}
bool __NodeConnPool_shouldPrintConnectFailedLogMsg(NodeConnPool* this, struct in_addr currentPeerIP,
NicAddrType_t currentNicType)
{
/* log only if this is the first connection attempt or if we succeeded last time this this
IP/NIC pair */
return (!this->errState.wasLastTimeCompleteFail &&
(!__NodeConnPool_isLastSuccessInitialized(this) ||
__NodeConnPool_equalsLastSuccess(this, currentPeerIP, currentNicType) ) );
}
void NodeConnPool_updateLocalInterfaces(NodeConnPool* this, NicAddressList* localNicList,
NicListCapabilities* localNicCaps, NicAddressList* localRdmaNicList)
{
Logger* log = App_getLogger(this->app);
const char* logContext = "NodeConn (update local NIC list)";
unsigned numInvalidated;
Mutex_lock(&this->mutex); // L O C K
this->localNicCaps = *localNicCaps;
if (localRdmaNicList)
{
ListTk_kfreeNicAddressListElems(&this->localRdmaNicList);
NicAddressList_uninit(&this->localRdmaNicList);
ListTk_cloneNicAddressList(localRdmaNicList, &this->localRdmaNicList, true);
NodeConnPool_loadRdmaNicStatsList(this);
}
Mutex_unlock(&this->mutex); // U N L O C K
numInvalidated = __NodeConnPool_invalidateAvailableStreams(this, false, true);
if(numInvalidated)
{
Logger_logFormatted(log, Log_DEBUG, logContext,
"Invalidated %u pooled connections (due to local interface change)", numInvalidated);
}
}