2504 lines
70 KiB
C++
2504 lines
70 KiB
C++
#include "IBVSocket.h"
|
|
|
|
#include <sys/epoll.h>
|
|
|
|
#include <common/app/log/Logger.h>
|
|
#include <common/app/AbstractApp.h>
|
|
#include <common/threading/PThread.h>
|
|
|
|
#ifdef BEEGFS_NVFS
|
|
// only for WORKER_BUFOUT_SIZE
|
|
#include <common/components/worker/Worker.h>
|
|
#endif /* BEEGFS_NVFS */
|
|
|
|
#define IBVSOCKET_CONN_TIMEOUT_MS 3000
|
|
// IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS is the initial timeout to wait between checks for
|
|
// RDMA response events while establishing outgoing connections. Will be scaled up exponentially
|
|
// after every poll (up to IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS) to reduce load but allow for quick
|
|
// turnaround in the majority of cases where an event will come shortly after initiating the
|
|
// connection.
|
|
//
|
|
// There used to be a fixed timeout of 500ms here, which lead to long initial connection initiation
|
|
// times when an event was not received immediately
|
|
// (https://github.com/ThinkParQ/beegfs-core/issues/4054).
|
|
#define IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS 1
|
|
#define IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS 512
|
|
|
|
#define IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS 180000
|
|
#define IBVSOCKET_POLL_TIMEOUT_MS 7500
|
|
#define IBVSOCKET_LISTEN_BACKLOG 128
|
|
#define IBVSOCKET_FLOWCONTROL_MSG_LEN 1
|
|
#define IBVSOCKET_DEFAULT_TOS 0
|
|
/**
|
|
* IBVSOCKET_RECV_TIMEOUT_MS is used by IBVSocket_recv, which does not take a
|
|
* timeout value. It is very long because IBVSocket_recv continues to call
|
|
* IBVSocket_recvT until it does not timeout.
|
|
*/
|
|
#define IBVSOCKET_RECV_TIMEOUT_MS (1024*1024)
|
|
|
|
#define IBVSOCKET_MIN_BUF_NUM 1
|
|
#define IBVSOCKET_MIN_BUF_SIZE 4096 // 4kiB
|
|
#define IBVSOCKET_MAX_BUF_SIZE_NUM 134217728 // num * size <= 128MiB
|
|
#ifdef BEEGFS_NVFS
|
|
#define IBVSOCKET_WC_ENTRIES 1
|
|
#endif /* BEEGFS_NVFS */
|
|
|
|
void IBVSocket_init(IBVSocket* _this)
|
|
{
|
|
memset(_this, 0, sizeof(*_this) );
|
|
|
|
_this->sockValid = false;
|
|
_this->epollFD = -1;
|
|
_this->typeOfService = IBVSOCKET_DEFAULT_TOS;
|
|
_this->timeoutCfg.connectMS = IBVSOCKET_CONN_TIMEOUT_MS;
|
|
_this->timeoutCfg.flowSendMS = IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS;
|
|
|
|
_this->cm_channel = rdma_create_event_channel();
|
|
if(!_this->cm_channel)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_create_event_channel failed.");
|
|
return;
|
|
}
|
|
|
|
if(rdma_create_id(_this->cm_channel, &_this->cm_id, NULL, RDMA_PS_TCP) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_create_id failed.");
|
|
return;
|
|
}
|
|
|
|
_this->sockValid = true;
|
|
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* Note: Intended for incoming accepted connections.
|
|
*
|
|
* @param commContext belongs to this object (so do not use or free it after calling this!)
|
|
*/
|
|
void __IBVSocket_initFromCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
|
|
IBVCommContext* commContext)
|
|
{
|
|
memset(_this, 0, sizeof(*_this) );
|
|
|
|
_this->sockValid = false;
|
|
_this->epollFD = -1;
|
|
|
|
_this->typeOfService = IBVSOCKET_DEFAULT_TOS;
|
|
|
|
_this->cm_id = cm_id;
|
|
_this->commContext = commContext;
|
|
|
|
#ifdef SYSTEM_HAS_RDMA_MIGRATE_ID__disabled
|
|
|
|
// note: see _accept() for the reasons why this is currently disabled
|
|
|
|
_this->cm_channel = rdma_create_event_channel();
|
|
if(!_this->cm_channel)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_create_event_channel failed.");
|
|
return;
|
|
}
|
|
|
|
#endif // SYSTEM_HAS_RDMA_MIGRATE_ID
|
|
|
|
_this->sockValid = true;
|
|
LOG(SOCKLIB, DEBUG, __func__,
|
|
("_this", StringTk::uint64ToHexStr((uint64_t) _this)),
|
|
("device", cm_id->verbs->device->name));
|
|
|
|
return;
|
|
}
|
|
|
|
IBVSocket* IBVSocket_construct()
|
|
{
|
|
IBVSocket* _this = (IBVSocket*)malloc(sizeof(*_this) );
|
|
|
|
IBVSocket_init(_this);
|
|
|
|
return _this;
|
|
}
|
|
|
|
IBVSocket* __IBVSocket_constructFromCommContext(struct rdma_cm_id* cm_id,
|
|
IBVCommContext* commContext)
|
|
{
|
|
IBVSocket* _this = (IBVSocket*)malloc(sizeof(*_this) );
|
|
|
|
__IBVSocket_initFromCommContext(_this, cm_id, commContext);
|
|
|
|
return _this;
|
|
}
|
|
|
|
void IBVSocket_uninit(IBVSocket* _this)
|
|
{
|
|
if(_this->epollFD != -1)
|
|
close(_this->epollFD);
|
|
|
|
__IBVSocket_close(_this);
|
|
}
|
|
|
|
void IBVSocket_destruct(IBVSocket* _this)
|
|
{
|
|
IBVSocket_uninit(_this);
|
|
|
|
free(_this);
|
|
}
|
|
|
|
bool IBVSocket_rdmaDevicesExist()
|
|
{
|
|
bool devicesExist;
|
|
|
|
int numDevices = 1;
|
|
struct ibv_context** devicesRes;
|
|
|
|
devicesRes = rdma_get_devices(&numDevices);
|
|
|
|
devicesExist = (devicesRes != NULL) && (numDevices > 0);
|
|
|
|
if(devicesRes)
|
|
rdma_free_devices(devicesRes);
|
|
|
|
return devicesExist;
|
|
}
|
|
|
|
/**
|
|
* Prepare ibverbs for forking a child process. This is only required if the parent process
|
|
* has mapped memory for RDMA.
|
|
* Call this only once in your program.
|
|
*
|
|
* Note: There is no corresponding uninit-method that needs to be called.
|
|
*/
|
|
void IBVSocket_fork_init_once()
|
|
{
|
|
ibv_fork_init();
|
|
}
|
|
|
|
bool IBVSocket_connectByName(IBVSocket* _this, const char* hostname, unsigned short port,
|
|
IBVCommConfig* commCfg)
|
|
{
|
|
struct addrinfo *res;
|
|
struct addrinfo hints;
|
|
|
|
int getInfoRes;
|
|
struct in_addr ipaddress;
|
|
|
|
memset(&hints, 0, sizeof(hints) );
|
|
hints.ai_family = PF_INET;
|
|
hints.ai_socktype = SOCK_STREAM;
|
|
|
|
getInfoRes = getaddrinfo(hostname, NULL, &hints, &res);
|
|
|
|
if(getInfoRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Name resolution error.", hostname, port,
|
|
("error", gai_strerror(getInfoRes)));
|
|
|
|
return false;
|
|
}
|
|
|
|
ipaddress.s_addr = ( (struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
|
|
|
|
|
|
// clean-up
|
|
freeaddrinfo(res);
|
|
|
|
|
|
return IBVSocket_connectByIP(_this, ipaddress, port, commCfg);
|
|
}
|
|
|
|
bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port,
|
|
IBVCommConfig* commCfg)
|
|
{
|
|
struct rdma_cm_event* event;
|
|
struct sockaddr_in sin;
|
|
bool createContextRes;
|
|
struct rdma_conn_param conn_param;
|
|
bool parseCommDestRes;
|
|
bool epollInitRes;
|
|
int rc;
|
|
int connTimeoutMS = IBVSOCKET_CONN_TIMEOUT_INITIAL_POLL_MS;
|
|
int connTimeoutRemaining = IBVSOCKET_CONN_TIMEOUT_MS;
|
|
int oldChannelFlags;
|
|
int setOldFlagsRes;
|
|
|
|
LOG(SOCKLIB, DEBUG, "Connect RDMASocket", ("socket", _this), ("addr", Socket::endpointAddrToStr(ipaddress, port)),
|
|
("bindIP", Socket::ipaddrToStr(_this->bindIP)));
|
|
|
|
// resolve IP address...
|
|
|
|
sin.sin_addr.s_addr = ipaddress.s_addr;
|
|
sin.sin_family = AF_INET;
|
|
sin.sin_port = htons(port);
|
|
|
|
if(rdma_resolve_addr(_this->cm_id, NULL, (struct sockaddr*)&sin, _this->timeoutCfg.connectMS) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_resolve_addr failed.");
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
if(rdma_get_cm_event(_this->cm_channel, &event))
|
|
goto err_invalidateSock;
|
|
|
|
if(event->event != RDMA_CM_EVENT_ADDR_RESOLVED)
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Unexpected CM event.", ("event", rdma_event_str(event->event)));
|
|
goto err_ack_and_invalidateSock;
|
|
}
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
// set type of service for connection
|
|
if (_this->typeOfService)
|
|
{
|
|
if (rdma_set_option(_this->cm_id, RDMA_OPTION_ID, RDMA_OPTION_ID_TOS, &(_this->typeOfService),
|
|
sizeof(_this->typeOfService)))
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Failed to set Type Of Service.",
|
|
("tos", _this->typeOfService));
|
|
goto err_invalidateSock;
|
|
}
|
|
}
|
|
|
|
// resolve route...
|
|
|
|
if(rdma_resolve_route(_this->cm_id, _this->timeoutCfg.connectMS) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_resolve_route failed.");
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
if(rdma_get_cm_event(_this->cm_channel, &event))
|
|
goto err_invalidateSock;
|
|
|
|
if(event->event != RDMA_CM_EVENT_ROUTE_RESOLVED)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Unexpected CM event.",
|
|
("event", rdma_event_str(event->event)));
|
|
goto err_ack_and_invalidateSock;
|
|
}
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
// create comm context...
|
|
|
|
createContextRes = __IBVSocket_createCommContext(_this, _this->cm_id, commCfg,
|
|
&_this->commContext);
|
|
if(!createContextRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "creation of CommContext failed.");
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
// establish connection...
|
|
|
|
__IBVSocket_initCommDest(_this->commContext, &_this->localDest);
|
|
|
|
memset(&conn_param, 0, sizeof(conn_param) );
|
|
#ifdef BEEGFS_NVFS
|
|
conn_param.responder_resources = RDMA_MAX_RESP_RES;
|
|
conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
|
|
#else
|
|
conn_param.responder_resources = 1;
|
|
conn_param.initiator_depth = 1;
|
|
#endif /* BEEGFS_NVFS */
|
|
conn_param.flow_control = 0;
|
|
conn_param.retry_count = 7; // (3 bits)
|
|
conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity)
|
|
conn_param.private_data = &_this->localDest;
|
|
conn_param.private_data_len = sizeof(_this->localDest);
|
|
|
|
if(rdma_connect(_this->cm_id, &conn_param))
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "rdma_connect failed.");
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
oldChannelFlags = fcntl(IBVSocket_getConnManagerFD(_this), F_GETFL);
|
|
|
|
rc = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags | O_NONBLOCK);
|
|
if(rc < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Set conn manager channel non-blocking failed.", sysErr);
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
// rdma_connect() can take a very long time (>5m) to timeout if the peer's HCA is down.
|
|
// Change the channel to non-blocking and use a custom timeout mechanism.
|
|
rc = -1;
|
|
while (connTimeoutRemaining > 0)
|
|
{
|
|
// (non-blocking) check for new events
|
|
rc = rdma_get_cm_event(_this->cm_channel, &event);
|
|
|
|
if (rc)
|
|
{
|
|
if (errno != ETIMEDOUT && errno != EAGAIN)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_get_cm_event failed", ("errno", errno));
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// we got an event
|
|
LOG(SOCKLIB, DEBUG, "Received RDMA connect response event.", ("Milliseconds spent polling", IBVSOCKET_CONN_TIMEOUT_MS - connTimeoutRemaining));
|
|
break;
|
|
}
|
|
|
|
connTimeoutRemaining -= connTimeoutMS;
|
|
if (connTimeoutRemaining > 0)
|
|
{
|
|
struct timespec ts = {
|
|
.tv_sec = 0,
|
|
.tv_nsec = (connTimeoutMS * 1000 * 1000)
|
|
};
|
|
|
|
// progressively scale the timeout by squaring it until it is larger than
|
|
// IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS
|
|
if (connTimeoutMS == 1)
|
|
{
|
|
connTimeoutMS = 2;
|
|
}
|
|
else if (connTimeoutMS * 2 <= IBVSOCKET_CONN_TIMEOUT_MAX_POLL_MS)
|
|
{
|
|
connTimeoutMS *= 2;
|
|
}
|
|
|
|
if (::nanosleep(&ts, NULL) != 0)
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "rdma_connect: sleep interrupted");
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
LOG(SOCKLIB, DEBUG, "rdma_connect: timed out");
|
|
}
|
|
|
|
// change channel mode back to blocking
|
|
setOldFlagsRes = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags);
|
|
if(setOldFlagsRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Set conn manager channel blocking failed.", sysErr);
|
|
if (rc == 0)
|
|
goto err_ack_and_invalidateSock;
|
|
else
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
if (rc != 0)
|
|
goto err_invalidateSock;
|
|
|
|
if(event->event != RDMA_CM_EVENT_ESTABLISHED)
|
|
{
|
|
if(event->event == RDMA_CM_EVENT_REJECTED)
|
|
LOG(SOCKLIB, DEBUG, "Connection rejected.");
|
|
else
|
|
LOG(SOCKLIB, WARNING, "Unexpected conn manager event.",
|
|
("event", rdma_event_str(event->event)));
|
|
goto err_ack_and_invalidateSock;
|
|
}
|
|
|
|
parseCommDestRes = __IBVSocket_parseCommDest(
|
|
event->param.conn.private_data, event->param.conn.private_data_len, &_this->remoteDest);
|
|
if(!parseCommDestRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad private data received.",
|
|
("len", event->param.conn.private_data_len));
|
|
goto err_ack_and_invalidateSock;
|
|
}
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
epollInitRes = __IBVSocket_initEpollFD(_this);
|
|
if(!epollInitRes)
|
|
goto err_invalidateSock;
|
|
|
|
return true;
|
|
|
|
|
|
err_ack_and_invalidateSock:
|
|
rdma_ack_cm_event(event);
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @return true on success
|
|
*/
|
|
bool IBVSocket_bind(IBVSocket* _this, unsigned short port)
|
|
{
|
|
in_addr_t ipAddr = INADDR_ANY;
|
|
|
|
return IBVSocket_bindToAddr(_this, ipAddr, port);
|
|
}
|
|
|
|
bool IBVSocket_bindToAddr(IBVSocket* _this, in_addr_t ipAddr, unsigned short port)
|
|
{
|
|
struct sockaddr_in bindAddr;
|
|
|
|
bindAddr.sin_family = AF_INET;
|
|
bindAddr.sin_addr.s_addr = ipAddr;
|
|
bindAddr.sin_port = htons(port);
|
|
|
|
LOG(SOCKLIB, DEBUG, "Bind RDMASocket", ("socket", _this), ("addr", Socket::endpointAddrToStr(ipAddr, port)));
|
|
|
|
if(rdma_bind_addr(_this->cm_id, (struct sockaddr*)&bindAddr) )
|
|
{
|
|
//SyslogLogger::log(LOG_WARNING, "%s:%d rdma_bind_addr failed (port: %d)\n",
|
|
//__func__, __LINE__, (int)port); // debug in
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
_this->bindIP.s_addr = ipAddr;
|
|
|
|
return true;
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Note: This also inits the delayedCmEventsQueue.
|
|
*
|
|
* @return true on success
|
|
*/
|
|
bool IBVSocket_listen(IBVSocket* _this)
|
|
{
|
|
if(rdma_listen(_this->cm_id, IBVSOCKET_LISTEN_BACKLOG) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_listen failed.");
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
// init delayed events queue
|
|
_this->delayedCmEventsQ = new CmEventQueue();
|
|
|
|
|
|
return true;
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Note: Call IBVSocket_checkDelayedEvents() after this to find out whether more events
|
|
* are waiting.
|
|
* Note: Because of the special way ibverbs accept connections, it is possible that we receive
|
|
* some other events here as well (e.g. a child socket disconnect). In these cases,
|
|
* ACCEPTRES_IGNORE will be returned.
|
|
*
|
|
* @param outAcceptedSock only valid when ACCEPTRES_SUCCESS is returned
|
|
* @param peerAddr (out) peer address
|
|
* @param peerAddrLen (out) length of peer address
|
|
* @return ACCEPTRES_IGNORE in case an irrelevant event occurred
|
|
*/
|
|
IBVSocket_AcceptRes IBVSocket_accept(IBVSocket* _this, IBVSocket** outAcceptedSock,
|
|
struct sockaddr* peerAddr, socklen_t* peerAddrLen)
|
|
{
|
|
struct rdma_cm_event* event = NULL;
|
|
IBVCommContext* childCommContext = NULL;
|
|
IBVSocket* acceptedSock = NULL; // auto-destructed on error/ignore (internal, not for caller)
|
|
IBVCommDest* childRemoteDest = NULL; // auto-freed on error/ignore
|
|
|
|
*outAcceptedSock = NULL;
|
|
|
|
|
|
// get next waiting event from delay-queue or from event channel
|
|
|
|
if (!_this->delayedCmEventsQ->empty())
|
|
{
|
|
event = _this->delayedCmEventsQ->front();
|
|
_this->delayedCmEventsQ->pop();
|
|
}
|
|
else
|
|
if(rdma_get_cm_event(_this->cm_channel, &event) )
|
|
{
|
|
_this->errState = -1;
|
|
return ACCEPTRES_ERR;
|
|
}
|
|
|
|
|
|
// handle event type
|
|
|
|
switch(event->event)
|
|
{
|
|
case RDMA_CM_EVENT_CONNECT_REQUEST:
|
|
{
|
|
// got an incoming 'connect request' => check validity of private data and accept/reject
|
|
|
|
bool createContextRes;
|
|
struct rdma_conn_param conn_param;
|
|
bool parseCommDestRes;
|
|
IBVCommConfig commCfg;
|
|
|
|
struct rdma_cm_id* child_cm_id = event->id;
|
|
|
|
//*peerAddrLen = sizeof(struct sockaddr_in);
|
|
//memcpy(peerAddr, &child_cm_id->route.addr.dst_addr, *peerAddrLen);
|
|
|
|
|
|
// parse private data to get remote dest
|
|
|
|
parseCommDestRes = __IBVSocket_parseCommDest(
|
|
event->param.conn.private_data, event->param.conn.private_data_len, &childRemoteDest);
|
|
if(!parseCommDestRes)
|
|
{ // bad private data => reject connection
|
|
LOG(SOCKLIB, WARNING, "Bad private data received.",
|
|
("len", event->param.conn.private_data_len));
|
|
|
|
if(rdma_reject(child_cm_id, NULL, 0) )
|
|
LOG(SOCKLIB, WARNING, "rdma_reject failed.");
|
|
|
|
goto ignore;
|
|
}
|
|
|
|
|
|
// private data (remote dest) okay => create local comm context and socket instance
|
|
|
|
// (we use the buffer config as suggested by the connecting peer)
|
|
commCfg.bufNum = childRemoteDest->recvBufNum;
|
|
commCfg.bufSize = childRemoteDest->recvBufSize;
|
|
|
|
createContextRes = __IBVSocket_createCommContext(_this, child_cm_id, &commCfg,
|
|
&childCommContext);
|
|
if(!createContextRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Creation of CommContext failed.");
|
|
|
|
if(rdma_reject(child_cm_id, NULL, 0) )
|
|
LOG(SOCKLIB, WARNING, "rdma_reject failed.");
|
|
|
|
goto ignore;
|
|
}
|
|
|
|
acceptedSock = __IBVSocket_constructFromCommContext(child_cm_id, childCommContext);
|
|
if(!acceptedSock->sockValid)
|
|
goto ignore;
|
|
|
|
|
|
acceptedSock->remoteDest = childRemoteDest;
|
|
childRemoteDest = NULL; // would otherwise be destroyed at 'ignore'
|
|
|
|
|
|
// send accept message (with local destination info)
|
|
|
|
__IBVSocket_initCommDest(childCommContext, &acceptedSock->localDest);
|
|
|
|
memset(&conn_param, 0, sizeof(conn_param) );
|
|
#ifdef BEEGFS_NVFS
|
|
conn_param.responder_resources = RDMA_MAX_RESP_RES;
|
|
conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
|
|
#else
|
|
conn_param.responder_resources = 1;
|
|
conn_param.initiator_depth = 1;
|
|
#endif /* BEEGFS_NVFS */
|
|
conn_param.flow_control = 0;
|
|
conn_param.retry_count = 7; // (3 bits)
|
|
conn_param.rnr_retry_count = 7; // rnr = receiver not ready (3 bits, 7 means infinity)
|
|
conn_param.private_data = &acceptedSock->localDest;
|
|
conn_param.private_data_len = sizeof(acceptedSock->localDest);
|
|
|
|
// test point for dropping the connect request
|
|
if(IBVSocket_connectionRejection(_this))
|
|
goto ignore;
|
|
|
|
if(rdma_accept(child_cm_id, &conn_param) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_accept failed.");
|
|
|
|
goto ignore;
|
|
}
|
|
|
|
if(!__IBVSocket_initEpollFD(acceptedSock) )
|
|
goto ignore;
|
|
|
|
|
|
// Note that this code returns ACCEPTRES_IGNORE
|
|
LOG(SOCKLIB, DEBUG, "Connection request on RDMASocket");
|
|
child_cm_id->context = acceptedSock;
|
|
acceptedSock = NULL; // would otherwise be destroyed at 'ignore'
|
|
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_ESTABLISHED:
|
|
{
|
|
// received 'established' (this is what we've actually been waiting for!)
|
|
|
|
*peerAddrLen = sizeof(struct sockaddr_in);
|
|
memcpy(peerAddr, &event->id->route.addr.dst_addr, *peerAddrLen);
|
|
|
|
*outAcceptedSock = (IBVSocket*)event->id->context;
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
#ifdef SYSTEM_HAS_RDMA_MIGRATE_ID__disabled
|
|
|
|
// note: this is currently disabled, because:
|
|
// a) rdma_migrate_id always returns "invalid argument"
|
|
// b) we need disconnect events for incoming conns to be handled and the handler must call
|
|
// rdma_disconnect to enable disconnect detection for the streamlistener
|
|
|
|
// note: migration might deadlock if there are any retrieved but not yet ack'ed events
|
|
// for the current channel, so we cannot migrate if this is the case
|
|
|
|
// note: the only purpose of migration to a separate channel is that we can do better
|
|
// disconnect detection in waitForCompletion(). so living without the migration is
|
|
// generally not a problem (but disconnect detection might take longer).
|
|
|
|
if(_this->delayedCmEventsQ->size() )
|
|
{ // events waiting => don't migrate
|
|
LOG(SOCKLIB, WARNING,
|
|
"Skipping rdma_migrate_id due to waiting events (but we can live without it).");
|
|
}
|
|
else
|
|
{ // migrate cm_id from general accept-channel to its own channel
|
|
int migrateRes = rdma_migrate_id(
|
|
(*outAcceptedSock)->cm_id, (*outAcceptedSock)->cm_channel);
|
|
|
|
if(migrateRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma_migrate_id failed (but we can live without it).",
|
|
migrateRes, sysErr);
|
|
}
|
|
}
|
|
|
|
#endif // SYSTEM_HAS_RDMA_MIGRATE_ID
|
|
|
|
return ACCEPTRES_SUCCESS;
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_DISCONNECTED:
|
|
{
|
|
// note: be careful about what we do with the event-socket here, because the socket might
|
|
// already be under destruction in another thread.
|
|
|
|
LOG(SOCKLIB, DEBUG, "Disconnect event.");
|
|
|
|
// note: the additional disconnect call is required to get the streamlistener event
|
|
// channel (the one of the listen sock) to report the disconnect
|
|
rdma_disconnect(event->id);
|
|
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_UNREACHABLE:
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Remote unreachable event while waiting for 'established'.");
|
|
acceptedSock = (IBVSocket*)event->id->context; // will be destroyed at 'ignore'
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_CONNECT_ERROR:
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Connect error event while waiting for 'established'.");
|
|
acceptedSock = (IBVSocket*)event->id->context; // will be destroyed at 'ignore'
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
|
|
{ // log only with enabled debug code
|
|
LOG(SOCKLIB, DEBUG, "Ignoring conn manager event RDMA_CM_EVENT_TIMEWAIT_EXIT.");
|
|
} break;
|
|
|
|
case RDMA_CM_EVENT_DEVICE_REMOVAL:
|
|
{
|
|
AbstractApp* app = PThread::getCurrentThreadApp();
|
|
const char* devname = "unknown";
|
|
if (event->id && event->id->verbs)
|
|
devname = ibv_get_device_name(event->id->verbs->device);
|
|
LOG(SOCKLIB, ERR, "Device removed", ("device", devname));
|
|
app->handleNetworkInterfaceFailure(std::string(devname));
|
|
} break;
|
|
|
|
default:
|
|
{ // ignore other events
|
|
// always log
|
|
LOG(SOCKLIB, WARNING, "Ignoring conn manager event.",
|
|
("event", rdma_event_str(event->event)));
|
|
} break;
|
|
}
|
|
|
|
|
|
// irrelevant event (irrelevant for the caller)
|
|
ignore:
|
|
rdma_ack_cm_event(event);
|
|
|
|
SAFE_FREE(childRemoteDest);
|
|
if(acceptedSock)
|
|
IBVSocket_destruct(acceptedSock);
|
|
|
|
*outAcceptedSock = NULL;
|
|
|
|
return ACCEPTRES_IGNORE;
|
|
}
|
|
|
|
bool IBVSocket_shutdown(IBVSocket* _this)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
|
|
if(!commContext)
|
|
return true; // this socket has never been connected
|
|
|
|
// if object is in errState, then the socket might be in an inconsistent state,
|
|
// therefore further commands (except for disconnect) should not be executed
|
|
if(!_this->errState && commContext->incompleteSend.numAvailable)
|
|
{ // wait for all incomplete sends
|
|
int waitRes;
|
|
|
|
waitRes = __IBVSocket_waitForTotalSendCompletion(
|
|
_this, commContext->incompleteSend.numAvailable, 0, 0);
|
|
if(waitRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Waiting for incomplete send requests failed.");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
__IBVSocket_disconnect(_this);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Continues an incomplete former recv() by returning immediately available data from the
|
|
* corresponding buffer.
|
|
*/
|
|
ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, char* buf, size_t bufLen)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
int completedOffset = commContext->incompleteRecv.completedOffset;
|
|
size_t availableLen = commContext->incompleteRecv.wc.byte_len - completedOffset;
|
|
size_t bufIndex = commContext->incompleteRecv.wc.wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET;
|
|
|
|
|
|
if(availableLen <= bufLen)
|
|
{ // old data fits completely into buf
|
|
memcpy(buf, &(commContext->recvBufs)[bufIndex][completedOffset], availableLen);
|
|
|
|
commContext->incompleteRecv.isAvailable = 0;
|
|
|
|
int postRes = __IBVSocket_postRecv(_this, _this->commContext, bufIndex);
|
|
if(unlikely(postRes) )
|
|
goto err_invalidateSock;
|
|
|
|
return availableLen;
|
|
}
|
|
else
|
|
{ // still too much data for the buf => copy partially
|
|
memcpy(buf, &(commContext->recvBufs)[bufIndex][completedOffset], bufLen);
|
|
|
|
commContext->incompleteRecv.completedOffset += bufLen;
|
|
|
|
return bufLen;
|
|
}
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
ssize_t IBVSocket_recv(IBVSocket* _this, char* buf, size_t bufLen, int flags)
|
|
{
|
|
const int timeoutMS = IBVSOCKET_RECV_TIMEOUT_MS;
|
|
ssize_t recvTRes;
|
|
|
|
do
|
|
{
|
|
recvTRes = IBVSocket_recvT(_this, buf, bufLen, flags, timeoutMS);
|
|
} while(recvTRes == -ETIMEDOUT);
|
|
|
|
return recvTRes;
|
|
}
|
|
|
|
|
|
/**
|
|
* @return number of received bytes on success, 0 on timeout, -1 on error
|
|
*/
|
|
ssize_t IBVSocket_recvT(IBVSocket* _this, char* buf, size_t bufLen, int flags, int timeoutMS)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_wc* wc = &commContext->incompleteRecv.wc;
|
|
int flowControlRes;
|
|
int recvWCRes;
|
|
|
|
if(unlikely(_this->errState) )
|
|
return -1;
|
|
|
|
// check whether an old buffer has not been fully read yet
|
|
if(!commContext->incompleteRecv.isAvailable)
|
|
{ // no partially read data available => recv new buffer
|
|
|
|
// check whether we have a pending on-send flow control packet that needs to be received first
|
|
flowControlRes = __IBVSocket_flowControlOnSendWait(_this, timeoutMS);
|
|
if(flowControlRes <= 0)
|
|
{
|
|
if(likely(!flowControlRes) )
|
|
return -ETIMEDOUT; // timeout
|
|
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
// recv a new buffer (into the incompleteRecv structure)
|
|
recvWCRes = __IBVSocket_recvWC(_this, timeoutMS, wc);
|
|
if(recvWCRes <= 0)
|
|
{
|
|
if(likely(!recvWCRes) )
|
|
return -ETIMEDOUT; // timeout
|
|
|
|
goto err_invalidateSock; // error occurred
|
|
}
|
|
|
|
// recvWC was positive => we're guaranteed to have an incompleteRecv buf availabe
|
|
|
|
commContext->incompleteRecv.completedOffset = 0;
|
|
commContext->incompleteRecv.isAvailable = 1;
|
|
}
|
|
|
|
return __IBVSocket_recvContinueIncomplete(_this, buf, bufLen);
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return -ECOMM;
|
|
}
|
|
|
|
ssize_t IBVSocket_send(IBVSocket* _this, const char* buf, size_t bufLen, int flags)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
int flowControlRes;
|
|
size_t currentBufIndex;
|
|
int postRes;
|
|
size_t postedLen = 0;
|
|
int currentPostLen;
|
|
int waitRes;
|
|
|
|
if(unlikely(_this->errState) )
|
|
return -1;
|
|
|
|
do
|
|
{
|
|
flowControlRes = __IBVSocket_flowControlOnSendWait(_this,
|
|
_this->timeoutCfg.flowSendMS);
|
|
if(unlikely(flowControlRes <= 0) )
|
|
goto err_invalidateSock;
|
|
|
|
// note: we only poll for completed sends after we used up all (!) available bufs
|
|
|
|
if(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum)
|
|
{ // wait for all (!) incomplete sends
|
|
waitRes = __IBVSocket_waitForTotalSendCompletion(
|
|
_this, commContext->incompleteSend.numAvailable, 0, 0);
|
|
if(waitRes < 0)
|
|
goto err_invalidateSock;
|
|
|
|
commContext->incompleteSend.numAvailable = 0;
|
|
}
|
|
|
|
currentPostLen = BEEGFS_MIN(bufLen-postedLen, commContext->commCfg.bufSize);
|
|
currentBufIndex = commContext->incompleteSend.numAvailable;
|
|
|
|
memcpy( (commContext->sendBufs)[currentBufIndex], &buf[postedLen], currentPostLen);
|
|
|
|
commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */
|
|
|
|
postRes = __IBVSocket_postSend(_this, currentBufIndex, currentPostLen);
|
|
if(unlikely(postRes) )
|
|
{
|
|
commContext->incompleteSend.numAvailable--;
|
|
goto err_invalidateSock;
|
|
}
|
|
|
|
|
|
postedLen += currentPostLen;
|
|
|
|
} while(postedLen < bufLen);
|
|
|
|
return (ssize_t)bufLen;
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
|
|
return -ECOMM;
|
|
}
|
|
|
|
|
|
int __IBVSocket_registerBuf(IBVCommContext* commContext, void* buf, size_t bufLen,
|
|
struct ibv_mr** outMR)
|
|
{
|
|
/* note: IB spec says:
|
|
"The consumer is not allowed to assign remote-write or remote-atomic to
|
|
a memory region that has not been assigned local-write." */
|
|
enum ibv_access_flags accessFlags = (enum ibv_access_flags)
|
|
(IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
|
|
|
|
*outMR = ibv_reg_mr(commContext->pd, buf, bufLen, accessFlags);
|
|
if(!*outMR)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't allocate MR.");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
char* __IBVSocket_allocAndRegisterBuf(IBVCommContext* commContext, size_t bufLen,
|
|
struct ibv_mr** outMR)
|
|
{
|
|
void* buf;
|
|
int registerRes;
|
|
|
|
int allocRes = posix_memalign(&buf, sysconf(_SC_PAGESIZE), bufLen);
|
|
if(allocRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't allocate work buf.");
|
|
return NULL;
|
|
}
|
|
|
|
memset(buf, 0, bufLen);
|
|
|
|
registerRes = __IBVSocket_registerBuf(commContext, buf, bufLen, outMR);
|
|
if(registerRes < 0)
|
|
{
|
|
free(buf);
|
|
return NULL;
|
|
}
|
|
|
|
return (char*)buf;
|
|
}
|
|
|
|
bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
|
|
IBVCommConfig* commCfg, IBVCommContext** outCommContext)
|
|
{
|
|
IBVCommContext* commContext = NULL;
|
|
int registerControlRes;
|
|
int registerControlResReset;
|
|
struct ibv_qp_init_attr qpInitAttr;
|
|
int createQPRes;
|
|
unsigned i;
|
|
|
|
|
|
// sanity checks
|
|
|
|
if (unlikely(commCfg->bufNum < IBVSOCKET_MIN_BUF_NUM) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "bufNum too small!",
|
|
("got", commCfg->bufNum), ("minimum", IBVSOCKET_MIN_BUF_NUM));
|
|
goto err_cleanup;
|
|
}
|
|
|
|
if (unlikely(commCfg->bufSize < IBVSOCKET_MIN_BUF_SIZE) ) // sanity check
|
|
{
|
|
LOG(SOCKLIB, WARNING, "bufSize too small!",
|
|
("got", commCfg->bufSize), ("minimum", IBVSOCKET_MIN_BUF_SIZE));
|
|
goto err_cleanup;
|
|
}
|
|
|
|
if (commCfg->bufSize * commCfg->bufNum > IBVSOCKET_MAX_BUF_SIZE_NUM)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "bufSize*bufNum too large!",
|
|
("got", commCfg->bufSize * commCfg->bufNum),
|
|
("maximum", IBVSOCKET_MAX_BUF_SIZE_NUM));
|
|
goto err_cleanup;
|
|
}
|
|
|
|
|
|
commContext = (IBVCommContext*)calloc(1, sizeof(*commContext) );
|
|
if(!commContext)
|
|
goto err_cleanup;
|
|
|
|
commContext->context = cm_id->verbs;
|
|
if(!commContext->context)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Unbound cm_id!!");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
commContext->pd = ibv_alloc_pd(commContext->context);
|
|
if(!commContext->pd)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't allocate PD.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
// alloc and register buffers...
|
|
|
|
commContext->commCfg = *commCfg;
|
|
|
|
commContext->recvBuf = __IBVSocket_allocAndRegisterBuf(
|
|
commContext, commCfg->bufSize * commCfg->bufNum, &commContext->recvMR);
|
|
if(!commContext->recvBuf)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't prepare recvBuf.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
commContext->recvBufs = (char**)calloc(1, commCfg->bufNum * sizeof(char*) );
|
|
|
|
for(i=0; i < commCfg->bufNum; i++)
|
|
commContext->recvBufs[i] = &commContext->recvBuf[i * commCfg->bufSize];
|
|
|
|
|
|
commContext->sendBuf = __IBVSocket_allocAndRegisterBuf(
|
|
commContext, commCfg->bufSize * commCfg->bufNum, &commContext->sendMR);
|
|
if(!commContext->sendBuf)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't prepare sendBuf.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
commContext->sendBufs = (char**)calloc(1, commCfg->bufNum * sizeof(char*) );
|
|
|
|
for(i=0; i < commCfg->bufNum; i++)
|
|
commContext->sendBufs[i] = &commContext->sendBuf[i * commCfg->bufSize];
|
|
|
|
|
|
registerControlRes = __IBVSocket_registerBuf(
|
|
commContext, (char*)&commContext->numUsedSendBufs,
|
|
sizeof(commContext->numUsedSendBufs), &commContext->controlMR);
|
|
if(registerControlRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't register control memory region.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
registerControlResReset = __IBVSocket_registerBuf(
|
|
commContext, (char*)&commContext->numUsedSendBufsReset,
|
|
sizeof(commContext->numUsedSendBufsReset), &commContext->controlResetMR);
|
|
if(registerControlResReset < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't register control memory reset region.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
// init flow control v2 (to avoid long receiver-not-ready timeouts)
|
|
|
|
/* note: we use -1 because the last buf might not be read by the user (eg during
|
|
nonblockingRecvCheck) and so it might not be immediately available again. */
|
|
commContext->numReceivedBufsLeft = commCfg->bufNum - 1;
|
|
commContext->numSendBufsLeft = commCfg->bufNum - 1;
|
|
|
|
// create completion channel and queues...
|
|
|
|
commContext->recvCompChannel = ibv_create_comp_channel(commContext->context);
|
|
if(!commContext->recvCompChannel)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't create comp channel.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
commContext->recvCQ = ibv_create_cq(
|
|
commContext->context, commCfg->bufNum, commContext, commContext->recvCompChannel,
|
|
rand()%commContext->context->num_comp_vectors);
|
|
if(!commContext->recvCQ)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't create recv CQ.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
// note: 1+commCfg->bufNum here for the RDMA write usedBufs reset work (=> flow/flood control)
|
|
commContext->sendCQ = ibv_create_cq(
|
|
commContext->context, 1+commCfg->bufNum, NULL, NULL,
|
|
rand()%commContext->context->num_comp_vectors);
|
|
if(!commContext->sendCQ)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't create send CQ.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
// note: 1+commCfg->bufNum here for the RDMA write usedBufs reset work
|
|
memset(&qpInitAttr, 0, sizeof(qpInitAttr) );
|
|
|
|
qpInitAttr.send_cq = commContext->sendCQ;
|
|
qpInitAttr.recv_cq = commContext->recvCQ;
|
|
qpInitAttr.qp_type = IBV_QPT_RC;
|
|
qpInitAttr.sq_sig_all = 1;
|
|
qpInitAttr.cap.max_send_wr = 1+commCfg->bufNum;
|
|
qpInitAttr.cap.max_recv_wr = commCfg->bufNum;
|
|
qpInitAttr.cap.max_send_sge = 1;
|
|
qpInitAttr.cap.max_recv_sge = 1;
|
|
qpInitAttr.cap.max_inline_data = 0;
|
|
|
|
createQPRes = rdma_create_qp(cm_id, commContext->pd, &qpInitAttr);
|
|
if(createQPRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't create QP.", sysErr);
|
|
goto err_cleanup;
|
|
}
|
|
|
|
commContext->qp = cm_id->qp;
|
|
|
|
// post initial recv buffers...
|
|
|
|
for(i=0; i < commCfg->bufNum; i++)
|
|
{
|
|
if(__IBVSocket_postRecv(_this, commContext, i) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't post recv buffer.", ("index", i));
|
|
goto err_cleanup;
|
|
}
|
|
}
|
|
|
|
// prepare event notification...
|
|
|
|
// initial event notification request
|
|
if(ibv_req_notify_cq(commContext->recvCQ, 0) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't request CQ notification.");
|
|
goto err_cleanup;
|
|
}
|
|
|
|
#ifdef BEEGFS_NVFS
|
|
commContext->workerMRs = new MRMap();
|
|
commContext->cqMutex = new Mutex();
|
|
commContext->cqCompletions = new CQMap();
|
|
// RDMA id. (This variable will increment for each RDMA operation.)
|
|
commContext->wr_id = 1;
|
|
#endif /* BEEGFS_NVFS */
|
|
|
|
LOG(SOCKLIB, DEBUG, __func__,
|
|
("_this", StringTk::uint64ToHexStr((uint64_t) _this)),
|
|
("device", cm_id->verbs->device->name));
|
|
|
|
*outCommContext = commContext;
|
|
return true;
|
|
|
|
|
|
// error handling
|
|
|
|
err_cleanup:
|
|
__IBVSocket_cleanupCommContext(cm_id, commContext);
|
|
|
|
*outCommContext = NULL;
|
|
return false;
|
|
}
|
|
|
|
void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext)
|
|
{
|
|
if(!commContext)
|
|
return;
|
|
|
|
if(commContext->qp)
|
|
{
|
|
// see recommendation here: https://www.rdmamojo.com/2012/12/28/ibv_destroy_qp/
|
|
// the qp should be set to error state, so that no more events can be pushed to that queue.
|
|
|
|
struct ibv_qp_attr qpAttr;
|
|
qpAttr.qp_state = IBV_QPS_ERR;
|
|
if (ibv_modify_qp(commContext->qp, &qpAttr, IBV_QP_STATE))
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Failed to modify qp IBV_QP_STATE.");
|
|
}
|
|
}
|
|
|
|
// ack remaining delayed acks
|
|
if(commContext->recvCQ && commContext->numUnackedRecvCompChannelEvents)
|
|
ibv_ack_cq_events(commContext->recvCQ, commContext->numUnackedRecvCompChannelEvents);
|
|
|
|
if(commContext->qp)
|
|
{
|
|
rdma_destroy_qp(cm_id);
|
|
}
|
|
|
|
if(commContext->sendCQ)
|
|
{
|
|
if(ibv_destroy_cq(commContext->sendCQ) )
|
|
LOG(SOCKLIB, WARNING, "Failed to destroy sendCQ.");
|
|
}
|
|
|
|
if(commContext->recvCQ)
|
|
{
|
|
if(ibv_destroy_cq(commContext->recvCQ) )
|
|
LOG(SOCKLIB, WARNING, "Failed to destroy recvCQ.");
|
|
}
|
|
|
|
if(commContext->recvCompChannel)
|
|
{
|
|
if(ibv_destroy_comp_channel(commContext->recvCompChannel) )
|
|
LOG(SOCKLIB, WARNING, "Failed to destroy recvCompChannel.");
|
|
}
|
|
|
|
if(commContext->controlMR)
|
|
{
|
|
if(ibv_dereg_mr(commContext->controlMR) )
|
|
LOG(SOCKLIB, WARNING, "Failed to deregister controlMR.");
|
|
}
|
|
|
|
if(commContext->controlResetMR)
|
|
{
|
|
if(ibv_dereg_mr(commContext->controlResetMR) )
|
|
LOG(SOCKLIB, WARNING, "Failed to deregister controlResetMR.");
|
|
}
|
|
|
|
if(commContext->recvMR)
|
|
{
|
|
if(ibv_dereg_mr(commContext->recvMR) )
|
|
LOG(SOCKLIB, WARNING, "Failed to deregister recvMR.");
|
|
}
|
|
|
|
if(commContext->sendMR)
|
|
{
|
|
if(ibv_dereg_mr(commContext->sendMR) )
|
|
LOG(SOCKLIB, WARNING, "Failed to deregister sendMR.");
|
|
}
|
|
|
|
#ifdef BEEGFS_NVFS
|
|
if (commContext->workerMRs)
|
|
{
|
|
for (auto& iter: *(commContext->workerMRs))
|
|
{
|
|
if(ibv_dereg_mr(iter.second) )
|
|
LOG(SOCKLIB, WARNING, "Failed to deregister workerMR.");
|
|
}
|
|
commContext->workerMRs->clear();
|
|
delete(commContext->workerMRs);
|
|
}
|
|
|
|
if (commContext->cqCompletions)
|
|
{
|
|
commContext->cqCompletions->clear();
|
|
delete(commContext->cqCompletions);
|
|
}
|
|
|
|
delete(commContext->cqMutex);
|
|
#endif /* BEEGFS_NVFS */
|
|
|
|
SAFE_FREE(commContext->recvBuf);
|
|
SAFE_FREE(commContext->sendBuf);
|
|
SAFE_FREE(commContext->recvBufs);
|
|
SAFE_FREE(commContext->sendBufs);
|
|
|
|
if(commContext->pd)
|
|
{
|
|
if(ibv_dealloc_pd(commContext->pd) )
|
|
LOG(SOCKLIB, WARNING, "Failed to dealloc pd.");
|
|
}
|
|
|
|
free(commContext);
|
|
}
|
|
|
|
/**
|
|
* Initializes a (local) IBVCommDest.
|
|
*/
|
|
void __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest)
|
|
{
|
|
memcpy(outDest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN);
|
|
|
|
outDest->protocolVersion = HOST_TO_LE_64(IBVSOCKET_PRIVATEDATA_PROTOCOL_VER);
|
|
outDest->rkey = HOST_TO_LE_32(commContext->controlMR->rkey);
|
|
outDest->vaddr = HOST_TO_LE_64((uintptr_t)&commContext->numUsedSendBufs);
|
|
outDest->recvBufNum = HOST_TO_LE_32(commContext->commCfg.bufNum);
|
|
outDest->recvBufSize = HOST_TO_LE_32(commContext->commCfg.bufSize);
|
|
}
|
|
|
|
/**
|
|
* Checks and parses a (remote) IBVCommDest.
|
|
*
|
|
* @param buf should usually be the private_data of the connection handshake
|
|
* @param outDest will be alloced (if true is returned) and needs to be free'd by the caller
|
|
* @return true if data is okay, false otherwise
|
|
*/
|
|
bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest)
|
|
{
|
|
IBVCommDest* dest = NULL;
|
|
|
|
*outDest = NULL;
|
|
|
|
|
|
// Note: "bufLen < ..." (and not "!="), because there might be some extra padding
|
|
if(!buf || (bufLen < sizeof(*dest) ) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad private data size.", bufLen);
|
|
|
|
return false;
|
|
}
|
|
|
|
dest = (IBVCommDest*)malloc(sizeof(*dest) );
|
|
if(!dest)
|
|
return false;
|
|
|
|
memcpy(dest, buf, sizeof(*dest) );
|
|
|
|
if(memcmp(dest->verificationStr, IBVSOCKET_PRIVATEDATA_STR, IBVSOCKET_PRIVATEDATA_STR_LEN) != 0 )
|
|
goto err_cleanup;
|
|
|
|
dest->protocolVersion = LE_TO_HOST_64(dest->protocolVersion);
|
|
|
|
if (dest->protocolVersion != IBVSOCKET_PRIVATEDATA_PROTOCOL_VER)
|
|
goto err_cleanup;
|
|
|
|
dest->rkey = LE_TO_HOST_32(dest->rkey);
|
|
dest->vaddr = LE_TO_HOST_64(dest->vaddr);
|
|
dest->recvBufNum = LE_TO_HOST_32(dest->recvBufNum);
|
|
dest->recvBufSize = LE_TO_HOST_32(dest->recvBufSize);
|
|
|
|
*outDest = dest;
|
|
|
|
return true;
|
|
|
|
|
|
err_cleanup:
|
|
SAFE_FREE(dest);
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
* Append buffer to receive queue.
|
|
*
|
|
* @param commContext passed seperately because it's not the _this->commContext during
|
|
* accept() of incoming connections
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex)
|
|
{
|
|
struct ibv_sge list;
|
|
struct ibv_recv_wr wr;
|
|
struct ibv_recv_wr* bad_wr;
|
|
int postRes;
|
|
|
|
list.addr = (uint64_t)commContext->recvBufs[bufIndex];
|
|
list.length = commContext->commCfg.bufSize;
|
|
list.lkey = commContext->recvMR->lkey;
|
|
|
|
wr.next = NULL;
|
|
wr.wr_id = bufIndex + IBVSOCKET_RECV_WORK_ID_OFFSET;
|
|
wr.sg_list = &list;
|
|
wr.num_sge = 1;
|
|
|
|
postRes = ibv_post_recv(commContext->qp, &wr, &bad_wr);
|
|
if(unlikely(postRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_post_recv failed.", postRes, sysErr(postRes));
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Synchronous RDMA write (waits for completion)
|
|
*
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int __IBVSocket_postWrite(IBVSocket* _this, IBVCommDest* remoteDest,
|
|
struct ibv_mr* localMR, char* localBuf, int bufLen)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_sge list;
|
|
struct ibv_send_wr wr;
|
|
struct ibv_send_wr *bad_wr;
|
|
int postRes;
|
|
int waitRes;
|
|
|
|
list.addr = (uint64_t)localBuf;
|
|
list.length = bufLen;
|
|
list.lkey = localMR->lkey;
|
|
|
|
wr.wr.rdma.remote_addr = remoteDest->vaddr;
|
|
wr.wr.rdma.rkey = remoteDest->rkey;
|
|
|
|
wr.wr_id = IBVSOCKET_WRITE_WORK_ID;
|
|
wr.sg_list = &list;
|
|
wr.num_sge = 1;
|
|
wr.opcode = IBV_WR_RDMA_WRITE;
|
|
wr.send_flags = IBV_SEND_SIGNALED;
|
|
wr.next = NULL;
|
|
|
|
postRes = ibv_post_send(commContext->qp, &wr, &bad_wr);
|
|
if(unlikely(postRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes));
|
|
return -1;
|
|
}
|
|
|
|
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
|
|
commContext->incompleteSend.numAvailable, 1, 0);
|
|
if(unlikely(waitRes) )
|
|
return -1;
|
|
|
|
commContext->incompleteSend.numAvailable = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Synchronous RDMA read (waits for completion).
|
|
*
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int __IBVSocket_postRead(IBVSocket* _this, IBVCommDest* remoteDest,
|
|
struct ibv_mr* localMR, char* localBuf, int bufLen)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_sge list;
|
|
struct ibv_send_wr wr;
|
|
struct ibv_send_wr *bad_wr;
|
|
int postRes;
|
|
int waitRes;
|
|
|
|
list.addr = (uint64_t) localBuf;
|
|
list.length = bufLen;
|
|
list.lkey = localMR->lkey;
|
|
|
|
wr.wr.rdma.remote_addr = remoteDest->vaddr;
|
|
wr.wr.rdma.rkey = remoteDest->rkey;
|
|
|
|
wr.wr_id = IBVSOCKET_READ_WORK_ID;
|
|
wr.sg_list = &list;
|
|
wr.num_sge = 1;
|
|
wr.opcode = IBV_WR_RDMA_READ;
|
|
wr.send_flags = IBV_SEND_SIGNALED;
|
|
wr.next = NULL;
|
|
|
|
postRes = ibv_post_send(commContext->qp, &wr, &bad_wr);
|
|
if(unlikely(postRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes));
|
|
return -1;
|
|
}
|
|
|
|
waitRes = __IBVSocket_waitForTotalSendCompletion(_this,
|
|
commContext->incompleteSend.numAvailable, 0, 1);
|
|
if(unlikely(waitRes) )
|
|
return -1;
|
|
|
|
commContext->incompleteSend.numAvailable = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#ifdef BEEGFS_NVFS
|
|
static bool __IBVSocket_getBufferKey(IBVCommContext *commContext, char *buffer, unsigned *key)
|
|
{
|
|
struct ibv_mr *mr = NULL;
|
|
|
|
MRMap::const_iterator iter = commContext->workerMRs->find(buffer);
|
|
|
|
if (iter == commContext->workerMRs->end())
|
|
{
|
|
// It is assumed that buffer came from a Worker and is WORKER_BUFOUT_SIZE.
|
|
// TODO: pass around a Buffer with a length instead of unqualified char*.
|
|
// This cache of ibv_mr will potentially grow to Workers * Targets
|
|
// and the ibv_mr instances hang around until the IBVSocket is destroyed.
|
|
// That is probably something to look into...
|
|
if (unlikely(__IBVSocket_registerBuf(commContext, buffer, WORKER_BUFOUT_SIZE, &mr)))
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_postWrite(): failed to register buffer.");
|
|
return false;
|
|
}
|
|
|
|
commContext->workerMRs->insert({buffer, mr});
|
|
}
|
|
else
|
|
{
|
|
mr = iter->second;
|
|
}
|
|
|
|
*key = mr->lkey;
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Wait for the completion of a specific RDMA operation.
|
|
* @return number of completed elements or -1 in case of an error
|
|
*/
|
|
static int __IBVSocket_waitForRDMACompletion(IBVCommContext* commContext, uint64_t id)
|
|
{
|
|
struct ibv_wc wc[IBVSOCKET_WC_ENTRIES];
|
|
int i = 0;
|
|
int found = 0;
|
|
int status = 0;
|
|
int num_wc = 0;
|
|
|
|
/*
|
|
* This function is locked so that we don't get a race condition between two workers
|
|
* looking for completions.
|
|
*/
|
|
commContext->cqMutex->lock();
|
|
CQMap::const_iterator iter = commContext->cqCompletions->find(id);
|
|
|
|
/*
|
|
* Check to see if we have already found the completion we are looking for.
|
|
*/
|
|
if (iter != commContext->cqCompletions->end())
|
|
{
|
|
commContext->cqCompletions->erase(id);
|
|
commContext->cqMutex->unlock();
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Continue to poll the CQ until we find the entry in question or we encounter a
|
|
* bad status.
|
|
*/
|
|
while (!found && !status)
|
|
{
|
|
num_wc = ibv_poll_cq(commContext->sendCQ, IBVSOCKET_WC_ENTRIES, wc);
|
|
if (num_wc > 0)
|
|
{
|
|
for (i = 0; i < num_wc; i++)
|
|
{
|
|
if (unlikely(wc[i].status != IBV_WC_SUCCESS))
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Connection error.", wc[i].status);
|
|
status = -1;
|
|
break;
|
|
}
|
|
|
|
if ((wc[i].opcode == IBV_WC_RDMA_WRITE) || (wc[i].opcode == IBV_WC_RDMA_READ))
|
|
{
|
|
if (wc[i].wr_id == id)
|
|
{
|
|
found = 1;
|
|
}
|
|
else
|
|
{
|
|
commContext->cqCompletions->insert({wc[i].wr_id, wc[i].opcode});
|
|
}
|
|
}
|
|
else if (wc[i].opcode == IBV_WC_SEND)
|
|
{
|
|
if (likely(commContext->incompleteSend.numAvailable))
|
|
{
|
|
commContext->incompleteSend.numAvailable--;
|
|
}
|
|
else
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Received bad/unexpected send completion.");
|
|
status = -1;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Received unexpected CQ opcode.", wc[i].opcode);
|
|
status = -1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
commContext->cqMutex->unlock();
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
* Process RDMA requests.
|
|
*
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
static int __IBVSocket_postRDMA(IBVSocket* _this, ibv_wr_opcode opcode,
|
|
char* localBuf, int bufLen, unsigned lkey,
|
|
uint64_t remoteBuf, unsigned rkey)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_sge list;
|
|
struct ibv_send_wr wr;
|
|
struct ibv_send_wr *bad_wr;
|
|
int postRes;
|
|
int waitRes;
|
|
|
|
if (unlikely(lkey == 0))
|
|
{
|
|
if (unlikely(!__IBVSocket_getBufferKey(commContext, localBuf, &lkey)))
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_postRDMA(): no local key.");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
list.addr = (uint64_t) localBuf;
|
|
list.length = bufLen;
|
|
list.lkey = lkey;
|
|
|
|
wr.wr_id = __atomic_fetch_add(&commContext->wr_id, 1, __ATOMIC_SEQ_CST);
|
|
wr.next = NULL;
|
|
wr.sg_list = &list;
|
|
wr.num_sge = 1;
|
|
wr.opcode = opcode;
|
|
wr.send_flags = IBV_SEND_SIGNALED;
|
|
wr.wr.rdma.remote_addr = remoteBuf;
|
|
wr.wr.rdma.rkey = rkey;
|
|
|
|
postRes = ibv_post_send(commContext->qp, &wr, &bad_wr);
|
|
if(unlikely(postRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes));
|
|
return -1;
|
|
}
|
|
|
|
waitRes = __IBVSocket_waitForRDMACompletion(commContext, wr.wr_id);
|
|
return waitRes;
|
|
}
|
|
|
|
int __IBVSocket_postWrite(IBVSocket* _this, char* localBuf, int bufLen,
|
|
unsigned lkey, uint64_t remoteBuf, unsigned rkey)
|
|
{
|
|
return __IBVSocket_postRDMA(_this, IBV_WR_RDMA_WRITE, localBuf, bufLen,
|
|
lkey, remoteBuf, rkey);
|
|
}
|
|
|
|
int __IBVSocket_postRead(IBVSocket* _this, char* localBuf, int bufLen,
|
|
unsigned lkey, uint64_t remoteBuf, unsigned rkey)
|
|
{
|
|
return __IBVSocket_postRDMA(_this, IBV_WR_RDMA_READ, localBuf, bufLen,
|
|
lkey, remoteBuf, rkey);
|
|
}
|
|
|
|
ssize_t IBVSocket_read(IBVSocket* _this, const char* buf, size_t bufLen,
|
|
unsigned lkey, const uint64_t rbuf, unsigned rkey)
|
|
{
|
|
return __IBVSocket_postRead(_this, (char *)buf, bufLen, lkey, rbuf, rkey);
|
|
}
|
|
|
|
ssize_t IBVSocket_write(IBVSocket* _this, const char* buf, size_t bufLen,
|
|
unsigned lkey, const uint64_t rbuf, unsigned rkey)
|
|
{
|
|
return __IBVSocket_postWrite(_this, (char *)buf, bufLen, lkey, rbuf, rkey);
|
|
}
|
|
|
|
#endif /* BEEGFS_NVFS */
|
|
|
|
/**
|
|
* Note: Contains flow control.
|
|
*
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex, int bufLen)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_sge list;
|
|
struct ibv_send_wr wr;
|
|
struct ibv_send_wr *bad_wr;
|
|
int postRes;
|
|
|
|
list.addr = (uint64_t)commContext->sendBufs[bufIndex];
|
|
list.length = bufLen;
|
|
list.lkey = commContext->sendMR->lkey;
|
|
|
|
wr.wr_id = bufIndex + IBVSOCKET_SEND_WORK_ID_OFFSET;
|
|
wr.next = NULL;
|
|
wr.sg_list = &list;
|
|
wr.num_sge = 1;
|
|
wr.opcode = IBV_WR_SEND;
|
|
wr.send_flags = IBV_SEND_SIGNALED;
|
|
|
|
postRes = ibv_post_send(commContext->qp, &wr, &bad_wr);
|
|
if(unlikely(postRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "ibv_post_send() failed.", sysErr(postRes));
|
|
return -1;
|
|
}
|
|
|
|
// flow control
|
|
__IBVSocket_flowControlOnSendUpdateCounters(_this);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Note: Contains flow control.
|
|
*
|
|
* @return 1 on success, 0 on timeout, -1 on error
|
|
*/
|
|
int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ibv_wc* outWC)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
size_t bufIndex;
|
|
|
|
int waitRes = __IBVSocket_waitForRecvCompletionEvent(_this, timeoutMS, outWC);
|
|
if(waitRes <= 0)
|
|
{ // (note: waitRes==0 can often happen, because we call this with timeoutMS==0)
|
|
|
|
if(unlikely(waitRes < 0) )
|
|
LOG(SOCKLIB, DEBUG, "Retrieval of completion event failed.", waitRes);
|
|
else
|
|
if(unlikely(timeoutMS) )
|
|
LOG(SOCKLIB, DEBUG, "Waiting for recv completion timed out.");
|
|
|
|
return waitRes;
|
|
}
|
|
|
|
// we got something...
|
|
|
|
if(unlikely(outWC->status != IBV_WC_SUCCESS) )
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Connection error.", outWC->status);
|
|
return -1;
|
|
}
|
|
|
|
bufIndex = outWC->wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET;
|
|
|
|
if(unlikely(bufIndex >= commContext->commCfg.bufNum) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Completion for unknown/invalid wr_id.", outWC->wr_id);
|
|
return -1;
|
|
}
|
|
|
|
// receive completed
|
|
|
|
//printf("%s: Recveived %u bytes.\n", __func__, outWC->byte_len); // debug in
|
|
|
|
// flow control
|
|
|
|
if(unlikely(__IBVSocket_flowControlOnRecv(_this, timeoutMS) ) )
|
|
return -1;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Intention: Avoid IB rnr by sending control msg when (almost) all our recv bufs are used up to
|
|
* show that we got our new recv bufs ready.
|
|
*
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
// we received a packet, so peer has received all of our currently pending data => reset counter
|
|
commContext->numSendBufsLeft = commContext->commCfg.bufNum - 1; /* (see
|
|
createCommContext() for "-1" reason) */
|
|
|
|
// send control packet if recv counter expires...
|
|
|
|
#ifdef BEEGFS_DEBUG
|
|
if(!commContext->numReceivedBufsLeft)
|
|
LOG(SOCKLIB, WARNING, "BUG: numReceivedBufsLeft underflow!");
|
|
#endif // BEEGFS_DEBUG
|
|
|
|
commContext->numReceivedBufsLeft--;
|
|
|
|
if(!commContext->numReceivedBufsLeft)
|
|
{
|
|
size_t currentBufIndex;
|
|
int postRes;
|
|
|
|
if(commContext->incompleteSend.numAvailable == commContext->commCfg.bufNum)
|
|
{ // wait for all (!) incomplete sends
|
|
|
|
/* note: it's ok that all send bufs are used up, because it's possible that we do a lot of
|
|
recv without the user sending any data in between (so the bufs were actually used up by
|
|
flow control). */
|
|
|
|
int waitRes = __IBVSocket_waitForTotalSendCompletion(
|
|
_this, commContext->incompleteSend.numAvailable, 0, 0);
|
|
if(waitRes < 0)
|
|
return -1;
|
|
|
|
commContext->incompleteSend.numAvailable = 0;
|
|
}
|
|
|
|
currentBufIndex = commContext->incompleteSend.numAvailable;
|
|
|
|
commContext->incompleteSend.numAvailable++; /* inc'ed before postSend() for conn checks */
|
|
|
|
postRes = __IBVSocket_postSend(_this, currentBufIndex, IBVSOCKET_FLOWCONTROL_MSG_LEN);
|
|
if(unlikely(postRes) )
|
|
{
|
|
commContext->incompleteSend.numAvailable--;
|
|
return -1;
|
|
}
|
|
|
|
|
|
// note: numReceivedBufsLeft is reset during postSend() flow control
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Called after sending a packet to update flow control counters.
|
|
*
|
|
* Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up.
|
|
*
|
|
* Note: This is only one part of the on-send flow control. The other one is
|
|
* _flowControlOnSendWait().
|
|
*/
|
|
void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
// we sent a packet, so we received all currently pending data from the peer => reset counter
|
|
commContext->numReceivedBufsLeft = commContext->commCfg.bufNum - 1; /* (see
|
|
createCommContext() for "-1" reason) */
|
|
|
|
#ifdef BEEGFS_DEBUG
|
|
|
|
if(!commContext->numSendBufsLeft)
|
|
LOG(SOCKLIB, WARNING, "BUG: numSendBufsLeft underflow!");
|
|
|
|
#endif
|
|
|
|
commContext->numSendBufsLeft--;
|
|
}
|
|
|
|
/**
|
|
* Intention: Avoid IB rnr by waiting for control msg when (almost) all peer bufs are used up.
|
|
*
|
|
* @timeoutMS may be 0 for non-blocking operation, otherwise typically
|
|
* IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS
|
|
* @return >0 on success, 0 on timeout (waiting for flow control packet from peer), <0 on error
|
|
*/
|
|
int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
struct ibv_wc wc;
|
|
int recvRes;
|
|
size_t bufIndex;
|
|
int postRecvRes;
|
|
|
|
if(commContext->numSendBufsLeft)
|
|
return 1; // flow control not triggered yet
|
|
|
|
recvRes = __IBVSocket_recvWC(_this, timeoutMS, &wc);
|
|
if(recvRes <= 0)
|
|
return recvRes;
|
|
|
|
bufIndex = wc.wr_id - IBVSOCKET_RECV_WORK_ID_OFFSET;
|
|
|
|
if(unlikely(wc.byte_len != IBVSOCKET_FLOWCONTROL_MSG_LEN) )
|
|
{ // error (bad length)
|
|
LOG(SOCKLIB, WARNING, "Received flow control packet length mismatch.", wc.byte_len);
|
|
return -1;
|
|
}
|
|
|
|
postRecvRes = __IBVSocket_postRecv(_this, commContext, bufIndex);
|
|
if(postRecvRes)
|
|
return -1;
|
|
|
|
// note: numSendBufsLeft is reset during recvWC() (if it actually received a packet)
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
/**
|
|
* @return 1 on available data, 0 on timeout, -1 on error
|
|
*/
|
|
int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS, struct ibv_wc* outWC)
|
|
{
|
|
/* Note: This will also be called with timeoutMS==0 from nonblockingRecvCheck to remove
|
|
* a potentially outdated event notification. for this reason, we have to check the event
|
|
* channel even if "ibv_poll_cq returns 0" and "timeoutMS==0". */
|
|
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_cq* ev_cq; // event completion queue
|
|
void* ev_ctx; // event context
|
|
struct epoll_event epollEvent;
|
|
|
|
// check quick path (is an event available without waiting?)
|
|
|
|
int numImmediateEvents = ibv_poll_cq(commContext->recvCQ, 1, outWC);
|
|
if(unlikely(numImmediateEvents < 0) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Poll CQ failed.", numImmediateEvents);
|
|
return -1;
|
|
}
|
|
else
|
|
if(numImmediateEvents > 0)
|
|
return 1;
|
|
|
|
|
|
// no immediate event available => wait for them...
|
|
|
|
for( ; ; ) /* (loop until "wc retrieved" or "timeout" or "error") */
|
|
{
|
|
/* note: we use pollTimeoutMS to check the conn every few secs (otherwise we might
|
|
wait for a very long time in case the other side disconnected silently) */
|
|
int pollTimeoutMS = BEEGFS_MIN(_this->timeoutCfg.pollMS, timeoutMS);
|
|
|
|
int epollRes = epoll_wait(_this->epollFD, &epollEvent, 1, pollTimeoutMS);
|
|
if(unlikely(epollRes < 0) )
|
|
{
|
|
if(errno == EINTR)
|
|
continue; // ignore EINTR, because debugger causes it
|
|
|
|
LOG(SOCKLIB, WARNING, "Epoll error.", sysErr);
|
|
return -1;
|
|
}
|
|
|
|
if(epollRes == 0)
|
|
{ // poll timed out
|
|
|
|
// Note: we check "timeoutMS != 0" here because we don't want to run the
|
|
// connCheck each time this method is called from nonblockingRecvCheck
|
|
if(timeoutMS)
|
|
{
|
|
int checkRes = IBVSocket_checkConnection(_this);
|
|
if(checkRes < 0)
|
|
return -1;
|
|
}
|
|
|
|
timeoutMS -= pollTimeoutMS;
|
|
if(!timeoutMS)
|
|
return 0;
|
|
|
|
continue;
|
|
}
|
|
|
|
if(unlikely(_this->cm_channel &&
|
|
(epollEvent.data.fd == _this->cm_channel->fd) ) )
|
|
{ // cm event incoming
|
|
struct rdma_cm_event* event = 0;
|
|
|
|
if (rdma_get_cm_event(_this->cm_channel, &event) < 0)
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Disconnected by rdma_get_cm_event error.");
|
|
|
|
_this->errState = -1;
|
|
return -1;
|
|
}
|
|
|
|
// Note: this code doesn't encounter RDMA_CM_EVENT_DEVICE_REMOVAL
|
|
if(event->event == RDMA_CM_EVENT_DISCONNECTED)
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Disconnect event received.");
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
_this->errState = -1;
|
|
return -1;
|
|
}
|
|
else
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Ingoring received event",
|
|
("event", rdma_event_str(event->event))); // debug in
|
|
|
|
rdma_ack_cm_event(event);
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// we received a completion event notification => retrieve the event...
|
|
|
|
int getEventRes = ibv_get_cq_event(commContext->recvCompChannel, &ev_cq, &ev_ctx);
|
|
if(unlikely(getEventRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Failed to get cq_event.");
|
|
return -1;
|
|
}
|
|
|
|
if(unlikely(ev_cq != commContext->recvCQ) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "CQ event for unknown CQ.", ev_cq);
|
|
return -1;
|
|
}
|
|
|
|
// request notification for next event
|
|
|
|
int reqNotifyRes = ibv_req_notify_cq(commContext->recvCQ, 0);
|
|
if(unlikely(reqNotifyRes) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Couldn't request CQ notification.");
|
|
return -1;
|
|
}
|
|
|
|
|
|
// ack is expensive, so we gather and ack multiple events
|
|
// note: spec says we need this, but current send_bw.c & co don't use ibv_ack_cq_events.
|
|
|
|
commContext->numUnackedRecvCompChannelEvents++;
|
|
if(commContext->numUnackedRecvCompChannelEvents == IBVSOCKET_EVENTS_GATHER_NUM)
|
|
{ // ack events and reset counter
|
|
ibv_ack_cq_events(commContext->recvCQ, commContext->numUnackedRecvCompChannelEvents);
|
|
commContext->numUnackedRecvCompChannelEvents = 0;
|
|
}
|
|
|
|
|
|
// query event...
|
|
|
|
/* note: ibv_poll_cq() does not necessarily return "!=0" after a received event, because the
|
|
event might be outdated */
|
|
|
|
int numEvents = ibv_poll_cq(commContext->recvCQ, 1, outWC);
|
|
if(unlikely(numEvents < 0) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Poll CQ failed.", numEvents);
|
|
return -1;
|
|
}
|
|
else
|
|
if(numEvents > 0)
|
|
return 1;
|
|
|
|
// we received a notification for an outdated event => wait again in the next round
|
|
|
|
} // end of for-loop
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* @return number of completed elements or -1 in case of an error
|
|
*/
|
|
int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this,
|
|
int numSendElements, int numWriteElements, int numReadElements)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
int numElements;
|
|
int i;
|
|
size_t bufIndex;
|
|
struct ibv_wc wc[2];
|
|
|
|
|
|
do
|
|
{
|
|
numElements = ibv_poll_cq(commContext->sendCQ, 2, wc);
|
|
if(unlikely(numElements < 0) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad ibv_poll_cq result.", numElements);
|
|
|
|
return -1;
|
|
}
|
|
|
|
// for each completion element
|
|
for(i=0; i < numElements; i++)
|
|
{
|
|
if(unlikely(wc[i].status != IBV_WC_SUCCESS) )
|
|
{
|
|
LOG(SOCKLIB, DEBUG, "Connection error.", wc[i].status);
|
|
return -1;
|
|
}
|
|
|
|
switch(wc[i].opcode)
|
|
{
|
|
case IBV_WC_SEND:
|
|
{
|
|
bufIndex = wc[i].wr_id - IBVSOCKET_SEND_WORK_ID_OFFSET;
|
|
|
|
if(unlikely(bufIndex >= commContext->commCfg.bufNum) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad send completion wr_id.", wc[i].wr_id);
|
|
return -1;
|
|
}
|
|
|
|
if(likely(numSendElements) )
|
|
numSendElements--;
|
|
else
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Received bad/unexpected send completion.");
|
|
|
|
return -1;
|
|
}
|
|
|
|
} break;
|
|
|
|
case IBV_WC_RDMA_WRITE:
|
|
{
|
|
if(unlikely(wc[i].wr_id != IBVSOCKET_WRITE_WORK_ID) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "bad write completion wr_id.", wc[i].wr_id);
|
|
|
|
return -1;
|
|
}
|
|
|
|
if(likely(numWriteElements) )
|
|
numWriteElements--;
|
|
else
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Received bad/unexpected RDMA write completion.");
|
|
|
|
return -1;
|
|
}
|
|
} break;
|
|
|
|
case IBV_WC_RDMA_READ:
|
|
{
|
|
if(unlikely(wc[i].wr_id != IBVSOCKET_READ_WORK_ID) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad read completion wr_id.", wc[i].wr_id);
|
|
|
|
return -1;
|
|
}
|
|
|
|
if(likely(numReadElements) )
|
|
numReadElements--;
|
|
else
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Received bad/unexpected RDMA read completion.");
|
|
|
|
return -1;
|
|
}
|
|
} break;
|
|
|
|
default:
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Bad/unexpected completion opcode.", wc[i].opcode);
|
|
|
|
return -1;
|
|
} break;
|
|
|
|
} // end of switch
|
|
|
|
} // end of for-loop
|
|
|
|
} while(numSendElements || numWriteElements || numReadElements);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* @return 0 on success, -1 on error
|
|
*/
|
|
int IBVSocket_checkConnection(IBVSocket* _this)
|
|
{
|
|
struct ibv_qp_attr qpAttr;
|
|
struct ibv_qp_init_attr qpInitAttr;
|
|
int qpRes;
|
|
int postRes;
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
//printf("%s: querying qp...\n", __func__); // debug in
|
|
|
|
// check qp status
|
|
qpRes = ibv_query_qp(commContext->qp, &qpAttr, IBV_QP_STATE, &qpInitAttr);
|
|
if(qpRes || (qpAttr.qp_state == IBV_QPS_ERR) )
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Detected QP error state.");
|
|
|
|
_this->errState = -1;
|
|
return -1;
|
|
}
|
|
|
|
// note: we read a remote value into the numUsedSendBufsReset field, which is actually
|
|
// meant for something else, so we need to reset the value afterwards
|
|
|
|
//printf("%d:%s: post rdma_read to check connection...\n", __LINE__, __func__); // debug in
|
|
|
|
postRes = __IBVSocket_postRead(_this, _this->remoteDest, commContext->controlResetMR,
|
|
(char*)&commContext->numUsedSendBufsReset, sizeof(commContext->numUsedSendBufsReset) );
|
|
if(postRes)
|
|
{
|
|
_this->errState = -1;
|
|
return -1;
|
|
}
|
|
|
|
commContext->numUsedSendBufsReset = 0;
|
|
|
|
//printf("%d:%s: rdma_read succeeded\n", __LINE__, __func__); // debug in
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* @return <0 on error, 0 if recv would block, >0 if recv would not block
|
|
*/
|
|
ssize_t IBVSocket_nonblockingRecvCheck(IBVSocket* _this)
|
|
{
|
|
/* note: this will also be called from the stream listener for false alarm checks, so make
|
|
* sure that we remove an (outdated) event from the channel to mute the false alarm. */
|
|
|
|
IBVCommContext* commContext = _this->commContext;
|
|
struct ibv_wc* wc = &commContext->incompleteRecv.wc;
|
|
int flowControlRes;
|
|
int recvRes;
|
|
|
|
if(unlikely(_this->errState) )
|
|
return -1;
|
|
|
|
if(commContext->incompleteRecv.isAvailable)
|
|
return 1;
|
|
|
|
// check whether we have a pending on-send flow control packet that needs to be received first
|
|
flowControlRes = __IBVSocket_flowControlOnSendWait(_this, 0);
|
|
if(unlikely(flowControlRes < 0) )
|
|
goto err_invalidateSock;
|
|
|
|
if(!flowControlRes)
|
|
return 0;
|
|
|
|
// recv one packet (if available) and add it as incompleteRecv
|
|
// or remove event channel notification otherwise (to avoid endless false alerts)
|
|
recvRes = __IBVSocket_recvWC(_this, 0, wc);
|
|
if(unlikely(recvRes < 0) )
|
|
goto err_invalidateSock;
|
|
|
|
if(!recvRes)
|
|
return 0;
|
|
|
|
// we got something => prepare to continue later
|
|
|
|
commContext->incompleteRecv.completedOffset = 0;
|
|
commContext->incompleteRecv.isAvailable = 1;
|
|
|
|
return 1;
|
|
|
|
|
|
err_invalidateSock:
|
|
_this->errState = -1;
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Call this after accept() to find out whether more events are waiting (for which
|
|
* no notification would not be delivered through the file descriptor).
|
|
*
|
|
* @return true if more events are waiting and accept() should be called again
|
|
*/
|
|
bool IBVSocket_checkDelayedEvents(IBVSocket* _this)
|
|
{
|
|
bool retVal = false;
|
|
struct rdma_cm_event* event;
|
|
|
|
// check for events in the delay queue
|
|
if (!_this->delayedCmEventsQ->empty())
|
|
return true;
|
|
|
|
|
|
// Switch channel fd to non-blocking, check for waiting events and switch back to blocking.
|
|
// (Quite inefficient, but we really don't have to care about efficiency in this case.)
|
|
|
|
// Note: We do this to avoid race conditions (lost events) before we're waiting for
|
|
// new notifications with poll()
|
|
|
|
// change mode of the connection manager channel to non-blocking
|
|
int oldChannelFlags = fcntl(IBVSocket_getConnManagerFD(_this), F_GETFL);
|
|
|
|
int setNewFlagsRes = fcntl(
|
|
IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags | O_NONBLOCK);
|
|
if(setNewFlagsRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Set conn manager channel non-blocking failed.", sysErr);
|
|
return false;
|
|
}
|
|
|
|
// (non-blocking) check for new events
|
|
if(rdma_get_cm_event(_this->cm_channel, &event) )
|
|
{
|
|
// non-blocking mode, so we ignore "pseudo-errors" here
|
|
}
|
|
else
|
|
{ // incoming event available
|
|
//printf("%d:%s: enqueueing an event (during non-blocking check): %d (%s)\n",
|
|
// __LINE__, __func__, event->event, rdma_event_str(event->event) ); // debug in
|
|
|
|
_this->delayedCmEventsQ->push(event);
|
|
|
|
retVal = true;
|
|
}
|
|
|
|
|
|
// change channel mode back to blocking
|
|
int setOldFlagsRes = fcntl(IBVSocket_getConnManagerFD(_this), F_SETFL, oldChannelFlags);
|
|
if(setOldFlagsRes < 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Set conn manager channel blocking failed.", sysErr);
|
|
return false;
|
|
}
|
|
|
|
|
|
return retVal;
|
|
}
|
|
|
|
void __IBVSocket_disconnect(IBVSocket* _this)
|
|
{
|
|
/* note: we only call rdma_disconnect() here if the socket is not connected to the common
|
|
listen sock event channel to avoid a race condition (because the sock accept method also
|
|
calls rdma_disconnect() in the streamlistener thread. ...but that's ok. we really don't need
|
|
that additional event if we're actively disconnecting the sock. */
|
|
|
|
if(_this->cm_channel)
|
|
{
|
|
int disconnectRes = rdma_disconnect(_this->cm_id);
|
|
if(disconnectRes)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "rdma disconnect error.", sysErr);
|
|
return;
|
|
}
|
|
|
|
// note: we can't wait for events here, because the disconnect event might
|
|
// be received by the listen socket channel (for accepted sockets with older
|
|
// ofed versions).
|
|
|
|
/*
|
|
if(!_this->cm_channel || !waitForEvent)
|
|
return;
|
|
|
|
rdma_get_cm_event(_this->cm_channel, &event);
|
|
if(event->event != RDMA_CM_EVENT_DISCONNECTED)
|
|
{
|
|
SyslogLogger::log(LOG_WARNING, "%s: unexpected event during disconnect %d: %s\n",
|
|
__func__, event->event, rdma_event_str(event->event) );
|
|
}
|
|
|
|
rdma_ack_cm_event(event);
|
|
*/
|
|
}
|
|
|
|
}
|
|
|
|
void __IBVSocket_close(IBVSocket* _this)
|
|
{
|
|
SAFE_FREE(_this->remoteDest);
|
|
|
|
if(_this->delayedCmEventsQ)
|
|
{ // ack all queued events
|
|
while (!_this->delayedCmEventsQ->empty())
|
|
{
|
|
struct rdma_cm_event* nextEvent = _this->delayedCmEventsQ->front();
|
|
|
|
rdma_ack_cm_event(nextEvent);
|
|
|
|
_this->delayedCmEventsQ->pop();
|
|
}
|
|
|
|
delete(_this->delayedCmEventsQ);
|
|
}
|
|
|
|
if(_this->commContext)
|
|
__IBVSocket_cleanupCommContext(_this->cm_id, _this->commContext);
|
|
|
|
if(_this->cm_id)
|
|
rdma_destroy_id(_this->cm_id);
|
|
if(_this->cm_channel)
|
|
rdma_destroy_event_channel(_this->cm_channel);
|
|
}
|
|
|
|
/**
|
|
* Note: Call this for connected sockets only.
|
|
*/
|
|
bool __IBVSocket_initEpollFD(IBVSocket* _this)
|
|
{
|
|
_this->epollFD = epoll_create(1); // "1" is just a hint (and is actually ignored)
|
|
if(_this->epollFD == -1)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "epoll initialization error.", sysErr);
|
|
return false;
|
|
}
|
|
|
|
struct epoll_event epollEvent;
|
|
|
|
epollEvent.events = EPOLLIN;
|
|
epollEvent.data.fd = IBVSocket_getRecvCompletionFD(_this);
|
|
|
|
// note: we only add the recvCompletionFD here and not commContext->context->async_fd, because
|
|
// accepted sockets don't have their own async event channel (they receive events through
|
|
// their parent's fd)
|
|
|
|
int epollAddRes = epoll_ctl(_this->epollFD, EPOLL_CTL_ADD,
|
|
IBVSocket_getRecvCompletionFD(_this), &epollEvent);
|
|
|
|
if(epollAddRes == -1)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Unable to add sock to epoll set.", sysErr);
|
|
|
|
close(_this->epollFD);
|
|
_this->epollFD = -1;
|
|
|
|
return false;
|
|
}
|
|
|
|
if(_this->cm_channel)
|
|
{
|
|
epollEvent.events = EPOLLIN;
|
|
epollEvent.data.fd = _this->cm_channel->fd;
|
|
|
|
int epollAddRes = epoll_ctl(_this->epollFD, EPOLL_CTL_ADD,
|
|
_this->cm_channel->fd, &epollEvent);
|
|
|
|
if(epollAddRes == -1)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "Unable to add sock to epoll set.", sysErr);
|
|
|
|
close(_this->epollFD);
|
|
_this->epollFD = -1;
|
|
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @return pointer to static buffer with human readable string for a wc status code
|
|
*/
|
|
const char* __IBVSocket_wcStatusStr(int wcStatusCode)
|
|
{
|
|
switch(wcStatusCode)
|
|
{
|
|
case IBV_WC_WR_FLUSH_ERR:
|
|
return "work request flush error";
|
|
case IBV_WC_RETRY_EXC_ERR:
|
|
return "retries exceeded error";
|
|
case IBV_WC_RESP_TIMEOUT_ERR:
|
|
return "response timeout error";
|
|
|
|
default:
|
|
return "<undefined>";
|
|
}
|
|
|
|
}
|
|
|
|
bool IBVSocket_getSockValid(IBVSocket* _this)
|
|
{
|
|
return _this->sockValid;
|
|
}
|
|
|
|
int IBVSocket_getRecvCompletionFD(IBVSocket* _this)
|
|
{
|
|
IBVCommContext* commContext = _this->commContext;
|
|
|
|
return commContext ? commContext->recvCompChannel->fd : (-1);
|
|
}
|
|
|
|
int IBVSocket_getConnManagerFD(IBVSocket* _this)
|
|
{
|
|
return _this->cm_channel ? _this->cm_channel->fd : (-1);
|
|
}
|
|
|
|
void IBVSocket_setTypeOfService(IBVSocket* _this, uint8_t typeOfService)
|
|
{
|
|
_this->typeOfService = typeOfService;
|
|
}
|
|
|
|
void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS, int flowSendMS, int pollMS)
|
|
{
|
|
_this->timeoutCfg.connectMS = connectMS > 0 ? connectMS : IBVSOCKET_CONN_TIMEOUT_MS;
|
|
_this->timeoutCfg.flowSendMS = flowSendMS > 0? flowSendMS : IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS;
|
|
_this->timeoutCfg.pollMS = pollMS > 0? pollMS : IBVSOCKET_POLL_TIMEOUT_MS;
|
|
LOG(SOCKLIB, DEBUG, "timeouts", ("connectMS", _this->timeoutCfg.connectMS),
|
|
("flowSendMS", _this->timeoutCfg.flowSendMS), ("pollMS", _this->timeoutCfg.pollMS));
|
|
}
|
|
|
|
void IBVSocket_setConnectionRejectionRate(IBVSocket* _this, unsigned rate)
|
|
{
|
|
_this->connectionRejectionRate = rate;
|
|
}
|
|
|
|
bool IBVSocket_connectionRejection(IBVSocket* _this)
|
|
{
|
|
if(_this->connectionRejectionRate)
|
|
{
|
|
++_this->connectionRejectionCount;
|
|
if((_this->connectionRejectionCount % _this->connectionRejectionRate) != 0)
|
|
{
|
|
LOG(SOCKLIB, WARNING, "dropping connection for testing.",
|
|
_this->connectionRejectionCount,
|
|
_this->connectionRejectionRate);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|