New upstream version 8.1.0

This commit is contained in:
geos_one
2025-08-10 01:34:16 +02:00
commit c891bb7105
4398 changed files with 838833 additions and 0 deletions

View File

@@ -0,0 +1,261 @@
#include <common/net/sock/RDMASocket.h>
#include <common/net/sock/StandardSocket.h>
#include <common/net/sock/NetworkInterfaceCard.h>
#include <common/toolkit/ListTk.h>
#include <linux/if_arp.h>
#include <linux/in.h>
#include <linux/inetdevice.h>
#include <net/sock.h>
#define NIC_STRING_LEN 1024
static bool __NIC_fillNicAddress(struct net_device* dev, NicAddrType_t nicType,
NicAddress* outAddr);
void NIC_findAll(StrCpyList* allowedInterfaces, bool useRDMA, bool onlyRDMA,
NicAddressList* outList)
{
// find standard TCP/IP interfaces
__NIC_findAllTCP(allowedInterfaces, outList);
// find RDMA interfaces (based on TCP/IP interfaces query results)
if(useRDMA && RDMASocket_rdmaDevicesExist() )
{
NicAddressList tcpInterfaces;
NicAddressList_init(&tcpInterfaces);
__NIC_findAllTCP(allowedInterfaces, &tcpInterfaces);
__NIC_filterInterfacesForRDMA(&tcpInterfaces, outList);
ListTk_kfreeNicAddressListElems(&tcpInterfaces);
NicAddressList_uninit(&tcpInterfaces);
}
if (onlyRDMA)
{
NicAddressListIter nicIter;
NicAddressListIter_init(&nicIter, outList);
while (!NicAddressListIter_end(&nicIter))
{
NicAddress* nicAddr = NicAddressListIter_value(&nicIter);
if (nicAddr->nicType != NICADDRTYPE_RDMA)
{
nicIter = NicAddressListIter_remove(&nicIter);
kfree(nicAddr);
}
else
NicAddressListIter_next(&nicIter);
}
}
}
void __NIC_findAllTCP(StrCpyList* allowedInterfaces, NicAddressList* outList)
{
struct net_device *dev;
// find standard TCP/IP interfaces
// foreach network device
for (dev = first_net_device(&init_net); dev; dev = next_net_device(dev))
{
NicAddress* nicAddr = (NicAddress*)os_kmalloc(sizeof(NicAddress) );
ssize_t metricByListPos = 0;
if(!nicAddr)
{
printk_fhgfs(KERN_WARNING, "%s:%d: memory allocation failed. size: %zu\n",
__func__, __LINE__, sizeof(*nicAddr) );
return;
}
if(__NIC_fillNicAddress(dev, NICADDRTYPE_STANDARD, nicAddr) &&
(!StrCpyList_length(allowedInterfaces) ||
ListTk_listContains(nicAddr->name, allowedInterfaces, &metricByListPos) ) )
{
NicAddressList_append(outList, nicAddr);
}
else
{ // netdevice rejected => clean up
kfree(nicAddr);
}
}
}
bool __NIC_fillNicAddress(struct net_device* dev, NicAddrType_t nicType, NicAddress* outAddr)
{
struct ifreq ifr;
struct in_device* in_dev;
struct in_ifaddr *ifa;
#ifdef BEEGFS_RDMA
outAddr->ibdev = NULL;
#endif
// name
strncpy(outAddr->name, dev->name, IFNAMSIZ);
// SIOCGIFFLAGS:
// get interface flags
ifr.ifr_flags = dev_get_flags(dev);
if(ifr.ifr_flags & IFF_LOOPBACK)
return false; // loopback interface => skip
ifr.ifr_hwaddr.sa_family = dev->type;
// select which hardware types to process
// (on Linux see /usr/include/linux/if_arp.h for the whole list)
switch(ifr.ifr_hwaddr.sa_family)
{
case ARPHRD_LOOPBACK:
return false;
default:
break;
}
// copy nicType
outAddr->nicType = nicType;
// ip address
// note: based on inet_gifconf in /net/ipv4/devinet.c
in_dev = __in_dev_get_rtnl(dev);
if(!in_dev)
{
printk_fhgfs_debug(KERN_NOTICE, "found interface without in_dev: %s\n", dev->name);
return false;
}
ifa = in_dev->ifa_list;
if(!ifa)
{
printk_fhgfs_debug(KERN_NOTICE, "found interface without ifa_list: %s\n", dev->name);
return false;
}
outAddr->ipAddr.s_addr = ifa->ifa_local; // ip address
// code to read multiple addresses
/*
for (; ifa; ifa = ifa->ifa_next)
{
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
ifa->ifa_local;
}
*/
return true;
}
/**
* @return static string (not alloc'ed, so don't free it).
*/
const char* NIC_nicTypeToString(NicAddrType_t nicType)
{
switch(nicType)
{
case NICADDRTYPE_RDMA: return "RDMA";
case NICADDRTYPE_STANDARD: return "TCP";
default: return "<unknown>";
}
}
/**
* @return string will be kalloced and must be kfreed later
*/
char* NIC_nicAddrToString(NicAddress* nicAddr)
{
char* nicAddrStr;
char ipStr[NICADDRESS_IP_STR_LEN];
const char* typeStr;
nicAddrStr = (char*)os_kmalloc(NIC_STRING_LEN);
NicAddress_ipToStr(nicAddr->ipAddr, ipStr);
if(nicAddr->nicType == NICADDRTYPE_RDMA)
typeStr = "RDMA";
else
if(nicAddr->nicType == NICADDRTYPE_STANDARD)
typeStr = "TCP";
else
typeStr = "Unknown";
snprintf(nicAddrStr, NIC_STRING_LEN, "%s[ip addr: %s; type: %s]", nicAddr->name, ipStr, typeStr);
return nicAddrStr;
}
bool NIC_supportsRDMA(NicAddressList* nicList)
{
bool rdmaSupported = false;
NicAddressListIter iter;
NicAddressListIter_init(&iter, nicList);
for( ; !NicAddressListIter_end(&iter); NicAddressListIter_next(&iter) )
{
if(NicAddressListIter_value(&iter)->nicType == NICADDRTYPE_RDMA)
{
rdmaSupported = true;
break;
}
}
return rdmaSupported;
}
void NIC_supportedCapabilities(NicAddressList* nicList, NicListCapabilities* outCapabilities)
{
outCapabilities->supportsRDMA = NIC_supportsRDMA(nicList);
}
/**
* Checks a list of TCP/IP interfaces for RDMA-capable interfaces.
*/
void __NIC_filterInterfacesForRDMA(NicAddressList* nicList, NicAddressList* outList)
{
// Note: This works by binding an RDMASocket to each IP of the passed list.
NicAddressListIter iter;
NicAddressListIter_init(&iter, nicList);
for( ; !NicAddressListIter_end(&iter); NicAddressListIter_next(&iter) )
{
RDMASocket rdmaSock;
Socket* sock = (Socket*)&rdmaSock;
NicAddress* nicAddr = NicAddressListIter_value(&iter);
bool bindRes;
if(!RDMASocket_init(&rdmaSock, nicAddr->ipAddr, NULL) )
continue;
bindRes = sock->ops->bindToAddr(sock, nicAddr->ipAddr, 0);
if(bindRes)
{ // we've got an RDMA-capable interface => append it to outList
NicAddress* nicAddrCopy = os_kmalloc(sizeof(NicAddress) );
*nicAddrCopy = *nicAddr;
#ifdef BEEGFS_RDMA
nicAddrCopy->ibdev = rdmaSock.ibvsock.cm_id->device;
#endif
nicAddrCopy->nicType = NICADDRTYPE_RDMA;
NicAddressList_append(outList, nicAddrCopy);
}
sock->ops->uninit(sock);
}
}

View File

@@ -0,0 +1,25 @@
#ifndef NETWORKINTERFACECARD_H_
#define NETWORKINTERFACECARD_H_
#include <common/Common.h>
#include <common/toolkit/StringTk.h>
#include <common/net/sock/NicAddress.h>
#include <common/net/sock/NicAddressList.h>
#include <common/net/sock/NicAddressListIter.h>
extern void NIC_findAll(StrCpyList* allowedInterfaces, bool useRDMA, bool onlyRDMA,
NicAddressList* outList);
extern const char* NIC_nicTypeToString(NicAddrType_t nicType);
extern char* NIC_nicAddrToString(NicAddress* nicAddr);
extern bool NIC_supportsRDMA(NicAddressList* nicList);
extern void NIC_supportedCapabilities(NicAddressList* nicList,
NicListCapabilities* outCapabilities);
extern void __NIC_findAllTCP(StrCpyList* allowedInterfaces, NicAddressList* outList);
extern void __NIC_filterInterfacesForRDMA(NicAddressList* list, NicAddressList* outList);
#endif /*NETWORKINTERFACECARD_H_*/

View File

@@ -0,0 +1,34 @@
#include <common/net/sock/NicAddress.h>
#include <common/toolkit/Serialization.h>
/**
* @return true if lhs (left-hand side) is preferred compared to rhs
*/
bool NicAddress_preferenceComp(const NicAddress* lhs, const NicAddress* rhs)
{
// compares the preference of NICs
// returns true if lhs is preferred compared to rhs
unsigned lhsHostOrderIP;
unsigned rhsHostOrderIP;
// prefer RDMA NICs
if( (lhs->nicType == NICADDRTYPE_RDMA) && (rhs->nicType != NICADDRTYPE_RDMA) )
return true;
if( (rhs->nicType == NICADDRTYPE_RDMA) && (lhs->nicType != NICADDRTYPE_RDMA) )
return false;
// no bandwidth in client NicAddress
// // prefer higher bandwidth
// if(lhs->bandwidth > rhs->bandwidth)
// return true;
// if(lhs->bandwidth < rhs->bandwidth)
// return false;
// prefer higher ipAddr
lhsHostOrderIP = ntohl(lhs->ipAddr.s_addr);
rhsHostOrderIP = ntohl(rhs->ipAddr.s_addr);
// this is the original IP-order version
return lhsHostOrderIP > rhsHostOrderIP;
}

View File

@@ -0,0 +1,70 @@
#ifndef NICADDRESS_H_
#define NICADDRESS_H_
#include <common/Common.h>
#include <linux/if.h>
#define NICADDRESS_IP_STR_LEN 16
enum NicAddrType;
typedef enum NicAddrType NicAddrType_t;
struct NicAddress;
typedef struct NicAddress NicAddress;
struct NicListCapabilities;
typedef struct NicListCapabilities NicListCapabilities;
struct ib_device;
extern bool NicAddress_preferenceComp(const NicAddress* lhs, const NicAddress* rhs);
// inliners
static inline void NicAddress_ipToStr(struct in_addr ipAddr, char* outStr);
static inline bool NicAddress_equals(NicAddress* this, NicAddress* other);
enum NicAddrType
{
NICADDRTYPE_STANDARD = 0,
// removed: NICADDRTYPE_SDP = 1,
NICADDRTYPE_RDMA = 2
};
struct NicAddress
{
struct in_addr ipAddr;
NicAddrType_t nicType;
char name[IFNAMSIZ];
#ifdef BEEGFS_RDMA
struct ib_device *ibdev;
#endif
};
struct NicListCapabilities
{
bool supportsRDMA;
};
/**
* @param outStr must be at least NICADDRESS_STR_LEN bytes long
*/
void NicAddress_ipToStr(struct in_addr ipAddr, char* outStr)
{
u8* ipArray = (u8*)&ipAddr.s_addr;
sprintf(outStr, "%u.%u.%u.%u", ipArray[0], ipArray[1], ipArray[2], ipArray[3]);
}
bool NicAddress_equals(NicAddress* this, NicAddress* other)
{
return (this->ipAddr.s_addr == other->ipAddr.s_addr) &&
(this->nicType == other->nicType) &&
!strncmp(this->name, other->name, IFNAMSIZ);
}
#endif /*NICADDRESS_H_*/

View File

@@ -0,0 +1,35 @@
#include <common/net/sock/RDMASocket.h>
#include <common/net/sock/StandardSocket.h>
#include <common/net/sock/NetworkInterfaceCard.h>
#include <common/net/sock/NicAddressList.h>
#if 0
#include <linux/if_arp.h>
#include <linux/in.h>
#include <linux/inetdevice.h>
#include <net/sock.h>
#endif
bool NicAddressList_equals(NicAddressList* this, NicAddressList* other)
{
bool result = false;
if (NicAddressList_length(this) == NicAddressList_length(other))
{
PointerListIter thisIter;
PointerListIter otherIter;
PointerListIter_init(&thisIter, (PointerList*) this);
PointerListIter_init(&otherIter, (PointerList*) other);
for (result = true;
result == true && !PointerListIter_end(&thisIter) && !PointerListIter_end(&otherIter);
PointerListIter_next(&thisIter), PointerListIter_next(&otherIter))
{
result = NicAddress_equals((NicAddress*) PointerListIter_value(&thisIter),
(NicAddress*) PointerListIter_value(&otherIter));
}
}
return result;
}

View File

@@ -0,0 +1,45 @@
#ifndef NICADDRESSLIST_H_
#define NICADDRESSLIST_H_
#include <common/toolkit/list/PointerList.h>
#include <common/toolkit/list/PointerListIter.h>
#include <common/Common.h>
#include "NicAddress.h"
struct NicAddressList;
typedef struct NicAddressList NicAddressList;
static inline void NicAddressList_init(NicAddressList* this);
static inline void NicAddressList_uninit(NicAddressList* this);
static inline void NicAddressList_append(NicAddressList* this, NicAddress* nicAddress);
static inline size_t NicAddressList_length(NicAddressList* this);
extern bool NicAddressList_equals(NicAddressList* this, NicAddressList* other);
struct NicAddressList
{
struct PointerList pointerList;
};
void NicAddressList_init(NicAddressList* this)
{
PointerList_init( (PointerList*)this);
}
void NicAddressList_uninit(NicAddressList* this)
{
PointerList_uninit( (PointerList*)this);
}
void NicAddressList_append(NicAddressList* this, NicAddress* nicAddress)
{
PointerList_append( (PointerList*)this, nicAddress);
}
size_t NicAddressList_length(NicAddressList* this)
{
return PointerList_length( (PointerList*)this);
}
#endif /*NICADDRESSLIST_H_*/

View File

@@ -0,0 +1,59 @@
#ifndef NICADDRESSLISTITER_H_
#define NICADDRESSLISTITER_H_
#include <common/Common.h>
#include "NicAddressList.h"
struct NicAddressListIter;
typedef struct NicAddressListIter NicAddressListIter;
static inline void NicAddressListIter_init(NicAddressListIter* this, NicAddressList* list);
static inline void NicAddressListIter_next(NicAddressListIter* this);
static inline NicAddress* NicAddressListIter_value(NicAddressListIter* this);
static inline bool NicAddressListIter_end(NicAddressListIter* this);
static inline NicAddressListIter NicAddressListIter_remove(NicAddressListIter* this);
struct NicAddressListIter
{
PointerListIter pointerListIter;
};
void NicAddressListIter_init(NicAddressListIter* this, NicAddressList* list)
{
PointerListIter_init( (PointerListIter*)this, (PointerList*)list);
}
void NicAddressListIter_next(NicAddressListIter* this)
{
PointerListIter_next( (PointerListIter*)this);
}
NicAddress* NicAddressListIter_value(NicAddressListIter* this)
{
return (NicAddress*)PointerListIter_value( (PointerListIter*)this);
}
bool NicAddressListIter_end(NicAddressListIter* this)
{
return PointerListIter_end( (PointerListIter*)this);
}
/**
* note: the current iterator becomes invalid after the call (use the returned iterator)
* @return the new iterator that points to the element just behind the erased one
*/
NicAddressListIter NicAddressListIter_remove(NicAddressListIter* this)
{
NicAddressListIter newIter = *this;
NicAddressListIter_next(&newIter); // the new iter that will be returned
PointerListIter_remove( (PointerListIter*)this);
return newIter;
}
#endif /*NICADDRESSLISTITER_H_*/

View File

@@ -0,0 +1,137 @@
#ifndef NICADDRESSSTATS_H_
#define NICADDRESSSTATS_H_
#include <common/Common.h>
#include <common/toolkit/Time.h>
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
#endif
struct NicAddressStats;
typedef struct NicAddressStats NicAddressStats;
static inline void NicAddressStats_init(NicAddressStats* this, NicAddress* nic);
static inline void NicAddressStats_uninit(NicAddressStats* this);
/**
* Called when an associated NIC has gone down. This indicates
* that this particular statistic should not be considered for load balancing.
*/
static inline void NicAddressStats_invalidate(NicAddressStats* this);
/**
* Called when an associated NIC has come online. This updates the internal NicAddress
* and indicates that this particular statistic should be considered for load balancing.
*/
static inline void NicAddressStats_setValid(NicAddressStats* this, NicAddress* nic);
static inline int NicAddressStats_comparePriority(NicAddressStats* this, NicAddressStats* o,
int numa);
static inline void NicAddressStats_updateUsed(NicAddressStats* this);
static inline void NicAddressStats_updateLastError(NicAddressStats* this);
static inline bool NicAddressStats_lastErrorExpired(NicAddressStats* this, Time* now,
int expirationSecs);
static inline bool NicAddressStats_usable(NicAddressStats* this, int maxConns);
struct NicAddressStats
{
NicAddress nic;
int established;
int available;
Time used;
Time lastError;
/**
* nicValid indicates if the NicAddress can be used for connections.
* This may be tracking stats for a device that has gone offline.
*/
bool nicValid;
};
void NicAddressStats_init(NicAddressStats* this, NicAddress* nic)
{
this->nic = *nic;
this->established = 0;
this->available = 0;
this->nicValid = true;
Time_initZero(&this->used);
Time_initZero(&this->lastError);
}
void NicAddressStats_uninit(NicAddressStats* this)
{
}
void NicAddressStats_invalidate(NicAddressStats* this)
{
this->nicValid = false;
#ifdef BEEGFS_RDMA
this->nic.ibdev = NULL;
#endif
}
void NicAddressStats_setValid(NicAddressStats* this, NicAddress* nic)
{
this->nicValid = true;
this->nic = *nic;
}
/*
* Compare the priority of this and o.
*
* Return value is < 0 if this has higher priority, > 0 if o has higher priority.
*/
int NicAddressStats_comparePriority(NicAddressStats* this, NicAddressStats* o,
int numa)
{
int rc;
#ifdef BEEGFS_RDMA
// device on the same numa node as current thread has higher priority
if (likely(this->nic.ibdev && o->nic.ibdev))
{
int thisNode = this->nic.ibdev->dma_device->numa_node;
int oNode = o->nic.ibdev->dma_device->numa_node;
if (thisNode != oNode)
{
if (thisNode == numa)
return -1;
if (oNode == numa)
return 1;
}
}
#endif
// device with more available connections has higher priority
rc = o->available - this->available;
if (rc != 0)
return rc;
// device with less established connections has higher priority
rc = this->established - o->established;
if (rc != 0)
return rc;
// device used less recently has higher priority
return Time_compare(&this->used, &o->used);
}
void NicAddressStats_updateUsed(NicAddressStats* this)
{
Time_setToNow(&this->used);
}
void NicAddressStats_updateLastError(NicAddressStats* this)
{
Time_setToNow(&this->lastError);
}
bool NicAddressStats_lastErrorExpired(NicAddressStats* this, Time* now, int expirationSecs)
{
return Time_elapsedSinceMS(now, &this->lastError) >= (expirationSecs * 1000);
}
bool NicAddressStats_usable(NicAddressStats* this, int maxConns)
{
#ifdef BEEGFS_RDMA
if (unlikely(!this->nic.ibdev))
return false;
#endif
return this->available > 0 || this->established < maxConns;
}
#endif /*NICADDRESSSTATS_H_*/

View File

@@ -0,0 +1,42 @@
#ifndef NICADDRESSSTATSLIST_H_
#define NICADDRESSSTATSLIST_H_
#include <common/Common.h>
#include <common/toolkit/list/PointerList.h>
#include "NicAddressStats.h"
struct NicAddressStatsList;
typedef struct NicAddressStatsList NicAddressStatsList;
static inline void NicAddressStatsList_init(NicAddressStatsList* this);
static inline void NicAddressStatsList_uninit(NicAddressStatsList* this);
static inline void NicAddressStatsList_append(NicAddressStatsList* this, NicAddressStats* stats);
static inline size_t NicAddressStatsList_length(NicAddressStatsList* this);
struct NicAddressStatsList
{
PointerList pointerList;
};
void NicAddressStatsList_init(NicAddressStatsList* this)
{
PointerList_init( (PointerList*)this);
}
void NicAddressStatsList_uninit(NicAddressStatsList* this)
{
PointerList_uninit( (PointerList*)this);
}
void NicAddressStatsList_append(NicAddressStatsList* this, struct NicAddressStats* stats)
{
PointerList_append( (PointerList*)this, stats);
}
static inline size_t NicAddressStatsList_length(NicAddressStatsList* this)
{
return PointerList_length( (PointerList*)this);
}
#endif /*NICADDRESSSTATSLIST_H_*/

View File

@@ -0,0 +1,56 @@
#ifndef NICADDRESSSTATSLISTITER_H_
#define NICADDRESSSTATSLISTITER_H_
#include <common/toolkit/list/PointerListIter.h>
#include "NicAddressStatsList.h"
struct NicAddressStatsListIter;
typedef struct NicAddressStatsListIter NicAddressStatsListIter;
static inline void NicAddressStatsListIter_init(NicAddressStatsListIter* this, NicAddressStatsList* list);
static inline void NicAddressStatsListIter_next(NicAddressStatsListIter* this);
static inline NicAddressStats* NicAddressStatsListIter_value(NicAddressStatsListIter* this);
static inline bool NicAddressStatsListIter_end(NicAddressStatsListIter* this);
static inline NicAddressStatsListIter NicAddressStatsListIter_remove(NicAddressStatsListIter* this);
struct NicAddressStatsListIter
{
PointerListIter pointerListIter;
};
void NicAddressStatsListIter_init(NicAddressStatsListIter* this, NicAddressStatsList* list)
{
PointerListIter_init( (PointerListIter*)this, (PointerList*)list);
}
void NicAddressStatsListIter_next(NicAddressStatsListIter* this)
{
PointerListIter_next( (PointerListIter*)this);
}
NicAddressStats* NicAddressStatsListIter_value(NicAddressStatsListIter* this)
{
return (struct NicAddressStats*)PointerListIter_value( (PointerListIter*)this);
}
bool NicAddressStatsListIter_end(NicAddressStatsListIter* this)
{
return PointerListIter_end( (PointerListIter*)this);
}
/**
* note: the current iterator becomes invalid after the call (use the returned iterator)
* @return the new iterator that points to the element just behind the erased one
*/
NicAddressStatsListIter NicAddressStatsListIter_remove(NicAddressStatsListIter* this)
{
NicAddressStatsListIter newIter = *this;
NicAddressStatsListIter_next(&newIter); // the new iter that will be returned
PointerListIter_remove( (PointerListIter*)this);
return newIter;
}
#endif /*NICADDRESSSTATSLISTITER_H_*/

View File

@@ -0,0 +1,143 @@
#ifndef POOLEDSOCKET_H_
#define POOLEDSOCKET_H_
#include <common/net/sock/Socket.h>
#include <common/toolkit/Time.h>
struct PooledSocket;
typedef struct PooledSocket PooledSocket;
struct ConnectionList;
typedef struct ConnectionList ConnectionList;
static inline void _PooledSocket_init(PooledSocket* this, NicAddrType_t nicType);
static inline void _PooledSocket_uninit(Socket* this);
// inliners
static inline bool PooledSocket_getHasExpired(PooledSocket* this, unsigned expireSecs);
// getters & setters
static inline bool PooledSocket_isAvailable(PooledSocket* this);
static inline void PooledSocket_setAvailable(PooledSocket* this, bool available);
static inline bool PooledSocket_getHasActivity(PooledSocket* this);
static inline void PooledSocket_setHasActivity(PooledSocket* this);
static inline void PooledSocket_resetHasActivity(PooledSocket* this);
static inline bool PooledSocket_getHasExpirationTimer(PooledSocket* this);
static inline void PooledSocket_setExpireTimeStart(PooledSocket* this);
static inline NicAddrType_t PooledSocket_getNicType(PooledSocket* this);
static inline ConnectionList* PooledSocket_getPool(PooledSocket* this);
static inline PointerListElem* PooledSocket_getPoolElem(PooledSocket* this);
static inline void PooledSocket_setPool(PooledSocket* this, ConnectionList* pool,
PointerListElem* poolElem);
/**
* This class provides special extensions for sockets in a NodeConnPool.
*/
struct PooledSocket
{
Socket socket;
ConnectionList* pool;
PointerListElem* poolElem;
bool available; // == !acquired
bool hasActivity; // true if channel was not idle (part of channel class in fhgfs_common)
bool closeOnRelease; /* release must close socket. used for signal handling */
Time expireTimeStart; // 0 means "doesn't expire", otherwise time when conn was established
NicAddrType_t nicType; // same as the interface for which this conn was established
};
void _PooledSocket_init(PooledSocket* this, NicAddrType_t nicType)
{
_Socket_init( (Socket*)this);
this->available = false;
this->hasActivity = true; // initially active to avoid immediate disconnection
this->closeOnRelease = false;
Time_initZero(&this->expireTimeStart);
this->nicType = nicType;
this->pool = NULL;
this->poolElem = NULL;
}
void _PooledSocket_uninit(Socket* this)
{
_Socket_uninit(this);
}
/**
* Tests whether this socket is set to expire and whether its expire time has been exceeded.
*
* @param expireSecs the time in seconds after which an expire-enabled socket expires.
* @return true if this socket has expired.
*/
bool PooledSocket_getHasExpired(PooledSocket* this, unsigned expireSecs)
{
if(likely(Time_getIsZero(&this->expireTimeStart) ) )
return false;
if(Time_elapsedMS(&this->expireTimeStart) > (expireSecs*1000) ) // "*1000" for milliseconds
return true;
return false;
}
bool PooledSocket_isAvailable(PooledSocket* this)
{
return this->available;
}
void PooledSocket_setAvailable(PooledSocket* this, bool available)
{
this->available = available;
}
bool PooledSocket_getHasActivity(PooledSocket* this)
{
return this->hasActivity;
}
void PooledSocket_setHasActivity(PooledSocket* this)
{
this->hasActivity = true;
}
void PooledSocket_resetHasActivity(PooledSocket* this)
{
this->hasActivity = false;
}
bool PooledSocket_getHasExpirationTimer(PooledSocket* this)
{
return !Time_getIsZero(&this->expireTimeStart);
}
void PooledSocket_setExpireTimeStart(PooledSocket* this)
{
Time_setToNow(&this->expireTimeStart);
}
NicAddrType_t PooledSocket_getNicType(PooledSocket* this)
{
return this->nicType;
}
void PooledSocket_setPool(PooledSocket* this, ConnectionList* pool,
PointerListElem* poolElem)
{
this->pool = pool;
this->poolElem = poolElem;
}
ConnectionList* PooledSocket_getPool(PooledSocket* this)
{
return this->pool;
}
PointerListElem* PooledSocket_getPoolElem(PooledSocket* this)
{
return this->poolElem;
}
#endif /*POOLEDSOCKET_H_*/

View File

@@ -0,0 +1,233 @@
#include <common/net/sock/RDMASocket.h>
#include <common/Common.h>
#include <linux/in.h>
#include <linux/poll.h>
// Note: These are historical defaults designed for SDR IB and do not provide
// the best performance for current IB fabrics. Ideally, buf_size should be
// configured as the largest chunksize used by the filesystem and buf_num
// will be 3. It would be ideal to take buf_num down to 1, but the current
// protocol requires at least 3 buffers.
// buf_num=64; buf_size=4*1024 (=> 512kB per socket for send and recv)
#define RDMASOCKET_DEFAULT_BUF_NUM (128) // moved to config
#define RDMASOCKET_DEFAULT_BUF_SIZE (4*1024) // moved to config
#define RDMASOCKET_DEFAULT_FRAGMENT_SIZE RDMASOCKET_DEFAULT_BUF_SIZE // moved to config
#define RDMASOCKET_DEFAULT_KEY_TYPE RDMAKEYTYPE_UnsafeGlobal
static const struct SocketOps rdmaOps = {
.uninit = _RDMASocket_uninit,
.connectByIP = _RDMASocket_connectByIP,
.bindToAddr = _RDMASocket_bindToAddr,
.listen = _RDMASocket_listen,
.shutdown = _RDMASocket_shutdown,
.shutdownAndRecvDisconnect = _RDMASocket_shutdownAndRecvDisconnect,
.sendto = _RDMASocket_sendto,
.recvT = _RDMASocket_recvT,
};
bool RDMASocket_init(RDMASocket* this, struct in_addr src, NicAddressStats* nicStats)
{
Socket* thisBase = (Socket*)this;
// init super class
_PooledSocket_init( (PooledSocket*)this, NICADDRTYPE_RDMA);
thisBase->ops = &rdmaOps;
// normal init part
thisBase->sockType = NICADDRTYPE_RDMA;
this->commCfg.bufNum = RDMASOCKET_DEFAULT_BUF_NUM;
this->commCfg.bufSize = RDMASOCKET_DEFAULT_BUF_SIZE;
this->commCfg.fragmentSize = RDMASOCKET_DEFAULT_FRAGMENT_SIZE;
this->commCfg.keyType = RDMASocket_toIBVSocketKeyType(RDMASOCKET_DEFAULT_KEY_TYPE);
if(!IBVSocket_init(&this->ibvsock, src, nicStats) )
goto err_ibv;
return true;
err_ibv:
_PooledSocket_uninit(&this->pooledSocket.socket);
return false;
}
RDMASocket* RDMASocket_construct(struct in_addr src, NicAddressStats *nicStats)
{
RDMASocket* this = kmalloc(sizeof(*this), GFP_NOFS);
if(!this ||
!RDMASocket_init(this, src, nicStats) )
{
kfree(this);
return NULL;
}
return this;
}
void _RDMASocket_uninit(Socket* this)
{
RDMASocket* thisCast = (RDMASocket*)this;
IBVSocket_uninit(&thisCast->ibvsock);
_PooledSocket_uninit(this);
}
bool RDMASocket_rdmaDevicesExist(void)
{
#ifdef BEEGFS_RDMA
return true;
#else
return false;
#endif
}
bool _RDMASocket_connectByIP(Socket* this, struct in_addr ipaddress, unsigned short port)
{
// note: does not set the family type to the one of this socket.
RDMASocket* thisCast = (RDMASocket*)this;
bool connRes;
connRes = IBVSocket_connectByIP(&thisCast->ibvsock, ipaddress, port, &thisCast->commCfg);
if(!connRes)
{
// note: this message would flood the log if hosts are unreachable on the primary interface
//char* ipStr = SocketTk_ipaddrToStr(ipaddress);
//printk_fhgfs(KERN_WARNING, "RDMASocket failed to connect to %s.\n", ipStr);
//kfree(ipStr);
return false;
}
// connected
// set peername if not done so already (e.g. by connect(hostname) )
if(!this->peername[0])
{
SocketTk_endpointAddrToStrNoAlloc(this->peername, SOCKET_PEERNAME_LEN, ipaddress, port);
this->peerIP = ipaddress;
}
return true;
}
bool _RDMASocket_bindToAddr(Socket* this, struct in_addr ipaddress, unsigned short port)
{
RDMASocket* thisCast = (RDMASocket*)this;
bool bindRes;
bindRes = IBVSocket_bindToAddr(&thisCast->ibvsock, ipaddress, port);
if(!bindRes)
{
//printk_fhgfs_debug(KERN_INFO, "Failed to bind RDMASocket.\n"); // debug in
return false;
}
this->boundPort = port;
return true;
}
bool _RDMASocket_listen(Socket* this)
{
RDMASocket* thisCast = (RDMASocket*)this;
bool listenRes;
listenRes = IBVSocket_listen(&thisCast->ibvsock);
if(!listenRes)
{
printk_fhgfs(KERN_WARNING, "Failed to set RDMASocket to listening mode.\n");
return false;
}
snprintf(this->peername, SOCKET_PEERNAME_LEN, "Listen(Port: %u)", this->boundPort);
return true;
}
bool _RDMASocket_shutdown(Socket* this)
{
RDMASocket* thisCast = (RDMASocket*)this;
bool shutRes = IBVSocket_shutdown(&thisCast->ibvsock);
if(!shutRes)
{
printk_fhgfs_debug(KERN_INFO, "RDMASocket failed to send shutdown.\n");
return false;
}
return true;
}
/**
* Note: The RecvDisconnect-part is currently not implemented, so this is equal to the
* normal shutdown() method.
*/
bool _RDMASocket_shutdownAndRecvDisconnect(Socket* this, int timeoutMS)
{
return this->ops->shutdown(this);
}
/**
* @return -ETIMEDOUT on timeout
*/
ssize_t _RDMASocket_recvT(Socket* this, struct iov_iter* iter, int flags, int timeoutMS)
{
RDMASocket* thisCast = (RDMASocket*)this;
ssize_t retVal;
retVal = IBVSocket_recvT(&thisCast->ibvsock, iter, flags, timeoutMS);
return retVal;
}
/**
* Note: This is a connection-based socket type, so to and tolen are ignored.
*
* @param flags ignored
*/
ssize_t _RDMASocket_sendto(Socket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *to)
{
RDMASocket* thisCast = (RDMASocket*)this;
ssize_t retVal;
retVal = IBVSocket_send(&thisCast->ibvsock, iter, flags);
return retVal;
}
/**
* Register for polling (=> this method does not call schedule() !).
*
* Note: Call this only once with finishPoll==true (=> non-blocking) or multiple times with
* finishPoll==true in the last call from the current thread (for cleanup).
* Note: It's safe to call this multiple times with finishPoll==true.
*
* @param events the event flags you are interested in (POLL...)
* @param finishPoll true for cleanup if you don't call poll again from this thread; (it's also ok
* to set this to true if you call poll only once and want to avoid blocking)
* @return mask revents mask (like poll() => POLL... flags), but only the events you requested or
* error events
*/
unsigned long RDMASocket_poll(RDMASocket* this, short events, bool finishPoll)
{
return IBVSocket_poll(&this->ibvsock, events, finishPoll);
}

View File

@@ -0,0 +1,133 @@
#ifndef OPEN_RDMASOCKET_H_
#define OPEN_RDMASOCKET_H_
#include <common/toolkit/SocketTk.h>
#include <common/toolkit/StringTk.h>
#include <common/toolkit/Time.h>
#include <common/Common.h>
#include <common/net/sock/ibv/IBVSocket.h>
#include <common/net/sock/PooledSocket.h>
#include <common/net/sock/NicAddressStats.h>
#include <app/config/Config.h>
struct ib_device;
struct ib_mr;
struct RDMASocket;
typedef struct RDMASocket RDMASocket;
struct NicAddressStats;
typedef struct NicAddressStats NicAddressStats;
extern __must_check bool RDMASocket_init(RDMASocket* this, struct in_addr srcIpAddr, NicAddressStats* nicStats);
extern RDMASocket* RDMASocket_construct(struct in_addr srcIpAddr, NicAddressStats* nicStats);
extern void _RDMASocket_uninit(Socket* this);
extern bool RDMASocket_rdmaDevicesExist(void);
extern bool _RDMASocket_connectByIP(Socket* this, struct in_addr ipaddress,
unsigned short port);
extern bool _RDMASocket_bindToAddr(Socket* this, struct in_addr ipaddress,
unsigned short port);
extern bool _RDMASocket_listen(Socket* this);
extern bool _RDMASocket_shutdown(Socket* this);
extern bool _RDMASocket_shutdownAndRecvDisconnect(Socket* this, int timeoutMS);
extern ssize_t _RDMASocket_recvT(Socket* this, struct iov_iter* iter, int flags,
int timeoutMS);
extern ssize_t _RDMASocket_sendto(Socket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *to);
extern unsigned long RDMASocket_poll(RDMASocket* this, short events, bool finishPoll);
// inliners
static inline struct ib_device* RDMASocket_getDevice(RDMASocket* this);
static inline unsigned RDMASocket_getRkey(RDMASocket* this);
static inline bool RDMASocket_isRkeyGlobal(RDMASocket* this);
static inline void RDMASocket_setBuffers(RDMASocket* this, unsigned bufNum, unsigned bufSize,
unsigned fragmentSize, RDMAKeyType keyType);
static inline void RDMASocket_setTimeouts(RDMASocket* this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS);
static inline void RDMASocket_setTypeOfService(RDMASocket* this, int typeOfService);
static inline void RDMASocket_setConnectionFailureStatus(RDMASocket* this, unsigned value);
static inline bool RDMASocket_registerMr(RDMASocket* this, struct ib_mr* mr, int access);
static inline IBVSocketKeyType RDMASocket_toIBVSocketKeyType(RDMAKeyType keyType);
struct RDMASocket
{
PooledSocket pooledSocket;
IBVSocket ibvsock;
IBVCommConfig commCfg;
};
unsigned RDMASocket_getRkey(RDMASocket *this)
{
return IBVSocket_getRkey(&this->ibvsock);
}
bool RDMASocket_isRkeyGlobal(RDMASocket* this)
{
return this->commCfg.keyType != IBVSOCKETKEYTYPE_Register;
}
struct ib_device* RDMASocket_getDevice(RDMASocket *this)
{
return IBVSocket_getDevice(&this->ibvsock);
}
IBVSocketKeyType RDMASocket_toIBVSocketKeyType(RDMAKeyType keyType)
{
switch (keyType)
{
case RDMAKEYTYPE_UnsafeDMA:
return IBVSOCKETKEYTYPE_UnsafeDMA;
case RDMAKEYTYPE_Register:
return IBVSOCKETKEYTYPE_Register;
default:
return IBVSOCKETKEYTYPE_UnsafeGlobal;
}
}
/**
* Note: Only has an effect for unconnected sockets.
*/
void RDMASocket_setBuffers(RDMASocket* this, unsigned bufNum, unsigned bufSize,
unsigned fragmentSize, RDMAKeyType keyType)
{
this->commCfg.bufNum = bufNum;
this->commCfg.bufSize = bufSize;
this->commCfg.fragmentSize = fragmentSize;
this->commCfg.keyType = RDMASocket_toIBVSocketKeyType(keyType);
}
void RDMASocket_setTimeouts(RDMASocket* this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS)
{
IBVSocket_setTimeouts(&this->ibvsock, connectMS, completionMS, flowSendMS,
flowRecvMS, pollMS);
}
/**
* Note: Only has an effect for unconnected sockets.
*/
void RDMASocket_setTypeOfService(RDMASocket* this, int typeOfService)
{
IBVSocket_setTypeOfService(&this->ibvsock, typeOfService);
}
/**
* Note: Only has an effect for unconnected sockets.
*/
void RDMASocket_setConnectionFailureStatus(RDMASocket* this, unsigned value)
{
IBVSocket_setConnectionFailureStatus(&this->ibvsock, value);
}
bool RDMASocket_registerMr(RDMASocket* this, struct ib_mr* mr, int access)
{
return !IBVSocket_registerMr(&this->ibvsock, mr, access);
}
#endif /*OPEN_RDMASOCKET_H_*/

View File

@@ -0,0 +1,28 @@
#include <common/toolkit/SocketTk.h>
#include <common/net/sock/Socket.h>
#include <common/threading/Thread.h>
#include <linux/in.h>
void _Socket_init(Socket* this)
{
memset(this, 0, sizeof(*this) );
this->sockType = NICADDRTYPE_STANDARD;
this->boundPort = -1;
}
void _Socket_uninit(Socket* this)
{
}
bool Socket_bind(Socket* this, unsigned short port)
{
struct in_addr ipAddr = { INADDR_ANY };
return this->ops->bindToAddr(this, ipAddr, port);
}
bool Socket_bindToAddr(Socket* this, struct in_addr ipAddr, unsigned short port)
{
return this->ops->bindToAddr(this, ipAddr, port);
}

View File

@@ -0,0 +1,194 @@
#ifndef SOCKET_H_
#define SOCKET_H_
#include <common/toolkit/StringTk.h>
#include <common/toolkit/Time.h>
#include <common/Common.h>
#include <common/net/sock/NicAddress.h>
#include <linux/socket.h>
#include <os/iov_iter.h>
#define SOCKET_PEERNAME_LEN 24
/*
* This is an abstract class.
*/
struct Socket;
typedef struct Socket Socket;
extern void _Socket_init(Socket* this);
extern void _Socket_uninit(Socket* this);
extern bool Socket_bind(Socket* this, unsigned short port);
extern bool Socket_bindToAddr(Socket* this, struct in_addr ipAddr, unsigned short port);
struct SocketOps
{
void (*uninit)(Socket* this);
bool (*connectByIP)(Socket* this, struct in_addr ipaddress, unsigned short port);
bool (*bindToAddr)(Socket* this, struct in_addr ipaddress, unsigned short port);
bool (*listen)(Socket* this);
bool (*shutdown)(Socket* this);
bool (*shutdownAndRecvDisconnect)(Socket* this, int timeoutMS);
ssize_t (*sendto)(Socket* this, struct iov_iter* iter, int flags, fhgfs_sockaddr_in *to);
ssize_t (*recvT)(Socket* this, struct iov_iter* iter, int flags, int timeoutMS);
};
struct Socket
{
NicAddrType_t sockType;
char peername[SOCKET_PEERNAME_LEN];
struct in_addr peerIP;
int boundPort;
const struct SocketOps* ops;
struct {
struct list_head _list;
short _events;
short revents;
} poll;
};
static inline NicAddrType_t Socket_getSockType(Socket* this)
{
return this->sockType;
}
static inline char* Socket_getPeername(Socket* this)
{
return this->peername;
}
static inline struct in_addr Socket_getPeerIP(Socket* this)
{
return this->peerIP;
}
/**
* Calls the virtual uninit method and kfrees the object.
*/
static inline void Socket_virtualDestruct(Socket* this)
{
this->ops->uninit(this);
kfree(this);
}
static inline ssize_t Socket_recvT(Socket* this, struct iov_iter *iter,
size_t length, int flags, int timeoutMS)
{
// TODO: implementation function should accept length as well.
struct iov_iter copy = *iter;
iov_iter_truncate(&copy, length);
{
ssize_t nread = this->ops->recvT(this, &copy, flags, timeoutMS);
if (nread >= 0)
{
// TODO: currently some parts of the project expect that we advance
// the iov_iter. But as it turns out, advancing here does not mesh
// well with how iov_iter is supposed to be used. A problem can be
// observed when advancing an iov_iter of type ITER_PIPE. This will
// result in mutation of external state (struct pipe_inode_info). IOW
// we can't just make a copy of any iov_iter and advance that in
// isolation.
//
// That means, the code should be changed such that we advance only in
// the outermost layers of the beegfs client module.
iov_iter_advance(iter, nread);
}
return nread;
}
}
static inline ssize_t Socket_recvT_kernel(Socket* this, void *buffer,
size_t length, int flags, int timeoutMS)
{
struct iov_iter *iter = STACK_ALLOC_BEEGFS_ITER_KVEC(buffer, length, READ);
return this->ops->recvT(this, iter, flags, timeoutMS);
}
/**
* Receive with timeout, extended version with numReceivedBeforeError.
*
* note: this uses a soft timeout that is being reset after each received data packet.
*
* @param outNumReceivedBeforeError number of bytes received before returning (also set in case of
* an error, e.g. timeout); given value will only be increased and is intentionally not set to 0
* initially.
* @return -ETIMEDOUT on timeout.
*/
static inline ssize_t Socket_recvExactTEx(Socket* this, struct iov_iter *iter, size_t len, int flags, int timeoutMS,
size_t* outNumReceivedBeforeError)
{
ssize_t missingLen = len;
do
{
ssize_t recvRes = this->ops->recvT(this, iter, flags, timeoutMS);
if(unlikely(recvRes <= 0) )
return recvRes;
missingLen -= recvRes;
*outNumReceivedBeforeError += recvRes;
} while(missingLen);
// all received if we got here
return len;
}
static inline ssize_t Socket_recvExactTEx_kernel(Socket* this, void *buf, size_t len, int flags, int timeoutMS,
size_t* outNumReceivedBeforeError)
{
struct iov_iter *iter = STACK_ALLOC_BEEGFS_ITER_KVEC(buf, len, READ);
return Socket_recvExactTEx(this, iter, len, flags, timeoutMS, outNumReceivedBeforeError);
}
/**
* Receive with timeout.
*
* @return -ETIMEDOUT on timeout.
*/
static inline ssize_t Socket_recvExactT(Socket* this, struct iov_iter *iter, size_t len, int flags, int timeoutMS)
{
size_t numReceivedBeforeError;
return Socket_recvExactTEx(this, iter, len, flags, timeoutMS, &numReceivedBeforeError);
}
static inline ssize_t Socket_recvExactT_kernel(Socket* this, void *buf, size_t len, int flags, int timeoutMS)
{
size_t numReceivedBeforeError;
return Socket_recvExactTEx_kernel(this, buf, len, flags, timeoutMS, &numReceivedBeforeError);
}
static inline ssize_t Socket_sendto_kernel(Socket *this, const void *buf, size_t len, int flags,
fhgfs_sockaddr_in *to)
{
struct iov_iter *iter = STACK_ALLOC_BEEGFS_ITER_KVEC(buf, len, WRITE);
return this->ops->sendto(this, iter, flags, to);
}
static inline ssize_t Socket_send_kernel(Socket *this, const void *buf, size_t len, int flags)
{
return Socket_sendto_kernel(this, buf, len, flags, NULL);
}
#endif /*SOCKET_H_*/

View File

@@ -0,0 +1,660 @@
#include <common/net/sock/StandardSocket.h>
#include <common/toolkit/Serialization.h>
#include <common/toolkit/SocketTk.h>
#include <common/Common.h>
#include <linux/in.h>
#include <linux/tcp.h>
#define SOCKET_LISTEN_BACKLOG 32
#define SOCKET_SHUTDOWN_RECV_BUF_LEN 32
#define STANDARDSOCKET_CONNECT_TIMEOUT_MS 5000
static const struct SocketOps standardOps = {
.uninit = _StandardSocket_uninit,
.connectByIP = _StandardSocket_connectByIP,
.bindToAddr = _StandardSocket_bindToAddr,
.listen = _StandardSocket_listen,
.shutdown = _StandardSocket_shutdown,
.shutdownAndRecvDisconnect = _StandardSocket_shutdownAndRecvDisconnect,
.sendto = _StandardSocket_sendto,
.recvT = _StandardSocket_recvT,
};
#ifdef KERNEL_HAS_SKWQ_HAS_SLEEPER
# define __sock_has_sleeper(wq) (skwq_has_sleeper(wq))
#else
# define __sock_has_sleeper(wq) (wq_has_sleeper(wq))
#endif
#if defined(KERNEL_HAS_SK_SLEEP) && !defined(KERNEL_HAS_SK_HAS_SLEEPER)
static inline int sk_has_sleeper(struct sock* sk)
{
return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
}
#endif
#if defined(KERNEL_WAKE_UP_SYNC_KEY_HAS_3_ARGUMENTS)
# define __wake_up_sync_key_m(wq, state, key) __wake_up_sync_key(wq, state, key)
#else
# define __wake_up_sync_key_m(wq, state, key) __wake_up_sync_key(wq, state, 1, key)
#endif
/* unlike linux sock_def_readable, this will also wake TASK_KILLABLE threads. we need this
* for SocketTk_poll, which wants to wait for fatal signals only. */
#ifdef KERNEL_HAS_SK_DATA_READY_2
static void sock_readable(struct sock *sk, int len)
#else
static void sock_readable(struct sock *sk)
#endif
{
#ifdef KERNEL_HAS_SK_SLEEP
read_lock(&sk->sk_callback_lock);
if (sk_has_sleeper(sk))
{
__wake_up_sync_key_m(sk->sk_sleep, TASK_NORMAL,
(void*) (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND));
}
read_unlock(&sk->sk_callback_lock);
#else
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (__sock_has_sleeper(wq))
{
__wake_up_sync_key_m(&wq->wait, TASK_NORMAL,
(void*) (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND));
}
rcu_read_unlock();
#endif
}
/* sock_def_write_space will also not wake uninterruptible threads. additionally, in newer kernels
* it uses refcount_t for an optimization we will do not need: linux does not want to wake up
* many writers if many of them cannot make progress. we have only a single writer. */
static void sock_write_space(struct sock *sk)
{
#ifdef KERNEL_HAS_SK_SLEEP
read_lock(&sk->sk_callback_lock);
if (sk_has_sleeper(sk))
{
__wake_up_sync_key_m(sk->sk_sleep, TASK_NORMAL,
(void*) (POLLOUT | POLLWRNORM | POLLWRBAND));
}
read_unlock(&sk->sk_callback_lock);
#else
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (__sock_has_sleeper(wq))
__wake_up_sync_key_m(&wq->wait, TASK_NORMAL, (void*) (POLLOUT | POLLWRNORM | POLLWRBAND));
rcu_read_unlock();
#endif
}
/* sock_def_wakeup, which is called for disconnects, has the same problem. */
static void sock_wakeup(struct sock *sk)
{
#ifdef KERNEL_HAS_SK_SLEEP
read_lock(&sk->sk_callback_lock);
if (sk_has_sleeper(sk))
wake_up_all(sk->sk_sleep);
read_unlock(&sk->sk_callback_lock);
#else
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (__sock_has_sleeper(wq))
wake_up_all(&wq->wait);
rcu_read_unlock();
#endif
}
/* as does sock_def_error_report */
static void sock_error_report(struct sock *sk)
{
#ifdef KERNEL_HAS_SK_SLEEP
read_lock(&sk->sk_callback_lock);
if (sk_has_sleeper(sk))
__wake_up_sync_key_m(sk->sk_sleep, TASK_NORMAL, (void*) (POLLERR));
read_unlock(&sk->sk_callback_lock);
#else
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (__sock_has_sleeper(wq))
__wake_up_sync_key_m(&wq->wait, TASK_NORMAL, (void*) (POLLERR));
rcu_read_unlock();
#endif
}
bool StandardSocket_init(StandardSocket* this, int domain, int type, int protocol)
{
Socket* thisBase = (Socket*)this;
NicAddrType_t nicType = NICADDRTYPE_STANDARD;
// init super class
_PooledSocket_init( (PooledSocket*)this, nicType);
thisBase->ops = &standardOps;
// normal init part
this->sock = NULL;
this->sockDomain = domain;
return _StandardSocket_initSock(this, domain, type, protocol);
}
StandardSocket* StandardSocket_construct(int domain, int type, int protocol)
{
StandardSocket* this = kmalloc(sizeof(*this), GFP_NOFS);
if(!this ||
!StandardSocket_init(this, domain, type, protocol) )
{
kfree(this);
return NULL;
}
return this;
}
StandardSocket* StandardSocket_constructUDP(void)
{
return StandardSocket_construct(PF_INET, SOCK_DGRAM, 0);
}
StandardSocket* StandardSocket_constructTCP(void)
{
return StandardSocket_construct(PF_INET, SOCK_STREAM, 0);
}
void _StandardSocket_uninit(Socket* this)
{
StandardSocket* thisCast = (StandardSocket*)this;
_PooledSocket_uninit(this);
if(thisCast->sock)
sock_release(thisCast->sock);
}
bool _StandardSocket_initSock(StandardSocket* this, int domain, int type, int protocol)
{
int createRes;
// prepare/create socket
#ifndef KERNEL_HAS_SOCK_CREATE_KERN_NS
createRes = sock_create_kern(domain, type, protocol, &this->sock);
#else
createRes = sock_create_kern(&init_net, domain, type, protocol, &this->sock);
#endif
if(createRes < 0)
{
//printk_fhgfs(KERN_WARNING, "Failed to create socket\n");
return false;
}
__StandardSocket_setAllocMode(this, GFP_NOFS);
this->sock->sk->sk_data_ready = sock_readable;
this->sock->sk->sk_write_space = sock_write_space;
this->sock->sk->sk_state_change = sock_wakeup;
this->sock->sk->sk_error_report = sock_error_report;
return true;
}
void __StandardSocket_setAllocMode(StandardSocket* this, gfp_t flags)
{
this->sock->sk->sk_allocation = flags;
}
/**
* Use this to change socket options.
* Note: Behaves (almost) like user-space setsockopt.
*
* @return 0 on success, error code otherwise (=> different from userspace version)
*/
int _StandardSocket_setsockopt(StandardSocket* this, int level,
int optname, char* optval, int optlen)
{
struct socket *sock = this->sock;
#if defined(KERNEL_HAS_SOCK_SETSOCKOPT_SOCKPTR_T_PARAM)
sockptr_t ptr = KERNEL_SOCKPTR(optval);
if (level == SOL_SOCKET)
return sock_setsockopt(sock, level, optname, ptr, optlen);
else
return sock->ops->setsockopt(sock, level, optname, ptr, optlen);
#elif defined(KERNEL_HAS_GET_FS)
char __user *ptr = (char __user __force *) optval;
int r;
WITH_PROCESS_CONTEXT
if (level == SOL_SOCKET)
r = sock_setsockopt(sock, level, optname, ptr, optlen);
else
r = sock->ops->setsockopt(sock, level, optname, ptr, optlen);
return r;
#else
#error need set_fs()/get_fs() if sockptr_t is not available.
#endif
// unreachable
BUG();
}
bool StandardSocket_setSoKeepAlive(StandardSocket* this, bool enable)
{
int val = (enable ? 1 : 0);
int r = _StandardSocket_setsockopt(this, SOL_SOCKET, SO_KEEPALIVE, (char *) &val, sizeof val);
return r == 0;
}
bool StandardSocket_setSoBroadcast(StandardSocket* this, bool enable)
{
int val = (enable ? 1 : 0);
int r = _StandardSocket_setsockopt(this, SOL_SOCKET, SO_BROADCAST, (char *) &val, sizeof val);
return r == 0;
}
int StandardSocket_getSoRcvBuf(StandardSocket* this)
{
//TODO: should this be READ_ONCE()? There are different uses in the Linux kernel
return this->sock->sk->sk_rcvbuf;
}
/**
* Note: Increase only (buffer will not be set to a smaller value).
*
* @return false on error, true otherwise (decrease skipping is not an error)
*/
bool StandardSocket_setSoRcvBuf(StandardSocket* this, int size)
{
int origBufLen = StandardSocket_getSoRcvBuf(this);
if (origBufLen >= size)
{
// we don't decrease buf sizes (but this is not an error)
return true;
}
else
{
/* note: according to socket(7) man page, the value given to setsockopt()
* is doubled and the doubled value is returned by getsockopt()
*
* update 2022-05-13: the kernel doubles the value passed to
* setsockopt(SO_RCVBUF) to allow for bookkeeping overhead. Halving the
* value is probably "not correct" but it's been this way since 2010 and
* changing it will potentially do more harm than good at this point.
*/
int val = size/2;
int r = _StandardSocket_setsockopt(this, SOL_SOCKET, SO_RCVBUF, (char *)
&val, sizeof val);
if(r != 0)
printk_fhgfs_debug(KERN_INFO, "%s: setSoRcvBuf error: %d;\n", __func__, r);
return r == 0;
}
}
bool StandardSocket_setTcpNoDelay(StandardSocket* this, bool enable)
{
int val = (enable ? 1 : 0);
int r = _StandardSocket_setsockopt(this, SOL_TCP, TCP_NODELAY, (char*) &val, sizeof val);
return r == 0;
}
bool StandardSocket_setTcpCork(StandardSocket* this, bool enable)
{
int val = (enable ? 1 : 0);
int r = _StandardSocket_setsockopt(this, SOL_TCP, TCP_CORK, (char*) &val, sizeof val);
return r == 0;
}
bool _StandardSocket_connectByIP(Socket* this, struct in_addr ipaddress, unsigned short port)
{
// note: this might look a bit strange (it's kept similar to the c++ version)
// note: error messages here would flood the log if hosts are unreachable on primary interface
const int timeoutMS = STANDARDSOCKET_CONNECT_TIMEOUT_MS;
StandardSocket* thisCast = (StandardSocket*)this;
int connRes;
struct sockaddr_in serveraddr =
{
.sin_family = AF_INET,
.sin_addr = ipaddress,
.sin_port = htons(port),
};
connRes = kernel_connect(thisCast->sock,
(struct sockaddr*) &serveraddr,
sizeof(serveraddr),
O_NONBLOCK);
if(connRes)
{
if(connRes == -EINPROGRESS)
{ // wait for "ready to send data"
PollState state;
int pollRes;
PollState_init(&state);
PollState_addSocket(&state, this, POLLOUT);
pollRes = SocketTk_poll(&state, timeoutMS);
if(pollRes > 0)
{ // we got something (could also be an error)
/* note: it's important to test ERR/HUP/NVAL here instead of POLLOUT only, because
POLLOUT and POLLERR can be returned together. */
if(this->poll.revents & (POLLERR | POLLHUP | POLLNVAL) )
return false;
// connection successfully established
if(!this->peername[0])
{
SocketTk_endpointAddrToStrNoAlloc(this->peername, SOCKET_PEERNAME_LEN, ipaddress, port);
this->peerIP = ipaddress;
}
return true;
}
else
if(!pollRes)
return false; // timeout
else
return false; // connection error
} // end of "EINPROGRESS"
}
else
{ // connected immediately
// set peername if not done so already (e.g. by connect(hostname) )
if(!this->peername[0])
{
SocketTk_endpointAddrToStrNoAlloc(this->peername, SOCKET_PEERNAME_LEN, ipaddress, port);
this->peerIP = ipaddress;
}
return true;
}
return false;
}
bool _StandardSocket_bindToAddr(Socket* this, struct in_addr ipaddress, unsigned short port)
{
StandardSocket* thisCast = (StandardSocket*)this;
struct sockaddr_in bindAddr;
int bindRes;
bindAddr.sin_family = thisCast->sockDomain;
bindAddr.sin_addr = ipaddress;
bindAddr.sin_port = htons(port);
bindRes = kernel_bind(thisCast->sock, (struct sockaddr*)&bindAddr, sizeof(bindAddr) );
if(bindRes)
{
printk_fhgfs(KERN_WARNING, "Failed to bind socket. ErrCode: %d\n", bindRes);
return false;
}
this->boundPort = port;
return true;
}
bool _StandardSocket_listen(Socket* this)
{
StandardSocket* thisCast = (StandardSocket*)this;
int r;
r = kernel_listen(thisCast->sock, SOCKET_LISTEN_BACKLOG);
if(r)
{
printk_fhgfs(KERN_WARNING, "Failed to set socket to listening mode. ErrCode: %d\n",
r);
return false;
}
snprintf(this->peername, SOCKET_PEERNAME_LEN, "Listen(Port: %d)", this->boundPort);
return true;
}
bool _StandardSocket_shutdown(Socket* this)
{
StandardSocket* thisCast = (StandardSocket*)this;
int sendshutRes;
sendshutRes = kernel_sock_shutdown(thisCast->sock, SEND_SHUTDOWN);
if( (sendshutRes < 0) && (sendshutRes != -ENOTCONN) )
{
printk_fhgfs(KERN_WARNING, "Failed to send shutdown. ErrCode: %d\n", sendshutRes);
return false;
}
return true;
}
bool _StandardSocket_shutdownAndRecvDisconnect(Socket* this, int timeoutMS)
{
bool shutRes;
char buf[SOCKET_SHUTDOWN_RECV_BUF_LEN];
int recvRes;
shutRes = this->ops->shutdown(this);
if(!shutRes)
return false;
// receive until shutdown arrives
do
{
recvRes = Socket_recvT_kernel(this, buf, SOCKET_SHUTDOWN_RECV_BUF_LEN, 0, timeoutMS);
} while(recvRes > 0);
if(recvRes &&
(recvRes != -ECONNRESET) )
{ // error occurred (but we're disconnecting, so we don't really care about errors)
return false;
}
return true;
}
/* Compatibility wrappers for sock_sendmsg / sock_recvmsg. At some point in the
* 4.x series, the size argument disappeared. */
static int beegfs_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags)
{
#ifdef KERNEL_HAS_RECVMSG_SIZE
return sock_recvmsg(sock, msg, len, flags);
#else
return sock_recvmsg(sock, msg, flags);
#endif
}
static int beegfs_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
#ifdef KERNEL_HAS_RECVMSG_SIZE
return sock_sendmsg(sock, msg, len);
#else
return sock_sendmsg(sock, msg);
#endif
}
/**
* @return -ETIMEDOUT on timeout
*/
ssize_t _StandardSocket_recvT(Socket* this, struct iov_iter* iter, int flags, int timeoutMS)
{
StandardSocket* thisCast = (StandardSocket*)this;
return StandardSocket_recvfromT(thisCast, iter, flags, NULL, timeoutMS);
}
ssize_t _StandardSocket_sendto(Socket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *to)
{
StandardSocket* thisCast = (StandardSocket*)this;
struct socket *sock = thisCast->sock;
int sendRes;
size_t len;
struct sockaddr_in toSockAddr;
struct msghdr msg =
{
.msg_control = NULL,
.msg_controllen = 0,
.msg_flags = flags | MSG_NOSIGNAL,
.msg_name = (struct sockaddr*)(to ? &toSockAddr : NULL),
.msg_namelen = sizeof(toSockAddr),
.msg_iter = *iter,
};
len = iov_iter_count(iter);
if (to)
{
toSockAddr.sin_family = thisCast->sockDomain;
toSockAddr.sin_addr = to->addr;
toSockAddr.sin_port = to->port;
}
sendRes = beegfs_sendmsg(sock, &msg, len);
if(sendRes >= 0)
iov_iter_advance(iter, sendRes);
return sendRes;
}
ssize_t StandardSocket_recvfrom(StandardSocket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *from)
{
int recvRes;
size_t len;
struct sockaddr_in fromSockAddr;
struct socket *sock = this->sock;
struct msghdr msg =
{
.msg_control = NULL,
.msg_controllen = 0,
.msg_flags = flags,
.msg_name = (struct sockaddr*)&fromSockAddr,
.msg_namelen = sizeof(fromSockAddr),
.msg_iter = *iter,
};
len = iov_iter_count(iter);
recvRes = beegfs_recvmsg(sock, &msg, len, flags);
if(recvRes > 0)
iov_iter_advance(iter, recvRes);
if(from)
{
from->addr = fromSockAddr.sin_addr;
from->port = fromSockAddr.sin_port;
}
return recvRes;
}
/**
* @return -ETIMEDOUT on timeout
*/
ssize_t StandardSocket_recvfromT(StandardSocket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *from, int timeoutMS)
{
Socket* thisBase = (Socket*)this;
int pollRes;
PollState state;
if(timeoutMS < 0)
return StandardSocket_recvfrom(this, iter, flags, from);
PollState_init(&state);
PollState_addSocket(&state, thisBase, POLLIN);
pollRes = SocketTk_poll(&state, timeoutMS);
if( (pollRes > 0) && (thisBase->poll.revents & POLLIN) )
return StandardSocket_recvfrom(this, iter, flags, from);
if(!pollRes)
return -ETIMEDOUT;
if(thisBase->poll.revents & POLLERR)
printk_fhgfs_debug(KERN_DEBUG, "StandardSocket_recvfromT: poll(): %s: Error condition\n",
thisBase->peername);
else
if(thisBase->poll.revents & POLLHUP)
printk_fhgfs_debug(KERN_DEBUG, "StandardSocket_recvfromT: poll(): %s: Hung up\n",
thisBase->peername);
else
if(thisBase->poll.revents & POLLNVAL)
printk_fhgfs(KERN_DEBUG, "StandardSocket_recvfromT: poll(): %s: Invalid request\n",
thisBase->peername);
else
printk_fhgfs(KERN_DEBUG, "StandardSocket_recvfromT: poll(): %s: ErrCode: %d\n",
thisBase->peername, pollRes);
return -ECOMM;
}

View File

@@ -0,0 +1,68 @@
#ifndef OPEN_STANDARDSOCKET_H_
#define OPEN_STANDARDSOCKET_H_
#include <common/toolkit/StringTk.h>
#include <common/toolkit/Time.h>
#include <common/Common.h>
#include <common/net/sock/PooledSocket.h>
struct StandardSocket;
typedef struct StandardSocket StandardSocket;
extern __must_check bool StandardSocket_init(StandardSocket* this, int domain, int type,
int protocol);
extern StandardSocket* StandardSocket_construct(int domain, int type, int protocol);
extern StandardSocket* StandardSocket_constructUDP(void);
extern StandardSocket* StandardSocket_constructTCP(void);
extern void _StandardSocket_uninit(Socket* this);
int StandardSocket_getSoRcvBuf(StandardSocket* this);
extern bool StandardSocket_setSoKeepAlive(StandardSocket* this, bool enable);
extern bool StandardSocket_setSoBroadcast(StandardSocket* this, bool enable);
extern bool StandardSocket_setSoRcvBuf(StandardSocket* this, int size);
extern bool StandardSocket_setTcpNoDelay(StandardSocket* this, bool enable);
extern bool StandardSocket_setTcpCork(StandardSocket* this, bool enable);
extern bool _StandardSocket_connectByIP(Socket* this, struct in_addr ipaddress,
unsigned short port);
extern bool _StandardSocket_bindToAddr(Socket* this, struct in_addr ipaddress,
unsigned short port);
extern bool _StandardSocket_listen(Socket* this);
extern bool _StandardSocket_shutdown(Socket* this);
extern bool _StandardSocket_shutdownAndRecvDisconnect(Socket* this, int timeoutMS);
extern ssize_t _StandardSocket_recvT(Socket* this, struct iov_iter* iter, int flags,
int timeoutMS);
extern ssize_t _StandardSocket_sendto(Socket* this, struct iov_iter* iter, int flags,
fhgfs_sockaddr_in *to);
extern ssize_t StandardSocket_recvfrom(StandardSocket* this, struct iov_iter* iter,
int flags, fhgfs_sockaddr_in *from);
extern ssize_t StandardSocket_recvfromT(StandardSocket* this, struct iov_iter* iter,
int flags, fhgfs_sockaddr_in *from, int timeoutMS);
extern bool _StandardSocket_initSock(StandardSocket* this, int domain, int type,
int protocol);
extern void __StandardSocket_setAllocMode(StandardSocket* this, gfp_t flags);
extern int _StandardSocket_setsockopt(StandardSocket* this, int level, int optname, char* optval,
int optlen);
// getters & setters
static inline struct socket* StandardSocket_getRawSock(StandardSocket* this);
struct StandardSocket
{
PooledSocket pooledSocket;
struct socket* sock;
unsigned short sockDomain;
};
struct socket* StandardSocket_getRawSock(StandardSocket* this)
{
return this->sock;
}
#endif /*OPEN_STANDARDSOCKET_H_*/

View File

@@ -0,0 +1,152 @@
#include "IBVBuffer.h"
#include "IBVSocket.h"
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
bool IBVBuffer_init(IBVBuffer* buffer, IBVCommContext* ctx, size_t bufLen,
size_t fragmentLen, enum dma_data_direction dma_dir)
{
unsigned count;
unsigned i;
if (fragmentLen == 0)
fragmentLen = bufLen;
count = (bufLen + fragmentLen - 1) / fragmentLen;
bufLen = MIN(fragmentLen, bufLen);
buffer->dma_dir = dma_dir;
buffer->buffers = kzalloc(count * sizeof(*buffer->buffers), GFP_KERNEL);
buffer->lists = kzalloc(count * sizeof(*buffer->lists), GFP_KERNEL);
if(!buffer->buffers || !buffer->lists)
goto fail;
for(i = 0; i < count; i++)
{
buffer->lists[i].lkey = ctx->pd->local_dma_lkey;
buffer->lists[i].length = bufLen;
buffer->buffers[i] = kmalloc(bufLen, GFP_KERNEL);
if(unlikely(!buffer->buffers[i]))
{
printk_fhgfs(KERN_ERR, "Failed to allocate buffer size=%zu\n", bufLen);
goto fail;
}
buffer->lists[i].addr = ib_dma_map_single(ctx->pd->device, buffer->buffers[i],
bufLen, dma_dir);
if (unlikely(ib_dma_mapping_error(ctx->pd->device, buffer->lists[i].addr)))
{
buffer->lists[i].addr = 0;
printk_fhgfs(KERN_ERR, "Failed to dma map buffer size=%zu\n", bufLen);
goto fail;
}
BUG_ON(buffer->lists[i].addr == 0);
}
buffer->bufferSize = bufLen;
buffer->listLength = count;
buffer->bufferCount = count;
return true;
fail:
IBVBuffer_free(buffer, ctx);
return false;
}
bool IBVBuffer_initRegistration(IBVBuffer* buffer, IBVCommContext* ctx)
{
struct scatterlist* sg;
int res;
int i;
buffer->mr = ib_alloc_mr(ctx->pd, IB_MR_TYPE_MEM_REG, buffer->bufferCount);
if (IS_ERR(buffer->mr))
{
printk_fhgfs(KERN_ERR, "Failed to alloc mr, errCode=%ld\n", PTR_ERR(buffer->mr));
buffer->mr = NULL;
goto fail;
}
sg = kzalloc(buffer->bufferCount * sizeof(struct scatterlist), GFP_KERNEL);
if (sg == NULL)
{
printk_fhgfs(KERN_ERR, "Failed to alloc sg\n");
goto fail;
}
for (i = 0; i < buffer->bufferCount; ++i)
{
sg_dma_address(&sg[i]) = buffer->lists[i].addr;
sg_dma_len(&sg[i]) = buffer->lists[i].length;
}
res = ib_map_mr_sg(buffer->mr, sg, buffer->bufferCount, NULL, PAGE_SIZE);
kfree(sg);
if (res < 0)
{
printk_fhgfs(KERN_ERR, "Failed to map mr res=%d\n", res);
goto fail;
}
return true;
fail:
if (buffer->mr)
{
ib_dereg_mr(buffer->mr);
buffer->mr = NULL;
}
return false;
}
void IBVBuffer_free(IBVBuffer* buffer, IBVCommContext* ctx)
{
if(buffer->buffers && buffer->lists)
{
unsigned i;
for(i = 0; i < buffer->bufferCount; i++)
{
if (buffer->lists[i].addr)
ib_dma_unmap_single(ctx->pd->device, buffer->lists[i].addr,
buffer->bufferSize, buffer->dma_dir);
if (buffer->buffers[i])
kfree(buffer->buffers[i]);
}
}
if (buffer->mr)
ib_dereg_mr(buffer->mr);
if (buffer->buffers)
kfree(buffer->buffers);
if (buffer->lists)
kfree(buffer->lists);
}
ssize_t IBVBuffer_fill(IBVBuffer* buffer, struct iov_iter* iter)
{
ssize_t total = 0;
unsigned i;
for(i = 0; i < buffer->bufferCount && iov_iter_count(iter) > 0; i++)
{
size_t fragment = MIN(MIN(iov_iter_count(iter), buffer->bufferSize), 0xFFFFFFFF);
if(copy_from_iter(buffer->buffers[i], fragment, iter) != fragment)
return -EFAULT;
buffer->lists[i].length = fragment;
buffer->listLength = i + 1;
total += fragment;
}
return total;
}
#endif

View File

@@ -0,0 +1,49 @@
#ifndef IBVBuffer_h_aMQFNfzrjbEHDOcv216fi
#define IBVBuffer_h_aMQFNfzrjbEHDOcv216fi
#include <common/Common.h>
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <os/iov_iter.h>
struct IBVBuffer;
typedef struct IBVBuffer IBVBuffer;
struct IBVCommContext;
struct IBVSocket;
extern bool IBVBuffer_init(IBVBuffer* buffer, struct IBVCommContext* ctx, size_t bufLen,
size_t fragmentLen, enum dma_data_direction dma_dir);
/**
* Prepare the instance to use its internal ib_mr. This is only needed for buffers used
* with RDMA READ/WRITE and when not using a global rkey. This may be called before
* the connection is established. Once the connection has been established,
* the registration must be completed via a call to IBVSocket_registerMr().
*/
extern bool IBVBuffer_initRegistration(IBVBuffer* buffer, struct IBVCommContext* ctx);
extern void IBVBuffer_free(IBVBuffer* buffer, struct IBVCommContext* ctx);
extern ssize_t IBVBuffer_fill(IBVBuffer* buffer, struct iov_iter* iter);
struct IBVBuffer
{
char** buffers;
struct ib_sge* lists;
struct ib_mr* mr;
size_t bufferSize;
unsigned bufferCount;
unsigned listLength;
enum dma_data_direction dma_dir;
};
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,283 @@
#ifndef OPENTK_IBVSOCKET_H_
#define OPENTK_IBVSOCKET_H_
#include <common/Common.h>
#include <common/toolkit/Random.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <asm/atomic.h>
#include <os/iov_iter.h>
#define IBVSOCKET_PRIVATEDATA_STR "fhgfs0 " // must be exactly(!!) 8 bytes long
#define IBVSOCKET_PRIVATEDATA_STR_LEN 8
#define IBVSOCKET_PRIVATEDATA_PROTOCOL_VER 1
struct ib_device;
struct ib_mr;
struct IBVIncompleteRecv;
typedef struct IBVIncompleteRecv IBVIncompleteRecv;
struct IBVIncompleteSend;
typedef struct IBVIncompleteSend IBVIncompleteSend;
struct IBVCommContext;
typedef struct IBVCommContext IBVCommContext;
struct IBVCommDest;
typedef struct IBVCommDest IBVCommDest;
struct IBVTimeoutConfig;
typedef struct IBVTimeoutConfig IBVTimeoutConfig;
struct IBVSocket; // forward declaration
typedef struct IBVSocket IBVSocket;
struct IBVCommConfig;
typedef struct IBVCommConfig IBVCommConfig;
struct NicAddressStats;
typedef struct NicAddressStats NicAddressStats;
enum IBVSocketKeyType
{
IBVSOCKETKEYTYPE_UnsafeGlobal = 0,
IBVSOCKETKEYTYPE_UnsafeDMA,
IBVSOCKETKEYTYPE_Register
};
typedef enum IBVSocketKeyType IBVSocketKeyType;
// construction/destruction
extern __must_check bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats);
extern void IBVSocket_uninit(IBVSocket* _this);
// static
extern bool IBVSocket_rdmaDevicesExist(void);
// methods
extern bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress,
unsigned short port, IBVCommConfig* commCfg);
extern bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr,
unsigned short port);
extern bool IBVSocket_listen(IBVSocket* _this);
extern bool IBVSocket_shutdown(IBVSocket* _this);
extern ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags,
int timeoutMS);
extern ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags);
extern int IBVSocket_checkConnection(IBVSocket* _this);
extern unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll);
// getters & setters
extern void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS);
extern void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService);
extern void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value);
extern struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this);
// Only access members of NicAddressStats when the owner NodeConnPool mutex is held.
// OK to access "nic" without holding mutex.
extern NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this);
extern unsigned IBVSocket_getRkey(IBVSocket* _this);
extern struct ib_device* IBVSocket_getDevice(IBVSocket* _this);
extern int IBVSocket_registerMr(IBVSocket* _this, struct ib_mr* mr, int access);
struct IBVTimeoutConfig
{
int connectMS;
int completionMS;
int flowSendMS;
int flowRecvMS;
int pollMS;
};
struct IBVCommConfig
{
unsigned bufNum; // number of available buffers
unsigned bufSize; // total size of each buffer
/**
* IBVBuffer can allocate the buffer in multiple memory regions. This
* is to allow allocation of large buffers without requiring the
* buffer to be entirely contiguous. A value of 0 means that the
* buffer should not be fragmented.
*/
unsigned fragmentSize; // size of buffer fragments
IBVSocketKeyType keyType; // Which type of rkey for RDMA
};
#ifdef BEEGFS_RDMA
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <common/threading/Mutex.h>
#include "IBVBuffer.h"
enum IBVSocketConnState;
typedef enum IBVSocketConnState IBVSocketConnState_t;
extern bool __IBVSocket_createNewID(IBVSocket* _this);
extern bool __IBVSocket_createCommContext(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext);
extern void __IBVSocket_cleanupCommContext(struct rdma_cm_id* cm_id, IBVCommContext* commContext);
extern bool __IBVSocket_initCommDest(IBVCommContext* commContext, IBVCommDest* outDest);
extern bool __IBVSocket_parseCommDest(const void* buf, size_t bufLen, IBVCommDest** outDest);
extern int __IBVSocket_receiveCheck(IBVSocket* _this, int timeoutMS);
extern int __IBVSocket_nonblockingSendCheck(IBVSocket* _this);
extern int __IBVSocket_postRecv(IBVSocket* _this, IBVCommContext* commContext, size_t bufIndex);
extern int __IBVSocket_postSend(IBVSocket* _this, size_t bufIndex);
extern int __IBVSocket_recvWC(IBVSocket* _this, int timeoutMS, struct ib_wc* outWC);
extern int __IBVSocket_flowControlOnRecv(IBVSocket* _this, int timeoutMS);
extern void __IBVSocket_flowControlOnSendUpdateCounters(IBVSocket* _this);
extern int __IBVSocket_flowControlOnSendWait(IBVSocket* _this, int timeoutMS);
extern int __IBVSocket_waitForRecvCompletionEvent(IBVSocket* _this, int timeoutMS,
struct ib_wc* outWC);
extern int __IBVSocket_waitForSendCompletionEvent(IBVSocket* _this, int oldSendCount,
int timeoutMS);
extern int __IBVSocket_waitForTotalSendCompletion(IBVSocket* _this,
unsigned* numSendElements, unsigned* numWriteElements, unsigned* numReadElements, int timeoutMS);
extern ssize_t __IBVSocket_recvContinueIncomplete(IBVSocket* _this, struct iov_iter* iter);
extern int __IBVSocket_cmaHandler(struct rdma_cm_id* cm_id, struct rdma_cm_event* event);
extern void __IBVSocket_cqSendEventHandler(struct ib_event* event, void* data);
extern void __IBVSocket_sendCompletionHandler(struct ib_cq* cq, void* cq_context);
extern void __IBVSocket_cqRecvEventHandler(struct ib_event* event, void* data);
extern void __IBVSocket_recvCompletionHandler(struct ib_cq* cq, void* cq_context);
extern void __IBVSocket_qpEventHandler(struct ib_event* event, void* data);
extern int __IBVSocket_routeResolvedHandler(IBVSocket* _this, struct rdma_cm_id* cm_id,
IBVCommConfig* commCfg, IBVCommContext** outCommContext);
extern int __IBVSocket_connectedHandler(IBVSocket* _this, struct rdma_cm_event *event);
extern struct ib_cq* __IBVSocket_createCompletionQueue(struct ib_device* device,
ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *),
void* cq_context, int cqe);
extern const char* __IBVSocket_wcStatusStr(int wcStatusCode);
enum IBVSocketConnState
{
IBVSOCKETCONNSTATE_UNCONNECTED=0,
IBVSOCKETCONNSTATE_CONNECTING=1,
IBVSOCKETCONNSTATE_ADDRESSRESOLVED=2,
IBVSOCKETCONNSTATE_ROUTERESOLVED=3,
IBVSOCKETCONNSTATE_ESTABLISHED=4,
IBVSOCKETCONNSTATE_FAILED=5,
IBVSOCKETCONNSTATE_REJECTED_STALE=6
};
struct IBVIncompleteRecv
{
int isAvailable;
int completedOffset;
int bufIndex;
int totalSize;
};
struct IBVIncompleteSend
{
unsigned numAvailable;
bool forceWaitForAll; // true if we received only some completions and need
// to wait for the rest before we can send more data
};
struct IBVCommContext
{
struct ib_pd* pd; // protection domain
struct ib_mr* dmaMR; // system DMA MR. Not supported on all platforms.
atomic_t recvCompEventCount; // incremented on incoming event notification
wait_queue_head_t recvCompWaitQ; // for recvCompEvents
wait_queue_t recvWait;
bool recvWaitInitialized; // true if init_wait was called for the thread
atomic_t sendCompEventCount; // incremented on incoming event notification
wait_queue_head_t sendCompWaitQ; // for sendCompEvents
wait_queue_t sendWait;
bool sendWaitInitialized; // true if init_wait was called for the thread
struct ib_cq* recvCQ; // recv completion queue
struct ib_cq* sendCQ; // send completion queue
struct ib_qp* qp; // send+recv queue pair
IBVCommConfig commCfg;
struct IBVBuffer* sendBufs;
struct IBVBuffer* recvBufs;
struct IBVBuffer checkConBuffer;
unsigned numReceivedBufsLeft; // flow control v2 to avoid IB rnr timeout
unsigned numSendBufsLeft; // flow control v2 to avoid IB rnr timeout
IBVIncompleteRecv incompleteRecv;
IBVIncompleteSend incompleteSend;
u32 checkConnRkey;
};
#pragma pack(push, 1)
// Note: Make sure this struct has the same size on all architectures (because we use
// sizeof(IBVCommDest) for private_data during handshake)
struct IBVCommDest
{
char verificationStr[IBVSOCKET_PRIVATEDATA_STR_LEN];
uint64_t protocolVersion;
uint64_t vaddr;
unsigned rkey;
unsigned recvBufNum;
unsigned recvBufSize;
};
#pragma pack(pop)
struct IBVSocket
{
wait_queue_head_t eventWaitQ; // used to wait for connState change during connect
struct rdma_cm_id* cm_id;
struct in_addr srcIpAddr;
IBVCommDest localDest;
IBVCommDest* remoteDest;
IBVCommContext* commContext;
int errState; // 0 = <no error>; -1 = <unspecified error>
volatile IBVSocketConnState_t connState;
int typeOfService;
unsigned remapConnectionFailureStatus;
NicAddressStats* nicStats; // Owned by a NodeConnPool instance. Do not access
// members without locking the NodeConnPool mutex.
// Possibly NULL.
IBVTimeoutConfig timeoutCfg;
Mutex cmaMutex; // used to manage concurrency of cm_id and commContext
// with __IBVSocket_cmaHandler
};
#else
struct IBVSocket
{
/* empty structs are not allowed, so until this kludge can go, add a dummy member */
unsigned:0;
};
#endif
#endif /*OPENTK_IBVSOCKET_H_*/

View File

@@ -0,0 +1,114 @@
#include "IBVSocket.h"
#ifndef BEEGFS_RDMA
#define no_ibvsocket_err() \
printk_fhgfs(KERN_INFO, "%s:%d: You should never see this message\n", __func__, __LINE__)
bool IBVSocket_init(IBVSocket* _this, struct in_addr srcIpAddr, NicAddressStats* nicStats)
{
no_ibvsocket_err();
return false;
}
void IBVSocket_uninit(IBVSocket* _this)
{
// nothing to be done here
}
bool IBVSocket_rdmaDevicesExist(void)
{
return false;
}
bool IBVSocket_connectByIP(IBVSocket* _this, struct in_addr ipaddress, unsigned short port,
IBVCommConfig* commCfg)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_bindToAddr(IBVSocket* _this, struct in_addr ipAddr, unsigned short port)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_listen(IBVSocket* _this)
{
no_ibvsocket_err();
return false;
}
bool IBVSocket_shutdown(IBVSocket* _this)
{
no_ibvsocket_err();
return false;
}
ssize_t IBVSocket_recvT(IBVSocket* _this, struct iov_iter* iter, int flags, int timeoutMS)
{
no_ibvsocket_err();
return -1;
}
ssize_t IBVSocket_send(IBVSocket* _this, struct iov_iter* iter, int flags)
{
no_ibvsocket_err();
return -1;
}
/**
* @return 0 on success, -1 on error
*/
int IBVSocket_checkConnection(IBVSocket* _this)
{
no_ibvsocket_err();
return -1;
}
unsigned long IBVSocket_poll(IBVSocket* _this, short events, bool finishPoll)
{
no_ibvsocket_err();
return ~0;
}
unsigned IBVSocket_getRkey(IBVSocket* _this)
{
no_ibvsocket_err();
return ~0;
}
struct ib_device* IBVSocket_getDevice(IBVSocket* _this)
{
return NULL;
}
void IBVSocket_setTimeouts(IBVSocket* _this, int connectMS,
int completionMS, int flowSendMS, int flowRecvMS, int pollMS)
{
}
void IBVSocket_setTypeOfService(IBVSocket* _this, int typeOfService)
{
}
void IBVSocket_setConnectionFailureStatus(IBVSocket* _this, unsigned value)
{
}
struct in_addr IBVSocket_getSrcIpAddr(IBVSocket* _this)
{
struct in_addr r = {
.s_addr = ~0
};
return r;
}
NicAddressStats* IBVSocket_getNicStats(IBVSocket* _this)
{
return NULL;
}
#endif