2025-08-10 01:34:16 +02:00

400 lines
11 KiB
C

#ifdef BEEGFS_NVFS
#include "linux/uio.h"
#include "linux/pagemap.h"
#include "linux/kernel.h"
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <common/net/sock/RDMASocket.h>
#include "Nvfs.h"
#include "RdmaInfo.h"
//
// These macros convert a scatterlist entry into a base address (ba) and limit address (la)
// and vice-versa. From this information, we can combine scatterlist entries which are DMA
// contiguous.
//
#define sg_to_ba_la(SG, BA, LA) \
do \
{ \
BA = sg_dma_address(SG); \
LA = BA + sg_dma_len(SG); \
} while (0)
#define ba_la_to_sg(SG, BA, LA) \
do \
{ \
sg_dma_address(SG) = BA; \
sg_dma_len(SG) = LA - BA; \
SG->length = LA - BA; \
} while (0)
bool RdmaInfo_acquireNVFS(void)
{
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s\n", __func__);
#endif // BEEGFS_DEBUG
return nvfs_get_ops();
}
void RdmaInfo_releaseNVFS(void)
{
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s\n", __func__);
#endif // BEEGFS_DEBUG
nvfs_put_ops();
}
int RdmaInfo_detectNVFSRequest(DevicePriorityContext* dpctx,
const struct iov_iter *iter)
{
struct page *page = NULL;
struct iov_iter iter_copy = *iter;
size_t page_offset = 0;
int status = 0;
bool is_gpu = false;
// Test the first page of the request to determine the memory type.
status = iov_iter_get_pages(&iter_copy, &page, PAGE_SIZE, 1, &page_offset);
if (unlikely(status <= 0))
{
// 0 means the iter is empty, so just indicate that it's not an NVFS call.
// Otherwise, indicate an error condition.if (unlikely(status <= 0))
if (status < 0)
printk_fhgfs(KERN_WARNING, "%s: can't retrieve page from iov_iter, status=%d\n",
__func__, status);
return status == 0? false : status;
}
// At this point, the request did come in through nvidia_fs.
// nvfs_is_gpu_page() will return false if RDMA write support
// is disabled in user space.
// TODO: if a GPU page, keep the retrieved page for a future
// RDMA map operation instead of calling put_page()
if (nvfs_ops->nvfs_is_gpu_page(page))
{
is_gpu = true;
dpctx->gpuIndex = nvfs_ops->nvfs_gpu_index(page);
}
put_page(page);
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s:%d: page=%p is_gpu=%d gpu_index=%d\n",
__func__, __LINE__,
page, is_gpu, dpctx? dpctx->gpuIndex : -2);
#endif
return is_gpu;
}
/*
* RdmaInfo_putPpages - Put the pages back into free list.
* @sglist: The array of scatter/gather entries
* @count: The count of entries to process
*/
static inline void RdmaInfo_putPages(struct scatterlist *sglist, int count)
{
int i = 0;
struct scatterlist *sgp = NULL;
if ((sglist != NULL) && (count > 0))
{
for (i = 0, sgp = sglist; i < count; i++, sgp++)
{
put_page(sg_page(sgp));
}
}
}
/*
* RdmaInfo_iovToSglist - Map an iov_iter to scatter/gather list
* @iter: iov_iter
* @sglist: The array of scatter/gather entries (needs to be big enough for all pages)
* @returns number of sg entries set up for the iov_iter
*/
static int RdmaInfo_iovToSglist(const struct iov_iter *iter,
struct scatterlist *sglist)
{
struct page **pages = NULL;
struct scatterlist *sg = NULL;
struct scatterlist *sg_prev = NULL;
struct iov_iter iter_copy = *iter;
int sg_count = 0;
size_t page_length = 0;
size_t page_offset = 0;
size_t bytes = 0;
ssize_t result = 0;
unsigned i = 0;
unsigned npages = 0;
while (iov_iter_count(&iter_copy) > 0)
{
result = iov_iter_get_pages_alloc(&iter_copy, &pages, iov_iter_count(&iter_copy), &page_offset);
if (result < 0)
{
printk_fhgfs(KERN_ERR, "RdmaInfo_iovToSglist: no memory pages\n");
RdmaInfo_putPages(sglist, sg_count);
return -ENOMEM;
}
bytes = result;
npages = (bytes + page_offset + PAGE_SIZE - 1) / PAGE_SIZE;
sg_count += npages;
for (i = 0, sg = sglist; i < npages; i++, sg = sg_next(sg))
{
page_length = min(bytes, PAGE_SIZE - page_offset);
sg_set_page(sg, pages[i], page_length, page_offset);
bytes -= page_length;
page_offset = 0;
sg_prev = sg;
}
kvfree(pages);
iov_iter_advance(&iter_copy, result);
}
if (sg_prev)
{
sg_mark_end(sg_prev);
}
return sg_count;
}
/*
* RdmaInfo_coalesceSglist - Coalesce scatterlist entries for optimal RDMA operations.
* @sglist: input list (not necessarily coalesced)
* @dmalist: output list (coalesced)
* @count: Number of scatterlist entries
* @returns count of coalesed list
*/
static int RdmaInfo_coalesceSglist(struct scatterlist *sglist,
struct scatterlist *dmalist, int count)
{
struct scatterlist *sgp = sglist;
struct scatterlist *dmap = dmalist;
dma_addr_t dma_ba = 0, dma_la = 0;
dma_addr_t sg_ba = 0, sg_la = 0;
int i = 0;
#ifdef BEEGFS_DEBUG
size_t len = sg_dma_len(sgp);
#endif
//
// Load the first range.
//
sg_to_ba_la(sgp, dma_ba, dma_la);
if (count > 1)
{
for_each_sg(&sglist[1], sgp, count-1, i)
{
#ifdef BEEGFS_DEBUG
len += sg_dma_len(sgp);
#endif
sg_to_ba_la(sgp, sg_ba, sg_la);
//
// If the regions aren't contiguous, then set the current
// range and start a new range. Otherwise, add on to the
// current range.
//
if (dma_la != sg_ba)
{
ba_la_to_sg(dmap, dma_ba, dma_la);
sg_unmark_end(dmap);
dmap = sg_next(dmap);
dma_ba = sg_ba;
dma_la = sg_la;
}
else
{
dma_la = sg_la;
}
}
}
//
// Set the last range.
//
ba_la_to_sg(dmap, dma_ba, dma_la);
sg_mark_end(dmap);
#ifdef BEEGFS_DEBUG
printk_fhgfs(KERN_INFO, "%s len=%zu count=%d return=%d\n", __func__, len, count, (int)(1 + dmap - dmalist));
#endif
return 1 + dmap - dmalist;
}
/*
* RdmaInfo_map - Map GPU buffers for RDMA operations.
* @iter: iov_iter
* @socket: RDMA capable socket struct.
* @dma_dir: read (DMA_FROM_DEVICE) or write (DMA_TO_DEVICE)
* @returns RdmaInfo struct
*/
static RdmaInfo * RdmaInfo_map(const struct iov_iter *iter, Socket *socket,
enum dma_data_direction dma_dir)
{
RdmaInfo *rdmap;
RDMASocket *rs;
struct ib_device *device;
struct scatterlist *sglist;
struct scatterlist *dmalist;
int status = 0;
int sg_count;
int dma_count;
int count;
unsigned npages;
unsigned key;
if (Socket_getSockType(socket) != NICADDRTYPE_RDMA)
return ERR_PTR(-EINVAL);
rs = (RDMASocket*) socket;
if (!RDMASocket_isRkeyGlobal(rs))
{
printk_fhgfs(KERN_ERR, "ERROR: rkey type is not compatible with GDS\n");
return ERR_PTR(-EINVAL);
}
npages = 1 + iov_iter_npages(iter, INT_MAX);
//
// Allocate the scatterlist.
//
rdmap = kzalloc(sizeof(RdmaInfo), GFP_ATOMIC);
sglist = kzalloc(npages * sizeof(struct scatterlist), GFP_ATOMIC);
dmalist = kzalloc(npages * sizeof(struct scatterlist), GFP_ATOMIC);
if (unlikely(!rdmap || !sglist || !dmalist))
{
printk_fhgfs(KERN_ERR, "%s: no memory for scatterlist\n", __func__);
status = -ENOMEM;
goto error_return;
}
//
// Populate the scatterlist from the iov_iter.
//
sg_count = RdmaInfo_iovToSglist(iter, sglist);
if (unlikely(sg_count < 0))
{
printk_fhgfs(KERN_ERR, "%s: can't convert iov_iter to scatterlist\n", __func__);
status = -EIO;
goto error_return;
}
//
// DMA map all of the pages.
//
device = RDMASocket_getDevice(rs);
key = RDMASocket_getRkey(rs);
count = nvfs_ops->nvfs_dma_map_sg_attrs(device->dma_device, sglist, sg_count,
dma_dir, DMA_ATTR_NO_WARN);
if (unlikely(count != sg_count))
{
if (count == NVFS_CPU_REQ)
{
printk_fhgfs(KERN_ERR, "%s: NVFS_CPU_REQ\n", __func__);
status = 0;
}
else if (count == NVFS_IO_ERR)
{
printk_fhgfs(KERN_ERR, "%s: can't DMA map mixed CPU/GPU pages\n", __func__);
status = -EINVAL;
}
else
{
printk_fhgfs(KERN_ERR, "%s: unknown error returned from NVFS (%d)\n", __func__, count);
status = -EIO;
}
goto error_return;
}
//
// Coalesce the scatterlist.
//
dma_count = RdmaInfo_coalesceSglist(sglist, dmalist, count);
if (unlikely(dma_count > RDMA_MAX_DMA_COUNT))
{
printk_fhgfs(KERN_ERR, "%s: too many DMA elements count=%d max=%d\n", __func__,
dma_count, RDMA_MAX_DMA_COUNT);
status = -EIO;
goto error_return;
}
//
// Fill in the rdma info.
//
rdmap->dma_count = dma_count;
rdmap->sg_count = sg_count;
rdmap->tag = 0x00000000;
rdmap->device = device;
rdmap->key = key;
rdmap->sglist = sglist;
rdmap->dmalist = dmalist;
#ifdef BEEGFS_DEBUG_RDMA
RdmaInfo_dumpIovIter(iter);
RdmaInfo_dumpSgtable("MAP", rdmap->dmalist, rdmap->dma_count);
RdmaInfo_dumpRdmaInfo(rdmap);
#endif /* BEEGFS_DEBUG_RDMA */
return rdmap;
error_return:
if (sglist)
{
RdmaInfo_putPages(sglist, sg_count);
kfree(sglist);
}
if (dmalist)
kfree(dmalist);
if (rdmap)
kfree(rdmap);
return (status == 0) ? NULL : ERR_PTR(status);
}
RdmaInfo* RdmaInfo_mapRead(const struct iov_iter *iter, Socket *socket)
{
return RdmaInfo_map(iter, socket, DMA_FROM_DEVICE);
}
RdmaInfo* RdmaInfo_mapWrite(const struct iov_iter *iter, Socket *socket)
{
return RdmaInfo_map(iter, socket, DMA_TO_DEVICE);
}
/*
* RdmaInfo_unmap - Unmap GPU buffers for RDMA operations.
* @rdmap: RdmaInfo created by RdmaInfo_map (see above)
* @dma_dir: read (DMA_FROM_DEVICE) or write (DMA_TO_DEVICE)
*/
static inline void RdmaInfo_unmap(RdmaInfo *rdmap, enum dma_data_direction dma_dir)
{
if (rdmap->sglist)
{
if (rdmap->dmalist)
{
nvfs_ops->nvfs_dma_unmap_sg(rdmap->device->dma_device, rdmap->sglist, rdmap->sg_count, dma_dir);
RdmaInfo_putPages(rdmap->sglist, rdmap->sg_count);
kfree(rdmap->dmalist);
}
kfree(rdmap->sglist);
}
kfree(rdmap);
}
void RdmaInfo_unmapRead(RdmaInfo *rdmap)
{
RdmaInfo_unmap(rdmap, DMA_FROM_DEVICE);
}
void RdmaInfo_unmapWrite(RdmaInfo *rdmap)
{
RdmaInfo_unmap(rdmap, DMA_TO_DEVICE);
}
#endif /* BEEGFS_NVFS */