400 lines
11 KiB
C
400 lines
11 KiB
C
#ifdef BEEGFS_NVFS
|
|
#include "linux/uio.h"
|
|
#include "linux/pagemap.h"
|
|
#include "linux/kernel.h"
|
|
#include <rdma/ib_verbs.h>
|
|
#include <rdma/rdma_cm.h>
|
|
#include <rdma/ib_cm.h>
|
|
#include <common/net/sock/RDMASocket.h>
|
|
#include "Nvfs.h"
|
|
#include "RdmaInfo.h"
|
|
|
|
//
|
|
// These macros convert a scatterlist entry into a base address (ba) and limit address (la)
|
|
// and vice-versa. From this information, we can combine scatterlist entries which are DMA
|
|
// contiguous.
|
|
//
|
|
#define sg_to_ba_la(SG, BA, LA) \
|
|
do \
|
|
{ \
|
|
BA = sg_dma_address(SG); \
|
|
LA = BA + sg_dma_len(SG); \
|
|
} while (0)
|
|
|
|
#define ba_la_to_sg(SG, BA, LA) \
|
|
do \
|
|
{ \
|
|
sg_dma_address(SG) = BA; \
|
|
sg_dma_len(SG) = LA - BA; \
|
|
SG->length = LA - BA; \
|
|
} while (0)
|
|
|
|
bool RdmaInfo_acquireNVFS(void)
|
|
{
|
|
#ifdef BEEGFS_DEBUG
|
|
printk_fhgfs(KERN_INFO, "%s\n", __func__);
|
|
#endif // BEEGFS_DEBUG
|
|
return nvfs_get_ops();
|
|
}
|
|
|
|
void RdmaInfo_releaseNVFS(void)
|
|
{
|
|
#ifdef BEEGFS_DEBUG
|
|
printk_fhgfs(KERN_INFO, "%s\n", __func__);
|
|
#endif // BEEGFS_DEBUG
|
|
nvfs_put_ops();
|
|
}
|
|
|
|
int RdmaInfo_detectNVFSRequest(DevicePriorityContext* dpctx,
|
|
const struct iov_iter *iter)
|
|
{
|
|
struct page *page = NULL;
|
|
struct iov_iter iter_copy = *iter;
|
|
size_t page_offset = 0;
|
|
int status = 0;
|
|
bool is_gpu = false;
|
|
|
|
// Test the first page of the request to determine the memory type.
|
|
status = iov_iter_get_pages(&iter_copy, &page, PAGE_SIZE, 1, &page_offset);
|
|
if (unlikely(status <= 0))
|
|
{
|
|
// 0 means the iter is empty, so just indicate that it's not an NVFS call.
|
|
// Otherwise, indicate an error condition.if (unlikely(status <= 0))
|
|
if (status < 0)
|
|
printk_fhgfs(KERN_WARNING, "%s: can't retrieve page from iov_iter, status=%d\n",
|
|
__func__, status);
|
|
return status == 0? false : status;
|
|
}
|
|
|
|
// At this point, the request did come in through nvidia_fs.
|
|
// nvfs_is_gpu_page() will return false if RDMA write support
|
|
// is disabled in user space.
|
|
// TODO: if a GPU page, keep the retrieved page for a future
|
|
// RDMA map operation instead of calling put_page()
|
|
if (nvfs_ops->nvfs_is_gpu_page(page))
|
|
{
|
|
is_gpu = true;
|
|
dpctx->gpuIndex = nvfs_ops->nvfs_gpu_index(page);
|
|
}
|
|
put_page(page);
|
|
#ifdef BEEGFS_DEBUG
|
|
printk_fhgfs(KERN_INFO, "%s:%d: page=%p is_gpu=%d gpu_index=%d\n",
|
|
__func__, __LINE__,
|
|
page, is_gpu, dpctx? dpctx->gpuIndex : -2);
|
|
#endif
|
|
return is_gpu;
|
|
}
|
|
|
|
|
|
/*
|
|
* RdmaInfo_putPpages - Put the pages back into free list.
|
|
* @sglist: The array of scatter/gather entries
|
|
* @count: The count of entries to process
|
|
*/
|
|
static inline void RdmaInfo_putPages(struct scatterlist *sglist, int count)
|
|
{
|
|
int i = 0;
|
|
struct scatterlist *sgp = NULL;
|
|
|
|
if ((sglist != NULL) && (count > 0))
|
|
{
|
|
for (i = 0, sgp = sglist; i < count; i++, sgp++)
|
|
{
|
|
put_page(sg_page(sgp));
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* RdmaInfo_iovToSglist - Map an iov_iter to scatter/gather list
|
|
* @iter: iov_iter
|
|
* @sglist: The array of scatter/gather entries (needs to be big enough for all pages)
|
|
* @returns number of sg entries set up for the iov_iter
|
|
*/
|
|
static int RdmaInfo_iovToSglist(const struct iov_iter *iter,
|
|
struct scatterlist *sglist)
|
|
{
|
|
struct page **pages = NULL;
|
|
struct scatterlist *sg = NULL;
|
|
struct scatterlist *sg_prev = NULL;
|
|
struct iov_iter iter_copy = *iter;
|
|
int sg_count = 0;
|
|
size_t page_length = 0;
|
|
size_t page_offset = 0;
|
|
size_t bytes = 0;
|
|
ssize_t result = 0;
|
|
unsigned i = 0;
|
|
unsigned npages = 0;
|
|
|
|
while (iov_iter_count(&iter_copy) > 0)
|
|
{
|
|
result = iov_iter_get_pages_alloc(&iter_copy, &pages, iov_iter_count(&iter_copy), &page_offset);
|
|
if (result < 0)
|
|
{
|
|
printk_fhgfs(KERN_ERR, "RdmaInfo_iovToSglist: no memory pages\n");
|
|
RdmaInfo_putPages(sglist, sg_count);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
bytes = result;
|
|
npages = (bytes + page_offset + PAGE_SIZE - 1) / PAGE_SIZE;
|
|
sg_count += npages;
|
|
|
|
for (i = 0, sg = sglist; i < npages; i++, sg = sg_next(sg))
|
|
{
|
|
page_length = min(bytes, PAGE_SIZE - page_offset);
|
|
sg_set_page(sg, pages[i], page_length, page_offset);
|
|
|
|
bytes -= page_length;
|
|
page_offset = 0;
|
|
sg_prev = sg;
|
|
}
|
|
|
|
kvfree(pages);
|
|
iov_iter_advance(&iter_copy, result);
|
|
}
|
|
|
|
if (sg_prev)
|
|
{
|
|
sg_mark_end(sg_prev);
|
|
}
|
|
return sg_count;
|
|
}
|
|
|
|
/*
|
|
* RdmaInfo_coalesceSglist - Coalesce scatterlist entries for optimal RDMA operations.
|
|
* @sglist: input list (not necessarily coalesced)
|
|
* @dmalist: output list (coalesced)
|
|
* @count: Number of scatterlist entries
|
|
* @returns count of coalesed list
|
|
*/
|
|
static int RdmaInfo_coalesceSglist(struct scatterlist *sglist,
|
|
struct scatterlist *dmalist, int count)
|
|
{
|
|
struct scatterlist *sgp = sglist;
|
|
struct scatterlist *dmap = dmalist;
|
|
dma_addr_t dma_ba = 0, dma_la = 0;
|
|
dma_addr_t sg_ba = 0, sg_la = 0;
|
|
int i = 0;
|
|
#ifdef BEEGFS_DEBUG
|
|
size_t len = sg_dma_len(sgp);
|
|
#endif
|
|
|
|
//
|
|
// Load the first range.
|
|
//
|
|
sg_to_ba_la(sgp, dma_ba, dma_la);
|
|
|
|
if (count > 1)
|
|
{
|
|
for_each_sg(&sglist[1], sgp, count-1, i)
|
|
{
|
|
#ifdef BEEGFS_DEBUG
|
|
len += sg_dma_len(sgp);
|
|
#endif
|
|
sg_to_ba_la(sgp, sg_ba, sg_la);
|
|
|
|
//
|
|
// If the regions aren't contiguous, then set the current
|
|
// range and start a new range. Otherwise, add on to the
|
|
// current range.
|
|
//
|
|
if (dma_la != sg_ba)
|
|
{
|
|
ba_la_to_sg(dmap, dma_ba, dma_la);
|
|
sg_unmark_end(dmap);
|
|
dmap = sg_next(dmap);
|
|
|
|
dma_ba = sg_ba;
|
|
dma_la = sg_la;
|
|
}
|
|
else
|
|
{
|
|
dma_la = sg_la;
|
|
}
|
|
}
|
|
}
|
|
//
|
|
// Set the last range.
|
|
//
|
|
ba_la_to_sg(dmap, dma_ba, dma_la);
|
|
sg_mark_end(dmap);
|
|
|
|
#ifdef BEEGFS_DEBUG
|
|
printk_fhgfs(KERN_INFO, "%s len=%zu count=%d return=%d\n", __func__, len, count, (int)(1 + dmap - dmalist));
|
|
#endif
|
|
return 1 + dmap - dmalist;
|
|
}
|
|
|
|
/*
|
|
* RdmaInfo_map - Map GPU buffers for RDMA operations.
|
|
* @iter: iov_iter
|
|
* @socket: RDMA capable socket struct.
|
|
* @dma_dir: read (DMA_FROM_DEVICE) or write (DMA_TO_DEVICE)
|
|
* @returns RdmaInfo struct
|
|
*/
|
|
static RdmaInfo * RdmaInfo_map(const struct iov_iter *iter, Socket *socket,
|
|
enum dma_data_direction dma_dir)
|
|
{
|
|
RdmaInfo *rdmap;
|
|
RDMASocket *rs;
|
|
struct ib_device *device;
|
|
struct scatterlist *sglist;
|
|
struct scatterlist *dmalist;
|
|
int status = 0;
|
|
int sg_count;
|
|
int dma_count;
|
|
int count;
|
|
unsigned npages;
|
|
unsigned key;
|
|
|
|
if (Socket_getSockType(socket) != NICADDRTYPE_RDMA)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
rs = (RDMASocket*) socket;
|
|
if (!RDMASocket_isRkeyGlobal(rs))
|
|
{
|
|
printk_fhgfs(KERN_ERR, "ERROR: rkey type is not compatible with GDS\n");
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
npages = 1 + iov_iter_npages(iter, INT_MAX);
|
|
|
|
//
|
|
// Allocate the scatterlist.
|
|
//
|
|
rdmap = kzalloc(sizeof(RdmaInfo), GFP_ATOMIC);
|
|
sglist = kzalloc(npages * sizeof(struct scatterlist), GFP_ATOMIC);
|
|
dmalist = kzalloc(npages * sizeof(struct scatterlist), GFP_ATOMIC);
|
|
if (unlikely(!rdmap || !sglist || !dmalist))
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: no memory for scatterlist\n", __func__);
|
|
status = -ENOMEM;
|
|
goto error_return;
|
|
}
|
|
|
|
//
|
|
// Populate the scatterlist from the iov_iter.
|
|
//
|
|
sg_count = RdmaInfo_iovToSglist(iter, sglist);
|
|
if (unlikely(sg_count < 0))
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: can't convert iov_iter to scatterlist\n", __func__);
|
|
status = -EIO;
|
|
goto error_return;
|
|
}
|
|
|
|
//
|
|
// DMA map all of the pages.
|
|
//
|
|
device = RDMASocket_getDevice(rs);
|
|
key = RDMASocket_getRkey(rs);
|
|
|
|
count = nvfs_ops->nvfs_dma_map_sg_attrs(device->dma_device, sglist, sg_count,
|
|
dma_dir, DMA_ATTR_NO_WARN);
|
|
if (unlikely(count != sg_count))
|
|
{
|
|
if (count == NVFS_CPU_REQ)
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: NVFS_CPU_REQ\n", __func__);
|
|
status = 0;
|
|
}
|
|
else if (count == NVFS_IO_ERR)
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: can't DMA map mixed CPU/GPU pages\n", __func__);
|
|
status = -EINVAL;
|
|
}
|
|
else
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: unknown error returned from NVFS (%d)\n", __func__, count);
|
|
status = -EIO;
|
|
}
|
|
goto error_return;
|
|
}
|
|
|
|
//
|
|
// Coalesce the scatterlist.
|
|
//
|
|
dma_count = RdmaInfo_coalesceSglist(sglist, dmalist, count);
|
|
if (unlikely(dma_count > RDMA_MAX_DMA_COUNT))
|
|
{
|
|
printk_fhgfs(KERN_ERR, "%s: too many DMA elements count=%d max=%d\n", __func__,
|
|
dma_count, RDMA_MAX_DMA_COUNT);
|
|
status = -EIO;
|
|
goto error_return;
|
|
}
|
|
|
|
//
|
|
// Fill in the rdma info.
|
|
//
|
|
rdmap->dma_count = dma_count;
|
|
rdmap->sg_count = sg_count;
|
|
rdmap->tag = 0x00000000;
|
|
rdmap->device = device;
|
|
rdmap->key = key;
|
|
rdmap->sglist = sglist;
|
|
rdmap->dmalist = dmalist;
|
|
|
|
#ifdef BEEGFS_DEBUG_RDMA
|
|
RdmaInfo_dumpIovIter(iter);
|
|
RdmaInfo_dumpSgtable("MAP", rdmap->dmalist, rdmap->dma_count);
|
|
RdmaInfo_dumpRdmaInfo(rdmap);
|
|
#endif /* BEEGFS_DEBUG_RDMA */
|
|
|
|
return rdmap;
|
|
|
|
error_return:
|
|
if (sglist)
|
|
{
|
|
RdmaInfo_putPages(sglist, sg_count);
|
|
kfree(sglist);
|
|
}
|
|
if (dmalist)
|
|
kfree(dmalist);
|
|
if (rdmap)
|
|
kfree(rdmap);
|
|
return (status == 0) ? NULL : ERR_PTR(status);
|
|
}
|
|
|
|
RdmaInfo* RdmaInfo_mapRead(const struct iov_iter *iter, Socket *socket)
|
|
{
|
|
return RdmaInfo_map(iter, socket, DMA_FROM_DEVICE);
|
|
}
|
|
|
|
RdmaInfo* RdmaInfo_mapWrite(const struct iov_iter *iter, Socket *socket)
|
|
{
|
|
return RdmaInfo_map(iter, socket, DMA_TO_DEVICE);
|
|
}
|
|
|
|
/*
|
|
* RdmaInfo_unmap - Unmap GPU buffers for RDMA operations.
|
|
* @rdmap: RdmaInfo created by RdmaInfo_map (see above)
|
|
* @dma_dir: read (DMA_FROM_DEVICE) or write (DMA_TO_DEVICE)
|
|
*/
|
|
static inline void RdmaInfo_unmap(RdmaInfo *rdmap, enum dma_data_direction dma_dir)
|
|
{
|
|
if (rdmap->sglist)
|
|
{
|
|
if (rdmap->dmalist)
|
|
{
|
|
nvfs_ops->nvfs_dma_unmap_sg(rdmap->device->dma_device, rdmap->sglist, rdmap->sg_count, dma_dir);
|
|
RdmaInfo_putPages(rdmap->sglist, rdmap->sg_count);
|
|
kfree(rdmap->dmalist);
|
|
}
|
|
kfree(rdmap->sglist);
|
|
}
|
|
kfree(rdmap);
|
|
}
|
|
|
|
void RdmaInfo_unmapRead(RdmaInfo *rdmap)
|
|
{
|
|
RdmaInfo_unmap(rdmap, DMA_FROM_DEVICE);
|
|
}
|
|
|
|
void RdmaInfo_unmapWrite(RdmaInfo *rdmap)
|
|
{
|
|
RdmaInfo_unmap(rdmap, DMA_TO_DEVICE);
|
|
}
|
|
|
|
#endif /* BEEGFS_NVFS */
|