1720 lines
43 KiB
C
1720 lines
43 KiB
C
#include <app/App.h>
|
|
#include <app/log/Logger.h>
|
|
#include <linux/pagemap.h>
|
|
#include "FhgfsOpsFileNative.h"
|
|
#include "FhgfsOpsFile.h"
|
|
#include "FhgfsOpsHelper.h"
|
|
#include "FhgfsOpsIoctl.h"
|
|
#include <net/filesystem/FhgfsOpsRemoting.h>
|
|
#include <fault-inject/fault-inject.h>
|
|
#include <common/toolkit/SynchronizedCounter.h>
|
|
#include <os/OsCompat.h>
|
|
#include <linux/aio.h>
|
|
#include <linux/kref.h>
|
|
|
|
static int writepages_init(void);
|
|
static void writepages_release(void);
|
|
|
|
static void readpages_init(void);
|
|
|
|
static ssize_t __beegfs_direct_IO(int rw, struct kiocb* iocb, struct iov_iter* iter, loff_t offset);
|
|
|
|
static struct workqueue_struct* remoting_io_queue;
|
|
|
|
bool beegfs_native_init()
|
|
{
|
|
if(writepages_init() < 0)
|
|
return false;
|
|
|
|
readpages_init();
|
|
|
|
#ifndef KERNEL_HAS_ALLOC_WORKQUEUE
|
|
remoting_io_queue = create_workqueue("beegfs/flush");
|
|
#elif defined(KERNEL_HAS_WQ_RESCUER)
|
|
// WQ_RESCUER and WQ_MEM_RECLAIM are effectively the same thing: they ensure that
|
|
// at least one thread to run work items on is always available.
|
|
remoting_io_queue = alloc_workqueue("beegfs/flush", WQ_RESCUER, num_online_cpus());
|
|
#else
|
|
remoting_io_queue = alloc_workqueue("beegfs/flush", WQ_MEM_RECLAIM, num_online_cpus());
|
|
#endif
|
|
|
|
if(!remoting_io_queue)
|
|
goto fail_queue;
|
|
|
|
return true;
|
|
|
|
fail_queue:
|
|
writepages_release();
|
|
return false;
|
|
}
|
|
|
|
void beegfs_native_release()
|
|
{
|
|
writepages_release();
|
|
|
|
if(remoting_io_queue)
|
|
destroy_workqueue(remoting_io_queue);
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
* PVRs and ARDs use the Private and Checked bits of pages to determine which is attached to a
|
|
* page. Once we support only kernels that have the Private2 bit, we should use Private2 instead
|
|
* of Checked.
|
|
* Note: this is to avoid problems with 64k pages on 32 bit machines, otherwise we could use
|
|
* low-order bits of page->private to discriminate.
|
|
*/
|
|
|
|
enum
|
|
{
|
|
PVR_FIRST_SHIFT = 0,
|
|
PVR_FIRST_MASK = ~PAGE_MASK << PVR_FIRST_SHIFT,
|
|
|
|
PVR_LAST_SHIFT = PAGE_SHIFT,
|
|
PVR_LAST_MASK = ~PAGE_MASK << PVR_LAST_SHIFT,
|
|
};
|
|
|
|
static void pvr_init(struct page* page)
|
|
{
|
|
// pvr values *must* fit into the unsigned long private of struct page
|
|
BUILD_BUG_ON(PVR_LAST_SHIFT + PAGE_SHIFT > 8 * sizeof(unsigned long) );
|
|
|
|
SetPagePrivate(page);
|
|
ClearPageChecked(page);
|
|
page->private = 0;
|
|
}
|
|
|
|
static bool pvr_present(struct page* page)
|
|
{
|
|
return PagePrivate(page) && !PageChecked(page);
|
|
}
|
|
|
|
static void pvr_clear(struct page* page)
|
|
{
|
|
ClearPagePrivate(page);
|
|
}
|
|
|
|
static unsigned pvr_get_first(struct page* page)
|
|
{
|
|
return (page->private & PVR_FIRST_MASK) >> PVR_FIRST_SHIFT;
|
|
}
|
|
|
|
static unsigned pvr_get_last(struct page* page)
|
|
{
|
|
return (page->private & PVR_LAST_MASK) >> PVR_LAST_SHIFT;
|
|
}
|
|
|
|
static void pvr_set_first(struct page* page, unsigned first)
|
|
{
|
|
page->private &= ~PVR_FIRST_MASK;
|
|
page->private |= (first << PVR_FIRST_SHIFT) & PVR_FIRST_MASK;
|
|
}
|
|
|
|
static void pvr_set_last(struct page* page, unsigned last)
|
|
{
|
|
page->private &= ~PVR_LAST_MASK;
|
|
page->private |= (last << PVR_LAST_SHIFT) & PVR_LAST_MASK;
|
|
}
|
|
|
|
static bool pvr_can_merge(struct page* page, unsigned first, unsigned last)
|
|
{
|
|
unsigned oldFirst = pvr_get_first(page);
|
|
unsigned oldLast = pvr_get_last(page);
|
|
|
|
if(first == oldLast + 1 || last + 1 == oldFirst)
|
|
return true;
|
|
|
|
if(oldFirst <= first && first <= oldLast)
|
|
return true;
|
|
|
|
if(oldFirst <= last && last <= oldLast)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static void pvr_merge(struct page* page, unsigned first, unsigned last)
|
|
{
|
|
if(pvr_get_first(page) > first)
|
|
pvr_set_first(page, first);
|
|
|
|
if(pvr_get_last(page) < last)
|
|
pvr_set_last(page, last);
|
|
}
|
|
|
|
|
|
static void beegfs_drop_all_caches(struct inode* inode)
|
|
{
|
|
os_inode_lock(inode);
|
|
|
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)
|
|
// 2.6.32 and later have truncate_pagecache, but that was incorrect
|
|
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
|
|
truncate_inode_pages(inode->i_mapping, 0);
|
|
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
|
|
#else
|
|
truncate_pagecache(inode, 0);
|
|
#endif
|
|
|
|
i_size_write(inode, 0);
|
|
|
|
os_inode_unlock(inode);
|
|
}
|
|
|
|
|
|
|
|
static int beegfs_release_range(struct file* filp, loff_t first, loff_t last)
|
|
{
|
|
int writeRes;
|
|
|
|
// expand range to fit full pages
|
|
first &= PAGE_MASK;
|
|
last |= ~PAGE_MASK;
|
|
|
|
if (unlikely(last == -1))
|
|
{
|
|
printk_fhgfs(KERN_DEBUG, "range end given was -1");
|
|
last = LLONG_MAX;
|
|
}
|
|
|
|
clear_bit(AS_EIO, &filp->f_mapping->flags);
|
|
|
|
writeRes = file_write_and_wait_range(filp, first, last);
|
|
if(writeRes < 0)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
FhgfsOpsHelper_logOpMsg(3, app, file_dentry(filp), filp->f_mapping->host, __func__,
|
|
"error %i during flush", writeRes);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
return writeRes;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int beegfs_acquire_range(struct file* filp, loff_t first, loff_t last)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
FhgfsIsizeHints iSizeHints;
|
|
int err;
|
|
|
|
// expand range to fit full pages
|
|
first &= PAGE_MASK;
|
|
last |= ~PAGE_MASK;
|
|
|
|
if (unlikely(last == -1))
|
|
{
|
|
printk_fhgfs(KERN_DEBUG, "range end given was -1");
|
|
last = LLONG_MAX; // In linux-6.2, checks for the bytes offset i.e., (end_byte < start_byte)
|
|
// moved from __filemap_fdatawrite_range() to the interface
|
|
// file[map]_write_and_wait_range() in order to provide consistent
|
|
// behaviour between write and wait.
|
|
}
|
|
|
|
|
|
err = beegfs_release_range(filp, first, last);
|
|
if(err)
|
|
return err;
|
|
|
|
err = __FhgfsOps_refreshInode(app, file_inode(filp), NULL, &iSizeHints);
|
|
if(err)
|
|
return err;
|
|
|
|
err = invalidate_inode_pages2_range(filp->f_mapping, first >> PAGE_SHIFT, last >> PAGE_SHIFT);
|
|
return err;
|
|
}
|
|
|
|
|
|
|
|
static int beegfs_open(struct inode* inode, struct file* filp)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
struct FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
int err;
|
|
RemotingIOInfo ioInfo;
|
|
uint32_t remoteVersion;
|
|
bool mustAcquire;
|
|
|
|
FhgfsOpsHelper_logOp(5, app, file_dentry(filp), file_inode(filp), __func__);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
err = FhgfsOps_openReferenceHandle(app, inode, filp, filp->f_flags, NULL, &remoteVersion);
|
|
if(err)
|
|
return err;
|
|
|
|
FsFileInfo_getIOInfo(__FhgfsOps_getFileInfo(filp), fhgfsInode, &ioInfo);
|
|
if(ioInfo.pattern->chunkSize % PAGE_SIZE)
|
|
{
|
|
FhgfsOpsHelper_logOpMsg(1, app, file_dentry(filp), inode, __func__,
|
|
"chunk size is not a multiple of PAGE_SIZE!");
|
|
FhgfsOps_release(inode, filp);
|
|
return -EIO;
|
|
}
|
|
|
|
FhgfsInode_entryInfoWriteLock(fhgfsInode);
|
|
{
|
|
mustAcquire =
|
|
(fhgfsInode->fileVersion > remoteVersion &&
|
|
fhgfsInode->fileVersion - remoteVersion < 0x80000000ULL) ||
|
|
(fhgfsInode->fileVersion < remoteVersion &&
|
|
remoteVersion - fhgfsInode->fileVersion < 0x80000000ULL);
|
|
|
|
fhgfsInode->fileVersion = remoteVersion;
|
|
}
|
|
FhgfsInode_entryInfoWriteUnlock(fhgfsInode);
|
|
|
|
if(filp->f_flags & O_APPEND)
|
|
AtomicInt_inc(&fhgfsInode->appendFDsOpen);
|
|
|
|
if (mustAcquire)
|
|
err = beegfs_acquire_range(filp, 0, LLONG_MAX);
|
|
|
|
return err;
|
|
}
|
|
|
|
static int beegfs_flush(struct file* filp, fl_owner_t id)
|
|
{
|
|
FhgfsInode* inode = BEEGFS_INODE(file_inode(filp));
|
|
Config* cfg = FhgfsOps_getApp(inode->vfs_inode.i_sb)->cfg;
|
|
int result;
|
|
FhgfsOpsErr bumpRes;
|
|
struct FileEvent event = FILEEVENT_EMPTY;
|
|
|
|
IGNORE_UNUSED_VARIABLE(id);
|
|
|
|
if(filp->f_flags & O_APPEND)
|
|
AtomicInt_dec(&BEEGFS_INODE(file_inode(filp) )->appendFDsOpen);
|
|
|
|
/* if the file was not modified, we need not flush the caches. if we do not flush the caches,
|
|
* we also need not bump the file version - which means that other clients can keep their
|
|
* caches. */
|
|
if (atomic_read(&inode->modified) == 0)
|
|
return 0;
|
|
|
|
/* clear the modified bit *before* any data is written out. if the inode data is modified
|
|
* further, the flag *must* be set, even if all modifications are written out by us right
|
|
* here. clearing the flag only after everything has been written out requires exclusion
|
|
* between modifiers and flushers, which is prohibitively expensive. */
|
|
atomic_set(&inode->modified, 0);
|
|
|
|
result = beegfs_release_range(filp, 0, LLONG_MAX);
|
|
if (result < 0)
|
|
{
|
|
atomic_set(&inode->modified, 1);
|
|
return result;
|
|
}
|
|
|
|
if (cfg->eventLogMask & EventLogMask_FLUSH)
|
|
FileEvent_init(&event, FileEventType_FLUSH, file_dentry(filp));
|
|
|
|
FhgfsInode_entryInfoWriteLock(inode);
|
|
|
|
bumpRes = FhgfsOpsRemoting_bumpFileVersion(
|
|
FhgfsOps_getApp(file_dentry(filp)->d_sb),
|
|
&inode->entryInfo,
|
|
true, cfg->eventLogMask & EventLogMask_FLUSH ? &event : NULL);
|
|
|
|
inode->fileVersion += 1;
|
|
|
|
FhgfsInode_entryInfoWriteUnlock(inode);
|
|
|
|
FileEvent_uninit(&event);
|
|
|
|
if (bumpRes != FhgfsOpsErr_SUCCESS)
|
|
atomic_set(&inode->modified, 1);
|
|
|
|
return FhgfsOpsErr_toSysErr(bumpRes);
|
|
}
|
|
|
|
static int beegfs_release(struct inode* inode, struct file* filp)
|
|
{
|
|
int flushRes;
|
|
|
|
// flush entire contents of file, if this fails, a previous release operation has likely also
|
|
// failed. the only sensible thing to do then is to drop the entire cache (because we may be
|
|
// arbitrarily inconsistent with the rest of the world and would never know).
|
|
flushRes = beegfs_release_range(filp, 0, LLONG_MAX);
|
|
|
|
if(flushRes < 0)
|
|
beegfs_drop_all_caches(inode);
|
|
|
|
return FhgfsOps_release(inode, filp);
|
|
}
|
|
|
|
static ssize_t beegfs_file_write_iter(struct kiocb* iocb, struct iov_iter* from)
|
|
{
|
|
return generic_file_write_iter(iocb, from);
|
|
}
|
|
|
|
static ssize_t beegfs_write_iter_direct(struct kiocb* iocb, struct iov_iter* from)
|
|
{
|
|
iocb->ki_flags |= IOCB_DIRECT;
|
|
return generic_file_write_iter(iocb, from);
|
|
}
|
|
|
|
static ssize_t beegfs_write_iter_locked_append(struct kiocb* iocb, struct iov_iter* from)
|
|
{
|
|
struct file* filp = iocb->ki_filp;
|
|
struct inode *inode = file_inode(filp);
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
FhgfsInode *fhgfsInode = BEEGFS_INODE(inode);
|
|
FhgfsIsizeHints iSizeHints;
|
|
RemotingIOInfo ioInfo;
|
|
FhgfsOpsErr ferr;
|
|
ssize_t ret = 0;
|
|
|
|
FsFileInfo_getIOInfo(__FhgfsOps_getFileInfo(filp), fhgfsInode, &ioInfo);
|
|
|
|
Mutex_lock(&fhgfsInode->appendMutex);
|
|
|
|
ferr = FhgfsOpsHelper_getAppendLock(fhgfsInode, &ioInfo);
|
|
if (ferr)
|
|
{
|
|
ret = FhgfsOpsErr_toSysErr(ferr);
|
|
goto out;
|
|
}
|
|
|
|
ret = __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, false);
|
|
|
|
if (! ret)
|
|
ret = beegfs_write_iter_direct(iocb, from);
|
|
|
|
FhgfsOpsHelper_releaseAppendLock(fhgfsInode, &ioInfo);
|
|
|
|
out:
|
|
Mutex_unlock(&fhgfsInode->appendMutex);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t beegfs_write_iter(struct kiocb* iocb, struct iov_iter* from)
|
|
{
|
|
|
|
struct file* filp = iocb->ki_filp;
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
|
|
atomic_set(&BEEGFS_INODE(file_inode(filp))->modified, 1);
|
|
|
|
if ((filp->f_flags & O_APPEND)
|
|
&& Config_getTuneUseGlobalAppendLocks(App_getConfig(app)))
|
|
return beegfs_write_iter_locked_append(iocb, from);
|
|
|
|
// Switch to direct (non-buffered) writes in various circumstances.
|
|
//
|
|
if (!iov_iter_is_pipe(from)
|
|
&& (from->count >= Config_getTuneFileCacheBufSize(App_getConfig(app))
|
|
|| BEEGFS_SHOULD_FAIL(write_force_cache_bypass, 1)))
|
|
return beegfs_write_iter_direct(iocb, from);
|
|
|
|
return beegfs_file_write_iter(iocb, from);
|
|
}
|
|
|
|
|
|
static ssize_t beegfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to)
|
|
{
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
|
|
|
|
/* like with write_iter, this is basically the O_DIRECT generic_file_read_iter. */
|
|
static ssize_t beegfs_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
struct file* filp = iocb->ki_filp;
|
|
|
|
struct address_space* mapping = filp->f_mapping;
|
|
struct inode* inode = mapping->host;
|
|
size_t count = to->count;
|
|
loff_t size;
|
|
ssize_t result;
|
|
|
|
if(!count)
|
|
return 0; /* skip atime */
|
|
|
|
size = i_size_read(inode);
|
|
result = beegfs_release_range(filp, iocb->ki_pos, iocb->ki_pos + count - 1);
|
|
if(!result)
|
|
{
|
|
struct iov_iter data = *to;
|
|
result = __beegfs_direct_IO(READ, iocb, &data, iocb->ki_pos);
|
|
}
|
|
|
|
if(result > 0)
|
|
iocb->ki_pos += result;
|
|
|
|
if(result < 0 || to->count == result || iocb->ki_pos + result >= size)
|
|
file_accessed(filp);
|
|
|
|
return result;
|
|
}
|
|
|
|
static ssize_t beegfs_read_iter(struct kiocb* iocb, struct iov_iter* to)
|
|
{
|
|
size_t size = to->count;
|
|
struct file* filp = iocb->ki_filp;
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(filp), file_inode(filp), __func__,
|
|
"(offset: %lld; size: %zu)", iocb->ki_pos, size);
|
|
IGNORE_UNUSED_VARIABLE(size);
|
|
|
|
if (to->count >= Config_getTuneFileCacheBufSize(App_getConfig(app))
|
|
|| BEEGFS_SHOULD_FAIL(read_force_cache_bypass, 1))
|
|
return beegfs_direct_read_iter(iocb, to);
|
|
else
|
|
return beegfs_file_read_iter(iocb, to);
|
|
}
|
|
|
|
|
|
static int __beegfs_fsync(struct file* filp, loff_t start, loff_t end, int datasync)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
Config* cfg = App_getConfig(app);
|
|
int err;
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(file_inode(filp));
|
|
|
|
FhgfsOpsHelper_logOp(5, app, file_dentry(filp), file_inode(filp), __func__);
|
|
|
|
IGNORE_UNUSED_VARIABLE(start);
|
|
IGNORE_UNUSED_VARIABLE(end);
|
|
IGNORE_UNUSED_VARIABLE(datasync);
|
|
|
|
/* see comment in beegfs_flush for explanation */
|
|
atomic_set(&fhgfsInode->modified, 0);
|
|
|
|
err = beegfs_release_range(filp, start, end);
|
|
if(err)
|
|
{
|
|
atomic_set(&fhgfsInode->modified, 0);
|
|
return err;
|
|
}
|
|
|
|
if(Config_getTuneRemoteFSync(cfg) )
|
|
{
|
|
RemotingIOInfo ioInfo;
|
|
FhgfsOpsErr res;
|
|
|
|
FsFileInfo_getIOInfo(__FhgfsOps_getFileInfo(filp), fhgfsInode, &ioInfo);
|
|
res = FhgfsOpsRemoting_fsyncfile(&ioInfo, true, true, false);
|
|
if(res != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
atomic_set(&fhgfsInode->modified, 0);
|
|
return FhgfsOpsErr_toSysErr(res);
|
|
}
|
|
}
|
|
|
|
FhgfsInode_entryInfoWriteLock(fhgfsInode);
|
|
|
|
err = FhgfsOpsRemoting_bumpFileVersion(
|
|
FhgfsOps_getApp(file_dentry(filp)->d_sb),
|
|
&fhgfsInode->entryInfo,
|
|
true, NULL);
|
|
if (err != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
atomic_set(&fhgfsInode->modified, 0);
|
|
err = FhgfsOpsErr_toSysErr(err);
|
|
}
|
|
|
|
fhgfsInode->fileVersion += 1;
|
|
|
|
FhgfsInode_entryInfoWriteUnlock(fhgfsInode);
|
|
|
|
return err;
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_FSYNC_RANGE
|
|
static int beegfs_fsync(struct file* file, loff_t start, loff_t end, int datasync)
|
|
{
|
|
return __beegfs_fsync(file, start, end, datasync);
|
|
}
|
|
#elif defined(KERNEL_HAS_FSYNC_2)
|
|
static int beegfs_fsync(struct file* file, int datasync)
|
|
{
|
|
return __beegfs_fsync(file, 0, LLONG_MAX, datasync);
|
|
}
|
|
#else
|
|
static int beegfs_fsync(struct file* file, struct dentry* dentry, int datasync)
|
|
{
|
|
return __beegfs_fsync(file, 0, LLONG_MAX, datasync);
|
|
}
|
|
#endif
|
|
|
|
static int beegfs_flock(struct file* filp, int cmd, struct file_lock* flock)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
int err = -EINVAL;
|
|
|
|
FhgfsOpsHelper_logOp(5, app, file_dentry(filp), file_inode(filp), __func__);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
switch(FhgfsCommon_getFileLockType(flock))
|
|
{
|
|
case F_RDLCK:
|
|
case F_WRLCK:
|
|
err = beegfs_acquire_range(filp, 0, LLONG_MAX);
|
|
break;
|
|
|
|
case F_UNLCK:
|
|
err = beegfs_release_range(filp, 0, LLONG_MAX);
|
|
break;
|
|
}
|
|
|
|
if(err)
|
|
return err;
|
|
|
|
return FhgfsOps_flock(filp, cmd, flock);
|
|
}
|
|
|
|
static int beegfs_lock(struct file* filp, int cmd, struct file_lock* flock)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
int err = -EINVAL;
|
|
|
|
FhgfsOpsHelper_logOp(5, app, file_dentry(filp), file_inode(filp), __func__);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
switch(FhgfsCommon_getFileLockType(flock))
|
|
{
|
|
case F_RDLCK:
|
|
case F_WRLCK:
|
|
err = beegfs_acquire_range(filp, flock->fl_start, flock->fl_end);
|
|
break;
|
|
|
|
case F_UNLCK:
|
|
err = beegfs_release_range(filp, flock->fl_start, flock->fl_end);
|
|
break;
|
|
}
|
|
|
|
if(err)
|
|
return err;
|
|
|
|
return FhgfsOps_lock(filp, cmd, flock);
|
|
}
|
|
|
|
static int beegfs_mmap(struct file* filp, struct vm_area_struct* vma)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
int err = -EINVAL;
|
|
|
|
FhgfsOpsHelper_logOp(5, app, file_dentry(filp), file_inode(filp), __func__);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
err = beegfs_acquire_range(filp, 0, LLONG_MAX);
|
|
if(err)
|
|
return err;
|
|
|
|
err = generic_file_mmap(filp, vma);
|
|
return err;
|
|
}
|
|
|
|
const struct file_operations fhgfs_file_native_ops = {
|
|
.open = beegfs_open,
|
|
.flush = beegfs_flush,
|
|
.release = beegfs_release,
|
|
.fsync = beegfs_fsync,
|
|
.llseek = FhgfsOps_llseek,
|
|
.flock = beegfs_flock,
|
|
.lock = beegfs_lock,
|
|
.mmap = beegfs_mmap,
|
|
|
|
.unlocked_ioctl = FhgfsOpsIoctl_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = FhgfsOpsIoctl_compatIoctl,
|
|
#endif
|
|
|
|
.read_iter = beegfs_read_iter,
|
|
.write_iter = beegfs_write_iter,
|
|
|
|
#ifdef KERNEL_HAS_GENERIC_FILE_SPLICE_READ
|
|
.splice_read = generic_file_splice_read,
|
|
#else
|
|
.splice_read = filemap_splice_read,
|
|
#endif
|
|
#if defined(KERNEL_HAS_ITER_FILE_SPLICE_WRITE)
|
|
.splice_write = iter_file_splice_write,
|
|
#else
|
|
.splice_write = generic_file_splice_write,
|
|
#endif
|
|
};
|
|
|
|
|
|
|
|
struct beegfs_writepages_context
|
|
{
|
|
RemotingIOInfo ioInfo;
|
|
struct writeback_control* wbc;
|
|
bool unlockPages;
|
|
|
|
// only ever written by the flusher thread
|
|
struct beegfs_writepages_state* currentState;
|
|
int submitted;
|
|
|
|
SynchronizedCounter barrier;
|
|
};
|
|
|
|
static int writepages_block_size __read_mostly;
|
|
|
|
struct beegfs_writepages_state
|
|
{
|
|
struct page** pages;
|
|
struct kvec* kvecs;
|
|
unsigned nr_pages;
|
|
|
|
struct beegfs_writepages_context* context;
|
|
|
|
struct work_struct work;
|
|
};
|
|
|
|
static mempool_t* writepages_pool;
|
|
|
|
static void* __writepages_pool_alloc(gfp_t mask, void* pool_data)
|
|
{
|
|
struct beegfs_writepages_state* state;
|
|
|
|
state = kmalloc(sizeof(*state), mask);
|
|
if(!state)
|
|
goto fail_state;
|
|
|
|
// should use kmalloc_array, but that's not available everywhere. luckily, this will not
|
|
// overflow.
|
|
state->pages = kmalloc(writepages_block_size * sizeof(struct page*), mask);
|
|
if(!state->pages)
|
|
goto fail_pages;
|
|
|
|
state->kvecs = kmalloc(writepages_block_size * sizeof(*state->kvecs), mask);
|
|
if(!state->kvecs)
|
|
goto fail_kvecs;
|
|
|
|
return state;
|
|
|
|
fail_kvecs:
|
|
kfree(state->pages);
|
|
fail_pages:
|
|
kfree(state);
|
|
fail_state:
|
|
return NULL;
|
|
}
|
|
|
|
static void __writepages_pool_free(void* element, void* pool_data)
|
|
{
|
|
struct beegfs_writepages_state* state = element;
|
|
|
|
if(!state)
|
|
return;
|
|
|
|
kfree(state->kvecs);
|
|
kfree(state->pages);
|
|
kfree(state);
|
|
}
|
|
|
|
static int writepages_init()
|
|
{
|
|
// a state contains a pointer to page* array and an iovec array, so fill a page with one
|
|
// and allocate the other to match
|
|
writepages_block_size = PAGE_SIZE / MAX(sizeof(struct page*), sizeof(struct kvec) );
|
|
|
|
writepages_pool = mempool_create(1, __writepages_pool_alloc, __writepages_pool_free, NULL);
|
|
if(!writepages_pool)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void writepages_release()
|
|
{
|
|
if(writepages_pool)
|
|
mempool_destroy(writepages_pool);
|
|
}
|
|
|
|
static struct beegfs_writepages_state* wps_alloc(struct beegfs_writepages_context* ctx)
|
|
{
|
|
struct beegfs_writepages_state* result = mempool_alloc(writepages_pool, GFP_NOFS);
|
|
|
|
result->nr_pages = 0;
|
|
result->context = ctx;
|
|
|
|
return result;
|
|
}
|
|
|
|
static void wps_free(struct beegfs_writepages_state* state)
|
|
{
|
|
mempool_free(state, writepages_pool);
|
|
}
|
|
|
|
static int beegfs_wps_prepare(struct beegfs_writepages_state* state, loff_t* offset, size_t* size)
|
|
{
|
|
int i;
|
|
|
|
*size = 0;
|
|
|
|
if(pvr_present(state->pages[0]) )
|
|
{
|
|
*offset = page_offset(state->pages[0]) + pvr_get_first(state->pages[0]);
|
|
|
|
for(i = 0; i < state->nr_pages; i++)
|
|
{
|
|
struct page* page = state->pages[i];
|
|
unsigned length;
|
|
|
|
length = pvr_get_last(page) - pvr_get_first(page) + 1;
|
|
|
|
state->kvecs[i].iov_base = page_address(page) + pvr_get_first(page);
|
|
state->kvecs[i].iov_len = length;
|
|
|
|
*size += length;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// ARDs were deleted
|
|
BUG();
|
|
}
|
|
|
|
static void __beegfs_writepages_work(struct beegfs_writepages_state* state)
|
|
{
|
|
int err = 0;
|
|
loff_t offset;
|
|
ssize_t size;
|
|
ssize_t written = 0;
|
|
|
|
err = beegfs_wps_prepare(state, &offset, &size);
|
|
|
|
if(err < 0)
|
|
{
|
|
// Probably EIO or EDQUOT
|
|
}
|
|
else if(BEEGFS_SHOULD_FAIL(writepage, 1) )
|
|
{
|
|
// artificial write error
|
|
err = -EIO;
|
|
}
|
|
else
|
|
{
|
|
struct iov_iter iter;
|
|
BEEGFS_IOV_ITER_KVEC(&iter, WRITE, state->kvecs, state->nr_pages, size);
|
|
|
|
written = FhgfsOpsRemoting_writefileVec(&iter, offset, &state->context->ioInfo, false);
|
|
|
|
if(written < 0)
|
|
err = FhgfsOpsErr_toSysErr(-written);
|
|
else
|
|
task_io_account_write(written);
|
|
}
|
|
|
|
size = 0;
|
|
for(unsigned i = 0; i < state->nr_pages; i++)
|
|
{
|
|
struct page* page = state->pages[i];
|
|
struct address_space *mapping = page->mapping;
|
|
BUG_ON(! mapping); //???
|
|
|
|
size += state->kvecs[i].iov_len;
|
|
|
|
if (size <= written)
|
|
{
|
|
pvr_clear(page);
|
|
}
|
|
else if (err)
|
|
{
|
|
fhgfs_set_wb_error(page, err);
|
|
pvr_clear(page);
|
|
}
|
|
else
|
|
{
|
|
// NOTE: this will cause the kernel to retry writeback at a later point
|
|
redirty_page_for_writepage(state->context->wbc, page);
|
|
}
|
|
|
|
/*
|
|
Note: As per the documentation, as of Linux 2.5.12,
|
|
we could unlock the pages as early as after marking them using
|
|
set_page_writeback().
|
|
It could be that our own code requires it somewhere, though.
|
|
*/
|
|
if(state->context->unlockPages)
|
|
unlock_page(page);
|
|
|
|
end_page_writeback(page);
|
|
}
|
|
}
|
|
|
|
static void beegfs_writepages_work(struct beegfs_writepages_state* state)
|
|
{
|
|
if(state->nr_pages > 0)
|
|
__beegfs_writepages_work(state);
|
|
|
|
wps_free(state);
|
|
}
|
|
|
|
static void beegfs_writepages_work_wrapper(struct work_struct* w)
|
|
{
|
|
struct beegfs_writepages_state* state = container_of(w, struct beegfs_writepages_state, work);
|
|
SynchronizedCounter* barrier = &state->context->barrier;
|
|
|
|
beegfs_writepages_work(state);
|
|
SynchronizedCounter_incCount(barrier);
|
|
}
|
|
|
|
static void beegfs_writepages_submit(struct beegfs_writepages_context* context)
|
|
{
|
|
struct beegfs_writepages_state* state = context->currentState;
|
|
|
|
context->submitted += 1;
|
|
|
|
INIT_WORK(&state->work, beegfs_writepages_work_wrapper);
|
|
queue_work(remoting_io_queue, &state->work);
|
|
}
|
|
|
|
static bool beegfs_wps_must_flush_before(struct beegfs_writepages_state* state, struct page* next)
|
|
{
|
|
if(state->nr_pages == 0)
|
|
return false;
|
|
|
|
if(state->nr_pages == writepages_block_size)
|
|
return true;
|
|
|
|
if(state->pages[state->nr_pages - 1]->index + 1 != next->index)
|
|
return true;
|
|
|
|
if(pvr_present(next) )
|
|
{
|
|
if(pvr_get_first(next) != 0)
|
|
return true;
|
|
|
|
if(pvr_get_last(state->pages[state->nr_pages - 1]) != PAGE_SIZE - 1)
|
|
return true;
|
|
|
|
if(!pvr_present(state->pages[state->nr_pages - 1]) )
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
#ifdef KERNEL_WRITEPAGE_HAS_FOLIO
|
|
static int beegfs_writepages_callback(struct folio *folio, struct writeback_control* wbc, void* data)
|
|
{
|
|
struct page *page = &folio->page;
|
|
#else
|
|
static int beegfs_writepages_callback(struct page* page, struct writeback_control* wbc, void* data)
|
|
{
|
|
#endif
|
|
struct beegfs_writepages_context* context = data;
|
|
struct beegfs_writepages_state* state = context->currentState;
|
|
|
|
BUG_ON(!pvr_present(page));
|
|
|
|
if(beegfs_wps_must_flush_before(state, page) )
|
|
{
|
|
beegfs_writepages_submit(context);
|
|
state = wps_alloc(context);
|
|
context->currentState = state;
|
|
}
|
|
|
|
state->pages[state->nr_pages] = page;
|
|
state->nr_pages += 1;
|
|
|
|
//XXX can't we defer this to later?
|
|
set_page_writeback(page);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int beegfs_do_write_pages(struct address_space* mapping, struct writeback_control* wbc,
|
|
struct page* page, bool unlockPages)
|
|
{
|
|
struct inode* inode = mapping->host;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
|
|
FhgfsOpsErr referenceRes;
|
|
FileHandleType handleType;
|
|
int err;
|
|
|
|
struct beegfs_writepages_context context = {
|
|
.unlockPages = unlockPages,
|
|
.wbc = wbc,
|
|
.submitted = 0,
|
|
};
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, NULL, inode, __func__, "page? %i %lu", page != NULL,
|
|
page ? page->index : 0);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
referenceRes = FhgfsInode_referenceHandle(BEEGFS_INODE(inode), NULL, OPENFILE_ACCESS_READWRITE,
|
|
true, NULL, &handleType, NULL);
|
|
if(referenceRes != FhgfsOpsErr_SUCCESS)
|
|
return FhgfsOpsErr_toSysErr(referenceRes);
|
|
|
|
context.currentState = wps_alloc(&context);
|
|
SynchronizedCounter_init(&context.barrier);
|
|
|
|
FhgfsInode_getRefIOInfo(BEEGFS_INODE(inode), handleType, OPENFILE_ACCESS_READWRITE,
|
|
&context.ioInfo);
|
|
|
|
FhgfsInode_incWriteBackCounter(BEEGFS_INODE(inode) );
|
|
|
|
if(page)
|
|
{
|
|
#ifdef KERNEL_WRITEPAGE_HAS_FOLIO
|
|
struct folio *folio = page_folio(page);
|
|
err = beegfs_writepages_callback(folio, wbc, &context);
|
|
#else
|
|
err = beegfs_writepages_callback(page, wbc, &context);
|
|
#endif
|
|
|
|
//XXX not sure if it's supposed to be like that
|
|
WARN_ON(wbc->nr_to_write != 1);
|
|
if (! err)
|
|
-- wbc->nr_to_write;
|
|
}
|
|
else
|
|
err = write_cache_pages(mapping, wbc, beegfs_writepages_callback, &context);
|
|
|
|
beegfs_writepages_submit(&context);
|
|
|
|
SynchronizedCounter_waitForCount(&context.barrier, context.submitted);
|
|
|
|
FhgfsInode_releaseHandle(BEEGFS_INODE(inode), handleType, NULL);
|
|
|
|
FhgfsInode_decWriteBackCounter(BEEGFS_INODE(inode) );
|
|
FhgfsInode_unsetNoIsizeDecrease(BEEGFS_INODE(inode) );
|
|
|
|
return err;
|
|
}
|
|
|
|
static int beegfs_writepage(struct page* page, struct writeback_control* wbc)
|
|
{
|
|
struct inode* inode = page->mapping->host;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, NULL, inode, __func__, "");
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
return beegfs_do_write_pages(page->mapping, wbc, page, true);
|
|
}
|
|
|
|
static int beegfs_writepages(struct address_space* mapping, struct writeback_control* wbc)
|
|
{
|
|
struct inode* inode = mapping->host;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, NULL, inode, __func__, "");
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
return beegfs_do_write_pages(mapping, wbc, NULL, true);
|
|
}
|
|
|
|
|
|
|
|
static int beegfs_flush_page(struct page* page)
|
|
{
|
|
struct writeback_control wbc = {
|
|
.nr_to_write = 1,
|
|
.sync_mode = WB_SYNC_ALL,
|
|
};
|
|
|
|
if(!clear_page_dirty_for_io(page) )
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return beegfs_do_write_pages(page->mapping, &wbc, page, false);
|
|
}
|
|
|
|
static int beegfs_readpage(struct file* filp, struct page* page)
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
|
|
struct inode* inode = filp->f_mapping->host;
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(filp);
|
|
RemotingIOInfo ioInfo;
|
|
ssize_t readRes = -EIO;
|
|
|
|
loff_t offset = page_offset(page);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(filp), inode, __func__, "offset: %lld", offset);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
|
|
|
|
if (pvr_present(page))
|
|
{
|
|
readRes = beegfs_flush_page(page);
|
|
if(readRes)
|
|
goto out;
|
|
}
|
|
|
|
if(BEEGFS_SHOULD_FAIL(readpage, 1) )
|
|
goto out;
|
|
|
|
readRes = FhgfsOpsRemoting_readfile_kernel(page_address(page), PAGE_SIZE, offset, &ioInfo,
|
|
fhgfsInode);
|
|
|
|
if(readRes < 0)
|
|
{
|
|
readRes = FhgfsOpsErr_toSysErr(-readRes);
|
|
goto out;
|
|
}
|
|
|
|
if(readRes < PAGE_SIZE)
|
|
memset(page_address(page) + readRes, 0, PAGE_SIZE - readRes);
|
|
|
|
readRes = 0;
|
|
task_io_account_read(PAGE_SIZE);
|
|
|
|
out:
|
|
page_endio(page, READ, readRes);
|
|
return readRes;
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
static int beegfs_read_folio(struct file* filp, struct folio* folio)
|
|
{
|
|
return beegfs_readpage(filp, &folio->page);
|
|
}
|
|
#endif
|
|
|
|
struct beegfs_readpages_context
|
|
{
|
|
RemotingIOInfo ioInfo;
|
|
FileHandleType handleType;
|
|
struct inode* inode;
|
|
|
|
struct beegfs_readpages_state* currentState;
|
|
|
|
struct kref refs;
|
|
};
|
|
|
|
static struct beegfs_readpages_context* rpc_create(struct inode* inode)
|
|
{
|
|
struct beegfs_readpages_context* context;
|
|
FhgfsOpsErr referenceRes;
|
|
|
|
context = kmalloc(sizeof(*context), GFP_NOFS);
|
|
if(!context)
|
|
return NULL;
|
|
|
|
context->inode = inode;
|
|
context->currentState = NULL;
|
|
kref_init(&context->refs);
|
|
|
|
referenceRes = FhgfsInode_referenceHandle(BEEGFS_INODE(inode), NULL, OPENFILE_ACCESS_READWRITE,
|
|
true, NULL, &context->handleType, NULL);
|
|
if(referenceRes != FhgfsOpsErr_SUCCESS)
|
|
goto fail_reference;
|
|
|
|
FhgfsInode_getRefIOInfo(BEEGFS_INODE(inode), context->handleType, OPENFILE_ACCESS_READWRITE,
|
|
&context->ioInfo);
|
|
|
|
return context;
|
|
|
|
fail_reference:
|
|
kfree(context);
|
|
return ERR_PTR(FhgfsOpsErr_toSysErr(referenceRes) );
|
|
}
|
|
|
|
static void __beegfs_readpages_context_free(struct kref* ref)
|
|
{
|
|
struct beegfs_readpages_context* context;
|
|
|
|
context = container_of(ref, struct beegfs_readpages_context, refs);
|
|
|
|
FhgfsInode_releaseHandle(BEEGFS_INODE(context->inode), context->handleType, NULL);
|
|
kfree(context);
|
|
}
|
|
|
|
static void rpc_get(struct beegfs_readpages_context* context)
|
|
{
|
|
kref_get(&context->refs);
|
|
}
|
|
|
|
static void rpc_put(struct beegfs_readpages_context* context)
|
|
{
|
|
kref_put(&context->refs, __beegfs_readpages_context_free);
|
|
}
|
|
|
|
|
|
struct beegfs_readpages_state
|
|
{
|
|
struct page** pages;
|
|
struct kvec *kvecs;
|
|
unsigned nr_pages;
|
|
|
|
struct beegfs_readpages_context* context;
|
|
|
|
struct work_struct work;
|
|
};
|
|
|
|
static int readpages_block_size __read_mostly;
|
|
|
|
static void readpages_init()
|
|
{
|
|
// much the same as writepages_block_size
|
|
readpages_block_size = PAGE_SIZE / MAX(sizeof(struct page*), sizeof(struct kvec) );
|
|
}
|
|
|
|
static struct beegfs_readpages_state* rps_alloc(struct beegfs_readpages_context* context)
|
|
{
|
|
struct beegfs_readpages_state* state;
|
|
|
|
state = kmalloc(sizeof(*state), GFP_NOFS);
|
|
if(!state)
|
|
goto fail_state;
|
|
|
|
// should use kmalloc_array, see __writepages_pool_alloc
|
|
state->pages = kmalloc(readpages_block_size * sizeof(struct page*), GFP_NOFS);
|
|
if(!state->pages)
|
|
goto fail_pages;
|
|
|
|
state->kvecs = kmalloc(readpages_block_size * sizeof(*state->kvecs), GFP_NOFS);
|
|
if(!state->kvecs)
|
|
goto fail_kvecs;
|
|
|
|
state->nr_pages = 0;
|
|
state->context = context;
|
|
rpc_get(context);
|
|
|
|
return state;
|
|
|
|
fail_kvecs:
|
|
kfree(state->pages);
|
|
fail_pages:
|
|
kfree(state);
|
|
fail_state:
|
|
return NULL;
|
|
}
|
|
|
|
static void rps_free(struct beegfs_readpages_state* state)
|
|
{
|
|
if(!state)
|
|
return;
|
|
|
|
rpc_put(state->context);
|
|
kfree(state->kvecs);
|
|
kfree(state->pages);
|
|
kfree(state);
|
|
}
|
|
|
|
static void beegfs_readpages_work(struct work_struct* w)
|
|
{
|
|
struct beegfs_readpages_state* state = container_of(w, struct beegfs_readpages_state, work);
|
|
|
|
App* app;
|
|
struct iov_iter iter;
|
|
ssize_t readRes;
|
|
unsigned validPages = 0;
|
|
int err = 0;
|
|
int i;
|
|
|
|
if(!state->nr_pages)
|
|
goto done;
|
|
|
|
app = FhgfsOps_getApp(state->pages[0]->mapping->host->i_sb);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, NULL, state->pages[0]->mapping->host, __func__,
|
|
"first offset: %lld nr_pages %u", page_offset(state->pages[0]), state->nr_pages);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
if(BEEGFS_SHOULD_FAIL(readpage, 1) )
|
|
{
|
|
err = -EIO;
|
|
goto endio;
|
|
}
|
|
|
|
BEEGFS_IOV_ITER_KVEC(&iter, READ, state->kvecs, state->nr_pages,
|
|
state->nr_pages * PAGE_SIZE);
|
|
|
|
readRes = FhgfsOpsRemoting_readfileVec(&iter, iov_iter_count(&iter), page_offset(state->pages[0]),
|
|
&state->context->ioInfo, BEEGFS_INODE(state->pages[0]->mapping->host) );
|
|
if(readRes < 0)
|
|
err = FhgfsOpsErr_toSysErr(-readRes);
|
|
|
|
if(err < 0)
|
|
goto endio;
|
|
|
|
validPages = readRes / PAGE_SIZE;
|
|
|
|
if(readRes % PAGE_SIZE != 0)
|
|
{
|
|
int start = readRes % PAGE_SIZE;
|
|
memset(page_address(state->pages[validPages]) + start, 0, PAGE_SIZE - start);
|
|
validPages += 1;
|
|
}
|
|
|
|
endio:
|
|
for(i = 0; i < validPages; i++)
|
|
page_endio(state->pages[i], READ, err);
|
|
|
|
for(i = validPages; i < state->nr_pages; i++)
|
|
{
|
|
ClearPageUptodate(state->pages[i]);
|
|
unlock_page(state->pages[i]);
|
|
}
|
|
|
|
done:
|
|
rps_free(state);
|
|
}
|
|
|
|
static void beegfs_readpages_submit(struct beegfs_readpages_context* context)
|
|
{
|
|
struct beegfs_readpages_state* state = context->currentState;
|
|
|
|
INIT_WORK(&state->work, beegfs_readpages_work);
|
|
queue_work(remoting_io_queue, &state->work);
|
|
}
|
|
|
|
static int beegfs_readpages_add_page(void* data, struct page* page)
|
|
{
|
|
struct beegfs_readpages_context* context = data;
|
|
struct beegfs_readpages_state* state = context->currentState;
|
|
bool mustFlush;
|
|
|
|
mustFlush = (state->nr_pages == readpages_block_size)
|
|
|| (state->nr_pages > 0 && state->pages[state->nr_pages - 1]->index + 1 != page->index);
|
|
|
|
if(mustFlush)
|
|
{
|
|
beegfs_readpages_submit(context);
|
|
state = rps_alloc(context);
|
|
if(!state)
|
|
return -ENOMEM;
|
|
|
|
context->currentState = state;
|
|
}
|
|
|
|
state->pages[state->nr_pages] = page;
|
|
state->kvecs[state->nr_pages].iov_base = page_address(page);
|
|
state->kvecs[state->nr_pages].iov_len = PAGE_SIZE;
|
|
state->nr_pages += 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
static void beegfs_readahead(struct readahead_control *ractl)
|
|
#else
|
|
static int beegfs_readpages(struct file* filp, struct address_space* mapping,
|
|
struct list_head* pages, unsigned nr_pages)
|
|
#endif
|
|
{
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
struct inode* inode = ractl->mapping->host;
|
|
struct file* filp = ractl->file;
|
|
struct page* page_ra;
|
|
#else
|
|
struct inode* inode = mapping->host;
|
|
#endif
|
|
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(filp);
|
|
RemotingIOInfo ioInfo;
|
|
int err;
|
|
|
|
struct beegfs_readpages_context* context;
|
|
|
|
context = rpc_create(inode);
|
|
if(IS_ERR(context) )
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
return;
|
|
#else
|
|
return PTR_ERR(context);
|
|
#endif
|
|
|
|
context->currentState = rps_alloc(context);
|
|
if(!context->currentState)
|
|
{
|
|
err = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(filp), inode, __func__,
|
|
"first offset: %lld \n nr_pages %u \n no. of bytes in this readahead request: %zu\n",
|
|
readahead_pos(ractl), readahead_count(ractl), readahead_length(ractl));
|
|
#else
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(filp), inode, __func__,
|
|
"first offset: %lld nr_pages %u", page_offset(list_entry(pages->prev, struct page, lru)),
|
|
nr_pages);
|
|
#endif
|
|
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
if (readahead_count(ractl))
|
|
{
|
|
while ((page_ra = readahead_page(ractl)) != NULL)
|
|
{
|
|
err = beegfs_readpages_add_page(context, page_ra);
|
|
put_page(page_ra);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
}
|
|
#else
|
|
err = read_cache_pages(mapping, pages, beegfs_readpages_add_page, context);
|
|
#endif
|
|
|
|
beegfs_readpages_submit(context);
|
|
|
|
out:
|
|
rpc_put(context);
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
return;
|
|
#else
|
|
return err;
|
|
#endif
|
|
}
|
|
|
|
static int __beegfs_write_begin(struct file* filp, loff_t pos, unsigned len, struct page* page)
|
|
{
|
|
int result = 0;
|
|
|
|
if(!pvr_present(page) )
|
|
goto success;
|
|
|
|
if(pvr_can_merge(page, pos & ~PAGE_MASK, (pos & ~PAGE_MASK) + len - 1))
|
|
goto success;
|
|
|
|
result = beegfs_flush_page(page);
|
|
if(result)
|
|
goto out_err;
|
|
|
|
success:
|
|
return result;
|
|
|
|
out_err:
|
|
unlock_page(page);
|
|
put_page(page);
|
|
return result;
|
|
}
|
|
|
|
static int __beegfs_write_end(struct file* filp, loff_t pos, unsigned len, unsigned copied,
|
|
struct page* page)
|
|
{
|
|
struct inode* inode = page->mapping->host;
|
|
int result = copied;
|
|
|
|
App* app = FhgfsOps_getApp(file_dentry(filp)->d_sb);
|
|
|
|
if(copied != len && pvr_present(page) )
|
|
{
|
|
FhgfsOpsHelper_logOpMsg(2, app, file_dentry(filp), inode, __func__, "short write!");
|
|
result = 0;
|
|
goto out;
|
|
}
|
|
|
|
if(i_size_read(inode) < pos + copied)
|
|
{
|
|
i_size_write(inode, pos + copied);
|
|
FhgfsInode_setPageWriteFlag(BEEGFS_INODE(inode) );
|
|
FhgfsInode_setLastWriteBackOrIsizeWriteTime(BEEGFS_INODE(inode) );
|
|
FhgfsInode_setNoIsizeDecrease(BEEGFS_INODE(inode) );
|
|
}
|
|
|
|
if(pvr_present(page) )
|
|
pvr_merge(page, pos & ~PAGE_MASK, (pos & ~PAGE_MASK) + copied - 1);
|
|
else
|
|
{
|
|
pvr_init(page);
|
|
pvr_set_first(page, pos & ~PAGE_MASK);
|
|
pvr_set_last(page, (pos & ~PAGE_MASK) + copied - 1);
|
|
}
|
|
|
|
out:
|
|
ClearPageUptodate(page);
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
filemap_dirty_folio(page->mapping, page_folio(page));
|
|
#else
|
|
__set_page_dirty_nobuffers(page);
|
|
#endif
|
|
|
|
unlock_page(page);
|
|
put_page(page);
|
|
|
|
return result;
|
|
}
|
|
|
|
static int beegfs_write_begin(struct file *filp, struct address_space *mapping,
|
|
loff_t pos, unsigned len,
|
|
#if BEEGFS_HAS_WRITE_FLAGS
|
|
unsigned flags,
|
|
#endif
|
|
beegfs_pgfol_t *pgfolp, void **fsdata)
|
|
{
|
|
pgoff_t index = pos >> PAGE_SHIFT;
|
|
|
|
struct page *page = beegfs_grab_cache_page(mapping, index,
|
|
#if BEEGFS_HAS_WRITE_FLAGS
|
|
flags
|
|
#else
|
|
0
|
|
#endif
|
|
);
|
|
|
|
// Common check for all
|
|
if (!page)
|
|
return -ENOMEM;
|
|
|
|
*pgfolp = beegfs_to_pgfol(page);
|
|
return __beegfs_write_begin(filp, pos, len, page);
|
|
|
|
}
|
|
|
|
static int beegfs_write_end(struct file *filp, struct address_space *mapping, loff_t pos,
|
|
unsigned len, unsigned copied, beegfs_pgfol_t pgfol, void *fsdata)
|
|
{
|
|
struct page *page = beegfs_get_page(pgfol);
|
|
return __beegfs_write_end(filp, pos, len, copied, page);
|
|
}
|
|
|
|
static int beegfs_releasepage(struct page* page, gfp_t gfp)
|
|
{
|
|
|
|
IGNORE_UNUSED_VARIABLE(gfp);
|
|
|
|
if(pvr_present(page) )
|
|
{
|
|
pvr_clear(page);
|
|
return 1;
|
|
}
|
|
|
|
// ARDs were deleted
|
|
BUG();
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
static bool beegfs_release_folio(struct folio* folio, gfp_t gfp)
|
|
{
|
|
return beegfs_releasepage(&folio->page, gfp) != 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
static bool beegfs_set_dirty_folio(struct address_space *mapping, struct folio *folio)
|
|
{
|
|
struct page *page = &folio->page;
|
|
if (folio_test_dirty(folio))
|
|
{
|
|
printk_fhgfs_debug(KERN_INFO,"%s %p dirty_folio %p idx %lu -- already dirty\n", __func__,
|
|
mapping->host, folio, folio->index);
|
|
VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
|
|
return false;
|
|
}
|
|
|
|
#else
|
|
static int beegfs_set_page_dirty(struct page* page)
|
|
{
|
|
#endif
|
|
|
|
atomic_set(&BEEGFS_INODE(page->mapping->host)->modified, 1);
|
|
|
|
pvr_init(page);
|
|
pvr_set_first(page, 0);
|
|
pvr_set_last(page, PAGE_SIZE - 1);
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
return filemap_dirty_folio(mapping,folio);
|
|
#else
|
|
return __set_page_dirty_nobuffers(page);
|
|
#endif
|
|
}
|
|
|
|
static void __beegfs_invalidate_page(struct page* page, unsigned begin, unsigned end)
|
|
{
|
|
if(pvr_present(page) )
|
|
{
|
|
unsigned pvr_begin = pvr_get_first(page);
|
|
unsigned pvr_end = pvr_get_last(page);
|
|
|
|
if(begin == 0 && end == PAGE_SIZE)
|
|
{
|
|
pvr_clear(page);
|
|
ClearPageUptodate(page);
|
|
return;
|
|
}
|
|
|
|
if(begin < pvr_begin)
|
|
pvr_set_first(page, begin);
|
|
|
|
if(pvr_end < end)
|
|
pvr_set_last(page, end);
|
|
|
|
return;
|
|
}
|
|
|
|
// ARDs were deleted
|
|
BUG();
|
|
}
|
|
|
|
#if defined(KERNEL_HAS_FOLIO)
|
|
static void beegfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
|
|
{
|
|
if (offset != 0 || length < folio_size(folio))
|
|
return;
|
|
|
|
//FIX ME: If this folio_wait_writeback(folio) makes sense here (imp: as per doc)
|
|
__beegfs_invalidate_page(&folio->page, offset, (offset+length));
|
|
|
|
}
|
|
|
|
#elif !defined(KERNEL_HAS_INVALIDATEPAGE_RANGE)
|
|
static void beegfs_invalidate_page(struct page* page, unsigned long begin)
|
|
{
|
|
__beegfs_invalidate_page(page, begin, PAGE_CACHE_SIZE);
|
|
}
|
|
#else
|
|
static void beegfs_invalidate_page(struct page* page, unsigned begin, unsigned end)
|
|
{
|
|
__beegfs_invalidate_page(page, begin, end);
|
|
}
|
|
#endif
|
|
|
|
static ssize_t beegfs_dIO_read(struct kiocb* iocb, struct iov_iter* iter, loff_t offset,
|
|
RemotingIOInfo* ioInfo)
|
|
{
|
|
struct file* filp = iocb->ki_filp;
|
|
struct inode* inode = file_inode(filp);
|
|
struct dentry* dentry = file_dentry(filp);
|
|
|
|
App* app = FhgfsOps_getApp(dentry->d_sb);
|
|
|
|
ssize_t result = 0;
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "pos: %lld, nr_segs: %lld",
|
|
offset, beegfs_iov_iter_nr_segs(iter));
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
result = FhgfsOpsRemoting_readfileVec(iter, iov_iter_count(iter), offset, ioInfo, BEEGFS_INODE(inode));
|
|
|
|
if(result < 0)
|
|
return FhgfsOpsErr_toSysErr(-result);
|
|
|
|
offset += result;
|
|
|
|
if(iov_iter_count(iter) > 0)
|
|
{
|
|
ssize_t readRes = __FhgfsOps_readSparse(filp, iter, iov_iter_count(iter), offset + result);
|
|
|
|
result += readRes;
|
|
}
|
|
|
|
task_io_account_read(result);
|
|
|
|
if(offset > i_size_read(inode) )
|
|
i_size_write(inode, offset);
|
|
|
|
return result;
|
|
}
|
|
|
|
static ssize_t beegfs_dIO_write(struct kiocb* iocb, struct iov_iter* iter, loff_t offset,
|
|
RemotingIOInfo* ioInfo)
|
|
{
|
|
struct file* filp = iocb->ki_filp;
|
|
struct inode* inode = file_inode(filp);
|
|
struct dentry* dentry = file_dentry(filp);
|
|
|
|
App* app = FhgfsOps_getApp(dentry->d_sb);
|
|
|
|
ssize_t result = 0;
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "pos: %lld, nr_segs: %lld",
|
|
offset, beegfs_iov_iter_nr_segs(iter));
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
IGNORE_UNUSED_VARIABLE(inode);
|
|
|
|
result = FhgfsOpsRemoting_writefileVec(iter, offset, ioInfo, false);
|
|
|
|
if(result < 0)
|
|
return FhgfsOpsErr_toSysErr(-result);
|
|
|
|
offset += result;
|
|
|
|
task_io_account_write(result);
|
|
|
|
return result;
|
|
}
|
|
|
|
static ssize_t __beegfs_direct_IO(int rw, struct kiocb* iocb, struct iov_iter* iter, loff_t offset)
|
|
{
|
|
struct file* filp = iocb->ki_filp;
|
|
struct inode* inode = file_inode(filp);
|
|
RemotingIOInfo ioInfo;
|
|
|
|
FsFileInfo_getIOInfo(__FhgfsOps_getFileInfo(filp), BEEGFS_INODE(inode), &ioInfo);
|
|
|
|
{
|
|
ssize_t result;
|
|
|
|
switch(rw)
|
|
{
|
|
case READ:
|
|
result = beegfs_dIO_read(iocb, iter, offset, &ioInfo);
|
|
break;
|
|
|
|
case WRITE:
|
|
result = beegfs_dIO_write(iocb, iter, offset, &ioInfo);
|
|
break;
|
|
|
|
default:
|
|
BUG();
|
|
return -EINVAL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
static ssize_t beegfs_direct_IO(struct kiocb* iocb, struct iov_iter* iter)
|
|
{
|
|
return __beegfs_direct_IO(iov_iter_rw(iter), iocb, iter, iocb->ki_pos);
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
static int beegfs_launder_folio(struct folio *folio)
|
|
{
|
|
return beegfs_flush_page(&folio->page);
|
|
}
|
|
#else
|
|
static int beegfs_launderpage(struct page* page)
|
|
{
|
|
return beegfs_flush_page(page);
|
|
}
|
|
#endif
|
|
|
|
const struct address_space_operations fhgfs_addrspace_native_ops = {
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
.read_folio = beegfs_read_folio,
|
|
.release_folio = beegfs_release_folio,
|
|
#else
|
|
.readpage = beegfs_readpage,
|
|
.releasepage = beegfs_releasepage,
|
|
#endif
|
|
|
|
.writepage = beegfs_writepage,
|
|
.direct_IO = beegfs_direct_IO,
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
.readahead = beegfs_readahead,
|
|
.dirty_folio = beegfs_set_dirty_folio,
|
|
.invalidate_folio = beegfs_invalidate_folio,
|
|
.launder_folio = beegfs_launder_folio,
|
|
#else
|
|
.readpages = beegfs_readpages,
|
|
.set_page_dirty = beegfs_set_page_dirty,
|
|
.invalidatepage = beegfs_invalidate_page,
|
|
.launder_page = beegfs_launderpage,
|
|
#endif
|
|
|
|
.writepages = beegfs_writepages,
|
|
.write_begin = beegfs_write_begin,
|
|
.write_end = beegfs_write_end,
|
|
};
|