2025-08-10 01:34:16 +02:00

1794 lines
56 KiB
C

#include <app/log/Logger.h>
#include <app/App.h>
#include <app/config/Config.h>
#include <common/toolkit/vector/StrCpyVec.h>
#include <common/toolkit/LockingTk.h>
#include <common/storage/StorageErrors.h>
#include <common/toolkit/StringTk.h>
#include <filesystem/ProcFs.h>
#include <os/iov_iter.h>
#include <os/OsCompat.h>
#include <os/OsTypeConversion.h>
#include "FhgfsOpsHelper.h"
#include "FhgfsOpsFile.h"
#include "FhgfsOpsDir.h"
#include "FhgfsOpsInode.h"
#include "FhgfsOpsIoctl.h"
#include "FhgfsOpsSuper.h"
#include "FhgfsOps_versions.h"
#include "FhgfsOpsPages.h"
#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/mm.h>
#include <linux/mpage.h>
#include <linux/backing-dev.h>
#include <linux/pagemap.h>
#include <linux/delay.h>
#ifdef CONFIG_COMPAT
#include <asm/compat.h>
#endif // CONFIG_COMPAT
static ssize_t FhgfsOps_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
static ssize_t FhgfsOps_buffered_read_iter(struct kiocb *iocb, struct iov_iter *to);
static int FhgfsOps_write_begin(struct file* file, struct address_space* mapping,
loff_t pos, unsigned len,
#if BEEGFS_HAS_WRITE_FLAGS
unsigned flags,
#endif
beegfs_pgfol_t *pgfolp, void** fsdata);
static int FhgfsOps_write_end(struct file* file, struct address_space* mapping,
loff_t pos, unsigned len, unsigned copied, beegfs_pgfol_t pgfol, void* fsdata);
#define MMAP_RETRY_LOCK_EASY 100
#define MMAP_RETRY_LOCK_HARD 500
/**
* Operations for files with cache type "buffered" and "none".
*/
struct file_operations fhgfs_file_buffered_ops =
{
.open = FhgfsOps_open,
.release = FhgfsOps_release,
.fsync = FhgfsOps_fsync,
.flush = FhgfsOps_flush,
.llseek = FhgfsOps_llseek,
.flock = FhgfsOps_flock,
.lock = FhgfsOps_lock,
.mmap = FhgfsOps_mmap,
.unlocked_ioctl = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
#ifdef KERNEL_HAS_GENERIC_FILE_SPLICE_READ
.splice_read = generic_file_splice_read,
#else
.splice_read = filemap_splice_read,
#endif
#ifdef KERNEL_HAS_ITER_FILE_SPLICE_WRITE
.splice_write = iter_file_splice_write,
#else
.splice_write = generic_file_splice_write,
#endif
.read_iter = FhgfsOps_buffered_read_iter,
.write_iter = FhgfsOps_buffered_write_iter, // replacement for aio_write
#ifdef KERNEL_HAS_GENERIC_FILE_SENDFILE
.sendfile = generic_file_sendfile, // removed in 2.6.23 (now handled via splice)
#endif // LINUX_VERSION_CODE
};
/**
* Operations for files with cache type "paged".
*/
struct file_operations fhgfs_file_pagecache_ops =
{
.open = FhgfsOps_open,
.release = FhgfsOps_release,
.read_iter = FhgfsOps_read_iter,
.write_iter = FhgfsOps_write_iter,
.fsync = FhgfsOps_fsync,
.flush = FhgfsOps_flush,
.llseek = FhgfsOps_llseek,
.flock = FhgfsOps_flock,
.lock = FhgfsOps_lock,
.mmap = FhgfsOps_mmap,
.unlocked_ioctl = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
#ifdef KERNEL_HAS_GENERIC_FILE_SPLICE_READ
.splice_read = generic_file_splice_read,
#else
.splice_read = filemap_splice_read,
#endif
#ifdef KERNEL_HAS_ITER_FILE_SPLICE_WRITE
.splice_write = iter_file_splice_write,
#else
.splice_write = generic_file_splice_write,
#endif
#ifdef KERNEL_HAS_GENERIC_FILE_SENDFILE
.sendfile = generic_file_sendfile, // removed in 2.6.23 (now handled via splice)
#endif // LINUX_VERSION_CODE
};
struct file_operations fhgfs_dir_ops =
{
.open = FhgfsOps_opendirIncremental,
.release = FhgfsOps_releasedir,
#ifdef KERNEL_HAS_ITERATE_DIR
#if defined(KERNEL_HAS_FOPS_ITERATE)
.iterate = FhgfsOps_iterateIncremental, // linux 3.11 renamed readdir to iterate
#else
.iterate_shared = FhgfsOps_iterateIncremental, // linux 6.3 removed .iterate & it's a parallel variant of .iterate().
#endif
#else
.readdir = FhgfsOps_readdirIncremental, // linux 3.11 renamed readdir to iterate
#endif // LINUX_VERSION_CODE
.read = generic_read_dir, // just returns the appropriate error code
.fsync = FhgfsOps_fsync,
.llseek = FhgfsOps_llseekdir,
.unlocked_ioctl = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
};
/**
* Operations for files with cache type "buffered" and "none".
*/
struct address_space_operations fhgfs_address_ops =
{
#ifdef KERNEL_HAS_READ_FOLIO
.read_folio = FhgfsOps_read_folio,
#else
.readpage = FhgfsOpsPages_readpage,
#endif
#ifdef KERNEL_HAS_FOLIO
.readahead = FhgfsOpsPages_readahead,
.dirty_folio = filemap_dirty_folio,
#else
.readpages = FhgfsOpsPages_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
#endif
.writepage = FhgfsOpsPages_writepage,
.writepages = FhgfsOpsPages_writepages,
.direct_IO = FhgfsOps_directIO,
.write_begin = FhgfsOps_write_begin,
.write_end = FhgfsOps_write_end,
};
/**
* Operations for files with cache type "paged".
*/
struct address_space_operations fhgfs_address_pagecache_ops =
{
#ifdef KERNEL_HAS_READ_FOLIO
.read_folio = FhgfsOps_read_folio,
#else
.readpage = FhgfsOpsPages_readpage,
#endif
#ifdef KERNEL_HAS_FOLIO
.readahead = FhgfsOpsPages_readahead,
.dirty_folio = filemap_dirty_folio,
#else
.readpages = FhgfsOpsPages_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
#endif
.writepage = FhgfsOpsPages_writepage,
.writepages = FhgfsOpsPages_writepages,
.direct_IO = FhgfsOps_directIO,
.write_begin = FhgfsOps_write_begin,
.write_end = FhgfsOps_write_end,
};
/**
* note: rewinddir is seek to offset 0.
*
* @param origin dirs allow only SEEK_SET (via seekdir/rewinddir from userspace).
*/
loff_t FhgfsOps_llseekdir(struct file *file, loff_t offset, int origin)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
const char* logContext = "FhgfsOps_llseekDir";
struct inode* inode = file_inode(file);
loff_t retVal = 0;
FsDirInfo* dirInfo = __FhgfsOps_getDirInfo(file);
FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
"offset: %lld directive: %d", (long long)offset, origin);
if(origin != SEEK_SET)
{
if (origin == SEEK_CUR && offset == 0) {
// Some applications use lseek with SEEK_CUR and offset = 0 to get the current position in
// the file. To support that special case, we will translate the request into a SEEK_SET
// with the current file position as the offset.
offset = file->f_pos;
origin = SEEK_SET;
FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
"offset: %lld position: %lld directive: %d", (long long)offset, (long long)file->f_pos,
origin);
} else {
return -EINVAL;
}
}
retVal = generic_file_llseek_unlocked(file, offset, origin);
if(likely(retVal >= 0) )
{
// invalidate any retrieved contents to keep things in sync with server offset
StrCpyVec* contents = FsDirInfo_getDirContents(dirInfo);
StrCpyVec_clear(contents);
FsDirInfo_setCurrentContentsPos(dirInfo, 0);
FsDirInfo_setServerOffset(dirInfo, offset);
FsDirInfo_setEndOfDir(dirInfo, false);
}
return retVal;
}
loff_t FhgfsOps_llseek(struct file *file, loff_t offset, int origin)
{
const char* logContext = "FhgfsOps_llseek";
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
Config* cfg = App_getConfig(app);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
bool isGloballyLockedAppend =
FsFileInfo_getAppending(fileInfo) && Config_getTuneUseGlobalAppendLocks(cfg);
loff_t retVal = 0;
struct inode *inode = file->f_mapping->host;
FhgfsIsizeHints iSizeHints;
if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
"offset: %lld directive: %d", (long long)offset, origin);
/* note: globally locked append with SEEK_CUR is a special case, because we need to flush
the cache to find out the current offset (which is not required without append) */
if( (origin == SEEK_END) ||
(isGloballyLockedAppend && (origin == SEEK_CUR) ) )
{ // seek to position relative to end-of-file => flush cache and update current file size first
// (note: refreshInode() also flushes caches for correct file size)
retVal = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
if(retVal)
goto clean_up;
spin_lock(&inode->i_lock); // L O C K
// SEEK_CUR reads (and modifies) f_pos, so in buffered append mode move to end first
if(origin == SEEK_CUR)
file->f_pos = inode->i_size;
retVal = generic_file_llseek_unlocked(file, offset, origin);
spin_unlock(&inode->i_lock); // U N L O C K
}
else
{ // abolute or relative-to-current_pos seeks => generic stuff
retVal = generic_file_llseek_unlocked(file, offset, origin);
}
clean_up:
// clean-up
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, logContext, "retVal: %lld",
retVal);
return retVal;
}
/**
* Note: Currently unsused method, as we're using the kernel's generic_readlink function.
*/
int FhgfsOps_readlink(struct dentry* dentry, char __user* buf, int size)
{
App* app = FhgfsOps_getApp(dentry->d_sb);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_readlink";
int retVal;
struct inode* inode = dentry->d_inode;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
if(unlikely(Logger_getLogLevel(log) >= 5) )
FhgfsOpsHelper_logOp(5, app, dentry, inode, logContext);
// check user buffer
if(unlikely(!os_access_ok(VERIFY_WRITE, buf, size) ) )
return -EFAULT;
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
retVal = FhgfsOpsHelper_readlink_kernel(app, FhgfsInode_getEntryInfo(fhgfsInode), buf, size);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
return retVal;
}
/**
* Opens a directory and prepares the handle for incremental readdir().
*/
int FhgfsOps_opendirIncremental(struct inode* inode, struct file* file)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_opendirIncremental";
int retVal = 0;
//struct dentry* dentry = file_dentry(file);
FsDirInfo* dirInfo;
if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);
//retVal = __FhgfsOps_refreshInode(app, inode); // not necessary
if(!retVal)
{ // success
dirInfo = FsDirInfo_construct(app);
__FhgfsOps_setDirInfo(dirInfo, file);
}
#ifdef FMODE_KABI_ITERATE
file->f_mode |= FMODE_KABI_ITERATE;
#endif
return retVal;
}
#ifdef KERNEL_HAS_ITERATE_DIR
int FhgfsOps_iterateIncremental(struct file* file, struct dir_context* ctx)
#else
int FhgfsOps_readdirIncremental(struct file* file, void* buf, filldir_t filldir)
#endif // LINUX_VERSION_CODE
{
/* note: if the user seeks to a custom offset, llseekdir will invalidate any retrieved contents
and set the new offset in the dirinfo object */
struct dentry* dentry = file_dentry(file);
struct super_block* superBlock = dentry->d_sb;
App* app = FhgfsOps_getApp(superBlock);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_readdirIncremental";
int retVal = 0;
FsDirInfo* dirInfo = __FhgfsOps_getDirInfo(file);
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
StrCpyVec* dirContents = FsDirInfo_getDirContents(dirInfo);
UInt8Vec* dirContentsTypes = FsDirInfo_getDirContentsTypes(dirInfo);
StrCpyVec* dirContentIDs = FsDirInfo_getEntryIDs(dirInfo);
Int64CpyVec* serverOffsets = FsDirInfo_getServerOffsets(dirInfo);
#ifdef KERNEL_HAS_ITERATE_DIR
loff_t* pos = &(ctx->pos); // used by dir_emit()
#else
loff_t* pos = &(file->f_pos);
#endif // LINUX_VERSION_CODE
if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
FhgfsOpsHelper_logOp(Log_SPAM, app, dentry, inode, logContext);
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
// loop as long as filldir (or dir_emit) swallows more entries (or end of dir contents reached)
for( ; ; )
{
int refreshRes;
size_t contentsPos;
size_t contentsLength;
char* currentName;
DirEntryType currentEntryType;
unsigned currentOSEntryType;
uint64_t currentIno;
refreshRes = FhgfsOpsHelper_refreshDirInfoIncremental(app,
FhgfsInode_getEntryInfo(fhgfsInode), dirInfo, false);
if(unlikely(refreshRes) )
{ // error occurred
retVal = refreshRes;
break;
}
contentsLength = StrCpyVec_length(dirContents);
/* refreshDirInfoIncremental() guarantees that we either have a valid range for current
dir offset or that dirContents list is empty */
if(!contentsLength)
{ // end of dir
LOG_DEBUG(log, Log_SPAM, logContext, "reached end of dir");
break;
}
contentsPos = FsDirInfo_getCurrentContentsPos(dirInfo);
currentName = StrCpyVec_at(dirContents, contentsPos);
currentEntryType = UInt8Vec_at(dirContentsTypes, contentsPos);
currentOSEntryType = OsTypeConv_dirEntryTypeToOS(currentEntryType);
LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext,
"name: %s; pos: %lld; contentsPos: %lld/%lld; finalContents: %s",
currentName, (long long)*pos, (long long)contentsPos,
(long long)contentsLength, FsDirInfo_getEndOfDir(dirInfo) ? "yes" : "no");
if(!strcmp(".", currentName) )
currentIno = inode->i_ino;
else
if(!strcmp("..", currentName) )
#if defined(KERNEL_HAS_PARENT_INO)
currentIno = parent_ino(dentry);
#else
currentIno = d_parent_ino(dentry);
#endif
else
{ // generate inode number from entryID
const char* currentEntryID = StrCpyVec_at(dirContentIDs, contentsPos);
currentIno = FhgfsInode_generateInodeID(superBlock, currentEntryID,
strlen(currentEntryID) );
}
if(is_32bit_api() && (currentIno > UINT_MAX) )
currentIno = currentIno >> 32; // (32-bit apps would fail with EOVERFLOW)
#ifdef KERNEL_HAS_ITERATE_DIR
if(!dir_emit(
ctx, currentName, strlen(currentName), currentIno, currentOSEntryType) )
break;
#else
if(filldir(
buf, currentName, strlen(currentName), *pos, currentIno, currentOSEntryType) < 0)
break;
#endif // LINUX_VERSION_CODE
LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext, "filled: %s", currentName);
// advance dir position (yes, it's alright to use the old contentsPos for the next round here)
(*pos) = Int64CpyVec_at(serverOffsets, contentsPos);
// increment contents vector offset
FsDirInfo_setCurrentContentsPos(dirInfo, contentsPos+1);
} // end of for-loop
// clean-up
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
return retVal;
}
/**
* Note: This works for _opendir() and for _opendirIncremental().
*/
int FhgfsOps_releasedir(struct inode* inode, struct file* file)
{
const char* logContext = "FhgfsOps_releasedir";
FsObjectInfo* fsObjectInfo = __FhgfsOps_getObjectInfo(file);
App* app = FsObjectInfo_getApp(fsObjectInfo);
FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);
FsObjectInfo_virtualDestruct(fsObjectInfo);
return 0;
}
/**
* Open a file, may be called from vfs or lookup/atomic open.
*
* @param lookupInfo is NULL if this is a direct open call from the vfs
*/
int FhgfsOps_openReferenceHandle(App* app, struct inode* inode, struct file* file,
unsigned openFlags, LookupIntentInfoOut* lookupInfo, uint32_t* outVersion)
{
Config* cfg = App_getConfig(app);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_openReferenceHandle";
struct super_block* sb = inode->i_sb;
struct dentry* dentry = file_dentry(file);
int retVal = 0;
int fhgfsOpenFlags;
FileHandleType handleType;
FhgfsOpsErr openRes;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
if(unlikely(Logger_getLogLevel(log) >= 4) )
FhgfsOpsHelper_logOp(Log_DEBUG, app, dentry, inode, logContext);
fhgfsOpenFlags = OsTypeConv_openFlagsOsToFhgfs(openFlags, __FhgfsOps_isPagedMode(sb) );
openRes = FhgfsInode_referenceHandle(fhgfsInode, file_dentry(file), fhgfsOpenFlags, false,
lookupInfo, &handleType, outVersion);
LOG_DEBUG_FORMATTED(log, 4, logContext, "remoting complete. result: %s",
FhgfsOpsErr_toErrString(openRes) );
if(openRes != FhgfsOpsErr_SUCCESS)
{ // error
retVal = FhgfsOpsErr_toSysErr(openRes);
}
else
{ // success => file is open (=> handle open flags)
FsFileInfo* fileInfo = FsFileInfo_construct(app, fhgfsOpenFlags, handleType);
// handle O_APPEND
if(file->f_flags & O_APPEND)
FsFileInfo_setAppending(fileInfo, true);
// handle O_DIRECT + disabled caching
if( (file->f_flags & O_DIRECT) ||
( (file->f_flags & O_APPEND) && !Config_getTuneUseBufferedAppend(cfg) ) ||
(Config_getTuneFileCacheTypeNum(cfg) == FILECACHETYPE_None) )
{ // disable caching
FsFileInfo_setAllowCaching(fileInfo, false);
}
__FhgfsOps_setFileInfo(fileInfo, file);
}
return retVal;
}
/**
* Open a file, vfs interface
*/
int FhgfsOps_open(struct inode* inode, struct file* file)
{
const char* logContext = "FhgfsOps_open";
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
struct dentry* dentry = file_dentry(file);
unsigned openFlags = file->f_flags;
LookupIntentInfoOut* lookupInfo = NULL; // not available for direct open
if(unlikely(Logger_getLogLevel(log) >= 4) )
FhgfsOpsHelper_logOp(4, app, dentry, inode, logContext);
return FhgfsOps_openReferenceHandle(app, inode, file, openFlags, lookupInfo, NULL);
}
/**
* Close a file.
*
* Note: We only got one shot, even in case of an error.
*/
int FhgfsOps_release(struct inode* inode, struct file* file)
{
const char* logContext = "FhgfsOps_release";
int retVal = 0;
FhgfsOpsErr closeRes;
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
FsObjectInfo* fsObjectInfo = __FhgfsOps_getObjectInfo(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
FileHandleType handleType = FsFileInfo_getHandleType(fileInfo);
App* app = FsObjectInfo_getApp(fsObjectInfo);
FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);
if(unlikely(!fileInfo) )
{ // invalid file handle
return -EBADF;
}
FhgfsOps_releaseCancelLocks(inode, file); // cancel all locks that were not properly released yet
closeRes = FhgfsInode_releaseHandle(fhgfsInode, handleType, file_dentry(file));
if(closeRes != FhgfsOpsErr_SUCCESS)
{ // error
retVal = FhgfsOpsErr_toSysErr(closeRes);
}
// note: we free the fileInfo no matter whether the communication succeeded or not
// (because _release() won't be called again even if it didn't succeed)
FsObjectInfo_virtualDestruct( (FsObjectInfo*)fileInfo);
__FhgfsOps_setFileInfo( (FsFileInfo*)NULL, file);
// warning: linux vfs won't return this result to user apps. only flush() res is passed to apps.
return retVal;
}
/**
* Called during file close to unlock remaining entry locks and range locks that were not properly
* unlocked by the user-space application yet.
*/
int FhgfsOps_releaseCancelLocks(struct inode* inode, struct file* file)
{
int retVal = 0;
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
RemotingIOInfo ioInfo;
FhgfsOpsErr unlockRes;
/* (note: it is very unlikely that an application will use entry and range locking together on
the same file, so we have no special optimization regarding EntryMinInfoCopy for that case) */
if(FsFileInfo_getUsedEntryLocking(fileInfo) )
{ // entry locks were used with this file handle
int64_t clientFD = __FhgfsOps_getCurrentLockFD(file);
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
unlockRes = FhgfsOpsHelper_unlockEntryWithAsyncRetry(&fhgfsInode->entryInfo,
&fhgfsInode->entryInfoLock, &ioInfo, clientFD);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
if(!retVal)
retVal = FhgfsOpsErr_toSysErr(unlockRes);
}
/* (note: FhgfsInode_getNumRangeLockPIDs() below is a shortcut to save the time for mutex locking
if no range locks were used for this inode.) */
if(FhgfsInode_getNumRangeLockPIDs(fhgfsInode) &&
FhgfsInode_removeRangeLockPID(fhgfsInode, __FhgfsOps_getCurrentLockPID() ) )
{ // current pid used range locking on this inode
int ownerPID = __FhgfsOps_getCurrentLockPID();
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
unlockRes = FhgfsOpsHelper_unlockRangeWithAsyncRetry(&fhgfsInode->entryInfo,
&fhgfsInode->entryInfoLock, &ioInfo, ownerPID);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
if(!retVal)
retVal = FhgfsOpsErr_toSysErr(unlockRes);
}
return retVal;
}
/**
* Called by flock syscall.
*
* @return 0 on success, negative linux error code otherwise
*/
int FhgfsOps_flock(struct file* file, int cmd, struct file_lock* fileLock)
{
const char* logContext = __func__;
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
Config* cfg = App_getConfig(app);
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
bool useGlobalFileLocks = Config_getTuneUseGlobalFileLocks(cfg);
FhgfsOpsErr globalLockRes = FhgfsOpsErr_SUCCESS;
int lockTypeFlags;
lockTypeFlags = OsTypeConv_flockTypeToFhgfs(fileLock);
if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext, "lockType: %s",
LockingTk_lockTypeToStr(lockTypeFlags) );
// flush buffers before removing a global lock
if(useGlobalFileLocks && (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
{
int flushRes = __FhgfsOps_flush(app, file, false, false, true,
false);
/* note: can't return error here and must continue, because local unlock must always be done
to avoid BUG() statement being triggered in locks_remove_flock() on cleanup after kill */
if(unlikely(flushRes < 0) )
Logger_logFormatted(log, Log_NOTICE, logContext,
"Flushing before unlock failed. Continuing anyways. flushRes: %d", flushRes);
}
// global locking
if(useGlobalFileLocks)
{
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
FsFileInfo_setUsedEntryLocking(fileInfo);
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
globalLockRes = FhgfsOpsRemoting_flockEntryEx(&fhgfsInode->entryInfo,
&fhgfsInode->entryInfoLock, app, ioInfo.fileHandleID, (size_t)FhgfsCommon_getFileLock(fileLock),
FhgfsCommon_getFileLockPID(fileLock), lockTypeFlags, true);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
LOG_DEBUG_FORMATTED(log, Log_DEBUG, logContext, "remoting complete. result: %s",
FhgfsOpsErr_toErrString(globalLockRes) );
/* note: local unlock must always be done for cleanup (otherwise e.g. killing a process
holding a lock results in the BUG() statement being triggered in locks_remove_flock() ) */
if( (globalLockRes != FhgfsOpsErr_SUCCESS) &&
!(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
return FhgfsOpsErr_toSysErr(globalLockRes);
}
// local locking
{
#if defined(KERNEL_HAS_LOCKS_FILELOCK_INODE_WAIT) || defined(KERNEL_HAS_LOCKS_LOCK_INODE_WAIT)
int localLockRes = locks_lock_inode_wait(file_inode(file), fileLock);
#else
int localLockRes = flock_lock_file_wait(file, fileLock);
#endif
if(!useGlobalFileLocks)
return localLockRes;
if(localLockRes &&
(globalLockRes == FhgfsOpsErr_SUCCESS) &&
(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) )
{ // sanity check
Logger_logFormatted(log, Log_NOTICE, logContext,
"Unexpected: Global locking succeeded, but local locking failed. SysErr: %d",
localLockRes);
}
}
// flush buffers after we got a new global lock
if(useGlobalFileLocks &&
(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) &&
(globalLockRes == FhgfsOpsErr_SUCCESS) )
{
int flushRes = __FhgfsOps_flush(app, file, false, false, true,
false);
if(unlikely(flushRes < 0) )
return flushRes; // flush error occured
}
return FhgfsOpsErr_toSysErr(globalLockRes);
}
/**
* Called by fcntl syscall (F_GETLK, F_SETLK, F_SETLKW) for file range locking.
*
* @return 0 on success, negative linux error code otherwise
*/
int FhgfsOps_lock(struct file* file, int cmd, struct file_lock* fileLock)
{
const char* logContext = "FhgfsOps_lock (fcntl)";
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
Config* cfg = App_getConfig(app);
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
bool useGlobalFileLocks = Config_getTuneUseGlobalFileLocks(cfg);
FhgfsOpsErr globalLockRes = FhgfsOpsErr_SUCCESS;
int lockTypeFlags;
// handle user request for conflicting locks (always local-only currently, see notes below)
if(cmd == F_GETLK)
{ // get confliciting lock
/* note: it's questionable if returning remote locks makes sense (because the local app could
misinterpret the pid of the lock-holder), so we do local only for now. */
posix_test_lock(file, fileLock);
/* note: "fileLock->fl_type != F_UNLCK" would tell us now whether a conflicting local lock
was found */
return 0;
}
lockTypeFlags = OsTypeConv_flockTypeToFhgfs(fileLock);
if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
"lockType: %s; start: %lld; end: %lld", LockingTk_lockTypeToStr(lockTypeFlags),
(long long)fileLock->fl_start, (long long)fileLock->fl_end);
// flush buffers before removing a global lock
if(useGlobalFileLocks &&
(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
{
int flushRes = __FhgfsOps_flush(app, file, false, false, true, false);
/* note: can't return error here and must continue, because local unlock must always be done
to avoid BUG() statement being triggered in locks_remove_flock() on cleanup after kill */
if(unlikely(flushRes < 0) )
Logger_logFormatted(log, Log_NOTICE, logContext,
"Flushing before unlock failed. Continuing anyways. flushRes: %d", flushRes);
}
// global locking
if(useGlobalFileLocks)
{
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
FhgfsInode_addRangeLockPID(fhgfsInode, FhgfsCommon_getFileLockPID(fileLock));
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
globalLockRes = FhgfsOpsRemoting_flockRangeEx(&fhgfsInode->entryInfo,
&fhgfsInode->entryInfoLock, ioInfo.app, ioInfo.fileHandleID, FhgfsCommon_getFileLockPID(fileLock),
lockTypeFlags, fileLock->fl_start, fileLock->fl_end, true);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
LOG_DEBUG_FORMATTED(log, Log_DEBUG, logContext, "remoting complete. result: %s",
FhgfsOpsErr_toErrString(globalLockRes) );
/* note: local unlock must always be done for cleanup (otherwise e.g. killing a process
holding a lock results in the BUG() statement being triggered in locks_remove_flock() ) */
if( (globalLockRes != FhgfsOpsErr_SUCCESS) &&
!(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
return FhgfsOpsErr_toSysErr(globalLockRes);
}
// local locking
{
/* note on local+global locking:
we need to call posix_lock_file_wait() even with global locks, because inode->i_flock needs
to be set, so that locks_remove_posix() gets active (via filp_close() ) and thus the
condition "Record locks are not inherited by a child created via fork(2), but are preserved
across an execve(2)." [man 2 fcntl] holds.
Otherwise we wouldn't be notified about an unlock on parent process exit, as there are
still references to the filp and thus our ->release() isn't invoked. (See trac #271) */
/* note on local/global locking order:
local locking needs to be done after global locking, because otherwise if global locking
failed we wouldn't know how to undo the local locking (e.g. if the process acquires a
shared lock for the second time or does a merge with existing ranges). */
#if defined(KERNEL_HAS_LOCKS_FILELOCK_INODE_WAIT) || defined(KERNEL_HAS_LOCKS_LOCK_INODE_WAIT)
int localLockRes = locks_lock_inode_wait(file_inode(file), fileLock);
#else
int localLockRes = posix_lock_file_wait(file, fileLock);
#endif
//printk_fhgfs_debug(KERN_WARNING, "posix_lock_file result=%d, cmd=%d\n", localLockRes, cmd);
if(!useGlobalFileLocks)
return localLockRes;
if(localLockRes &&
(globalLockRes == FhgfsOpsErr_SUCCESS) &&
(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) )
{ // sanity check
Logger_logFormatted(log, Log_NOTICE, logContext,
"Unexpected: Global locking succeeded, but local locking failed. SysErr: %d",
localLockRes);
}
}
// flush buffers after we got a new global lock
if(useGlobalFileLocks &&
(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) &&
(globalLockRes == FhgfsOpsErr_SUCCESS) )
{
int flushRes = __FhgfsOps_flush(app, file, false, false, true, false);
if(unlikely(flushRes < 0) )
return flushRes; // flush error occured
}
return FhgfsOpsErr_toSysErr(globalLockRes);
}
static ssize_t read_common(struct file *file, struct iov_iter *iter, size_t size, loff_t *offsetPointer)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
struct inode* inode = file->f_mapping->host;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
ssize_t readRes;
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
(long long)*offsetPointer, (long long)size);
IGNORE_UNUSED_VARIABLE(app);
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
if (app->cfg->tuneCoherentBuffers)
{
readRes = filemap_write_and_wait(file->f_mapping);
if (readRes < 0)
return readRes;
// ignore the -EBUSY we could receive here, because there is just *no* way we can keep caches
// coherent without locking everything all the time. if this produces inconsistent data,
// something must have been racy anyway.
invalidate_inode_pages2(file->f_mapping);
// Increment coherent read/write counter
atomic_inc(&fhgfsInode->coRWInProg);
}
readRes = FhgfsOpsHelper_readCached(iter, size, *offsetPointer, fhgfsInode, fileInfo, &ioInfo);
//readRes = FhgfsOpsRemoting_readfile(buf, size, *offsetPointer, &ioInfo);
if(readRes < 0)
{ // read error (=> transform negative fhgfs error code to system error code)
if (app->cfg->tuneCoherentBuffers)
atomic_dec(&fhgfsInode->coRWInProg);
return FhgfsOpsErr_toSysErr(-readRes);
}
*offsetPointer += readRes;
FsFileInfo_setLastReadOffset(fileInfo, *offsetPointer);
if( ( (size_t)readRes < size) && (i_size_read(inode) > *offsetPointer) )
{ // sparse file compatibility mode
ssize_t readSparseRes = __FhgfsOps_readSparse(
file, iter, size - readRes, *offsetPointer);
if(unlikely(readSparseRes < 0) )
{
if (app->cfg->tuneCoherentBuffers)
atomic_dec(&fhgfsInode->coRWInProg);
return readSparseRes;
}
*offsetPointer += readSparseRes;
readRes += readSparseRes;
FsFileInfo_setLastReadOffset(fileInfo, *offsetPointer);
}
// add to /proc/<pid>/io
task_io_account_read(readRes);
// Decrement coherent read/write counter
if (app->cfg->tuneCoherentBuffers)
atomic_dec(&fhgfsInode->coRWInProg);
return readRes;
}
/**
* Special reading mode that is slower (e.g. not parallel) but compatible with sparse files.
*
* Note: Intended to be just a helper for actual read methods (e.g. won't increase the offset
* pointer).
*
* @return negative Linux error code on error, read bytes otherwise
*/
ssize_t __FhgfsOps_readSparse(struct file* file, struct iov_iter *iter, size_t size, loff_t offset)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
struct inode* inode = file->f_mapping->host;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
ssize_t readRes;
loff_t i_size;
FhgfsOpsErr helperReadRes;
FhgfsIsizeHints iSizeHints;
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
(long long)offset, (long long)size);
readRes = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
if(unlikely(readRes) )
return readRes;
i_size = i_size_read(inode);
if(i_size <= offset)
return 0; // EOF
// adapt read length to current file length
size = MIN(size, (unsigned long long)(i_size - offset) );
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
helperReadRes = FhgfsOpsHelper_readOrClearUser(app, iter, size, offset, fileInfo, &ioInfo);
if(unlikely(helperReadRes != FhgfsOpsErr_SUCCESS) )
return FhgfsOpsErr_toSysErr(helperReadRes);
return size;
}
ssize_t FhgfsOps_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
size_t count = iov_iter_count(to);
loff_t pos = iocb->ki_pos;
struct file* file = iocb->ki_filp;
struct address_space* mapping = file->f_mapping;
struct inode* inode = mapping->host;
App* app = FhgfsOps_getApp(inode->i_sb);
ssize_t retVal;
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
(long long)pos, (long long)count);
IGNORE_UNUSED_VARIABLE(pos);
IGNORE_UNUSED_VARIABLE(count);
retVal = __FhgfsOps_revalidateMapping(app, inode);
if(unlikely(retVal) )
{ // error
return retVal;
}
return generic_file_read_iter(iocb, to);
}
static ssize_t FhgfsOps_buffered_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
return read_common(iocb->ki_filp, to, iov_iter_count(to), &iocb->ki_pos);
}
static ssize_t write_common(struct file *file, struct iov_iter *from, size_t size, loff_t *offsetPointer)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Config* cfg = App_getConfig(app);
struct inode* inode = file->f_mapping->host;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
loff_t writeOffset;
ssize_t writeRes;
loff_t newMinFileSize; // to update i_size after write
bool isLocallyLockedAppend =
FsFileInfo_getAppending(fileInfo) && !Config_getTuneUseGlobalAppendLocks(cfg);
bool isGloballyLockedAppend =
FsFileInfo_getAppending(fileInfo) && Config_getTuneUseGlobalAppendLocks(cfg);
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
(long long)*offsetPointer, (long long)size);
inode_lock(inode);
{
writeRes = os_generic_write_checks(file, offsetPointer, &size, S_ISBLK(inode->i_mode) );
if (likely(! writeRes)) // success
writeRes = file_remove_privs(file);
}
inode_unlock(inode);
if (unlikely(writeRes))
return writeRes;
if (app->cfg->tuneCoherentBuffers)
{
/* this flush is necessary to ensure that delayed flushing of the page cache does not
* overwrite the data written here, even though it was written to the file first. */
writeRes = filemap_write_and_wait(file->f_mapping);
if (writeRes < 0)
return writeRes;
/* ignore the -EBUSY we could receive here, because there is just *no* way we can keep caches
* coherent without locking everything all the time. if this produces inconsistent data,
* something must have been racy anyway. */
invalidate_inode_pages2(file->f_mapping);
//Increment coherent rw counter
atomic_inc(&fhgfsInode->coRWInProg);
}
if(isLocallyLockedAppend)
{ // appending without global locks => move file offset to end-of-file before writing
/* note on flush and lock: the flush here must be inside the local lock, but cannot happen at
the place where we take the global lock (because that might be called from a flush path
itself), that's why global and local locks are taken at different places. */
int flushRes;
FhgfsOpsErr statRes;
fhgfs_stat fhgfsStat;
Fhgfsinode_appendLock(fhgfsInode); // L O C K (append)
flushRes = __FhgfsOps_flush(app, file, false, false, true, false);
if(unlikely(flushRes < 0) )
{ // flush error
writeRes = flushRes;
goto unlockappend_and_exit;
}
FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo
/* note on stat here: we could pass -1 to _writeCached and remove the stat here, but the
disadvantage would be that we don't have the correct file offset for i_size then, so we
leave the stat here for now. */
statRes = FhgfsOpsRemoting_statDirect(app, FhgfsInode_getEntryInfo(fhgfsInode), &fhgfsStat);
FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo
if(unlikely(statRes != FhgfsOpsErr_SUCCESS) )
{ // remote stat error
writeRes = FhgfsOpsErr_toSysErr(statRes);
goto unlockappend_and_exit;
}
*offsetPointer = fhgfsStat.size;
}
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
writeOffset = isGloballyLockedAppend ? -1 : *offsetPointer;
writeRes = FhgfsOpsHelper_writeCached(from, size, writeOffset, fhgfsInode, fileInfo, &ioInfo);
//writeRes = FhgfsOpsRemoting_writefile(from, size, *offsetPointer, &ioInfo);
if(unlikely(writeRes < 0) )
{ // write error (=> transform negative fhgfs error code to system error code)
writeRes = FhgfsOpsErr_toSysErr(-writeRes);
goto unlockappend_and_exit;
}
if(!isGloballyLockedAppend)
{ // for (buffered) global append locks, new offset/filesize would be unknown
newMinFileSize = *offsetPointer + writeRes; // update with old offset to avoid offset==0 check
*offsetPointer += writeRes;
FsFileInfo_setLastWriteOffset(fileInfo, *offsetPointer);
// check current file size and update if necessary (also important for sparse read heuristic)
spin_lock(&inode->i_lock);
if(inode->i_size < newMinFileSize)
i_size_write(inode, newMinFileSize);
spin_unlock(&inode->i_lock);
}
// add to /proc/<pid>/io
task_io_account_write(writeRes);
unlockappend_and_exit:
if(isLocallyLockedAppend)
Fhgfsinode_appendUnlock(fhgfsInode); // U N L O C K (append)
// Decrement coherent read/write counter
if (app->cfg->tuneCoherentBuffers)
atomic_dec(&fhgfsInode->coRWInProg);
return writeRes;
}
ssize_t FhgfsOps_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
size_t count = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
struct file* file = iocb->ki_filp;
struct dentry* dentry = file_dentry(file);
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
App* app = FhgfsOps_getApp(dentry->d_sb);
Logger* log = App_getLogger(app);
const char* logContext = __func__;
FhgfsIsizeHints iSizeHints;
ssize_t retVal;
int writeCheckRes;
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(offset: %lld; size: %lld)",
(long long)pos, (long long)count);
if (iocb->ki_pos != pos)
{ /* Similiar to WARN_ON(iocb->ki_pos != pos), as fuse does */
Logger_logErrFormatted(log, logContext, "Bug: iocb->ki_pos != pos (%lld vs %lld)",
iocb->ki_pos, pos);
dump_stack();
}
if(iocb->ki_filp->f_flags & O_APPEND)
{ // O_APPEND => flush (for correct size) and refresh file size
retVal = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
if(retVal)
return retVal;
}
writeCheckRes = os_generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode) );
if(unlikely(writeCheckRes) )
return writeCheckRes;
if(!count)
return 0;
if( (file->f_flags & O_APPEND) && (pos != iocb->ki_pos) )
{
/* pos was updated by generic_write_checks (append writes), so we also need to update
* iocb->ki_pos, otherwise generic_file_aio_write() will call BUG_ON */
iocb->ki_pos = pos;
}
iov_iter_truncate(from, count);
retVal = generic_file_write_iter(iocb, from);
if( (retVal >= 0)
&& ( (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC) )
|| unlikely(FhgfsInode_getHasWritePageError(fhgfsInode)) ) )
{ // sync I/O => flush and wait
struct address_space* mapping = inode->i_mapping;
if(mapping->nrpages)
{
int writeRes = filemap_fdatawrite(mapping);
if(writeRes >= 0)
{
int waitRes = filemap_fdatawait(mapping);
if(waitRes < 0)
retVal = waitRes;
}
else
retVal = writeRes;
}
if (unlikely(FhgfsInode_getHasWritePageError(fhgfsInode) ) && retVal >= 0)
FhgfsInode_clearWritePageError(fhgfsInode);
} // end of if(sync)
return retVal;
}
static ssize_t FhgfsOps_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
return write_common(iocb->ki_filp, from, iov_iter_count(from), &iocb->ki_pos);
}
#ifdef KERNEL_HAS_FSYNC_RANGE /* added in vanilla 3.1 */
int FhgfsOps_fsync(struct file* file, loff_t start, loff_t end, int datasync)
{
struct dentry* dentry = file_dentry(file);
#elif !defined(KERNEL_HAS_FSYNC_DENTRY)
int FhgfsOps_fsync(struct file* file, int datasync)
{
struct dentry* dentry = file_dentry(file);
#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34) */
int FhgfsOps_fsync(struct file* file, struct dentry* dentry, int datasync)
{
#endif // LINUX_VERSION_CODE
App* app = FhgfsOps_getApp(dentry->d_sb);
Config* cfg = App_getConfig(app);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_fsync";
int retVal = 0;
FsObjectInfo* fsObjectInfo;
struct inode* inode = file_inode(file);
if(unlikely(Logger_getLogLevel(log) >= 5) )
FhgfsOpsHelper_logOp(5, app, dentry, inode, logContext);
fsObjectInfo = __FhgfsOps_getObjectInfo(file);
// syncing something other than a file?
if(FsObjectInfo_getObjectType(fsObjectInfo) != FsObjectType_FILE)
goto clean_up;
retVal = __FhgfsOps_flush(app, file, false, Config_getTuneRemoteFSync(cfg), true,
false);
if(retVal)
goto clean_up;
clean_up:
return retVal;
}
/**
* Flush data from local cache to servers.
* Note: This method is in fact a local sync (as wait until data have arrived remotely)
*
* @param discardCacheOnError whether or not to discard the internal buffer cache in case of an
* error (typically you will only want to set true here during file close).
* @param forceRemoteFlush whether or not remote fsync will be executed depends on this value
* and the corresponding config value (fsync will use true here and flush won't)
* @param checkSession whether or not a server crash detection must be done on the server
* @param isClose whether or not the method is called by a close
* @return negative linux error code on error
*/
int __FhgfsOps_flush(App* app, struct file *file, bool discardCacheOnError,
bool forceRemoteFlush, bool checkSession, bool isClose)
{
Logger* log = App_getLogger(app);
Config* cfg = App_getConfig(app);
const char* logContext = __func__;
struct inode* inode = file->f_mapping->host;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
RemotingIOInfo ioInfo;
int filemapWaitRes;
int inodeWriteRes;
FhgfsOpsErr flushRes;
FhgfsOpsErr bumpRes;
int retVal = 0;
bool hasWriteHandle = FhgfsInode_hasWriteHandle(fhgfsInode);
bool doSyncOnClose = Config_getSysSyncOnClose(App_getConfig(app)) && isClose;
if(unlikely(Logger_getLogLevel(log) >= 5) )
FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);
if (hasWriteHandle || FhgfsInode_getHasDirtyPages(fhgfsInode) )
{
// flush page cache
inodeWriteRes = write_inode_now(inode, 1);
filemapWaitRes = filemap_fdatawait(file->f_mapping);
if(unlikely(inodeWriteRes < 0 || filemapWaitRes < 0) )
{
retVal = (inodeWriteRes < 0) ? inodeWriteRes : filemapWaitRes;
goto clean_up;
}
}
// flush buffer cache
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
flushRes = FhgfsOpsHelper_flushCache(app, fhgfsInode, discardCacheOnError);
if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) )
{ // error
retVal = FhgfsOpsErr_toSysErr(flushRes);
goto clean_up;
}
// remote fsync
if(forceRemoteFlush || (checkSession && Config_getSysSessionCheckOnClose(cfg)) || doSyncOnClose)
{
FhgfsOpsErr fsyncRes = FhgfsOpsRemoting_fsyncfile(&ioInfo, forceRemoteFlush, checkSession,
doSyncOnClose);
if(unlikely(fsyncRes != FhgfsOpsErr_SUCCESS) )
{
retVal = FhgfsOpsErr_toSysErr(fsyncRes);
goto clean_up;
}
}
if ((cfg->eventLogMask & EventLogMask_FLUSH) && (file->f_flags & (O_ACCMODE | O_TRUNC)))
{
struct FileEvent event;
FileEvent_init(&event, FileEventType_FLUSH, file_dentry(file));
FhgfsInode_entryInfoReadLock(fhgfsInode);
bumpRes = FhgfsOpsRemoting_bumpFileVersion(FhgfsOps_getApp(file_dentry(file)->d_sb),
&fhgfsInode->entryInfo, false, &event);
FhgfsInode_entryInfoReadUnlock(fhgfsInode);
FileEvent_uninit(&event);
if (bumpRes != FhgfsOpsErr_SUCCESS)
retVal = FhgfsOpsErr_toSysErr(bumpRes);
}
clean_up:
return retVal;
}
int FhgfsOps_mmap(struct file* file, struct vm_area_struct* vma)
{
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
Logger* log = App_getLogger(app);
const char* logContext = "FhgfsOps_mmap";
int locked;
int retry;
int max_retry;
FhgfsIsizeHints iSizeHints;
int retVal;
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
if(unlikely(Logger_getLogLevel(log) >= 5) )
FhgfsOpsHelper_logOp(5, app, file_dentry(file), inode, logContext);
locked = 0;
retry = 0;
/*
* If there are reads/writes already in progress, retry for the inode cache
* lock till MMAP_RETRY_LOCK_EASY iterations. reads/write will anyway
* flush the cache. So even if mmap can not get the inode cache lock, it can
* proceed with the operation. If read/writes are not in progress, wait for
* more iteration before we get the lock. But mmap should not block forever
* for the cache lock to avoid the deadlock condition.
* If mmap has to proceed without getting lock, we will print warning message
* indicating cache might not be coherent.
*/
max_retry = atomic_read(&fhgfsInode->coRWInProg) > 0 ?
MMAP_RETRY_LOCK_EASY : MMAP_RETRY_LOCK_HARD;
if (app->cfg->tuneCoherentBuffers)
{
FhgfsOpsErr flushRes;
do
{
locked = FhgfsInode_fileCacheExclusiveTryLock(fhgfsInode);
if (locked)
break;
// Sleep and retry for the lock
mdelay(10);
retry++;
} while (!locked && retry < max_retry);
if (locked)
{
flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false);
if (flushRes != FhgfsOpsErr_SUCCESS)
{
FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode);
retVal = FhgfsOpsErr_toSysErr(flushRes);
goto exit;
}
}
else
printk_fhgfs_debug(KERN_WARNING,
"mmap couldn't flush the cache. Cache might not be coherent\n");
}
retVal = generic_file_mmap(file, vma);
if(!retVal)
retVal = __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true);
if (app->cfg->tuneCoherentBuffers && locked)
FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode);
exit:
LOG_DEBUG_FORMATTED(log, 5, logContext, "result: %d", retVal);
return retVal;
}
/**
* @param fsdata can be used to hand any data over to write_end() (but note that the old
* prepare_write() doesn't have this)
* @return 0 on success
*/
static int FhgfsOps_write_begin(struct file* file, struct address_space* mapping,
loff_t pos, unsigned len,
#if BEEGFS_HAS_WRITE_FLAGS
unsigned flags,
#endif
beegfs_pgfol_t *pgfolp, void** fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
loff_t offset = pos & (PAGE_SIZE - 1);
loff_t page_start = pos & PAGE_MASK;
loff_t i_size;
struct page* page;
int retVal = 0;
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
// FhgfsOpsHelper_logOpDebug(app, file->f_dentry, __func__, "(offset: %lld; page_start: %lld; len: %u)",
// (long long)offset, (long long)page_start, len);
IGNORE_UNUSED_VARIABLE(app);
page = beegfs_grab_cache_page(mapping, index,
#if BEEGFS_HAS_WRITE_FLAGS
flags
#else
0
#endif
);
if(!page)
{
retVal = -ENOMEM;
goto clean_up;
}
if(PageUptodate(page) )
goto clean_up;
if(len == PAGE_SIZE)
{
// two possibilities:
// a) full page write => no need to read-update the page from the server
// b) short write (with offset) => will lead to sync write
goto clean_up;
}
i_size = i_size_read(mapping->host);
if( (page_start >= i_size) ||
(!offset && ( (pos + len) >= i_size) ) )
{
// we don't need to read data beyond the end of the file
// => zero it, and set the page up-to-date
zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
// note: PageChecked means rest of the page (to which is not being written) is up-to-date.
// so when our data is written, the whole page is up-to-date.
SetPageChecked(page);
goto clean_up;
}
// it is read-modify-write, so update the page with the server content
retVal = FhgfsOpsPages_readpageSync(file, page);
clean_up:
// clean-up
*pgfolp = beegfs_to_pgfol(page);
return retVal;
}
/**
* @param copied the amount that was able to be copied ("copied==len" is always true if
* write_begin() was called with the AOP_FLAG_UNINTERRUPTIBLE flag)
* @param fsdata whatever write_begin() set here
* @return < 0 on failure, number of bytes copied into pagecache (<= 'copied') otherwise
**/
static int FhgfsOps_write_end(struct file* file, struct address_space* mapping,
loff_t pos, unsigned len, unsigned copied, beegfs_pgfol_t pgfol, void* fsdata)
{
struct page* page = beegfs_get_page(pgfol);
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
struct inode* inode = mapping->host;
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
int retVal;
struct dentry *dentry= file_dentry(file);
App* app = FhgfsOps_getApp(dentry->d_sb);
Logger* log = App_getLogger(app);
const char* logContext = __func__;
// FhgfsOpsHelper_logOpDebug(app, dentry, logContext, "pos: %lld; len: %u; copied: %u",
// (long long)pos, len, copied);
IGNORE_UNUSED_VARIABLE(logContext);
if(PageChecked(page) )
{
// note: see write_begin() for meaning of PageChecked()
if(copied == len)
SetPageUptodate(page);
ClearPageChecked(page);
}
else
if(!PageUptodate(page) && (copied == PAGE_SIZE) )
SetPageUptodate(page);
if(!PageUptodate(page) )
{
unsigned offset = pos & (PAGE_SIZE - 1);
char* buf = kmap(page);
RemotingIOInfo ioInfo;
ssize_t writeRes;
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
FhgfsInode_incWriteBackCounter(fhgfsInode);
writeRes = FhgfsOpsRemoting_writefile(&buf[offset], copied, pos, &ioInfo);
spin_lock(&inode->i_lock);
FhgfsInode_setLastWriteBackOrIsizeWriteTime(fhgfsInode);
FhgfsInode_decWriteBackCounter(fhgfsInode);
spin_unlock(&inode->i_lock);
if(likely(writeRes > 0) )
{
retVal = writeRes;
pos += writeRes;
}
else
retVal = FhgfsOpsErr_toSysErr(-writeRes);
kunmap(page);
}
else
{
retVal = copied;
pos += copied;
if (!PageDirty(page) )
{ // Only add if the page is not dirty yet (don't add the same page twice...)
FhgfsInode_incNumDirtyPages(fhgfsInode);
}
set_page_dirty(page); // could be in the if-condition above, but for safety we set it here
}
if(likely(retVal > 0) )
{
spin_lock(&inode->i_lock);
if(pos > inode->i_size)
{
FhgfsInode_setPageWriteFlag(fhgfsInode);
FhgfsInode_setLastWriteBackOrIsizeWriteTime(fhgfsInode);
FhgfsInode_setNoIsizeDecrease(fhgfsInode);
i_size_write(inode, pos);
}
spin_unlock(&inode->i_lock);
}
unlock_page(page);
put_page(page);
// clean-up
// LOG_DEBUG_FORMATTED(log, 5, logContext, "complete. retVal: %d", retVal);
IGNORE_UNUSED_VARIABLE(log);
return retVal;
}
static ssize_t __FhgfsOps_directIO_common(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
struct iov_iter bgfsIter = *iter; // Was a wrapper copy. Now is just a defensive copy. Still needed?
struct file* file = iocb->ki_filp;
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
struct dentry* dentry = file_dentry(file);
struct inode* inode = file_inode(file);
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
RemotingIOInfo ioInfo;
ssize_t remotingRes;
const char* logContext = __func__;
App* app = FhgfsOps_getApp(dentry->d_sb);
Logger* log = App_getLogger(app);
FhgfsOpsHelper_logOpDebug(app, dentry, inode, logContext, "(%s, pos: %lld, nr_seqs: %lld)",
(rw == WRITE) ? "WRITE" : "READ", (long long)pos);
IGNORE_UNUSED_VARIABLE(logContext); // non-debug builds
FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
if(rw == WRITE)
{ // write
remotingRes = FhgfsOpsRemoting_writefileVec(&bgfsIter, pos, &ioInfo, false);
}
else if(rw == READ)
{ // read
remotingRes = FhgfsOpsRemoting_readfileVec(&bgfsIter, iov_iter_count(&bgfsIter), pos, &ioInfo, fhgfsInode);
if( (remotingRes >= 0 && iov_iter_count(&bgfsIter))
&& ( i_size_read(inode) > (pos + remotingRes) ) )
{ // sparse file compatibility mode
ssize_t readSparseRes = __FhgfsOps_readSparse(file, &bgfsIter, iov_iter_count(&bgfsIter), pos + remotingRes);
if(unlikely(readSparseRes < 0) )
remotingRes = readSparseRes;
else
remotingRes += readSparseRes;
}
}
else
{
#ifdef WARN_ONCE
WARN_ONCE(1, "unexpected: rw value !=READ and !=WRITE. (int value: %d)\n", rw);
#endif
return -EINVAL;
}
//Write back wrapped iter.
*iter = bgfsIter;
if(unlikely(remotingRes < 0) )
{ // error occurred
LOG_DEBUG_FORMATTED(log, 1, logContext, "error: %s",
FhgfsOpsErr_toErrString(-remotingRes) );
IGNORE_UNUSED_VARIABLE(log);
return FhgfsOpsErr_toSysErr(-remotingRes);
}
if(rw == WRITE)
task_io_account_write(remotingRes);
else
task_io_account_read(remotingRes);
return remotingRes;
}
/**
* Note: This method must be defined because otherwise the kernel rejects open() with O_DIRECT in
* fs/open.c. However, it is only called indirectoy through the generic file read/write routines
* (and swapping code), so it should actually never be called for buffered IO.
*
* @param rw whether this is a read or a write {READ, WRITE}
* @param iocb I/O control block with open file handle
* @param iov I/O buffer vectors (array)
* @param pos file offset
* @param nr_segs length of iov array
*/
ssize_t FhgfsOps_directIO(struct kiocb *iocb, struct iov_iter *iter)
{
int rw = iov_iter_rw(iter);
loff_t pos = iocb->ki_pos;
return __FhgfsOps_directIO_common(rw, iocb, iter, pos);
}