beegfs/client_module/source/filesystem/FhgfsOpsFile.c

#include <app/log/Logger.h>
#include <app/App.h>
#include <app/config/Config.h>
#include <common/toolkit/vector/StrCpyVec.h>
#include <common/toolkit/LockingTk.h>
#include <common/storage/StorageErrors.h>
#include <common/toolkit/StringTk.h>
#include <filesystem/ProcFs.h>
#include <os/iov_iter.h>
#include <os/OsCompat.h>
#include <os/OsTypeConversion.h>
#include "FhgfsOpsHelper.h"
#include "FhgfsOpsFile.h"
#include "FhgfsOpsDir.h"
#include "FhgfsOpsInode.h"
#include "FhgfsOpsIoctl.h"
#include "FhgfsOpsSuper.h"
#include "FhgfsOps_versions.h"
#include "FhgfsOpsPages.h"

#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/mm.h>
#include <linux/mpage.h>
#include <linux/backing-dev.h>
#include <linux/pagemap.h>
#include <linux/delay.h>


#ifdef CONFIG_COMPAT
#include <asm/compat.h>
#endif // CONFIG_COMPAT

static ssize_t FhgfsOps_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
static ssize_t FhgfsOps_buffered_read_iter(struct kiocb *iocb, struct iov_iter *to);

static int FhgfsOps_write_begin(struct file* file, struct address_space* mapping,
    loff_t pos, unsigned len,
#if BEEGFS_HAS_WRITE_FLAGS
    unsigned flags,
#endif
    beegfs_pgfol_t *pgfolp, void** fsdata);

static int FhgfsOps_write_end(struct file* file, struct address_space* mapping,
    loff_t pos, unsigned len, unsigned copied, beegfs_pgfol_t pgfol, void* fsdata);

#define MMAP_RETRY_LOCK_EASY 100
#define MMAP_RETRY_LOCK_HARD 500

/**
 * Operations for files with cache type "buffered" and "none".
 */
struct file_operations fhgfs_file_buffered_ops =
{
   .open             = FhgfsOps_open,
   .release          = FhgfsOps_release,
   .fsync            = FhgfsOps_fsync,
   .flush            = FhgfsOps_flush,
   .llseek           = FhgfsOps_llseek,
   .flock            = FhgfsOps_flock,
   .lock             = FhgfsOps_lock,
   .mmap             = FhgfsOps_mmap,
   .unlocked_ioctl   = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
   .compat_ioctl     = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
#ifdef KERNEL_HAS_GENERIC_FILE_SPLICE_READ
    .splice_read  = generic_file_splice_read,
#else
    .splice_read  = filemap_splice_read,
#endif
#ifdef KERNEL_HAS_ITER_FILE_SPLICE_WRITE
    .splice_write = iter_file_splice_write,
#else
    .splice_write = generic_file_splice_write,
#endif

   .read_iter           = FhgfsOps_buffered_read_iter,
   .write_iter          = FhgfsOps_buffered_write_iter, // replacement for aio_write

#ifdef KERNEL_HAS_GENERIC_FILE_SENDFILE
   .sendfile   = generic_file_sendfile, // removed in 2.6.23 (now handled via splice)
#endif // LINUX_VERSION_CODE
};

/**
 * Operations for files with cache type "paged".
 */
struct file_operations fhgfs_file_pagecache_ops =
{
   .open                = FhgfsOps_open,
   .release             = FhgfsOps_release,
   .read_iter           = FhgfsOps_read_iter,
   .write_iter          = FhgfsOps_write_iter,
   .fsync               = FhgfsOps_fsync,
   .flush               = FhgfsOps_flush,
   .llseek              = FhgfsOps_llseek,
   .flock               = FhgfsOps_flock,
   .lock                = FhgfsOps_lock,
   .mmap                = FhgfsOps_mmap,
   .unlocked_ioctl      = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
   .compat_ioctl        = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
#ifdef KERNEL_HAS_GENERIC_FILE_SPLICE_READ
    .splice_read  = generic_file_splice_read,
#else
    .splice_read  = filemap_splice_read,
#endif
#ifdef KERNEL_HAS_ITER_FILE_SPLICE_WRITE
    .splice_write = iter_file_splice_write,
#else
    .splice_write = generic_file_splice_write,
#endif

#ifdef KERNEL_HAS_GENERIC_FILE_SENDFILE
   .sendfile   = generic_file_sendfile, // removed in 2.6.23 (now handled via splice)
#endif // LINUX_VERSION_CODE
};

struct file_operations fhgfs_dir_ops =
{
   .open             = FhgfsOps_opendirIncremental,
   .release          = FhgfsOps_releasedir,
#ifdef KERNEL_HAS_ITERATE_DIR
#if defined(KERNEL_HAS_FOPS_ITERATE)
   .iterate       = FhgfsOps_iterateIncremental, // linux 3.11 renamed readdir to iterate
#else
   .iterate_shared   = FhgfsOps_iterateIncremental, // linux 6.3 removed .iterate & it's a parallel variant of .iterate().
#endif
#else
   .readdir       = FhgfsOps_readdirIncremental, // linux 3.11 renamed readdir to iterate
#endif // LINUX_VERSION_CODE
   .read             = generic_read_dir, // just returns the appropriate error code
   .fsync            = FhgfsOps_fsync,
   .llseek           = FhgfsOps_llseekdir,
   .unlocked_ioctl   = FhgfsOpsIoctl_ioctl,
#ifdef CONFIG_COMPAT
   .compat_ioctl     = FhgfsOpsIoctl_compatIoctl,
#endif // CONFIG_COMPAT
};

/**
 * Operations for files with cache type "buffered" and "none".
 */
struct address_space_operations fhgfs_address_ops =
{
#ifdef KERNEL_HAS_READ_FOLIO
   .read_folio     = FhgfsOps_read_folio,
#else
   .readpage       = FhgfsOpsPages_readpage,
#endif

#ifdef KERNEL_HAS_FOLIO
   .readahead      = FhgfsOpsPages_readahead,
   .dirty_folio    = filemap_dirty_folio,
#else
   .readpages      = FhgfsOpsPages_readpages,
   .set_page_dirty = __set_page_dirty_nobuffers,
#endif
   .writepage      = FhgfsOpsPages_writepage,
   .writepages     = FhgfsOpsPages_writepages,
   .direct_IO      = FhgfsOps_directIO,
   .write_begin   = FhgfsOps_write_begin,
   .write_end     = FhgfsOps_write_end,
};

/**
 * Operations for files with cache type "paged".
 */
struct address_space_operations fhgfs_address_pagecache_ops =
{
#ifdef KERNEL_HAS_READ_FOLIO
   .read_folio     = FhgfsOps_read_folio,
#else
   .readpage       = FhgfsOpsPages_readpage,
#endif

#ifdef KERNEL_HAS_FOLIO
   .readahead      = FhgfsOpsPages_readahead,
   .dirty_folio    = filemap_dirty_folio,
#else
   .readpages      = FhgfsOpsPages_readpages,
   .set_page_dirty = __set_page_dirty_nobuffers,
#endif
   .writepage      = FhgfsOpsPages_writepage,
   .writepages     = FhgfsOpsPages_writepages,
   .direct_IO      = FhgfsOps_directIO,
   .write_begin   = FhgfsOps_write_begin,
   .write_end     = FhgfsOps_write_end,
};


/**
 * note: rewinddir is seek to offset 0.
 *
 * @param origin dirs allow only SEEK_SET (via seekdir/rewinddir from userspace).
 */
loff_t FhgfsOps_llseekdir(struct file *file, loff_t offset, int origin)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   const char* logContext = "FhgfsOps_llseekDir";
   struct inode* inode = file_inode(file);

   loff_t retVal = 0;
   FsDirInfo* dirInfo = __FhgfsOps_getDirInfo(file);

   FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
      "offset: %lld directive: %d", (long long)offset, origin);

   if(origin != SEEK_SET)
   {
      if (origin == SEEK_CUR && offset == 0) {
         // Some applications use lseek with SEEK_CUR and offset = 0 to get the current position in
         // the file. To support that special case, we will translate the request into a SEEK_SET
         // with the current file position as the offset.
         offset = file->f_pos;
         origin = SEEK_SET;
         FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
            "offset: %lld position: %lld directive: %d", (long long)offset, (long long)file->f_pos,
            origin);
      } else {
         return -EINVAL;
      }
   }


   retVal = generic_file_llseek_unlocked(file, offset, origin);
   if(likely(retVal >= 0) )
   {
      // invalidate any retrieved contents to keep things in sync with server offset
      StrCpyVec* contents = FsDirInfo_getDirContents(dirInfo);

      StrCpyVec_clear(contents);
      FsDirInfo_setCurrentContentsPos(dirInfo, 0);

      FsDirInfo_setServerOffset(dirInfo, offset);
      FsDirInfo_setEndOfDir(dirInfo, false);
   }

   return retVal;
}

loff_t FhgfsOps_llseek(struct file *file, loff_t offset, int origin)
{
   const char* logContext = "FhgfsOps_llseek";
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);
   Config* cfg = App_getConfig(app);

   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   bool isGloballyLockedAppend =
      FsFileInfo_getAppending(fileInfo) && Config_getTuneUseGlobalAppendLocks(cfg);

   loff_t retVal = 0;
   struct inode *inode = file->f_mapping->host;

   FhgfsIsizeHints iSizeHints;

   if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
      FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
         "offset: %lld directive: %d", (long long)offset, origin);

   /* note: globally locked append with SEEK_CUR is a special case, because we need to flush
      the cache to find out the current offset (which is not required without append) */
   if( (origin == SEEK_END) ||
       (isGloballyLockedAppend && (origin == SEEK_CUR) ) )
   { // seek to position relative to end-of-file => flush cache and update current file size first

      // (note: refreshInode() also flushes caches for correct file size)

      retVal = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
      if(retVal)
         goto clean_up;

      spin_lock(&inode->i_lock); // L O C K

      // SEEK_CUR reads (and modifies) f_pos, so in buffered append mode move to end first
      if(origin == SEEK_CUR)
         file->f_pos = inode->i_size;

      retVal = generic_file_llseek_unlocked(file, offset, origin);

      spin_unlock(&inode->i_lock); // U N L O C K
   }
   else
   { // abolute or relative-to-current_pos seeks => generic stuff
      retVal = generic_file_llseek_unlocked(file, offset, origin);
   }


clean_up:
   // clean-up

   FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, logContext, "retVal: %lld",
      retVal);

   return retVal;
}

/**
 * Note: Currently unsused method, as we're using the kernel's generic_readlink function.
 */
int FhgfsOps_readlink(struct dentry* dentry, char __user* buf, int size)
{
   App* app = FhgfsOps_getApp(dentry->d_sb);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_readlink";

   int retVal;
   struct inode* inode = dentry->d_inode;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   if(unlikely(Logger_getLogLevel(log) >= 5) )
      FhgfsOpsHelper_logOp(5, app, dentry, inode, logContext);

   // check user buffer
   if(unlikely(!os_access_ok(VERIFY_WRITE, buf, size) ) )
      return -EFAULT;

   FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

   retVal = FhgfsOpsHelper_readlink_kernel(app, FhgfsInode_getEntryInfo(fhgfsInode), buf, size);

   FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo


   return retVal;
}

/**
 * Opens a directory and prepares the handle for incremental readdir().
 */
int FhgfsOps_opendirIncremental(struct inode* inode, struct file* file)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_opendirIncremental";

   int retVal = 0;
   //struct dentry* dentry = file_dentry(file);
   FsDirInfo* dirInfo;

   if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
      FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);

   //retVal = __FhgfsOps_refreshInode(app, inode); // not necessary
   if(!retVal)
   { // success
      dirInfo = FsDirInfo_construct(app);
      __FhgfsOps_setDirInfo(dirInfo, file);
   }

#ifdef FMODE_KABI_ITERATE
   file->f_mode |= FMODE_KABI_ITERATE;
#endif

   return retVal;
}

#ifdef KERNEL_HAS_ITERATE_DIR
int FhgfsOps_iterateIncremental(struct file* file, struct dir_context* ctx)
#else
int FhgfsOps_readdirIncremental(struct file* file, void* buf, filldir_t filldir)
#endif // LINUX_VERSION_CODE
{
   /* note: if the user seeks to a custom offset, llseekdir will invalidate any retrieved contents
      and set the new offset in the dirinfo object */

   struct dentry* dentry = file_dentry(file);
   struct super_block* superBlock = dentry->d_sb;
   App* app = FhgfsOps_getApp(superBlock);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_readdirIncremental";

   int retVal = 0;
   FsDirInfo* dirInfo = __FhgfsOps_getDirInfo(file);
   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   StrCpyVec* dirContents = FsDirInfo_getDirContents(dirInfo);
   UInt8Vec* dirContentsTypes = FsDirInfo_getDirContentsTypes(dirInfo);
   StrCpyVec* dirContentIDs = FsDirInfo_getEntryIDs(dirInfo);
   Int64CpyVec* serverOffsets = FsDirInfo_getServerOffsets(dirInfo);

   #ifdef KERNEL_HAS_ITERATE_DIR
      loff_t* pos = &(ctx->pos); // used by dir_emit()
   #else
      loff_t* pos = &(file->f_pos);
   #endif // LINUX_VERSION_CODE


   if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
      FhgfsOpsHelper_logOp(Log_SPAM, app, dentry, inode, logContext);


   FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo


   // loop as long as filldir (or dir_emit) swallows more entries (or end of dir contents reached)
   for( ; ; )
   {
      int refreshRes;
      size_t contentsPos;
      size_t contentsLength;
      char* currentName;
      DirEntryType currentEntryType;
      unsigned currentOSEntryType;
      uint64_t currentIno;

      refreshRes = FhgfsOpsHelper_refreshDirInfoIncremental(app,
         FhgfsInode_getEntryInfo(fhgfsInode), dirInfo, false);
      if(unlikely(refreshRes) )
      { // error occurred
         retVal = refreshRes;
         break;
      }

      contentsLength = StrCpyVec_length(dirContents);

      /* refreshDirInfoIncremental() guarantees that we either have a valid range for current
         dir offset or that dirContents list is empty */
      if(!contentsLength)
      { // end of dir
         LOG_DEBUG(log, Log_SPAM, logContext, "reached end of dir");
         break;
      }

      contentsPos = FsDirInfo_getCurrentContentsPos(dirInfo);

      currentName = StrCpyVec_at(dirContents, contentsPos);

      currentEntryType = UInt8Vec_at(dirContentsTypes, contentsPos);
      currentOSEntryType = OsTypeConv_dirEntryTypeToOS(currentEntryType);


      LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext,
         "name: %s; pos: %lld; contentsPos: %lld/%lld; finalContents: %s",
         currentName, (long long)*pos, (long long)contentsPos,
         (long long)contentsLength, FsDirInfo_getEndOfDir(dirInfo) ? "yes" : "no");


      if(!strcmp(".", currentName) )
         currentIno = inode->i_ino;
      else
      if(!strcmp("..", currentName) )
         #if defined(KERNEL_HAS_PARENT_INO)
         currentIno = parent_ino(dentry);
         #else
         currentIno = d_parent_ino(dentry);
         #endif
      else
      { // generate inode number from entryID
         const char* currentEntryID = StrCpyVec_at(dirContentIDs, contentsPos);

         currentIno = FhgfsInode_generateInodeID(superBlock, currentEntryID,
            strlen(currentEntryID) );
      }


      if(is_32bit_api() && (currentIno > UINT_MAX) )
         currentIno = currentIno >> 32; // (32-bit apps would fail with EOVERFLOW)


      #ifdef KERNEL_HAS_ITERATE_DIR
         if(!dir_emit(
            ctx, currentName, strlen(currentName), currentIno, currentOSEntryType) )
            break;
      #else
         if(filldir(
            buf, currentName, strlen(currentName), *pos, currentIno, currentOSEntryType) < 0)
            break;
      #endif // LINUX_VERSION_CODE


      LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext, "filled: %s", currentName);

      // advance dir position (yes, it's alright to use the old contentsPos for the next round here)
      (*pos) = Int64CpyVec_at(serverOffsets, contentsPos);

      // increment contents vector offset
      FsDirInfo_setCurrentContentsPos(dirInfo, contentsPos+1);

   } // end of for-loop


   // clean-up
   FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

   return retVal;
}


/**
 * Note: This works for _opendir() and for _opendirIncremental().
 */
int FhgfsOps_releasedir(struct inode* inode, struct file* file)
{
   const char* logContext = "FhgfsOps_releasedir";

   FsObjectInfo* fsObjectInfo = __FhgfsOps_getObjectInfo(file);

   App* app = FsObjectInfo_getApp(fsObjectInfo);

   FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);

   FsObjectInfo_virtualDestruct(fsObjectInfo);

   return 0;
}

/**
 * Open a file, may be called from vfs or lookup/atomic open.
 *
 * @param lookupInfo is NULL if this is a direct open call from the vfs
 */
int FhgfsOps_openReferenceHandle(App* app, struct inode* inode, struct file* file,
   unsigned openFlags, LookupIntentInfoOut* lookupInfo, uint32_t* outVersion)
{
   Config* cfg = App_getConfig(app);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_openReferenceHandle";

   struct super_block* sb = inode->i_sb;
   struct dentry* dentry = file_dentry(file);

   int retVal = 0;
   int fhgfsOpenFlags;
   FileHandleType handleType;
   FhgfsOpsErr openRes;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   if(unlikely(Logger_getLogLevel(log) >= 4) )
      FhgfsOpsHelper_logOp(Log_DEBUG, app, dentry, inode, logContext);

   fhgfsOpenFlags = OsTypeConv_openFlagsOsToFhgfs(openFlags, __FhgfsOps_isPagedMode(sb) );

   openRes = FhgfsInode_referenceHandle(fhgfsInode, file_dentry(file), fhgfsOpenFlags, false,
      lookupInfo, &handleType, outVersion);

   LOG_DEBUG_FORMATTED(log, 4, logContext, "remoting complete. result: %s",
      FhgfsOpsErr_toErrString(openRes) );

   if(openRes != FhgfsOpsErr_SUCCESS)
   { // error
      retVal = FhgfsOpsErr_toSysErr(openRes);
   }
   else
   { // success => file is open (=> handle open flags)
      FsFileInfo* fileInfo = FsFileInfo_construct(app, fhgfsOpenFlags, handleType);

      // handle O_APPEND
      if(file->f_flags & O_APPEND)
         FsFileInfo_setAppending(fileInfo, true);

      // handle O_DIRECT + disabled caching
      if( (file->f_flags & O_DIRECT) ||
          ( (file->f_flags & O_APPEND) && !Config_getTuneUseBufferedAppend(cfg) ) ||
          (Config_getTuneFileCacheTypeNum(cfg) == FILECACHETYPE_None) )
      { // disable caching
         FsFileInfo_setAllowCaching(fileInfo, false);
      }

      __FhgfsOps_setFileInfo(fileInfo, file);
   }

   return retVal;
}

/**
 * Open a file, vfs interface
 */
int FhgfsOps_open(struct inode* inode, struct file* file)
{
   const char* logContext = "FhgfsOps_open";

   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);

   struct dentry* dentry = file_dentry(file);

   unsigned openFlags = file->f_flags;
   LookupIntentInfoOut* lookupInfo = NULL; // not available for direct open

   if(unlikely(Logger_getLogLevel(log) >= 4) )
      FhgfsOpsHelper_logOp(4, app, dentry, inode, logContext);

   return FhgfsOps_openReferenceHandle(app, inode, file, openFlags, lookupInfo, NULL);
}

/**
 * Close a file.
 *
 * Note: We only got one shot, even in case of an error.
 */
int FhgfsOps_release(struct inode* inode, struct file* file)
{
   const char* logContext = "FhgfsOps_release";

   int retVal = 0;
   FhgfsOpsErr closeRes;
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   FsObjectInfo* fsObjectInfo = __FhgfsOps_getObjectInfo(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   FileHandleType handleType = FsFileInfo_getHandleType(fileInfo);

   App* app = FsObjectInfo_getApp(fsObjectInfo);

   FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);

   if(unlikely(!fileInfo) )
   { // invalid file handle
      return -EBADF;
   }

   FhgfsOps_releaseCancelLocks(inode, file); // cancel all locks that were not properly released yet

   closeRes = FhgfsInode_releaseHandle(fhgfsInode, handleType, file_dentry(file));

   if(closeRes != FhgfsOpsErr_SUCCESS)
   { // error
      retVal = FhgfsOpsErr_toSysErr(closeRes);
   }

   // note: we free the fileInfo no matter whether the communication succeeded or not
   //    (because _release() won't be called again even if it didn't succeed)

   FsObjectInfo_virtualDestruct( (FsObjectInfo*)fileInfo);
   __FhgfsOps_setFileInfo( (FsFileInfo*)NULL, file);

   // warning: linux vfs won't return this result to user apps. only flush() res is passed to apps.
   return retVal;
}

/**
 * Called during file close to unlock remaining entry locks and range locks that were not properly
 * unlocked by the user-space application yet.
 */
int FhgfsOps_releaseCancelLocks(struct inode* inode, struct file* file)
{
   int retVal = 0;
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   RemotingIOInfo ioInfo;
   FhgfsOpsErr unlockRes;

   /* (note: it is very unlikely that an application will use entry and range locking together on
      the same file, so we have no special optimization regarding EntryMinInfoCopy for that case) */

   if(FsFileInfo_getUsedEntryLocking(fileInfo) )
   { // entry locks were used with this file handle
      int64_t clientFD = __FhgfsOps_getCurrentLockFD(file);

      FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

      FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

      unlockRes = FhgfsOpsHelper_unlockEntryWithAsyncRetry(&fhgfsInode->entryInfo,
         &fhgfsInode->entryInfoLock, &ioInfo, clientFD);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

      if(!retVal)
         retVal = FhgfsOpsErr_toSysErr(unlockRes);
   }

   /* (note: FhgfsInode_getNumRangeLockPIDs() below is a shortcut to save the time for mutex locking
      if no range locks were used for this inode.) */

   if(FhgfsInode_getNumRangeLockPIDs(fhgfsInode) &&
      FhgfsInode_removeRangeLockPID(fhgfsInode, __FhgfsOps_getCurrentLockPID() ) )
   { // current pid used range locking on this inode
      int ownerPID = __FhgfsOps_getCurrentLockPID();

      FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

      FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

      unlockRes = FhgfsOpsHelper_unlockRangeWithAsyncRetry(&fhgfsInode->entryInfo,
         &fhgfsInode->entryInfoLock, &ioInfo, ownerPID);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

      if(!retVal)
         retVal = FhgfsOpsErr_toSysErr(unlockRes);
   }

   return retVal;
}

/**
 * Called by flock syscall.
 *
 * @return 0 on success, negative linux error code otherwise
 */
int FhgfsOps_flock(struct file* file, int cmd, struct file_lock* fileLock)
{
   const char* logContext = __func__;

   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);
   Config* cfg = App_getConfig(app);

   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   bool useGlobalFileLocks = Config_getTuneUseGlobalFileLocks(cfg);
   FhgfsOpsErr globalLockRes = FhgfsOpsErr_SUCCESS;
   int lockTypeFlags;

   lockTypeFlags = OsTypeConv_flockTypeToFhgfs(fileLock);

   if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
      FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext, "lockType: %s",
         LockingTk_lockTypeToStr(lockTypeFlags) );


   // flush buffers before removing a global lock

   if(useGlobalFileLocks && (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
   {
      int flushRes = __FhgfsOps_flush(app, file, false, false, true,
         false);

      /* note: can't return error here and must continue, because local unlock must always be done
         to avoid BUG() statement being triggered in locks_remove_flock() on cleanup after kill */
      if(unlikely(flushRes < 0) )
         Logger_logFormatted(log, Log_NOTICE, logContext,
            "Flushing before unlock failed. Continuing anyways. flushRes: %d", flushRes);
   }

   // global locking

   if(useGlobalFileLocks)
   {
      FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
      RemotingIOInfo ioInfo;

      FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

      FsFileInfo_setUsedEntryLocking(fileInfo);

      FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

      globalLockRes = FhgfsOpsRemoting_flockEntryEx(&fhgfsInode->entryInfo,
         &fhgfsInode->entryInfoLock, app, ioInfo.fileHandleID, (size_t)FhgfsCommon_getFileLock(fileLock),
         FhgfsCommon_getFileLockPID(fileLock), lockTypeFlags, true);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

      LOG_DEBUG_FORMATTED(log, Log_DEBUG, logContext, "remoting complete. result: %s",
         FhgfsOpsErr_toErrString(globalLockRes) );

      /* note: local unlock must always be done for cleanup (otherwise e.g. killing a process
         holding a lock results in the BUG() statement being triggered in  locks_remove_flock() ) */
      if( (globalLockRes != FhgfsOpsErr_SUCCESS) &&
         !(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
         return FhgfsOpsErr_toSysErr(globalLockRes);
   }

   // local locking

   {
      #if defined(KERNEL_HAS_LOCKS_FILELOCK_INODE_WAIT) || defined(KERNEL_HAS_LOCKS_LOCK_INODE_WAIT)
         int localLockRes = locks_lock_inode_wait(file_inode(file), fileLock);
      #else
         int localLockRes = flock_lock_file_wait(file, fileLock);
      #endif

      if(!useGlobalFileLocks)
         return localLockRes;

      if(localLockRes &&
         (globalLockRes == FhgfsOpsErr_SUCCESS) &&
         (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) )
      { // sanity check
         Logger_logFormatted(log, Log_NOTICE, logContext,
            "Unexpected: Global locking succeeded, but local locking failed. SysErr: %d",
            localLockRes);
      }
   }

   // flush buffers after we got a new global lock

   if(useGlobalFileLocks &&
      (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) &&
      (globalLockRes == FhgfsOpsErr_SUCCESS) )
   {
      int flushRes = __FhgfsOps_flush(app, file, false, false, true,
         false);

      if(unlikely(flushRes < 0) )
         return flushRes; // flush error occured
   }

   return FhgfsOpsErr_toSysErr(globalLockRes);
}

/**
 * Called by fcntl syscall (F_GETLK, F_SETLK, F_SETLKW) for file range locking.
 *
 * @return 0 on success, negative linux error code otherwise
 */
int FhgfsOps_lock(struct file* file, int cmd, struct file_lock* fileLock)
{
   const char* logContext = "FhgfsOps_lock (fcntl)";

   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);
   Config* cfg = App_getConfig(app);

   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   bool useGlobalFileLocks = Config_getTuneUseGlobalFileLocks(cfg);
   FhgfsOpsErr globalLockRes = FhgfsOpsErr_SUCCESS;
   int lockTypeFlags;


   // handle user request for conflicting locks (always local-only currently, see notes below)

   if(cmd == F_GETLK)
   { // get confliciting lock

      /* note: it's questionable if returning remote locks makes sense (because the local app could
         misinterpret the pid of the lock-holder), so we do local only for now. */

      posix_test_lock(file, fileLock);

      /* note: "fileLock->fl_type != F_UNLCK" would tell us now whether a conflicting local lock
         was found */

      return 0;
   }


   lockTypeFlags = OsTypeConv_flockTypeToFhgfs(fileLock);

   if(unlikely(Logger_getLogLevel(log) >= Log_SPAM) )
      FhgfsOpsHelper_logOpMsg(Log_SPAM, app, file_dentry(file), inode, logContext,
         "lockType: %s; start: %lld; end: %lld", LockingTk_lockTypeToStr(lockTypeFlags),
         (long long)fileLock->fl_start, (long long)fileLock->fl_end);


   // flush buffers before removing a global lock

   if(useGlobalFileLocks &&
      (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
   {
      int flushRes = __FhgfsOps_flush(app, file, false, false, true, false);

      /* note: can't return error here and must continue, because local unlock must always be done
         to avoid BUG() statement being triggered in locks_remove_flock() on cleanup after kill */
      if(unlikely(flushRes < 0) )
         Logger_logFormatted(log, Log_NOTICE, logContext,
            "Flushing before unlock failed. Continuing anyways. flushRes: %d", flushRes);
   }

   // global locking

   if(useGlobalFileLocks)
   {
      FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
      RemotingIOInfo ioInfo;

      FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);
      FhgfsInode_addRangeLockPID(fhgfsInode, FhgfsCommon_getFileLockPID(fileLock));

      FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

      globalLockRes = FhgfsOpsRemoting_flockRangeEx(&fhgfsInode->entryInfo,
         &fhgfsInode->entryInfoLock, ioInfo.app, ioInfo.fileHandleID, FhgfsCommon_getFileLockPID(fileLock),
         lockTypeFlags, fileLock->fl_start, fileLock->fl_end, true);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

      LOG_DEBUG_FORMATTED(log, Log_DEBUG, logContext, "remoting complete. result: %s",
         FhgfsOpsErr_toErrString(globalLockRes) );

      /* note: local unlock must always be done for cleanup (otherwise e.g. killing a process
         holding a lock results in the BUG() statement being triggered in  locks_remove_flock() ) */
      if( (globalLockRes != FhgfsOpsErr_SUCCESS) &&
         !(lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_REMOVE) )
         return FhgfsOpsErr_toSysErr(globalLockRes);
   }

   // local locking

   {
      /* note on local+global locking:
         we need to call posix_lock_file_wait() even with global locks, because inode->i_flock needs
         to be set, so that locks_remove_posix() gets active (via filp_close() ) and thus the
         condition "Record locks are not inherited by a child created via fork(2), but are preserved
         across an execve(2)." [man 2 fcntl] holds.
         Otherwise we wouldn't be notified about an unlock on parent process exit, as there are
         still references to the filp and thus our ->release() isn't invoked. (See trac #271) */

      /* note on local/global locking order:
         local locking needs to be done after global locking, because otherwise if global locking
         failed we wouldn't know how to undo the local locking (e.g. if the process acquires a
         shared lock for the second time or does a merge with existing ranges). */

#if defined(KERNEL_HAS_LOCKS_FILELOCK_INODE_WAIT) || defined(KERNEL_HAS_LOCKS_LOCK_INODE_WAIT)
      int localLockRes = locks_lock_inode_wait(file_inode(file), fileLock);
#else
      int localLockRes = posix_lock_file_wait(file, fileLock);
#endif

      //printk_fhgfs_debug(KERN_WARNING, "posix_lock_file result=%d, cmd=%d\n", localLockRes, cmd);

      if(!useGlobalFileLocks)
         return localLockRes;

      if(localLockRes &&
         (globalLockRes == FhgfsOpsErr_SUCCESS) &&
         (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) )
      { // sanity check
         Logger_logFormatted(log, Log_NOTICE, logContext,
            "Unexpected: Global locking succeeded, but local locking failed. SysErr: %d",
            localLockRes);
      }
   }

   // flush buffers after we got a new global lock

   if(useGlobalFileLocks &&
      (lockTypeFlags & ENTRYLOCKTYPE_LOCKOPS_ADD) &&
      (globalLockRes == FhgfsOpsErr_SUCCESS) )
   {
      int flushRes = __FhgfsOps_flush(app, file, false, false, true, false);
      if(unlikely(flushRes < 0) )
         return flushRes; // flush error occured
   }

   return FhgfsOpsErr_toSysErr(globalLockRes);
}


static ssize_t read_common(struct file *file, struct iov_iter *iter, size_t size, loff_t *offsetPointer)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);

   struct inode* inode = file->f_mapping->host;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   RemotingIOInfo ioInfo;
   ssize_t readRes;

   FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
      (long long)*offsetPointer, (long long)size);
   IGNORE_UNUSED_VARIABLE(app);

   FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

   if (app->cfg->tuneCoherentBuffers)
   {
      readRes = filemap_write_and_wait(file->f_mapping);
      if (readRes < 0)
         return readRes;

      // ignore the -EBUSY we could receive here, because there is just *no* way we can keep caches
      // coherent without locking everything all the time. if this produces inconsistent data,
      // something must have been racy anyway.
      invalidate_inode_pages2(file->f_mapping);
      // Increment coherent read/write counter
      atomic_inc(&fhgfsInode->coRWInProg);
   }

   readRes = FhgfsOpsHelper_readCached(iter, size, *offsetPointer, fhgfsInode, fileInfo, &ioInfo);
   //readRes = FhgfsOpsRemoting_readfile(buf, size, *offsetPointer, &ioInfo);

   if(readRes < 0)
   { // read error (=> transform negative fhgfs error code to system error code)
      if (app->cfg->tuneCoherentBuffers)
         atomic_dec(&fhgfsInode->coRWInProg);
      return FhgfsOpsErr_toSysErr(-readRes);
   }

   *offsetPointer += readRes;
   FsFileInfo_setLastReadOffset(fileInfo, *offsetPointer);

   if( ( (size_t)readRes < size) && (i_size_read(inode) > *offsetPointer) )
   { // sparse file compatibility mode
      ssize_t readSparseRes = __FhgfsOps_readSparse(
         file, iter, size - readRes, *offsetPointer);

      if(unlikely(readSparseRes < 0) )
      {
         if (app->cfg->tuneCoherentBuffers)
            atomic_dec(&fhgfsInode->coRWInProg);
         return readSparseRes;
      }

      *offsetPointer += readSparseRes;
      readRes += readSparseRes;

      FsFileInfo_setLastReadOffset(fileInfo, *offsetPointer);
   }

   // add to /proc/<pid>/io
   task_io_account_read(readRes);

   // Decrement coherent read/write counter
   if (app->cfg->tuneCoherentBuffers)
      atomic_dec(&fhgfsInode->coRWInProg);

   return readRes;
}


/**
 * Special reading mode that is slower (e.g. not parallel) but compatible with sparse files.
 *
 * Note: Intended to be just a helper for actual read methods (e.g. won't increase the offset
 * pointer).
 *
 * @return negative Linux error code on error, read bytes otherwise
 */
ssize_t __FhgfsOps_readSparse(struct file* file, struct iov_iter *iter, size_t size, loff_t offset)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);

   struct inode* inode = file->f_mapping->host;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   RemotingIOInfo ioInfo;
   ssize_t readRes;
   loff_t i_size;
   FhgfsOpsErr helperReadRes;
   FhgfsIsizeHints iSizeHints;

   FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
      (long long)offset, (long long)size);

   readRes = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
   if(unlikely(readRes) )
      return readRes;

   i_size = i_size_read(inode);
   if(i_size <= offset)
      return 0; // EOF

   // adapt read length to current file length
   size = MIN(size, (unsigned long long)(i_size - offset) );

   FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

   helperReadRes = FhgfsOpsHelper_readOrClearUser(app, iter, size, offset, fileInfo, &ioInfo);

   if(unlikely(helperReadRes != FhgfsOpsErr_SUCCESS) )
      return FhgfsOpsErr_toSysErr(helperReadRes);

   return size;
}


ssize_t FhgfsOps_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
   size_t count = iov_iter_count(to);
   loff_t pos = iocb->ki_pos;

   struct file* file = iocb->ki_filp;
   struct address_space* mapping = file->f_mapping;
   struct inode* inode = mapping->host;

   App* app = FhgfsOps_getApp(inode->i_sb);

   ssize_t retVal;

   FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
      (long long)pos, (long long)count);

   IGNORE_UNUSED_VARIABLE(pos);
   IGNORE_UNUSED_VARIABLE(count);

   retVal = __FhgfsOps_revalidateMapping(app, inode);
   if(unlikely(retVal) )
   { // error
      return retVal;
   }

   return generic_file_read_iter(iocb, to);
}

static ssize_t FhgfsOps_buffered_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
   return read_common(iocb->ki_filp, to, iov_iter_count(to), &iocb->ki_pos);
}

static ssize_t write_common(struct file *file, struct iov_iter *from, size_t size, loff_t *offsetPointer)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Config* cfg = App_getConfig(app);

   struct inode* inode = file->f_mapping->host;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   RemotingIOInfo ioInfo;
   loff_t writeOffset;
   ssize_t writeRes;
   loff_t newMinFileSize; // to update i_size after write

   bool isLocallyLockedAppend =
      FsFileInfo_getAppending(fileInfo) && !Config_getTuneUseGlobalAppendLocks(cfg);
   bool isGloballyLockedAppend =
      FsFileInfo_getAppending(fileInfo) && Config_getTuneUseGlobalAppendLocks(cfg);

   FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "(offset: %lld; size: %lld)",
      (long long)*offsetPointer, (long long)size);

   inode_lock(inode);
   {
      writeRes = os_generic_write_checks(file, offsetPointer, &size, S_ISBLK(inode->i_mode) );
      if (likely(! writeRes))  // success
         writeRes = file_remove_privs(file);
   }
   inode_unlock(inode);

   if (unlikely(writeRes))
      return writeRes;

   if (app->cfg->tuneCoherentBuffers)
   {
      /* this flush is necessary to ensure that delayed flushing of the page cache does not
       * overwrite the data written here, even though it was written to the file first. */
      writeRes = filemap_write_and_wait(file->f_mapping);
      if (writeRes < 0)
         return writeRes;

      /* ignore the -EBUSY we could receive here, because there is just *no* way we can keep caches
       * coherent without locking everything all the time. if this produces inconsistent data,
       * something must have been racy anyway. */
      invalidate_inode_pages2(file->f_mapping);
      //Increment coherent rw counter
      atomic_inc(&fhgfsInode->coRWInProg);
   }

   if(isLocallyLockedAppend)
   { // appending without global locks => move file offset to end-of-file before writing

      /* note on flush and lock: the flush here must be inside the local lock, but cannot happen at
         the place where we take the global lock (because that might be called from a flush path
         itself), that's why global and local locks are taken at different places. */

      int flushRes;
      FhgfsOpsErr statRes;
      fhgfs_stat fhgfsStat;

      Fhgfsinode_appendLock(fhgfsInode); // L O C K (append)

      flushRes = __FhgfsOps_flush(app, file, false, false, true, false);
      if(unlikely(flushRes < 0) )
      { // flush error
         writeRes = flushRes;
         goto unlockappend_and_exit;
      }

      FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo

      /* note on stat here: we could pass -1 to _writeCached and remove the stat here, but the
         disadvantage would be that we don't have the correct file offset for i_size then, so we
         leave the stat here for now. */

      statRes = FhgfsOpsRemoting_statDirect(app, FhgfsInode_getEntryInfo(fhgfsInode), &fhgfsStat);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo

      if(unlikely(statRes != FhgfsOpsErr_SUCCESS) )
      { // remote stat error
         writeRes = FhgfsOpsErr_toSysErr(statRes);
         goto unlockappend_and_exit;
      }

      *offsetPointer = fhgfsStat.size;
   }

   FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

   writeOffset = isGloballyLockedAppend ? -1 : *offsetPointer;

   writeRes = FhgfsOpsHelper_writeCached(from, size, writeOffset, fhgfsInode, fileInfo, &ioInfo);
   //writeRes = FhgfsOpsRemoting_writefile(from, size, *offsetPointer, &ioInfo);

   if(unlikely(writeRes < 0) )
   { // write error (=> transform negative fhgfs error code to system error code)
      writeRes = FhgfsOpsErr_toSysErr(-writeRes);
      goto unlockappend_and_exit;
   }

   if(!isGloballyLockedAppend)
   { // for (buffered) global append locks, new offset/filesize would be unknown
      newMinFileSize = *offsetPointer + writeRes; // update with old offset to avoid offset==0 check
      *offsetPointer += writeRes;

      FsFileInfo_setLastWriteOffset(fileInfo, *offsetPointer);

      // check current file size and update if necessary (also important for sparse read heuristic)
      spin_lock(&inode->i_lock);
      if(inode->i_size < newMinFileSize)
         i_size_write(inode, newMinFileSize);
      spin_unlock(&inode->i_lock);
   }

   // add to /proc/<pid>/io
   task_io_account_write(writeRes);

unlockappend_and_exit:
   if(isLocallyLockedAppend)
      Fhgfsinode_appendUnlock(fhgfsInode); // U N L O C K (append)

   // Decrement coherent read/write counter
   if (app->cfg->tuneCoherentBuffers)
      atomic_dec(&fhgfsInode->coRWInProg);
   return writeRes;
}

ssize_t FhgfsOps_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
   size_t count = iov_iter_count(from);
   loff_t pos = iocb->ki_pos;

   struct file* file = iocb->ki_filp;
   struct dentry* dentry = file_dentry(file);
   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   App* app = FhgfsOps_getApp(dentry->d_sb);
   Logger* log = App_getLogger(app);
   const char* logContext = __func__;

   FhgfsIsizeHints iSizeHints;

   ssize_t retVal;
   int writeCheckRes;

   FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(offset: %lld; size: %lld)",
      (long long)pos, (long long)count);

   if (iocb->ki_pos != pos)
   { /* Similiar to WARN_ON(iocb->ki_pos != pos), as fuse does */
      Logger_logErrFormatted(log, logContext, "Bug: iocb->ki_pos != pos (%lld vs %lld)",
         iocb->ki_pos, pos);

      dump_stack();
   }

   if(iocb->ki_filp->f_flags & O_APPEND)
   { // O_APPEND => flush (for correct size) and refresh file size

      retVal = __FhgfsOps_refreshInode(app, inode, NULL, &iSizeHints);
      if(retVal)
         return retVal;
   }

   writeCheckRes = os_generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode) );
   if(unlikely(writeCheckRes) )
      return writeCheckRes;

   if(!count)
      return 0;

   if( (file->f_flags & O_APPEND) && (pos != iocb->ki_pos) )
   {
      /* pos was updated by generic_write_checks (append writes), so we also need to update
       * iocb->ki_pos, otherwise generic_file_aio_write() will call BUG_ON */
      iocb->ki_pos = pos;
   }

   iov_iter_truncate(from, count);

   retVal = generic_file_write_iter(iocb, from);

   if( (retVal >= 0)
      && ( (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC) )
         || unlikely(FhgfsInode_getHasWritePageError(fhgfsInode)) ) )
   { // sync I/O => flush and wait
      struct address_space* mapping = inode->i_mapping;

      if(mapping->nrpages)
      {
         int writeRes = filemap_fdatawrite(mapping);
         if(writeRes >= 0)
         {
            int waitRes = filemap_fdatawait(mapping);
            if(waitRes < 0)
               retVal = waitRes;
         }
         else
            retVal = writeRes;
      }

      if (unlikely(FhgfsInode_getHasWritePageError(fhgfsInode) ) && retVal >= 0)
         FhgfsInode_clearWritePageError(fhgfsInode);
   } // end of if(sync)

   return retVal;
}

static ssize_t FhgfsOps_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
   return write_common(iocb->ki_filp, from, iov_iter_count(from), &iocb->ki_pos);
}


#ifdef KERNEL_HAS_FSYNC_RANGE /* added in vanilla 3.1 */
int FhgfsOps_fsync(struct file* file, loff_t start, loff_t end, int datasync)
{
   struct dentry* dentry = file_dentry(file);
#elif !defined(KERNEL_HAS_FSYNC_DENTRY)
int FhgfsOps_fsync(struct file* file, int datasync)
{
      struct dentry* dentry = file_dentry(file);
#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34) */
int FhgfsOps_fsync(struct file* file, struct dentry* dentry, int datasync)
{
#endif // LINUX_VERSION_CODE
   App* app = FhgfsOps_getApp(dentry->d_sb);
   Config* cfg = App_getConfig(app);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_fsync";

   int retVal = 0;
   FsObjectInfo* fsObjectInfo;

   struct inode* inode = file_inode(file);

   if(unlikely(Logger_getLogLevel(log) >= 5) )
      FhgfsOpsHelper_logOp(5, app, dentry, inode, logContext);

   fsObjectInfo = __FhgfsOps_getObjectInfo(file);

   // syncing something other than a file?
   if(FsObjectInfo_getObjectType(fsObjectInfo) != FsObjectType_FILE)
      goto clean_up;

   retVal = __FhgfsOps_flush(app, file, false, Config_getTuneRemoteFSync(cfg), true,
      false);

   if(retVal)
      goto clean_up;


clean_up:

   return retVal;
}

/**
 * Flush data from local cache to servers.
 * Note: This method is in fact a local sync (as wait until data have arrived remotely)
 *
 * @param discardCacheOnError whether or not to discard the internal buffer cache in case of an
 * error (typically you will only want to set true here during file close).
 * @param forceRemoteFlush whether or not remote fsync will be executed depends on this value
 * and the corresponding config value (fsync will use true here and flush won't)
 * @param checkSession whether or not a server crash detection must be done on the server
 * @param isClose whether or not the method is called by a close
 * @return negative linux error code on error
 */
int __FhgfsOps_flush(App* app, struct file *file, bool discardCacheOnError,
   bool forceRemoteFlush, bool checkSession, bool isClose)
{
   Logger* log = App_getLogger(app);
   Config* cfg = App_getConfig(app);
   const char* logContext = __func__;

   struct inode* inode = file->f_mapping->host;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   RemotingIOInfo ioInfo;
   int filemapWaitRes;
   int inodeWriteRes;
   FhgfsOpsErr flushRes;
   FhgfsOpsErr bumpRes;
   int retVal = 0;
   bool hasWriteHandle = FhgfsInode_hasWriteHandle(fhgfsInode);

   bool doSyncOnClose = Config_getSysSyncOnClose(App_getConfig(app)) && isClose;

   if(unlikely(Logger_getLogLevel(log) >= 5) )
      FhgfsOpsHelper_logOp(Log_SPAM, app, file_dentry(file), inode, logContext);

   if (hasWriteHandle || FhgfsInode_getHasDirtyPages(fhgfsInode) )
   {
      // flush page cache
      inodeWriteRes = write_inode_now(inode, 1);
      filemapWaitRes = filemap_fdatawait(file->f_mapping);
      if(unlikely(inodeWriteRes < 0 || filemapWaitRes < 0) )
      {
         retVal = (inodeWriteRes < 0) ? inodeWriteRes : filemapWaitRes;
         goto clean_up;
      }
   }

   // flush buffer cache

   FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

   flushRes = FhgfsOpsHelper_flushCache(app, fhgfsInode, discardCacheOnError);
   if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) )
   { // error
      retVal = FhgfsOpsErr_toSysErr(flushRes);
      goto clean_up;
   }

   // remote fsync

   if(forceRemoteFlush || (checkSession && Config_getSysSessionCheckOnClose(cfg)) || doSyncOnClose)
   {
      FhgfsOpsErr fsyncRes = FhgfsOpsRemoting_fsyncfile(&ioInfo, forceRemoteFlush, checkSession,
         doSyncOnClose);
      if(unlikely(fsyncRes != FhgfsOpsErr_SUCCESS) )
      {
         retVal = FhgfsOpsErr_toSysErr(fsyncRes);
         goto clean_up;
      }
   }

   if ((cfg->eventLogMask & EventLogMask_FLUSH) && (file->f_flags & (O_ACCMODE | O_TRUNC)))
   {
      struct FileEvent event;

      FileEvent_init(&event, FileEventType_FLUSH, file_dentry(file));

      FhgfsInode_entryInfoReadLock(fhgfsInode);

      bumpRes = FhgfsOpsRemoting_bumpFileVersion(FhgfsOps_getApp(file_dentry(file)->d_sb),
            &fhgfsInode->entryInfo, false, &event);

      FhgfsInode_entryInfoReadUnlock(fhgfsInode);

      FileEvent_uninit(&event);

      if (bumpRes != FhgfsOpsErr_SUCCESS)
         retVal = FhgfsOpsErr_toSysErr(bumpRes);
   }

clean_up:

   return retVal;
}

int FhgfsOps_mmap(struct file* file, struct vm_area_struct* vma)
{
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_mmap";
   int locked;
   int retry;
   int max_retry;

   FhgfsIsizeHints iSizeHints;

   int retVal;
   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   if(unlikely(Logger_getLogLevel(log) >= 5) )
      FhgfsOpsHelper_logOp(5, app, file_dentry(file), inode, logContext);

   locked = 0;
   retry = 0;

   /*
    * If there are reads/writes already in progress, retry for the inode cache
    * lock till MMAP_RETRY_LOCK_EASY iterations. reads/write will anyway
    * flush the cache. So even if mmap can not get the inode cache lock, it can
    * proceed with the operation. If read/writes are not in progress, wait for
    * more iteration before we get the lock. But mmap should not block forever
    * for the cache lock to avoid the deadlock condition.
    * If mmap has to proceed without getting lock, we will print warning message
    * indicating cache might not be coherent.
    */

   max_retry = atomic_read(&fhgfsInode->coRWInProg) > 0 ?
               MMAP_RETRY_LOCK_EASY : MMAP_RETRY_LOCK_HARD;

   if (app->cfg->tuneCoherentBuffers)
   {
      FhgfsOpsErr flushRes;

      do
      {
         locked = FhgfsInode_fileCacheExclusiveTryLock(fhgfsInode);
         if (locked)
            break;

         // Sleep and retry for the lock
         mdelay(10);
         retry++;
      } while (!locked && retry < max_retry);

      if (locked)
      {
         flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false);
         if (flushRes != FhgfsOpsErr_SUCCESS)
         {
            FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode);
            retVal = FhgfsOpsErr_toSysErr(flushRes);
            goto exit;
         }
      }
      else
         printk_fhgfs_debug(KERN_WARNING,
                            "mmap couldn't flush the cache. Cache might not be coherent\n");
   }

   retVal = generic_file_mmap(file, vma);

   if(!retVal)
      retVal = __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true);

   if (app->cfg->tuneCoherentBuffers && locked)
      FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode);

exit:
   LOG_DEBUG_FORMATTED(log, 5, logContext, "result: %d", retVal);

   return retVal;
}


/**
 * @param fsdata can be used to hand any data over to write_end() (but note that the old
 * prepare_write() doesn't have this)
 * @return 0 on success
 */
static int FhgfsOps_write_begin(struct file* file, struct address_space* mapping,
    loff_t pos, unsigned len,
#if BEEGFS_HAS_WRITE_FLAGS
    unsigned flags,
#endif
    beegfs_pgfol_t *pgfolp, void** fsdata)
{
   pgoff_t index = pos >> PAGE_SHIFT;
   loff_t offset = pos & (PAGE_SIZE - 1);
   loff_t page_start = pos & PAGE_MASK;

   loff_t i_size;

   struct page* page;

   int retVal = 0;
   App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);

   // FhgfsOpsHelper_logOpDebug(app, file->f_dentry, __func__, "(offset: %lld; page_start: %lld; len: %u)",
   //   (long long)offset, (long long)page_start, len);
   IGNORE_UNUSED_VARIABLE(app);

   page = beegfs_grab_cache_page(mapping, index,
#if BEEGFS_HAS_WRITE_FLAGS
        flags
#else
        0
#endif
    );

   if(!page)
   {
      retVal = -ENOMEM;
      goto clean_up;
   }

   if(PageUptodate(page) )
      goto clean_up;

   if(len == PAGE_SIZE)
   {
      // two possibilities:
      // a) full page write => no need to read-update the page from the server
      // b) short write (with offset) => will lead to sync write
      goto clean_up;
   }

   i_size = i_size_read(mapping->host);
   if( (page_start >= i_size) ||
       (!offset && ( (pos + len) >= i_size) ) )
   {
      // we don't need to read data beyond the end of the file
      //    => zero it, and set the page up-to-date

      zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);

      // note: PageChecked means rest of the page (to which is not being written) is up-to-date.
      //       so when our data is written, the whole page is up-to-date.
      SetPageChecked(page);

      goto clean_up;
   }

   // it is read-modify-write, so update the page with the server content
   retVal = FhgfsOpsPages_readpageSync(file, page);

clean_up:
   // clean-up
   *pgfolp = beegfs_to_pgfol(page);
   return retVal;
}


/**
 * @param copied the amount that was able to be copied ("copied==len" is always true if
 * write_begin() was called with the AOP_FLAG_UNINTERRUPTIBLE flag)
 * @param fsdata whatever write_begin() set here
 * @return < 0 on failure, number of bytes copied into pagecache (<= 'copied') otherwise
 **/
static int FhgfsOps_write_end(struct file* file, struct address_space* mapping,
      loff_t pos, unsigned len, unsigned copied, beegfs_pgfol_t pgfol, void* fsdata)
{
   struct page* page = beegfs_get_page(pgfol);
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);

   struct inode* inode = mapping->host;
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);

   int retVal;

   struct dentry *dentry= file_dentry(file);
   App* app = FhgfsOps_getApp(dentry->d_sb);
   Logger* log = App_getLogger(app);
   const char* logContext = __func__;

   // FhgfsOpsHelper_logOpDebug(app, dentry, logContext, "pos: %lld; len: %u; copied: %u",
   //   (long long)pos, len, copied);
   IGNORE_UNUSED_VARIABLE(logContext);

   if(PageChecked(page) )
   {
      // note: see write_begin() for meaning of PageChecked()

      if(copied == len)
         SetPageUptodate(page);

      ClearPageChecked(page);
   }
   else
   if(!PageUptodate(page) && (copied == PAGE_SIZE) )
      SetPageUptodate(page);


   if(!PageUptodate(page) )
   {
      unsigned offset = pos & (PAGE_SIZE - 1);
      char* buf = kmap(page);
      RemotingIOInfo ioInfo;
      ssize_t writeRes;

      FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

      FhgfsInode_incWriteBackCounter(fhgfsInode);

      writeRes = FhgfsOpsRemoting_writefile(&buf[offset], copied, pos, &ioInfo);

      spin_lock(&inode->i_lock);
      FhgfsInode_setLastWriteBackOrIsizeWriteTime(fhgfsInode);
      FhgfsInode_decWriteBackCounter(fhgfsInode);
      spin_unlock(&inode->i_lock);

      if(likely(writeRes > 0) )
      {
         retVal = writeRes;
         pos += writeRes;
      }
      else
         retVal = FhgfsOpsErr_toSysErr(-writeRes);

      kunmap(page);
   }
   else
   {
      retVal = copied;
      pos += copied;

      if (!PageDirty(page) )
      {  // Only add if the page is not dirty yet (don't add the same page twice...)
         FhgfsInode_incNumDirtyPages(fhgfsInode);
      }
      set_page_dirty(page); // could be in the if-condition above, but for safety we set it here

   }

   if(likely(retVal > 0) )
   {
      spin_lock(&inode->i_lock);
      if(pos > inode->i_size)
      {
         FhgfsInode_setPageWriteFlag(fhgfsInode);
         FhgfsInode_setLastWriteBackOrIsizeWriteTime(fhgfsInode);
         FhgfsInode_setNoIsizeDecrease(fhgfsInode);
         i_size_write(inode, pos);
      }
      spin_unlock(&inode->i_lock);
   }

   unlock_page(page);
   put_page(page);

   // clean-up

   // LOG_DEBUG_FORMATTED(log, 5, logContext, "complete. retVal: %d", retVal);
   IGNORE_UNUSED_VARIABLE(log);

   return retVal;
}

static  ssize_t __FhgfsOps_directIO_common(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
   struct iov_iter bgfsIter = *iter;  // Was a wrapper copy. Now is just a defensive copy. Still needed?
   struct file* file = iocb->ki_filp;
   FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
   struct dentry* dentry = file_dentry(file);
   struct inode* inode = file_inode(file);
   FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
   RemotingIOInfo ioInfo;

   ssize_t remotingRes;


   const char* logContext = __func__;
   App* app = FhgfsOps_getApp(dentry->d_sb);
   Logger* log = App_getLogger(app);

   FhgfsOpsHelper_logOpDebug(app, dentry, inode, logContext, "(%s, pos: %lld, nr_seqs: %lld)",
      (rw == WRITE) ? "WRITE" : "READ", (long long)pos);
   IGNORE_UNUSED_VARIABLE(logContext); // non-debug builds

   FsFileInfo_getIOInfo(fileInfo, fhgfsInode, &ioInfo);

   if(rw == WRITE)
   { // write
      remotingRes = FhgfsOpsRemoting_writefileVec(&bgfsIter, pos, &ioInfo, false);
   }
   else if(rw == READ)
   { // read
      remotingRes = FhgfsOpsRemoting_readfileVec(&bgfsIter, iov_iter_count(&bgfsIter), pos, &ioInfo, fhgfsInode);

      if( (remotingRes >= 0 && iov_iter_count(&bgfsIter))
            && ( i_size_read(inode) > (pos + remotingRes) ) )
      { // sparse file compatibility mode
         ssize_t readSparseRes = __FhgfsOps_readSparse(file, &bgfsIter, iov_iter_count(&bgfsIter), pos + remotingRes);

         if(unlikely(readSparseRes < 0) )
            remotingRes = readSparseRes;
         else
            remotingRes += readSparseRes;
      }
   }
   else
   {
#ifdef WARN_ONCE
      WARN_ONCE(1, "unexpected: rw value !=READ and !=WRITE. (int value: %d)\n", rw);
#endif
      return -EINVAL;
   }

   //Write back wrapped iter.
   *iter = bgfsIter;

   if(unlikely(remotingRes < 0) )
   { // error occurred
      LOG_DEBUG_FORMATTED(log, 1, logContext, "error: %s",
            FhgfsOpsErr_toErrString(-remotingRes) );
      IGNORE_UNUSED_VARIABLE(log);

      return FhgfsOpsErr_toSysErr(-remotingRes);
   }

   if(rw == WRITE)
      task_io_account_write(remotingRes);
   else
      task_io_account_read(remotingRes);

   return remotingRes;
}

/**
 * Note: This method must be defined because otherwise the kernel rejects open() with O_DIRECT in
 * fs/open.c. However, it is only called indirectoy through the generic file read/write routines
 * (and swapping code), so it should actually never be called for buffered IO.
 *
 * @param rw whether this is a read or a write {READ, WRITE}
 * @param iocb I/O control block with open file handle
 * @param iov I/O buffer vectors (array)
 * @param pos file offset
 * @param nr_segs length of iov array
 */
ssize_t FhgfsOps_directIO(struct kiocb *iocb, struct iov_iter *iter)
{
   int rw = iov_iter_rw(iter);
   loff_t pos = iocb->ki_pos;
   return __FhgfsOps_directIO_common(rw, iocb, iter, pos);
}