beegfs/meta/source/storage/FileInode.cpp

#include <common/toolkit/serialization/Serialization.h>
#include <common/toolkit/MathTk.h>
#include <common/storage/striping/Raid0Pattern.h>
#include <common/storage/StorageDefinitions.h>
#include <toolkit/XAttrTk.h>
#include <program/Program.h>
#include "FileInode.h"
#include "Locking.h"

#include <sys/xattr.h>

#include <boost/lexical_cast.hpp>


// shorthand for the long init line of AppendLockQueuesContainer to create on stack
#define FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(varName) \
   AppendLockQueuesContainer varName( \
      &exclAppendLock, &waitersExclAppendLock, &waitersLockIDsAppendLock)

// shorthand for the long init line of EntryLockQueuesContainer to create on stack
#define FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(varName) \
   EntryLockQueuesContainer varName( \
      &exclFLock, &sharedFLocks, &waitersExclFLock, &waitersSharedFLock, &waitersLockIDsFLock)


/**
 * Inode initialization. The preferred initializer. Used for loading an inode from disk
 */
FileInode::FileInode(std::string entryID, FileInodeStoreData* inodeDiskData,
   DirEntryType entryType, unsigned dentryFeatureFlags) : inodeDiskData(entryID, inodeDiskData)
{
   this->exclusiveTID     = 0;
   this->numSessionsRead  = 0;
   this->numSessionsWrite = 0;

   initFileInfoVec();

   this->dentryCompatData.entryType    = entryType;
   this->dentryCompatData.featureFlags = dentryFeatureFlags;
}

/**
 * Note: This constructor does not perform the full initialization, so use it for
 * metadata loading (or similar deserialization) only.
 *
 * Note: Don't forget to call initFileInfoVec() when using this (loadFromInodeFile() includes it).
 */
FileInode::FileInode()
{
   this->exclusiveTID     = 0;
   this->numSessionsRead  = 0;
   this->numSessionsWrite = 0;

   this->dentryCompatData.entryType    = DirEntryType_INVALID;
   this->dentryCompatData.featureFlags = 0;
}

/**
 * Requires: init'ed stripe pattern, modification and last access time secs
 */
void FileInode::initFileInfoVec()
{
   // create a fileInfo in the vector for each stripe node

   StripePattern* pattern = inodeDiskData.getStripePattern();
   size_t numTargets      = pattern->getStripeTargetIDs()->size();
   unsigned chunkSize     = pattern->getChunkSize();
   unsigned chunkSizeLog2 = MathTk::log2Int32(chunkSize);

   uint64_t stripeSetSize = chunkSize * numTargets;

   int64_t lastStripeSetSize; // =fileLength%stripeSetSize (remainder after stripeSetStart)
   int64_t stripeSetStart; // =fileLength-stripeSetSize
   int64_t fullLengthPerTarget; // =stripeSetStart/numTargets (without last stripe set remainder)

   StatData* statData = this->inodeDiskData.getInodeStatData();
   int64_t fileSize = statData->getFileSize();

   /* compute stripeset start to get number of complete chunks on all nodes and stripeset remainder
      to compute each target's remainder in the last stripe set. */

   /* note: chunkSize is definitely power of two. if numTargets is also power of two, then
      stripeSetSize is also power of two */

   if(MathTk::isPowerOfTwo(numTargets) )
   { // quick path => optimized without division/modulo
      lastStripeSetSize = fileSize & (stripeSetSize-1);
      stripeSetStart = fileSize - lastStripeSetSize;
      fullLengthPerTarget = stripeSetStart >> MathTk::log2Int32(numTargets);
   }
   else
   { // slow path => requires division/modulo
      lastStripeSetSize = fileSize % stripeSetSize;
      stripeSetStart = fileSize - lastStripeSetSize;
      fullLengthPerTarget = stripeSetStart / numTargets;
   }

   // walk over all targets: compute their chunk file sizes and init timestamps

   fileInfoVec.reserve(numTargets);

   // to subtract last stripe set length of pevious targets in for-loop below
   int64_t remainingLastSetSize = lastStripeSetSize;

   for(unsigned target=0; target < numTargets; target++) // iterate over all chunks / targets
   {
      int64_t targetFileLength = fullLengthPerTarget;

      if(remainingLastSetSize > 0)
         targetFileLength += BEEGFS_MIN(remainingLastSetSize, chunkSize);

      int64_t modificationTimeSecs = statData->getModificationTimeSecs();
      int64_t lastAccessTimeSecs   = statData->getLastAccessTimeSecs();

      uint64_t usedBlocks;
      if (statData->getIsSparseFile() )
         usedBlocks = statData->getTargetChunkBlocks(target);
      else
      {  // estimate the number of blocks by the file size
         usedBlocks = targetFileLength >> StatData::BLOCK_SHIFT;
      }

      DynamicFileAttribs dynAttribs(0, targetFileLength, usedBlocks, modificationTimeSecs,
         lastAccessTimeSecs);
      ChunkFileInfo fileInfo(chunkSize, chunkSizeLog2, dynAttribs);

      fileInfoVec.push_back(fileInfo);

      remainingLastSetSize -= chunkSize;
   }
}

/*
 * set remote targets for FileInode
 */
FhgfsOpsErr FileInode::setRemoteStorageTarget(EntryInfo* entryInfo, const RemoteStorageTarget& rst)
{
   const char* logContext = "Set Remote Storage Target (FileInode)";

   FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE);

   auto [isValid, details] = rst.validateWithDetails();
   if (!isValid)
   {
      LogContext(logContext).log(Log_WARNING, "Invalid RST data: " + details);
      retVal = FhgfsOpsErr_INTERNAL;
   }
   else
   {
      // set file's rst now
      this->rstInfo.set(rst);

      if (this->storeRemoteStorageTargetUnlocked(entryInfo))
      {
         if (!this->getIsRstAvailableUnlocked())
         {
            addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_RST);
            if (!this->storeUpdatedInodeUnlocked(entryInfo))
               retVal = FhgfsOpsErr_INTERNAL;
         }
      }
      else
         retVal = FhgfsOpsErr_INTERNAL;
   }

   safeLock.unlock();
   return retVal;
}

FhgfsOpsErr FileInode::clearRemoteStorageTarget(EntryInfo* entryInfo)
{
   const char* logContext = "Clear Remote Storage Target (FileInode)";
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   if (!this->getIsRstAvailableUnlocked())
      return FhgfsOpsErr_SUCCESS;

   // Clear inode feature flag and store updated inode
   unsigned flags = this->inodeDiskData.getInodeFeatureFlags();
   flags &= ~FILEINODE_FEATURE_HAS_RST;
   this->inodeDiskData.setInodeFeatureFlags(flags);
   if (!this->storeUpdatedInodeUnlocked(entryInfo))
         return FhgfsOpsErr_INTERNAL;

   // Clear in-memory RST info
   this->rstInfo.reset();

   // Remove RST xattr from meta file
   std::string metafile = this->getMetaFilePath(entryInfo);
   int res = removexattr(metafile.c_str(), RST_XATTR_NAME);
   if (unlikely(res == -1))
   {
      // Not reporting as error to caller because:
      // 1. Feature flag is already cleared in metadata
      // 2. In-memory state is reset
      // 3. Future operations will ignore xattr due to cleared flag
      // Just log warning msgs for any unexpected errors or missing xattr.
      if (errno == ENODATA)
      {
         LogContext(logContext).log(Log_WARNING, "RST xattr not found. Path: " + metafile);
      }
      else
      {
         LogContext(logContext).log(Log_WARNING, "Failed to remove RST xattr; entryID: " +
            entryInfo->getEntryID() + "; error: " + System::getErrString());
      }
   }

   return FhgfsOpsErr_SUCCESS;
}

/**
 * Decrease number of sessions for read or write (=> file close) and update persistent
 * metadata.
 * Note: This currently includes persistent metadata update for efficiency reasons (because
 * we already hold the mutex lock here).
 *
 * @param accessFlags OPENFILE_ACCESS_... flags
 */
void FileInode::decNumSessionsAndStore(EntryInfo* entryInfo, unsigned accessFlags)
{
   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE);

   if(accessFlags & OPENFILE_ACCESS_READ)
   {
      if(unlikely(!numSessionsRead) )
      {
         LogContext log("File::decNumSessionsRead");
         log.logErr(
            std::string("Warning: numSessionsRead is already zero. " +
            std::string("File: ") + getEntryIDUnlocked() ) );
      }
      else
         this->numSessionsRead--;
   }
   else
   { // (includes read+write)
      if(unlikely(!numSessionsWrite) )
      {
         LogContext log("File::decNumSessionsWrite");
         log.logErr(
            std::string("Warning: numSessionsWrite is already zero. " +
            std::string("File: ") + getEntryIDUnlocked() ) );
      }
      else
         this->numSessionsWrite--;
   }

   // dyn attribs have been updated during close, so we save them here
   storeUpdatedInodeUnlocked(entryInfo);

   safeLock.unlock();
}


/**
 * Note: This version is compatible with sparse files.
 */
void FileInode::updateDynamicAttribs()
{
   this->inodeDiskData.inodeStatData.updateDynamicFileAttribs(this->fileInfoVec,
      this->inodeDiskData.getPattern() );
}

/*
 * Note: Current object state is used for the serialization
 */
void FileInode::serializeMetaData(Serializer& ser)
{
   // note: the total amount of serialized data may not be larger than META_SERBUF_SIZE

   // get latest dyn attrib values
   updateDynamicAttribs();

   NumNodeID ownerNodeID ; /* irrelevant here. The serialize will set it to ourselves for inlined
                            *  inodes */
   DentryStoreData dentryDiskData(this->inodeDiskData.getEntryID(),
      this->dentryCompatData.entryType, ownerNodeID, this->dentryCompatData.featureFlags);

   DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData);

   diskMetaData.serializeFileInode(ser);
}

/*
 * Note: Applies deserialized data directly to the current object
 */
void FileInode::deserializeMetaData(Deserializer& des)
{
   DentryStoreData dentryDiskData;
   DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData);

   diskMetaData.deserializeFileInode(des);
   if (!des.good())
      return;

   { // dentry compat data
     // entryType
      this->dentryCompatData.entryType = dentryDiskData.getDirEntryType();

      // (dentry) feature flags
      this->dentryCompatData.featureFlags = dentryDiskData.getDentryFeatureFlags();
   }
}


/**
 * Note: Wrapper/chooser for storeUpdatedMetaDataBufAsXAttr/Contents.
 * Note: Unlocked, caller must hold write lock.
 *
 * @param buf the serialized object state that is to be stored
 */
bool FileInode::storeUpdatedMetaDataBuf(char* buf, unsigned bufLen)
{
   App* app = Program::getApp();

   bool useXAttrs = app->getConfig()->getStoreUseExtendedAttribs();

   const Path* inodesPath =
      getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath() : app->getInodesPath();

   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
      inodeDiskData.getEntryID());

   bool result = useXAttrs
      ? storeUpdatedMetaDataBufAsXAttr(buf, bufLen, metaFilename)
      : storeUpdatedMetaDataBufAsContents(buf, bufLen, metaFilename);

   if (getIsBuddyMirroredUnlocked())
      if (auto* resync = BuddyResyncer::getSyncChangeset())
         resync->addModification(metaFilename, MetaSyncFileType::Inode);

   return result;
}

/**
 * Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf().
 *
 * @param buf the serialized object state that is to be stored
 */
bool FileInode::storeUpdatedMetaDataBufAsXAttr(char* buf, unsigned bufLen, std::string metaFilename)
{
   const char* logContext = "File (store updated xattr metadata)";

   // open file (create file if not already present)
   int openFlags = O_CREAT|O_TRUNC|O_WRONLY;
   int fd = open(metaFilename.c_str(), openFlags, 0644);

   if (unlikely(fd == -1))
   {
      LogContext(logContext).logErr("Unable to open/create inode metafile: " + metaFilename
         + ". " + "SysErr: " + System::getErrString());
      return false;
   }

   // write data to file

   int setRes = fsetxattr(fd, META_XATTR_NAME, buf, bufLen, 0);

   if(unlikely(setRes == -1) )
   { // error
      LogContext(logContext).logErr("Unable to write FileInode metadata update: " +
         metaFilename + ". " + "SysErr: " + System::getErrString() );

      close(fd);
      return false;
   }

   LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() );

   close(fd);
   return true;
}

/**
 * Stores the update to a sparate file first and then renames it.
 *
 * Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf().
 *
 * @param buf the serialized object state that is to be stored
 */
bool FileInode::storeUpdatedMetaDataBufAsContents(char* buf, unsigned bufLen,
   std::string metaFilename)
{
   const char* logContext = "File (store updated inode)";

   std::string metaUpdateFilename(metaFilename + META_UPDATE_EXT_STR);

   ssize_t writeRes;
   int renameRes;

   // open file (create it, but not O_EXCL because a former update could have failed)
   int openFlags = O_CREAT|O_TRUNC|O_WRONLY;

   int fd = open(metaUpdateFilename.c_str(), openFlags, 0644);
   if(fd == -1)
   { // error
      if(errno == ENOSPC)
      { // no free space => try again with update in-place
         LogContext(logContext).log(Log_DEBUG, "No space left to create update file. Trying update "
            "in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() );

         return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename);
      }

      LogContext(logContext).logErr("Unable to create inode update file: " + metaUpdateFilename +
         ". " + "SysErr: " + System::getErrString() );

      return false;
   }

   // metafile created => store meta data
   writeRes = write(fd, buf, bufLen);
   if(writeRes != (ssize_t)bufLen)
   {
      if( (writeRes >= 0) || (errno == ENOSPC) )
      { // no free space => try again with update in-place
         LogContext(logContext).log(Log_DEBUG, "No space left to write update inode. Trying update "
            "in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() );

         close(fd);
         unlink(metaUpdateFilename.c_str() );

         return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename);
      }

      LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " +
         "SysErr: " + System::getErrString() );

      goto error_closefile;
   }

   close(fd);

   renameRes = rename(metaUpdateFilename.c_str(), metaFilename.c_str() );
   if(renameRes == -1)
   {
      LogContext(logContext).logErr("Unable to replace old inode file: " + metaFilename + ". " +
         "SysErr: " + System::getErrString() );

      goto error_unlink;
   }

   LOG_DEBUG(logContext, 4, "Inode update stored: " + this->inodeDiskData.getEntryID() );

   return true;


   // error compensation
error_closefile:
   close(fd);

error_unlink:
   unlink(metaUpdateFilename.c_str() );

   return false;
}

/**
 * Stores the update directly to the current metadata file (instead of creating a separate file
 * first and renaming it).
 *
 * Note: Don't call this directly, it is automatically called by storeUpdatedMetaDataBufAsContents()
 * when necessary.
 *
 * @param buf the serialized object state that is to be stored
 */
bool FileInode::storeUpdatedMetaDataBufAsContentsInPlace(char* buf, unsigned bufLen,
   std::string metaFilename)
{
   const char* logContext = "File (store updated inode in-place)";

   int fallocRes;
   ssize_t writeRes;
   int truncRes;

   // open file (create it, but not O_EXCL because a former update could have failed)
   int openFlags = O_CREAT|O_WRONLY;

   int fd = open(metaFilename.c_str(), openFlags, 0644);
   if(fd == -1)
   { // error
      LogContext(logContext).logErr("Unable to open inode file: " + metaFilename +
         ". " + "SysErr: " + System::getErrString() );

      return false;
   }

   // make sure we have enough room to write our update
   fallocRes = posix_fallocate(fd, 0, bufLen); // (note: posix_fallocate does not set errno)
   if(fallocRes == EBADF)
   { // special case for XFS bug
      struct stat statBuf;
      int statRes = fstat(fd, &statBuf);

      if (statRes == -1)
      {
         LogContext(logContext).log(Log_WARNING, "Unexpected error: fstat() failed with SysErr: "
            + System::getErrString(errno));
         goto error_closefile;
      }

      if (statBuf.st_size < bufLen)
      {
         LogContext(logContext).log(Log_WARNING, "File space allocation ("
            + StringTk::intToStr(bufLen)  + ") for inode update failed: " + metaFilename + ". " +
            "SysErr: " + System::getErrString(fallocRes) + " "
            "statRes: " + StringTk::intToStr(statRes) + " "
            "oldSize: " + StringTk::intToStr(statBuf.st_size));
         goto error_closefile;
      }
      else
      { // // XFS bug! We only return an error if statBuf.st_size < bufLen. Ingore fallocRes then
         LOG_DEBUG(logContext, Log_SPAM, "Ignoring kernel file system bug: "
            "posix_fallocate() failed for len < filesize");
      }
   }
   else
   if (fallocRes != 0)
   { // default error handling if posix_fallocate() failed
      LogContext(logContext).log(Log_WARNING, "File space allocation ("
         + StringTk::intToStr(bufLen)  + ") for inode update failed: " +  metaFilename + ". " +
         "SysErr: " + System::getErrString(fallocRes) );
      goto error_closefile;
   }

   // metafile created => store meta data
   writeRes = write(fd, buf, bufLen);
   if(writeRes != (ssize_t)bufLen)
   {
      LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " +
         "SysErr: " + System::getErrString() );

      goto error_closefile;
   }

   close(fd);

   // truncate in case the update lead to a smaller file size
   truncRes = ftruncate(fd, bufLen);
   if(truncRes == -1)
   { // ignore trunc errors
      LogContext(logContext).log(Log_WARNING, "Unable to truncate inode file (strange, but "
         "proceeding anyways): " + metaFilename + ". " + "SysErr: " + System::getErrString() );
   }

   LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() );

   return true;


   // error compensation
error_closefile:
   close(fd);

   return false;
}


/**
 * Update the inode on disk
 *
 * Note: We already need to have a FileInode (WRITE) rwlock here
 */
bool FileInode::storeUpdatedInodeUnlocked(EntryInfo* entryInfo, StripePattern* updatedStripePattern)
{
   const char* logContext = "FileInode (store updated Inode)";
   bool saveRes;

   bool isInLined = this->isInlined;

   if (isInLined)
   {
      FhgfsOpsErr dentrySaveRes = storeUpdatedInlinedInodeUnlocked(entryInfo, updatedStripePattern);
      if (dentrySaveRes == FhgfsOpsErr_SUCCESS)
         return true;

      // dentrySaveRes != FhgfsOpsErr_SUCCESS
      std::string parentID = entryInfo->getParentEntryID();
      std::string entryID  = entryInfo->getEntryID();
      std::string fileName = entryInfo->getFileName();

      if (dentrySaveRes == FhgfsOpsErr_INODENOTINLINED)
      {
         /* dentrySaveRes == FhgfsOpsErr_INODENOTINLINED. Our internal inode information says the
          * inode is inlined, but on writing it we figure out it is not. As we we are holding a
          * write lock here, that never should have happened. So probably a locking bug, but not
          * critical here and we retry using the non-inlined way.
          */

         LogContext(logContext).log(Log_WARNING, std::string("Inode unexpectedly not inlined: ") +
            "parentID: "+ parentID +  " entryID: " + entryID + " fileName: " + fileName );
         this->isInlined = false;

      }
      else
      {
         LogContext(logContext).log(Log_WARNING, std::string("Failed to write inlined inode: ") +
            "parentID: "+ parentID +  " entryID: " + entryID + " fileName: " + fileName +
            " Error: " + boost::lexical_cast<std::string>(dentrySaveRes));
         #ifdef BEEGFS_DEBUG
            LogContext(logContext).logBacktrace();
         #endif

      }


      // it now falls through to the not-inlined handling, hopefully this is goint to work
   }

   // inode not inlined

   // change the stripe pattern here before serializing;

   if (unlikely(updatedStripePattern))
   {
      StripePattern* pattern = this->inodeDiskData.getPattern();
      if (!pattern->updateStripeTargetIDs(updatedStripePattern))
         LogContext(logContext).log(Log_WARNING, "Could not set requested new stripe pattern");
   }

   char buf[META_SERBUF_SIZE];
   Serializer ser(buf, sizeof(buf));

   serializeMetaData(ser);

   if (ser.good())
      saveRes = storeUpdatedMetaDataBuf(buf, ser.size());
   else
      saveRes = false;

   if (!saveRes && isInlined)
   {
      LogContext(logContext).log(Log_WARNING, std::string("Trying to write as non-inlined inode "
         "also failed.") );

   }

   return saveRes;
}

/**
 * Update an inode, which is inlined into a dentry
 */
FhgfsOpsErr FileInode::storeUpdatedInlinedInodeUnlocked(EntryInfo* entryInfo,
   StripePattern* updatedStripePattern)
{
   const char* logContext = "DirEntry (storeUpdatedInode)";
   App* app = Program::getApp();

   // get latest dyn attrib vals...
   updateDynamicAttribs();

   std::string parentEntryID = entryInfo->getParentEntryID();

   const Path* dentriesPath =
      entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();

   std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(),
      parentEntryID);

   FileInodeStoreData* inodeDiskData = this->getInodeDiskData();

   if (unlikely(updatedStripePattern))
   {
      // note: We do not set the complete stripe pattern here, but only the stripe target IDs
      if (! inodeDiskData->getPattern()->updateStripeTargetIDs(updatedStripePattern))
         LogContext(logContext).log(Log_WARNING, "Could not set new stripe target IDs.");
   }

   DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(),
      entryInfo->getEntryID(), entryInfo->getOwnerNodeID() );

   /* Note: As we are called from FileInode most data of this DirEntry are unknown and we need to
    *       load it from disk. */
   bool loadRes = dirEntry.loadFromID(dirEntryPath, entryInfo->getEntryID() );
   if (!loadRes)
      return FhgfsOpsErr_INTERNAL;

   FileInodeStoreData* entryInodeDiskData = dirEntry.getInodeStoreData();
   entryInodeDiskData->setFileInodeStoreData(inodeDiskData);

   FhgfsOpsErr retVal = dirEntry.storeUpdatedInode(dirEntryPath);

   return retVal;
}

std::string FileInode::getMetaFilePath(EntryInfo* entryInfo)
{
   App* app = Program::getApp();
   if (isInlined)
    {
        const Path* dentriesPath = getIsBuddyMirroredUnlocked()
            ? app->getBuddyMirrorDentriesPath()
            : app->getDentriesPath();

        std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(
            dentriesPath->str(), entryInfo->getParentEntryID());
        return MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID();
    }

    const Path* inodesPath = getIsBuddyMirroredUnlocked()
        ? app->getBuddyMirrorInodesPath()
        : app->getInodesPath();

    return MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID());
}

bool FileInode::storeRemoteStorageTargetUnlocked(EntryInfo* entryInfo)
{
   std::string metafile = getMetaFilePath(entryInfo);

   char buf[META_SERBUF_SIZE];
   Serializer ser(buf, sizeof(buf));
   ser % rstInfo;

   if (!ser.good())
      return false;

   bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();
   if (useXAttrs)
      return storeRemoteStorageTargetBufAsXAttr(buf, ser.size(), metafile);
   else
   {
      LOG(GENERAL, WARNING, "Storing RST info as file contents is unsupported. "
         "Please check the 'storeUseExtendedAttribs' setting in the BeeGFS meta config");
      return false;
   }
}

bool FileInode::storeRemoteStorageTargetBufAsXAttr(char* buf, unsigned bufLen, const std::string& metafilename)
{
   const char* logContext = "FileInode (store remote storage target as xattr)";

   int setRes = setxattr(metafilename.c_str(), RST_XATTR_NAME, buf, bufLen, 0);

   if (unlikely(setRes == -1))
   {
      // error
      LogContext(logContext).logErr("Unable to write remote storage target info to disk: "
      + metafilename + ". SysErr: " + System::getErrString());

      return false;
   }

   return true;
}

bool FileInode::removeStoredMetaData(const std::string& id, bool isBuddyMirrored)
{
   const char* logContext = "FileInode (remove stored metadata)";

   App* app = Program::getApp();
   std::string inodeFilename = MetaStorageTk::getMetaInodePath(
         isBuddyMirrored
            ? app->getBuddyMirrorInodesPath()->str()
            : app->getInodesPath()->str(),
         id);

   // delete metadata file
   int unlinkRes = unlink(inodeFilename.c_str() );

   /* ignore errno == ENOENT as the file does not exist anymore for whatever reasons. Although
    * unlink() failed, we do not have to care, as our goal is still reached. This is also about
    * inode removal, if the dir-entry also does not exist, the application still will get
    * the right error code */
   if(unlinkRes == -1 && errno != ENOENT)
   { // error
      LogContext(logContext).logErr("Unable to delete inode file: " + inodeFilename + ". " +
         "SysErr: " + System::getErrString() );

      return false;
   }

   LOG_DEBUG(logContext, 4, "Inode file deleted: " + inodeFilename);

   if (isBuddyMirrored)
      if (auto* resync = BuddyResyncer::getSyncChangeset())
         resync->addDeletion(inodeFilename, MetaSyncFileType::Inode);

   return true;
}


/**
 * Note: Wrapper/chooser for loadFromFileXAttr/Contents.
 * Note: This also (indirectly) calls initFileInfoVec()
 */
bool FileInode::loadFromInodeFile(EntryInfo* entryInfo)
{
   bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();

   if(useXAttrs)
      return loadFromFileXAttr(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() );

   return loadFromFileContents(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() );
}

/**
 * Note: Don't call this directly, use the wrapper loadFromInodeFile().
 * Note: This also calls initFileInfoVec()
 */
bool FileInode::loadFromFileXAttr(const std::string& id, bool isBuddyMirrored)
{
   const char* logContext = "File inode (load from xattr file)";
   App* app = Program::getApp();

   const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);

   bool retVal = false;

   char buf[META_SERBUF_SIZE];

   ssize_t getRes = getxattr(metaFilename.c_str(), META_XATTR_NAME, buf, META_SERBUF_SIZE);
   if(getRes > 0)
   { // we got something => deserialize it
      Deserializer des(buf, getRes);
      deserializeMetaData(des);

      if(unlikely(!des.good()))
      { // deserialization failed
         LogContext(logContext).logErr("Unable to deserialize metadata in file: " + metaFilename);
         goto error_exit;
      }

      // deserialization successful => init dyn attribs

      initFileInfoVec(); /* note: this can only be done after the stripePattern
         has been initialized, that's why we do it here at this "unusual" place. */

      retVal = true;
   }
   else
   if( (getRes == -1) && (errno == ENOENT) )
   { // file not exists
      LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " +
         metaFilename + ". " + "SysErr: " + System::getErrString() );
   }
   else
   { // unhandled error
      LogContext(logContext).logErr("Unable to open/read inode file: " + metaFilename + ". " +
         "SysErr: " + System::getErrString() );
   }


error_exit:

   return retVal;
}

/**
 * Note: Don't call this directly, use the wrapper loadFromInodeFile().
 * Note: This also calls initFileInfoVec()
 */
bool FileInode::loadFromFileContents(const std::string& id, bool isBuddyMirrored)
{
   const char* logContext = "File inode (load from file)";
   App* app = Program::getApp();

   const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);
   bool retVal = false;

   int openFlags = O_NOATIME | O_RDONLY;

   int fd = open(metaFilename.c_str(), openFlags, 0);
   if(fd == -1)
   { // open failed
      if(errno != ENOENT)
         LogContext(logContext).logErr("Unable to open inode file: " + metaFilename + ". " +
            "SysErr: " + System::getErrString() );

      return false;
   }

   char buf[META_SERBUF_SIZE];
   int readRes = read(fd, buf, META_SERBUF_SIZE);
   if(readRes <= 0)
   { // reading failed
      LogContext(logContext).logErr("Unable to read inode file: " + metaFilename + ". " +
         "SysErr: " + System::getErrString() );
   }
   else
   {
      Deserializer des(buf, readRes);
      deserializeMetaData(des);
      if(!des.good())
      { // deserialization failed
         LogContext(logContext).logErr("Unable to deserialize inode in file: " + metaFilename);
      }
      else
      { // deserialization successful => init dyn attribs
         initFileInfoVec(); // note: this can only be done after the stripePattern
            // has been initialized, that's why we do it here at this "unusual" place

         retVal = true;
      }
   }

   close(fd);

   return retVal;
}

bool FileInode::loadRstFromInodeFile(EntryInfo* entryInfo)
{
   bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();

   if (useXAttrs)
      return loadRstFromFileXAttr(entryInfo);

   return false;
}

bool FileInode::loadRstFromFileXAttr(EntryInfo* entryInfo)
{
   const char* logContext = "File inode RST (load from xattr file)";
   App* app = Program::getApp();
   std::string metafile;

   if (isInlined)
   {
      const Path* dentriesPath =
         getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();

      std::string dirEntryPath =
         MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(), entryInfo->getParentEntryID());

      metafile = MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID();
   }
   else
   {
      const Path* inodesPath =
         getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath(): app->getInodesPath();

      metafile = MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID());
   }

   char buf[META_SERBUF_SIZE];
   ssize_t getRes = getxattr(metafile.c_str(), RST_XATTR_NAME, buf, META_SERBUF_SIZE);

   if (getRes > 0)
   {
      // we got something => deserialize it
      Deserializer des(buf, getRes);
      des % this->rstInfo;

      if (unlikely(!des.good()))
      {
         // deserialization failed
         LogContext(logContext).logErr("Unable to deserialize remote storage targets"
            ", file: " + metafile);
         return false;
      }

      return true;
   }
   else
   if( (getRes == -1) && (errno == ENOENT) )
   { // file not exists
      LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " +
         metafile + ". " + "SysErr: " + System::getErrString() );
   }
   else
   { // unhandled error
      LogContext(logContext).logErr("Unable to open/read inode file: " + metafile + ". " +
         "SysErr: " + System::getErrString() );
   }

   return false;
}

/**
 * Create an inode from an entryInfo.
 *
 * Note: The entryInfo indicates if the inode is inlined or not. However, this information
 *       might be outdated and so we need to try inlined and file-inode access, if creating
 *       the inode failed.
 *       We here rely on kernel lookup calls, to update client side entryInfo data.
 */
FileInode* FileInode::createFromEntryInfo(EntryInfo* entryInfo)
{
   FileInode* inode;

   if (entryInfo->getIsInlined() )
   {
      /* entryInfo indicates the inode is inlined. So first try to get the inode by
       * dir-entry inlined inode and if that failes try again with an inode-file. */
      inode = createFromInlinedInode(entryInfo);

      if (!inode)
         inode = createFromInodeFile(entryInfo);
   }
   else
   {
      /* entryInfo indicates the inode is not inlined, but a separate inode-file. So first
       * try to get the inode by inode-file and only if that fails try again with the dir-entry,
       * maybe the inode was re-inlined.        */
      inode = createFromInodeFile(entryInfo);

      if (!inode)
         inode = createFromInlinedInode(entryInfo);
   }

   if (likely(inode) && inode->getIsRstAvailableUnlocked())
      inode->loadRstFromInodeFile(entryInfo);

   return inode;
}

/**
 * Inode from inode file (inode is not inlined)
 *
 * Note: Do not call directly, but use FileInode::createFromEntryInfo()
 */
FileInode* FileInode::createFromInodeFile(EntryInfo* entryInfo)
{
   FileInode* newInode = new FileInode();

   bool loadRes = newInode->loadFromInodeFile(entryInfo);
   if(!loadRes)
   {
      delete(newInode);

      return NULL;
   }

   newInode->setIsInlinedUnlocked(false);

   return newInode;
}


/**
 * Inode from dir-entry with inlined inode.
 *
 * Note: Do not call directly, but use FileInode::createFromEntryInfo()
 */
FileInode* FileInode::createFromInlinedInode(EntryInfo* entryInfo)
{
   App* app = Program::getApp();

   std::string parentEntryID = entryInfo->getParentEntryID();

   const Path* dentryPath =
      entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();

   std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentryPath->str(),
      parentEntryID);

   DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(),
      entryInfo->getEntryID(), entryInfo->getOwnerNodeID() );

   FileInode* newInode = dirEntry.createInodeByID(dirEntryPath, entryInfo);

   if (newInode)
      newInode->setIsInlinedUnlocked(true);

   return newInode;
}


/**
 * Update entry attributes like chmod() etc. do it.
 *
 * Note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require
      a special handling by the caller (but we also set the static attribs here).

 * @param validAttribs SETATTR_CHANGE_...-Flags, but maybe 0, if we only want to update
 *    AttribChangeTimeSecs.
 * @param attribs   new attributes, but may be NULL if validAttribs == 0
 */
bool FileInode::setAttrData(EntryInfo * entryInfo, int validAttribs, SettableFileAttribs* attribs)
{
   bool success = true;

   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K

   /* note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require
      a special handling by the caller (i.e. to also update chunk files) */
   // save old attribs
   StatData* statData = this->inodeDiskData.getInodeStatData();
   SettableFileAttribs oldAttribs = *(statData->getSettableFileAttribs() );

   statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);

   if(validAttribs)
   {
      // apply new attribs wrt flags...

      if(validAttribs & SETATTR_CHANGE_MODE)
         statData->setMode(attribs->mode);

      if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME)
      {
         /* only static value update required for storeUpdatedInodeUnlocked() */
         statData->setModificationTimeSecs(attribs->modificationTimeSecs);
      }

      if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME)
      {
         /* only static value update required for storeUpdatedInodeUnlocked() */
         statData->setLastAccessTimeSecs(attribs->lastAccessTimeSecs);
      }

      if(validAttribs & SETATTR_CHANGE_USERID)
      {
         statData->setUserID(attribs->userID);

         if ((attribs->userID != this->inodeDiskData.getOrigUID() ) &&
            (this->inodeDiskData.getOrigFeature() == FileInodeOrigFeature_TRUE) )
            addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_ORIG_UID);
      }

      if(validAttribs & SETATTR_CHANGE_GROUPID)
         statData->setGroupID(attribs->groupID);
   }

   bool storeRes = storeUpdatedInodeUnlocked(entryInfo); // store on disk
   if(!storeRes)
   { // failed to update metadata on disk => restore old values
      statData->setSettableFileAttribs(oldAttribs);

      success = false;
      goto err_unlock;
   }

   // persistent update succeeded

   // update attribs vec (wasn't done earlier because of backup overhead for restore on error)

   if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME)
   {
      for(size_t i=0; i < fileInfoVec.size(); i++)
         fileInfoVec[i].getRawDynAttribs()->modificationTimeSecs = attribs->modificationTimeSecs;
   }

   if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME)
   {
      for(size_t i=0; i < fileInfoVec.size(); i++)
         fileInfoVec[i].getRawDynAttribs()->lastAccessTimeSecs = attribs->lastAccessTimeSecs;
   }

err_unlock:
   safeLock.unlock(); // U N L O C K

   return success;
}

/**
 * General wrapper for append lock and unlock operations.
 *
 * Append supports exclusive locking only, no shared locks.
 *
 * Note: Unlocks are always immediately granted (=> they always return "true").
 *
 * @return true if operation succeeded immediately; false if registered for waiting (or failed in
 * case of NOWAIT-flag)
 */
std::pair<bool, LockEntryNotifyList> FileInode::flockAppend(EntryLockDetails& lockDetails)
{
   FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);

   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   return flockEntryUnlocked(lockDetails, &lockQs);
}

/**
 * General wrapper for flock lock and unlock operations.
 *
 * Note: Unlocks are always immediately granted (=> they always return "true").
 *
 * @return true if operation succeeded immediately; false if registered for waiting (or failed in
 * case of NOWAIT-flag)
 */
std::pair<bool, LockEntryNotifyList> FileInode::flockEntry(EntryLockDetails& lockDetails)
{
   FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);

   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   return flockEntryUnlocked(lockDetails, &lockQs);
}

/**
 * General wrapper for flock lock and unlock operations.
 *
 * Note: Unlocks are always immediately granted (=> they always return "true").
 * Note: Unlocked version => caller must hold write lock.
 *
 * @return true if operation succeeded immediately; false if registered for waiting (or failed in
 * case of NOWAIT-flag)
 */
std::pair<bool, LockEntryNotifyList> FileInode::flockEntryUnlocked(EntryLockDetails& lockDetails,
   EntryLockQueuesContainer* lockQs)
{
   bool tryNextWaiters = false;
   bool immediatelyGranted = false; // return value

   if(lockDetails.isCancel() )
   {
      // C A N C E L request

      /* note: this is typically used when a client closes a file, so we remove all granted and
         pending locks for the given handle here */

      if(flockEntryCancelByHandle(lockDetails, lockQs) )
         tryNextWaiters = true;

      immediatelyGranted = true;
   }
   else
   if(lockDetails.isUnlock() )
   {
      // U N L O C K request

      tryNextWaiters = flockEntryUnlock(lockDetails, lockQs);
      immediatelyGranted = true;
   }
   else
   {
      // L O C K request

      // check waiters to filter duplicate requests

      StringSetIter iterWaiters = lockQs->waitersLockIDs->find(lockDetails.lockAckID);
      if(iterWaiters != lockQs->waitersLockIDs->end() )
         return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting

      // not in waiters queue => is it granted already?

      bool isGrantedAlready = flockEntryIsGranted(lockDetails, lockQs);
      if(isGrantedAlready)
         return {true, {}}; // request was granted already

      // not waiting, not granted => we have a new request

      bool hasConflicts = flockEntryCheckConflicts(lockDetails, lockQs, NULL);

      if(!hasConflicts || lockDetails.allowsWaiting() )
         tryNextWaiters = flockEntryUnlock(lockDetails, lockQs); // unlock (for lock up-/downgrades)

      if(lockDetails.isShared() )
      {
         // S H A R E D lock request

         if(!hasConflicts)
         { // no confictors for this lock => can be immediately granted
            flockEntryShared(lockDetails, lockQs);
            immediatelyGranted = true;
         }
         else
         if(lockDetails.allowsWaiting() )
         { // we have conflictors and locker wants to wait
            lockQs->waitersSharedLock->push_back(lockDetails);
            lockQs->waitersLockIDs->insert(lockDetails.lockAckID);
         }
      }
      else
      {
         // E X C L U S I V E lock request

         if(!hasConflicts)
         { // no confictors for this lock => can be immediately granted
            flockEntryExclusive(lockDetails, lockQs);
            immediatelyGranted = true;
         }
         else
         if(lockDetails.allowsWaiting() )
         { // we have conflictors and locker wants to wait
            lockQs->waitersExclLock->push_back(lockDetails);
            lockQs->waitersLockIDs->insert(lockDetails.lockAckID);
         }
      }
   }

   if (tryNextWaiters)
      return {immediatelyGranted, flockEntryTryNextWaiters(lockQs)};

   return {immediatelyGranted, {}};
}

/**
 * Remove all waiters from the queues.
 */
void FileInode::flockAppendCancelAllWaiters()
{
   FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);

   flockEntryGenericCancelAllWaiters(&lockQs);
}

/**
 * Remove all waiters from the queues.
 */
void FileInode::flockEntryCancelAllWaiters()
{
   FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);

   flockEntryGenericCancelAllWaiters(&lockQs);
}

/**
 * Remove all waiters from the queues.
 *
 * Generic version shared by append and flock locking.
 */
void FileInode::flockEntryGenericCancelAllWaiters(EntryLockQueuesContainer* lockQs)
{
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   lockQs->waitersLockIDs->clear();
   lockQs->waitersExclLock->clear();
   lockQs->waitersSharedLock->clear();
}


/**
 * Unlock all locks and wait entries of the given clientID.
 */
LockEntryNotifyList FileInode::flockAppendCancelByClientID(NumNodeID clientID)
{
   FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);

   return flockEntryGenericCancelByClientID(clientID, &lockQs);
}

/**
 * Unlock all locks and wait entries of the given clientID.
 */
LockEntryNotifyList FileInode::flockEntryCancelByClientID(NumNodeID clientID)
{
   FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);

   return flockEntryGenericCancelByClientID(clientID, &lockQs);
}

/**
 * Unlock all locks and wait entries of the given clientID.
 *
 * Generic version shared by append and flock locking.
 */
LockEntryNotifyList FileInode::flockEntryGenericCancelByClientID(NumNodeID clientNumID,
   EntryLockQueuesContainer* lockQs)
{
   /* note: this code is in many aspects similar to flockEntryCancelByHandle(), so if you change
    * something here, you probably want to change it there, too. */

   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   bool tryNextWaiters = false;

   // exclusive lock

   if(lockQs->exclLock->isSet() && (lockQs->exclLock->clientNumID == clientNumID) )
   {
      *lockQs->exclLock = {};
      tryNextWaiters = true;
   }

   // shared locks

   for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin();
       iter != lockQs->sharedLocks->end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         EntryLockDetailsSetIter iterNext = iter;
         iterNext++;

         lockQs->sharedLocks->erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters exlusive

   for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin();
       iter != lockQs->waitersExclLock->end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         lockQs->waitersLockIDs->erase(iter->lockAckID);
         iter = lockQs->waitersExclLock->erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters shared

   for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin();
       iter != lockQs->waitersSharedLock->end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         lockQs->waitersLockIDs->erase(iter->lockAckID);
         iter = lockQs->waitersSharedLock->erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   if (tryNextWaiters)
      return flockEntryTryNextWaiters(lockQs);

   return {};
}

/**
 * Remove all granted and pending locks that match the given handle.
 * (This is typically called by clients during file close.)
 *
 * Note: unlocked, so hold the mutex when calling this.
 *
 * @return true if locks were removed and next waiters should be tried.
 */
bool FileInode::flockEntryCancelByHandle(EntryLockDetails& lockDetails,
   EntryLockQueuesContainer* lockQs)
{
   /* note: this code is in many aspects similar to flockEntryCancelByClientID(), so if you change
    * something here, you probably want to change it there, too. */


   bool tryNextWaiters = false;

   // exclusive lock

   if(lockQs->exclLock->isSet() && lockDetails.equalsHandle(*lockQs->exclLock) )
   {
      *lockQs->exclLock = {};
      tryNextWaiters = true;
   }

   // shared locks

   for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin();
       iter != lockQs->sharedLocks->end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         EntryLockDetailsSetIter iterNext = iter;
         iterNext++;

         lockQs->sharedLocks->erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters exlusive

   for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin();
       iter != lockQs->waitersExclLock->end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         lockQs->waitersLockIDs->erase(iter->lockAckID);
         iter = lockQs->waitersExclLock->erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters shared

   for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin();
       iter != lockQs->waitersSharedLock->end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         lockQs->waitersLockIDs->erase(iter->lockAckID);
         iter = lockQs->waitersSharedLock->erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   return tryNextWaiters;
}

/**
 * Note: Automatically ignores self-conflicts (locks that could be up- or downgraded).
 * Note: Make sure to remove lock duplicates before calling this.
 * Note: unlocked, so hold the mutex when calling this.
 *
 * @param outConflictor first identified conflicting lock (only set if true is returned; can be
 * NULL if caller is not interested)
 * @return true if there is a conflict with a lock that is not owned by the current lock requestor,
 * false if the request can defintely be granted immediately without waiting
 */
bool FileInode::flockEntryCheckConflicts(EntryLockDetails& lockDetails,
   EntryLockQueuesContainer* lockQs, EntryLockDetails* outConflictor)
{
   // note: we also check waiting writers here, because we have writer preference and so we don't
      // want to grant access for a new reader if we have a waiting writer


   // check conflicting exclusive lock (for shared & eclusive requests)

   if(lockQs->exclLock->isSet() && !lockQs->exclLock->equalsHandle(lockDetails) )
   {
      SAFE_ASSIGN(outConflictor, *lockQs->exclLock);
      return true;
   }

   // no exclusive lock exists

   if(lockDetails.isExclusive() )
   { // exclusive lock request: check conflicting shared lock

      for(EntryLockDetailsSetCIter iterShared = lockQs->sharedLocks->begin();
          iterShared != lockQs->sharedLocks->end();
          iterShared++)
      {
         if(!iterShared->equalsHandle(lockDetails) )
         { // found a conflicting lock
            SAFE_ASSIGN(outConflictor, *iterShared);
            return true;
         }
      }
   }
   else
   { // non-exclusive lock: check for waiting writers to enforce writer preference

      if(!lockQs->waitersExclLock->empty() )
      {
         SAFE_ASSIGN(outConflictor, *lockQs->waitersExclLock->begin() );
         return true;
      }
   }

   return false;
}

/**
 * Find out whether a given lock is currently being held by the given owner.
 *
 * Note: unlocked, hold the read lock when calling this.
 *
 * @return true if the given lock is being held by the given owner.
 */
bool FileInode::flockEntryIsGranted(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
   if(lockDetails.isExclusive() )
   {
      if(lockQs->exclLock->equalsHandle(lockDetails) )
      { // was an exclusive lock
         return true;
      }
   }
   else
   if(lockDetails.isShared() )
   {
      EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails);
      if(iterShared != lockQs->sharedLocks->end() )
      { // was a shared lock
         return true;
      }
   }

   return false;
}


/**
 * Note: unlocked, so hold the write lock when calling this.
 *
 * @return true if an existing lock was released
 */
bool FileInode::flockEntryUnlock(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
   if(lockQs->exclLock->equalsHandle(lockDetails) )
   { // was an exclusive lock
      *lockQs->exclLock = {};
      return true;
   }

   EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails);
   if(iterShared != lockQs->sharedLocks->end() )
   { // was a shared lock
      lockQs->sharedLocks->erase(iterShared);

      return true;
   }

   return false;
}


/**
 * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
 * duplicates.
 * Note: unlocked, so hold the mutex when calling this
 */
void FileInode::flockEntryShared(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
   lockQs->sharedLocks->insert(lockDetails);
}

/**
 * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
 * duplicates.
 * Note: unlocked, so hold the mutex when calling this
 */
void FileInode::flockEntryExclusive(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
   *lockQs->exclLock = lockDetails;
}


/**
 * Remove next requests from waiters queue and try to grant it - until we reach an entry that
 * cannot be granted immediately.
 *
 * Note: We assume that duplicate waiters and duplicate granted locks (up-/downgrades) have been
 * removed before a lock request is enqueued, so we don't check for that.
 *
 * Note: FileInode must be already write-locked by the caller!
 */
LockEntryNotifyList FileInode::flockEntryTryNextWaiters(EntryLockQueuesContainer* lockQs)
{
   /* note: we have writer preference, so we don't grant any new readers while we have waiting
      writers */

   if(lockQs->exclLock->isSet() )
      return {}; // eclusive lock => there's nothing we can do right now

   // no exclusive lock set

   if(!lockQs->waitersSharedLock->empty() && lockQs->waitersExclLock->empty() )
   { // shared locks waiting and no exclusive locks waiting => grant all

      LockEntryNotifyList notifyList;

      while(!lockQs->waitersSharedLock->empty() )
      {
         flockEntryShared(*lockQs->waitersSharedLock->begin(), lockQs);

         notifyList.push_back(*lockQs->waitersSharedLock->begin() );

         lockQs->waitersLockIDs->erase(lockQs->waitersSharedLock->begin()->lockAckID);
         lockQs->waitersSharedLock->pop_front();
      }

      return notifyList;
   }

   // no exclusive and no shared locks set => we can grant an exclusive lock

   if(!lockQs->waitersExclLock->empty() )
   { // exclusive locks waiting => grant first one of them
      flockEntryExclusive(*lockQs->waitersExclLock->begin(), lockQs);

      LockEntryNotifyList notifyList;
      notifyList.push_back(*lockQs->waitersExclLock->begin() );

      lockQs->waitersLockIDs->erase(lockQs->waitersExclLock->begin()->lockAckID);
      lockQs->waitersExclLock->pop_front();

      return notifyList;
   }

   return {};
}

/**
 * Generate a complete locking status overview (all granted and waiters) as human-readable string.
 */
std::string FileInode::flockAppendGetAllAsStr()
{
   FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);

   return flockEntryGenericGetAllAsStr(&lockQs);
}

/**
 * Generate a complete locking status overview (all granted and waiters) as human-readable string.
 */
std::string FileInode::flockEntryGetAllAsStr()
{
   FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);

   return flockEntryGenericGetAllAsStr(&lockQs);
}

/**
 * Generate a complete locking status overview (all granted and waiters) as human-readable string.
 *
 * Generic version shared by append and flock locking.
 */
std::string FileInode::flockEntryGenericGetAllAsStr(EntryLockQueuesContainer* lockQs)
{
   UniqueRWLock lock(rwlock, SafeRWLock_READ);

   std::ostringstream outStream;

   outStream << "Exclusive" << std::endl;
   outStream << "=========" << std::endl;
   if(lockQs->exclLock->isSet() )
      outStream << lockQs->exclLock->toString() << std::endl;

   outStream << std::endl;

   outStream << "Shared" << std::endl;
   outStream << "=========" << std::endl;
   for(EntryLockDetailsSetCIter iter = lockQs->sharedLocks->begin();
       iter != lockQs->sharedLocks->end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Exclusive Waiters" << std::endl;
   outStream << "=========" << std::endl;
   for(EntryLockDetailsListCIter iter = lockQs->waitersExclLock->begin();
       iter != lockQs->waitersExclLock->end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Shared Waiters" << std::endl;
   outStream << "=========" << std::endl;
   for(EntryLockDetailsListCIter iter = lockQs->waitersSharedLock->begin();
       iter != lockQs->waitersSharedLock->end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Waiters lockIDs" << std::endl;
   outStream << "=========" << std::endl;
   for(StringSetCIter iter = lockQs->waitersLockIDs->begin();
       iter != lockQs->waitersLockIDs->end();
       iter++)
   {
      outStream << *iter << std::endl;
   }

   outStream << std::endl;

   return outStream.str();
}

/**
 * General wrapper for flock lock and unlock operations.
 *
 * @return true if operation succeeded immediately; false if registered for waiting (or failed in
 * case of NOWAIT-flag)
 */
std::pair<bool, LockRangeNotifyList> FileInode::flockRange(RangeLockDetails& lockDetails)
{
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   return flockRangeUnlocked(lockDetails);
}

/**
 * General wrapper for flock lock and unlock operations.
 *
 * Note: Unlocked, so caller must hold the write lock.
 *
 * @return true if operation succeeded immediately; false if registered for waiting (or failed in
 * case of NOWAIT-flag)
 */
std::pair<bool, LockRangeNotifyList> FileInode::flockRangeUnlocked(RangeLockDetails& lockDetails)
{
   bool tryNextWaiters = false;
   bool immediatelyGranted = false; // return value

   if(lockDetails.isCancel() )
   {
      // C A N C E L request

      /* note: this is typically used when a client closes a file, so we remove all granted and
         pending locks for the given handle here */

      if(flockRangeCancelByHandle(lockDetails) )
         tryNextWaiters = true;

      immediatelyGranted = true;
   }
   else
   if(lockDetails.isUnlock() )
   {
      // U N L O C K request

      tryNextWaiters = flockRangeUnlock(lockDetails);
      immediatelyGranted = true;
   }
   else
   {
      // L O C K request

      // check waiters to filter duplicate requests

      StringSetIter iterWaiters = waitersLockIDsRangeFLock.find(lockDetails.lockAckID);
      if(iterWaiters != waitersLockIDsRangeFLock.end() )
         return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting

      // not in waiters queue => is it granted already?

      bool isGrantedAlready = flockRangeIsGranted(lockDetails);
      if(isGrantedAlready)
         return {true, {}}; // request was granted already

      // not waiting, not granted => we have a new request

      bool hasConflicts = flockRangeCheckConflicts(lockDetails, NULL);

      if(!hasConflicts || lockDetails.allowsWaiting() )
         tryNextWaiters = flockRangeUnlock(lockDetails); // unlock range (for lock up-/downgrades)

      if(lockDetails.isShared() )
      {
         // S H A R E D lock request

         if(!hasConflicts)
         { // no confictors for this lock => can be immediately granted
            flockRangeShared(lockDetails);
            immediatelyGranted = true;
         }
         else
         if(lockDetails.allowsWaiting() )
         { // we have conflictors and locker wants to wait
            waitersSharedRangeFLock.push_back(lockDetails);
            waitersLockIDsRangeFLock.insert(lockDetails.lockAckID);
         }
      }
      else
      {
         // E X C L U S I V E lock request

         if(!hasConflicts)
         { // no confictors for this lock => can be immediately granted
            flockRangeExclusive(lockDetails);
            immediatelyGranted = true;
         }
         else
         if(lockDetails.allowsWaiting() )
         { // we have conflictors and locker wants to wait
            waitersExclRangeFLock.push_back(lockDetails);
            waitersLockIDsRangeFLock.insert(lockDetails.lockAckID);
         }
      }
   }

   if (tryNextWaiters)
      return {immediatelyGranted, flockRangeTryNextWaiters()};

   return {immediatelyGranted, {}};
}

/**
 * Remove all waiters from the queues.
 */
void FileInode::flockRangeCancelAllWaiters()
{
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   waitersLockIDsRangeFLock.clear();
   waitersExclRangeFLock.clear();
   waitersSharedRangeFLock.clear();
}

/**
 * Unlock all locks and wait entries of the given clientID.
 */
LockRangeNotifyList FileInode::flockRangeCancelByClientID(NumNodeID clientNumID)
{
   /* note: this code is in many aspects similar to flockRangeCancelByHandle(), so if you change
    * something here, you probably want to change it there, too. */

   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   bool tryNextWaiters = false;

   // exclusive locks

   for(RangeLockExclSetIter iter = exclRangeFLocks.begin();
       iter != exclRangeFLocks.end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         RangeLockExclSetIter iterNext = iter;
         iterNext++;

         exclRangeFLocks.erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // shared locks

   for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin();
       iter != sharedRangeFLocks.end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         RangeLockSharedSetIter iterNext = iter;
         iterNext++;

         sharedRangeFLocks.erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters exlusive

   for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
       iter != waitersExclRangeFLock.end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         waitersLockIDsRangeFLock.erase(iter->lockAckID);
         iter = waitersExclRangeFLock.erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters shared

   for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
       iter != waitersSharedRangeFLock.end();
       /* iter inc'ed inside loop */ )
   {
      if(iter->clientNumID == clientNumID)
      {
         waitersLockIDsRangeFLock.erase(iter->lockAckID);
         iter = waitersSharedRangeFLock.erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   if(tryNextWaiters)
      return flockRangeTryNextWaiters();

   return {};
}

/**
 * Remove all granted and pending locks that match the given handle.
 * (This is typically called by clients during file close.)
 *
 * Note: unlocked, so hold the mutex when calling this.
 *
 * @return true if locks were removed and next waiters should be tried.
 */
bool FileInode::flockRangeCancelByHandle(RangeLockDetails& lockDetails)
{
   /* note: this code is in many aspects similar to flockRangeCancelByClientID(), so if you change
    * something here, you probably want to change it there, too. */


   bool tryNextWaiters = false;

   // exclusive locks

   for(RangeLockExclSetIter iter = exclRangeFLocks.begin();
       iter != exclRangeFLocks.end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         RangeLockExclSetIter iterNext = iter;
         iterNext++;

         exclRangeFLocks.erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // shared locks

   for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin();
       iter != sharedRangeFLocks.end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         RangeLockSharedSetIter iterNext = iter;
         iterNext++;

         sharedRangeFLocks.erase(iter);

         iter = iterNext;
         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters exlusive

   for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
       iter != waitersExclRangeFLock.end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         waitersLockIDsRangeFLock.erase(iter->lockAckID);
         iter = waitersExclRangeFLock.erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }

   // waiters shared

   for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
       iter != waitersSharedRangeFLock.end();
       /* iter inc'ed inside loop */ )
   {
      if(lockDetails.equalsHandle(*iter) )
      {
         waitersLockIDsRangeFLock.erase(iter->lockAckID);
         iter = waitersSharedRangeFLock.erase(iter);

         tryNextWaiters = true;
         continue;
      }

      iter++;
   }


   return tryNextWaiters;
}


/**
 * Checks if there is a conflict for the given lock (but does not actually place lock).
 *
 * @param outConflictor the conflicting lock (or of of them) in case we return true.
 * @return true if there is a conflict for the given lock request.
 */
bool FileInode::flockRangeGetConflictor(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor)
{
   UniqueRWLock lock(rwlock, SafeRWLock_READ);

   return flockRangeCheckConflicts(lockDetails, outConflictor);
}

/**
 * Note: see flockRangeCheckConflictsEx() for comments (this is just the simple version which
 * checks the whole excl waiters queue and hence is inappropriate for tryNextWaiters() ).
 */
bool FileInode::flockRangeCheckConflicts(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor)
{
   return flockRangeCheckConflictsEx(lockDetails, -1, outConflictor);
}


/**
 * Note: Automatically ignores self-conflicts (locks that could be up- or downgraded)
 * Note: unlocked, so hold the mutex when calling this
 *
 * @param outConflictor first identified conflicting lock (only set if true is returned; can be
 * NULL if caller is not interested)
 * @param maxExclWaitersCheckNum only required by tryNextWaiters to find out how many pending excls
 * in the queue before the checked element should be tested for conflicts (ie for the 5th queue
 * element you will pass 4 here); -1 will check the whole queue, which is what all other callers
 * probably want to do.
 * @return true if there is a conflict with a lock that is not owned by the current lock requestor
 */
bool FileInode::flockRangeCheckConflictsEx(RangeLockDetails& lockDetails, int maxExclWaitersCheckNum,
   RangeLockDetails* outConflictor)
{
   // note: we also check waiting writers here, because we have writer preference and so we don't
      // want to grant access for a new reader if we have a waiting writer
      // ...and we also don't want to starve writers by other writers, so we also check for
      // overlapping waiting writer requests before granting a write lock


   // check conflicting exclusive locks (for shared & exclusive requests)

   for(RangeLockExclSetCIter iterExcl = exclRangeFLocks.begin();
       (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
       iterExcl++)
   {
      if(lockDetails.overlaps(*iterExcl) &&
         !lockDetails.equalsHandle(*iterExcl) )
      {
         SAFE_ASSIGN(outConflictor, *iterExcl);
         return true;
      }
   }

   // no conflicting exclusive lock exists

   if(lockDetails.isExclusive() )
   { // exclusive lock request: check conflicting shared locks

      // check granted shared locks

      for(RangeLockSharedSetCIter iterShared = sharedRangeFLocks.begin();
          iterShared != sharedRangeFLocks.end();
          iterShared++)
      {
         if(lockDetails.overlaps(*iterShared) &&
            !lockDetails.equalsHandle(*iterShared) )
         {
            SAFE_ASSIGN(outConflictor, *iterShared);
            return true;
         }
      }
   }

   // no conflicting shared lock exists

   // check waiting writers (for shared reqs to prefer writers and for excl reqs to avoid
      // writer starvation of partially overlapping waiting writers)

   // (note: keep in mind that maxExclWaitersCheckNum can also be -1 for infinite checks)

   for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin();
       (iter != waitersExclRangeFLock.end() ) && (maxExclWaitersCheckNum != 0);
       iter++, maxExclWaitersCheckNum--)
   {
      if(lockDetails.overlaps(*iter) &&
         !lockDetails.equalsHandle(*iter) )
      {
         SAFE_ASSIGN(outConflictor, *iter);
         return true;
      }
   }


   return false;
}


/**
 * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
 * duplicates.
 * Note: unlocked, so hold the mutex when calling this
 */
void FileInode::flockRangeShared(RangeLockDetails& lockDetails)
{
   // insert shared lock request...
   // (avoid duplicates and side-by-side locks for same file handles by merging)

   for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
       iterShared != sharedRangeFLocks.end();
       /* conditional iter increment inside loop */ )
   {
      bool incIterAtEnd = true;

      if(lockDetails.equalsHandle(*iterShared) && lockDetails.isMergeable(*iterShared) )
      { // same handle => merge with existing lock

         // note: all overlaps will be merged into lockDetails, so every other overlapping entry
            // can be removed here

         lockDetails.merge(*iterShared);

         RangeLockExclSetIter iterSharedNext(iterShared);
         iterSharedNext++;

         sharedRangeFLocks.erase(iterShared);

         iterShared = iterSharedNext;
         incIterAtEnd = false;
      }

      if(incIterAtEnd)
         iterShared++;
   }

   // actually insert the new lock
   sharedRangeFLocks.insert(lockDetails);
}

/**
 * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
 * duplicates.
 * Note: unlocked, so hold the mutex when calling this
 */
void FileInode::flockRangeExclusive(RangeLockDetails& lockDetails)
{
   // insert excl lock request...
   // (avoid duplicates and side-by-side locks for same file handles by merging)

   // (note: lockDetails.end+1: because we're also looking for extensions, not only overlaps)
   for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
       (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= (lockDetails.end+1) );
       /* conditional iter increment inside loop */ )
   {
      bool incIterAtEnd = true;

      if(lockDetails.equalsHandle(*iterExcl) && lockDetails.isMergeable(*iterExcl) )
      { // same handle => merge with existing lock

         // note: all overlaps will be merged into lockDetails, so every other overlapping entry
            // can be removed here

         lockDetails.merge(*iterExcl);

         RangeLockExclSetIter iterExclNext(iterExcl);
         iterExclNext++;

         exclRangeFLocks.erase(iterExcl);

         iterExcl = iterExclNext;
         incIterAtEnd = false;
      }

      if(incIterAtEnd)
         iterExcl++;
   }

   // actually insert the new lock
   exclRangeFLocks.insert(lockDetails);
}

/**
 * Find out whether a given range lock is currently being held by the given owner.
 *
 * Note: unlocked, hold the read lock when calling this.
 *
 * @return true if the range is locked by the given owner
 */
bool FileInode::flockRangeIsGranted(RangeLockDetails& lockDetails)
{
   if(lockDetails.isExclusive() )
   {
      for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
          (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
          /* conditional iter increment at end of loop */ )
      {
         if(!lockDetails.equalsHandle(*iterExcl) )
         { // lock owned by another client/process
            iterExcl++;
            continue;
         }

         // found a lock that is owned by the same client/process => check overlap with given lock

         bool incIterAtEnd = true;

         RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl);

         switch(overlap)
         {
            case RangeOverlapType_EQUALS:
            { // found an exact match => don't need to look any further
               return true;
            } break;

            case RangeOverlapType_ISCONTAINED:
            { /* given range is fully contained in a greater locked area => don't need to look any
                 further */
               return true;
            } break;

            case RangeOverlapType_CONTAINS:
            { /* found a range which is part of the given lock => given owner cannot currently hold
                 the lock for the whole given range, otherwise we wouldn't find a partial match
                 because of our merging => don't need to look any further */

               return false;
            } break;

            case RangeOverlapType_STARTOVERLAP:
            case RangeOverlapType_ENDOVERLAP:
            { /* found a range which is part of the given lock => given owner cannot currently hold
                 the lock for the whole given range, otherwise we wouldn't find a partial match
                 because of our merging => don't need to look any further */

               return false;
            } break;

            default: break; // no overlap

         } // end of switch(overlap)

         if(incIterAtEnd)
            iterExcl++;
      }
   } // end of exclusive locks check
   else
   if(lockDetails.isShared() )
   {
      for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
          iterShared != sharedRangeFLocks.end();
          /* conditional iter increment at end of loop */ )
      {
         if(!lockDetails.equalsHandle(*iterShared) )
         { // lock owned by another client/process
            iterShared++;
            continue;
         }

         // found a lock that is owned by the same client/process => check overlap with given lock

         bool incIterAtEnd = true;

         RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared);

         switch(overlap)
         {
            case RangeOverlapType_EQUALS:
            { // found an exact match => don't need to look any further

               return true;
            } break;

            case RangeOverlapType_ISCONTAINED:
            { /* given lock is fully contained in a greater locked area => don't need to look any
                 further */

               return true;
            } break;

            case RangeOverlapType_CONTAINS:
            { /* found a range which is part of the given lock => given owner cannot currently hold
                 the lock for the whole given range, otherwise we wouldn't find a partial match
                 because of our merging => don't need to look any further */

               return false;
            } break;

            case RangeOverlapType_STARTOVERLAP:
            case RangeOverlapType_ENDOVERLAP:
            {  /* found a range which is part of the given lock => given owner cannot currently hold
                 the lock for the whole given range, otherwise we wouldn't find a partial match
                 because of our merging => don't need to look any further */

               return false;
            } break;

            default: break; // no overlap

         } // end of switch(overlap)

         if(incIterAtEnd)
            iterShared++;
      }
   } // end of shared locks check


   return false;
}


/**
 * Note: unlocked, so hold the mutex when calling this.
 *
 * @return true if an existing lock has been removed
 */
bool FileInode::flockRangeUnlock(RangeLockDetails& lockDetails)
{
   bool lockRemoved = false; // return value

   // check exclusive locks...
   // (quick path: if the whole unlock is entirely covered by an exclusive range, then we don't need
   // to look any further)

   for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
       (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
       /* conditional iter increment at end of loop */ )
   {
      if(!lockDetails.equalsHandle(*iterExcl) )
      { // lock owned by another client/process
         iterExcl++;
         continue;
      }

      // found a lock that is owned by the same client/process => check overlap with unlock request

      bool incIterAtEnd = true;

      RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl);

      switch(overlap)
      {
         case RangeOverlapType_EQUALS:
         { // found an exact match => don't need to look any further
            exclRangeFLocks.erase(iterExcl);

            return true;
         } break;

         case RangeOverlapType_ISCONTAINED:
         { // unlock is fully contained in a greater locked area => don't need to look any further

            // check if 1 or 2 locked areas remain (=> shrink or split)

            if( (lockDetails.start == iterExcl->start) ||
                (lockDetails.end == iterExcl->end) )
            { // only one locked area remains
               RangeLockDetails oldExcl(*iterExcl);
               oldExcl.trim(lockDetails);

               exclRangeFLocks.erase(iterExcl);
               exclRangeFLocks.insert(oldExcl);
            }
            else
            { // two locked areas remain
               RangeLockDetails oldExcl(*iterExcl);
               RangeLockDetails newExcl;

               oldExcl.split(lockDetails, newExcl);

               exclRangeFLocks.erase(iterExcl);
               exclRangeFLocks.insert(oldExcl);
               exclRangeFLocks.insert(newExcl);
            }

            return true;
         } break;

         case RangeOverlapType_CONTAINS:
         { // full removal of this lock, but there may still be some others that need to be removed
            RangeLockExclSetIter iterExclNext(iterExcl);
            iterExclNext++;

            exclRangeFLocks.erase(iterExcl);

            lockRemoved = true;

            iterExcl = iterExclNext;
            incIterAtEnd = false;
         } break;

         case RangeOverlapType_STARTOVERLAP:
         case RangeOverlapType_ENDOVERLAP:
         { // partial removal of this lock and there may still be others that need to be removed
            // note: might change start and consequently map position => re-insert excl lock
            RangeLockExclSetIter iterExclNext(iterExcl);
            iterExclNext++;

            RangeLockDetails oldExcl(*iterExcl);
            oldExcl.trim(lockDetails);

            exclRangeFLocks.erase(iterExcl);
            exclRangeFLocks.insert(oldExcl);

            lockRemoved = true;

            iterExcl = iterExclNext;
            incIterAtEnd = false;
         } break;

         default: break; // no overlap

      } // end of switch(overlap)

      if(incIterAtEnd)
         iterExcl++;
   }

   // check shared locks...
   // (similar to exclusive locks, we can stop here if unlock is entirely covered by one of our
   // owned shared ranges, because there cannot be another overlapping range which we also own)

   for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
       iterShared != sharedRangeFLocks.end();
       /* conditional iter increment at end of loop */ )
   {
      if(!lockDetails.equalsHandle(*iterShared) )
      { // lock owned by another client/process
         iterShared++;
         continue;
      }

      // found a lock that is owned by the same client/process => check overlap with unlock request

      bool incIterAtEnd = true;

      RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared);

      switch(overlap)
      {
         case RangeOverlapType_EQUALS:
         { // found an exact match => don't need to look any further
            sharedRangeFLocks.erase(iterShared);

            return true;
         } break;

         case RangeOverlapType_ISCONTAINED:
         { // unlock is fully contained in a greater locked area => don't need to look any further

            // check if 1 or 2 locked areas remain...

            if( (lockDetails.start == iterShared->start) ||
                (lockDetails.end == iterShared->end) )
            { // only one locked area remains
               RangeLockDetails oldShared(*iterShared);
               oldShared.trim(lockDetails);

               sharedRangeFLocks.erase(iterShared);
               sharedRangeFLocks.insert(oldShared);
            }
            else
            { // two locked areas remain
               RangeLockDetails oldShared(*iterShared);
               RangeLockDetails newShared;

               oldShared.split(lockDetails, newShared);

               sharedRangeFLocks.erase(iterShared);
               sharedRangeFLocks.insert(oldShared);
               sharedRangeFLocks.insert(newShared);
            }

            return true;
         } break;

         case RangeOverlapType_CONTAINS:
         { // full removal of this lock, but there may still be some others that need to be removed
            RangeLockExclSetIter iterExclNext(iterShared);
            iterExclNext++;

            sharedRangeFLocks.erase(iterShared);

            lockRemoved = true;

            iterShared = iterExclNext;
            incIterAtEnd = false;
         } break;

         case RangeOverlapType_STARTOVERLAP:
         case RangeOverlapType_ENDOVERLAP:
         { // partial removal of this lock and there may still be others that need to be removed
            // note: might change start and consequently map position => re-insert excl lock
            RangeLockExclSetIter iterSharedNext(iterShared);
            iterSharedNext++;

            RangeLockDetails oldShared(*iterShared);
            oldShared.trim(lockDetails);

            sharedRangeFLocks.erase(iterShared);
            sharedRangeFLocks.insert(oldShared);

            lockRemoved = true;

            iterShared = iterSharedNext;
            incIterAtEnd = false;
         } break;

         default: break; // no overlap

      } // end of switch(overlap)

      if(incIterAtEnd)
         iterShared++;
   }


   return lockRemoved;
}

/**
 * Remove next requests from waiters queue and try to grant it - until we reach an entry that
 * cannot be granted immediately.
 *
 * Note: unlocked, so hold the mutex when calling this.
 */
LockRangeNotifyList FileInode::flockRangeTryNextWaiters()
{
   int numWaitersBefore = 0; // number of waiters in the queue before the current checked element

   LockRangeNotifyList notifyList; // quick stack version to speed up the no waiter granted path


   for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
       iter != waitersExclRangeFLock.end();
       /* conditional iter inc inside loop */)
   {
      bool hasConflict = flockRangeCheckConflictsEx(*iter, numWaitersBefore, NULL);
      if(hasConflict)
      {
         iter++;
         numWaitersBefore++;
         continue;
      }

      // no conflict => grant lock

      flockRangeExclusive(*iter);

      notifyList.push_back(*iter);

      waitersLockIDsRangeFLock.erase(iter->lockAckID);
      iter = waitersExclRangeFLock.erase(iter);
   }

   for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
       iter != waitersSharedRangeFLock.end();
       /* conditional iter inc inside loop */)
   {
      bool hasConflict = flockRangeCheckConflicts(*iter, NULL);
      if(hasConflict)
      {
         iter++;
         continue;
      }

      // no conflict => grant lock

      flockRangeShared(*iter);

      notifyList.push_back(*iter);

      waitersLockIDsRangeFLock.erase(iter->lockAckID);
      iter = waitersSharedRangeFLock.erase(iter);
   }

   return notifyList;
}


/**
 * Generate a complete locking status overview (all granted and waiters) as human-readable string.
 */
std::string FileInode::flockRangeGetAllAsStr()
{
   UniqueRWLock lock(rwlock, SafeRWLock_READ);

   std::ostringstream outStream;

   outStream << "Exclusive" << std::endl;
   outStream << "=========" << std::endl;
   for(RangeLockExclSetCIter iter = exclRangeFLocks.begin();
       iter != exclRangeFLocks.end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Shared" << std::endl;
   outStream << "=========" << std::endl;
   for(RangeLockSharedSetCIter iter = sharedRangeFLocks.begin();
       iter != sharedRangeFLocks.end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Exclusive Waiters" << std::endl;
   outStream << "=========" << std::endl;
   for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin();
       iter != waitersExclRangeFLock.end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Shared Waiters" << std::endl;
   outStream << "=========" << std::endl;
   for(RangeLockDetailsListCIter iter = waitersSharedRangeFLock.begin();
       iter != waitersSharedRangeFLock.end();
       iter++)
   {
      outStream << iter->toString() << std::endl;
   }

   outStream << std::endl;

   outStream << "Waiters lockIDs" << std::endl;
   outStream << "=========" << std::endl;
   for(StringSetCIter iter = waitersLockIDsRangeFLock.begin();
       iter != waitersLockIDsRangeFLock.end();
       iter++)
   {
      outStream << *iter << std::endl;
   }

   outStream << std::endl;

   return outStream.str();
}

/**
 * Increase/decreas the link count of this inode
 */
bool FileInode::incDecNumHardLinks(EntryInfo* entryInfo, int value)
{
   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K

   incDecNumHardlinksUnpersistentUnlocked(value);

   // update ctime
   StatData* statData = this->inodeDiskData.getInodeStatData();
   statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);

   bool retVal = storeUpdatedInodeUnlocked(entryInfo); // store on disk
   if(!retVal)
   {  // failed to update metadata on disk => restore old values
      incDecNumHardlinksUnpersistentUnlocked(-value);
   }

   safeLock.unlock(); // U N L O C K

   return retVal;
}

bool FileInode::operator==(const FileInode& other) const
{
   return inodeDiskData == other.inodeDiskData
      && fileInfoVec == other.fileInfoVec
      && exclusiveTID == other.exclusiveTID
      && numSessionsRead == other.numSessionsRead
      && numSessionsWrite == other.numSessionsWrite
      && exclAppendLock == other.exclAppendLock
      && waitersExclAppendLock == other.waitersExclAppendLock
      && waitersLockIDsAppendLock == other.waitersLockIDsAppendLock
      && exclFLock == other.exclFLock
      && sharedFLocks == other.sharedFLocks
      && waitersExclFLock == other.waitersExclFLock
      && waitersSharedFLock == other.waitersSharedFLock
      && waitersLockIDsFLock == other.waitersLockIDsFLock
      && exclRangeFLocks == other.exclRangeFLocks
      && sharedRangeFLocks == other.sharedRangeFLocks
      && waitersExclRangeFLock == other.waitersExclRangeFLock
      && waitersSharedRangeFLock == other.waitersSharedRangeFLock
      && waitersLockIDsRangeFLock == other.waitersLockIDsRangeFLock
      && dentryCompatData == other.dentryCompatData
      && numParentRefs.read() == other.numParentRefs.read()
      && referenceParentID == other.referenceParentID
      && isInlined == other.isInlined;
}

std::pair<FhgfsOpsErr, StringVector> FileInode::listXAttr()
{
   BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");

   const Path* inodesPath = getIsBuddyMirroredUnlocked()
      ? Program::getApp()->getBuddyMirrorInodesPath()
      : Program::getApp()->getInodesPath();

   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
      inodeDiskData.getEntryID());

   return XAttrTk::listUserXAttrs(metaFilename);
}

std::tuple<FhgfsOpsErr, std::vector<char>, ssize_t> FileInode::getXAttr(
   const std::string& xAttrName, size_t maxSize)
{
   BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");

   const Path* inodesPath = getIsBuddyMirroredUnlocked()
      ? Program::getApp()->getBuddyMirrorInodesPath()
      : Program::getApp()->getInodesPath();

   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
      inodeDiskData.getEntryID());

   return XAttrTk::getUserXAttr(metaFilename, xAttrName, maxSize);
}

FhgfsOpsErr FileInode::removeXAttr(EntryInfo* entryInfo, const std::string& xAttrName)
{
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");

   const Path* inodesPath = getIsBuddyMirroredUnlocked()
      ? Program::getApp()->getBuddyMirrorInodesPath()
      : Program::getApp()->getInodesPath();

   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
      inodeDiskData.getEntryID());

   FhgfsOpsErr result = XAttrTk::removeUserXAttr(metaFilename, xAttrName);

   if (result == FhgfsOpsErr_SUCCESS)
   {
      inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
      storeUpdatedInodeUnlocked(entryInfo, nullptr);
   }

   // FIXME: should resync only this xattr ON THE INODE
   if (getIsBuddyMirroredUnlocked())
      if (auto* resync = BuddyResyncer::getSyncChangeset())
         resync->addModification(metaFilename, MetaSyncFileType::Inode);

   return result;
}

FhgfsOpsErr FileInode::setXAttr(EntryInfo* entryInfo, const std::string& xAttrName,
   const CharVector& xAttrValue, int flags)
{
   UniqueRWLock lock(rwlock, SafeRWLock_WRITE);

   BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");

   const Path* inodesPath = getIsBuddyMirroredUnlocked()
      ? Program::getApp()->getBuddyMirrorInodesPath()
      : Program::getApp()->getInodesPath();

   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
      inodeDiskData.getEntryID());

   FhgfsOpsErr result = XAttrTk::setUserXAttr(metaFilename, xAttrName, &xAttrValue[0],
         xAttrValue.size(), flags);

   if (result == FhgfsOpsErr_SUCCESS)
   {
      inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
      storeUpdatedInodeUnlocked(entryInfo, nullptr);
   }

   // FIXME: should resync only this xattr ON THE INODE
   if (getIsBuddyMirroredUnlocked())
      if (auto* resync = BuddyResyncer::getSyncChangeset())
         resync->addModification(metaFilename, MetaSyncFileType::Inode);

   return result;
}

void FileInode::initLocksRandomForSerializationTests()
{
   Random rand;

   this->exclusiveTID = rand.getNextInt();
   this->numSessionsRead = rand.getNextInt();
   this->numSessionsWrite = rand.getNextInt();


   this->exclAppendLock.initRandomForSerializationTests();

   int max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      EntryLockDetails lock;
      lock.initRandomForSerializationTests();
      this->waitersExclAppendLock.push_back(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      std::string id;
      StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
      this->waitersLockIDsAppendLock.insert(id);
   }


   this->exclFLock.initRandomForSerializationTests();

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      EntryLockDetails lock;
      lock.initRandomForSerializationTests();
      this->sharedFLocks.insert(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      EntryLockDetails lock;
      lock.initRandomForSerializationTests();
      this->waitersExclFLock.push_back(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      EntryLockDetails lock;
      lock.initRandomForSerializationTests();
      this->waitersSharedFLock.push_back(lock);
   }


   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      std::string id;
      StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
      this->waitersLockIDsFLock.insert(id);
   }


   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      RangeLockDetails lock;
      lock.initRandomForSerializationTests();
      this->exclRangeFLocks.insert(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      RangeLockDetails lock;
      lock.initRandomForSerializationTests();
      this->sharedRangeFLocks.insert(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      RangeLockDetails lock;
      lock.initRandomForSerializationTests();
      this->waitersExclRangeFLock.push_back(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      RangeLockDetails lock;
      lock.initRandomForSerializationTests();
      this->waitersSharedRangeFLock.push_back(lock);
   }

   max = rand.getNextInRange(0, 1024);
   for(int i = 0; i < max; i++)
   {
      std::string id;
      StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
      this->waitersLockIDsFLock.insert(id);
   }


   StringTk::genRandomAlphaNumericString(this->referenceParentID, rand.getNextInRange(2, 30) );
   this->numParentRefs.set(rand.getNextInt() );
}

/**
 * Checks whether current file state allows the requested access and increments appropriate
 * session counter if permitted. The entire operation occurs under a single write lock
 * to prevent races between open() operation and state validate-and-update operations.
 *
 * @param accessFlags OPENFILE_ACCESS_... flags
 * @param bypassAccessCheck if true, skip all file state-based access checks
 * @return FhgfsOpsErr_SUCCESS if file opened successfully
 *         FhgfsOpsErr_FILEACCESS_DENIED if file state restricts the requested access
 */
FhgfsOpsErr FileInode::checkAccessAndOpen(unsigned accessFlags, bool bypassAccessCheck)
{
   RWLockGuard lock(rwlock, SafeRWLock_WRITE);

   if (!bypassAccessCheck)
   {
      FileState state(getFileStateUnlocked());

      // Fast path: Check if file is unlocked (common case)
      if (unlikely(!state.isUnlocked()))
      {
         // File has active state restrictions - determine what access types are being requested
         bool readRequested = accessFlags & (OPENFILE_ACCESS_READ | OPENFILE_ACCESS_READWRITE);
         bool writeRequested = accessFlags & (OPENFILE_ACCESS_WRITE | OPENFILE_ACCESS_READWRITE |
                                          OPENFILE_ACCESS_TRUNC);

         // Access not allowed if: state implies fully locked, or
         // read requested when read-locked, or write requested when write-locked
         bool blockOpenRequest = state.isFullyLocked() ||
                                 (state.isReadLocked() && readRequested) ||
                                 (state.isWriteLocked() && writeRequested);

         if (blockOpenRequest)
            return FhgfsOpsErr_FILEACCESS_DENIED;
      }
   }

   // Access allowed - increment session counter
   incNumSessionsUnlocked(accessFlags);
   return FhgfsOpsErr_SUCCESS;
}