beegfs/meta/source/storage/FileInode.cpp
2025-08-10 01:34:16 +02:00

3039 lines
90 KiB
C++

#include <common/toolkit/serialization/Serialization.h>
#include <common/toolkit/MathTk.h>
#include <common/storage/striping/Raid0Pattern.h>
#include <common/storage/StorageDefinitions.h>
#include <toolkit/XAttrTk.h>
#include <program/Program.h>
#include "FileInode.h"
#include "Locking.h"
#include <sys/xattr.h>
#include <boost/lexical_cast.hpp>
// shorthand for the long init line of AppendLockQueuesContainer to create on stack
#define FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(varName) \
AppendLockQueuesContainer varName( \
&exclAppendLock, &waitersExclAppendLock, &waitersLockIDsAppendLock)
// shorthand for the long init line of EntryLockQueuesContainer to create on stack
#define FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(varName) \
EntryLockQueuesContainer varName( \
&exclFLock, &sharedFLocks, &waitersExclFLock, &waitersSharedFLock, &waitersLockIDsFLock)
/**
* Inode initialization. The preferred initializer. Used for loading an inode from disk
*/
FileInode::FileInode(std::string entryID, FileInodeStoreData* inodeDiskData,
DirEntryType entryType, unsigned dentryFeatureFlags) : inodeDiskData(entryID, inodeDiskData)
{
this->exclusiveTID = 0;
this->numSessionsRead = 0;
this->numSessionsWrite = 0;
initFileInfoVec();
this->dentryCompatData.entryType = entryType;
this->dentryCompatData.featureFlags = dentryFeatureFlags;
}
/**
* Note: This constructor does not perform the full initialization, so use it for
* metadata loading (or similar deserialization) only.
*
* Note: Don't forget to call initFileInfoVec() when using this (loadFromInodeFile() includes it).
*/
FileInode::FileInode()
{
this->exclusiveTID = 0;
this->numSessionsRead = 0;
this->numSessionsWrite = 0;
this->dentryCompatData.entryType = DirEntryType_INVALID;
this->dentryCompatData.featureFlags = 0;
}
/**
* Requires: init'ed stripe pattern, modification and last access time secs
*/
void FileInode::initFileInfoVec()
{
// create a fileInfo in the vector for each stripe node
StripePattern* pattern = inodeDiskData.getStripePattern();
size_t numTargets = pattern->getStripeTargetIDs()->size();
unsigned chunkSize = pattern->getChunkSize();
unsigned chunkSizeLog2 = MathTk::log2Int32(chunkSize);
uint64_t stripeSetSize = chunkSize * numTargets;
int64_t lastStripeSetSize; // =fileLength%stripeSetSize (remainder after stripeSetStart)
int64_t stripeSetStart; // =fileLength-stripeSetSize
int64_t fullLengthPerTarget; // =stripeSetStart/numTargets (without last stripe set remainder)
StatData* statData = this->inodeDiskData.getInodeStatData();
int64_t fileSize = statData->getFileSize();
/* compute stripeset start to get number of complete chunks on all nodes and stripeset remainder
to compute each target's remainder in the last stripe set. */
/* note: chunkSize is definitely power of two. if numTargets is also power of two, then
stripeSetSize is also power of two */
if(MathTk::isPowerOfTwo(numTargets) )
{ // quick path => optimized without division/modulo
lastStripeSetSize = fileSize & (stripeSetSize-1);
stripeSetStart = fileSize - lastStripeSetSize;
fullLengthPerTarget = stripeSetStart >> MathTk::log2Int32(numTargets);
}
else
{ // slow path => requires division/modulo
lastStripeSetSize = fileSize % stripeSetSize;
stripeSetStart = fileSize - lastStripeSetSize;
fullLengthPerTarget = stripeSetStart / numTargets;
}
// walk over all targets: compute their chunk file sizes and init timestamps
fileInfoVec.reserve(numTargets);
// to subtract last stripe set length of pevious targets in for-loop below
int64_t remainingLastSetSize = lastStripeSetSize;
for(unsigned target=0; target < numTargets; target++) // iterate over all chunks / targets
{
int64_t targetFileLength = fullLengthPerTarget;
if(remainingLastSetSize > 0)
targetFileLength += BEEGFS_MIN(remainingLastSetSize, chunkSize);
int64_t modificationTimeSecs = statData->getModificationTimeSecs();
int64_t lastAccessTimeSecs = statData->getLastAccessTimeSecs();
uint64_t usedBlocks;
if (statData->getIsSparseFile() )
usedBlocks = statData->getTargetChunkBlocks(target);
else
{ // estimate the number of blocks by the file size
usedBlocks = targetFileLength >> StatData::BLOCK_SHIFT;
}
DynamicFileAttribs dynAttribs(0, targetFileLength, usedBlocks, modificationTimeSecs,
lastAccessTimeSecs);
ChunkFileInfo fileInfo(chunkSize, chunkSizeLog2, dynAttribs);
fileInfoVec.push_back(fileInfo);
remainingLastSetSize -= chunkSize;
}
}
/*
* set remote targets for FileInode
*/
FhgfsOpsErr FileInode::setRemoteStorageTarget(EntryInfo* entryInfo, const RemoteStorageTarget& rst)
{
const char* logContext = "Set Remote Storage Target (FileInode)";
FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE);
auto [isValid, details] = rst.validateWithDetails();
if (!isValid)
{
LogContext(logContext).log(Log_WARNING, "Invalid RST data: " + details);
retVal = FhgfsOpsErr_INTERNAL;
}
else
{
// set file's rst now
this->rstInfo.set(rst);
if (this->storeRemoteStorageTargetUnlocked(entryInfo))
{
if (!this->getIsRstAvailableUnlocked())
{
addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_RST);
if (!this->storeUpdatedInodeUnlocked(entryInfo))
retVal = FhgfsOpsErr_INTERNAL;
}
}
else
retVal = FhgfsOpsErr_INTERNAL;
}
safeLock.unlock();
return retVal;
}
FhgfsOpsErr FileInode::clearRemoteStorageTarget(EntryInfo* entryInfo)
{
const char* logContext = "Clear Remote Storage Target (FileInode)";
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
if (!this->getIsRstAvailableUnlocked())
return FhgfsOpsErr_SUCCESS;
// Clear inode feature flag and store updated inode
unsigned flags = this->inodeDiskData.getInodeFeatureFlags();
flags &= ~FILEINODE_FEATURE_HAS_RST;
this->inodeDiskData.setInodeFeatureFlags(flags);
if (!this->storeUpdatedInodeUnlocked(entryInfo))
return FhgfsOpsErr_INTERNAL;
// Clear in-memory RST info
this->rstInfo.reset();
// Remove RST xattr from meta file
std::string metafile = this->getMetaFilePath(entryInfo);
int res = removexattr(metafile.c_str(), RST_XATTR_NAME);
if (unlikely(res == -1))
{
// Not reporting as error to caller because:
// 1. Feature flag is already cleared in metadata
// 2. In-memory state is reset
// 3. Future operations will ignore xattr due to cleared flag
// Just log warning msgs for any unexpected errors or missing xattr.
if (errno == ENODATA)
{
LogContext(logContext).log(Log_WARNING, "RST xattr not found. Path: " + metafile);
}
else
{
LogContext(logContext).log(Log_WARNING, "Failed to remove RST xattr; entryID: " +
entryInfo->getEntryID() + "; error: " + System::getErrString());
}
}
return FhgfsOpsErr_SUCCESS;
}
/**
* Decrease number of sessions for read or write (=> file close) and update persistent
* metadata.
* Note: This currently includes persistent metadata update for efficiency reasons (because
* we already hold the mutex lock here).
*
* @param accessFlags OPENFILE_ACCESS_... flags
*/
void FileInode::decNumSessionsAndStore(EntryInfo* entryInfo, unsigned accessFlags)
{
SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE);
if(accessFlags & OPENFILE_ACCESS_READ)
{
if(unlikely(!numSessionsRead) )
{
LogContext log("File::decNumSessionsRead");
log.logErr(
std::string("Warning: numSessionsRead is already zero. " +
std::string("File: ") + getEntryIDUnlocked() ) );
}
else
this->numSessionsRead--;
}
else
{ // (includes read+write)
if(unlikely(!numSessionsWrite) )
{
LogContext log("File::decNumSessionsWrite");
log.logErr(
std::string("Warning: numSessionsWrite is already zero. " +
std::string("File: ") + getEntryIDUnlocked() ) );
}
else
this->numSessionsWrite--;
}
// dyn attribs have been updated during close, so we save them here
storeUpdatedInodeUnlocked(entryInfo);
safeLock.unlock();
}
/**
* Note: This version is compatible with sparse files.
*/
void FileInode::updateDynamicAttribs()
{
this->inodeDiskData.inodeStatData.updateDynamicFileAttribs(this->fileInfoVec,
this->inodeDiskData.getPattern() );
}
/*
* Note: Current object state is used for the serialization
*/
void FileInode::serializeMetaData(Serializer& ser)
{
// note: the total amount of serialized data may not be larger than META_SERBUF_SIZE
// get latest dyn attrib values
updateDynamicAttribs();
NumNodeID ownerNodeID ; /* irrelevant here. The serialize will set it to ourselves for inlined
* inodes */
DentryStoreData dentryDiskData(this->inodeDiskData.getEntryID(),
this->dentryCompatData.entryType, ownerNodeID, this->dentryCompatData.featureFlags);
DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData);
diskMetaData.serializeFileInode(ser);
}
/*
* Note: Applies deserialized data directly to the current object
*/
void FileInode::deserializeMetaData(Deserializer& des)
{
DentryStoreData dentryDiskData;
DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData);
diskMetaData.deserializeFileInode(des);
if (!des.good())
return;
{ // dentry compat data
// entryType
this->dentryCompatData.entryType = dentryDiskData.getDirEntryType();
// (dentry) feature flags
this->dentryCompatData.featureFlags = dentryDiskData.getDentryFeatureFlags();
}
}
/**
* Note: Wrapper/chooser for storeUpdatedMetaDataBufAsXAttr/Contents.
* Note: Unlocked, caller must hold write lock.
*
* @param buf the serialized object state that is to be stored
*/
bool FileInode::storeUpdatedMetaDataBuf(char* buf, unsigned bufLen)
{
App* app = Program::getApp();
bool useXAttrs = app->getConfig()->getStoreUseExtendedAttribs();
const Path* inodesPath =
getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
inodeDiskData.getEntryID());
bool result = useXAttrs
? storeUpdatedMetaDataBufAsXAttr(buf, bufLen, metaFilename)
: storeUpdatedMetaDataBufAsContents(buf, bufLen, metaFilename);
if (getIsBuddyMirroredUnlocked())
if (auto* resync = BuddyResyncer::getSyncChangeset())
resync->addModification(metaFilename, MetaSyncFileType::Inode);
return result;
}
/**
* Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf().
*
* @param buf the serialized object state that is to be stored
*/
bool FileInode::storeUpdatedMetaDataBufAsXAttr(char* buf, unsigned bufLen, std::string metaFilename)
{
const char* logContext = "File (store updated xattr metadata)";
// open file (create file if not already present)
int openFlags = O_CREAT|O_TRUNC|O_WRONLY;
int fd = open(metaFilename.c_str(), openFlags, 0644);
if (unlikely(fd == -1))
{
LogContext(logContext).logErr("Unable to open/create inode metafile: " + metaFilename
+ ". " + "SysErr: " + System::getErrString());
return false;
}
// write data to file
int setRes = fsetxattr(fd, META_XATTR_NAME, buf, bufLen, 0);
if(unlikely(setRes == -1) )
{ // error
LogContext(logContext).logErr("Unable to write FileInode metadata update: " +
metaFilename + ". " + "SysErr: " + System::getErrString() );
close(fd);
return false;
}
LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() );
close(fd);
return true;
}
/**
* Stores the update to a sparate file first and then renames it.
*
* Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf().
*
* @param buf the serialized object state that is to be stored
*/
bool FileInode::storeUpdatedMetaDataBufAsContents(char* buf, unsigned bufLen,
std::string metaFilename)
{
const char* logContext = "File (store updated inode)";
std::string metaUpdateFilename(metaFilename + META_UPDATE_EXT_STR);
ssize_t writeRes;
int renameRes;
// open file (create it, but not O_EXCL because a former update could have failed)
int openFlags = O_CREAT|O_TRUNC|O_WRONLY;
int fd = open(metaUpdateFilename.c_str(), openFlags, 0644);
if(fd == -1)
{ // error
if(errno == ENOSPC)
{ // no free space => try again with update in-place
LogContext(logContext).log(Log_DEBUG, "No space left to create update file. Trying update "
"in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() );
return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename);
}
LogContext(logContext).logErr("Unable to create inode update file: " + metaUpdateFilename +
". " + "SysErr: " + System::getErrString() );
return false;
}
// metafile created => store meta data
writeRes = write(fd, buf, bufLen);
if(writeRes != (ssize_t)bufLen)
{
if( (writeRes >= 0) || (errno == ENOSPC) )
{ // no free space => try again with update in-place
LogContext(logContext).log(Log_DEBUG, "No space left to write update inode. Trying update "
"in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() );
close(fd);
unlink(metaUpdateFilename.c_str() );
return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename);
}
LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
goto error_closefile;
}
close(fd);
renameRes = rename(metaUpdateFilename.c_str(), metaFilename.c_str() );
if(renameRes == -1)
{
LogContext(logContext).logErr("Unable to replace old inode file: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
goto error_unlink;
}
LOG_DEBUG(logContext, 4, "Inode update stored: " + this->inodeDiskData.getEntryID() );
return true;
// error compensation
error_closefile:
close(fd);
error_unlink:
unlink(metaUpdateFilename.c_str() );
return false;
}
/**
* Stores the update directly to the current metadata file (instead of creating a separate file
* first and renaming it).
*
* Note: Don't call this directly, it is automatically called by storeUpdatedMetaDataBufAsContents()
* when necessary.
*
* @param buf the serialized object state that is to be stored
*/
bool FileInode::storeUpdatedMetaDataBufAsContentsInPlace(char* buf, unsigned bufLen,
std::string metaFilename)
{
const char* logContext = "File (store updated inode in-place)";
int fallocRes;
ssize_t writeRes;
int truncRes;
// open file (create it, but not O_EXCL because a former update could have failed)
int openFlags = O_CREAT|O_WRONLY;
int fd = open(metaFilename.c_str(), openFlags, 0644);
if(fd == -1)
{ // error
LogContext(logContext).logErr("Unable to open inode file: " + metaFilename +
". " + "SysErr: " + System::getErrString() );
return false;
}
// make sure we have enough room to write our update
fallocRes = posix_fallocate(fd, 0, bufLen); // (note: posix_fallocate does not set errno)
if(fallocRes == EBADF)
{ // special case for XFS bug
struct stat statBuf;
int statRes = fstat(fd, &statBuf);
if (statRes == -1)
{
LogContext(logContext).log(Log_WARNING, "Unexpected error: fstat() failed with SysErr: "
+ System::getErrString(errno));
goto error_closefile;
}
if (statBuf.st_size < bufLen)
{
LogContext(logContext).log(Log_WARNING, "File space allocation ("
+ StringTk::intToStr(bufLen) + ") for inode update failed: " + metaFilename + ". " +
"SysErr: " + System::getErrString(fallocRes) + " "
"statRes: " + StringTk::intToStr(statRes) + " "
"oldSize: " + StringTk::intToStr(statBuf.st_size));
goto error_closefile;
}
else
{ // // XFS bug! We only return an error if statBuf.st_size < bufLen. Ingore fallocRes then
LOG_DEBUG(logContext, Log_SPAM, "Ignoring kernel file system bug: "
"posix_fallocate() failed for len < filesize");
}
}
else
if (fallocRes != 0)
{ // default error handling if posix_fallocate() failed
LogContext(logContext).log(Log_WARNING, "File space allocation ("
+ StringTk::intToStr(bufLen) + ") for inode update failed: " + metaFilename + ". " +
"SysErr: " + System::getErrString(fallocRes) );
goto error_closefile;
}
// metafile created => store meta data
writeRes = write(fd, buf, bufLen);
if(writeRes != (ssize_t)bufLen)
{
LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
goto error_closefile;
}
close(fd);
// truncate in case the update lead to a smaller file size
truncRes = ftruncate(fd, bufLen);
if(truncRes == -1)
{ // ignore trunc errors
LogContext(logContext).log(Log_WARNING, "Unable to truncate inode file (strange, but "
"proceeding anyways): " + metaFilename + ". " + "SysErr: " + System::getErrString() );
}
LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() );
return true;
// error compensation
error_closefile:
close(fd);
return false;
}
/**
* Update the inode on disk
*
* Note: We already need to have a FileInode (WRITE) rwlock here
*/
bool FileInode::storeUpdatedInodeUnlocked(EntryInfo* entryInfo, StripePattern* updatedStripePattern)
{
const char* logContext = "FileInode (store updated Inode)";
bool saveRes;
bool isInLined = this->isInlined;
if (isInLined)
{
FhgfsOpsErr dentrySaveRes = storeUpdatedInlinedInodeUnlocked(entryInfo, updatedStripePattern);
if (dentrySaveRes == FhgfsOpsErr_SUCCESS)
return true;
// dentrySaveRes != FhgfsOpsErr_SUCCESS
std::string parentID = entryInfo->getParentEntryID();
std::string entryID = entryInfo->getEntryID();
std::string fileName = entryInfo->getFileName();
if (dentrySaveRes == FhgfsOpsErr_INODENOTINLINED)
{
/* dentrySaveRes == FhgfsOpsErr_INODENOTINLINED. Our internal inode information says the
* inode is inlined, but on writing it we figure out it is not. As we we are holding a
* write lock here, that never should have happened. So probably a locking bug, but not
* critical here and we retry using the non-inlined way.
*/
LogContext(logContext).log(Log_WARNING, std::string("Inode unexpectedly not inlined: ") +
"parentID: "+ parentID + " entryID: " + entryID + " fileName: " + fileName );
this->isInlined = false;
}
else
{
LogContext(logContext).log(Log_WARNING, std::string("Failed to write inlined inode: ") +
"parentID: "+ parentID + " entryID: " + entryID + " fileName: " + fileName +
" Error: " + boost::lexical_cast<std::string>(dentrySaveRes));
#ifdef BEEGFS_DEBUG
LogContext(logContext).logBacktrace();
#endif
}
// it now falls through to the not-inlined handling, hopefully this is goint to work
}
// inode not inlined
// change the stripe pattern here before serializing;
if (unlikely(updatedStripePattern))
{
StripePattern* pattern = this->inodeDiskData.getPattern();
if (!pattern->updateStripeTargetIDs(updatedStripePattern))
LogContext(logContext).log(Log_WARNING, "Could not set requested new stripe pattern");
}
char buf[META_SERBUF_SIZE];
Serializer ser(buf, sizeof(buf));
serializeMetaData(ser);
if (ser.good())
saveRes = storeUpdatedMetaDataBuf(buf, ser.size());
else
saveRes = false;
if (!saveRes && isInlined)
{
LogContext(logContext).log(Log_WARNING, std::string("Trying to write as non-inlined inode "
"also failed.") );
}
return saveRes;
}
/**
* Update an inode, which is inlined into a dentry
*/
FhgfsOpsErr FileInode::storeUpdatedInlinedInodeUnlocked(EntryInfo* entryInfo,
StripePattern* updatedStripePattern)
{
const char* logContext = "DirEntry (storeUpdatedInode)";
App* app = Program::getApp();
// get latest dyn attrib vals...
updateDynamicAttribs();
std::string parentEntryID = entryInfo->getParentEntryID();
const Path* dentriesPath =
entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();
std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(),
parentEntryID);
FileInodeStoreData* inodeDiskData = this->getInodeDiskData();
if (unlikely(updatedStripePattern))
{
// note: We do not set the complete stripe pattern here, but only the stripe target IDs
if (! inodeDiskData->getPattern()->updateStripeTargetIDs(updatedStripePattern))
LogContext(logContext).log(Log_WARNING, "Could not set new stripe target IDs.");
}
DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(),
entryInfo->getEntryID(), entryInfo->getOwnerNodeID() );
/* Note: As we are called from FileInode most data of this DirEntry are unknown and we need to
* load it from disk. */
bool loadRes = dirEntry.loadFromID(dirEntryPath, entryInfo->getEntryID() );
if (!loadRes)
return FhgfsOpsErr_INTERNAL;
FileInodeStoreData* entryInodeDiskData = dirEntry.getInodeStoreData();
entryInodeDiskData->setFileInodeStoreData(inodeDiskData);
FhgfsOpsErr retVal = dirEntry.storeUpdatedInode(dirEntryPath);
return retVal;
}
std::string FileInode::getMetaFilePath(EntryInfo* entryInfo)
{
App* app = Program::getApp();
if (isInlined)
{
const Path* dentriesPath = getIsBuddyMirroredUnlocked()
? app->getBuddyMirrorDentriesPath()
: app->getDentriesPath();
std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(
dentriesPath->str(), entryInfo->getParentEntryID());
return MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID();
}
const Path* inodesPath = getIsBuddyMirroredUnlocked()
? app->getBuddyMirrorInodesPath()
: app->getInodesPath();
return MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID());
}
bool FileInode::storeRemoteStorageTargetUnlocked(EntryInfo* entryInfo)
{
std::string metafile = getMetaFilePath(entryInfo);
char buf[META_SERBUF_SIZE];
Serializer ser(buf, sizeof(buf));
ser % rstInfo;
if (!ser.good())
return false;
bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();
if (useXAttrs)
return storeRemoteStorageTargetBufAsXAttr(buf, ser.size(), metafile);
else
{
LOG(GENERAL, WARNING, "Storing RST info as file contents is unsupported. "
"Please check the 'storeUseExtendedAttribs' setting in the BeeGFS meta config");
return false;
}
}
bool FileInode::storeRemoteStorageTargetBufAsXAttr(char* buf, unsigned bufLen, const std::string& metafilename)
{
const char* logContext = "FileInode (store remote storage target as xattr)";
int setRes = setxattr(metafilename.c_str(), RST_XATTR_NAME, buf, bufLen, 0);
if (unlikely(setRes == -1))
{
// error
LogContext(logContext).logErr("Unable to write remote storage target info to disk: "
+ metafilename + ". SysErr: " + System::getErrString());
return false;
}
return true;
}
bool FileInode::removeStoredMetaData(const std::string& id, bool isBuddyMirrored)
{
const char* logContext = "FileInode (remove stored metadata)";
App* app = Program::getApp();
std::string inodeFilename = MetaStorageTk::getMetaInodePath(
isBuddyMirrored
? app->getBuddyMirrorInodesPath()->str()
: app->getInodesPath()->str(),
id);
// delete metadata file
int unlinkRes = unlink(inodeFilename.c_str() );
/* ignore errno == ENOENT as the file does not exist anymore for whatever reasons. Although
* unlink() failed, we do not have to care, as our goal is still reached. This is also about
* inode removal, if the dir-entry also does not exist, the application still will get
* the right error code */
if(unlinkRes == -1 && errno != ENOENT)
{ // error
LogContext(logContext).logErr("Unable to delete inode file: " + inodeFilename + ". " +
"SysErr: " + System::getErrString() );
return false;
}
LOG_DEBUG(logContext, 4, "Inode file deleted: " + inodeFilename);
if (isBuddyMirrored)
if (auto* resync = BuddyResyncer::getSyncChangeset())
resync->addDeletion(inodeFilename, MetaSyncFileType::Inode);
return true;
}
/**
* Note: Wrapper/chooser for loadFromFileXAttr/Contents.
* Note: This also (indirectly) calls initFileInfoVec()
*/
bool FileInode::loadFromInodeFile(EntryInfo* entryInfo)
{
bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();
if(useXAttrs)
return loadFromFileXAttr(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() );
return loadFromFileContents(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() );
}
/**
* Note: Don't call this directly, use the wrapper loadFromInodeFile().
* Note: This also calls initFileInfoVec()
*/
bool FileInode::loadFromFileXAttr(const std::string& id, bool isBuddyMirrored)
{
const char* logContext = "File inode (load from xattr file)";
App* app = Program::getApp();
const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);
bool retVal = false;
char buf[META_SERBUF_SIZE];
ssize_t getRes = getxattr(metaFilename.c_str(), META_XATTR_NAME, buf, META_SERBUF_SIZE);
if(getRes > 0)
{ // we got something => deserialize it
Deserializer des(buf, getRes);
deserializeMetaData(des);
if(unlikely(!des.good()))
{ // deserialization failed
LogContext(logContext).logErr("Unable to deserialize metadata in file: " + metaFilename);
goto error_exit;
}
// deserialization successful => init dyn attribs
initFileInfoVec(); /* note: this can only be done after the stripePattern
has been initialized, that's why we do it here at this "unusual" place. */
retVal = true;
}
else
if( (getRes == -1) && (errno == ENOENT) )
{ // file not exists
LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " +
metaFilename + ". " + "SysErr: " + System::getErrString() );
}
else
{ // unhandled error
LogContext(logContext).logErr("Unable to open/read inode file: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
}
error_exit:
return retVal;
}
/**
* Note: Don't call this directly, use the wrapper loadFromInodeFile().
* Note: This also calls initFileInfoVec()
*/
bool FileInode::loadFromFileContents(const std::string& id, bool isBuddyMirrored)
{
const char* logContext = "File inode (load from file)";
App* app = Program::getApp();
const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);
bool retVal = false;
int openFlags = O_NOATIME | O_RDONLY;
int fd = open(metaFilename.c_str(), openFlags, 0);
if(fd == -1)
{ // open failed
if(errno != ENOENT)
LogContext(logContext).logErr("Unable to open inode file: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
return false;
}
char buf[META_SERBUF_SIZE];
int readRes = read(fd, buf, META_SERBUF_SIZE);
if(readRes <= 0)
{ // reading failed
LogContext(logContext).logErr("Unable to read inode file: " + metaFilename + ". " +
"SysErr: " + System::getErrString() );
}
else
{
Deserializer des(buf, readRes);
deserializeMetaData(des);
if(!des.good())
{ // deserialization failed
LogContext(logContext).logErr("Unable to deserialize inode in file: " + metaFilename);
}
else
{ // deserialization successful => init dyn attribs
initFileInfoVec(); // note: this can only be done after the stripePattern
// has been initialized, that's why we do it here at this "unusual" place
retVal = true;
}
}
close(fd);
return retVal;
}
bool FileInode::loadRstFromInodeFile(EntryInfo* entryInfo)
{
bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();
if (useXAttrs)
return loadRstFromFileXAttr(entryInfo);
return false;
}
bool FileInode::loadRstFromFileXAttr(EntryInfo* entryInfo)
{
const char* logContext = "File inode RST (load from xattr file)";
App* app = Program::getApp();
std::string metafile;
if (isInlined)
{
const Path* dentriesPath =
getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();
std::string dirEntryPath =
MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(), entryInfo->getParentEntryID());
metafile = MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID();
}
else
{
const Path* inodesPath =
getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath(): app->getInodesPath();
metafile = MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID());
}
char buf[META_SERBUF_SIZE];
ssize_t getRes = getxattr(metafile.c_str(), RST_XATTR_NAME, buf, META_SERBUF_SIZE);
if (getRes > 0)
{
// we got something => deserialize it
Deserializer des(buf, getRes);
des % this->rstInfo;
if (unlikely(!des.good()))
{
// deserialization failed
LogContext(logContext).logErr("Unable to deserialize remote storage targets"
", file: " + metafile);
return false;
}
return true;
}
else
if( (getRes == -1) && (errno == ENOENT) )
{ // file not exists
LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " +
metafile + ". " + "SysErr: " + System::getErrString() );
}
else
{ // unhandled error
LogContext(logContext).logErr("Unable to open/read inode file: " + metafile + ". " +
"SysErr: " + System::getErrString() );
}
return false;
}
/**
* Create an inode from an entryInfo.
*
* Note: The entryInfo indicates if the inode is inlined or not. However, this information
* might be outdated and so we need to try inlined and file-inode access, if creating
* the inode failed.
* We here rely on kernel lookup calls, to update client side entryInfo data.
*/
FileInode* FileInode::createFromEntryInfo(EntryInfo* entryInfo)
{
FileInode* inode;
if (entryInfo->getIsInlined() )
{
/* entryInfo indicates the inode is inlined. So first try to get the inode by
* dir-entry inlined inode and if that failes try again with an inode-file. */
inode = createFromInlinedInode(entryInfo);
if (!inode)
inode = createFromInodeFile(entryInfo);
}
else
{
/* entryInfo indicates the inode is not inlined, but a separate inode-file. So first
* try to get the inode by inode-file and only if that fails try again with the dir-entry,
* maybe the inode was re-inlined. */
inode = createFromInodeFile(entryInfo);
if (!inode)
inode = createFromInlinedInode(entryInfo);
}
if (likely(inode) && inode->getIsRstAvailableUnlocked())
inode->loadRstFromInodeFile(entryInfo);
return inode;
}
/**
* Inode from inode file (inode is not inlined)
*
* Note: Do not call directly, but use FileInode::createFromEntryInfo()
*/
FileInode* FileInode::createFromInodeFile(EntryInfo* entryInfo)
{
FileInode* newInode = new FileInode();
bool loadRes = newInode->loadFromInodeFile(entryInfo);
if(!loadRes)
{
delete(newInode);
return NULL;
}
newInode->setIsInlinedUnlocked(false);
return newInode;
}
/**
* Inode from dir-entry with inlined inode.
*
* Note: Do not call directly, but use FileInode::createFromEntryInfo()
*/
FileInode* FileInode::createFromInlinedInode(EntryInfo* entryInfo)
{
App* app = Program::getApp();
std::string parentEntryID = entryInfo->getParentEntryID();
const Path* dentryPath =
entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath();
std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentryPath->str(),
parentEntryID);
DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(),
entryInfo->getEntryID(), entryInfo->getOwnerNodeID() );
FileInode* newInode = dirEntry.createInodeByID(dirEntryPath, entryInfo);
if (newInode)
newInode->setIsInlinedUnlocked(true);
return newInode;
}
/**
* Update entry attributes like chmod() etc. do it.
*
* Note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require
a special handling by the caller (but we also set the static attribs here).
* @param validAttribs SETATTR_CHANGE_...-Flags, but maybe 0, if we only want to update
* AttribChangeTimeSecs.
* @param attribs new attributes, but may be NULL if validAttribs == 0
*/
bool FileInode::setAttrData(EntryInfo * entryInfo, int validAttribs, SettableFileAttribs* attribs)
{
bool success = true;
SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K
/* note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require
a special handling by the caller (i.e. to also update chunk files) */
// save old attribs
StatData* statData = this->inodeDiskData.getInodeStatData();
SettableFileAttribs oldAttribs = *(statData->getSettableFileAttribs() );
statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
if(validAttribs)
{
// apply new attribs wrt flags...
if(validAttribs & SETATTR_CHANGE_MODE)
statData->setMode(attribs->mode);
if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME)
{
/* only static value update required for storeUpdatedInodeUnlocked() */
statData->setModificationTimeSecs(attribs->modificationTimeSecs);
}
if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME)
{
/* only static value update required for storeUpdatedInodeUnlocked() */
statData->setLastAccessTimeSecs(attribs->lastAccessTimeSecs);
}
if(validAttribs & SETATTR_CHANGE_USERID)
{
statData->setUserID(attribs->userID);
if ((attribs->userID != this->inodeDiskData.getOrigUID() ) &&
(this->inodeDiskData.getOrigFeature() == FileInodeOrigFeature_TRUE) )
addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_ORIG_UID);
}
if(validAttribs & SETATTR_CHANGE_GROUPID)
statData->setGroupID(attribs->groupID);
}
bool storeRes = storeUpdatedInodeUnlocked(entryInfo); // store on disk
if(!storeRes)
{ // failed to update metadata on disk => restore old values
statData->setSettableFileAttribs(oldAttribs);
success = false;
goto err_unlock;
}
// persistent update succeeded
// update attribs vec (wasn't done earlier because of backup overhead for restore on error)
if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME)
{
for(size_t i=0; i < fileInfoVec.size(); i++)
fileInfoVec[i].getRawDynAttribs()->modificationTimeSecs = attribs->modificationTimeSecs;
}
if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME)
{
for(size_t i=0; i < fileInfoVec.size(); i++)
fileInfoVec[i].getRawDynAttribs()->lastAccessTimeSecs = attribs->lastAccessTimeSecs;
}
err_unlock:
safeLock.unlock(); // U N L O C K
return success;
}
/**
* General wrapper for append lock and unlock operations.
*
* Append supports exclusive locking only, no shared locks.
*
* Note: Unlocks are always immediately granted (=> they always return "true").
*
* @return true if operation succeeded immediately; false if registered for waiting (or failed in
* case of NOWAIT-flag)
*/
std::pair<bool, LockEntryNotifyList> FileInode::flockAppend(EntryLockDetails& lockDetails)
{
FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
return flockEntryUnlocked(lockDetails, &lockQs);
}
/**
* General wrapper for flock lock and unlock operations.
*
* Note: Unlocks are always immediately granted (=> they always return "true").
*
* @return true if operation succeeded immediately; false if registered for waiting (or failed in
* case of NOWAIT-flag)
*/
std::pair<bool, LockEntryNotifyList> FileInode::flockEntry(EntryLockDetails& lockDetails)
{
FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
return flockEntryUnlocked(lockDetails, &lockQs);
}
/**
* General wrapper for flock lock and unlock operations.
*
* Note: Unlocks are always immediately granted (=> they always return "true").
* Note: Unlocked version => caller must hold write lock.
*
* @return true if operation succeeded immediately; false if registered for waiting (or failed in
* case of NOWAIT-flag)
*/
std::pair<bool, LockEntryNotifyList> FileInode::flockEntryUnlocked(EntryLockDetails& lockDetails,
EntryLockQueuesContainer* lockQs)
{
bool tryNextWaiters = false;
bool immediatelyGranted = false; // return value
if(lockDetails.isCancel() )
{
// C A N C E L request
/* note: this is typically used when a client closes a file, so we remove all granted and
pending locks for the given handle here */
if(flockEntryCancelByHandle(lockDetails, lockQs) )
tryNextWaiters = true;
immediatelyGranted = true;
}
else
if(lockDetails.isUnlock() )
{
// U N L O C K request
tryNextWaiters = flockEntryUnlock(lockDetails, lockQs);
immediatelyGranted = true;
}
else
{
// L O C K request
// check waiters to filter duplicate requests
StringSetIter iterWaiters = lockQs->waitersLockIDs->find(lockDetails.lockAckID);
if(iterWaiters != lockQs->waitersLockIDs->end() )
return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting
// not in waiters queue => is it granted already?
bool isGrantedAlready = flockEntryIsGranted(lockDetails, lockQs);
if(isGrantedAlready)
return {true, {}}; // request was granted already
// not waiting, not granted => we have a new request
bool hasConflicts = flockEntryCheckConflicts(lockDetails, lockQs, NULL);
if(!hasConflicts || lockDetails.allowsWaiting() )
tryNextWaiters = flockEntryUnlock(lockDetails, lockQs); // unlock (for lock up-/downgrades)
if(lockDetails.isShared() )
{
// S H A R E D lock request
if(!hasConflicts)
{ // no confictors for this lock => can be immediately granted
flockEntryShared(lockDetails, lockQs);
immediatelyGranted = true;
}
else
if(lockDetails.allowsWaiting() )
{ // we have conflictors and locker wants to wait
lockQs->waitersSharedLock->push_back(lockDetails);
lockQs->waitersLockIDs->insert(lockDetails.lockAckID);
}
}
else
{
// E X C L U S I V E lock request
if(!hasConflicts)
{ // no confictors for this lock => can be immediately granted
flockEntryExclusive(lockDetails, lockQs);
immediatelyGranted = true;
}
else
if(lockDetails.allowsWaiting() )
{ // we have conflictors and locker wants to wait
lockQs->waitersExclLock->push_back(lockDetails);
lockQs->waitersLockIDs->insert(lockDetails.lockAckID);
}
}
}
if (tryNextWaiters)
return {immediatelyGranted, flockEntryTryNextWaiters(lockQs)};
return {immediatelyGranted, {}};
}
/**
* Remove all waiters from the queues.
*/
void FileInode::flockAppendCancelAllWaiters()
{
FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);
flockEntryGenericCancelAllWaiters(&lockQs);
}
/**
* Remove all waiters from the queues.
*/
void FileInode::flockEntryCancelAllWaiters()
{
FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);
flockEntryGenericCancelAllWaiters(&lockQs);
}
/**
* Remove all waiters from the queues.
*
* Generic version shared by append and flock locking.
*/
void FileInode::flockEntryGenericCancelAllWaiters(EntryLockQueuesContainer* lockQs)
{
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
lockQs->waitersLockIDs->clear();
lockQs->waitersExclLock->clear();
lockQs->waitersSharedLock->clear();
}
/**
* Unlock all locks and wait entries of the given clientID.
*/
LockEntryNotifyList FileInode::flockAppendCancelByClientID(NumNodeID clientID)
{
FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);
return flockEntryGenericCancelByClientID(clientID, &lockQs);
}
/**
* Unlock all locks and wait entries of the given clientID.
*/
LockEntryNotifyList FileInode::flockEntryCancelByClientID(NumNodeID clientID)
{
FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);
return flockEntryGenericCancelByClientID(clientID, &lockQs);
}
/**
* Unlock all locks and wait entries of the given clientID.
*
* Generic version shared by append and flock locking.
*/
LockEntryNotifyList FileInode::flockEntryGenericCancelByClientID(NumNodeID clientNumID,
EntryLockQueuesContainer* lockQs)
{
/* note: this code is in many aspects similar to flockEntryCancelByHandle(), so if you change
* something here, you probably want to change it there, too. */
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
bool tryNextWaiters = false;
// exclusive lock
if(lockQs->exclLock->isSet() && (lockQs->exclLock->clientNumID == clientNumID) )
{
*lockQs->exclLock = {};
tryNextWaiters = true;
}
// shared locks
for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin();
iter != lockQs->sharedLocks->end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
EntryLockDetailsSetIter iterNext = iter;
iterNext++;
lockQs->sharedLocks->erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters exlusive
for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin();
iter != lockQs->waitersExclLock->end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
lockQs->waitersLockIDs->erase(iter->lockAckID);
iter = lockQs->waitersExclLock->erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters shared
for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin();
iter != lockQs->waitersSharedLock->end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
lockQs->waitersLockIDs->erase(iter->lockAckID);
iter = lockQs->waitersSharedLock->erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
if (tryNextWaiters)
return flockEntryTryNextWaiters(lockQs);
return {};
}
/**
* Remove all granted and pending locks that match the given handle.
* (This is typically called by clients during file close.)
*
* Note: unlocked, so hold the mutex when calling this.
*
* @return true if locks were removed and next waiters should be tried.
*/
bool FileInode::flockEntryCancelByHandle(EntryLockDetails& lockDetails,
EntryLockQueuesContainer* lockQs)
{
/* note: this code is in many aspects similar to flockEntryCancelByClientID(), so if you change
* something here, you probably want to change it there, too. */
bool tryNextWaiters = false;
// exclusive lock
if(lockQs->exclLock->isSet() && lockDetails.equalsHandle(*lockQs->exclLock) )
{
*lockQs->exclLock = {};
tryNextWaiters = true;
}
// shared locks
for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin();
iter != lockQs->sharedLocks->end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
EntryLockDetailsSetIter iterNext = iter;
iterNext++;
lockQs->sharedLocks->erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters exlusive
for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin();
iter != lockQs->waitersExclLock->end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
lockQs->waitersLockIDs->erase(iter->lockAckID);
iter = lockQs->waitersExclLock->erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters shared
for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin();
iter != lockQs->waitersSharedLock->end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
lockQs->waitersLockIDs->erase(iter->lockAckID);
iter = lockQs->waitersSharedLock->erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
return tryNextWaiters;
}
/**
* Note: Automatically ignores self-conflicts (locks that could be up- or downgraded).
* Note: Make sure to remove lock duplicates before calling this.
* Note: unlocked, so hold the mutex when calling this.
*
* @param outConflictor first identified conflicting lock (only set if true is returned; can be
* NULL if caller is not interested)
* @return true if there is a conflict with a lock that is not owned by the current lock requestor,
* false if the request can defintely be granted immediately without waiting
*/
bool FileInode::flockEntryCheckConflicts(EntryLockDetails& lockDetails,
EntryLockQueuesContainer* lockQs, EntryLockDetails* outConflictor)
{
// note: we also check waiting writers here, because we have writer preference and so we don't
// want to grant access for a new reader if we have a waiting writer
// check conflicting exclusive lock (for shared & eclusive requests)
if(lockQs->exclLock->isSet() && !lockQs->exclLock->equalsHandle(lockDetails) )
{
SAFE_ASSIGN(outConflictor, *lockQs->exclLock);
return true;
}
// no exclusive lock exists
if(lockDetails.isExclusive() )
{ // exclusive lock request: check conflicting shared lock
for(EntryLockDetailsSetCIter iterShared = lockQs->sharedLocks->begin();
iterShared != lockQs->sharedLocks->end();
iterShared++)
{
if(!iterShared->equalsHandle(lockDetails) )
{ // found a conflicting lock
SAFE_ASSIGN(outConflictor, *iterShared);
return true;
}
}
}
else
{ // non-exclusive lock: check for waiting writers to enforce writer preference
if(!lockQs->waitersExclLock->empty() )
{
SAFE_ASSIGN(outConflictor, *lockQs->waitersExclLock->begin() );
return true;
}
}
return false;
}
/**
* Find out whether a given lock is currently being held by the given owner.
*
* Note: unlocked, hold the read lock when calling this.
*
* @return true if the given lock is being held by the given owner.
*/
bool FileInode::flockEntryIsGranted(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
if(lockDetails.isExclusive() )
{
if(lockQs->exclLock->equalsHandle(lockDetails) )
{ // was an exclusive lock
return true;
}
}
else
if(lockDetails.isShared() )
{
EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails);
if(iterShared != lockQs->sharedLocks->end() )
{ // was a shared lock
return true;
}
}
return false;
}
/**
* Note: unlocked, so hold the write lock when calling this.
*
* @return true if an existing lock was released
*/
bool FileInode::flockEntryUnlock(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
if(lockQs->exclLock->equalsHandle(lockDetails) )
{ // was an exclusive lock
*lockQs->exclLock = {};
return true;
}
EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails);
if(iterShared != lockQs->sharedLocks->end() )
{ // was a shared lock
lockQs->sharedLocks->erase(iterShared);
return true;
}
return false;
}
/**
* Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
* duplicates.
* Note: unlocked, so hold the mutex when calling this
*/
void FileInode::flockEntryShared(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
lockQs->sharedLocks->insert(lockDetails);
}
/**
* Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
* duplicates.
* Note: unlocked, so hold the mutex when calling this
*/
void FileInode::flockEntryExclusive(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs)
{
*lockQs->exclLock = lockDetails;
}
/**
* Remove next requests from waiters queue and try to grant it - until we reach an entry that
* cannot be granted immediately.
*
* Note: We assume that duplicate waiters and duplicate granted locks (up-/downgrades) have been
* removed before a lock request is enqueued, so we don't check for that.
*
* Note: FileInode must be already write-locked by the caller!
*/
LockEntryNotifyList FileInode::flockEntryTryNextWaiters(EntryLockQueuesContainer* lockQs)
{
/* note: we have writer preference, so we don't grant any new readers while we have waiting
writers */
if(lockQs->exclLock->isSet() )
return {}; // eclusive lock => there's nothing we can do right now
// no exclusive lock set
if(!lockQs->waitersSharedLock->empty() && lockQs->waitersExclLock->empty() )
{ // shared locks waiting and no exclusive locks waiting => grant all
LockEntryNotifyList notifyList;
while(!lockQs->waitersSharedLock->empty() )
{
flockEntryShared(*lockQs->waitersSharedLock->begin(), lockQs);
notifyList.push_back(*lockQs->waitersSharedLock->begin() );
lockQs->waitersLockIDs->erase(lockQs->waitersSharedLock->begin()->lockAckID);
lockQs->waitersSharedLock->pop_front();
}
return notifyList;
}
// no exclusive and no shared locks set => we can grant an exclusive lock
if(!lockQs->waitersExclLock->empty() )
{ // exclusive locks waiting => grant first one of them
flockEntryExclusive(*lockQs->waitersExclLock->begin(), lockQs);
LockEntryNotifyList notifyList;
notifyList.push_back(*lockQs->waitersExclLock->begin() );
lockQs->waitersLockIDs->erase(lockQs->waitersExclLock->begin()->lockAckID);
lockQs->waitersExclLock->pop_front();
return notifyList;
}
return {};
}
/**
* Generate a complete locking status overview (all granted and waiters) as human-readable string.
*/
std::string FileInode::flockAppendGetAllAsStr()
{
FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs);
return flockEntryGenericGetAllAsStr(&lockQs);
}
/**
* Generate a complete locking status overview (all granted and waiters) as human-readable string.
*/
std::string FileInode::flockEntryGetAllAsStr()
{
FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs);
return flockEntryGenericGetAllAsStr(&lockQs);
}
/**
* Generate a complete locking status overview (all granted and waiters) as human-readable string.
*
* Generic version shared by append and flock locking.
*/
std::string FileInode::flockEntryGenericGetAllAsStr(EntryLockQueuesContainer* lockQs)
{
UniqueRWLock lock(rwlock, SafeRWLock_READ);
std::ostringstream outStream;
outStream << "Exclusive" << std::endl;
outStream << "=========" << std::endl;
if(lockQs->exclLock->isSet() )
outStream << lockQs->exclLock->toString() << std::endl;
outStream << std::endl;
outStream << "Shared" << std::endl;
outStream << "=========" << std::endl;
for(EntryLockDetailsSetCIter iter = lockQs->sharedLocks->begin();
iter != lockQs->sharedLocks->end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Exclusive Waiters" << std::endl;
outStream << "=========" << std::endl;
for(EntryLockDetailsListCIter iter = lockQs->waitersExclLock->begin();
iter != lockQs->waitersExclLock->end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Shared Waiters" << std::endl;
outStream << "=========" << std::endl;
for(EntryLockDetailsListCIter iter = lockQs->waitersSharedLock->begin();
iter != lockQs->waitersSharedLock->end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Waiters lockIDs" << std::endl;
outStream << "=========" << std::endl;
for(StringSetCIter iter = lockQs->waitersLockIDs->begin();
iter != lockQs->waitersLockIDs->end();
iter++)
{
outStream << *iter << std::endl;
}
outStream << std::endl;
return outStream.str();
}
/**
* General wrapper for flock lock and unlock operations.
*
* @return true if operation succeeded immediately; false if registered for waiting (or failed in
* case of NOWAIT-flag)
*/
std::pair<bool, LockRangeNotifyList> FileInode::flockRange(RangeLockDetails& lockDetails)
{
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
return flockRangeUnlocked(lockDetails);
}
/**
* General wrapper for flock lock and unlock operations.
*
* Note: Unlocked, so caller must hold the write lock.
*
* @return true if operation succeeded immediately; false if registered for waiting (or failed in
* case of NOWAIT-flag)
*/
std::pair<bool, LockRangeNotifyList> FileInode::flockRangeUnlocked(RangeLockDetails& lockDetails)
{
bool tryNextWaiters = false;
bool immediatelyGranted = false; // return value
if(lockDetails.isCancel() )
{
// C A N C E L request
/* note: this is typically used when a client closes a file, so we remove all granted and
pending locks for the given handle here */
if(flockRangeCancelByHandle(lockDetails) )
tryNextWaiters = true;
immediatelyGranted = true;
}
else
if(lockDetails.isUnlock() )
{
// U N L O C K request
tryNextWaiters = flockRangeUnlock(lockDetails);
immediatelyGranted = true;
}
else
{
// L O C K request
// check waiters to filter duplicate requests
StringSetIter iterWaiters = waitersLockIDsRangeFLock.find(lockDetails.lockAckID);
if(iterWaiters != waitersLockIDsRangeFLock.end() )
return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting
// not in waiters queue => is it granted already?
bool isGrantedAlready = flockRangeIsGranted(lockDetails);
if(isGrantedAlready)
return {true, {}}; // request was granted already
// not waiting, not granted => we have a new request
bool hasConflicts = flockRangeCheckConflicts(lockDetails, NULL);
if(!hasConflicts || lockDetails.allowsWaiting() )
tryNextWaiters = flockRangeUnlock(lockDetails); // unlock range (for lock up-/downgrades)
if(lockDetails.isShared() )
{
// S H A R E D lock request
if(!hasConflicts)
{ // no confictors for this lock => can be immediately granted
flockRangeShared(lockDetails);
immediatelyGranted = true;
}
else
if(lockDetails.allowsWaiting() )
{ // we have conflictors and locker wants to wait
waitersSharedRangeFLock.push_back(lockDetails);
waitersLockIDsRangeFLock.insert(lockDetails.lockAckID);
}
}
else
{
// E X C L U S I V E lock request
if(!hasConflicts)
{ // no confictors for this lock => can be immediately granted
flockRangeExclusive(lockDetails);
immediatelyGranted = true;
}
else
if(lockDetails.allowsWaiting() )
{ // we have conflictors and locker wants to wait
waitersExclRangeFLock.push_back(lockDetails);
waitersLockIDsRangeFLock.insert(lockDetails.lockAckID);
}
}
}
if (tryNextWaiters)
return {immediatelyGranted, flockRangeTryNextWaiters()};
return {immediatelyGranted, {}};
}
/**
* Remove all waiters from the queues.
*/
void FileInode::flockRangeCancelAllWaiters()
{
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
waitersLockIDsRangeFLock.clear();
waitersExclRangeFLock.clear();
waitersSharedRangeFLock.clear();
}
/**
* Unlock all locks and wait entries of the given clientID.
*/
LockRangeNotifyList FileInode::flockRangeCancelByClientID(NumNodeID clientNumID)
{
/* note: this code is in many aspects similar to flockRangeCancelByHandle(), so if you change
* something here, you probably want to change it there, too. */
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
bool tryNextWaiters = false;
// exclusive locks
for(RangeLockExclSetIter iter = exclRangeFLocks.begin();
iter != exclRangeFLocks.end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
RangeLockExclSetIter iterNext = iter;
iterNext++;
exclRangeFLocks.erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// shared locks
for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin();
iter != sharedRangeFLocks.end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
RangeLockSharedSetIter iterNext = iter;
iterNext++;
sharedRangeFLocks.erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters exlusive
for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
iter != waitersExclRangeFLock.end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersExclRangeFLock.erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters shared
for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
iter != waitersSharedRangeFLock.end();
/* iter inc'ed inside loop */ )
{
if(iter->clientNumID == clientNumID)
{
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersSharedRangeFLock.erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
if(tryNextWaiters)
return flockRangeTryNextWaiters();
return {};
}
/**
* Remove all granted and pending locks that match the given handle.
* (This is typically called by clients during file close.)
*
* Note: unlocked, so hold the mutex when calling this.
*
* @return true if locks were removed and next waiters should be tried.
*/
bool FileInode::flockRangeCancelByHandle(RangeLockDetails& lockDetails)
{
/* note: this code is in many aspects similar to flockRangeCancelByClientID(), so if you change
* something here, you probably want to change it there, too. */
bool tryNextWaiters = false;
// exclusive locks
for(RangeLockExclSetIter iter = exclRangeFLocks.begin();
iter != exclRangeFLocks.end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
RangeLockExclSetIter iterNext = iter;
iterNext++;
exclRangeFLocks.erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// shared locks
for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin();
iter != sharedRangeFLocks.end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
RangeLockSharedSetIter iterNext = iter;
iterNext++;
sharedRangeFLocks.erase(iter);
iter = iterNext;
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters exlusive
for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
iter != waitersExclRangeFLock.end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersExclRangeFLock.erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
// waiters shared
for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
iter != waitersSharedRangeFLock.end();
/* iter inc'ed inside loop */ )
{
if(lockDetails.equalsHandle(*iter) )
{
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersSharedRangeFLock.erase(iter);
tryNextWaiters = true;
continue;
}
iter++;
}
return tryNextWaiters;
}
/**
* Checks if there is a conflict for the given lock (but does not actually place lock).
*
* @param outConflictor the conflicting lock (or of of them) in case we return true.
* @return true if there is a conflict for the given lock request.
*/
bool FileInode::flockRangeGetConflictor(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor)
{
UniqueRWLock lock(rwlock, SafeRWLock_READ);
return flockRangeCheckConflicts(lockDetails, outConflictor);
}
/**
* Note: see flockRangeCheckConflictsEx() for comments (this is just the simple version which
* checks the whole excl waiters queue and hence is inappropriate for tryNextWaiters() ).
*/
bool FileInode::flockRangeCheckConflicts(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor)
{
return flockRangeCheckConflictsEx(lockDetails, -1, outConflictor);
}
/**
* Note: Automatically ignores self-conflicts (locks that could be up- or downgraded)
* Note: unlocked, so hold the mutex when calling this
*
* @param outConflictor first identified conflicting lock (only set if true is returned; can be
* NULL if caller is not interested)
* @param maxExclWaitersCheckNum only required by tryNextWaiters to find out how many pending excls
* in the queue before the checked element should be tested for conflicts (ie for the 5th queue
* element you will pass 4 here); -1 will check the whole queue, which is what all other callers
* probably want to do.
* @return true if there is a conflict with a lock that is not owned by the current lock requestor
*/
bool FileInode::flockRangeCheckConflictsEx(RangeLockDetails& lockDetails, int maxExclWaitersCheckNum,
RangeLockDetails* outConflictor)
{
// note: we also check waiting writers here, because we have writer preference and so we don't
// want to grant access for a new reader if we have a waiting writer
// ...and we also don't want to starve writers by other writers, so we also check for
// overlapping waiting writer requests before granting a write lock
// check conflicting exclusive locks (for shared & exclusive requests)
for(RangeLockExclSetCIter iterExcl = exclRangeFLocks.begin();
(iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
iterExcl++)
{
if(lockDetails.overlaps(*iterExcl) &&
!lockDetails.equalsHandle(*iterExcl) )
{
SAFE_ASSIGN(outConflictor, *iterExcl);
return true;
}
}
// no conflicting exclusive lock exists
if(lockDetails.isExclusive() )
{ // exclusive lock request: check conflicting shared locks
// check granted shared locks
for(RangeLockSharedSetCIter iterShared = sharedRangeFLocks.begin();
iterShared != sharedRangeFLocks.end();
iterShared++)
{
if(lockDetails.overlaps(*iterShared) &&
!lockDetails.equalsHandle(*iterShared) )
{
SAFE_ASSIGN(outConflictor, *iterShared);
return true;
}
}
}
// no conflicting shared lock exists
// check waiting writers (for shared reqs to prefer writers and for excl reqs to avoid
// writer starvation of partially overlapping waiting writers)
// (note: keep in mind that maxExclWaitersCheckNum can also be -1 for infinite checks)
for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin();
(iter != waitersExclRangeFLock.end() ) && (maxExclWaitersCheckNum != 0);
iter++, maxExclWaitersCheckNum--)
{
if(lockDetails.overlaps(*iter) &&
!lockDetails.equalsHandle(*iter) )
{
SAFE_ASSIGN(outConflictor, *iter);
return true;
}
}
return false;
}
/**
* Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
* duplicates.
* Note: unlocked, so hold the mutex when calling this
*/
void FileInode::flockRangeShared(RangeLockDetails& lockDetails)
{
// insert shared lock request...
// (avoid duplicates and side-by-side locks for same file handles by merging)
for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
iterShared != sharedRangeFLocks.end();
/* conditional iter increment inside loop */ )
{
bool incIterAtEnd = true;
if(lockDetails.equalsHandle(*iterShared) && lockDetails.isMergeable(*iterShared) )
{ // same handle => merge with existing lock
// note: all overlaps will be merged into lockDetails, so every other overlapping entry
// can be removed here
lockDetails.merge(*iterShared);
RangeLockExclSetIter iterSharedNext(iterShared);
iterSharedNext++;
sharedRangeFLocks.erase(iterShared);
iterShared = iterSharedNext;
incIterAtEnd = false;
}
if(incIterAtEnd)
iterShared++;
}
// actually insert the new lock
sharedRangeFLocks.insert(lockDetails);
}
/**
* Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or
* duplicates.
* Note: unlocked, so hold the mutex when calling this
*/
void FileInode::flockRangeExclusive(RangeLockDetails& lockDetails)
{
// insert excl lock request...
// (avoid duplicates and side-by-side locks for same file handles by merging)
// (note: lockDetails.end+1: because we're also looking for extensions, not only overlaps)
for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
(iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= (lockDetails.end+1) );
/* conditional iter increment inside loop */ )
{
bool incIterAtEnd = true;
if(lockDetails.equalsHandle(*iterExcl) && lockDetails.isMergeable(*iterExcl) )
{ // same handle => merge with existing lock
// note: all overlaps will be merged into lockDetails, so every other overlapping entry
// can be removed here
lockDetails.merge(*iterExcl);
RangeLockExclSetIter iterExclNext(iterExcl);
iterExclNext++;
exclRangeFLocks.erase(iterExcl);
iterExcl = iterExclNext;
incIterAtEnd = false;
}
if(incIterAtEnd)
iterExcl++;
}
// actually insert the new lock
exclRangeFLocks.insert(lockDetails);
}
/**
* Find out whether a given range lock is currently being held by the given owner.
*
* Note: unlocked, hold the read lock when calling this.
*
* @return true if the range is locked by the given owner
*/
bool FileInode::flockRangeIsGranted(RangeLockDetails& lockDetails)
{
if(lockDetails.isExclusive() )
{
for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
(iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
/* conditional iter increment at end of loop */ )
{
if(!lockDetails.equalsHandle(*iterExcl) )
{ // lock owned by another client/process
iterExcl++;
continue;
}
// found a lock that is owned by the same client/process => check overlap with given lock
bool incIterAtEnd = true;
RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl);
switch(overlap)
{
case RangeOverlapType_EQUALS:
{ // found an exact match => don't need to look any further
return true;
} break;
case RangeOverlapType_ISCONTAINED:
{ /* given range is fully contained in a greater locked area => don't need to look any
further */
return true;
} break;
case RangeOverlapType_CONTAINS:
{ /* found a range which is part of the given lock => given owner cannot currently hold
the lock for the whole given range, otherwise we wouldn't find a partial match
because of our merging => don't need to look any further */
return false;
} break;
case RangeOverlapType_STARTOVERLAP:
case RangeOverlapType_ENDOVERLAP:
{ /* found a range which is part of the given lock => given owner cannot currently hold
the lock for the whole given range, otherwise we wouldn't find a partial match
because of our merging => don't need to look any further */
return false;
} break;
default: break; // no overlap
} // end of switch(overlap)
if(incIterAtEnd)
iterExcl++;
}
} // end of exclusive locks check
else
if(lockDetails.isShared() )
{
for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
iterShared != sharedRangeFLocks.end();
/* conditional iter increment at end of loop */ )
{
if(!lockDetails.equalsHandle(*iterShared) )
{ // lock owned by another client/process
iterShared++;
continue;
}
// found a lock that is owned by the same client/process => check overlap with given lock
bool incIterAtEnd = true;
RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared);
switch(overlap)
{
case RangeOverlapType_EQUALS:
{ // found an exact match => don't need to look any further
return true;
} break;
case RangeOverlapType_ISCONTAINED:
{ /* given lock is fully contained in a greater locked area => don't need to look any
further */
return true;
} break;
case RangeOverlapType_CONTAINS:
{ /* found a range which is part of the given lock => given owner cannot currently hold
the lock for the whole given range, otherwise we wouldn't find a partial match
because of our merging => don't need to look any further */
return false;
} break;
case RangeOverlapType_STARTOVERLAP:
case RangeOverlapType_ENDOVERLAP:
{ /* found a range which is part of the given lock => given owner cannot currently hold
the lock for the whole given range, otherwise we wouldn't find a partial match
because of our merging => don't need to look any further */
return false;
} break;
default: break; // no overlap
} // end of switch(overlap)
if(incIterAtEnd)
iterShared++;
}
} // end of shared locks check
return false;
}
/**
* Note: unlocked, so hold the mutex when calling this.
*
* @return true if an existing lock has been removed
*/
bool FileInode::flockRangeUnlock(RangeLockDetails& lockDetails)
{
bool lockRemoved = false; // return value
// check exclusive locks...
// (quick path: if the whole unlock is entirely covered by an exclusive range, then we don't need
// to look any further)
for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin();
(iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end);
/* conditional iter increment at end of loop */ )
{
if(!lockDetails.equalsHandle(*iterExcl) )
{ // lock owned by another client/process
iterExcl++;
continue;
}
// found a lock that is owned by the same client/process => check overlap with unlock request
bool incIterAtEnd = true;
RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl);
switch(overlap)
{
case RangeOverlapType_EQUALS:
{ // found an exact match => don't need to look any further
exclRangeFLocks.erase(iterExcl);
return true;
} break;
case RangeOverlapType_ISCONTAINED:
{ // unlock is fully contained in a greater locked area => don't need to look any further
// check if 1 or 2 locked areas remain (=> shrink or split)
if( (lockDetails.start == iterExcl->start) ||
(lockDetails.end == iterExcl->end) )
{ // only one locked area remains
RangeLockDetails oldExcl(*iterExcl);
oldExcl.trim(lockDetails);
exclRangeFLocks.erase(iterExcl);
exclRangeFLocks.insert(oldExcl);
}
else
{ // two locked areas remain
RangeLockDetails oldExcl(*iterExcl);
RangeLockDetails newExcl;
oldExcl.split(lockDetails, newExcl);
exclRangeFLocks.erase(iterExcl);
exclRangeFLocks.insert(oldExcl);
exclRangeFLocks.insert(newExcl);
}
return true;
} break;
case RangeOverlapType_CONTAINS:
{ // full removal of this lock, but there may still be some others that need to be removed
RangeLockExclSetIter iterExclNext(iterExcl);
iterExclNext++;
exclRangeFLocks.erase(iterExcl);
lockRemoved = true;
iterExcl = iterExclNext;
incIterAtEnd = false;
} break;
case RangeOverlapType_STARTOVERLAP:
case RangeOverlapType_ENDOVERLAP:
{ // partial removal of this lock and there may still be others that need to be removed
// note: might change start and consequently map position => re-insert excl lock
RangeLockExclSetIter iterExclNext(iterExcl);
iterExclNext++;
RangeLockDetails oldExcl(*iterExcl);
oldExcl.trim(lockDetails);
exclRangeFLocks.erase(iterExcl);
exclRangeFLocks.insert(oldExcl);
lockRemoved = true;
iterExcl = iterExclNext;
incIterAtEnd = false;
} break;
default: break; // no overlap
} // end of switch(overlap)
if(incIterAtEnd)
iterExcl++;
}
// check shared locks...
// (similar to exclusive locks, we can stop here if unlock is entirely covered by one of our
// owned shared ranges, because there cannot be another overlapping range which we also own)
for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin();
iterShared != sharedRangeFLocks.end();
/* conditional iter increment at end of loop */ )
{
if(!lockDetails.equalsHandle(*iterShared) )
{ // lock owned by another client/process
iterShared++;
continue;
}
// found a lock that is owned by the same client/process => check overlap with unlock request
bool incIterAtEnd = true;
RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared);
switch(overlap)
{
case RangeOverlapType_EQUALS:
{ // found an exact match => don't need to look any further
sharedRangeFLocks.erase(iterShared);
return true;
} break;
case RangeOverlapType_ISCONTAINED:
{ // unlock is fully contained in a greater locked area => don't need to look any further
// check if 1 or 2 locked areas remain...
if( (lockDetails.start == iterShared->start) ||
(lockDetails.end == iterShared->end) )
{ // only one locked area remains
RangeLockDetails oldShared(*iterShared);
oldShared.trim(lockDetails);
sharedRangeFLocks.erase(iterShared);
sharedRangeFLocks.insert(oldShared);
}
else
{ // two locked areas remain
RangeLockDetails oldShared(*iterShared);
RangeLockDetails newShared;
oldShared.split(lockDetails, newShared);
sharedRangeFLocks.erase(iterShared);
sharedRangeFLocks.insert(oldShared);
sharedRangeFLocks.insert(newShared);
}
return true;
} break;
case RangeOverlapType_CONTAINS:
{ // full removal of this lock, but there may still be some others that need to be removed
RangeLockExclSetIter iterExclNext(iterShared);
iterExclNext++;
sharedRangeFLocks.erase(iterShared);
lockRemoved = true;
iterShared = iterExclNext;
incIterAtEnd = false;
} break;
case RangeOverlapType_STARTOVERLAP:
case RangeOverlapType_ENDOVERLAP:
{ // partial removal of this lock and there may still be others that need to be removed
// note: might change start and consequently map position => re-insert excl lock
RangeLockExclSetIter iterSharedNext(iterShared);
iterSharedNext++;
RangeLockDetails oldShared(*iterShared);
oldShared.trim(lockDetails);
sharedRangeFLocks.erase(iterShared);
sharedRangeFLocks.insert(oldShared);
lockRemoved = true;
iterShared = iterSharedNext;
incIterAtEnd = false;
} break;
default: break; // no overlap
} // end of switch(overlap)
if(incIterAtEnd)
iterShared++;
}
return lockRemoved;
}
/**
* Remove next requests from waiters queue and try to grant it - until we reach an entry that
* cannot be granted immediately.
*
* Note: unlocked, so hold the mutex when calling this.
*/
LockRangeNotifyList FileInode::flockRangeTryNextWaiters()
{
int numWaitersBefore = 0; // number of waiters in the queue before the current checked element
LockRangeNotifyList notifyList; // quick stack version to speed up the no waiter granted path
for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin();
iter != waitersExclRangeFLock.end();
/* conditional iter inc inside loop */)
{
bool hasConflict = flockRangeCheckConflictsEx(*iter, numWaitersBefore, NULL);
if(hasConflict)
{
iter++;
numWaitersBefore++;
continue;
}
// no conflict => grant lock
flockRangeExclusive(*iter);
notifyList.push_back(*iter);
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersExclRangeFLock.erase(iter);
}
for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin();
iter != waitersSharedRangeFLock.end();
/* conditional iter inc inside loop */)
{
bool hasConflict = flockRangeCheckConflicts(*iter, NULL);
if(hasConflict)
{
iter++;
continue;
}
// no conflict => grant lock
flockRangeShared(*iter);
notifyList.push_back(*iter);
waitersLockIDsRangeFLock.erase(iter->lockAckID);
iter = waitersSharedRangeFLock.erase(iter);
}
return notifyList;
}
/**
* Generate a complete locking status overview (all granted and waiters) as human-readable string.
*/
std::string FileInode::flockRangeGetAllAsStr()
{
UniqueRWLock lock(rwlock, SafeRWLock_READ);
std::ostringstream outStream;
outStream << "Exclusive" << std::endl;
outStream << "=========" << std::endl;
for(RangeLockExclSetCIter iter = exclRangeFLocks.begin();
iter != exclRangeFLocks.end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Shared" << std::endl;
outStream << "=========" << std::endl;
for(RangeLockSharedSetCIter iter = sharedRangeFLocks.begin();
iter != sharedRangeFLocks.end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Exclusive Waiters" << std::endl;
outStream << "=========" << std::endl;
for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin();
iter != waitersExclRangeFLock.end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Shared Waiters" << std::endl;
outStream << "=========" << std::endl;
for(RangeLockDetailsListCIter iter = waitersSharedRangeFLock.begin();
iter != waitersSharedRangeFLock.end();
iter++)
{
outStream << iter->toString() << std::endl;
}
outStream << std::endl;
outStream << "Waiters lockIDs" << std::endl;
outStream << "=========" << std::endl;
for(StringSetCIter iter = waitersLockIDsRangeFLock.begin();
iter != waitersLockIDsRangeFLock.end();
iter++)
{
outStream << *iter << std::endl;
}
outStream << std::endl;
return outStream.str();
}
/**
* Increase/decreas the link count of this inode
*/
bool FileInode::incDecNumHardLinks(EntryInfo* entryInfo, int value)
{
SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K
incDecNumHardlinksUnpersistentUnlocked(value);
// update ctime
StatData* statData = this->inodeDiskData.getInodeStatData();
statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
bool retVal = storeUpdatedInodeUnlocked(entryInfo); // store on disk
if(!retVal)
{ // failed to update metadata on disk => restore old values
incDecNumHardlinksUnpersistentUnlocked(-value);
}
safeLock.unlock(); // U N L O C K
return retVal;
}
bool FileInode::operator==(const FileInode& other) const
{
return inodeDiskData == other.inodeDiskData
&& fileInfoVec == other.fileInfoVec
&& exclusiveTID == other.exclusiveTID
&& numSessionsRead == other.numSessionsRead
&& numSessionsWrite == other.numSessionsWrite
&& exclAppendLock == other.exclAppendLock
&& waitersExclAppendLock == other.waitersExclAppendLock
&& waitersLockIDsAppendLock == other.waitersLockIDsAppendLock
&& exclFLock == other.exclFLock
&& sharedFLocks == other.sharedFLocks
&& waitersExclFLock == other.waitersExclFLock
&& waitersSharedFLock == other.waitersSharedFLock
&& waitersLockIDsFLock == other.waitersLockIDsFLock
&& exclRangeFLocks == other.exclRangeFLocks
&& sharedRangeFLocks == other.sharedRangeFLocks
&& waitersExclRangeFLock == other.waitersExclRangeFLock
&& waitersSharedRangeFLock == other.waitersSharedRangeFLock
&& waitersLockIDsRangeFLock == other.waitersLockIDsRangeFLock
&& dentryCompatData == other.dentryCompatData
&& numParentRefs.read() == other.numParentRefs.read()
&& referenceParentID == other.referenceParentID
&& isInlined == other.isInlined;
}
std::pair<FhgfsOpsErr, StringVector> FileInode::listXAttr()
{
BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");
const Path* inodesPath = getIsBuddyMirroredUnlocked()
? Program::getApp()->getBuddyMirrorInodesPath()
: Program::getApp()->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
inodeDiskData.getEntryID());
return XAttrTk::listUserXAttrs(metaFilename);
}
std::tuple<FhgfsOpsErr, std::vector<char>, ssize_t> FileInode::getXAttr(
const std::string& xAttrName, size_t maxSize)
{
BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");
const Path* inodesPath = getIsBuddyMirroredUnlocked()
? Program::getApp()->getBuddyMirrorInodesPath()
: Program::getApp()->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
inodeDiskData.getEntryID());
return XAttrTk::getUserXAttr(metaFilename, xAttrName, maxSize);
}
FhgfsOpsErr FileInode::removeXAttr(EntryInfo* entryInfo, const std::string& xAttrName)
{
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");
const Path* inodesPath = getIsBuddyMirroredUnlocked()
? Program::getApp()->getBuddyMirrorInodesPath()
: Program::getApp()->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
inodeDiskData.getEntryID());
FhgfsOpsErr result = XAttrTk::removeUserXAttr(metaFilename, xAttrName);
if (result == FhgfsOpsErr_SUCCESS)
{
inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
storeUpdatedInodeUnlocked(entryInfo, nullptr);
}
// FIXME: should resync only this xattr ON THE INODE
if (getIsBuddyMirroredUnlocked())
if (auto* resync = BuddyResyncer::getSyncChangeset())
resync->addModification(metaFilename, MetaSyncFileType::Inode);
return result;
}
FhgfsOpsErr FileInode::setXAttr(EntryInfo* entryInfo, const std::string& xAttrName,
const CharVector& xAttrValue, int flags)
{
UniqueRWLock lock(rwlock, SafeRWLock_WRITE);
BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs");
const Path* inodesPath = getIsBuddyMirroredUnlocked()
? Program::getApp()->getBuddyMirrorInodesPath()
: Program::getApp()->getInodesPath();
std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(),
inodeDiskData.getEntryID());
FhgfsOpsErr result = XAttrTk::setUserXAttr(metaFilename, xAttrName, &xAttrValue[0],
xAttrValue.size(), flags);
if (result == FhgfsOpsErr_SUCCESS)
{
inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec);
storeUpdatedInodeUnlocked(entryInfo, nullptr);
}
// FIXME: should resync only this xattr ON THE INODE
if (getIsBuddyMirroredUnlocked())
if (auto* resync = BuddyResyncer::getSyncChangeset())
resync->addModification(metaFilename, MetaSyncFileType::Inode);
return result;
}
void FileInode::initLocksRandomForSerializationTests()
{
Random rand;
this->exclusiveTID = rand.getNextInt();
this->numSessionsRead = rand.getNextInt();
this->numSessionsWrite = rand.getNextInt();
this->exclAppendLock.initRandomForSerializationTests();
int max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
EntryLockDetails lock;
lock.initRandomForSerializationTests();
this->waitersExclAppendLock.push_back(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
std::string id;
StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
this->waitersLockIDsAppendLock.insert(id);
}
this->exclFLock.initRandomForSerializationTests();
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
EntryLockDetails lock;
lock.initRandomForSerializationTests();
this->sharedFLocks.insert(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
EntryLockDetails lock;
lock.initRandomForSerializationTests();
this->waitersExclFLock.push_back(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
EntryLockDetails lock;
lock.initRandomForSerializationTests();
this->waitersSharedFLock.push_back(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
std::string id;
StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
this->waitersLockIDsFLock.insert(id);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
RangeLockDetails lock;
lock.initRandomForSerializationTests();
this->exclRangeFLocks.insert(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
RangeLockDetails lock;
lock.initRandomForSerializationTests();
this->sharedRangeFLocks.insert(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
RangeLockDetails lock;
lock.initRandomForSerializationTests();
this->waitersExclRangeFLock.push_back(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
RangeLockDetails lock;
lock.initRandomForSerializationTests();
this->waitersSharedRangeFLock.push_back(lock);
}
max = rand.getNextInRange(0, 1024);
for(int i = 0; i < max; i++)
{
std::string id;
StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) );
this->waitersLockIDsFLock.insert(id);
}
StringTk::genRandomAlphaNumericString(this->referenceParentID, rand.getNextInRange(2, 30) );
this->numParentRefs.set(rand.getNextInt() );
}
/**
* Checks whether current file state allows the requested access and increments appropriate
* session counter if permitted. The entire operation occurs under a single write lock
* to prevent races between open() operation and state validate-and-update operations.
*
* @param accessFlags OPENFILE_ACCESS_... flags
* @param bypassAccessCheck if true, skip all file state-based access checks
* @return FhgfsOpsErr_SUCCESS if file opened successfully
* FhgfsOpsErr_FILEACCESS_DENIED if file state restricts the requested access
*/
FhgfsOpsErr FileInode::checkAccessAndOpen(unsigned accessFlags, bool bypassAccessCheck)
{
RWLockGuard lock(rwlock, SafeRWLock_WRITE);
if (!bypassAccessCheck)
{
FileState state(getFileStateUnlocked());
// Fast path: Check if file is unlocked (common case)
if (unlikely(!state.isUnlocked()))
{
// File has active state restrictions - determine what access types are being requested
bool readRequested = accessFlags & (OPENFILE_ACCESS_READ | OPENFILE_ACCESS_READWRITE);
bool writeRequested = accessFlags & (OPENFILE_ACCESS_WRITE | OPENFILE_ACCESS_READWRITE |
OPENFILE_ACCESS_TRUNC);
// Access not allowed if: state implies fully locked, or
// read requested when read-locked, or write requested when write-locked
bool blockOpenRequest = state.isFullyLocked() ||
(state.isReadLocked() && readRequested) ||
(state.isWriteLocked() && writeRequested);
if (blockOpenRequest)
return FhgfsOpsErr_FILEACCESS_DENIED;
}
}
// Access allowed - increment session counter
incNumSessionsUnlocked(accessFlags);
return FhgfsOpsErr_SUCCESS;
}