#include #include #include #include #include #include #include "FileInode.h" #include "Locking.h" #include #include // shorthand for the long init line of AppendLockQueuesContainer to create on stack #define FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(varName) \ AppendLockQueuesContainer varName( \ &exclAppendLock, &waitersExclAppendLock, &waitersLockIDsAppendLock) // shorthand for the long init line of EntryLockQueuesContainer to create on stack #define FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(varName) \ EntryLockQueuesContainer varName( \ &exclFLock, &sharedFLocks, &waitersExclFLock, &waitersSharedFLock, &waitersLockIDsFLock) /** * Inode initialization. The preferred initializer. Used for loading an inode from disk */ FileInode::FileInode(std::string entryID, FileInodeStoreData* inodeDiskData, DirEntryType entryType, unsigned dentryFeatureFlags) : inodeDiskData(entryID, inodeDiskData) { this->exclusiveTID = 0; this->numSessionsRead = 0; this->numSessionsWrite = 0; initFileInfoVec(); this->dentryCompatData.entryType = entryType; this->dentryCompatData.featureFlags = dentryFeatureFlags; } /** * Note: This constructor does not perform the full initialization, so use it for * metadata loading (or similar deserialization) only. * * Note: Don't forget to call initFileInfoVec() when using this (loadFromInodeFile() includes it). */ FileInode::FileInode() { this->exclusiveTID = 0; this->numSessionsRead = 0; this->numSessionsWrite = 0; this->dentryCompatData.entryType = DirEntryType_INVALID; this->dentryCompatData.featureFlags = 0; } /** * Requires: init'ed stripe pattern, modification and last access time secs */ void FileInode::initFileInfoVec() { // create a fileInfo in the vector for each stripe node StripePattern* pattern = inodeDiskData.getStripePattern(); size_t numTargets = pattern->getStripeTargetIDs()->size(); unsigned chunkSize = pattern->getChunkSize(); unsigned chunkSizeLog2 = MathTk::log2Int32(chunkSize); uint64_t stripeSetSize = chunkSize * numTargets; int64_t lastStripeSetSize; // =fileLength%stripeSetSize (remainder after stripeSetStart) int64_t stripeSetStart; // =fileLength-stripeSetSize int64_t fullLengthPerTarget; // =stripeSetStart/numTargets (without last stripe set remainder) StatData* statData = this->inodeDiskData.getInodeStatData(); int64_t fileSize = statData->getFileSize(); /* compute stripeset start to get number of complete chunks on all nodes and stripeset remainder to compute each target's remainder in the last stripe set. */ /* note: chunkSize is definitely power of two. if numTargets is also power of two, then stripeSetSize is also power of two */ if(MathTk::isPowerOfTwo(numTargets) ) { // quick path => optimized without division/modulo lastStripeSetSize = fileSize & (stripeSetSize-1); stripeSetStart = fileSize - lastStripeSetSize; fullLengthPerTarget = stripeSetStart >> MathTk::log2Int32(numTargets); } else { // slow path => requires division/modulo lastStripeSetSize = fileSize % stripeSetSize; stripeSetStart = fileSize - lastStripeSetSize; fullLengthPerTarget = stripeSetStart / numTargets; } // walk over all targets: compute their chunk file sizes and init timestamps fileInfoVec.reserve(numTargets); // to subtract last stripe set length of pevious targets in for-loop below int64_t remainingLastSetSize = lastStripeSetSize; for(unsigned target=0; target < numTargets; target++) // iterate over all chunks / targets { int64_t targetFileLength = fullLengthPerTarget; if(remainingLastSetSize > 0) targetFileLength += BEEGFS_MIN(remainingLastSetSize, chunkSize); int64_t modificationTimeSecs = statData->getModificationTimeSecs(); int64_t lastAccessTimeSecs = statData->getLastAccessTimeSecs(); uint64_t usedBlocks; if (statData->getIsSparseFile() ) usedBlocks = statData->getTargetChunkBlocks(target); else { // estimate the number of blocks by the file size usedBlocks = targetFileLength >> StatData::BLOCK_SHIFT; } DynamicFileAttribs dynAttribs(0, targetFileLength, usedBlocks, modificationTimeSecs, lastAccessTimeSecs); ChunkFileInfo fileInfo(chunkSize, chunkSizeLog2, dynAttribs); fileInfoVec.push_back(fileInfo); remainingLastSetSize -= chunkSize; } } /* * set remote targets for FileInode */ FhgfsOpsErr FileInode::setRemoteStorageTarget(EntryInfo* entryInfo, const RemoteStorageTarget& rst) { const char* logContext = "Set Remote Storage Target (FileInode)"; FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS; SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); auto [isValid, details] = rst.validateWithDetails(); if (!isValid) { LogContext(logContext).log(Log_WARNING, "Invalid RST data: " + details); retVal = FhgfsOpsErr_INTERNAL; } else { // set file's rst now this->rstInfo.set(rst); if (this->storeRemoteStorageTargetUnlocked(entryInfo)) { if (!this->getIsRstAvailableUnlocked()) { addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_RST); if (!this->storeUpdatedInodeUnlocked(entryInfo)) retVal = FhgfsOpsErr_INTERNAL; } } else retVal = FhgfsOpsErr_INTERNAL; } safeLock.unlock(); return retVal; } FhgfsOpsErr FileInode::clearRemoteStorageTarget(EntryInfo* entryInfo) { const char* logContext = "Clear Remote Storage Target (FileInode)"; UniqueRWLock lock(rwlock, SafeRWLock_WRITE); if (!this->getIsRstAvailableUnlocked()) return FhgfsOpsErr_SUCCESS; // Clear inode feature flag and store updated inode unsigned flags = this->inodeDiskData.getInodeFeatureFlags(); flags &= ~FILEINODE_FEATURE_HAS_RST; this->inodeDiskData.setInodeFeatureFlags(flags); if (!this->storeUpdatedInodeUnlocked(entryInfo)) return FhgfsOpsErr_INTERNAL; // Clear in-memory RST info this->rstInfo.reset(); // Remove RST xattr from meta file std::string metafile = this->getMetaFilePath(entryInfo); int res = removexattr(metafile.c_str(), RST_XATTR_NAME); if (unlikely(res == -1)) { // Not reporting as error to caller because: // 1. Feature flag is already cleared in metadata // 2. In-memory state is reset // 3. Future operations will ignore xattr due to cleared flag // Just log warning msgs for any unexpected errors or missing xattr. if (errno == ENODATA) { LogContext(logContext).log(Log_WARNING, "RST xattr not found. Path: " + metafile); } else { LogContext(logContext).log(Log_WARNING, "Failed to remove RST xattr; entryID: " + entryInfo->getEntryID() + "; error: " + System::getErrString()); } } return FhgfsOpsErr_SUCCESS; } /** * Decrease number of sessions for read or write (=> file close) and update persistent * metadata. * Note: This currently includes persistent metadata update for efficiency reasons (because * we already hold the mutex lock here). * * @param accessFlags OPENFILE_ACCESS_... flags */ void FileInode::decNumSessionsAndStore(EntryInfo* entryInfo, unsigned accessFlags) { SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); if(accessFlags & OPENFILE_ACCESS_READ) { if(unlikely(!numSessionsRead) ) { LogContext log("File::decNumSessionsRead"); log.logErr( std::string("Warning: numSessionsRead is already zero. " + std::string("File: ") + getEntryIDUnlocked() ) ); } else this->numSessionsRead--; } else { // (includes read+write) if(unlikely(!numSessionsWrite) ) { LogContext log("File::decNumSessionsWrite"); log.logErr( std::string("Warning: numSessionsWrite is already zero. " + std::string("File: ") + getEntryIDUnlocked() ) ); } else this->numSessionsWrite--; } // dyn attribs have been updated during close, so we save them here storeUpdatedInodeUnlocked(entryInfo); safeLock.unlock(); } /** * Note: This version is compatible with sparse files. */ void FileInode::updateDynamicAttribs() { this->inodeDiskData.inodeStatData.updateDynamicFileAttribs(this->fileInfoVec, this->inodeDiskData.getPattern() ); } /* * Note: Current object state is used for the serialization */ void FileInode::serializeMetaData(Serializer& ser) { // note: the total amount of serialized data may not be larger than META_SERBUF_SIZE // get latest dyn attrib values updateDynamicAttribs(); NumNodeID ownerNodeID ; /* irrelevant here. The serialize will set it to ourselves for inlined * inodes */ DentryStoreData dentryDiskData(this->inodeDiskData.getEntryID(), this->dentryCompatData.entryType, ownerNodeID, this->dentryCompatData.featureFlags); DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData); diskMetaData.serializeFileInode(ser); } /* * Note: Applies deserialized data directly to the current object */ void FileInode::deserializeMetaData(Deserializer& des) { DentryStoreData dentryDiskData; DiskMetaData diskMetaData(&dentryDiskData, &this->inodeDiskData); diskMetaData.deserializeFileInode(des); if (!des.good()) return; { // dentry compat data // entryType this->dentryCompatData.entryType = dentryDiskData.getDirEntryType(); // (dentry) feature flags this->dentryCompatData.featureFlags = dentryDiskData.getDentryFeatureFlags(); } } /** * Note: Wrapper/chooser for storeUpdatedMetaDataBufAsXAttr/Contents. * Note: Unlocked, caller must hold write lock. * * @param buf the serialized object state that is to be stored */ bool FileInode::storeUpdatedMetaDataBuf(char* buf, unsigned bufLen) { App* app = Program::getApp(); bool useXAttrs = app->getConfig()->getStoreUseExtendedAttribs(); const Path* inodesPath = getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath() : app->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(), inodeDiskData.getEntryID()); bool result = useXAttrs ? storeUpdatedMetaDataBufAsXAttr(buf, bufLen, metaFilename) : storeUpdatedMetaDataBufAsContents(buf, bufLen, metaFilename); if (getIsBuddyMirroredUnlocked()) if (auto* resync = BuddyResyncer::getSyncChangeset()) resync->addModification(metaFilename, MetaSyncFileType::Inode); return result; } /** * Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf(). * * @param buf the serialized object state that is to be stored */ bool FileInode::storeUpdatedMetaDataBufAsXAttr(char* buf, unsigned bufLen, std::string metaFilename) { const char* logContext = "File (store updated xattr metadata)"; // open file (create file if not already present) int openFlags = O_CREAT|O_TRUNC|O_WRONLY; int fd = open(metaFilename.c_str(), openFlags, 0644); if (unlikely(fd == -1)) { LogContext(logContext).logErr("Unable to open/create inode metafile: " + metaFilename + ". " + "SysErr: " + System::getErrString()); return false; } // write data to file int setRes = fsetxattr(fd, META_XATTR_NAME, buf, bufLen, 0); if(unlikely(setRes == -1) ) { // error LogContext(logContext).logErr("Unable to write FileInode metadata update: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); close(fd); return false; } LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() ); close(fd); return true; } /** * Stores the update to a sparate file first and then renames it. * * Note: Don't call this directly, use the wrapper storeUpdatedMetaDataBuf(). * * @param buf the serialized object state that is to be stored */ bool FileInode::storeUpdatedMetaDataBufAsContents(char* buf, unsigned bufLen, std::string metaFilename) { const char* logContext = "File (store updated inode)"; std::string metaUpdateFilename(metaFilename + META_UPDATE_EXT_STR); ssize_t writeRes; int renameRes; // open file (create it, but not O_EXCL because a former update could have failed) int openFlags = O_CREAT|O_TRUNC|O_WRONLY; int fd = open(metaUpdateFilename.c_str(), openFlags, 0644); if(fd == -1) { // error if(errno == ENOSPC) { // no free space => try again with update in-place LogContext(logContext).log(Log_DEBUG, "No space left to create update file. Trying update " "in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() ); return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename); } LogContext(logContext).logErr("Unable to create inode update file: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() ); return false; } // metafile created => store meta data writeRes = write(fd, buf, bufLen); if(writeRes != (ssize_t)bufLen) { if( (writeRes >= 0) || (errno == ENOSPC) ) { // no free space => try again with update in-place LogContext(logContext).log(Log_DEBUG, "No space left to write update inode. Trying update " "in-place: " + metaUpdateFilename + ". " + "SysErr: " + System::getErrString() ); close(fd); unlink(metaUpdateFilename.c_str() ); return storeUpdatedMetaDataBufAsContentsInPlace(buf, bufLen, metaFilename); } LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); goto error_closefile; } close(fd); renameRes = rename(metaUpdateFilename.c_str(), metaFilename.c_str() ); if(renameRes == -1) { LogContext(logContext).logErr("Unable to replace old inode file: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); goto error_unlink; } LOG_DEBUG(logContext, 4, "Inode update stored: " + this->inodeDiskData.getEntryID() ); return true; // error compensation error_closefile: close(fd); error_unlink: unlink(metaUpdateFilename.c_str() ); return false; } /** * Stores the update directly to the current metadata file (instead of creating a separate file * first and renaming it). * * Note: Don't call this directly, it is automatically called by storeUpdatedMetaDataBufAsContents() * when necessary. * * @param buf the serialized object state that is to be stored */ bool FileInode::storeUpdatedMetaDataBufAsContentsInPlace(char* buf, unsigned bufLen, std::string metaFilename) { const char* logContext = "File (store updated inode in-place)"; int fallocRes; ssize_t writeRes; int truncRes; // open file (create it, but not O_EXCL because a former update could have failed) int openFlags = O_CREAT|O_WRONLY; int fd = open(metaFilename.c_str(), openFlags, 0644); if(fd == -1) { // error LogContext(logContext).logErr("Unable to open inode file: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); return false; } // make sure we have enough room to write our update fallocRes = posix_fallocate(fd, 0, bufLen); // (note: posix_fallocate does not set errno) if(fallocRes == EBADF) { // special case for XFS bug struct stat statBuf; int statRes = fstat(fd, &statBuf); if (statRes == -1) { LogContext(logContext).log(Log_WARNING, "Unexpected error: fstat() failed with SysErr: " + System::getErrString(errno)); goto error_closefile; } if (statBuf.st_size < bufLen) { LogContext(logContext).log(Log_WARNING, "File space allocation (" + StringTk::intToStr(bufLen) + ") for inode update failed: " + metaFilename + ". " + "SysErr: " + System::getErrString(fallocRes) + " " "statRes: " + StringTk::intToStr(statRes) + " " "oldSize: " + StringTk::intToStr(statBuf.st_size)); goto error_closefile; } else { // // XFS bug! We only return an error if statBuf.st_size < bufLen. Ingore fallocRes then LOG_DEBUG(logContext, Log_SPAM, "Ignoring kernel file system bug: " "posix_fallocate() failed for len < filesize"); } } else if (fallocRes != 0) { // default error handling if posix_fallocate() failed LogContext(logContext).log(Log_WARNING, "File space allocation (" + StringTk::intToStr(bufLen) + ") for inode update failed: " + metaFilename + ". " + "SysErr: " + System::getErrString(fallocRes) ); goto error_closefile; } // metafile created => store meta data writeRes = write(fd, buf, bufLen); if(writeRes != (ssize_t)bufLen) { LogContext(logContext).logErr("Unable to write inode update: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); goto error_closefile; } close(fd); // truncate in case the update lead to a smaller file size truncRes = ftruncate(fd, bufLen); if(truncRes == -1) { // ignore trunc errors LogContext(logContext).log(Log_WARNING, "Unable to truncate inode file (strange, but " "proceeding anyways): " + metaFilename + ". " + "SysErr: " + System::getErrString() ); } LOG_DEBUG(logContext, 4, "File inode update stored: " + this->inodeDiskData.getEntryID() ); return true; // error compensation error_closefile: close(fd); return false; } /** * Update the inode on disk * * Note: We already need to have a FileInode (WRITE) rwlock here */ bool FileInode::storeUpdatedInodeUnlocked(EntryInfo* entryInfo, StripePattern* updatedStripePattern) { const char* logContext = "FileInode (store updated Inode)"; bool saveRes; bool isInLined = this->isInlined; if (isInLined) { FhgfsOpsErr dentrySaveRes = storeUpdatedInlinedInodeUnlocked(entryInfo, updatedStripePattern); if (dentrySaveRes == FhgfsOpsErr_SUCCESS) return true; // dentrySaveRes != FhgfsOpsErr_SUCCESS std::string parentID = entryInfo->getParentEntryID(); std::string entryID = entryInfo->getEntryID(); std::string fileName = entryInfo->getFileName(); if (dentrySaveRes == FhgfsOpsErr_INODENOTINLINED) { /* dentrySaveRes == FhgfsOpsErr_INODENOTINLINED. Our internal inode information says the * inode is inlined, but on writing it we figure out it is not. As we we are holding a * write lock here, that never should have happened. So probably a locking bug, but not * critical here and we retry using the non-inlined way. */ LogContext(logContext).log(Log_WARNING, std::string("Inode unexpectedly not inlined: ") + "parentID: "+ parentID + " entryID: " + entryID + " fileName: " + fileName ); this->isInlined = false; } else { LogContext(logContext).log(Log_WARNING, std::string("Failed to write inlined inode: ") + "parentID: "+ parentID + " entryID: " + entryID + " fileName: " + fileName + " Error: " + boost::lexical_cast(dentrySaveRes)); #ifdef BEEGFS_DEBUG LogContext(logContext).logBacktrace(); #endif } // it now falls through to the not-inlined handling, hopefully this is goint to work } // inode not inlined // change the stripe pattern here before serializing; if (unlikely(updatedStripePattern)) { StripePattern* pattern = this->inodeDiskData.getPattern(); if (!pattern->updateStripeTargetIDs(updatedStripePattern)) LogContext(logContext).log(Log_WARNING, "Could not set requested new stripe pattern"); } char buf[META_SERBUF_SIZE]; Serializer ser(buf, sizeof(buf)); serializeMetaData(ser); if (ser.good()) saveRes = storeUpdatedMetaDataBuf(buf, ser.size()); else saveRes = false; if (!saveRes && isInlined) { LogContext(logContext).log(Log_WARNING, std::string("Trying to write as non-inlined inode " "also failed.") ); } return saveRes; } /** * Update an inode, which is inlined into a dentry */ FhgfsOpsErr FileInode::storeUpdatedInlinedInodeUnlocked(EntryInfo* entryInfo, StripePattern* updatedStripePattern) { const char* logContext = "DirEntry (storeUpdatedInode)"; App* app = Program::getApp(); // get latest dyn attrib vals... updateDynamicAttribs(); std::string parentEntryID = entryInfo->getParentEntryID(); const Path* dentriesPath = entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath(); std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(), parentEntryID); FileInodeStoreData* inodeDiskData = this->getInodeDiskData(); if (unlikely(updatedStripePattern)) { // note: We do not set the complete stripe pattern here, but only the stripe target IDs if (! inodeDiskData->getPattern()->updateStripeTargetIDs(updatedStripePattern)) LogContext(logContext).log(Log_WARNING, "Could not set new stripe target IDs."); } DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(), entryInfo->getEntryID(), entryInfo->getOwnerNodeID() ); /* Note: As we are called from FileInode most data of this DirEntry are unknown and we need to * load it from disk. */ bool loadRes = dirEntry.loadFromID(dirEntryPath, entryInfo->getEntryID() ); if (!loadRes) return FhgfsOpsErr_INTERNAL; FileInodeStoreData* entryInodeDiskData = dirEntry.getInodeStoreData(); entryInodeDiskData->setFileInodeStoreData(inodeDiskData); FhgfsOpsErr retVal = dirEntry.storeUpdatedInode(dirEntryPath); return retVal; } std::string FileInode::getMetaFilePath(EntryInfo* entryInfo) { App* app = Program::getApp(); if (isInlined) { const Path* dentriesPath = getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath(); std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath( dentriesPath->str(), entryInfo->getParentEntryID()); return MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID(); } const Path* inodesPath = getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath() : app->getInodesPath(); return MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID()); } bool FileInode::storeRemoteStorageTargetUnlocked(EntryInfo* entryInfo) { std::string metafile = getMetaFilePath(entryInfo); char buf[META_SERBUF_SIZE]; Serializer ser(buf, sizeof(buf)); ser % rstInfo; if (!ser.good()) return false; bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs(); if (useXAttrs) return storeRemoteStorageTargetBufAsXAttr(buf, ser.size(), metafile); else { LOG(GENERAL, WARNING, "Storing RST info as file contents is unsupported. " "Please check the 'storeUseExtendedAttribs' setting in the BeeGFS meta config"); return false; } } bool FileInode::storeRemoteStorageTargetBufAsXAttr(char* buf, unsigned bufLen, const std::string& metafilename) { const char* logContext = "FileInode (store remote storage target as xattr)"; int setRes = setxattr(metafilename.c_str(), RST_XATTR_NAME, buf, bufLen, 0); if (unlikely(setRes == -1)) { // error LogContext(logContext).logErr("Unable to write remote storage target info to disk: " + metafilename + ". SysErr: " + System::getErrString()); return false; } return true; } bool FileInode::removeStoredMetaData(const std::string& id, bool isBuddyMirrored) { const char* logContext = "FileInode (remove stored metadata)"; App* app = Program::getApp(); std::string inodeFilename = MetaStorageTk::getMetaInodePath( isBuddyMirrored ? app->getBuddyMirrorInodesPath()->str() : app->getInodesPath()->str(), id); // delete metadata file int unlinkRes = unlink(inodeFilename.c_str() ); /* ignore errno == ENOENT as the file does not exist anymore for whatever reasons. Although * unlink() failed, we do not have to care, as our goal is still reached. This is also about * inode removal, if the dir-entry also does not exist, the application still will get * the right error code */ if(unlinkRes == -1 && errno != ENOENT) { // error LogContext(logContext).logErr("Unable to delete inode file: " + inodeFilename + ". " + "SysErr: " + System::getErrString() ); return false; } LOG_DEBUG(logContext, 4, "Inode file deleted: " + inodeFilename); if (isBuddyMirrored) if (auto* resync = BuddyResyncer::getSyncChangeset()) resync->addDeletion(inodeFilename, MetaSyncFileType::Inode); return true; } /** * Note: Wrapper/chooser for loadFromFileXAttr/Contents. * Note: This also (indirectly) calls initFileInfoVec() */ bool FileInode::loadFromInodeFile(EntryInfo* entryInfo) { bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs(); if(useXAttrs) return loadFromFileXAttr(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() ); return loadFromFileContents(entryInfo->getEntryID(), entryInfo->getIsBuddyMirrored() ); } /** * Note: Don't call this directly, use the wrapper loadFromInodeFile(). * Note: This also calls initFileInfoVec() */ bool FileInode::loadFromFileXAttr(const std::string& id, bool isBuddyMirrored) { const char* logContext = "File inode (load from xattr file)"; App* app = Program::getApp(); const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id); bool retVal = false; char buf[META_SERBUF_SIZE]; ssize_t getRes = getxattr(metaFilename.c_str(), META_XATTR_NAME, buf, META_SERBUF_SIZE); if(getRes > 0) { // we got something => deserialize it Deserializer des(buf, getRes); deserializeMetaData(des); if(unlikely(!des.good())) { // deserialization failed LogContext(logContext).logErr("Unable to deserialize metadata in file: " + metaFilename); goto error_exit; } // deserialization successful => init dyn attribs initFileInfoVec(); /* note: this can only be done after the stripePattern has been initialized, that's why we do it here at this "unusual" place. */ retVal = true; } else if( (getRes == -1) && (errno == ENOENT) ) { // file not exists LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); } else { // unhandled error LogContext(logContext).logErr("Unable to open/read inode file: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); } error_exit: return retVal; } /** * Note: Don't call this directly, use the wrapper loadFromInodeFile(). * Note: This also calls initFileInfoVec() */ bool FileInode::loadFromFileContents(const std::string& id, bool isBuddyMirrored) { const char* logContext = "File inode (load from file)"; App* app = Program::getApp(); const Path* inodePath = isBuddyMirrored ? app->getBuddyMirrorInodesPath() : app->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id); bool retVal = false; int openFlags = O_NOATIME | O_RDONLY; int fd = open(metaFilename.c_str(), openFlags, 0); if(fd == -1) { // open failed if(errno != ENOENT) LogContext(logContext).logErr("Unable to open inode file: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); return false; } char buf[META_SERBUF_SIZE]; int readRes = read(fd, buf, META_SERBUF_SIZE); if(readRes <= 0) { // reading failed LogContext(logContext).logErr("Unable to read inode file: " + metaFilename + ". " + "SysErr: " + System::getErrString() ); } else { Deserializer des(buf, readRes); deserializeMetaData(des); if(!des.good()) { // deserialization failed LogContext(logContext).logErr("Unable to deserialize inode in file: " + metaFilename); } else { // deserialization successful => init dyn attribs initFileInfoVec(); // note: this can only be done after the stripePattern // has been initialized, that's why we do it here at this "unusual" place retVal = true; } } close(fd); return retVal; } bool FileInode::loadRstFromInodeFile(EntryInfo* entryInfo) { bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs(); if (useXAttrs) return loadRstFromFileXAttr(entryInfo); return false; } bool FileInode::loadRstFromFileXAttr(EntryInfo* entryInfo) { const char* logContext = "File inode RST (load from xattr file)"; App* app = Program::getApp(); std::string metafile; if (isInlined) { const Path* dentriesPath = getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath(); std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentriesPath->str(), entryInfo->getParentEntryID()); metafile = MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) + entryInfo->getEntryID(); } else { const Path* inodesPath = getIsBuddyMirroredUnlocked() ? app->getBuddyMirrorInodesPath(): app->getInodesPath(); metafile = MetaStorageTk::getMetaInodePath(inodesPath->str(), entryInfo->getEntryID()); } char buf[META_SERBUF_SIZE]; ssize_t getRes = getxattr(metafile.c_str(), RST_XATTR_NAME, buf, META_SERBUF_SIZE); if (getRes > 0) { // we got something => deserialize it Deserializer des(buf, getRes); des % this->rstInfo; if (unlikely(!des.good())) { // deserialization failed LogContext(logContext).logErr("Unable to deserialize remote storage targets" ", file: " + metafile); return false; } return true; } else if( (getRes == -1) && (errno == ENOENT) ) { // file not exists LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, "Inode file not exists: " + metafile + ". " + "SysErr: " + System::getErrString() ); } else { // unhandled error LogContext(logContext).logErr("Unable to open/read inode file: " + metafile + ". " + "SysErr: " + System::getErrString() ); } return false; } /** * Create an inode from an entryInfo. * * Note: The entryInfo indicates if the inode is inlined or not. However, this information * might be outdated and so we need to try inlined and file-inode access, if creating * the inode failed. * We here rely on kernel lookup calls, to update client side entryInfo data. */ FileInode* FileInode::createFromEntryInfo(EntryInfo* entryInfo) { FileInode* inode; if (entryInfo->getIsInlined() ) { /* entryInfo indicates the inode is inlined. So first try to get the inode by * dir-entry inlined inode and if that failes try again with an inode-file. */ inode = createFromInlinedInode(entryInfo); if (!inode) inode = createFromInodeFile(entryInfo); } else { /* entryInfo indicates the inode is not inlined, but a separate inode-file. So first * try to get the inode by inode-file and only if that fails try again with the dir-entry, * maybe the inode was re-inlined. */ inode = createFromInodeFile(entryInfo); if (!inode) inode = createFromInlinedInode(entryInfo); } if (likely(inode) && inode->getIsRstAvailableUnlocked()) inode->loadRstFromInodeFile(entryInfo); return inode; } /** * Inode from inode file (inode is not inlined) * * Note: Do not call directly, but use FileInode::createFromEntryInfo() */ FileInode* FileInode::createFromInodeFile(EntryInfo* entryInfo) { FileInode* newInode = new FileInode(); bool loadRes = newInode->loadFromInodeFile(entryInfo); if(!loadRes) { delete(newInode); return NULL; } newInode->setIsInlinedUnlocked(false); return newInode; } /** * Inode from dir-entry with inlined inode. * * Note: Do not call directly, but use FileInode::createFromEntryInfo() */ FileInode* FileInode::createFromInlinedInode(EntryInfo* entryInfo) { App* app = Program::getApp(); std::string parentEntryID = entryInfo->getParentEntryID(); const Path* dentryPath = entryInfo->getIsBuddyMirrored() ? app->getBuddyMirrorDentriesPath() : app->getDentriesPath(); std::string dirEntryPath = MetaStorageTk::getMetaDirEntryPath(dentryPath->str(), parentEntryID); DirEntry dirEntry(entryInfo->getEntryType(), entryInfo->getFileName(), entryInfo->getEntryID(), entryInfo->getOwnerNodeID() ); FileInode* newInode = dirEntry.createInodeByID(dirEntryPath, entryInfo); if (newInode) newInode->setIsInlinedUnlocked(true); return newInode; } /** * Update entry attributes like chmod() etc. do it. * * Note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require a special handling by the caller (but we also set the static attribs here). * @param validAttribs SETATTR_CHANGE_...-Flags, but maybe 0, if we only want to update * AttribChangeTimeSecs. * @param attribs new attributes, but may be NULL if validAttribs == 0 */ bool FileInode::setAttrData(EntryInfo * entryInfo, int validAttribs, SettableFileAttribs* attribs) { bool success = true; SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K /* note: modificationTimeSecs and lastAccessTimeSecs are dynamic attribs, so they require a special handling by the caller (i.e. to also update chunk files) */ // save old attribs StatData* statData = this->inodeDiskData.getInodeStatData(); SettableFileAttribs oldAttribs = *(statData->getSettableFileAttribs() ); statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec); if(validAttribs) { // apply new attribs wrt flags... if(validAttribs & SETATTR_CHANGE_MODE) statData->setMode(attribs->mode); if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME) { /* only static value update required for storeUpdatedInodeUnlocked() */ statData->setModificationTimeSecs(attribs->modificationTimeSecs); } if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME) { /* only static value update required for storeUpdatedInodeUnlocked() */ statData->setLastAccessTimeSecs(attribs->lastAccessTimeSecs); } if(validAttribs & SETATTR_CHANGE_USERID) { statData->setUserID(attribs->userID); if ((attribs->userID != this->inodeDiskData.getOrigUID() ) && (this->inodeDiskData.getOrigFeature() == FileInodeOrigFeature_TRUE) ) addFeatureFlagUnlocked(FILEINODE_FEATURE_HAS_ORIG_UID); } if(validAttribs & SETATTR_CHANGE_GROUPID) statData->setGroupID(attribs->groupID); } bool storeRes = storeUpdatedInodeUnlocked(entryInfo); // store on disk if(!storeRes) { // failed to update metadata on disk => restore old values statData->setSettableFileAttribs(oldAttribs); success = false; goto err_unlock; } // persistent update succeeded // update attribs vec (wasn't done earlier because of backup overhead for restore on error) if(validAttribs & SETATTR_CHANGE_MODIFICATIONTIME) { for(size_t i=0; i < fileInfoVec.size(); i++) fileInfoVec[i].getRawDynAttribs()->modificationTimeSecs = attribs->modificationTimeSecs; } if(validAttribs & SETATTR_CHANGE_LASTACCESSTIME) { for(size_t i=0; i < fileInfoVec.size(); i++) fileInfoVec[i].getRawDynAttribs()->lastAccessTimeSecs = attribs->lastAccessTimeSecs; } err_unlock: safeLock.unlock(); // U N L O C K return success; } /** * General wrapper for append lock and unlock operations. * * Append supports exclusive locking only, no shared locks. * * Note: Unlocks are always immediately granted (=> they always return "true"). * * @return true if operation succeeded immediately; false if registered for waiting (or failed in * case of NOWAIT-flag) */ std::pair FileInode::flockAppend(EntryLockDetails& lockDetails) { FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs); UniqueRWLock lock(rwlock, SafeRWLock_WRITE); return flockEntryUnlocked(lockDetails, &lockQs); } /** * General wrapper for flock lock and unlock operations. * * Note: Unlocks are always immediately granted (=> they always return "true"). * * @return true if operation succeeded immediately; false if registered for waiting (or failed in * case of NOWAIT-flag) */ std::pair FileInode::flockEntry(EntryLockDetails& lockDetails) { FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs); UniqueRWLock lock(rwlock, SafeRWLock_WRITE); return flockEntryUnlocked(lockDetails, &lockQs); } /** * General wrapper for flock lock and unlock operations. * * Note: Unlocks are always immediately granted (=> they always return "true"). * Note: Unlocked version => caller must hold write lock. * * @return true if operation succeeded immediately; false if registered for waiting (or failed in * case of NOWAIT-flag) */ std::pair FileInode::flockEntryUnlocked(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { bool tryNextWaiters = false; bool immediatelyGranted = false; // return value if(lockDetails.isCancel() ) { // C A N C E L request /* note: this is typically used when a client closes a file, so we remove all granted and pending locks for the given handle here */ if(flockEntryCancelByHandle(lockDetails, lockQs) ) tryNextWaiters = true; immediatelyGranted = true; } else if(lockDetails.isUnlock() ) { // U N L O C K request tryNextWaiters = flockEntryUnlock(lockDetails, lockQs); immediatelyGranted = true; } else { // L O C K request // check waiters to filter duplicate requests StringSetIter iterWaiters = lockQs->waitersLockIDs->find(lockDetails.lockAckID); if(iterWaiters != lockQs->waitersLockIDs->end() ) return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting // not in waiters queue => is it granted already? bool isGrantedAlready = flockEntryIsGranted(lockDetails, lockQs); if(isGrantedAlready) return {true, {}}; // request was granted already // not waiting, not granted => we have a new request bool hasConflicts = flockEntryCheckConflicts(lockDetails, lockQs, NULL); if(!hasConflicts || lockDetails.allowsWaiting() ) tryNextWaiters = flockEntryUnlock(lockDetails, lockQs); // unlock (for lock up-/downgrades) if(lockDetails.isShared() ) { // S H A R E D lock request if(!hasConflicts) { // no confictors for this lock => can be immediately granted flockEntryShared(lockDetails, lockQs); immediatelyGranted = true; } else if(lockDetails.allowsWaiting() ) { // we have conflictors and locker wants to wait lockQs->waitersSharedLock->push_back(lockDetails); lockQs->waitersLockIDs->insert(lockDetails.lockAckID); } } else { // E X C L U S I V E lock request if(!hasConflicts) { // no confictors for this lock => can be immediately granted flockEntryExclusive(lockDetails, lockQs); immediatelyGranted = true; } else if(lockDetails.allowsWaiting() ) { // we have conflictors and locker wants to wait lockQs->waitersExclLock->push_back(lockDetails); lockQs->waitersLockIDs->insert(lockDetails.lockAckID); } } } if (tryNextWaiters) return {immediatelyGranted, flockEntryTryNextWaiters(lockQs)}; return {immediatelyGranted, {}}; } /** * Remove all waiters from the queues. */ void FileInode::flockAppendCancelAllWaiters() { FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs); flockEntryGenericCancelAllWaiters(&lockQs); } /** * Remove all waiters from the queues. */ void FileInode::flockEntryCancelAllWaiters() { FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs); flockEntryGenericCancelAllWaiters(&lockQs); } /** * Remove all waiters from the queues. * * Generic version shared by append and flock locking. */ void FileInode::flockEntryGenericCancelAllWaiters(EntryLockQueuesContainer* lockQs) { UniqueRWLock lock(rwlock, SafeRWLock_WRITE); lockQs->waitersLockIDs->clear(); lockQs->waitersExclLock->clear(); lockQs->waitersSharedLock->clear(); } /** * Unlock all locks and wait entries of the given clientID. */ LockEntryNotifyList FileInode::flockAppendCancelByClientID(NumNodeID clientID) { FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs); return flockEntryGenericCancelByClientID(clientID, &lockQs); } /** * Unlock all locks and wait entries of the given clientID. */ LockEntryNotifyList FileInode::flockEntryCancelByClientID(NumNodeID clientID) { FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs); return flockEntryGenericCancelByClientID(clientID, &lockQs); } /** * Unlock all locks and wait entries of the given clientID. * * Generic version shared by append and flock locking. */ LockEntryNotifyList FileInode::flockEntryGenericCancelByClientID(NumNodeID clientNumID, EntryLockQueuesContainer* lockQs) { /* note: this code is in many aspects similar to flockEntryCancelByHandle(), so if you change * something here, you probably want to change it there, too. */ UniqueRWLock lock(rwlock, SafeRWLock_WRITE); bool tryNextWaiters = false; // exclusive lock if(lockQs->exclLock->isSet() && (lockQs->exclLock->clientNumID == clientNumID) ) { *lockQs->exclLock = {}; tryNextWaiters = true; } // shared locks for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin(); iter != lockQs->sharedLocks->end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { EntryLockDetailsSetIter iterNext = iter; iterNext++; lockQs->sharedLocks->erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // waiters exlusive for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin(); iter != lockQs->waitersExclLock->end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { lockQs->waitersLockIDs->erase(iter->lockAckID); iter = lockQs->waitersExclLock->erase(iter); tryNextWaiters = true; continue; } iter++; } // waiters shared for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin(); iter != lockQs->waitersSharedLock->end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { lockQs->waitersLockIDs->erase(iter->lockAckID); iter = lockQs->waitersSharedLock->erase(iter); tryNextWaiters = true; continue; } iter++; } if (tryNextWaiters) return flockEntryTryNextWaiters(lockQs); return {}; } /** * Remove all granted and pending locks that match the given handle. * (This is typically called by clients during file close.) * * Note: unlocked, so hold the mutex when calling this. * * @return true if locks were removed and next waiters should be tried. */ bool FileInode::flockEntryCancelByHandle(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { /* note: this code is in many aspects similar to flockEntryCancelByClientID(), so if you change * something here, you probably want to change it there, too. */ bool tryNextWaiters = false; // exclusive lock if(lockQs->exclLock->isSet() && lockDetails.equalsHandle(*lockQs->exclLock) ) { *lockQs->exclLock = {}; tryNextWaiters = true; } // shared locks for(EntryLockDetailsSetIter iter = lockQs->sharedLocks->begin(); iter != lockQs->sharedLocks->end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { EntryLockDetailsSetIter iterNext = iter; iterNext++; lockQs->sharedLocks->erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // waiters exlusive for(EntryLockDetailsListIter iter = lockQs->waitersExclLock->begin(); iter != lockQs->waitersExclLock->end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { lockQs->waitersLockIDs->erase(iter->lockAckID); iter = lockQs->waitersExclLock->erase(iter); tryNextWaiters = true; continue; } iter++; } // waiters shared for(EntryLockDetailsListIter iter = lockQs->waitersSharedLock->begin(); iter != lockQs->waitersSharedLock->end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { lockQs->waitersLockIDs->erase(iter->lockAckID); iter = lockQs->waitersSharedLock->erase(iter); tryNextWaiters = true; continue; } iter++; } return tryNextWaiters; } /** * Note: Automatically ignores self-conflicts (locks that could be up- or downgraded). * Note: Make sure to remove lock duplicates before calling this. * Note: unlocked, so hold the mutex when calling this. * * @param outConflictor first identified conflicting lock (only set if true is returned; can be * NULL if caller is not interested) * @return true if there is a conflict with a lock that is not owned by the current lock requestor, * false if the request can defintely be granted immediately without waiting */ bool FileInode::flockEntryCheckConflicts(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs, EntryLockDetails* outConflictor) { // note: we also check waiting writers here, because we have writer preference and so we don't // want to grant access for a new reader if we have a waiting writer // check conflicting exclusive lock (for shared & eclusive requests) if(lockQs->exclLock->isSet() && !lockQs->exclLock->equalsHandle(lockDetails) ) { SAFE_ASSIGN(outConflictor, *lockQs->exclLock); return true; } // no exclusive lock exists if(lockDetails.isExclusive() ) { // exclusive lock request: check conflicting shared lock for(EntryLockDetailsSetCIter iterShared = lockQs->sharedLocks->begin(); iterShared != lockQs->sharedLocks->end(); iterShared++) { if(!iterShared->equalsHandle(lockDetails) ) { // found a conflicting lock SAFE_ASSIGN(outConflictor, *iterShared); return true; } } } else { // non-exclusive lock: check for waiting writers to enforce writer preference if(!lockQs->waitersExclLock->empty() ) { SAFE_ASSIGN(outConflictor, *lockQs->waitersExclLock->begin() ); return true; } } return false; } /** * Find out whether a given lock is currently being held by the given owner. * * Note: unlocked, hold the read lock when calling this. * * @return true if the given lock is being held by the given owner. */ bool FileInode::flockEntryIsGranted(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { if(lockDetails.isExclusive() ) { if(lockQs->exclLock->equalsHandle(lockDetails) ) { // was an exclusive lock return true; } } else if(lockDetails.isShared() ) { EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails); if(iterShared != lockQs->sharedLocks->end() ) { // was a shared lock return true; } } return false; } /** * Note: unlocked, so hold the write lock when calling this. * * @return true if an existing lock was released */ bool FileInode::flockEntryUnlock(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { if(lockQs->exclLock->equalsHandle(lockDetails) ) { // was an exclusive lock *lockQs->exclLock = {}; return true; } EntryLockDetailsSetIter iterShared = lockQs->sharedLocks->find(lockDetails); if(iterShared != lockQs->sharedLocks->end() ) { // was a shared lock lockQs->sharedLocks->erase(iterShared); return true; } return false; } /** * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or * duplicates. * Note: unlocked, so hold the mutex when calling this */ void FileInode::flockEntryShared(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { lockQs->sharedLocks->insert(lockDetails); } /** * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or * duplicates. * Note: unlocked, so hold the mutex when calling this */ void FileInode::flockEntryExclusive(EntryLockDetails& lockDetails, EntryLockQueuesContainer* lockQs) { *lockQs->exclLock = lockDetails; } /** * Remove next requests from waiters queue and try to grant it - until we reach an entry that * cannot be granted immediately. * * Note: We assume that duplicate waiters and duplicate granted locks (up-/downgrades) have been * removed before a lock request is enqueued, so we don't check for that. * * Note: FileInode must be already write-locked by the caller! */ LockEntryNotifyList FileInode::flockEntryTryNextWaiters(EntryLockQueuesContainer* lockQs) { /* note: we have writer preference, so we don't grant any new readers while we have waiting writers */ if(lockQs->exclLock->isSet() ) return {}; // eclusive lock => there's nothing we can do right now // no exclusive lock set if(!lockQs->waitersSharedLock->empty() && lockQs->waitersExclLock->empty() ) { // shared locks waiting and no exclusive locks waiting => grant all LockEntryNotifyList notifyList; while(!lockQs->waitersSharedLock->empty() ) { flockEntryShared(*lockQs->waitersSharedLock->begin(), lockQs); notifyList.push_back(*lockQs->waitersSharedLock->begin() ); lockQs->waitersLockIDs->erase(lockQs->waitersSharedLock->begin()->lockAckID); lockQs->waitersSharedLock->pop_front(); } return notifyList; } // no exclusive and no shared locks set => we can grant an exclusive lock if(!lockQs->waitersExclLock->empty() ) { // exclusive locks waiting => grant first one of them flockEntryExclusive(*lockQs->waitersExclLock->begin(), lockQs); LockEntryNotifyList notifyList; notifyList.push_back(*lockQs->waitersExclLock->begin() ); lockQs->waitersLockIDs->erase(lockQs->waitersExclLock->begin()->lockAckID); lockQs->waitersExclLock->pop_front(); return notifyList; } return {}; } /** * Generate a complete locking status overview (all granted and waiters) as human-readable string. */ std::string FileInode::flockAppendGetAllAsStr() { FILEINODE_APPEND_LOCK_QUEUES_CONTAINER(lockQs); return flockEntryGenericGetAllAsStr(&lockQs); } /** * Generate a complete locking status overview (all granted and waiters) as human-readable string. */ std::string FileInode::flockEntryGetAllAsStr() { FILEINODE_ENTRY_LOCK_QUEUES_CONTAINER(lockQs); return flockEntryGenericGetAllAsStr(&lockQs); } /** * Generate a complete locking status overview (all granted and waiters) as human-readable string. * * Generic version shared by append and flock locking. */ std::string FileInode::flockEntryGenericGetAllAsStr(EntryLockQueuesContainer* lockQs) { UniqueRWLock lock(rwlock, SafeRWLock_READ); std::ostringstream outStream; outStream << "Exclusive" << std::endl; outStream << "=========" << std::endl; if(lockQs->exclLock->isSet() ) outStream << lockQs->exclLock->toString() << std::endl; outStream << std::endl; outStream << "Shared" << std::endl; outStream << "=========" << std::endl; for(EntryLockDetailsSetCIter iter = lockQs->sharedLocks->begin(); iter != lockQs->sharedLocks->end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Exclusive Waiters" << std::endl; outStream << "=========" << std::endl; for(EntryLockDetailsListCIter iter = lockQs->waitersExclLock->begin(); iter != lockQs->waitersExclLock->end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Shared Waiters" << std::endl; outStream << "=========" << std::endl; for(EntryLockDetailsListCIter iter = lockQs->waitersSharedLock->begin(); iter != lockQs->waitersSharedLock->end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Waiters lockIDs" << std::endl; outStream << "=========" << std::endl; for(StringSetCIter iter = lockQs->waitersLockIDs->begin(); iter != lockQs->waitersLockIDs->end(); iter++) { outStream << *iter << std::endl; } outStream << std::endl; return outStream.str(); } /** * General wrapper for flock lock and unlock operations. * * @return true if operation succeeded immediately; false if registered for waiting (or failed in * case of NOWAIT-flag) */ std::pair FileInode::flockRange(RangeLockDetails& lockDetails) { UniqueRWLock lock(rwlock, SafeRWLock_WRITE); return flockRangeUnlocked(lockDetails); } /** * General wrapper for flock lock and unlock operations. * * Note: Unlocked, so caller must hold the write lock. * * @return true if operation succeeded immediately; false if registered for waiting (or failed in * case of NOWAIT-flag) */ std::pair FileInode::flockRangeUnlocked(RangeLockDetails& lockDetails) { bool tryNextWaiters = false; bool immediatelyGranted = false; // return value if(lockDetails.isCancel() ) { // C A N C E L request /* note: this is typically used when a client closes a file, so we remove all granted and pending locks for the given handle here */ if(flockRangeCancelByHandle(lockDetails) ) tryNextWaiters = true; immediatelyGranted = true; } else if(lockDetails.isUnlock() ) { // U N L O C K request tryNextWaiters = flockRangeUnlock(lockDetails); immediatelyGranted = true; } else { // L O C K request // check waiters to filter duplicate requests StringSetIter iterWaiters = waitersLockIDsRangeFLock.find(lockDetails.lockAckID); if(iterWaiters != waitersLockIDsRangeFLock.end() ) return {false, {}}; // re-request from waiter, but still in the queue => keep on waiting // not in waiters queue => is it granted already? bool isGrantedAlready = flockRangeIsGranted(lockDetails); if(isGrantedAlready) return {true, {}}; // request was granted already // not waiting, not granted => we have a new request bool hasConflicts = flockRangeCheckConflicts(lockDetails, NULL); if(!hasConflicts || lockDetails.allowsWaiting() ) tryNextWaiters = flockRangeUnlock(lockDetails); // unlock range (for lock up-/downgrades) if(lockDetails.isShared() ) { // S H A R E D lock request if(!hasConflicts) { // no confictors for this lock => can be immediately granted flockRangeShared(lockDetails); immediatelyGranted = true; } else if(lockDetails.allowsWaiting() ) { // we have conflictors and locker wants to wait waitersSharedRangeFLock.push_back(lockDetails); waitersLockIDsRangeFLock.insert(lockDetails.lockAckID); } } else { // E X C L U S I V E lock request if(!hasConflicts) { // no confictors for this lock => can be immediately granted flockRangeExclusive(lockDetails); immediatelyGranted = true; } else if(lockDetails.allowsWaiting() ) { // we have conflictors and locker wants to wait waitersExclRangeFLock.push_back(lockDetails); waitersLockIDsRangeFLock.insert(lockDetails.lockAckID); } } } if (tryNextWaiters) return {immediatelyGranted, flockRangeTryNextWaiters()}; return {immediatelyGranted, {}}; } /** * Remove all waiters from the queues. */ void FileInode::flockRangeCancelAllWaiters() { UniqueRWLock lock(rwlock, SafeRWLock_WRITE); waitersLockIDsRangeFLock.clear(); waitersExclRangeFLock.clear(); waitersSharedRangeFLock.clear(); } /** * Unlock all locks and wait entries of the given clientID. */ LockRangeNotifyList FileInode::flockRangeCancelByClientID(NumNodeID clientNumID) { /* note: this code is in many aspects similar to flockRangeCancelByHandle(), so if you change * something here, you probably want to change it there, too. */ UniqueRWLock lock(rwlock, SafeRWLock_WRITE); bool tryNextWaiters = false; // exclusive locks for(RangeLockExclSetIter iter = exclRangeFLocks.begin(); iter != exclRangeFLocks.end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { RangeLockExclSetIter iterNext = iter; iterNext++; exclRangeFLocks.erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // shared locks for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin(); iter != sharedRangeFLocks.end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { RangeLockSharedSetIter iterNext = iter; iterNext++; sharedRangeFLocks.erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // waiters exlusive for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin(); iter != waitersExclRangeFLock.end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersExclRangeFLock.erase(iter); tryNextWaiters = true; continue; } iter++; } // waiters shared for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin(); iter != waitersSharedRangeFLock.end(); /* iter inc'ed inside loop */ ) { if(iter->clientNumID == clientNumID) { waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersSharedRangeFLock.erase(iter); tryNextWaiters = true; continue; } iter++; } if(tryNextWaiters) return flockRangeTryNextWaiters(); return {}; } /** * Remove all granted and pending locks that match the given handle. * (This is typically called by clients during file close.) * * Note: unlocked, so hold the mutex when calling this. * * @return true if locks were removed and next waiters should be tried. */ bool FileInode::flockRangeCancelByHandle(RangeLockDetails& lockDetails) { /* note: this code is in many aspects similar to flockRangeCancelByClientID(), so if you change * something here, you probably want to change it there, too. */ bool tryNextWaiters = false; // exclusive locks for(RangeLockExclSetIter iter = exclRangeFLocks.begin(); iter != exclRangeFLocks.end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { RangeLockExclSetIter iterNext = iter; iterNext++; exclRangeFLocks.erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // shared locks for(RangeLockSharedSetIter iter = sharedRangeFLocks.begin(); iter != sharedRangeFLocks.end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { RangeLockSharedSetIter iterNext = iter; iterNext++; sharedRangeFLocks.erase(iter); iter = iterNext; tryNextWaiters = true; continue; } iter++; } // waiters exlusive for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin(); iter != waitersExclRangeFLock.end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersExclRangeFLock.erase(iter); tryNextWaiters = true; continue; } iter++; } // waiters shared for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin(); iter != waitersSharedRangeFLock.end(); /* iter inc'ed inside loop */ ) { if(lockDetails.equalsHandle(*iter) ) { waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersSharedRangeFLock.erase(iter); tryNextWaiters = true; continue; } iter++; } return tryNextWaiters; } /** * Checks if there is a conflict for the given lock (but does not actually place lock). * * @param outConflictor the conflicting lock (or of of them) in case we return true. * @return true if there is a conflict for the given lock request. */ bool FileInode::flockRangeGetConflictor(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor) { UniqueRWLock lock(rwlock, SafeRWLock_READ); return flockRangeCheckConflicts(lockDetails, outConflictor); } /** * Note: see flockRangeCheckConflictsEx() for comments (this is just the simple version which * checks the whole excl waiters queue and hence is inappropriate for tryNextWaiters() ). */ bool FileInode::flockRangeCheckConflicts(RangeLockDetails& lockDetails, RangeLockDetails* outConflictor) { return flockRangeCheckConflictsEx(lockDetails, -1, outConflictor); } /** * Note: Automatically ignores self-conflicts (locks that could be up- or downgraded) * Note: unlocked, so hold the mutex when calling this * * @param outConflictor first identified conflicting lock (only set if true is returned; can be * NULL if caller is not interested) * @param maxExclWaitersCheckNum only required by tryNextWaiters to find out how many pending excls * in the queue before the checked element should be tested for conflicts (ie for the 5th queue * element you will pass 4 here); -1 will check the whole queue, which is what all other callers * probably want to do. * @return true if there is a conflict with a lock that is not owned by the current lock requestor */ bool FileInode::flockRangeCheckConflictsEx(RangeLockDetails& lockDetails, int maxExclWaitersCheckNum, RangeLockDetails* outConflictor) { // note: we also check waiting writers here, because we have writer preference and so we don't // want to grant access for a new reader if we have a waiting writer // ...and we also don't want to starve writers by other writers, so we also check for // overlapping waiting writer requests before granting a write lock // check conflicting exclusive locks (for shared & exclusive requests) for(RangeLockExclSetCIter iterExcl = exclRangeFLocks.begin(); (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end); iterExcl++) { if(lockDetails.overlaps(*iterExcl) && !lockDetails.equalsHandle(*iterExcl) ) { SAFE_ASSIGN(outConflictor, *iterExcl); return true; } } // no conflicting exclusive lock exists if(lockDetails.isExclusive() ) { // exclusive lock request: check conflicting shared locks // check granted shared locks for(RangeLockSharedSetCIter iterShared = sharedRangeFLocks.begin(); iterShared != sharedRangeFLocks.end(); iterShared++) { if(lockDetails.overlaps(*iterShared) && !lockDetails.equalsHandle(*iterShared) ) { SAFE_ASSIGN(outConflictor, *iterShared); return true; } } } // no conflicting shared lock exists // check waiting writers (for shared reqs to prefer writers and for excl reqs to avoid // writer starvation of partially overlapping waiting writers) // (note: keep in mind that maxExclWaitersCheckNum can also be -1 for infinite checks) for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin(); (iter != waitersExclRangeFLock.end() ) && (maxExclWaitersCheckNum != 0); iter++, maxExclWaitersCheckNum--) { if(lockDetails.overlaps(*iter) && !lockDetails.equalsHandle(*iter) ) { SAFE_ASSIGN(outConflictor, *iter); return true; } } return false; } /** * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or * duplicates. * Note: unlocked, so hold the mutex when calling this */ void FileInode::flockRangeShared(RangeLockDetails& lockDetails) { // insert shared lock request... // (avoid duplicates and side-by-side locks for same file handles by merging) for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin(); iterShared != sharedRangeFLocks.end(); /* conditional iter increment inside loop */ ) { bool incIterAtEnd = true; if(lockDetails.equalsHandle(*iterShared) && lockDetails.isMergeable(*iterShared) ) { // same handle => merge with existing lock // note: all overlaps will be merged into lockDetails, so every other overlapping entry // can be removed here lockDetails.merge(*iterShared); RangeLockExclSetIter iterSharedNext(iterShared); iterSharedNext++; sharedRangeFLocks.erase(iterShared); iterShared = iterSharedNext; incIterAtEnd = false; } if(incIterAtEnd) iterShared++; } // actually insert the new lock sharedRangeFLocks.insert(lockDetails); } /** * Note: We assume that unlock() has been called before, so we don't check for up-/downgrades or * duplicates. * Note: unlocked, so hold the mutex when calling this */ void FileInode::flockRangeExclusive(RangeLockDetails& lockDetails) { // insert excl lock request... // (avoid duplicates and side-by-side locks for same file handles by merging) // (note: lockDetails.end+1: because we're also looking for extensions, not only overlaps) for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin(); (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= (lockDetails.end+1) ); /* conditional iter increment inside loop */ ) { bool incIterAtEnd = true; if(lockDetails.equalsHandle(*iterExcl) && lockDetails.isMergeable(*iterExcl) ) { // same handle => merge with existing lock // note: all overlaps will be merged into lockDetails, so every other overlapping entry // can be removed here lockDetails.merge(*iterExcl); RangeLockExclSetIter iterExclNext(iterExcl); iterExclNext++; exclRangeFLocks.erase(iterExcl); iterExcl = iterExclNext; incIterAtEnd = false; } if(incIterAtEnd) iterExcl++; } // actually insert the new lock exclRangeFLocks.insert(lockDetails); } /** * Find out whether a given range lock is currently being held by the given owner. * * Note: unlocked, hold the read lock when calling this. * * @return true if the range is locked by the given owner */ bool FileInode::flockRangeIsGranted(RangeLockDetails& lockDetails) { if(lockDetails.isExclusive() ) { for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin(); (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end); /* conditional iter increment at end of loop */ ) { if(!lockDetails.equalsHandle(*iterExcl) ) { // lock owned by another client/process iterExcl++; continue; } // found a lock that is owned by the same client/process => check overlap with given lock bool incIterAtEnd = true; RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl); switch(overlap) { case RangeOverlapType_EQUALS: { // found an exact match => don't need to look any further return true; } break; case RangeOverlapType_ISCONTAINED: { /* given range is fully contained in a greater locked area => don't need to look any further */ return true; } break; case RangeOverlapType_CONTAINS: { /* found a range which is part of the given lock => given owner cannot currently hold the lock for the whole given range, otherwise we wouldn't find a partial match because of our merging => don't need to look any further */ return false; } break; case RangeOverlapType_STARTOVERLAP: case RangeOverlapType_ENDOVERLAP: { /* found a range which is part of the given lock => given owner cannot currently hold the lock for the whole given range, otherwise we wouldn't find a partial match because of our merging => don't need to look any further */ return false; } break; default: break; // no overlap } // end of switch(overlap) if(incIterAtEnd) iterExcl++; } } // end of exclusive locks check else if(lockDetails.isShared() ) { for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin(); iterShared != sharedRangeFLocks.end(); /* conditional iter increment at end of loop */ ) { if(!lockDetails.equalsHandle(*iterShared) ) { // lock owned by another client/process iterShared++; continue; } // found a lock that is owned by the same client/process => check overlap with given lock bool incIterAtEnd = true; RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared); switch(overlap) { case RangeOverlapType_EQUALS: { // found an exact match => don't need to look any further return true; } break; case RangeOverlapType_ISCONTAINED: { /* given lock is fully contained in a greater locked area => don't need to look any further */ return true; } break; case RangeOverlapType_CONTAINS: { /* found a range which is part of the given lock => given owner cannot currently hold the lock for the whole given range, otherwise we wouldn't find a partial match because of our merging => don't need to look any further */ return false; } break; case RangeOverlapType_STARTOVERLAP: case RangeOverlapType_ENDOVERLAP: { /* found a range which is part of the given lock => given owner cannot currently hold the lock for the whole given range, otherwise we wouldn't find a partial match because of our merging => don't need to look any further */ return false; } break; default: break; // no overlap } // end of switch(overlap) if(incIterAtEnd) iterShared++; } } // end of shared locks check return false; } /** * Note: unlocked, so hold the mutex when calling this. * * @return true if an existing lock has been removed */ bool FileInode::flockRangeUnlock(RangeLockDetails& lockDetails) { bool lockRemoved = false; // return value // check exclusive locks... // (quick path: if the whole unlock is entirely covered by an exclusive range, then we don't need // to look any further) for(RangeLockExclSetIter iterExcl = exclRangeFLocks.begin(); (iterExcl != exclRangeFLocks.end() ) && (iterExcl->start <= lockDetails.end); /* conditional iter increment at end of loop */ ) { if(!lockDetails.equalsHandle(*iterExcl) ) { // lock owned by another client/process iterExcl++; continue; } // found a lock that is owned by the same client/process => check overlap with unlock request bool incIterAtEnd = true; RangeOverlapType overlap = lockDetails.overlapsEx(*iterExcl); switch(overlap) { case RangeOverlapType_EQUALS: { // found an exact match => don't need to look any further exclRangeFLocks.erase(iterExcl); return true; } break; case RangeOverlapType_ISCONTAINED: { // unlock is fully contained in a greater locked area => don't need to look any further // check if 1 or 2 locked areas remain (=> shrink or split) if( (lockDetails.start == iterExcl->start) || (lockDetails.end == iterExcl->end) ) { // only one locked area remains RangeLockDetails oldExcl(*iterExcl); oldExcl.trim(lockDetails); exclRangeFLocks.erase(iterExcl); exclRangeFLocks.insert(oldExcl); } else { // two locked areas remain RangeLockDetails oldExcl(*iterExcl); RangeLockDetails newExcl; oldExcl.split(lockDetails, newExcl); exclRangeFLocks.erase(iterExcl); exclRangeFLocks.insert(oldExcl); exclRangeFLocks.insert(newExcl); } return true; } break; case RangeOverlapType_CONTAINS: { // full removal of this lock, but there may still be some others that need to be removed RangeLockExclSetIter iterExclNext(iterExcl); iterExclNext++; exclRangeFLocks.erase(iterExcl); lockRemoved = true; iterExcl = iterExclNext; incIterAtEnd = false; } break; case RangeOverlapType_STARTOVERLAP: case RangeOverlapType_ENDOVERLAP: { // partial removal of this lock and there may still be others that need to be removed // note: might change start and consequently map position => re-insert excl lock RangeLockExclSetIter iterExclNext(iterExcl); iterExclNext++; RangeLockDetails oldExcl(*iterExcl); oldExcl.trim(lockDetails); exclRangeFLocks.erase(iterExcl); exclRangeFLocks.insert(oldExcl); lockRemoved = true; iterExcl = iterExclNext; incIterAtEnd = false; } break; default: break; // no overlap } // end of switch(overlap) if(incIterAtEnd) iterExcl++; } // check shared locks... // (similar to exclusive locks, we can stop here if unlock is entirely covered by one of our // owned shared ranges, because there cannot be another overlapping range which we also own) for(RangeLockSharedSetIter iterShared = sharedRangeFLocks.begin(); iterShared != sharedRangeFLocks.end(); /* conditional iter increment at end of loop */ ) { if(!lockDetails.equalsHandle(*iterShared) ) { // lock owned by another client/process iterShared++; continue; } // found a lock that is owned by the same client/process => check overlap with unlock request bool incIterAtEnd = true; RangeOverlapType overlap = lockDetails.overlapsEx(*iterShared); switch(overlap) { case RangeOverlapType_EQUALS: { // found an exact match => don't need to look any further sharedRangeFLocks.erase(iterShared); return true; } break; case RangeOverlapType_ISCONTAINED: { // unlock is fully contained in a greater locked area => don't need to look any further // check if 1 or 2 locked areas remain... if( (lockDetails.start == iterShared->start) || (lockDetails.end == iterShared->end) ) { // only one locked area remains RangeLockDetails oldShared(*iterShared); oldShared.trim(lockDetails); sharedRangeFLocks.erase(iterShared); sharedRangeFLocks.insert(oldShared); } else { // two locked areas remain RangeLockDetails oldShared(*iterShared); RangeLockDetails newShared; oldShared.split(lockDetails, newShared); sharedRangeFLocks.erase(iterShared); sharedRangeFLocks.insert(oldShared); sharedRangeFLocks.insert(newShared); } return true; } break; case RangeOverlapType_CONTAINS: { // full removal of this lock, but there may still be some others that need to be removed RangeLockExclSetIter iterExclNext(iterShared); iterExclNext++; sharedRangeFLocks.erase(iterShared); lockRemoved = true; iterShared = iterExclNext; incIterAtEnd = false; } break; case RangeOverlapType_STARTOVERLAP: case RangeOverlapType_ENDOVERLAP: { // partial removal of this lock and there may still be others that need to be removed // note: might change start and consequently map position => re-insert excl lock RangeLockExclSetIter iterSharedNext(iterShared); iterSharedNext++; RangeLockDetails oldShared(*iterShared); oldShared.trim(lockDetails); sharedRangeFLocks.erase(iterShared); sharedRangeFLocks.insert(oldShared); lockRemoved = true; iterShared = iterSharedNext; incIterAtEnd = false; } break; default: break; // no overlap } // end of switch(overlap) if(incIterAtEnd) iterShared++; } return lockRemoved; } /** * Remove next requests from waiters queue and try to grant it - until we reach an entry that * cannot be granted immediately. * * Note: unlocked, so hold the mutex when calling this. */ LockRangeNotifyList FileInode::flockRangeTryNextWaiters() { int numWaitersBefore = 0; // number of waiters in the queue before the current checked element LockRangeNotifyList notifyList; // quick stack version to speed up the no waiter granted path for(RangeLockDetailsListIter iter = waitersExclRangeFLock.begin(); iter != waitersExclRangeFLock.end(); /* conditional iter inc inside loop */) { bool hasConflict = flockRangeCheckConflictsEx(*iter, numWaitersBefore, NULL); if(hasConflict) { iter++; numWaitersBefore++; continue; } // no conflict => grant lock flockRangeExclusive(*iter); notifyList.push_back(*iter); waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersExclRangeFLock.erase(iter); } for(RangeLockDetailsListIter iter = waitersSharedRangeFLock.begin(); iter != waitersSharedRangeFLock.end(); /* conditional iter inc inside loop */) { bool hasConflict = flockRangeCheckConflicts(*iter, NULL); if(hasConflict) { iter++; continue; } // no conflict => grant lock flockRangeShared(*iter); notifyList.push_back(*iter); waitersLockIDsRangeFLock.erase(iter->lockAckID); iter = waitersSharedRangeFLock.erase(iter); } return notifyList; } /** * Generate a complete locking status overview (all granted and waiters) as human-readable string. */ std::string FileInode::flockRangeGetAllAsStr() { UniqueRWLock lock(rwlock, SafeRWLock_READ); std::ostringstream outStream; outStream << "Exclusive" << std::endl; outStream << "=========" << std::endl; for(RangeLockExclSetCIter iter = exclRangeFLocks.begin(); iter != exclRangeFLocks.end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Shared" << std::endl; outStream << "=========" << std::endl; for(RangeLockSharedSetCIter iter = sharedRangeFLocks.begin(); iter != sharedRangeFLocks.end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Exclusive Waiters" << std::endl; outStream << "=========" << std::endl; for(RangeLockDetailsListCIter iter = waitersExclRangeFLock.begin(); iter != waitersExclRangeFLock.end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Shared Waiters" << std::endl; outStream << "=========" << std::endl; for(RangeLockDetailsListCIter iter = waitersSharedRangeFLock.begin(); iter != waitersSharedRangeFLock.end(); iter++) { outStream << iter->toString() << std::endl; } outStream << std::endl; outStream << "Waiters lockIDs" << std::endl; outStream << "=========" << std::endl; for(StringSetCIter iter = waitersLockIDsRangeFLock.begin(); iter != waitersLockIDsRangeFLock.end(); iter++) { outStream << *iter << std::endl; } outStream << std::endl; return outStream.str(); } /** * Increase/decreas the link count of this inode */ bool FileInode::incDecNumHardLinks(EntryInfo* entryInfo, int value) { SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K incDecNumHardlinksUnpersistentUnlocked(value); // update ctime StatData* statData = this->inodeDiskData.getInodeStatData(); statData->setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec); bool retVal = storeUpdatedInodeUnlocked(entryInfo); // store on disk if(!retVal) { // failed to update metadata on disk => restore old values incDecNumHardlinksUnpersistentUnlocked(-value); } safeLock.unlock(); // U N L O C K return retVal; } bool FileInode::operator==(const FileInode& other) const { return inodeDiskData == other.inodeDiskData && fileInfoVec == other.fileInfoVec && exclusiveTID == other.exclusiveTID && numSessionsRead == other.numSessionsRead && numSessionsWrite == other.numSessionsWrite && exclAppendLock == other.exclAppendLock && waitersExclAppendLock == other.waitersExclAppendLock && waitersLockIDsAppendLock == other.waitersLockIDsAppendLock && exclFLock == other.exclFLock && sharedFLocks == other.sharedFLocks && waitersExclFLock == other.waitersExclFLock && waitersSharedFLock == other.waitersSharedFLock && waitersLockIDsFLock == other.waitersLockIDsFLock && exclRangeFLocks == other.exclRangeFLocks && sharedRangeFLocks == other.sharedRangeFLocks && waitersExclRangeFLock == other.waitersExclRangeFLock && waitersSharedRangeFLock == other.waitersSharedRangeFLock && waitersLockIDsRangeFLock == other.waitersLockIDsRangeFLock && dentryCompatData == other.dentryCompatData && numParentRefs.read() == other.numParentRefs.read() && referenceParentID == other.referenceParentID && isInlined == other.isInlined; } std::pair FileInode::listXAttr() { BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs"); const Path* inodesPath = getIsBuddyMirroredUnlocked() ? Program::getApp()->getBuddyMirrorInodesPath() : Program::getApp()->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(), inodeDiskData.getEntryID()); return XAttrTk::listUserXAttrs(metaFilename); } std::tuple, ssize_t> FileInode::getXAttr( const std::string& xAttrName, size_t maxSize) { BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs"); const Path* inodesPath = getIsBuddyMirroredUnlocked() ? Program::getApp()->getBuddyMirrorInodesPath() : Program::getApp()->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(), inodeDiskData.getEntryID()); return XAttrTk::getUserXAttr(metaFilename, xAttrName, maxSize); } FhgfsOpsErr FileInode::removeXAttr(EntryInfo* entryInfo, const std::string& xAttrName) { UniqueRWLock lock(rwlock, SafeRWLock_WRITE); BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs"); const Path* inodesPath = getIsBuddyMirroredUnlocked() ? Program::getApp()->getBuddyMirrorInodesPath() : Program::getApp()->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(), inodeDiskData.getEntryID()); FhgfsOpsErr result = XAttrTk::removeUserXAttr(metaFilename, xAttrName); if (result == FhgfsOpsErr_SUCCESS) { inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec); storeUpdatedInodeUnlocked(entryInfo, nullptr); } // FIXME: should resync only this xattr ON THE INODE if (getIsBuddyMirroredUnlocked()) if (auto* resync = BuddyResyncer::getSyncChangeset()) resync->addModification(metaFilename, MetaSyncFileType::Inode); return result; } FhgfsOpsErr FileInode::setXAttr(EntryInfo* entryInfo, const std::string& xAttrName, const CharVector& xAttrValue, int flags) { UniqueRWLock lock(rwlock, SafeRWLock_WRITE); BEEGFS_BUG_ON_DEBUG(isInlined, "inlined file inode cannot access its own xattrs"); const Path* inodesPath = getIsBuddyMirroredUnlocked() ? Program::getApp()->getBuddyMirrorInodesPath() : Program::getApp()->getInodesPath(); std::string metaFilename = MetaStorageTk::getMetaInodePath(inodesPath->str(), inodeDiskData.getEntryID()); FhgfsOpsErr result = XAttrTk::setUserXAttr(metaFilename, xAttrName, &xAttrValue[0], xAttrValue.size(), flags); if (result == FhgfsOpsErr_SUCCESS) { inodeDiskData.inodeStatData.setAttribChangeTimeSecs(TimeAbs().getTimeval()->tv_sec); storeUpdatedInodeUnlocked(entryInfo, nullptr); } // FIXME: should resync only this xattr ON THE INODE if (getIsBuddyMirroredUnlocked()) if (auto* resync = BuddyResyncer::getSyncChangeset()) resync->addModification(metaFilename, MetaSyncFileType::Inode); return result; } void FileInode::initLocksRandomForSerializationTests() { Random rand; this->exclusiveTID = rand.getNextInt(); this->numSessionsRead = rand.getNextInt(); this->numSessionsWrite = rand.getNextInt(); this->exclAppendLock.initRandomForSerializationTests(); int max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { EntryLockDetails lock; lock.initRandomForSerializationTests(); this->waitersExclAppendLock.push_back(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { std::string id; StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) ); this->waitersLockIDsAppendLock.insert(id); } this->exclFLock.initRandomForSerializationTests(); max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { EntryLockDetails lock; lock.initRandomForSerializationTests(); this->sharedFLocks.insert(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { EntryLockDetails lock; lock.initRandomForSerializationTests(); this->waitersExclFLock.push_back(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { EntryLockDetails lock; lock.initRandomForSerializationTests(); this->waitersSharedFLock.push_back(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { std::string id; StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) ); this->waitersLockIDsFLock.insert(id); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { RangeLockDetails lock; lock.initRandomForSerializationTests(); this->exclRangeFLocks.insert(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { RangeLockDetails lock; lock.initRandomForSerializationTests(); this->sharedRangeFLocks.insert(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { RangeLockDetails lock; lock.initRandomForSerializationTests(); this->waitersExclRangeFLock.push_back(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { RangeLockDetails lock; lock.initRandomForSerializationTests(); this->waitersSharedRangeFLock.push_back(lock); } max = rand.getNextInRange(0, 1024); for(int i = 0; i < max; i++) { std::string id; StringTk::genRandomAlphaNumericString(id, rand.getNextInRange(2, 30) ); this->waitersLockIDsFLock.insert(id); } StringTk::genRandomAlphaNumericString(this->referenceParentID, rand.getNextInRange(2, 30) ); this->numParentRefs.set(rand.getNextInt() ); } /** * Checks whether current file state allows the requested access and increments appropriate * session counter if permitted. The entire operation occurs under a single write lock * to prevent races between open() operation and state validate-and-update operations. * * @param accessFlags OPENFILE_ACCESS_... flags * @param bypassAccessCheck if true, skip all file state-based access checks * @return FhgfsOpsErr_SUCCESS if file opened successfully * FhgfsOpsErr_FILEACCESS_DENIED if file state restricts the requested access */ FhgfsOpsErr FileInode::checkAccessAndOpen(unsigned accessFlags, bool bypassAccessCheck) { RWLockGuard lock(rwlock, SafeRWLock_WRITE); if (!bypassAccessCheck) { FileState state(getFileStateUnlocked()); // Fast path: Check if file is unlocked (common case) if (unlikely(!state.isUnlocked())) { // File has active state restrictions - determine what access types are being requested bool readRequested = accessFlags & (OPENFILE_ACCESS_READ | OPENFILE_ACCESS_READWRITE); bool writeRequested = accessFlags & (OPENFILE_ACCESS_WRITE | OPENFILE_ACCESS_READWRITE | OPENFILE_ACCESS_TRUNC); // Access not allowed if: state implies fully locked, or // read requested when read-locked, or write requested when write-locked bool blockOpenRequest = state.isFullyLocked() || (state.isReadLocked() && readRequested) || (state.isWriteLocked() && writeRequested); if (blockOpenRequest) return FhgfsOpsErr_FILEACCESS_DENIED; } } // Access allowed - increment session counter incNumSessionsUnlocked(accessFlags); return FhgfsOpsErr_SUCCESS; }