#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "FhgfsOpsDir.h" #include "FhgfsOpsHelper.h" /** * Log file system operations and optionally additional messages. * Used to trace function calls and to print the path the function is to operate on. * * @param level The log level * @param dentry Common vfs directory entry * @param logContext Usually the name of the calling function * @param msgStr Optional message string, may be NULL * @param ... Optional arguments according to format given in msgStr */ void FhgfsOpsHelper_logOpMsg(int level, App* app, struct dentry* dentry, struct inode* inode, const char *logContext, const char *msgStr, ...) { Logger* log = App_getLogger(app); NoAllocBufferStore* bufStore = App_getPathBufStore(app); char* pathStoreBuf = NULL; // NoAllocBufferStore_addBuf() can detect a wrong NULL... char* path = NULL; const char* entryID = NULL; const char* noPath = "n/a (no dentry)"; const char* noEntryID = "n/a (no inode)"; if(level > Logger_getLogLevel(log) ) return; if(dentry) { path = __FhgfsOps_pathResolveToStoreBuf(bufStore, dentry, &pathStoreBuf); if(IS_ERR(path) ) path = NULL; } if(inode) { // get entryInfo lock for entryID FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); const EntryInfo* entryInfo; FhgfsInode_entryInfoReadLock(fhgfsInode); // L O C K entryInfo entryInfo = FhgfsInode_getEntryInfo(fhgfsInode); entryID = EntryInfo_getEntryID(entryInfo); } if(msgStr) { // generate new msg string for given msg formatting va_list ap; char* newMsg; newMsg = kmalloc(LOGGER_LOGBUF_SIZE, GFP_NOFS); if(newMsg) { int prefixLen = snprintf(newMsg, LOGGER_LOGBUF_SIZE, "called. Path: %s; EntryID: %s; %s", path ? path : noPath, entryID ? entryID : noEntryID, msgStr); va_start(ap, msgStr); vsnprintf(newMsg + prefixLen, LOGGER_LOGBUF_SIZE - prefixLen, msgStr, ap); va_end(ap); Logger_logFormatted(log, level, logContext, "%s", newMsg); kfree(newMsg); } else { // alloc failed, still try to log the operation at least Logger_logFormatted(log, level, logContext, "called. Path: %s; EntryID: %s; (msg n/a)", path ? path : noPath, entryID ? entryID : noEntryID); } } else Logger_logFormatted(log, level, logContext, "called. Path: %s; EntryID: %s", path ? path : noPath, entryID ? entryID : noEntryID); if(inode) { // release entryInfo lock FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); FhgfsInode_entryInfoReadUnlock(fhgfsInode); // U N L O C K entryInfo } if(pathStoreBuf) NoAllocBufferStore_addBuf(bufStore, pathStoreBuf); } ssize_t FhgfsOpsHelper_appendfileVecOffset(FhgfsInode* fhgfsInode, struct iov_iter *iter, size_t count, RemotingIOInfo* ioInfo, loff_t offsetFromEnd, loff_t* outNewOffset) { App* app = ioInfo->app; ssize_t writeRes = 0; FhgfsOpsErr lockRes; FhgfsOpsErr statRes; fhgfs_stat fhgfsStat; // get MDS append lock... lockRes = FhgfsOpsHelper_getAppendLock(fhgfsInode, ioInfo); if(unlikely(lockRes != FhgfsOpsErr_SUCCESS) ) return -lockRes; // get current file size from servers... FhgfsInode_entryInfoReadLock(fhgfsInode); // LOCK EntryInfo statRes = FhgfsOpsRemoting_statDirect(app, FhgfsInode_getEntryInfo(fhgfsInode), &fhgfsStat); FhgfsInode_entryInfoReadUnlock(fhgfsInode); // UNLOCK EntryInfo if(unlikely(statRes != FhgfsOpsErr_SUCCESS) ) { // remote stat error writeRes = -statRes; goto unlock_and_exit; } // the actual remote write... fhgfsStat.size += offsetFromEnd; (void) count; // count currently not used, writefileVec looks at iter->count BUG_ON(count != iov_iter_count(iter)); writeRes = FhgfsOpsRemoting_writefileVec(iter, fhgfsStat.size, ioInfo, false); if(writeRes >= 0) fhgfsStat.size += writeRes; *outNewOffset = fhgfsStat.size; unlock_and_exit: FhgfsOpsHelper_releaseAppendLock(fhgfsInode, ioInfo); return writeRes; } /** * Append data to a file, protected by MDS locking. * * Note: This method does not try to flush local file buffers after acquiring the MDS lock, because * this method might be called during a file buffer flush (so callers must ensure that there are * no conflicing local file buffers). * * @param size buffer length to be appended * @param outNewOffset new file offset after append completes (only valid if no error returned) * @return number of bytes written or negative fhgfs error code */ static ssize_t FhgfsOpsHelper_appendfile_kernel(FhgfsInode* fhgfsInode, const char *buf, size_t size, RemotingIOInfo* ioInfo, loff_t* outNewOffset) { struct iov_iter *iter = STACK_ALLOC_BEEGFS_ITER_KVEC(buf, size, WRITE); return FhgfsOpsHelper_appendfileVecOffset(fhgfsInode, iter, size, ioInfo, 0, outNewOffset); } /** * Wrapper for FhgfsOpsRemoting_writefile, but this automatically calls _appendfile() if -1 offset * is given. * * @param offset offset in file, -1 for append */ static ssize_t FhgfsOpsHelper_writefileEx(FhgfsInode* fhgfsInode, struct iov_iter *iter, size_t size, loff_t offset, RemotingIOInfo* ioInfo) { if(offset == -1) return FhgfsOpsHelper_appendfileVecOffset(fhgfsInode, iter, size, ioInfo, 0, &offset); else return FhgfsOpsRemoting_writefileVec(iter, offset, ioInfo, false); } static ssize_t FhgfsOpsHelper_writefileEx_kernel(FhgfsInode* fhgfsInode, const char *buf, size_t size, loff_t offset, RemotingIOInfo* ioInfo) { if(offset == -1) return FhgfsOpsHelper_appendfile_kernel(fhgfsInode, buf, size, ioInfo, &offset); else return FhgfsOpsRemoting_writefile_kernel(buf, size, offset, ioInfo); } /** * Refreshes the dirInfo by retrieving a new version from the nodes (but only if required or * forced). * * Note: This method guarantees either valid local names range for currentServerOffset or empty * dirContents list on completion. * * @param dirInfo holds the relevant serverOffset for the refresh * @param forceUpdate use this to force an update (e.g. when the user seeked) * @return negative linux error code on error, 0 otherwise */ int FhgfsOpsHelper_refreshDirInfoIncremental(App* app, const EntryInfo* entryInfo, FsDirInfo* dirInfo, bool forceUpdate) { Logger* log = App_getLogger(app); const char* logContext = "FhgfsOpsHelper (refresh dir info incremental)"; const unsigned maxNames = 100; // max number of retrieved names int retVal = 0; FhgfsOpsErr listRes = FhgfsOpsErr_SUCCESS; StrCpyVec* dirContents = FsDirInfo_getDirContents(dirInfo); size_t dirContentsLen = StrCpyVec_length(dirContents); size_t currentContentsPos = FsDirInfo_getCurrentContentsPos(dirInfo); // pos inside contents vec LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext, "called. EntryID: %s, Owner: %hu", entryInfo->entryID, entryInfo->owner.node.value); IGNORE_UNUSED_VARIABLE(log); IGNORE_UNUSED_VARIABLE(logContext); if(forceUpdate) { // user seeked backwards (or something else makes an update necessary) LOG_DEBUG(log, Log_SPAM, logContext, "forced update of dir contents"); listRes = FhgfsOpsRemoting_listdirFromOffset(entryInfo, dirInfo, maxNames); } else if(FsDirInfo_getEndOfDir(dirInfo) && (currentContentsPos >= dirContentsLen) ) { // we reached the end of the dir contents => do nothing LOG_DEBUG(log, Log_SPAM, logContext, "reached end of dir contents by offset"); StrCpyVec_clear(FsDirInfo_getDirContents(dirInfo) ); } else if(currentContentsPos < dirContentsLen) { // inside the local contents region => do nothing LOG_DEBUG_FORMATTED(log, Log_SPAM, logContext, "offset inside local contents region: %lld <= %lld", (long long)currentContentsPos, (long long)dirContentsLen); } else { // initial retrieval or offset outside current local contents region LOG_DEBUG(log, Log_SPAM, logContext, "retrieving by offset"); listRes = FhgfsOpsRemoting_listdirFromOffset(entryInfo, dirInfo, maxNames); } if(unlikely(listRes != FhgfsOpsErr_SUCCESS) ) { // error StrCpyVec_clear(FsDirInfo_getDirContents(dirInfo) ); FsDirInfo_setCurrentContentsPos(dirInfo, 0); Logger_logFormatted(log, Log_DEBUG, logContext, "result: %s", FhgfsOpsErr_toErrString(listRes) ); retVal = FhgfsOpsErr_toSysErr(listRes); } return retVal; } /** * Flush cache buffer and return it to the store, but return immediately if the cache lock cannot * be acquired immediately. * * @param discardCacheOnError true to discard a write cache if the remote write * was not successful; false to just release it to the store in this case * @return FhgfsOpsErr_INUSE if cache lock is contended and nothing has been flushed. */ FhgfsOpsErr FhgfsOpsHelper_flushCacheNoWait(App* app, FhgfsInode* fhgfsInode, bool discardCacheOnError) { int gotLock; // 1 if we got the lock, 0 otherwise FhgfsOpsErr retVal; gotLock = FhgfsInode_fileCacheExclusiveTryLock(fhgfsInode); // (T R Y) L O C K if(gotLock != 1) return FhgfsOpsErr_INUSE; retVal = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, discardCacheOnError); FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode); // U N L O C K return retVal; } /** * Buffered writing. * Tries to add written data to an existing cache or flushes the cache (if any) and delegates to * writeCacheFlushed(). * * Note: This method also updates FsFileInfo cache hits counter. * * @param offset offset in file, -1 for append * @return number of bytes written or negative fhgfs error code */ ssize_t FhgfsOpsHelper_writeCached(struct iov_iter *iter, size_t size, loff_t offset, FhgfsInode* fhgfsInode, FsFileInfo* fileInfo, RemotingIOInfo* ioInfo) { App* app = ioInfo->app; NoAllocBufferStore* cacheStore = App_getCacheBufStore(app); ssize_t retVal; bool isAppendWrite = (offset == -1); // true if this is an append write CacheBuffer* cacheBuffer; enum FileBufferType cacheType; bool isAppendCacheBuf; // true if the current cache buffer is an append write buffer size_t bufLenLeft; // remaining buffer length loff_t currentCacheEndOffset; int userBufCopyRes; if (app->cfg->tuneCoherentBuffers) { i_mmap_lock_read(fhgfsInode->vfs_inode.i_mapping); if (beegfs_hasMappings(&fhgfsInode->vfs_inode)) { FsFileInfo_decCacheHits(fileInfo); i_mmap_unlock_read(fhgfsInode->vfs_inode.i_mapping); return FhgfsOpsHelper_writefileEx(fhgfsInode, iter, size, offset, ioInfo); } i_mmap_unlock_read(fhgfsInode->vfs_inode.i_mapping); } FhgfsInode_fileCacheExclusiveLock(fhgfsInode); // L O C K cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); cacheType = cacheBuffer->bufType; if(cacheType == FileBufferType_NONE) { // file doesn't have any cache buffer currently // update cache hits by guessing whether this would have been a cache hit if we had a buffer if(isAppendWrite) FsFileInfo_incCacheHits(fileInfo); // could have been a cache hit else { loff_t virtualCacheStart = FsFileInfo_getLastWriteOffset(fileInfo); loff_t virtualCacheEnd = virtualCacheStart + NoAllocBufferStore_getBufSize(cacheStore) - 1; if( (offset >= virtualCacheStart) && (offset <= virtualCacheEnd) ) FsFileInfo_incCacheHits(fileInfo); // could have been a cache hit else FsFileInfo_decCacheHits(fileInfo); // would probably not have been a cache hit } retVal = __FhgfsOpsHelper_writeCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } if(cacheType == FileBufferType_READ) { // file has a read cache => just discard the cached data FsFileInfo_decCacheHits(fileInfo); __FhgfsOpsHelper_discardCache(app, fhgfsInode); retVal = __FhgfsOpsHelper_writeCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } // file has a write cache => can we extend it or do we have to flush it? isAppendCacheBuf = cacheBuffer->fileOffset == -1; // -1 offset means append buf currentCacheEndOffset = cacheBuffer->fileOffset + cacheBuffer->bufUsageLen; bufLenLeft = cacheBuffer->bufUsageMaxLen - cacheBuffer->bufUsageLen; if( (isAppendWrite != isAppendCacheBuf) || (!isAppendWrite && (offset != currentCacheEndOffset) ) || (size > bufLenLeft) ) { // offset doesn't fit or not enough cache room left => flush cache FhgfsOpsErr flushRes; FsFileInfo_decCacheHits(fileInfo); flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false); if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) ) { // flush failed retVal = -flushRes; goto unlock_and_exit; } retVal = __FhgfsOpsHelper_writeCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } // offset fits and we have enough room left for the write => copy data to cache buf (and update) FsFileInfo_incCacheHits(fileInfo); userBufCopyRes = copy_from_iter(&(cacheBuffer->buf)[cacheBuffer->bufUsageLen], size, iter); if(unlikely(userBufCopyRes != size)) { // copy failed Logger* log = App_getLogger(app); Logger_log(log, Log_DEBUG, __func__, "Buffer copy from userspace failed (invalid buffer)"); retVal = -FhgfsOpsErr_ADDRESSFAULT; goto unlock_and_exit; } cacheBuffer->bufUsageLen += size; bufLenLeft = cacheBuffer->bufUsageMaxLen - cacheBuffer->bufUsageLen; // re-calc remaining bufLen // write has been completely cached => check whether the cache is full now if(!bufLenLeft) { // cache buf used up => flush it FhgfsOpsErr flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false); if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) ) { // flush failed retVal = -flushRes; goto unlock_and_exit; } } // write was successful if we got here retVal = size; unlock_and_exit: FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode); // U N L O C K return retVal; } /** * Buffered reading. * Tries to read data from an existing cache or flushes the cache and delegates to * readCacheFlushed(). * * @return number of bytes read or negative fhgfs error code */ ssize_t FhgfsOpsHelper_readCached(struct iov_iter *iter, size_t size, loff_t offset, FhgfsInode* fhgfsInode, FsFileInfo* fileInfo, RemotingIOInfo* ioInfo) { App* app = ioInfo->app; NoAllocBufferStore* cacheStore = App_getCacheBufStore(app); ssize_t retVal; CacheBuffer* cacheBuffer; enum FileBufferType cacheType; loff_t readEndOffset = offset + size - 1; loff_t cacheEndOffset; ssize_t remoteReadSize; loff_t cacheCopyOffset; size_t cacheCopySize; if (app->cfg->tuneCoherentBuffers) { i_mmap_lock_read(fhgfsInode->vfs_inode.i_mapping); if (beegfs_hasMappings(&fhgfsInode->vfs_inode)) { FsFileInfo_decCacheHits(fileInfo); i_mmap_unlock_read(fhgfsInode->vfs_inode.i_mapping); return FhgfsOpsRemoting_readfileVec(iter, size, offset, ioInfo, fhgfsInode); } i_mmap_unlock_read(fhgfsInode->vfs_inode.i_mapping); } // fast path for parallel readers (other path takes exclusive lock) if(FhgfsInode_getIsFileOpenByMultipleReaders(fhgfsInode) || !FsFileInfo_getAllowCaching(fileInfo) ) { // no caching needed ssize_t readRes; FhgfsInode_fileCacheSharedLock(fhgfsInode); // L O C K (shared) cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); cacheType = cacheBuffer->bufType; if(cacheType != FileBufferType_NONE) { // file already has buffer attached, so we need exclusive lock FhgfsInode_fileCacheSharedUnlock(fhgfsInode); // U N L O C K (shared) goto exclusive_lock_path; } readRes = FhgfsOpsRemoting_readfileVec(iter, size, offset, ioInfo, fhgfsInode); FhgfsInode_fileCacheSharedUnlock(fhgfsInode); // U N L O C K (shared) return readRes; } exclusive_lock_path: FhgfsInode_fileCacheExclusiveLock(fhgfsInode); // L O C K (exclusive) cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); cacheType = cacheBuffer->bufType; if(cacheType == FileBufferType_NONE) { // file has no buffer attached // update cache hits by guessing whether this would have been a cache hit if we had a buffer loff_t virtualCacheStart = FsFileInfo_getLastReadOffset(fileInfo); loff_t virtualCacheEnd = virtualCacheStart + NoAllocBufferStore_getBufSize(cacheStore) - 1; if( (virtualCacheEnd >= offset) && (virtualCacheStart <= readEndOffset) ) FsFileInfo_incCacheHits(fileInfo); // range overlap => could have been a cache hit else FsFileInfo_decCacheHits(fileInfo); // would probably not have been a cache hit retVal = __FhgfsOpsHelper_readCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } if(cacheType == FileBufferType_WRITE) { // file has a write cache => flush it FhgfsOpsErr flushRes; FsFileInfo_decCacheHits(fileInfo); flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false); if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) ) { // flush failed retVal = -flushRes; goto unlock_and_exit; } retVal = __FhgfsOpsHelper_readCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } // file has a read cache => does it overlap with the read range? cacheEndOffset = cacheBuffer->fileOffset + cacheBuffer->bufUsageLen - 1; if( (cacheEndOffset < offset) || (cacheBuffer->fileOffset > readEndOffset) ) { // cache range and read range do not overlap => flush FhgfsOpsErr flushRes; FsFileInfo_decCacheHits(fileInfo); flushRes = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, false); if(unlikely(flushRes != FhgfsOpsErr_SUCCESS) ) { // flush failed retVal = -flushRes; goto unlock_and_exit; } retVal = __FhgfsOpsHelper_readCacheFlushed(iter, size, offset, fhgfsInode, fileInfo, ioInfo); goto unlock_and_exit; } // cache range and read range are overlapping. three possible cases to be handled: // read range before/inside/behind the cache range FsFileInfo_incCacheHits(fileInfo); remoteReadSize = 0; if(offset < cacheBuffer->fileOffset) { // read begins before cache => remote-read and copy the rest ssize_t readRes; remoteReadSize = cacheBuffer->fileOffset - offset; readRes = FhgfsOpsRemoting_readfileVec(iter, remoteReadSize, offset, ioInfo, fhgfsInode); if(readRes < remoteReadSize) { // error or end-of-file => invalidate cache __FhgfsOpsHelper_discardCache(app, fhgfsInode); retVal = readRes; goto unlock_and_exit; } } // remote read (if any) succeeded => copy part from cache cacheCopyOffset = (offset <= cacheBuffer->fileOffset) ? 0 : (offset - cacheBuffer->fileOffset); cacheCopySize = MIN(cacheBuffer->fileOffset + cacheBuffer->bufUsageLen, offset + size) - (cacheBuffer->fileOffset + cacheCopyOffset); { size_t numCopied = copy_to_iter(&(cacheBuffer->buf)[cacheCopyOffset], cacheCopySize, iter); if (unlikely(numCopied != cacheCopySize)) { // copy failed Logger* log = App_getLogger(app); Logger_log(log, Log_DEBUG, __func__, "Buffer copy to userspace failed (invalid buffer)"); retVal = -FhgfsOpsErr_ADDRESSFAULT; goto unlock_and_exit; } } // has everything been read or is there a remainder behind the cacheBuf? if(readEndOffset > cacheEndOffset) { // there is a remainder behind the cacheBuf => discard cache (to allow new caching) //loff_t remainderBufOffset = remoteReadSize + cacheCopySize; loff_t remainderFileOffset = cacheEndOffset + 1; size_t remainderSize = readEndOffset - cacheEndOffset; ssize_t remainderRes; __FhgfsOpsHelper_discardCache(app, fhgfsInode); remainderRes = __FhgfsOpsHelper_readCacheFlushed(iter, remainderSize, remainderFileOffset, fhgfsInode, fileInfo, ioInfo); if(unlikely(remainderRes < 0) ) { // reading failed retVal = remainderRes; goto unlock_and_exit; } retVal = remoteReadSize + cacheCopySize + remainderRes; goto unlock_and_exit; } // read was successful if we got here retVal = size; unlock_and_exit: FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode); // U N L O C K (exclusive) return retVal; } /** * Try to cache the write data or delegate to the normal remoting writefile(). * * Note: Use this only when definitely no file cache entry exists. * Note: Unlocked, so caller must hold inode cache lock. * * @param offset offset in file, -1 for append * @return number of bytes written or negative fhgfs error code */ ssize_t __FhgfsOpsHelper_writeCacheFlushed(struct iov_iter *iter, size_t size, loff_t offset, FhgfsInode* fhgfsInode, FsFileInfo* fileInfo, RemotingIOInfo* ioInfo) { App* app = ioInfo->app; NoAllocBufferStore* cacheStore = App_getCacheBufStore(app); InodeRefStore* refStore = App_getInodeRefStore(app); bool isAppendWrite = (offset == -1); // true if this is an append write StripePattern* pattern = ioInfo->pattern; size_t maxCacheLen; CacheBuffer* cacheBuffer; int userBufCopyRes; // caching disabled? if(!FsFileInfo_getAllowCaching(fileInfo) || (!isAppendWrite && (FsFileInfo_getCacheHits(fileInfo) <= 0) ) ) return FhgfsOpsHelper_writefileEx(fhgfsInode, iter, size, offset, ioInfo); // check whether the write size is larger than cacheBuf or would span multiple chunks if(isAppendWrite) maxCacheLen = NoAllocBufferStore_getBufSize(cacheStore); else { // normal write (not append) size_t currentChunkSize = StripePattern_getChunkEnd(pattern, offset) - offset + 1; maxCacheLen = MIN(currentChunkSize, NoAllocBufferStore_getBufSize(cacheStore) ); } if(size >= maxCacheLen) // (Note: '=' because we don't want a completely filled up buffer) { // scenario not allowed => write through return FhgfsOpsHelper_writefileEx(fhgfsInode, iter, size, offset, ioInfo); } // we got something to cache here (data fits completely into a cacheBuf) // create new cache (if buffer available) cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); #ifdef BEEGFS_DEBUG BEEGFS_BUG_ON(cacheBuffer->buf, "Looks like we're about to leak a cache buffer"); #endif // BEEGFS_DEBUG cacheBuffer->buf = NoAllocBufferStore_instantBuf(cacheStore); if(!cacheBuffer->buf) { // no cache buffer left in the store => write through return FhgfsOpsHelper_writefileEx(fhgfsInode, iter, size, offset, ioInfo); } cacheBuffer->bufType = FileBufferType_WRITE; // (needed here for _discardCache() below) // init cache entry fields and copy data to cacheBuf userBufCopyRes = copy_from_iter(cacheBuffer->buf, size, iter); if(unlikely(userBufCopyRes != size)) { // copy failed Logger* log = App_getLogger(app); Logger_log(log, Log_DEBUG, __func__, "Buffer copy from userspace failed (invalid buffer)"); __FhgfsOpsHelper_discardCache(app, fhgfsInode); return -FhgfsOpsErr_ADDRESSFAULT; } cacheBuffer->bufUsageLen = size; cacheBuffer->bufUsageMaxLen = maxCacheLen; cacheBuffer->fileOffset = offset; // (note: can be -1 in case of append) // add to inode ref store for async flush InodeRefStore_addAndReferenceInode(refStore, BEEGFS_VFSINODE(fhgfsInode) ); return size; } /** * Try to read-ahead data or just do a plain normal remoting readfile(). * * Note: Use this only when definitely no file cache entry exists. * Note: Unlocked, so caller must hold inode cache lock. * * @return number of bytes read or negative fhgfs error code */ ssize_t __FhgfsOpsHelper_readCacheFlushed(struct iov_iter *iter, size_t size, loff_t offset, FhgfsInode* fhgfsInode, FsFileInfo* fileInfo, RemotingIOInfo* ioInfo) { App* app = ioInfo->app; NoAllocBufferStore* cacheStore = App_getCacheBufStore(app); InodeRefStore* refStore = App_getInodeRefStore(app); StripePattern* pattern = ioInfo->pattern; size_t currentChunkSize; size_t maxCacheLen; CacheBuffer* cacheBuffer; ssize_t readRes; ssize_t numIterCopy; // caching disabled? if(!FsFileInfo_getAllowCaching(fileInfo) || (FsFileInfo_getCacheHits(fileInfo) <= 0) || FhgfsInode_getIsFileOpenByMultipleReaders(fhgfsInode) ) return FhgfsOpsRemoting_readfileVec(iter, size, offset, ioInfo, fhgfsInode); // check whether the read size is larger than a single cacheBuf or would span multiple chunks currentChunkSize = StripePattern_getChunkEnd(pattern, offset) - offset + 1; maxCacheLen = MIN(currentChunkSize, NoAllocBufferStore_getBufSize(cacheStore) ); if(size >= maxCacheLen) // (Note: '=' because we don't want a completely used up buffer) { // scenario not allowed => read direct return FhgfsOpsRemoting_readfileVec(iter, size, offset, ioInfo, fhgfsInode); } // looks like we got something to cache here (read size is smaller than a single cacheBuf) // create new cache (if buffer available) cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); #ifdef BEEGFS_DEBUG BEEGFS_BUG_ON(cacheBuffer->buf, "Looks like we're about to leak a cache buffer"); #endif // BEEGFS_DEBUG cacheBuffer->buf = NoAllocBufferStore_instantBuf(cacheStore); if(!cacheBuffer->buf) { // no cache buffer left in the store => read direct return FhgfsOpsRemoting_readfileVec(iter, size, offset, ioInfo, fhgfsInode); } cacheBuffer->bufType = FileBufferType_READ; // (needed here for _discardCache() below) // we got a buffer => read as much as possible into the cache buffer if(!offset && (size < FSFILEINFO_CACHE_SLOWSTART_READLEN) ) maxCacheLen = FSFILEINFO_CACHE_SLOWSTART_READLEN; /* reduce read-ahead at file start for small reads. good if e.g. a process is only looking at file starts */ readRes = FhgfsOpsRemoting_readfile_kernel(cacheBuffer->buf, maxCacheLen, offset, ioInfo, fhgfsInode); if(readRes <= 0) { // error or immediate end of file __FhgfsOpsHelper_discardCache(app, fhgfsInode); return readRes; } numIterCopy = MIN((ssize_t) size, readRes); // init cache entry fields and copy data from cache buffer to client iter { size_t numCopied = copy_to_iter(cacheBuffer->buf, numIterCopy, iter); if (unlikely(numCopied != numIterCopy)) { // copy failed Logger* log = App_getLogger(app); Logger_log(log, Log_DEBUG, __func__, "Buffer copy to userspace failed (invalid buffer)"); __FhgfsOpsHelper_discardCache(app, fhgfsInode); return -FhgfsOpsErr_ADDRESSFAULT; } } cacheBuffer->bufUsageLen = readRes; cacheBuffer->bufUsageMaxLen = maxCacheLen; cacheBuffer->fileOffset = offset; // add to inode ref store for async flush InodeRefStore_addAndReferenceInode(refStore, BEEGFS_VFSINODE(fhgfsInode) ); return numIterCopy; } /** * Discard the current cache buffer and return it to the store. * * Note: Unlocked, so caller must hold the inode cache lock. */ void __FhgfsOpsHelper_discardCache(App* app, FhgfsInode* fhgfsInode) { NoAllocBufferStore* cacheStore = App_getCacheBufStore(app); InodeRefStore* refStore = App_getInodeRefStore(app); CacheBuffer* cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); #ifdef BEEGFS_DEBUG if( (cacheBuffer->buf == NULL) || (cacheBuffer->bufType == FileBufferType_NONE) ) { BEEGFS_BUG_ON(1, "Attempting to discard an invalid cache buffer"); return; } #endif // BEEGFS_DEBUG NoAllocBufferStore_addBuf(cacheStore, cacheBuffer->buf); cacheBuffer->buf = NULL; // (NULL'ing required for debug sanity checks) cacheBuffer->bufType = FileBufferType_NONE; // remove inode from async flush store InodeRefStore_removeAndReleaseInode(refStore, BEEGFS_VFSINODE(fhgfsInode) ); } FhgfsOpsErr FhgfsOpsHelper_getAppendLock(FhgfsInode* inode, RemotingIOInfo* ioInfo) { FhgfsOpsErr lockRes; FhgfsInode_entryInfoReadLock(inode); // LOCK EntryInfo lockRes = FhgfsOpsRemoting_flockAppendEx(&inode->entryInfo, &inode->entryInfoLock, ioInfo->app, ioInfo->fileHandleID, 0, current->pid, ENTRYLOCKTYPE_EXCLUSIVE, true); FhgfsInode_entryInfoReadUnlock(inode); // UNLOCK EntryInfo if(unlikely(lockRes != FhgfsOpsErr_SUCCESS) ) { LOG_DEBUG_FORMATTED(App_getLogger(ioInfo->app), Log_DEBUG, __func__, "Append lock error: %s", FhgfsOpsErr_toErrString(lockRes) ); SAFE_ASSIGN(ioInfo->needsAppendLockCleanup, true); } return lockRes; } FhgfsOpsErr FhgfsOpsHelper_releaseAppendLock(FhgfsInode* inode, RemotingIOInfo* ioInfo) { FhgfsOpsErr unlockRes; FhgfsInode_entryInfoReadLock(inode); // LOCK EntryInfo unlockRes = FhgfsOpsRemoting_flockAppendEx(&inode->entryInfo, &inode->entryInfoLock, ioInfo->app, ioInfo->fileHandleID, 0, current->pid, ENTRYLOCKTYPE_UNLOCK, true); FhgfsInode_entryInfoReadUnlock(inode); // UNLOCK EntryInfo if(unlikely(unlockRes != FhgfsOpsErr_SUCCESS) ) SAFE_ASSIGN(ioInfo->needsAppendLockCleanup, true); return unlockRes; } /** * Reads chunk by chunk from the servers and zero-fills the buffer if a server returns EOF. * So the caller must make sure that the file actually has at least the requested size. * * Note: Intended for sparse file reading. * Note: There is also a similar version for kernel buffers. */ FhgfsOpsErr FhgfsOpsHelper_readOrClearUser(App* app, struct iov_iter *iter, size_t size, loff_t offset, FsFileInfo* fileInfo, RemotingIOInfo* ioInfo) { StripePattern* pattern = ioInfo->pattern; while(size) { size_t currentChunkSize = StripePattern_getChunkEnd(pattern, offset) - offset + 1; size_t currentReadSize = MIN(currentChunkSize, size); ssize_t currentReadRes; currentReadRes = FhgfsOpsRemoting_readfileVec(iter, currentReadSize, offset, ioInfo, NULL); if(unlikely(currentReadRes < 0) ) return -currentReadRes; if( (size_t)currentReadRes < currentReadSize) { // zero-fill the remainder long clearVal; ssize_t nclear = currentReadSize - currentReadRes; clearVal = iov_iter_zero(nclear, iter); if (clearVal != nclear) return FhgfsOpsErr_ADDRESSFAULT; } offset += currentReadSize; size -= currentReadSize; } return FhgfsOpsErr_SUCCESS; } /** * Takes two paths that are relative to the mount point and creates a new path that is relative from * pathFromStr to pathToStr. * * @param outPathRelativeToStr will be kalloc'ed and needs to be kfree'd by the caller */ void FhgfsOpsHelper_getRelativeLinkStr(const char* pathFromStr, const char* pathToStr, char** outPathRelativeToStr) { // the idea here is to prepend a "../" to pathTo for every parent dir of pathFrom int i; int pathFromLen; // (must be signed, because it might be negative for certain paths) Path pathFrom; Path pathTo; // will be modified to become the relative result path StrCpyList* pathFromElems; StrCpyList* pathToElems; Path_initFromString(&pathFrom, pathFromStr); Path_initFromString(&pathTo, pathToStr); pathFromElems = Path_getPathElems(&pathFrom); pathToElems = Path_getPathElems(&pathTo); pathFromLen = StrCpyList_length(pathFromElems); // insert a ".." for every parent dir in pathFrom // (note: -1 to excluse the final path element) for(i = 0; i < (pathFromLen-1); i++) { StrCpyList_addHead(pathToElems, ".."); } Path_setAbsolute(&pathTo, false); *outPathRelativeToStr = Path_getPathAsStrCopy(&pathTo); Path_uninit(&pathFrom); Path_uninit(&pathTo); } /** * Note: creates the symlink as a normal file and writes the target to it * * @param mode access mode (permission flags) * @return 0 on success, negative linux error code otherwise */ int FhgfsOpsHelper_symlink(App* app, const EntryInfo* parentInfo, const char* to, struct CreateInfo* createInfo, EntryInfo* outEntryInfo) { int retVal = 0; size_t toStrLen = strlen(to); FhgfsOpsErr mkRes; AtomicInt maxUsedTargetIndex; RemotingIOInfo ioInfo; FhgfsOpsErr openRes; size_t numTargets; ssize_t writeRes; FhgfsOpsErr closeRes; BitStore firstWriteDone; PathInfo pathInfo; const struct FileEvent* event; AtomicInt_init(&maxUsedTargetIndex, -1); // create the file // stash createInfo->fileEvent. we want to send it only during close (when the symlink is fully // created) - mkfile would generate its own event if createInfo->fileEvent was set. event = createInfo->fileEvent; createInfo->fileEvent = NULL; mkRes = FhgfsOpsRemoting_mkfile(app, parentInfo, createInfo, outEntryInfo); createInfo->fileEvent = event; if(mkRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(mkRes); goto err_exit; } // open the file memset(&pathInfo, 0, sizeof(pathInfo) ); RemotingIOInfo_initOpen(app, OPENFILE_ACCESS_WRITE, &maxUsedTargetIndex, &pathInfo, &ioInfo); openRes = FhgfsOpsRemoting_openfile(outEntryInfo, &ioInfo, NULL, NULL); if(openRes != FhgfsOpsErr_SUCCESS) { // error FhgfsOpsRemoting_unlinkfile(app, parentInfo, createInfo->entryName, NULL); retVal = FhgfsOpsErr_toSysErr(openRes); goto err_cleanup_open; } numTargets = UInt16Vec_length(ioInfo.pattern->getStripeTargetIDs(ioInfo.pattern)); BitStore_initWithSizeAndReset(&firstWriteDone, numTargets); ioInfo.firstWriteDone = &firstWriteDone; ioInfo.userID = createInfo->userID; ioInfo.groupID = createInfo->groupID; #ifdef BEEGFS_NVFS ioInfo.nvfs = false; #endif // write link-destination to the file writeRes = FhgfsOpsRemoting_writefile_kernel(to, toStrLen, 0, &ioInfo); if(writeRes < (ssize_t)toStrLen) { // error FhgfsOpsHelper_closefileWithAsyncRetry(outEntryInfo, &ioInfo, NULL); FhgfsOpsRemoting_unlinkfile(app, parentInfo, createInfo->entryName, NULL); retVal = (writeRes < 0) ? FhgfsOpsErr_toSysErr(-writeRes) : FhgfsOpsErr_INTERNAL; goto err_cleanup_open; } // close the file // callee frees fileEvent and wants a non-const pointer to signify this. callers of _symlink // must be aware of this and not free the fileEvent themselves closeRes = FhgfsOpsHelper_closefileWithAsyncRetry(outEntryInfo, &ioInfo, (struct FileEvent*) createInfo->fileEvent); if(closeRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(closeRes); // createInfo->fileEvent has been taken care of, don't free it again createInfo->fileEvent = NULL; goto err_cleanup_open; } // clean-up BitStore_uninit(&firstWriteDone); RemotingIOInfo_freeVals(&ioInfo); return retVal; err_cleanup_open: if (ioInfo.firstWriteDone) BitStore_uninit(ioInfo.firstWriteDone); RemotingIOInfo_freeVals(&ioInfo); EntryInfo_uninit(outEntryInfo); if (createInfo->fileEvent) FileEvent_uninit((struct FileEvent*) createInfo->fileEvent); err_exit: return retVal; } /** * Opens a file (the file must exist), reads data from it and closes it. * * Note: This is really slow currently, because of the open/close overhead. * * @return number of read bytes or negative linux error code */ ssize_t FhgfsOpsHelper_readStateless(App* app, const EntryInfo* entryInfo, struct iov_iter *iter, size_t size, loff_t offset) { int retVal = -EREMOTEIO; AtomicInt maxUsedTargetIndex; RemotingIOInfo ioInfo; FhgfsOpsErr openRes; ssize_t readRes; size_t numTargets; FhgfsOpsErr closeRes; BitStore firstWriteDone; PathInfo pathInfo; AtomicInt_init(&maxUsedTargetIndex, -1); // open file memset(&pathInfo, 0, sizeof(pathInfo) ); RemotingIOInfo_initOpen(app, OPENFILE_ACCESS_READ, &maxUsedTargetIndex, &pathInfo, &ioInfo); openRes = FhgfsOpsRemoting_openfile(entryInfo, &ioInfo, NULL, NULL); if(openRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(openRes); goto clean_up_open; } numTargets = UInt16Vec_length(ioInfo.pattern->getStripeTargetIDs(ioInfo.pattern)); BitStore_initWithSizeAndReset(&firstWriteDone, numTargets); ioInfo.firstWriteDone = &firstWriteDone; // read file readRes = FhgfsOpsRemoting_readfileVec(iter, size, offset, &ioInfo, NULL); if(readRes < 0) { // error FhgfsOpsHelper_closefileWithAsyncRetry(entryInfo, &ioInfo, NULL); retVal = FhgfsOpsErr_toSysErr(-readRes); goto clean_up_open; } retVal = readRes; // close the file closeRes = FhgfsOpsHelper_closefileWithAsyncRetry(entryInfo, &ioInfo, NULL); if(closeRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(closeRes); } // clean-up clean_up_open: if (ioInfo.firstWriteDone) BitStore_uninit(ioInfo.firstWriteDone); RemotingIOInfo_freeVals(&ioInfo); return retVal; } /** * Writes the given buffer by getting a reference handle from the inode and releasing that handle * immediately after the write. * * Note: This can safe the overhead for remote open/close if the file is already open for writing * (or reading+writing) by any other process, so it is way better than the _writeStateless() * method. * * @param offset offset in file, -1 for append * @return bytes written or negative fhgfs error code */ static ssize_t FhgfsOpsHelper_writeStatelessInode(FhgfsInode* fhgfsInode, const char *buf, size_t size, loff_t offset) { FileHandleType handleType; RemotingIOInfo ioInfo; FhgfsOpsErr referenceRes; ssize_t writeRes; FhgfsOpsErr releaseRes; // open file /* referenceHandle needs a dentry only for possible TRUNC operations. */ referenceRes = FhgfsInode_referenceHandle(fhgfsInode, NULL, OPENFILE_ACCESS_WRITE, true, NULL, &handleType, NULL); if(unlikely(referenceRes != FhgfsOpsErr_SUCCESS) ) { // error return -referenceRes; } // write file FhgfsInode_getRefIOInfo(fhgfsInode, handleType, FhgfsInode_handleTypeToOpenFlags(handleType), &ioInfo); writeRes = FhgfsOpsHelper_writefileEx_kernel(fhgfsInode, buf, size, offset, &ioInfo); if(unlikely(writeRes < 0) ) { // error FhgfsInode_releaseHandle(fhgfsInode, handleType, NULL); return writeRes; } // close file releaseRes = FhgfsInode_releaseHandle(fhgfsInode, handleType, NULL); if(unlikely(releaseRes != FhgfsOpsErr_SUCCESS) ) { // error return -releaseRes; } return writeRes; } /** * Opens a file (the file must exist), writes the data to it and closes it. * * Note: This is really slow, because of the open/close overhead - use _writeStatelessInode() * method instead, if possible. * * @return number of written bytes or negative linux error code */ ssize_t FhgfsOpsHelper_writeStateless(App* app, const EntryInfo* entryInfo, struct iov_iter *iter, size_t size, loff_t offset, unsigned uid, unsigned gid) { int retVal = -EREMOTEIO; AtomicInt maxUsedTargetIndex; RemotingIOInfo ioInfo; FhgfsOpsErr openRes; ssize_t writeRes; FhgfsOpsErr closeRes; BitStore firstWriteDone; size_t numTargets; PathInfo pathInfo; AtomicInt_init(&maxUsedTargetIndex, -1); // open file memset(&pathInfo, 0, sizeof(pathInfo) ); RemotingIOInfo_initOpen(app, OPENFILE_ACCESS_WRITE, &maxUsedTargetIndex, &pathInfo, &ioInfo); openRes = FhgfsOpsRemoting_openfile(entryInfo, &ioInfo, NULL, NULL); if(openRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(openRes); goto clean_up_open; } numTargets = UInt16Vec_length(ioInfo.pattern->getStripeTargetIDs(ioInfo.pattern)); BitStore_initWithSizeAndReset(&firstWriteDone, numTargets); ioInfo.firstWriteDone = &firstWriteDone; ioInfo.userID = uid; ioInfo.groupID = gid; #ifdef BEEGFS_NVFS ioInfo.nvfs = false; #endif // write file writeRes = FhgfsOpsRemoting_writefileVec(iter, offset, &ioInfo, false); if(writeRes < 0) { // error FhgfsOpsHelper_closefileWithAsyncRetry(entryInfo, &ioInfo, NULL); retVal = FhgfsOpsErr_toSysErr(-writeRes); goto clean_up_open; } retVal = writeRes; // close file closeRes = FhgfsOpsHelper_closefileWithAsyncRetry(entryInfo, &ioInfo, NULL); if(closeRes != FhgfsOpsErr_SUCCESS) { // error retVal = FhgfsOpsErr_toSysErr(closeRes); } // clean-up clean_up_open: if (ioInfo.firstWriteDone) BitStore_uninit(ioInfo.firstWriteDone); RemotingIOInfo_freeVals(&ioInfo); return retVal; } /** * Flush cache buffer and return it to the store. * * Note: Unlocked, so caller must hold inode cache lock. * * @param discardCacheOnError true to discard a write cache if the remote write * was not successful; false to just release it to the store in this case */ FhgfsOpsErr __FhgfsOpsHelper_flushCacheUnlocked(App* app, FhgfsInode* fhgfsInode, bool discardCacheOnError) { /* note: we don't take a handle to an open file here, because we wouldn't know whether the given handle is a read-only handle that just needs to flush the write cache to establish a new read cache. that's why we use the _writeStatelessInode() method here. */ CacheBuffer* cacheBuffer = Fhgfsinode_getFileCacheBuffer(fhgfsInode); enum FileBufferType cacheType = cacheBuffer->bufType; ssize_t writeRes; if(cacheType == FileBufferType_NONE) return FhgfsOpsErr_SUCCESS; if(cacheType == FileBufferType_READ) { // file has a read cache => just discard the cached data __FhgfsOpsHelper_discardCache(app, fhgfsInode); return FhgfsOpsErr_SUCCESS; } // file has a write cache => send the cached data to the storage nodes writeRes = FhgfsOpsHelper_writeStatelessInode(fhgfsInode, cacheBuffer->buf, cacheBuffer->bufUsageLen, cacheBuffer->fileOffset); if(unlikely(writeRes < (ssize_t)cacheBuffer->bufUsageLen) ) { // write did not succeed if(discardCacheOnError) __FhgfsOpsHelper_discardCache(app, fhgfsInode); if(writeRes > 0) return FhgfsOpsErr_NOSPACE; return -writeRes; } // write succeeded __FhgfsOpsHelper_discardCache(app, fhgfsInode); return FhgfsOpsErr_SUCCESS; } /** * Flush cache buffer and return it to the store. * * @param discardCacheOnError true to discard a write cache if the remote write * was not successful; false to just release it to the store in this case */ FhgfsOpsErr FhgfsOpsHelper_flushCache(App* app, FhgfsInode* fhgfsInode, bool discardCacheOnError) { FhgfsOpsErr retVal; FhgfsInode_fileCacheExclusiveLock(fhgfsInode); // L O C K retVal = __FhgfsOpsHelper_flushCacheUnlocked(app, fhgfsInode, discardCacheOnError); FhgfsInode_fileCacheExclusiveUnlock(fhgfsInode); // U N L O C K return retVal; }