/* * fhgfs page cache methods * */ #include #include #include #include #include #include #include #include #include #include #include #include #include "FhgfsOpsDir.h" #include "FhgfsOpsFile.h" #include "FhgfsOpsHelper.h" #include "FhgfsOpsInode.h" #include "FhgfsOpsPages.h" #include "FhgfsOpsSuper.h" #include "FhgfsOps_versions.h" #include #include #include #include #include #include #include #ifdef CONFIG_COMPAT #include #endif #define INITIAL_FIND_PAGES (16) // search initially for this number of pages #define FHGFSOPSPAGES_pageVecListCacheName BEEGFS_MODULE_NAME_STR "-pageListVec" #define BEEGFS_PAGE_VEC_LIST_POOL_SIZE 8 // number of reserve page list-vecs struct FhgfsPageData; typedef struct FhgfsPageData FhgfsPageData; #if ((INITIAL_FIND_PAGES) > (BEEGFS_MAX_PAGE_LIST_SIZE)) #error // trigger a compilation error as we would end up with memory corruption in real live #endif static struct kmem_cache* FhgfsOpsPages_pageListVecCache = NULL; static mempool_t* FhgfsOpsPages_pageListVecPool = NULL; // forward declarations struct fhgfsWritePageHelper; typedef struct fhgfsWritePageHelper fhgfsWrPgHelper; static int _FhgfsOpsPages_sendPageVec(FhgfsPageData* pageData, struct inode* inode, bool isFinal, Fhgfs_RWType rwType); static FhgfsChunkPageVec* _FhgfsOpsPages_allocNewPageVec(FhgfsPageData* pageData); static inline FhgfsOpsErr _FhgfsOpsPages_referenceReadFileHandle(FhgfsPageData* writePageData, struct file* file); static inline FhgfsOpsErr _FhgfsOpsPages_referenceWriteFileHandle(FhgfsPageData* writePageData); static FhgfsOpsErr _FhgfsOpsPages_referenceFileHandle(FhgfsPageData* writePageData, unsigned openFlags); static int _FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc, struct page* page); #ifdef KERNEL_HAS_FOLIO int _FhgfsOpsPages_readahead(struct readahead_control *ractl, struct page* page); #else int _FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping, struct list_head* pageList, struct page* page); #endif #ifdef KERNEL_WRITEPAGE_HAS_FOLIO static int FhgfsOpsPages_writePageCallBack(struct folio *folio, struct writeback_control *wbc, void *data); #else static int FhgfsOpsPages_writePageCallBack(struct page *page, struct writeback_control *wbc, void *data); #endif static int FhgfsOpsPages_readPageCallBack(void *dataPtr, struct page *page); /** * A struct with variables to be exchanged between fhgfs writepages functions */ struct FhgfsPageData { struct inode* inode; FhgfsChunkPageVec *chunkPageVec; bool isReferenced; FileHandleType handleType; RemotingIOInfo ioInfo; }; /** * Initialize the pageListVecCache and pageListVec mempool */ bool FhgfsOpsPages_initPageListVecCache(void) { size_t cacheSize = sizeof(FhgfsPageListVec); FhgfsOpsPages_pageListVecCache = OsCompat_initKmemCache(FHGFSOPSPAGES_pageVecListCacheName, cacheSize, NULL); // create a kmem PageVecList allocation cache if (!FhgfsOpsPages_pageListVecCache) return false; FhgfsOpsPages_pageListVecPool = mempool_create(BEEGFS_PAGE_VEC_LIST_POOL_SIZE, mempool_alloc_slab, mempool_free_slab, FhgfsOpsPages_pageListVecCache); // create a mempool as last reserve for the PageVecList allocation cache if (!FhgfsOpsPages_pageListVecPool) { kmem_cache_destroy(FhgfsOpsPages_pageListVecCache); FhgfsOpsPages_pageListVecCache = NULL; return false; } return true; } /** * Destroy the pageListVecCache and pageListVec mempool */ void FhgfsOpsPages_destroyPageListVecCache(void) { // first destroy the pool, then the cache, as the pool uses cached objects if (FhgfsOpsPages_pageListVecPool) { mempool_destroy(FhgfsOpsPages_pageListVecPool); FhgfsOpsPages_pageListVecPool = NULL; } if (FhgfsOpsPages_pageListVecCache) { kmem_cache_destroy(FhgfsOpsPages_pageListVecCache); FhgfsOpsPages_pageListVecCache = NULL; } } /** * If the meta server has told us for some reason a wrong file-size (i_size) the caller would * wrongly discard data beyond i_size. So we are going to correct i_size here. * * This could be removed once we are sure the meta server *always* has the correct i_size * (which might never be the case, due to concurrent writes and truncates). * * Note: This is the non-inlined version. Only call it from * FhgfsOpsPages_incInodeFileSizeOnPagedRead() */ void __FhgfsOpsPages_incInodeFileSizeOnPagedRead(struct inode* inode, loff_t offset, size_t readRes) { App* app = FhgfsOps_getApp(inode->i_sb); Logger* log = App_getLogger(app); const char* logContext = "Paged-read"; loff_t i_size; FhgfsIsizeHints iSizeHints; FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); const EntryInfo* entryInfo = FhgfsInode_getEntryInfo(fhgfsInode); /* Refresh the inode first, hopefully that is sufficient * Note: The inode MUST NOT be flused from this functions, as the caller hold locked pages. * But on flushing the inode, mm/vfs will wait for page unlocks - it would deadlock! */ __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true); i_size = i_size_read(inode); if (unlikely(readRes && (offset + (loff_t)readRes > i_size) ) ) { // _refreshInode was not sufficient, force a meta-update FhgfsOpsErr refreshRes = FhgfsOpsRemoting_refreshEntry(app, entryInfo); if (refreshRes != FhgfsOpsErr_SUCCESS) { Logger_logErr(log, logContext, "Meta Refresh failed."); } // again try to refresh the inode, again the inode must not be flushed to avoid deadlocks __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true); /* note on i_lock/i_size: make sure we only increase i_size and do not decrease it (e.g. in case we're racing with a concurrent writer at a higher offset) */ spin_lock(&inode->i_lock); // L O C K i_size = i_size_read(inode); if (unlikely(offset + (loff_t)readRes > i_size) ) { // All attempts to update the remote inode size to the position that we read failed. /* i_size_write() sugguests to lock i_mutex, but we might end up here from write_begin() * (FhgfsOps_write_begin), which aready has i_mutex locked. The more common callers * readpages() and readpage() do not have i_mutex, though. */ i_size_write(inode, offset + readRes); spin_unlock(&inode->i_lock); // U N L O C K /* note: this situation can also be "normal" with a concurrent trunc, so we have to be careful regarding user warnings and error return values. */ Logger_logFormatted(log, Log_DEBUG, logContext, "Failed to increase MDS inode size to " "the expected value. (Application might read less data than expected or file was " "truncated during read operation. Expected size: %lld isSize: %lld)", offset + (loff_t)readRes, i_size); } else spin_unlock(&inode->i_lock); // U N L O C K } Logger_logFormatted(log, Log_DEBUG, logContext, "EntryID: %s Correcting inode size from %lld to %lld", entryInfo->entryID, i_size, offset + readRes); } /** * Just a small wrapper to write a pageVec * * Note: no-op if writePageData->chunkPageVec is NULL or chunkPageVec has a size of 0 * * Note: Will destroy writePageData->pageVec and create a new pageVec * * @param isFinalWrite If true no new pageVec will be allocated * * @return 0 on success, negative linux error code on error */ int _FhgfsOpsPages_sendPageVec(FhgfsPageData* pageData, struct inode* inode, bool isFinal, Fhgfs_RWType rwType) { const char* logContext = __func__; App* app = FhgfsOps_getApp(inode->i_sb); int retVal = 0; bool queueSuccess; if (!pageData->chunkPageVec) return retVal; // nothing to do if (FhgfsChunkPageVec_getSize(pageData->chunkPageVec) == 0) { // pageVec is empty if (isFinal) { FhgfsChunkPageVec_destroy(pageData->chunkPageVec); pageData->chunkPageVec = NULL; } return retVal; } queueSuccess = RWPagesWork_createQueue(app, pageData->chunkPageVec, inode, rwType); pageData->chunkPageVec = NULL; if (unlikely(!queueSuccess) ) { Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Creating the async queue failed"); retVal = -ENOMEM; goto out; } if (!isFinal) { FhgfsChunkPageVec* newPageVec; // allocate the next chunkPageVec newPageVec = _FhgfsOpsPages_allocNewPageVec(pageData); if (unlikely(!newPageVec) ) retVal = -ENOMEM; } out: return retVal; } /** * Just allocate a pageVector */ FhgfsChunkPageVec* _FhgfsOpsPages_allocNewPageVec(FhgfsPageData* pageData) { struct inode* inode = pageData->inode; App* app = FhgfsOps_getApp(inode->i_sb); unsigned chunkPages = RemotingIOInfo_getNumPagesPerChunk(&pageData->ioInfo); FhgfsChunkPageVec* newPageVec; newPageVec = FhgfsChunkPageVec_create(app, inode, FhgfsOpsPages_pageListVecCache, FhgfsOpsPages_pageListVecPool ,chunkPages); pageData->chunkPageVec = newPageVec; // assign new pagevec return newPageVec; } FhgfsOpsErr _FhgfsOpsPages_referenceReadFileHandle(FhgfsPageData* writePageData, struct file* file) { FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file); unsigned openFlags = FsFileInfo_getAccessFlags(fileInfo) & OPENFILE_ACCESS_MASK_RW; return _FhgfsOpsPages_referenceFileHandle(writePageData, openFlags); } FhgfsOpsErr _FhgfsOpsPages_referenceWriteFileHandle(FhgfsPageData* writePageData) { unsigned openFlags = OPENFILE_ACCESS_WRITE; return _FhgfsOpsPages_referenceFileHandle(writePageData, openFlags); } /** * Reference the file if not already referenced and get the handleType * * @param file may be NULL (from writepages) */ FhgfsOpsErr _FhgfsOpsPages_referenceFileHandle(FhgfsPageData* writePageData, unsigned openFlags) { const char* logContext = "OpsPages_writeReferenceFileHandle"; FhgfsOpsErr referenceRes; struct inode* inode = writePageData->inode; FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); if (writePageData->isReferenced) return FhgfsOpsErr_SUCCESS; // already referenced, just return /* we will never truncate the file here, so referenceHandle will not send an event. with * no event to send, we don't need to supply a dentry. */ referenceRes = FhgfsInode_referenceHandle(fhgfsInode, NULL, openFlags, true, NULL, &(writePageData->handleType), NULL); if (referenceRes != FhgfsOpsErr_SUCCESS) { // failure Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Referencing the file handle failed! Error: %s", FhgfsOpsErr_toErrString(referenceRes) ); } else { // success FhgfsInode_getRefIOInfo(fhgfsInode, writePageData->handleType, FhgfsInode_handleTypeToOpenFlags(writePageData->handleType), &writePageData->ioInfo); writePageData->isReferenced = true; } return referenceRes; } /** * Callback for the mm/vfs write_cache_pages function. * * Collect the given pages into data->pageVec. If data->pageVec cannot take more pages * (chunk is full) a write request will be send immediately * * @return 0 on success, negative linux error code on error */ #ifdef KERNEL_WRITEPAGE_HAS_FOLIO int FhgfsOpsPages_writePageCallBack(struct folio *folio, struct writeback_control *wbc, void *dataPtr) { struct page *page = &folio->page; #else int FhgfsOpsPages_writePageCallBack(struct page *page, struct writeback_control *wbc, void *dataPtr) { #endif const char* logContext = __func__; int retVal = 0; FhgfsPageData* writePageData = (FhgfsPageData*) dataPtr; struct inode* inode = writePageData->inode; loff_t fileSize = i_size_read(inode); pgoff_t endIndex = fileSize >> PAGE_SHIFT; int usedPageLen; const bool finalWrite = false; /* When called from this callBack method, finalWrite * is always false. */ bool pageVecWasSent = false; int referenceRes = _FhgfsOpsPages_referenceWriteFileHandle(writePageData); if (referenceRes != FhgfsOpsErr_SUCCESS) { retVal = FhgfsOpsErr_toSysErr(referenceRes); goto outWriteErr; } // note, only allocate the pageVec after referencing the file! if (!writePageData->chunkPageVec) { FhgfsChunkPageVec* pageVec = _FhgfsOpsPages_allocNewPageVec(writePageData); if (unlikely(!pageVec) ) { printk_fhgfs_debug(KERN_INFO, "%s:%d ENOMEM\n", __func__, __LINE__); retVal = -ENOMEM; goto outAgain; } } if (page->index < endIndex) /* in this case, the page is within the limits of the file */ usedPageLen = PAGE_SIZE; else { // the page does not entirely fit into the file size limit IGNORE_UNUSED_VARIABLE(logContext); usedPageLen = fileSize & ~PAGE_MASK; if (page->index > endIndex || !usedPageLen) { // Page is outside the file size limit, probably truncate in progess, ignore this page int writeRes; #ifdef BEEGFS_DEBUG { Logger_logFormattedWithEntryID(inode, Log_NOTICE, logContext, "Page outside file size limit. file-size: %llu page-offset: %llu, usedPageLen: %d " "pg-Idx: %lu endIdx: %lu", fileSize, page_offset(page), usedPageLen, page->index, endIndex); } #endif writeRes = _FhgfsOpsPages_sendPageVec(writePageData, inode, finalWrite, BEEGFS_RWTYPE_WRITE); if (unlikely(writeRes) ) { retVal = writeRes; goto outWriteErr; } // set- and end page-writeback to remove the page from dirty-page-tree set_page_writeback(page); end_page_writeback(page); // invalidate the page (for reads) as there is a truncate in process ClearPageUptodate(page); goto outUnlock; // don't re-dirty the page to avoid further write attempts } } while(1) // repeats only once, until pageVecWasSent == true { int pushSucces; pushSucces = FhgfsChunkPageVec_pushPage(writePageData->chunkPageVec, page, usedPageLen); if (!pushSucces) { // pageVec probably full, send it and create a new one int writeRes; if (unlikely(pageVecWasSent) ) { /* We already send the pageVec once, no need to do it again for an empty vec. * Probably of out memory */ Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "pageVec push failed, page-index: %ld", writePageData->chunkPageVec->firstPageIdx); retVal = -ENOMEM; goto outAgain; } else { #ifdef BEEGFS_DEBUG if (writePageData->chunkPageVec->size == 0) Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "initial push failed, index: %ld", writePageData->chunkPageVec->firstPageIdx); #endif } writeRes = _FhgfsOpsPages_sendPageVec(writePageData, inode, finalWrite, BEEGFS_RWTYPE_WRITE); pageVecWasSent = true; if (unlikely(writeRes) ) { Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "ChunkPageVec writing failed."); retVal = writeRes; goto outWriteErr; } if (unlikely(!writePageData->chunkPageVec) ) { Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Warning, chunkPageVec is NULL."); goto outAgain; } else { continue; // try again to push the page } } else { // push success } break; // break on success } BUG_ON(PageWriteback(page)); set_page_writeback(page); return retVal; outWriteErr: set_bit(AS_EIO, &page->mapping->flags); unlock_page(page); return retVal; outAgain: // redirty and unlock the page, it will be handled again set_page_dirty(page); outUnlock: unlock_page(page); return retVal; } /** * address_space_operations.writepages method * * @retVal 0 on success, otherwise negative linux error code */ int _FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc, struct page* page) { struct inode* inode = mapping->host; FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); int retVal; FhgfsPageData pageData = { .inode = inode, .chunkPageVec = NULL, .isReferenced = false, }; #ifdef LOG_DEBUG_MESSAGES { App* app = FhgfsOps_getApp(inode->i_sb); struct dentry* dentry = d_find_alias(inode); // calls dget_locked FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(nr_to_write: %ld = %lluKiB)", wbc->nr_to_write, (long long) (wbc->nr_to_write << PAGE_SHIFT) / 1024); if(dentry) dput(dentry); } #endif // LOG_DEBUG_MESSAGES if (!page) { // writepages retVal = write_cache_pages(mapping, wbc, FhgfsOpsPages_writePageCallBack, &pageData); } else { // Called with a single page only, so we are called from ->writepage #ifdef KERNEL_WRITEPAGE_HAS_FOLIO struct folio *folio = page_folio(page); retVal = FhgfsOpsPages_writePageCallBack(folio, wbc, &pageData); #else retVal = FhgfsOpsPages_writePageCallBack(page, wbc, &pageData); #endif if (unlikely(retVal < 0) ) { // some kind of error if (unlikely(pageData.chunkPageVec) ) { printk_fhgfs(KERN_ERR, "Bug: pageData.chunkPageVec set, but error code returned\n"); dump_stack(); // try to clean it up FhgfsChunkPageVec_destroy(pageData.chunkPageVec); pageData.chunkPageVec = NULL; } } } if (pageData.chunkPageVec) { int writeRes; if (unlikely(!pageData.isReferenced) ) { // This would be a big bug and should be impossible at all! int referenceRes; printk_fhgfs(KERN_ERR, "%s: Bug: File is not referenced! ", __func__); dump_stack(); referenceRes = _FhgfsOpsPages_referenceWriteFileHandle(&pageData); if (unlikely(referenceRes) != FhgfsOpsErr_SUCCESS) { retVal = FhgfsOpsErr_toSysErr(referenceRes); FhgfsChunkPageVec_iterateAllHandleWritePages(pageData.chunkPageVec, referenceRes); FhgfsChunkPageVec_destroy(pageData.chunkPageVec); pageData.chunkPageVec = NULL; goto outReferenceErr; } } writeRes = _FhgfsOpsPages_sendPageVec(&pageData, inode, true, BEEGFS_RWTYPE_WRITE); if (unlikely(writeRes)) retVal = writeRes; } if (pageData.isReferenced) FhgfsInode_releaseHandle(fhgfsInode, pageData.handleType, NULL); outReferenceErr: #ifdef LOG_DEBUG_MESSAGES { App* app = FhgfsOps_getApp(inode->i_sb); struct dentry* dentry = d_find_alias(inode); // calls dget_locked FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "retVal: %d", retVal); if(dentry) dput(dentry); } #endif // LOG_DEBUG_MESSAGES return retVal; } /** * @return 0 on success, negative linux error code otherwise * * NOTE: page is already locked here */ int FhgfsOpsPages_writepage(struct page *page, struct writeback_control *wbc) { // Note: this is a writeback method, so we have no file handle here! (only the inode) // note: make sure that write-mapping is enabled in fhgfsops_mmap!!! #ifdef BEEGFS_DEBUG if(!PageUptodate(page) ) printk_fhgfs_debug(KERN_WARNING, "%s: Bug: page not up-to-date!\n", __func__); #endif return _FhgfsOpsPages_writepages(page->mapping, wbc, page); } int FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc) { #ifdef LOG_DEBUG_MESSAGES { struct inode* inode = mapping->host; App* app = FhgfsOps_getApp(inode->i_sb); struct dentry* dentry = d_find_alias(inode); // calls dget_locked FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(nr_to_write: %ld = %lluKiB)", wbc->nr_to_write, (long long) (wbc->nr_to_write << PAGE_SHIFT) / 1024); if(dentry) dput(dentry); } #endif // LOG_DEBUG_MESSAGES return _FhgfsOpsPages_writepages(mapping, wbc, NULL); } /** * Handle a written page * * Note: This must be only called once the server acknowledged it received the page * (successful write) * * @param writeRes positive number means the write succeeded, negative is standard error code */ void FhgfsOpsPages_endWritePage(struct page* page, int writeRes, struct inode* inode) { FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); kunmap(page); // page data access done, so unmap it if (unlikely(writeRes <= 0) ) { // write error if (writeRes == -EAGAIN || writeRes == 0) set_page_dirty(page); else { fhgfs_set_wb_error(page, writeRes); FhgfsInode_decNumDirtyPages(fhgfsInode); } } else FhgfsInode_decNumDirtyPages(fhgfsInode); end_page_writeback(page); unlock_page(page); } /** * We don't have data for this page, check if the page exceeds the file size limit * * Handle a short (sparse) read, which might have 2 two reasons * 1) Another client truncated the file, as this client has another inode-size * the kernel read-ahead would try forever to read missing data * 2) Sparse files, so one or more chunks have less data than following chunks * * As we do not know which applies, we always update the inode size first, but calling code * shall do this only once per chunk. * * */ bool FhgfsOpsPages_isShortRead(struct inode* inode, pgoff_t pageIndex, bool needInodeRefresh) { bool retVal = false; App* app = FhgfsOps_getApp(inode->i_sb); off_t iSize = i_size_read(inode); pgoff_t fileEndIndex = iSize >> PAGE_SHIFT; FhgfsIsizeHints iSizeHints; if (needInodeRefresh && pageIndex > fileEndIndex) { // page is outside the file size limit /* don't flush here, this called before FhgfsOpsPages_endReadPage(), so a page lock * is being held and if flush would try to lock this page, it deadlocks. */ __FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true); needInodeRefresh = false; iSize = i_size_read(inode); fileEndIndex = iSize >> PAGE_SHIFT; } if (pageIndex < fileEndIndex) retVal = true; return retVal; } /** * Note: Needs to be called for all pages added to pageVec in FhgfsOps_readpagesVec() in order to * set the status and to unlock it. * * @param readRes the number of read bytes or negative linux error code */ void FhgfsOpsPages_endReadPage(Logger* log, struct inode* inode, struct FhgfsPage* fhgfsPage, int readRes) { struct page* page = fhgfsPage->page; loff_t offset = FhgfsPage_getFileOffset(fhgfsPage); const char* logContext = __func__; /* LogTopic_COMMKIT here, as the message belongs better to CommKitVec, although it is a page * relate method */ Logger_logTopFormatted(log, LogTopic_COMMKIT, Log_SPAM, logContext, "Page-index: %lu readRes: %d", page->index, readRes); FhgfsOpsPages_incInodeFileSizeOnPagedRead(inode, offset, readRes); if (readRes < 0) { fhgfs_set_wb_error(page, readRes); } else { /* note: There is no way to mark a page outside the filesize, so even with readRes == 0 * we need to SetPageUptodate(page) */ if (readRes && readRes < PAGE_SIZE) { // zero the remainder of the page for which we don't have data // zero_user_segment() would be optimal, but not available in older kernels // zero_user_segment(page, zeroOffset, BEEGFS_PAGE_SIZE); // BUT we can use our kmapped Fhgfs_Page directly for zeroing memset(fhgfsPage->data + readRes, 0, PAGE_SIZE - readRes); } flush_dcache_page(page); SetPageUptodate(page); } FhgfsPage_unmapUnlockReleasePage(page); } /** * Callback for the mm/vfs read_cache_pages function. * * Collect the given pages into data->pageVec. If data->pageVec cannot take more pages * (chunk is full) a read request will be send immediately. * * @return 0 on success, negative linux error code on error */ int FhgfsOpsPages_readPageCallBack(void *dataPtr, struct page *page) { const char* logContext = __func__; int retVal = 0; FhgfsPageData* pageData = (FhgfsPageData*) dataPtr; struct inode* inode = pageData->inode; bool pageVecWasSent = false; const bool finalRead = false; // is not the final read from this function // note, only allocate the pageVec after referencing the file! if (!pageData->chunkPageVec) { FhgfsChunkPageVec* pageVec = _FhgfsOpsPages_allocNewPageVec(pageData); if (unlikely(!pageVec) ) { printk_fhgfs_debug(KERN_INFO, "%s:%d ENOMEM\n", __func__, __LINE__); retVal = -ENOMEM; goto outErr; } } while(1) // repeats only once, until pageVecWasSent == true { bool pushSucces; pushSucces = FhgfsChunkPageVec_pushPage(pageData->chunkPageVec, page, PAGE_SIZE); if (!pushSucces) { // pageVec probably full, send it and create a new one int readRes; if (unlikely(pageVecWasSent) ) { /* We already send the pageVec once, no need to do it again for an empty vec. * Probably of out memory */ Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "pageVec push failed, page-index: %ld", pageData->chunkPageVec->firstPageIdx); retVal = -ENOMEM; goto outErr; } else { #ifdef BEEGFS_DEBUG if (pageData->chunkPageVec->size == 0) Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "initial push failed, index: %ld", pageData->chunkPageVec->firstPageIdx); #endif } readRes = _FhgfsOpsPages_sendPageVec(pageData, inode, finalRead, BEEGFS_RWTYPE_READ); pageVecWasSent = true; if (unlikely(readRes) ) { Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "ChunkPageVec writing failed."); retVal = readRes; goto outErr; } if (unlikely(!pageData->chunkPageVec) ) { Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Warning, chunkPageVec is NULL."); retVal = -ENOMEM; goto outErr; } else { continue; // try again to push the page } } else { // push success } break; // break on success } get_page(page); // reference page to avoid it is re-used while we read it return retVal; outErr: return retVal; } /** * Read a page synchronously * * Note: Reads a locked page and returns it locked again (page will be unlocked in between) * */ int FhgfsOpsPages_readpageSync(struct file* file, struct page* page) { int retVal; ClearPageUptodate(page); #ifdef KERNEL_HAS_READ_FOLIO retVal = FhgfsOps_read_folio(file, page_folio(page)); // note: async read will unlock the page #else retVal = FhgfsOpsPages_readpage(file, page); // note: async read will unlock the page #endif if (retVal) { lock_page(page); // re-lock it return retVal; // some kind of error } lock_page(page); // re-lock it if (!PageUptodate(page) ) { /* The uptodate flag is not set, which means reading the page failed with an io-error. * Note: FhgfsOpsPages_readpage *must* SetPageUptoDate even if the page does not exist on * the server (i.e. file size too small), as other mm/vfs code requires that. * So if it is not set there was some kind of IO error. */ retVal = -EIO; } return retVal; } /* * Read a single page * Note: Called with the page locked and we need to unlock it when done. * * Note: Caller holds a reference to this page * Note: Reading the page is done asynchronously from another thread * * @return 0 on success, negative linux error code otherwise */ #ifdef KERNEL_HAS_READ_FOLIO int FhgfsOps_read_folio(struct file *file, struct folio *folio) #else int FhgfsOpsPages_readpage(struct file* file, struct page* page) #endif { App* app = FhgfsOps_getApp(file_dentry(file)->d_sb); struct inode* inode = file_inode(file); #ifdef KERNEL_HAS_READ_FOLIO struct page* page = &folio->page; DEFINE_READAHEAD(ractl, file, &file->f_ra, file->f_mapping, folio->index); #elif defined(KERNEL_HAS_FOLIO) DEFINE_READAHEAD(ractl, file, &file->f_ra, file->f_mapping, page->index); #endif int writeBackRes; int retVal; #ifdef KERNEL_HAS_READ_FOLIO FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "folio-index: %lu", folio->index); #else FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu", page->index); #endif IGNORE_UNUSED_VARIABLE(app); writeBackRes = FhgfsOpsPages_writeBackPage(inode, page); if (writeBackRes) { retVal = writeBackRes; goto outErr; } #if defined(KERNEL_HAS_FOLIO) retVal = _FhgfsOpsPages_readahead(&ractl, page); #if defined(KERNEL_HAS_READ_FOLIO) FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "folio-index: %lu retVal: %d", folio->index, retVal); #else FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d", page->index, retVal); #endif #else retVal = _FhgfsOpsPages_readpages(file, file->f_mapping, NULL, page); FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d", page->index, retVal); #endif return retVal; outErr: #ifdef KERNEL_HAS_READ_FOLIO folio_unlock(folio); FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "folio-index: %lu retVal: %d", folio->index, retVal); #else unlock_page(page); FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d", page->index, retVal); #endif return retVal; } #ifdef KERNEL_HAS_FOLIO int _FhgfsOpsPages_readahead(struct readahead_control *ractl, struct page *page) { struct inode* inode = ractl->mapping->host; struct file *file = ractl->file; #else int _FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping, struct list_head* pageList, struct page* page) { struct inode* inode = mapping->host; #endif int referenceRes; FhgfsInode* fhgfsInode = BEEGFS_INODE(inode); int retVal = 0; FhgfsPageData pageData = { .inode = inode, .chunkPageVec = NULL, .isReferenced = false, }; referenceRes = _FhgfsOpsPages_referenceReadFileHandle(&pageData, file); if (unlikely(referenceRes) != FhgfsOpsErr_SUCCESS) { retVal = FhgfsOpsErr_toSysErr(referenceRes); return retVal; } ihold(inode); // make sure the inode is not released #ifdef KERNEL_HAS_FOLIO if (readahead_count(ractl)) { while ((page = readahead_page(ractl)) != NULL) { retVal = FhgfsOpsPages_readPageCallBack(&pageData, page); put_page(page); if (retVal) break; } } #else if (pageList) { // classical readpages, we get a list pages, which are not in the page cache yet retVal = read_cache_pages(mapping, pageList, FhgfsOpsPages_readPageCallBack, &pageData); } #endif else { /* Called with a single page only, that does not need to be added to the cache, * we are called from ->readpage */ retVal = FhgfsOpsPages_readPageCallBack(&pageData, page); if (unlikely(retVal < 0) ) { // some kind of error if (unlikely(pageData.chunkPageVec) ) { printk_fhgfs(KERN_ERR, "Bug: pageData.chunkPageVec set, but error code returned\n"); dump_stack(); // try to clean it up FhgfsChunkPageVec_destroy(pageData.chunkPageVec); pageData.chunkPageVec = NULL; } // unlock the page as it cannot be handled. unlock_page(page); } } if (pageData.chunkPageVec) { int readRes; if (unlikely(!pageData.isReferenced) ) { // This would be a big bug and should be impossible at all! printk_fhgfs(KERN_ERR, "%s: Bug: File is not referenced! ", __func__); dump_stack(); } readRes = _FhgfsOpsPages_sendPageVec(&pageData, inode, true, BEEGFS_RWTYPE_READ); if (unlikely(readRes)) retVal = readRes; } if (likely(pageData.isReferenced) ) FhgfsInode_releaseHandle(fhgfsInode, pageData.handleType, NULL); iput(inode); return retVal; } /** * address_space_operations.readahead method */ #ifdef KERNEL_HAS_FOLIO void FhgfsOpsPages_readahead(struct readahead_control *ractl) { struct file *file = ractl->file; struct dentry* dentry = file_dentry(file); App* app = FhgfsOps_getApp(dentry->d_sb); FhgfsOpsHelper_logOpDebug(app, dentry, ractl->mapping->host, __func__, "(nr_pages: %u)", readahead_count(ractl)); IGNORE_UNUSED_VARIABLE(app); _FhgfsOpsPages_readahead(ractl, NULL); return; } #else /** * address_space_operations.readpages method */ int FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping, struct list_head* pageList, unsigned numPages) { struct dentry* dentry = file_dentry(file); struct inode* inode = mapping->host; App* app = FhgfsOps_getApp(dentry->d_sb); const char* logContext = __func__; FhgfsOpsHelper_logOpDebug(app, dentry, inode, logContext, "(num_pages: %u)", numPages); IGNORE_UNUSED_VARIABLE(logContext); IGNORE_UNUSED_VARIABLE(app); IGNORE_UNUSED_VARIABLE(inode); return _FhgfsOpsPages_readpages(file, mapping, pageList, NULL); } #endif /* * Write back all requests on one page - we do this before reading it. * * Note: page is locked and must stay locked */ int FhgfsOpsPages_writeBackPage(struct inode *inode, struct page *page) { struct writeback_control wbc; int ret; loff_t range_start = page_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); memset(&wbc, 0, sizeof(wbc) ); wbc.sync_mode = WB_SYNC_ALL; wbc.nr_to_write = 0; wbc.range_start = range_start; wbc.range_end = range_end; IGNORE_UNUSED_VARIABLE(range_start); IGNORE_UNUSED_VARIABLE(range_end); for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { ret = FhgfsOpsPages_writepage(page, &wbc); // note: unlocks the page lock_page(page); // re-lock the page if (ret < 0) goto out_error; continue; } ret = 0; if (!PagePrivate(page)) break; } out_error: return ret; }