1217 lines
35 KiB
C
1217 lines
35 KiB
C
/*
|
|
* fhgfs page cache methods
|
|
*
|
|
*/
|
|
|
|
#include <app/log/Logger.h>
|
|
#include <app/App.h>
|
|
#include <app/config/Config.h>
|
|
#include <common/toolkit/vector/StrCpyVec.h>
|
|
#include <common/toolkit/LockingTk.h>
|
|
#include <common/storage/StorageErrors.h>
|
|
#include <common/toolkit/StringTk.h>
|
|
#include <common/storage/striping/StripePattern.h>
|
|
#include <components/worker/RWPagesWork.h>
|
|
|
|
#include <filesystem/ProcFs.h>
|
|
#include <os/OsTypeConversion.h>
|
|
#include <os/OsCompat.h>
|
|
#include "FhgfsOpsDir.h"
|
|
#include "FhgfsOpsFile.h"
|
|
#include "FhgfsOpsHelper.h"
|
|
#include "FhgfsOpsInode.h"
|
|
#include "FhgfsOpsPages.h"
|
|
#include "FhgfsOpsSuper.h"
|
|
#include "FhgfsOps_versions.h"
|
|
|
|
#include <linux/writeback.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mpage.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/mempool.h>
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
#include <asm/compat.h>
|
|
#endif
|
|
|
|
#define INITIAL_FIND_PAGES (16) // search initially for this number of pages
|
|
|
|
#define FHGFSOPSPAGES_pageVecListCacheName BEEGFS_MODULE_NAME_STR "-pageListVec"
|
|
#define BEEGFS_PAGE_VEC_LIST_POOL_SIZE 8 // number of reserve page list-vecs
|
|
|
|
|
|
struct FhgfsPageData;
|
|
typedef struct FhgfsPageData FhgfsPageData;
|
|
|
|
|
|
#if ((INITIAL_FIND_PAGES) > (BEEGFS_MAX_PAGE_LIST_SIZE))
|
|
#error // trigger a compilation error as we would end up with memory corruption in real live
|
|
#endif
|
|
|
|
|
|
static struct kmem_cache* FhgfsOpsPages_pageListVecCache = NULL;
|
|
static mempool_t* FhgfsOpsPages_pageListVecPool = NULL;
|
|
|
|
// forward declarations
|
|
struct fhgfsWritePageHelper;
|
|
typedef struct fhgfsWritePageHelper fhgfsWrPgHelper;
|
|
|
|
|
|
|
|
static int _FhgfsOpsPages_sendPageVec(FhgfsPageData* pageData,
|
|
struct inode* inode, bool isFinal, Fhgfs_RWType rwType);
|
|
static FhgfsChunkPageVec* _FhgfsOpsPages_allocNewPageVec(FhgfsPageData* pageData);
|
|
static inline FhgfsOpsErr _FhgfsOpsPages_referenceReadFileHandle(FhgfsPageData* writePageData,
|
|
struct file* file);
|
|
static inline FhgfsOpsErr _FhgfsOpsPages_referenceWriteFileHandle(FhgfsPageData* writePageData);
|
|
static FhgfsOpsErr _FhgfsOpsPages_referenceFileHandle(FhgfsPageData* writePageData,
|
|
unsigned openFlags);
|
|
|
|
static int _FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc,
|
|
struct page* page);
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
int _FhgfsOpsPages_readahead(struct readahead_control *ractl, struct page* page);
|
|
#else
|
|
int _FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping,
|
|
struct list_head* pageList, struct page* page);
|
|
#endif
|
|
#ifdef KERNEL_WRITEPAGE_HAS_FOLIO
|
|
static int FhgfsOpsPages_writePageCallBack(struct folio *folio, struct writeback_control *wbc,
|
|
void *data);
|
|
#else
|
|
static int FhgfsOpsPages_writePageCallBack(struct page *page, struct writeback_control *wbc,
|
|
void *data);
|
|
#endif
|
|
static int FhgfsOpsPages_readPageCallBack(void *dataPtr, struct page *page);
|
|
|
|
|
|
/**
|
|
* A struct with variables to be exchanged between fhgfs writepages functions
|
|
*/
|
|
struct FhgfsPageData
|
|
{
|
|
struct inode* inode;
|
|
|
|
FhgfsChunkPageVec *chunkPageVec;
|
|
|
|
bool isReferenced;
|
|
FileHandleType handleType;
|
|
RemotingIOInfo ioInfo;
|
|
};
|
|
|
|
|
|
/**
|
|
* Initialize the pageListVecCache and pageListVec mempool
|
|
*/
|
|
bool FhgfsOpsPages_initPageListVecCache(void)
|
|
{
|
|
size_t cacheSize = sizeof(FhgfsPageListVec);
|
|
|
|
FhgfsOpsPages_pageListVecCache =
|
|
OsCompat_initKmemCache(FHGFSOPSPAGES_pageVecListCacheName, cacheSize, NULL);
|
|
|
|
// create a kmem PageVecList allocation cache
|
|
if (!FhgfsOpsPages_pageListVecCache)
|
|
return false;
|
|
|
|
FhgfsOpsPages_pageListVecPool = mempool_create(BEEGFS_PAGE_VEC_LIST_POOL_SIZE,
|
|
mempool_alloc_slab, mempool_free_slab, FhgfsOpsPages_pageListVecCache);
|
|
|
|
// create a mempool as last reserve for the PageVecList allocation cache
|
|
if (!FhgfsOpsPages_pageListVecPool)
|
|
{
|
|
kmem_cache_destroy(FhgfsOpsPages_pageListVecCache);
|
|
FhgfsOpsPages_pageListVecCache = NULL;
|
|
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Destroy the pageListVecCache and pageListVec mempool
|
|
*/
|
|
void FhgfsOpsPages_destroyPageListVecCache(void)
|
|
{
|
|
// first destroy the pool, then the cache, as the pool uses cached objects
|
|
if (FhgfsOpsPages_pageListVecPool)
|
|
{
|
|
mempool_destroy(FhgfsOpsPages_pageListVecPool);
|
|
FhgfsOpsPages_pageListVecPool = NULL;
|
|
}
|
|
|
|
if (FhgfsOpsPages_pageListVecCache)
|
|
{
|
|
kmem_cache_destroy(FhgfsOpsPages_pageListVecCache);
|
|
FhgfsOpsPages_pageListVecCache = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* If the meta server has told us for some reason a wrong file-size (i_size) the caller would
|
|
* wrongly discard data beyond i_size. So we are going to correct i_size here.
|
|
*
|
|
* This could be removed once we are sure the meta server *always* has the correct i_size
|
|
* (which might never be the case, due to concurrent writes and truncates).
|
|
*
|
|
* Note: This is the non-inlined version. Only call it from
|
|
* FhgfsOpsPages_incInodeFileSizeOnPagedRead()
|
|
*/
|
|
void __FhgfsOpsPages_incInodeFileSizeOnPagedRead(struct inode* inode, loff_t offset, size_t readRes)
|
|
{
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
Logger* log = App_getLogger(app);
|
|
const char* logContext = "Paged-read";
|
|
loff_t i_size;
|
|
|
|
FhgfsIsizeHints iSizeHints;
|
|
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
const EntryInfo* entryInfo = FhgfsInode_getEntryInfo(fhgfsInode);
|
|
|
|
/* Refresh the inode first, hopefully that is sufficient
|
|
* Note: The inode MUST NOT be flused from this functions, as the caller hold locked pages.
|
|
* But on flushing the inode, mm/vfs will wait for page unlocks - it would deadlock!
|
|
*/
|
|
__FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true);
|
|
|
|
i_size = i_size_read(inode);
|
|
|
|
if (unlikely(readRes && (offset + (loff_t)readRes > i_size) ) )
|
|
{ // _refreshInode was not sufficient, force a meta-update
|
|
FhgfsOpsErr refreshRes = FhgfsOpsRemoting_refreshEntry(app, entryInfo);
|
|
|
|
if (refreshRes != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
Logger_logErr(log, logContext, "Meta Refresh failed.");
|
|
}
|
|
|
|
// again try to refresh the inode, again the inode must not be flushed to avoid deadlocks
|
|
__FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true);
|
|
|
|
|
|
/* note on i_lock/i_size: make sure we only increase i_size and do not decrease it
|
|
(e.g. in case we're racing with a concurrent writer at a higher offset) */
|
|
spin_lock(&inode->i_lock); // L O C K
|
|
|
|
i_size = i_size_read(inode);
|
|
|
|
if (unlikely(offset + (loff_t)readRes > i_size) )
|
|
{ // All attempts to update the remote inode size to the position that we read failed.
|
|
|
|
/* i_size_write() sugguests to lock i_mutex, but we might end up here from write_begin()
|
|
* (FhgfsOps_write_begin), which aready has i_mutex locked. The more common callers
|
|
* readpages() and readpage() do not have i_mutex, though. */
|
|
|
|
i_size_write(inode, offset + readRes);
|
|
|
|
spin_unlock(&inode->i_lock); // U N L O C K
|
|
|
|
/* note: this situation can also be "normal" with a concurrent trunc, so we have to be
|
|
careful regarding user warnings and error return values. */
|
|
Logger_logFormatted(log, Log_DEBUG, logContext, "Failed to increase MDS inode size to "
|
|
"the expected value. (Application might read less data than expected or file was "
|
|
"truncated during read operation. Expected size: %lld isSize: %lld)",
|
|
offset + (loff_t)readRes, i_size);
|
|
}
|
|
else
|
|
spin_unlock(&inode->i_lock); // U N L O C K
|
|
}
|
|
|
|
Logger_logFormatted(log, Log_DEBUG, logContext,
|
|
"EntryID: %s Correcting inode size from %lld to %lld",
|
|
entryInfo->entryID, i_size, offset + readRes);
|
|
}
|
|
|
|
/**
|
|
* Just a small wrapper to write a pageVec
|
|
*
|
|
* Note: no-op if writePageData->chunkPageVec is NULL or chunkPageVec has a size of 0
|
|
*
|
|
* Note: Will destroy writePageData->pageVec and create a new pageVec
|
|
*
|
|
* @param isFinalWrite If true no new pageVec will be allocated
|
|
*
|
|
* @return 0 on success, negative linux error code on error
|
|
*/
|
|
int _FhgfsOpsPages_sendPageVec(FhgfsPageData* pageData, struct inode* inode, bool isFinal,
|
|
Fhgfs_RWType rwType)
|
|
{
|
|
const char* logContext = __func__;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
|
|
int retVal = 0;
|
|
bool queueSuccess;
|
|
|
|
if (!pageData->chunkPageVec)
|
|
return retVal; // nothing to do
|
|
|
|
if (FhgfsChunkPageVec_getSize(pageData->chunkPageVec) == 0)
|
|
{ // pageVec is empty
|
|
if (isFinal)
|
|
{
|
|
FhgfsChunkPageVec_destroy(pageData->chunkPageVec);
|
|
pageData->chunkPageVec = NULL;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
queueSuccess = RWPagesWork_createQueue(app, pageData->chunkPageVec, inode, rwType);
|
|
pageData->chunkPageVec = NULL;
|
|
|
|
if (unlikely(!queueSuccess) )
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Creating the async queue failed");
|
|
|
|
retVal = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (!isFinal)
|
|
{
|
|
FhgfsChunkPageVec* newPageVec;
|
|
// allocate the next chunkPageVec
|
|
newPageVec = _FhgfsOpsPages_allocNewPageVec(pageData);
|
|
|
|
if (unlikely(!newPageVec) )
|
|
retVal = -ENOMEM;
|
|
}
|
|
|
|
out:
|
|
return retVal;
|
|
}
|
|
|
|
/**
|
|
* Just allocate a pageVector
|
|
*/
|
|
FhgfsChunkPageVec* _FhgfsOpsPages_allocNewPageVec(FhgfsPageData* pageData)
|
|
{
|
|
struct inode* inode = pageData->inode;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
unsigned chunkPages = RemotingIOInfo_getNumPagesPerChunk(&pageData->ioInfo);
|
|
FhgfsChunkPageVec* newPageVec;
|
|
|
|
newPageVec = FhgfsChunkPageVec_create(app, inode, FhgfsOpsPages_pageListVecCache,
|
|
FhgfsOpsPages_pageListVecPool ,chunkPages);
|
|
|
|
pageData->chunkPageVec = newPageVec; // assign new pagevec
|
|
|
|
return newPageVec;
|
|
}
|
|
|
|
FhgfsOpsErr _FhgfsOpsPages_referenceReadFileHandle(FhgfsPageData* writePageData, struct file* file)
|
|
{
|
|
FsFileInfo* fileInfo = __FhgfsOps_getFileInfo(file);
|
|
unsigned openFlags = FsFileInfo_getAccessFlags(fileInfo) & OPENFILE_ACCESS_MASK_RW;
|
|
|
|
return _FhgfsOpsPages_referenceFileHandle(writePageData, openFlags);
|
|
}
|
|
|
|
FhgfsOpsErr _FhgfsOpsPages_referenceWriteFileHandle(FhgfsPageData* writePageData)
|
|
{
|
|
unsigned openFlags = OPENFILE_ACCESS_WRITE;
|
|
|
|
return _FhgfsOpsPages_referenceFileHandle(writePageData, openFlags);
|
|
}
|
|
|
|
/**
|
|
* Reference the file if not already referenced and get the handleType
|
|
*
|
|
* @param file may be NULL (from writepages)
|
|
*/
|
|
FhgfsOpsErr _FhgfsOpsPages_referenceFileHandle(FhgfsPageData* writePageData, unsigned openFlags)
|
|
{
|
|
const char* logContext = "OpsPages_writeReferenceFileHandle";
|
|
FhgfsOpsErr referenceRes;
|
|
|
|
struct inode* inode = writePageData->inode;
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
|
|
if (writePageData->isReferenced)
|
|
return FhgfsOpsErr_SUCCESS; // already referenced, just return
|
|
|
|
/* we will never truncate the file here, so referenceHandle will not send an event. with
|
|
* no event to send, we don't need to supply a dentry. */
|
|
referenceRes = FhgfsInode_referenceHandle(fhgfsInode, NULL, openFlags, true, NULL,
|
|
&(writePageData->handleType), NULL);
|
|
|
|
if (referenceRes != FhgfsOpsErr_SUCCESS)
|
|
{ // failure
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"Referencing the file handle failed! Error: %s", FhgfsOpsErr_toErrString(referenceRes) );
|
|
}
|
|
else
|
|
{ // success
|
|
FhgfsInode_getRefIOInfo(fhgfsInode, writePageData->handleType,
|
|
FhgfsInode_handleTypeToOpenFlags(writePageData->handleType), &writePageData->ioInfo);
|
|
|
|
writePageData->isReferenced = true;
|
|
}
|
|
|
|
return referenceRes;
|
|
}
|
|
|
|
|
|
/**
|
|
* Callback for the mm/vfs write_cache_pages function.
|
|
*
|
|
* Collect the given pages into data->pageVec. If data->pageVec cannot take more pages
|
|
* (chunk is full) a write request will be send immediately
|
|
*
|
|
* @return 0 on success, negative linux error code on error
|
|
*/
|
|
#ifdef KERNEL_WRITEPAGE_HAS_FOLIO
|
|
int FhgfsOpsPages_writePageCallBack(struct folio *folio, struct writeback_control *wbc, void *dataPtr)
|
|
{
|
|
struct page *page = &folio->page;
|
|
#else
|
|
int FhgfsOpsPages_writePageCallBack(struct page *page, struct writeback_control *wbc, void *dataPtr)
|
|
{
|
|
#endif
|
|
const char* logContext = __func__;
|
|
int retVal = 0;
|
|
|
|
FhgfsPageData* writePageData = (FhgfsPageData*) dataPtr;
|
|
|
|
struct inode* inode = writePageData->inode;
|
|
|
|
loff_t fileSize = i_size_read(inode);
|
|
pgoff_t endIndex = fileSize >> PAGE_SHIFT;
|
|
|
|
int usedPageLen;
|
|
|
|
const bool finalWrite = false; /* When called from this callBack method, finalWrite
|
|
* is always false. */
|
|
|
|
bool pageVecWasSent = false;
|
|
|
|
int referenceRes = _FhgfsOpsPages_referenceWriteFileHandle(writePageData);
|
|
|
|
|
|
if (referenceRes != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
retVal = FhgfsOpsErr_toSysErr(referenceRes);
|
|
goto outWriteErr;
|
|
}
|
|
|
|
// note, only allocate the pageVec after referencing the file!
|
|
if (!writePageData->chunkPageVec)
|
|
{
|
|
FhgfsChunkPageVec* pageVec = _FhgfsOpsPages_allocNewPageVec(writePageData);
|
|
if (unlikely(!pageVec) )
|
|
{
|
|
printk_fhgfs_debug(KERN_INFO, "%s:%d ENOMEM\n", __func__, __LINE__);
|
|
retVal = -ENOMEM;
|
|
goto outAgain;
|
|
}
|
|
}
|
|
|
|
if (page->index < endIndex)
|
|
/* in this case, the page is within the limits of the file */
|
|
usedPageLen = PAGE_SIZE;
|
|
else
|
|
{ // the page does not entirely fit into the file size limit
|
|
IGNORE_UNUSED_VARIABLE(logContext);
|
|
|
|
usedPageLen = fileSize & ~PAGE_MASK;
|
|
|
|
if (page->index > endIndex || !usedPageLen)
|
|
{ // Page is outside the file size limit, probably truncate in progess, ignore this page
|
|
int writeRes;
|
|
|
|
#ifdef BEEGFS_DEBUG
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_NOTICE, logContext,
|
|
"Page outside file size limit. file-size: %llu page-offset: %llu, usedPageLen: %d "
|
|
"pg-Idx: %lu endIdx: %lu",
|
|
fileSize, page_offset(page), usedPageLen, page->index, endIndex);
|
|
}
|
|
#endif
|
|
|
|
writeRes = _FhgfsOpsPages_sendPageVec(writePageData, inode, finalWrite,
|
|
BEEGFS_RWTYPE_WRITE);
|
|
if (unlikely(writeRes) )
|
|
{
|
|
|
|
retVal = writeRes;
|
|
goto outWriteErr;
|
|
}
|
|
|
|
// set- and end page-writeback to remove the page from dirty-page-tree
|
|
set_page_writeback(page);
|
|
end_page_writeback(page);
|
|
|
|
// invalidate the page (for reads) as there is a truncate in process
|
|
ClearPageUptodate(page);
|
|
|
|
goto outUnlock; // don't re-dirty the page to avoid further write attempts
|
|
}
|
|
}
|
|
|
|
|
|
while(1) // repeats only once, until pageVecWasSent == true
|
|
{
|
|
int pushSucces;
|
|
|
|
pushSucces = FhgfsChunkPageVec_pushPage(writePageData->chunkPageVec, page, usedPageLen);
|
|
|
|
if (!pushSucces)
|
|
{ // pageVec probably full, send it and create a new one
|
|
int writeRes;
|
|
|
|
if (unlikely(pageVecWasSent) )
|
|
{ /* We already send the pageVec once, no need to do it again for an empty vec.
|
|
* Probably of out memory */
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"pageVec push failed, page-index: %ld", writePageData->chunkPageVec->firstPageIdx);
|
|
|
|
retVal = -ENOMEM;
|
|
goto outAgain;
|
|
}
|
|
else
|
|
{
|
|
#ifdef BEEGFS_DEBUG
|
|
if (writePageData->chunkPageVec->size == 0)
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"initial push failed, index: %ld", writePageData->chunkPageVec->firstPageIdx);
|
|
#endif
|
|
}
|
|
|
|
|
|
writeRes = _FhgfsOpsPages_sendPageVec(writePageData, inode, finalWrite,
|
|
BEEGFS_RWTYPE_WRITE);
|
|
|
|
pageVecWasSent = true;
|
|
|
|
if (unlikely(writeRes) )
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "ChunkPageVec writing failed.");
|
|
|
|
retVal = writeRes;
|
|
goto outWriteErr;
|
|
}
|
|
|
|
if (unlikely(!writePageData->chunkPageVec) )
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext, "Warning, chunkPageVec is NULL.");
|
|
|
|
goto outAgain;
|
|
}
|
|
else
|
|
{
|
|
continue; // try again to push the page
|
|
}
|
|
}
|
|
else
|
|
{ // push success
|
|
}
|
|
|
|
break; // break on success
|
|
}
|
|
|
|
BUG_ON(PageWriteback(page));
|
|
set_page_writeback(page);
|
|
|
|
return retVal;
|
|
|
|
|
|
outWriteErr:
|
|
set_bit(AS_EIO, &page->mapping->flags);
|
|
unlock_page(page);
|
|
return retVal;
|
|
|
|
outAgain: // redirty and unlock the page, it will be handled again
|
|
set_page_dirty(page);
|
|
|
|
outUnlock:
|
|
unlock_page(page);
|
|
return retVal;
|
|
}
|
|
|
|
|
|
/**
|
|
* address_space_operations.writepages method
|
|
*
|
|
* @retVal 0 on success, otherwise negative linux error code
|
|
*/
|
|
int _FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc,
|
|
struct page* page)
|
|
{
|
|
struct inode* inode = mapping->host;
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
int retVal;
|
|
|
|
FhgfsPageData pageData =
|
|
{
|
|
.inode = inode,
|
|
.chunkPageVec = NULL,
|
|
.isReferenced = false,
|
|
};
|
|
|
|
#ifdef LOG_DEBUG_MESSAGES
|
|
{
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
struct dentry* dentry = d_find_alias(inode); // calls dget_locked
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(nr_to_write: %ld = %lluKiB)",
|
|
wbc->nr_to_write, (long long) (wbc->nr_to_write << PAGE_SHIFT) / 1024);
|
|
|
|
if(dentry)
|
|
dput(dentry);
|
|
}
|
|
#endif // LOG_DEBUG_MESSAGES
|
|
|
|
|
|
if (!page)
|
|
{ // writepages
|
|
retVal = write_cache_pages(mapping, wbc, FhgfsOpsPages_writePageCallBack, &pageData);
|
|
}
|
|
else
|
|
{ // Called with a single page only, so we are called from ->writepage
|
|
#ifdef KERNEL_WRITEPAGE_HAS_FOLIO
|
|
struct folio *folio = page_folio(page);
|
|
retVal = FhgfsOpsPages_writePageCallBack(folio, wbc, &pageData);
|
|
#else
|
|
retVal = FhgfsOpsPages_writePageCallBack(page, wbc, &pageData);
|
|
#endif
|
|
if (unlikely(retVal < 0) )
|
|
{ // some kind of error
|
|
if (unlikely(pageData.chunkPageVec) )
|
|
{
|
|
printk_fhgfs(KERN_ERR,
|
|
"Bug: pageData.chunkPageVec set, but error code returned\n");
|
|
dump_stack();
|
|
|
|
// try to clean it up
|
|
FhgfsChunkPageVec_destroy(pageData.chunkPageVec);
|
|
pageData.chunkPageVec = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (pageData.chunkPageVec)
|
|
{
|
|
int writeRes;
|
|
|
|
if (unlikely(!pageData.isReferenced) )
|
|
{ // This would be a big bug and should be impossible at all!
|
|
int referenceRes;
|
|
|
|
printk_fhgfs(KERN_ERR, "%s: Bug: File is not referenced! ", __func__);
|
|
dump_stack();
|
|
|
|
referenceRes = _FhgfsOpsPages_referenceWriteFileHandle(&pageData);
|
|
if (unlikely(referenceRes) != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
retVal = FhgfsOpsErr_toSysErr(referenceRes);
|
|
FhgfsChunkPageVec_iterateAllHandleWritePages(pageData.chunkPageVec, referenceRes);
|
|
|
|
FhgfsChunkPageVec_destroy(pageData.chunkPageVec);
|
|
pageData.chunkPageVec = NULL;
|
|
goto outReferenceErr;
|
|
}
|
|
}
|
|
|
|
writeRes = _FhgfsOpsPages_sendPageVec(&pageData, inode, true, BEEGFS_RWTYPE_WRITE);
|
|
|
|
if (unlikely(writeRes))
|
|
retVal = writeRes;
|
|
}
|
|
|
|
if (pageData.isReferenced)
|
|
FhgfsInode_releaseHandle(fhgfsInode, pageData.handleType, NULL);
|
|
|
|
outReferenceErr:
|
|
#ifdef LOG_DEBUG_MESSAGES
|
|
{
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
struct dentry* dentry = d_find_alias(inode); // calls dget_locked
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "retVal: %d", retVal);
|
|
|
|
if(dentry)
|
|
dput(dentry);
|
|
}
|
|
#endif // LOG_DEBUG_MESSAGES
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/**
|
|
* @return 0 on success, negative linux error code otherwise
|
|
*
|
|
* NOTE: page is already locked here
|
|
*/
|
|
int FhgfsOpsPages_writepage(struct page *page, struct writeback_control *wbc)
|
|
{
|
|
// Note: this is a writeback method, so we have no file handle here! (only the inode)
|
|
|
|
// note: make sure that write-mapping is enabled in fhgfsops_mmap!!!
|
|
|
|
#ifdef BEEGFS_DEBUG
|
|
if(!PageUptodate(page) )
|
|
printk_fhgfs_debug(KERN_WARNING, "%s: Bug: page not up-to-date!\n", __func__);
|
|
#endif
|
|
|
|
return _FhgfsOpsPages_writepages(page->mapping, wbc, page);
|
|
}
|
|
|
|
int FhgfsOpsPages_writepages(struct address_space* mapping, struct writeback_control* wbc)
|
|
{
|
|
#ifdef LOG_DEBUG_MESSAGES
|
|
{
|
|
struct inode* inode = mapping->host;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
struct dentry* dentry = d_find_alias(inode); // calls dget_locked
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, __func__, "(nr_to_write: %ld = %lluKiB)",
|
|
wbc->nr_to_write, (long long) (wbc->nr_to_write << PAGE_SHIFT) / 1024);
|
|
|
|
if(dentry)
|
|
dput(dentry);
|
|
}
|
|
#endif // LOG_DEBUG_MESSAGES
|
|
|
|
return _FhgfsOpsPages_writepages(mapping, wbc, NULL);
|
|
}
|
|
|
|
|
|
/**
|
|
* Handle a written page
|
|
*
|
|
* Note: This must be only called once the server acknowledged it received the page
|
|
* (successful write)
|
|
*
|
|
* @param writeRes positive number means the write succeeded, negative is standard error code
|
|
*/
|
|
void FhgfsOpsPages_endWritePage(struct page* page, int writeRes, struct inode* inode)
|
|
{
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
|
|
kunmap(page); // page data access done, so unmap it
|
|
|
|
if (unlikely(writeRes <= 0) )
|
|
{ // write error
|
|
if (writeRes == -EAGAIN || writeRes == 0)
|
|
set_page_dirty(page);
|
|
else
|
|
{
|
|
fhgfs_set_wb_error(page, writeRes);
|
|
FhgfsInode_decNumDirtyPages(fhgfsInode);
|
|
}
|
|
}
|
|
else
|
|
FhgfsInode_decNumDirtyPages(fhgfsInode);
|
|
|
|
end_page_writeback(page);
|
|
unlock_page(page);
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* We don't have data for this page, check if the page exceeds the file size limit
|
|
*
|
|
* Handle a short (sparse) read, which might have 2 two reasons
|
|
* 1) Another client truncated the file, as this client has another inode-size
|
|
* the kernel read-ahead would try forever to read missing data
|
|
* 2) Sparse files, so one or more chunks have less data than following chunks
|
|
*
|
|
* As we do not know which applies, we always update the inode size first, but calling code
|
|
* shall do this only once per chunk.
|
|
*
|
|
* */
|
|
bool FhgfsOpsPages_isShortRead(struct inode* inode, pgoff_t pageIndex,
|
|
bool needInodeRefresh)
|
|
{
|
|
bool retVal = false;
|
|
App* app = FhgfsOps_getApp(inode->i_sb);
|
|
|
|
off_t iSize = i_size_read(inode);
|
|
pgoff_t fileEndIndex = iSize >> PAGE_SHIFT;
|
|
|
|
FhgfsIsizeHints iSizeHints;
|
|
|
|
if (needInodeRefresh && pageIndex > fileEndIndex)
|
|
{ // page is outside the file size limit
|
|
|
|
/* don't flush here, this called before FhgfsOpsPages_endReadPage(), so a page lock
|
|
* is being held and if flush would try to lock this page, it deadlocks. */
|
|
__FhgfsOps_doRefreshInode(app, inode, NULL, &iSizeHints, true);
|
|
needInodeRefresh = false;
|
|
|
|
iSize = i_size_read(inode);
|
|
fileEndIndex = iSize >> PAGE_SHIFT;
|
|
}
|
|
|
|
if (pageIndex < fileEndIndex)
|
|
retVal = true;
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/**
|
|
* Note: Needs to be called for all pages added to pageVec in FhgfsOps_readpagesVec() in order to
|
|
* set the status and to unlock it.
|
|
*
|
|
* @param readRes the number of read bytes or negative linux error code
|
|
*/
|
|
void FhgfsOpsPages_endReadPage(Logger* log, struct inode* inode, struct FhgfsPage* fhgfsPage,
|
|
int readRes)
|
|
{
|
|
struct page* page = fhgfsPage->page;
|
|
loff_t offset = FhgfsPage_getFileOffset(fhgfsPage);
|
|
|
|
const char* logContext = __func__;
|
|
|
|
/* LogTopic_COMMKIT here, as the message belongs better to CommKitVec, although it is a page
|
|
* relate method */
|
|
Logger_logTopFormatted(log, LogTopic_COMMKIT, Log_SPAM, logContext,
|
|
"Page-index: %lu readRes: %d", page->index, readRes);
|
|
|
|
FhgfsOpsPages_incInodeFileSizeOnPagedRead(inode, offset, readRes);
|
|
|
|
|
|
if (readRes < 0)
|
|
{
|
|
fhgfs_set_wb_error(page, readRes);
|
|
}
|
|
else
|
|
{ /* note: There is no way to mark a page outside the filesize, so even with readRes == 0
|
|
* we need to SetPageUptodate(page) */
|
|
|
|
if (readRes && readRes < PAGE_SIZE)
|
|
{
|
|
// zero the remainder of the page for which we don't have data
|
|
|
|
// zero_user_segment() would be optimal, but not available in older kernels
|
|
// zero_user_segment(page, zeroOffset, BEEGFS_PAGE_SIZE);
|
|
// BUT we can use our kmapped Fhgfs_Page directly for zeroing
|
|
|
|
memset(fhgfsPage->data + readRes, 0, PAGE_SIZE - readRes);
|
|
}
|
|
|
|
flush_dcache_page(page);
|
|
SetPageUptodate(page);
|
|
}
|
|
|
|
FhgfsPage_unmapUnlockReleasePage(page);
|
|
}
|
|
|
|
|
|
/**
|
|
* Callback for the mm/vfs read_cache_pages function.
|
|
*
|
|
* Collect the given pages into data->pageVec. If data->pageVec cannot take more pages
|
|
* (chunk is full) a read request will be send immediately.
|
|
*
|
|
* @return 0 on success, negative linux error code on error
|
|
*/
|
|
int FhgfsOpsPages_readPageCallBack(void *dataPtr, struct page *page)
|
|
{
|
|
const char* logContext = __func__;
|
|
int retVal = 0;
|
|
|
|
FhgfsPageData* pageData = (FhgfsPageData*) dataPtr;
|
|
|
|
struct inode* inode = pageData->inode;
|
|
|
|
bool pageVecWasSent = false;
|
|
|
|
const bool finalRead = false; // is not the final read from this function
|
|
|
|
// note, only allocate the pageVec after referencing the file!
|
|
if (!pageData->chunkPageVec)
|
|
{
|
|
FhgfsChunkPageVec* pageVec = _FhgfsOpsPages_allocNewPageVec(pageData);
|
|
if (unlikely(!pageVec) )
|
|
{
|
|
printk_fhgfs_debug(KERN_INFO, "%s:%d ENOMEM\n", __func__, __LINE__);
|
|
retVal = -ENOMEM;
|
|
goto outErr;
|
|
}
|
|
}
|
|
|
|
while(1) // repeats only once, until pageVecWasSent == true
|
|
{
|
|
bool pushSucces;
|
|
|
|
pushSucces = FhgfsChunkPageVec_pushPage(pageData->chunkPageVec, page, PAGE_SIZE);
|
|
|
|
if (!pushSucces)
|
|
{ // pageVec probably full, send it and create a new one
|
|
int readRes;
|
|
|
|
if (unlikely(pageVecWasSent) )
|
|
{ /* We already send the pageVec once, no need to do it again for an empty vec.
|
|
* Probably of out memory */
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"pageVec push failed, page-index: %ld", pageData->chunkPageVec->firstPageIdx);
|
|
|
|
retVal = -ENOMEM;
|
|
goto outErr;
|
|
}
|
|
else
|
|
{
|
|
#ifdef BEEGFS_DEBUG
|
|
if (pageData->chunkPageVec->size == 0)
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"initial push failed, index: %ld", pageData->chunkPageVec->firstPageIdx);
|
|
#endif
|
|
}
|
|
|
|
|
|
readRes = _FhgfsOpsPages_sendPageVec(pageData, inode, finalRead, BEEGFS_RWTYPE_READ);
|
|
|
|
pageVecWasSent = true;
|
|
|
|
if (unlikely(readRes) )
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"ChunkPageVec writing failed.");
|
|
|
|
retVal = readRes;
|
|
goto outErr;
|
|
}
|
|
|
|
if (unlikely(!pageData->chunkPageVec) )
|
|
{
|
|
Logger_logFormattedWithEntryID(inode, Log_ERR, logContext,
|
|
"Warning, chunkPageVec is NULL.");
|
|
|
|
retVal = -ENOMEM;
|
|
goto outErr;
|
|
}
|
|
else
|
|
{
|
|
continue; // try again to push the page
|
|
}
|
|
}
|
|
else
|
|
{ // push success
|
|
}
|
|
|
|
break; // break on success
|
|
}
|
|
|
|
get_page(page); // reference page to avoid it is re-used while we read it
|
|
return retVal;
|
|
|
|
|
|
outErr:
|
|
return retVal;
|
|
}
|
|
|
|
/**
|
|
* Read a page synchronously
|
|
*
|
|
* Note: Reads a locked page and returns it locked again (page will be unlocked in between)
|
|
*
|
|
*/
|
|
int FhgfsOpsPages_readpageSync(struct file* file, struct page* page)
|
|
{
|
|
int retVal;
|
|
|
|
ClearPageUptodate(page);
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
retVal = FhgfsOps_read_folio(file, page_folio(page)); // note: async read will unlock the page
|
|
#else
|
|
retVal = FhgfsOpsPages_readpage(file, page); // note: async read will unlock the page
|
|
#endif
|
|
|
|
if (retVal)
|
|
{
|
|
lock_page(page); // re-lock it
|
|
return retVal; // some kind of error
|
|
}
|
|
|
|
lock_page(page); // re-lock it
|
|
|
|
if (!PageUptodate(page) )
|
|
{
|
|
/* The uptodate flag is not set, which means reading the page failed with an io-error.
|
|
* Note: FhgfsOpsPages_readpage *must* SetPageUptoDate even if the page does not exist on
|
|
* the server (i.e. file size too small), as other mm/vfs code requires that.
|
|
* So if it is not set there was some kind of IO error. */
|
|
retVal = -EIO;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/*
|
|
* Read a single page
|
|
* Note: Called with the page locked and we need to unlock it when done.
|
|
*
|
|
* Note: Caller holds a reference to this page
|
|
* Note: Reading the page is done asynchronously from another thread
|
|
*
|
|
* @return 0 on success, negative linux error code otherwise
|
|
*/
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
int FhgfsOps_read_folio(struct file *file, struct folio *folio)
|
|
#else
|
|
int FhgfsOpsPages_readpage(struct file* file, struct page* page)
|
|
#endif
|
|
{
|
|
App* app = FhgfsOps_getApp(file_dentry(file)->d_sb);
|
|
struct inode* inode = file_inode(file);
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
struct page* page = &folio->page;
|
|
DEFINE_READAHEAD(ractl, file, &file->f_ra, file->f_mapping, folio->index);
|
|
#elif defined(KERNEL_HAS_FOLIO)
|
|
DEFINE_READAHEAD(ractl, file, &file->f_ra, file->f_mapping, page->index);
|
|
#endif
|
|
|
|
int writeBackRes;
|
|
|
|
int retVal;
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__,
|
|
"folio-index: %lu", folio->index);
|
|
#else
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__,
|
|
"page-index: %lu", page->index);
|
|
#endif
|
|
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
|
|
writeBackRes = FhgfsOpsPages_writeBackPage(inode, page);
|
|
if (writeBackRes)
|
|
{
|
|
retVal = writeBackRes;
|
|
goto outErr;
|
|
}
|
|
|
|
#if defined(KERNEL_HAS_FOLIO)
|
|
retVal = _FhgfsOpsPages_readahead(&ractl, page);
|
|
#if defined(KERNEL_HAS_READ_FOLIO)
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "folio-index: %lu retVal: %d",
|
|
folio->index, retVal);
|
|
#else
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d",
|
|
page->index, retVal);
|
|
#endif
|
|
#else
|
|
retVal = _FhgfsOpsPages_readpages(file, file->f_mapping, NULL, page);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d",
|
|
page->index, retVal);
|
|
#endif
|
|
|
|
return retVal;
|
|
|
|
outErr:
|
|
|
|
#ifdef KERNEL_HAS_READ_FOLIO
|
|
folio_unlock(folio);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "folio-index: %lu retVal: %d",
|
|
folio->index, retVal);
|
|
#else
|
|
unlock_page(page);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, file_dentry(file), inode, __func__, "page-index: %lu retVal: %d",
|
|
page->index, retVal);
|
|
#endif
|
|
|
|
return retVal;
|
|
}
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
int _FhgfsOpsPages_readahead(struct readahead_control *ractl, struct page *page)
|
|
{
|
|
struct inode* inode = ractl->mapping->host;
|
|
struct file *file = ractl->file;
|
|
#else
|
|
int _FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping,
|
|
struct list_head* pageList, struct page* page)
|
|
{
|
|
struct inode* inode = mapping->host;
|
|
#endif
|
|
|
|
int referenceRes;
|
|
|
|
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
|
|
int retVal = 0;
|
|
|
|
FhgfsPageData pageData =
|
|
{
|
|
.inode = inode,
|
|
.chunkPageVec = NULL,
|
|
.isReferenced = false,
|
|
};
|
|
|
|
|
|
referenceRes = _FhgfsOpsPages_referenceReadFileHandle(&pageData, file);
|
|
if (unlikely(referenceRes) != FhgfsOpsErr_SUCCESS)
|
|
{
|
|
retVal = FhgfsOpsErr_toSysErr(referenceRes);
|
|
|
|
return retVal;
|
|
}
|
|
|
|
|
|
ihold(inode); // make sure the inode is not released
|
|
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
if (readahead_count(ractl))
|
|
{
|
|
while ((page = readahead_page(ractl)) != NULL)
|
|
{
|
|
retVal = FhgfsOpsPages_readPageCallBack(&pageData, page);
|
|
put_page(page);
|
|
if (retVal)
|
|
break;
|
|
}
|
|
}
|
|
#else
|
|
if (pageList)
|
|
{ // classical readpages, we get a list pages, which are not in the page cache yet
|
|
retVal = read_cache_pages(mapping, pageList, FhgfsOpsPages_readPageCallBack, &pageData);
|
|
}
|
|
#endif
|
|
else
|
|
{ /* Called with a single page only, that does not need to be added to the cache,
|
|
* we are called from ->readpage */
|
|
|
|
retVal = FhgfsOpsPages_readPageCallBack(&pageData, page);
|
|
|
|
if (unlikely(retVal < 0) )
|
|
{ // some kind of error
|
|
if (unlikely(pageData.chunkPageVec) )
|
|
{
|
|
printk_fhgfs(KERN_ERR,
|
|
"Bug: pageData.chunkPageVec set, but error code returned\n");
|
|
dump_stack();
|
|
|
|
// try to clean it up
|
|
FhgfsChunkPageVec_destroy(pageData.chunkPageVec);
|
|
pageData.chunkPageVec = NULL;
|
|
}
|
|
|
|
// unlock the page as it cannot be handled.
|
|
unlock_page(page);
|
|
}
|
|
}
|
|
|
|
if (pageData.chunkPageVec)
|
|
{
|
|
int readRes;
|
|
|
|
if (unlikely(!pageData.isReferenced) )
|
|
{ // This would be a big bug and should be impossible at all!
|
|
printk_fhgfs(KERN_ERR, "%s: Bug: File is not referenced! ", __func__);
|
|
dump_stack();
|
|
|
|
}
|
|
|
|
readRes = _FhgfsOpsPages_sendPageVec(&pageData, inode, true, BEEGFS_RWTYPE_READ);
|
|
|
|
if (unlikely(readRes))
|
|
retVal = readRes;
|
|
}
|
|
|
|
if (likely(pageData.isReferenced) )
|
|
FhgfsInode_releaseHandle(fhgfsInode, pageData.handleType, NULL);
|
|
|
|
iput(inode);
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/**
|
|
* address_space_operations.readahead method
|
|
*/
|
|
#ifdef KERNEL_HAS_FOLIO
|
|
void FhgfsOpsPages_readahead(struct readahead_control *ractl)
|
|
{
|
|
|
|
struct file *file = ractl->file;
|
|
|
|
struct dentry* dentry = file_dentry(file);
|
|
App* app = FhgfsOps_getApp(dentry->d_sb);
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, ractl->mapping->host, __func__, "(nr_pages: %u)", readahead_count(ractl));
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
_FhgfsOpsPages_readahead(ractl, NULL);
|
|
return;
|
|
}
|
|
#else
|
|
|
|
/**
|
|
* address_space_operations.readpages method
|
|
*/
|
|
int FhgfsOpsPages_readpages(struct file* file, struct address_space* mapping,
|
|
struct list_head* pageList, unsigned numPages)
|
|
{
|
|
|
|
struct dentry* dentry = file_dentry(file);
|
|
struct inode* inode = mapping->host;
|
|
App* app = FhgfsOps_getApp(dentry->d_sb);
|
|
|
|
const char* logContext = __func__;
|
|
|
|
FhgfsOpsHelper_logOpDebug(app, dentry, inode, logContext, "(num_pages: %u)", numPages);
|
|
IGNORE_UNUSED_VARIABLE(logContext);
|
|
IGNORE_UNUSED_VARIABLE(app);
|
|
IGNORE_UNUSED_VARIABLE(inode);
|
|
|
|
return _FhgfsOpsPages_readpages(file, mapping, pageList, NULL);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Write back all requests on one page - we do this before reading it.
|
|
*
|
|
* Note: page is locked and must stay locked
|
|
*/
|
|
int FhgfsOpsPages_writeBackPage(struct inode *inode, struct page *page)
|
|
{
|
|
struct writeback_control wbc;
|
|
int ret;
|
|
|
|
loff_t range_start = page_offset(page);
|
|
loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
|
|
|
|
|
|
memset(&wbc, 0, sizeof(wbc) );
|
|
wbc.sync_mode = WB_SYNC_ALL;
|
|
wbc.nr_to_write = 0;
|
|
|
|
wbc.range_start = range_start;
|
|
wbc.range_end = range_end;
|
|
|
|
IGNORE_UNUSED_VARIABLE(range_start);
|
|
IGNORE_UNUSED_VARIABLE(range_end);
|
|
|
|
for (;;) {
|
|
wait_on_page_writeback(page);
|
|
if (clear_page_dirty_for_io(page))
|
|
{
|
|
ret = FhgfsOpsPages_writepage(page, &wbc); // note: unlocks the page
|
|
lock_page(page); // re-lock the page
|
|
if (ret < 0)
|
|
goto out_error;
|
|
continue;
|
|
}
|
|
ret = 0;
|
|
if (!PagePrivate(page))
|
|
break;
|
|
}
|
|
|
|
out_error:
|
|
return ret;
|
|
}
|
|
|
|
|