beegfs/client_module/source/net/filesystem/FhgfsOpsCommKitVec.c
2025-08-10 01:34:16 +02:00

1194 lines
39 KiB
C

#include <app/App.h>
#include <common/net/message/session/rw/ReadLocalFileV2Msg.h>
#include <common/net/message/session/rw/WriteLocalFileMsg.h>
#include <common/net/message/session/rw/WriteLocalFileRespMsg.h>
#include <common/nodes/MirrorBuddyGroupMapper.h>
#include <common/FhgfsTypes.h>
#include <common/toolkit/MessagingTk.h>
#include <common/storage/StorageErrors.h>
#include <common/threading/Thread.h>
#include <filesystem/FhgfsInode.h>
#include <filesystem/FhgfsOpsPages.h>
#include "FhgfsOpsCommKitVec.h"
static void __FhgfsOpsCommKitVec_readfileStagePREPARE(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_readfileStageRECVHEADER(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_readfileStageRECVDATA(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_handleReadError(CommKitVecHelper* commHelper, FhgfsCommKitVec* comm,
FhgfsPage* fhgfsPage, ssize_t recvRes, size_t missingPgLen);
static void __FhgfsOpsCommKitVec_readfileStageSOCKETEXCEPTION(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_readfileStageCLEANUP(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_readfileStageHandlePages(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStagePREPARE(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageSENDHEADER(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageSENDDATA(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageRECV(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageSOCKETEXCEPTION(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageCLEANUP(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static void __FhgfsOpsCommKitVec_writefileStageHandlePages(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static int64_t __FhgfsOpsCommKitVec_writefileCommunicate(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
static int64_t __FhgfsOpsCommKitVec_readfileCommunicate(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm);
/**
* Prepare a read request
*
* @return false on error (means 'break' for the surrounding switch statement)
*/
void __FhgfsOpsCommKitVec_readfileStagePREPARE(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
Config* cfg = App_getConfig(commHelper->app);
TargetMapper* targetMapper = App_getTargetMapper(commHelper->app);
NodeStoreEx* storageNodes = App_getStorageNodes(commHelper->app);
Node* localNode = App_getLocalNode(commHelper->app);
const NumNodeID localNodeNumID = Node_getNumID(localNode);
FhgfsOpsErr resolveErr;
NodeConnPool* connPool;
ReadLocalFileV2Msg readMsg;
ssize_t sendRes;
bool needCleanup = false;
bool isSocketException = false;
uint16_t nodeReferenceTargetID = comm->targetID; /* to avoid overriding original targetID
in case of buddy-mirroring, because we need the mirror group ID for the msg and for retries */
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
// nodeAndType is not set until later after calling NodeStoreEx_referenceNodeByTargetID.
// Don't use it before then!
NodeString nodeAndType;
comm->read.reqLen = FhgfsOpsCommKitVec_getRemainingDataSize(comm);
comm->sock = NULL;
comm->nodeResult = -FhgfsOpsErr_COMMUNICATION;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
// select the right targetID
if(StripePattern_getPatternType(commHelper->ioInfo->pattern) == STRIPEPATTERN_BuddyMirror)
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = App_getStorageBuddyGroupMapper(commHelper->app);
nodeReferenceTargetID = comm->useBuddyMirrorSecond ?
MirrorBuddyGroupMapper_getSecondaryTargetID(mirrorBuddies, comm->targetID) :
MirrorBuddyGroupMapper_getPrimaryTargetID(mirrorBuddies, comm->targetID);
// note: only log message here, error handling below through invalid node reference
if(unlikely(!nodeReferenceTargetID) )
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Invalid mirror buddy group ID: %hu", comm->targetID);
}
// get the target-node reference
comm->node = NodeStoreEx_referenceNodeByTargetID(storageNodes,
nodeReferenceTargetID, targetMapper, &resolveErr);
if(unlikely(!comm->node) )
{ // unable to resolve targetID
comm->nodeResult = -resolveErr;
needCleanup = true;
goto outErr;
}
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
connPool = Node_getConnPool(comm->node);
// connect
comm->sock = NodeConnPool_acquireStreamSocketEx(connPool, true, NULL);
if(!comm->sock)
{ // connection error
if (fatal_signal_pending(current))
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG,
commHelper->logContext, "Connect to server canceled by pending signal: %s",
nodeAndType.buf);
comm->nodeResult = FhgfsOpsErr_INTERRUPTED;
}
else
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING,
commHelper->logContext, "Unable to connect to server: %s",
nodeAndType.buf);
}
needCleanup = true;
goto outErr;
}
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"Acquire read node: %s offset: %lli size: %zu",
nodeAndType.buf, offset, comm->read.reqLen);
// prepare message
ReadLocalFileV2Msg_initFromSession(&readMsg, localNodeNumID,
commHelper->ioInfo->fileHandleID, comm->targetID, commHelper->ioInfo->pathInfo,
commHelper->ioInfo->accessFlags, offset, comm->read.reqLen);
NetMessage_setMsgHeaderTargetID( (NetMessage*)&readMsg, nodeReferenceTargetID);
if (comm->firstWriteDoneForTarget && Config_getSysSessionChecksEnabled(cfg))
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&readMsg,
READLOCALFILEMSG_FLAG_SESSION_CHECK);
if(StripePattern_getPatternType(commHelper->ioInfo->pattern) == STRIPEPATTERN_BuddyMirror)
{
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&readMsg, READLOCALFILEMSG_FLAG_BUDDYMIRROR);
if(comm->useBuddyMirrorSecond)
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&readMsg,
READLOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
}
if(App_getNetBenchModeEnabled(commHelper->app) )
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&readMsg, READLOCALFILEMSG_FLAG_DISABLE_IO);
NetMessage_serialize( (NetMessage*)&readMsg, comm->msgBuf, BEEGFS_COMMKIT_MSGBUF_SIZE);
comm->hdrLen = NetMessage_getMsgLength( (NetMessage*)&readMsg);
comm->nodeResult = 0; // ready to read, so set this variable to 0
sendRes = Socket_send_kernel(comm->sock, comm->msgBuf, comm->hdrLen, 0);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_READ_SEND )
if (sendRes > 0 && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) sending data.");
sendRes = -ETIMEDOUT;
}
#endif
if(unlikely(sendRes <= 0) )
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, commHelper->logContext,
"Failed to send message to %s: %s", nodeAndType.buf,
Socket_getPeername(comm->sock) );
isSocketException = true;
goto outErr;
}
__FhgfsOpsCommKitVec_readfileStageRECVHEADER(commHelper, comm);
return;
outErr:
if (needCleanup)
__FhgfsOpsCommKitVec_readfileStageCLEANUP(commHelper, comm);
if (isSocketException)
__FhgfsOpsCommKitVec_readfileStageRECVHEADER(commHelper, comm);
}
/**
* Receive how many data the server is going to send next, this is also called after each
* *complete* read request
*/
void __FhgfsOpsCommKitVec_readfileStageRECVHEADER(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
ssize_t recvRes;
char dataLenBuf[sizeof(int64_t)]; // length info in fhgfs network byte order
int64_t lengthInfo; // length info in fhgfs host byte order
DeserializeCtx ctx = { dataLenBuf, sizeof(int64_t) };
bool isSocketException = false;
bool needCleanup = false;
Config* cfg = App_getConfig(commHelper->app);
NodeString alias;
Node_copyAlias(comm->node, &alias);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
recvRes = Socket_recvExactT_kernel(comm->sock, &dataLenBuf, sizeof(int64_t), 0, cfg->connMsgLongTimeout);
if(unlikely(recvRes <= 0) )
{ // error
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, commHelper->logContext,
"Failed to receive length info from %s: %s",
nodeAndType.buf, Socket_getPeername(comm->sock) );
isSocketException = true;
goto outErr;
}
// got the length info response
Serialization_deserializeInt64(&ctx, &lengthInfo);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, commHelper->logContext,
"Received lengthInfo from %s: %lld", alias.buf, (long long)lengthInfo);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_READ_HEADER)
if (recvRes > 0 && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) receiving data.");
isSocketException = true;
goto outErr;
}
#endif
if(lengthInfo <= 0)
{ // end of file data transmission
if(unlikely(lengthInfo < 0) )
{
comm->nodeResult = lengthInfo; // error occurred
isSocketException = true;
}
else
{ // EOF
needCleanup = true;
}
goto outErr;
}
if (unlikely(lengthInfo > comm->read.reqLen) )
{
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Bug: Server wants to send more than we requested (%zu vs. %llu)",
comm->read.reqLen, lengthInfo);
isSocketException = true;
goto outErr;
}
// positive result => node is going to send some file data
comm->read.serverSize = lengthInfo;
__FhgfsOpsCommKitVec_readfileStageRECVDATA(commHelper, comm);
return;
outErr:
if (needCleanup)
__FhgfsOpsCommKitVec_readfileStageCLEANUP(commHelper, comm);
if (isSocketException)
__FhgfsOpsCommKitVec_readfileStageSOCKETEXCEPTION(commHelper, comm);
}
/**
* Read the data from the server
*/
void __FhgfsOpsCommKitVec_readfileStageRECVDATA(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
size_t receiveSum = 0; // sum of recvRes
FhgfsPage *fhgfsPage;
size_t pageLen; // length of a page
ssize_t recvRes;
Config* cfg = App_getConfig(commHelper->app);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
do
{ // walk over all pages of this state up to the size the server promise to give us
FhgfsChunkPageVec* pageVec = comm->pageVec;
void* pageDataPtr;
size_t requestLength;
#ifdef BEEGFS_DEBUG
size_t vecIdx = FhgfsChunkPageVec_getCurrentListVecIterIdx(pageVec);
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
#endif
fhgfsPage = FhgfsChunkPageVec_iterateGetNextPage(pageVec);
if (unlikely(!fhgfsPage) )
{
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"BUG: Iterator does not have more pages!" );
comm->nodeResult = -FhgfsOpsErr_INTERNAL;
goto outSocketException;
}
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"vecSize: %zu offset: %lld; Vec-Idx: %zu receivedData: %ld",
FhgfsChunkPageVec_getDataSize(pageVec), offset,
vecIdx, receiveSum);
pageLen = fhgfsPage->length;
pageDataPtr = fhgfsPage->data;
requestLength = MIN(pageLen, comm->read.serverSize - receiveSum);
// receive available dataPart
recvRes = Socket_recvExactT_kernel(comm->sock, pageDataPtr, requestLength, 0, cfg->connMsgLongTimeout);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__,
"requested: %lld; received: %lld",
(long long)requestLength, (long long)recvRes);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_RECV_DATA)
if (recvRes > 0 && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) receiving data.");
recvRes = -ETIMEDOUT;
}
#endif
if(unlikely(recvRes <= 0) )
{ // something went wrong
goto outReadErr;
}
receiveSum += recvRes;
if (unlikely(requestLength < pageLen && receiveSum < comm->read.serverSize) )
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING,
commHelper->logContext, "Bug/Warning: requestLength < pageLength (%zu, %zu)",
requestLength, pageLen);
goto outReadErr;
}
comm->numSuccessPages++;
} while (receiveSum < comm->read.serverSize);
// all data for this state and round received, add it up
comm->nodeResult += receiveSum;
// all of the data has been received => receive the next lengthInfo in the header stage
comm->doHeader = true;
return;
outSocketException:
__FhgfsOpsCommKitVec_readfileStageSOCKETEXCEPTION(commHelper, comm);
return;
outReadErr:
__FhgfsOpsCommKitVec_handleReadError(commHelper, comm, fhgfsPage, recvRes, pageLen);
return;
}
/**
* Take the right action based on the error type
*/
void __FhgfsOpsCommKitVec_handleReadError(CommKitVecHelper* commHelper, FhgfsCommKitVec* comm,
FhgfsPage* fhgfsPage, ssize_t recvRes, size_t missingPgLen)
{
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
if(recvRes == -EFAULT)
{ // bad buffer address given
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, commHelper->logContext,
"Bad buffer address");
comm->nodeResult = -FhgfsOpsErr_ADDRESSFAULT;
}
else
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
if (recvRes > 0)
{ /* requestLength < pageLength && comm->read.receivedLength < comm->read.serverSize */
comm->nodeResult = -FhgfsOpsErr_INTERNAL;
}
if(recvRes == -ETIMEDOUT)
{ // timeout
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Communication timeout in RECVDATA stage. Node: %s",
nodeAndType.buf);
comm->nodeResult = -FhgfsOpsErr_COMMUNICATION;
}
else
{ // error
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Communication error in RECVDATA stage. Node: %s (recv result: %lld)",
nodeAndType.buf, (long long)recvRes);
comm->nodeResult = -FhgfsOpsErr_INTERNAL;
}
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_SPAM, commHelper->logContext,
"Request details: missingLength of %s: %lld", nodeAndType.buf,
(long long)missingPgLen);
}
__FhgfsOpsCommKitVec_readfileStageSOCKETEXCEPTION(commHelper, comm);
}
/**
* Socket exception happened
*/
void __FhgfsOpsCommKitVec_readfileStageSOCKETEXCEPTION(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
const char* logCommErrTxt = "Communication error";
size_t remainingDataSize = FhgfsOpsCommKitVec_getRemainingDataSize(comm);
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, commHelper->logContext,
"%s: Sent request: node: %s; fileHandleID: %s; offset: %lld; size: %zd",
logCommErrTxt, nodeAndType.buf, commHelper->ioInfo->fileHandleID,
(long long) offset, remainingDataSize );
NodeConnPool_invalidateStreamSocket(Node_getConnPool(comm->node), comm->sock);
comm->sock = NULL;
if (comm->nodeResult >= 0)
{ // make sure not to overwrite already assigned error codes
comm->nodeResult = -FhgfsOpsErr_COMMUNICATION;
}
__FhgfsOpsCommKitVec_readfileStageCLEANUP(commHelper, comm);
}
/**
* Clean up the current state after an error or too few data.
*/
void __FhgfsOpsCommKitVec_readfileStageCLEANUP(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
// actual clean-up
if (comm->sock)
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"Release socket, read node: %s", nodeAndType.buf);
NodeConnPool_releaseStreamSocket(Node_getConnPool(comm->node), comm->sock);
comm->sock = NULL;
}
if(likely(comm->node) )
Node_put(comm->node);
// prepare next stage
}
/**
* Handle all remaining pages of this state.
* Note: Must be always called - on read success and read-error!
*/
void __FhgfsOpsCommKitVec_readfileStageHandlePages(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
// const char* logContext = "CommKitVec_readfileStageHandlePages";
FhgfsChunkPageVec* fhgfsPageVec = comm->pageVec;
struct inode* inode = FhgfsChunkPageVec_getInode(fhgfsPageVec);
bool needInodeRefresh = true;
FhgfsPage* fhgfsPage;
size_t pgCount = 0;
int64_t readRes = comm->nodeResult;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__,
"nodeResult: %lli (%s)", comm->nodeResult,
(comm->nodeResult < 0) ? FhgfsOpsErr_toErrString(-comm->nodeResult):
FhgfsOpsErr_toErrString(FhgfsOpsErr_SUCCESS));
if (unlikely(comm->nodeResult == -FhgfsOpsErr_COMMUNICATION ||
comm->nodeResult == -FhgfsOpsErr_AGAIN))
{
// don't do anything, IO can be possibly re-tried
goto out;
}
FhgfsChunkPageVec_resetIterator(fhgfsPageVec);
// walk over all successfully read pages
while (pgCount < comm->numSuccessPages && readRes > 0)
{
fhgfsPage = FhgfsChunkPageVec_iterateGetNextPage(fhgfsPageVec);
if (!fhgfsPage)
break;
FhgfsOpsPages_endReadPage(commHelper->log, inode, fhgfsPage, readRes);
readRes -= PAGE_SIZE;
pgCount++;
}
if (unlikely(readRes > 0)) {
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"%s: Bug: readRes too large!", __func__);
comm->nodeResult = -FhgfsOpsErr_INTERNAL;
}
// walk over the remaining pages
while (1)
{
bool isShortRead;
int pageReadRes = (comm->nodeResult >= 0) ? comm->nodeResult : -1;
fhgfsPage = FhgfsChunkPageVec_iterateGetNextPage(fhgfsPageVec);
if (!fhgfsPage)
break;
if (pageReadRes >= 0)
{
isShortRead = FhgfsOpsPages_isShortRead(inode, FhgfsPage_getPageIndex(fhgfsPage),
needInodeRefresh);
needInodeRefresh = false; // a single inode refresh is sufficient
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"nodeResult: %lld pageIdx: %ld is-short-read: %d",
comm->nodeResult, FhgfsPage_getPageIndex(fhgfsPage), isShortRead);
if (isShortRead)
{ // file size includes this page, but we don't have data, so zero it
FhgfsPage_zeroPage(fhgfsPage);
}
else
{ // the file is smaller than this page, nothing to be done
}
}
FhgfsOpsPages_endReadPage(commHelper->log, inode, fhgfsPage, pageReadRes);
}
out:
return;
}
/**
* @return false on error (means 'break' for the surrounding switch statement)
*/
void __FhgfsOpsCommKitVec_writefileStagePREPARE(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
Config* cfg = App_getConfig(commHelper->app);
TargetMapper* targetMapper = App_getTargetMapper(commHelper->app);
NodeStoreEx* storageNodes = App_getStorageNodes(commHelper->app);
Node* localNode = App_getLocalNode(commHelper->app);
const NumNodeID localNodeNumID = Node_getNumID(localNode);
FhgfsOpsErr resolveErr;
NodeConnPool* connPool;
WriteLocalFileMsg writeMsg;
uint16_t nodeReferenceTargetID = comm->targetID; /* to avoid overriding original targetID
in case of buddy-mirroring, because we need the mirror group ID for the msg and for retries */
size_t remainingDataSize = FhgfsOpsCommKitVec_getRemainingDataSize(comm);
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
// nodeAndType is not set until later after calling NodeStoreEx_referenceNodeByTargetID.
// Don't use it before then!
NodeString nodeAndType;
comm->sock = NULL;
comm->nodeResult = -FhgfsOpsErr_COMMUNICATION;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
if(StripePattern_getPatternType(commHelper->ioInfo->pattern) == STRIPEPATTERN_BuddyMirror)
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = App_getStorageBuddyGroupMapper(commHelper->app);
nodeReferenceTargetID = comm->useBuddyMirrorSecond ?
MirrorBuddyGroupMapper_getSecondaryTargetID(mirrorBuddies, comm->targetID) :
MirrorBuddyGroupMapper_getPrimaryTargetID(mirrorBuddies, comm->targetID);
// note: only log message here, error handling below through invalid node reference
if(unlikely(!nodeReferenceTargetID) )
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Invalid mirror buddy group ID: %hu", comm->targetID);
}
// get the target-node reference
comm->node = NodeStoreEx_referenceNodeByTargetID(storageNodes,
nodeReferenceTargetID, targetMapper, &resolveErr);
if(unlikely(!comm->node) )
{ // unable to resolve targetID
comm->nodeResult = -resolveErr;
goto outErr;
}
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
connPool = Node_getConnPool(comm->node);
// connect
comm->sock = NodeConnPool_acquireStreamSocketEx(connPool, true, NULL);
if(!comm->sock)
{ // connection error
if (fatal_signal_pending(current)) {
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG,
commHelper->logContext, "Connect to server canceled by pending signal: %s",
nodeAndType.buf);
}
else {
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING,
commHelper->logContext, "Unable to connect to server: %s",
nodeAndType.buf);
}
goto outErr;
}
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"Acquire write state: node: %s offset: %lld size: %zd",
nodeAndType.buf, (long long) offset, remainingDataSize);
// prepare message
WriteLocalFileMsg_initFromSession(&writeMsg, localNodeNumID,
commHelper->ioInfo->fileHandleID, comm->targetID, commHelper->ioInfo->pathInfo,
commHelper->ioInfo->accessFlags, offset, remainingDataSize);
NetMessage_setMsgHeaderTargetID( (NetMessage*)&writeMsg, nodeReferenceTargetID);
if (Config_getQuotaEnabled(cfg) )
WriteLocalFileMsg_setUserdataForQuota(&writeMsg, commHelper->ioInfo->userID,
commHelper->ioInfo->groupID);
if (comm->firstWriteDoneForTarget && Config_getSysSessionChecksEnabled(cfg))
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&writeMsg,
WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
if(StripePattern_getPatternType(commHelper->ioInfo->pattern) == STRIPEPATTERN_BuddyMirror)
{
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&writeMsg,
WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
if(comm->useServersideMirroring)
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&writeMsg,
WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_FORWARD);
if(comm->useBuddyMirrorSecond)
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&writeMsg,
WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
}
if(App_getNetBenchModeEnabled(commHelper->app) )
NetMessage_addMsgHeaderFeatureFlag( (NetMessage*)&writeMsg,
WRITELOCALFILEMSG_FLAG_DISABLE_IO);
NetMessage_serialize( (NetMessage*)&writeMsg, comm->msgBuf, BEEGFS_COMMKIT_MSGBUF_SIZE);
comm->hdrLen = NetMessage_getMsgLength( (NetMessage*)&writeMsg);
comm->doHeader = true;
return;
outErr:
__FhgfsOpsCommKitVec_writefileStageCLEANUP(commHelper, comm);
}
/**
* Tell the server how many data we are going to send
*/
void __FhgfsOpsCommKitVec_writefileStageSENDHEADER(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
ssize_t sendRes;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
sendRes = Socket_send_kernel(comm->sock, comm->msgBuf, comm->hdrLen, 0);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_WRITE_HEADER )
if (sendRes == FhgfsOpsErr_SUCCESS && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) sending data.");
sendRes = -ETIMEDOUT;
}
#endif
if(unlikely(sendRes <= 0) )
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, commHelper->logContext,
"Failed to send message to %s: %s", nodeAndType.buf,
Socket_getPeername(comm->sock) );
goto outErr;
}
__FhgfsOpsCommKitVec_writefileStageSENDDATA(commHelper, comm);
return;
outErr:
__FhgfsOpsCommKitVec_writefileStageSOCKETEXCEPTION(commHelper, comm);
}
/**
* Send the data (of our current state)
*
* @param state - current state to send data for
*/
void __FhgfsOpsCommKitVec_writefileStageSENDDATA(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
ssize_t sendDataPartRes;
FhgfsChunkPageVec *pageVec = comm->pageVec;
FhgfsPage *fhgfsPage;
size_t vecIdx = FhgfsChunkPageVec_getCurrentListVecIterIdx(pageVec);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
{
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_SPAM, __func__,
"targetID: %hu offset: %lld vecSize: %u, current-pgIdx: %zu",
comm->targetID, (long long) offset, FhgfsChunkPageVec_getSize(pageVec), vecIdx);
IGNORE_UNUSED_VARIABLE(offset);
}
// walk over all pages
while (true)
{
void* data;
size_t dataLength;
vecIdx = FhgfsChunkPageVec_getCurrentListVecIterIdx(pageVec);
fhgfsPage = FhgfsChunkPageVec_iterateGetNextPage(pageVec);
if (!fhgfsPage)
break;
dataLength = fhgfsPage->length;
data = fhgfsPage->data;
// send dataPart blocking
sendDataPartRes = Socket_send_kernel(comm->sock, data, dataLength, 0);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__,
"VecIdx: %zu; size: %lld; PgLen: %d, sendRes: %zd",
vecIdx, (long long)dataLength, fhgfsPage->length, sendDataPartRes);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_WRITE_SEND)
if (sendDataPartRes > 0 && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) sending data.");
sendDataPartRes = -ETIMEDOUT;
}
#endif
if (unlikely(sendDataPartRes <= 0) )
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
if(sendDataPartRes == -EFAULT)
{ // bad buffer address given
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG,
commHelper->logContext, "Bad buffer address");
comm->nodeResult = -FhgfsOpsErr_ADDRESSFAULT;
goto outErr;
}
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Communication error in SENDDATA stage. "
"Node: %s; Request details: %zu/%u pages",
nodeAndType.buf, vecIdx, FhgfsChunkPageVec_getSize(pageVec) );
goto outErr;
}
if (sendDataPartRes < dataLength)
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"Error/Bug: Node: %s; Less data sent than requested; Request details: %zu/%u pages",
nodeAndType.buf, vecIdx, FhgfsChunkPageVec_getSize(pageVec) );
goto outErr;
}
}
__FhgfsOpsCommKitVec_writefileStageRECV(commHelper, comm);
return;
outErr:
__FhgfsOpsCommKitVec_writefileStageSOCKETEXCEPTION(commHelper, comm);
}
/**
* Ask the server side for errors
*/
void __FhgfsOpsCommKitVec_writefileStageRECV(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
ssize_t respRes;
bool isSocketException = false;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
respRes = MessagingTk_recvMsgBuf(commHelper->app, comm->sock,
comm->msgBuf, BEEGFS_COMMKIT_MSGBUF_SIZE);
#if (BEEGFS_COMMKIT_DEBUG & COMMKIT_DEBUG_WRITE_RECV)
if (respRes == FhgfsOpsErr_SUCCESS && jiffies % CommKitErrorInjectRate == 0)
{
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING, __func__,
"Injecting timeout error after (successfully) sending data.");
respRes = -ETIMEDOUT;
}
#endif
if(unlikely(respRes <= 0) )
{ // receive failed
// note: signal pending log msg will be printed in stage SOCKETEXCEPTION, so no need here
if (!fatal_signal_pending(current)) {
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING,
commHelper->logContext,
"Receive failed from: %s @ %s", nodeAndType.buf,
Socket_getPeername(comm->sock) );
}
isSocketException = true;
goto outErr;
}
else
{ // got response => deserialize it
bool deserRes;
WriteLocalFileRespMsg_init(&comm->write.respMsg);
deserRes = NetMessageFactory_deserializeFromBuf(commHelper->app, comm->msgBuf, respRes,
(NetMessage*)&comm->write.respMsg, NETMSGTYPE_WriteLocalFileResp);
if(unlikely(!deserRes) )
{ // response invalid
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_WARNING,
commHelper->logContext,
"Received invalid response from %s. Expected type: %d. Disconnecting: %s",
nodeAndType.buf, NETMSGTYPE_WriteLocalFileResp,
Socket_getPeername(comm->sock) );
isSocketException = true;
goto outErr;
}
else
{ // correct response
int64_t writeRespValue = WriteLocalFileRespMsg_getValue(&comm->write.respMsg);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, 5, __func__,
"writeRespValue: %lld", (long long) writeRespValue);
comm->nodeResult = writeRespValue;
}
}
__FhgfsOpsCommKitVec_writefileStageCLEANUP(commHelper, comm);
return;
outErr:
if (isSocketException)
__FhgfsOpsCommKitVec_writefileStageSOCKETEXCEPTION(commHelper, comm);
}
/**
* Socket exception happend
*/
void __FhgfsOpsCommKitVec_writefileStageSOCKETEXCEPTION(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
const char* logCommInterruptedTxt = "Communication interrupted by signal";
const char* logCommErrTxt = "Communication error";
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
if (fatal_signal_pending(current))
{ // interrupted by signal
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_NOTICE, commHelper->logContext,
"%s. Node: %s", logCommInterruptedTxt, nodeAndType.buf);
}
else
{ // "normal" connection error
size_t remainingDataSize = FhgfsOpsCommKitVec_getRemainingDataSize(comm);
loff_t offset = FhgfsOpsCommKitVec_getOffset(comm);
Logger_logTopErrFormatted(commHelper->log, LogTopic_COMMKIT, commHelper->logContext,
"%s. Node: %s", logCommErrTxt, nodeAndType.buf);
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, commHelper->logContext,
"Sent request: node: %s; fileHandleID: %s; offset: %lld; size: %zu",
nodeAndType.buf, commHelper->ioInfo->fileHandleID,
(long long)offset, remainingDataSize);
}
if (comm->nodeResult >= 0)
{ // make sure not to overwrite already assigned error codes
comm->nodeResult = -FhgfsOpsErr_COMMUNICATION;
}
NodeConnPool_invalidateStreamSocket(Node_getConnPool(comm->node), comm->sock);
comm->sock = NULL;
__FhgfsOpsCommKitVec_writefileStageCLEANUP(commHelper, comm);
}
/**
* Clean up a connection, if there was an error
*/
void __FhgfsOpsCommKitVec_writefileStageCLEANUP(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
// actual clean-up
if (comm->sock)
{
NodeString nodeAndType;
Node_copyAliasWithTypeStr(comm->node, &nodeAndType);
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, 5, __func__,
"Release write node: %s", nodeAndType.buf);
NodeConnPool_releaseStreamSocket(Node_getConnPool(comm->node), comm->sock);
comm->sock = NULL;
}
if(likely(comm->node) )
Node_put(comm->node);
// prepare next stage
}
/**
* Handle all remaining pages of this state.
* Note: Must be always called - on read success and read-error!
*/
void __FhgfsOpsCommKitVec_writefileStageHandlePages(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
FhgfsChunkPageVec *pageVec = comm->pageVec;
struct inode* inode = FhgfsChunkPageVec_getInode(pageVec);
FhgfsPage* lastInProgressPage = NULL;
FhgfsPage* firstUnhandledPage;
ssize_t remainingServerWriteRes = comm->nodeResult;
LOG_DEBUG_TOP_FORMATTED(commHelper->log, LogTopic_COMMKIT, Log_DEBUG, __func__, "enter");
if (unlikely(comm->nodeResult == -FhgfsOpsErr_AGAIN ||
comm->nodeResult == -FhgfsOpsErr_COMMUNICATION) )
return; // try again
firstUnhandledPage = FhgfsChunkPageVec_iterateGetNextPage(pageVec); // might be NULL
FhgfsChunkPageVec_resetIterator(pageVec);
// walk over all pages in the page list vec
while(1)
{
int writeRes;
FhgfsPage* fhgfsPage = FhgfsChunkPageVec_iterateGetNextPage(pageVec);
if (!fhgfsPage)
break;
if (fhgfsPage == lastInProgressPage)
continue; // we already handled it above
if (remainingServerWriteRes > 0)
{
writeRes = fhgfsPage->length;
remainingServerWriteRes -= fhgfsPage->length;
}
else
writeRes = -EREMOTEIO;
if (unlikely(fhgfsPage == firstUnhandledPage && remainingServerWriteRes > 0) )
{
printk_fhgfs(KERN_INFO,
"Bug: Server claims to have written more pages than we counted internally!");
writeRes = -EIO;
}
if (unlikely(writeRes < 0) )
{
FhgfsInode* fhgfsInode = BEEGFS_INODE(inode);
spin_lock(&inode->i_lock);
FhgfsInode_setWritePageError(fhgfsInode);
spin_unlock(&inode->i_lock);
}
FhgfsOpsPages_endWritePage(fhgfsPage->page, writeRes, inode);
}
if (unlikely(remainingServerWriteRes < 0) )
Logger_logTopFormatted(commHelper->log, LogTopic_COMMKIT, Log_ERR, __func__,
"Write error detected (check server logs for details)!");
}
/**
* Start the remote read IO
*/
static int64_t __FhgfsOpsCommKitVec_readfileCommunicate(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
comm->doHeader = false;
__FhgfsOpsCommKitVec_readfileStagePREPARE(commHelper, comm);
while (comm->doHeader)
{
comm->doHeader = false;
// set doHeader to true if required
__FhgfsOpsCommKitVec_readfileStageRECVHEADER(commHelper, comm);
}
return comm->nodeResult;
}
/**
* Start the remote write IO
*/
static int64_t __FhgfsOpsCommKitVec_writefileCommunicate(CommKitVecHelper* commHelper,
FhgfsCommKitVec* comm)
{
FhgfsInode* inode = BEEGFS_INODE(comm->pageVec->inode);
FhgfsInode_incWriteBackCounter(inode);
comm->doHeader = false;
__FhgfsOpsCommKitVec_writefileStagePREPARE(commHelper, comm);
while (comm->doHeader)
{
comm->doHeader = false;
// set doHeader to true if required
__FhgfsOpsCommKitVec_writefileStageSENDHEADER(commHelper, comm);
}
spin_lock(&inode->vfs_inode.i_lock);
FhgfsInode_setLastWriteBackOrIsizeWriteTime(inode);
FhgfsInode_decWriteBackCounter(inode);
FhgfsInode_unsetNoIsizeDecrease(inode);
spin_unlock(&inode->vfs_inode.i_lock);
return comm->nodeResult;
}
int64_t FhgfsOpsCommKitVec_rwFileCommunicate(App* app, RemotingIOInfo* ioInfo,
FhgfsCommKitVec* comm, Fhgfs_RWType rwType)
{
Logger* log = App_getLogger(app);
Config* cfg = App_getConfig(app);
int64_t retVal;
unsigned retries = 0;
unsigned maxRetries = Config_getConnNumCommRetries(cfg);
bool isSignalPending = false;
const char* logCommInterruptedTxt = "Communication interrupted by signal";
CommKitVecHelper commHelper =
{
.app = app,
.log = App_getLogger(app),
.logContext = (rwType == BEEGFS_RWTYPE_READ) ?
"readfile (vec communication)" : "writefile (vec communication)",
.ioInfo = ioInfo,
};
do
{
FhgfsChunkPageVec_resetIterator(comm->pageVec);
if (rwType == BEEGFS_RWTYPE_READ)
retVal = __FhgfsOpsCommKitVec_readfileCommunicate(&commHelper, comm);
else
retVal = __FhgfsOpsCommKitVec_writefileCommunicate(&commHelper, comm);
if (unlikely(retVal == -FhgfsOpsErr_AGAIN || retVal == -FhgfsOpsErr_COMMUNICATION) )
{
if (fatal_signal_pending(current))
{ // interrupted by signal
Logger_logTopFormatted(log, LogTopic_COMMKIT, Log_NOTICE, __func__,
"%s.", logCommInterruptedTxt);
isSignalPending = true;
}
else
MessagingTk_waitBeforeRetry(retries);
retries++;
}
if (unlikely(retVal == -FhgfsOpsErr_AGAIN ) )
{
retVal = -FhgfsOpsErr_COMMUNICATION;
retries = 0;
}
} while (retVal == -FhgfsOpsErr_COMMUNICATION && retries <= maxRetries && !isSignalPending);
if (rwType == BEEGFS_RWTYPE_READ)
__FhgfsOpsCommKitVec_readfileStageHandlePages(&commHelper, comm);
else
__FhgfsOpsCommKitVec_writefileStageHandlePages(&commHelper, comm);
return retVal;
}