New upstream version 8.1.0

This commit is contained in:
geos_one
2025-08-10 01:34:16 +02:00
commit c891bb7105
4398 changed files with 838833 additions and 0 deletions

View File

@@ -0,0 +1,94 @@
#include <program/Program.h>
#include <common/net/message/session/FSyncLocalFileRespMsg.h>
#include <common/storage/StorageErrors.h>
#include <net/msghelpers/MsgHelperIO.h>
#include "FSyncLocalFileMsgEx.h"
bool FSyncLocalFileMsgEx::processIncoming(ResponseContext& ctx)
{
ctx.sendResponse(FSyncLocalFileRespMsg(fsync()));
return true;
}
FhgfsOpsErr FSyncLocalFileMsgEx::fsync()
{
const char* logContext = "FSyncLocalFileMsg incoming";
FhgfsOpsErr clientRes = FhgfsOpsErr_SUCCESS;
bool isMirrorSession = isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_BUDDYMIRROR);
// do session check only when it is not a mirror session
bool useSessionCheck = isMirrorSession ? false :
isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_SESSION_CHECK);
App* app = Program::getApp();
SessionStore* sessions = app->getSessions();
auto session = sessions->referenceOrAddSession(getSessionID());
SessionLocalFileStore* sessionLocalFiles = session->getLocalFiles();
// select the right targetID
uint16_t targetID = getTargetID();
if(isMirrorSession)
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
targetID = isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
mirrorBuddies->getSecondaryTargetID(targetID) :
mirrorBuddies->getPrimaryTargetID(targetID);
// note: only log message here, error handling will happen below through invalid targetFD
if(unlikely(!targetID) )
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
StringTk::uintToStr(getTargetID() ) );
}
auto sessionLocalFile =
sessionLocalFiles->referenceSession(getFileHandleID(), targetID, isMirrorSession);
if(sessionLocalFile)
{ // sessionLocalFile exists => check if open and perform fsync
if (!isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_NO_SYNC) )
{
auto& fd = sessionLocalFile->getFD();
if (fd.valid())
{ // file open => sync
int fsyncRes = MsgHelperIO::fsync(*fd);
if(fsyncRes)
{
LogContext log(logContext);
log.log(Log_WARNING, std::string("fsync of chunk file failed. ") +
std::string("SessionID: ") + getSessionID().str() +
std::string(". SysErr: ") + System::getErrString() );
clientRes = FhgfsOpsErr_INTERNAL;
}
}
}
if(useSessionCheck && sessionLocalFile->isServerCrashed() )
{ // server crashed during the write, maybe lost some data send error to client
LogContext log(logContext);
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.) "
"The session is marked as dirty.");
clientRes = FhgfsOpsErr_STORAGE_SRV_CRASHED;
}
}
else
if (useSessionCheck)
{ // the server crashed during a write or before the close was successful
LogContext log(logContext);
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
"No session for file available. "
"FileHandleID: " + std::string(getFileHandleID()) );
clientRes = FhgfsOpsErr_STORAGE_SRV_CRASHED;
}
return clientRes;
}

View File

@@ -0,0 +1,13 @@
#pragma once
#include <common/net/message/session/FSyncLocalFileMsg.h>
class FSyncLocalFileMsgEx : public FSyncLocalFileMsg
{
public:
virtual bool processIncoming(ResponseContext& ctx);
private:
FhgfsOpsErr fsync();
};

View File

@@ -0,0 +1,252 @@
#include <common/net/message/control/GenericResponseMsg.h>
#include <common/net/message/session/opening/CloseChunkFileRespMsg.h>
#include <common/toolkit/SessionTk.h>
#include <net/msghelpers/MsgHelperIO.h>
#include <program/Program.h>
#include <toolkit/StorageTkEx.h>
#include "CloseChunkFileMsgEx.h"
#include <boost/lexical_cast.hpp>
bool CloseChunkFileMsgEx::processIncoming(ResponseContext& ctx)
{
App* app = Program::getApp();
FhgfsOpsErr closeMsgRes;
DynamicAttribs dynAttribs;
std::tie(closeMsgRes, dynAttribs) = close(ctx);
// if closeMsgRes == FhgfsOpsErr_COMMUNICATION, a GenericResponseMsg has been sent already
if (closeMsgRes != FhgfsOpsErr_COMMUNICATION)
ctx.sendResponse(
CloseChunkFileRespMsg(closeMsgRes, dynAttribs.filesize, dynAttribs.allocedBlocks,
dynAttribs.modificationTimeSecs, dynAttribs.lastAccessTimeSecs,
dynAttribs.storageVersion) );
// update op counters
app->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(), StorageOpCounter_CLOSELOCAL,
getMsgHeaderUserID() );
return true;
}
std::pair<FhgfsOpsErr, CloseChunkFileMsgEx::DynamicAttribs> CloseChunkFileMsgEx::close(
ResponseContext& ctx)
{
const char* logContext = "CloseChunkFileMsg incoming";
App* app = Program::getApp();
Config* config = app->getConfig();
SessionStore* sessions = app->getSessions();
uint16_t targetID;
FhgfsOpsErr closeMsgRes = FhgfsOpsErr_SUCCESS; // the result that will be sent to requestor
DynamicAttribs dynAttribs = {0, 0, 0, 0, 0};
std::string fileHandleID(getFileHandleID() );
bool isMirrorSession = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR);
SessionLocalFileStore* sessionLocalFiles;
// select the right targetID
targetID = getTargetID();
if(isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR) )
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
targetID = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
mirrorBuddies->getSecondaryTargetID(targetID) :
mirrorBuddies->getPrimaryTargetID(targetID);
if(unlikely(!targetID) )
{ // unknown target
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
StringTk::uintToStr(getTargetID() ) );
return {FhgfsOpsErr_UNKNOWNTARGET, {}};
}
}
// forward to secondary (if appropriate)
closeMsgRes = forwardToSecondary(ctx);
if (unlikely(closeMsgRes != FhgfsOpsErr_SUCCESS))
return {closeMsgRes, dynAttribs};
auto session = sessions->referenceOrAddSession(getSessionID());
sessionLocalFiles = session->getLocalFiles();
auto fsState = sessionLocalFiles->removeSession(fileHandleID, targetID, isMirrorSession);
// get current dynamic file attribs
if (fsState)
{ // file no longer in use => refresh filesize and close file fd
auto& fd = fsState->getFD();
/* get dynamic attribs, here before closing the file.
* Note: Depending on the underlying file system the returned st_blocks might be too large
* (pre-allocated blocks, which are only released on close() ). Advantage here is
* that we already have the file descriptor. */
if( (config->getTuneEarlyStat() ) &&
(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) ) )
getDynamicAttribsByFD(*fd, fileHandleID, targetID, dynAttribs);
// close fd
if (!fsState->close())
closeMsgRes = FhgfsOpsErr_INTERNAL;
// only get the attributes here, in order to make xfs to release pre-allocated blocks
if( (!config->getTuneEarlyStat() ) &&
(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) ) )
getDynamicAttribsByPath(fileHandleID, targetID, dynAttribs);
}
else
if(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) )
{ // file still in use by other threads => get dynamic attribs by path
bool getRes = getDynamicAttribsByPath(fileHandleID, targetID, dynAttribs);
if (getRes)
{
// LogContext(logContext).log(Log_DEBUG, "Chunk file virtually closed. "
// "HandleID: " + fileHandleID);
}
}
// note: "file not exists" is not an error. we just have nothing to do in that case.
return {closeMsgRes, dynAttribs};
}
/**
* If this is a buddy mirror msg and we are the primary, forward this msg to secondary.
*
* @return _COMMUNICATION if forwarding to buddy failed and buddy is not marked offline (in which
* case *outChunkLocked==false is guaranteed).
* @throw SocketException if sending of GenericResponseMsg fails.
*/
FhgfsOpsErr CloseChunkFileMsgEx::forwardToSecondary(ResponseContext& ctx)
{
const char* logContext = "CloseChunkFileMsg incoming (forward to secondary)";
App* app = Program::getApp();
if(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR) ||
isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND) )
return FhgfsOpsErr_SUCCESS; // nothing to do
// instead of creating a new msg object, we just re-use "this" with "buddymirror second" flag
addMsgHeaderFeatureFlag(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND);
RequestResponseArgs rrArgs(NULL, this, NETMSGTYPE_CloseChunkFileResp);
RequestResponseTarget rrTarget(getTargetID(), app->getTargetMapper(), app->getStorageNodes(),
app->getTargetStateStore(), app->getMirrorBuddyGroupMapper(), true);
FhgfsOpsErr commRes = MessagingTk::requestResponseTarget(&rrTarget, &rrArgs);
// remove the flag that we just added for secondary
unsetMsgHeaderFeatureFlag(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND);
if(unlikely(
(commRes == FhgfsOpsErr_COMMUNICATION) &&
(rrTarget.outTargetReachabilityState == TargetReachabilityState_OFFLINE) ) )
{
LOG_DEBUG(logContext, Log_DEBUG, std::string("Secondary is offline and will need resync. ") +
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );;
return FhgfsOpsErr_SUCCESS; // go ahead with local msg processing
}
if(unlikely(commRes != FhgfsOpsErr_SUCCESS) )
{
LogContext(logContext).log(Log_DEBUG, "Forwarding failed. "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) + "; "
"error: " + boost::lexical_cast<std::string>(commRes));
std::string genericRespStr = "Communication with secondary failed. "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() );
ctx.sendResponse(
GenericResponseMsg(GenericRespMsgCode_INDIRECTCOMMERR, std::move(genericRespStr)));
return FhgfsOpsErr_COMMUNICATION;
}
CloseChunkFileRespMsg* respMsg = (CloseChunkFileRespMsg*)rrArgs.outRespMsg.get();
FhgfsOpsErr secondaryRes = respMsg->getResult();
if(unlikely(secondaryRes != FhgfsOpsErr_SUCCESS) )
{
LogContext(logContext).log(Log_NOTICE, std::string("Secondary reported error: ") +
boost::lexical_cast<std::string>(secondaryRes) + "; "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
return secondaryRes;
}
return FhgfsOpsErr_SUCCESS;
}
bool CloseChunkFileMsgEx::getDynamicAttribsByFD(const int fd, std::string fileHandleID,
uint16_t targetID, DynamicAttribs& outDynAttribs)
{
SyncedStoragePaths* syncedPaths = Program::getApp()->getSyncedStoragePaths();
std::string fileID(SessionTk::fileIDFromHandleID(fileHandleID) );
uint64_t storageVersion = syncedPaths->lockPath(fileID, targetID); // LOCK
// note: this is locked because we need to get the filesize together with the storageVersion
bool getDynAttribsRes = StorageTkEx::getDynamicFileAttribs(fd, &outDynAttribs.filesize,
&outDynAttribs.allocedBlocks, &outDynAttribs.modificationTimeSecs,
&outDynAttribs.lastAccessTimeSecs);
if(getDynAttribsRes)
outDynAttribs.storageVersion = storageVersion;
syncedPaths->unlockPath(fileID, targetID); // UNLOCK
return getDynAttribsRes;
}
bool CloseChunkFileMsgEx::getDynamicAttribsByPath(std::string fileHandleID, uint16_t targetID,
DynamicAttribs& outDynAttribs)
{
const char* logContext = "CloseChunkFileMsg (attribs by path)";
App* app = Program::getApp();
SyncedStoragePaths* syncedPaths = app->getSyncedStoragePaths();
auto* const target = app->getStorageTargets()->getTarget(targetID);
if (!target)
{ // unknown targetID
LogContext(logContext).logErr("Unknown targetID: " + StringTk::uintToStr(targetID) );
return false;
}
const int targetFD = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR)
? *target->getMirrorFD()
: *target->getChunkFD();
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
std::string pathStr = StorageTk::getFileChunkPath(getPathInfo(), fileID);
uint64_t storageVersion = syncedPaths->lockPath(fileID, targetID); // L O C K path
// note: this is locked because we need to get the filesize together with the storageVersion
bool getDynAttribsRes = StorageTkEx::getDynamicFileAttribs(targetFD, pathStr.c_str(),
&outDynAttribs.filesize, &outDynAttribs.allocedBlocks, &outDynAttribs.modificationTimeSecs,
&outDynAttribs.lastAccessTimeSecs);
if(getDynAttribsRes)
outDynAttribs.storageVersion = storageVersion;
syncedPaths->unlockPath(fileID, targetID); // U N L O C K path
return getDynAttribsRes;
}

View File

@@ -0,0 +1,29 @@
#pragma once
#include <common/net/message/session/opening/CloseChunkFileMsg.h>
class CloseChunkFileMsgEx : public CloseChunkFileMsg
{
private:
struct DynamicAttribs
{
int64_t filesize;
int64_t allocedBlocks; // allocated 512byte blocks (relevant for sparse files)
int64_t modificationTimeSecs;
int64_t lastAccessTimeSecs;
uint64_t storageVersion;
};
public:
virtual bool processIncoming(ResponseContext& ctx);
private:
FhgfsOpsErr forwardToSecondary(ResponseContext& ctx);
bool getDynamicAttribsByFD(int fd, std::string fileHandleID, uint16_t targetID,
DynamicAttribs& outDynAttribs);
bool getDynamicAttribsByPath(std::string fileHandleID, uint16_t targetID,
DynamicAttribs& outDynAttribs);
std::pair<FhgfsOpsErr, DynamicAttribs> close(ResponseContext& ctx);
};

View File

@@ -0,0 +1,114 @@
#pragma once
#ifdef BEEGFS_NVFS
#include <string>
#include <typeinfo>
#include <common/net/message/session/rw/ReadLocalFileRDMAMsg.h>
#include <common/storage/StorageErrors.h>
#include <common/components/worker/Worker.h>
#include <session/SessionLocalFileStore.h>
#include "ReadLocalFileV2MsgEx.h"
/**
* Implements RDMA write protocol.
*/
class ReadLocalFileRDMAMsgSender : public ReadLocalFileRDMAMsg
{
public:
struct ReadState : public ReadStateBase
{
RdmaInfo* rdma;
uint64_t rBuf;
size_t rLen;
uint64_t rOff;
ReadState(const char* logContext, uint64_t toBeRead,
SessionLocalFile* sessionLocalFile) :
ReadStateBase(logContext, toBeRead, sessionLocalFile) {}
};
private:
friend class ReadLocalFileMsgExBase<ReadLocalFileRDMAMsgSender, ReadState>;
static std::string logContextPref;
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
{
lengthInfo = HOST_TO_LE_64(lengthInfo);
sock->send(&lengthInfo, sizeof(int64_t), 0);
}
/**
* RDMA write data to the remote buffer.
*/
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
{
ssize_t writeRes = sock->write(buf, rs.readRes, 0, rs.rBuf + rs.rOff, rs.rdma->key);
LOG_DEBUG(rs.logContext, Log_DEBUG,
"buf: " + StringTk::uint64ToHexStr((uint64_t)buf) + "; "
"bufLen: " + StringTk::int64ToStr(rs.readRes) + "; "
"rbuf: " + StringTk::uint64ToHexStr(rs.rBuf) + "; "
"rkey: " + StringTk::uintToHexStr(rs.rdma->key) + "; "
"writeRes: " + StringTk::int64ToStr(writeRes));
if (unlikely(writeRes != rs.readRes))
{
LogContext(rs.logContext).logErr("Unable to write file data to client. "
"FileID: " + rs.sessionLocalFile->getFileID() + "; "
"SysErr: " + System::getErrString());
writeRes = -1;
}
if (isFinal && likely(writeRes >= 0))
sendLengthInfo(sock, getCount() - rs.toBeRead);
return writeRes;
}
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
{
// Cannot RDMA anything larger than WORKER_BUFOUT_SIZE in a single operation
// because that is the size of the buffer passed in by the Worker.
// TODO: pass around a Buffer with a length instead of unqualified char*.
return BEEGFS_MIN(BEEGFS_MIN(len, ssize_t(rs.rLen - rs.rOff)), WORKER_BUFOUT_SIZE);
}
inline bool readStateInit(ReadState& rs)
{
rs.rdma = getRdmaInfo();
if (unlikely(!rs.rdma->next(rs.rBuf, rs.rLen, rs.rOff)))
{
LogContext(rs.logContext).logErr("No entities in RDMA buffers.");
return false;
}
return true;
}
inline bool readStateNext(ReadState& rs)
{
rs.rOff += rs.readRes;
if (rs.rOff == rs.rLen)
{
if (unlikely(!rs.rdma->next(rs.rBuf, rs.rLen, rs.rOff)))
{
LogContext(rs.logContext).logErr("RDMA buffers exhausted");
return false;
}
}
return true;
}
inline size_t getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf)
{
*dataBuf = ctx.getBuffer();
*sendBuf = *dataBuf;
return ctx.getBufferLength();
}
};
typedef ReadLocalFileMsgExBase<ReadLocalFileRDMAMsgSender,
ReadLocalFileRDMAMsgSender::ReadState> ReadLocalFileRDMAMsgEx;
#endif /* BEEGFS_NVFS */

View File

@@ -0,0 +1,466 @@
#include <program/Program.h>
#include <common/storage/StorageErrors.h>
#include <common/toolkit/SessionTk.h>
#include <net/msghelpers/MsgHelperIO.h>
#include <toolkit/StorageTkEx.h>
#include "ReadLocalFileV2MsgEx.h"
#ifdef BEEGFS_NVFS
#include "ReadLocalFileRDMAMsgEx.h"
#endif
#include <sys/sendfile.h>
#include <sys/mman.h>
#define READ_USE_TUNEFILEREAD_TRIGGER (4*1024*1024) /* seq IO trigger for tuneFileReadSize */
#define READ_BUF_OFFSET_PROTO_MIN (sizeof(int64_t) ) /* for prepended length info */
#define READ_BUF_END_PROTO_MIN (sizeof(int64_t) ) /* for appended length info */
/* reserve more than necessary at buf start to achieve page cache alignment */
const size_t READ_BUF_OFFSET =
BEEGFS_MAX( (long)READ_BUF_OFFSET_PROTO_MIN, sysconf(_SC_PAGESIZE) );
/* reserve more than necessary at buf end to achieve page cache alignment */
const size_t READ_BUF_END_RESERVE =
BEEGFS_MAX( (long)READ_BUF_END_PROTO_MIN, sysconf(_SC_PAGESIZE) );
/* read buffer size cutoff for protocol data */
const size_t READ_BUF_LEN_PROTOCOL_CUTOFF =
READ_BUF_OFFSET + READ_BUF_END_RESERVE;
// A linker error occurs for processIncoming without having this forced linkage.
static ReadLocalFileV2MsgEx forcedLinkageV2;
#ifdef BEEGFS_NVFS
static ReadLocalFileRDMAMsgEx forcedLinkageRDMA;
#endif
std::string ReadLocalFileV2MsgSender::logContextPref = "ReadChunkFileV2Msg";
#ifdef BEEGFS_NVFS
std::string ReadLocalFileRDMAMsgSender::logContextPref = "ReadChunkFileRDMAMsg";
#endif
template <class Msg, typename ReadState>
bool ReadLocalFileMsgExBase<Msg, ReadState>::processIncoming(NetMessage::ResponseContext& ctx)
{
std::string logContext = Msg::logContextPref + " incoming";
bool retVal = true; // return value
int64_t readRes = 0;
std::string fileHandleID(getFileHandleID() );
bool isMirrorSession = isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_BUDDYMIRROR);
// do session check only when it is not a mirror session
bool useSessionCheck = isMirrorSession ? false :
isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_SESSION_CHECK);
App* app = Program::getApp();
SessionStore* sessions = app->getSessions();
auto session = sessions->referenceOrAddSession(getClientNumID());
this->sessionLocalFiles = session->getLocalFiles();
// select the right targetID
uint16_t targetID = getTargetID();
if(isMirrorSession )
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
targetID = isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
mirrorBuddies->getSecondaryTargetID(targetID) :
mirrorBuddies->getPrimaryTargetID(targetID);
// note: only log message here, error handling will happen below through invalid targetFD
if(unlikely(!targetID) )
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
StringTk::uintToStr(getTargetID() ) );
}
auto* const target = app->getStorageTargets()->getTarget(targetID);
if (!target)
{
if (isMirrorSession)
{ /* buddy mirrored file => fail with Err_COMMUNICATION to make the requestor retry.
mgmt will mark this target as (p)offline in a few moments. */
LOG(GENERAL, NOTICE, "Unknown target ID, refusing request.", targetID);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
return true;
}
LOG(GENERAL, ERR, "Unknown target ID.", targetID);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_UNKNOWNTARGET);
return true;
}
// check if we already have a session for this file...
auto sessionLocalFile = sessionLocalFiles->referenceSession(
fileHandleID, targetID, isMirrorSession);
if(!sessionLocalFile)
{ // sessionLocalFile not exists yet => create, insert, re-get it
if(useSessionCheck)
{ // server crashed during the write, maybe lost some data send error to client
LogContext log(logContext);
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
"No session for file available. "
"FileHandleID: " + fileHandleID);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_STORAGE_SRV_CRASHED);
goto release_session;
}
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
int openFlags = SessionTk::sysOpenFlagsFromFhgfsAccessFlags(getAccessFlags() );
auto newFile = boost::make_unique<SessionLocalFile>(fileHandleID, targetID, fileID, openFlags,
false);
if(isMirrorSession)
newFile->setIsMirrorSession(true);
sessionLocalFile = sessionLocalFiles->addAndReferenceSession(std::move(newFile));
}
else
{ // session file exists
if(useSessionCheck && sessionLocalFile->isServerCrashed() )
{ // server crashed during the write, maybe lost some data send error to client
LogContext log(logContext);
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.) "
"The session is marked as dirty. "
"FileHandleID: " + fileHandleID);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_STORAGE_SRV_CRASHED);
goto release_session;
}
}
/* Note: the session file must be unlocked/released before we send the finalizing info,
because otherwise we have a race when the client assumes the read is complete and tries
to close the file (while the handle is actually still referenced on the server). */
/* Note: we also must be careful to update the current offset before sending the final length
info because otherwise the session file might have been released already and we have no
longer access to the offset. */
readRes = -1;
try
{
// prepare file descriptor (if file not open yet then open it if it exists already)
FhgfsOpsErr openRes = openFile(*target, sessionLocalFile.get());
if(openRes != FhgfsOpsErr_SUCCESS)
{
sendLengthInfo(ctx.getSocket(), -openRes);
goto release_session;
}
// check if file exists
if(!sessionLocalFile->getFD().valid())
{ // file didn't exist (not an error) => send EOF
sendLengthInfo(ctx.getSocket(), 0);
goto release_session;
}
// the actual read workhorse...
readRes = incrementalReadStatefulAndSendV2(ctx, sessionLocalFile.get());
LOG_DEBUG(logContext, Log_SPAM, "sending completed. "
"readRes: " + StringTk::int64ToStr(readRes) );
IGNORE_UNUSED_VARIABLE(readRes);
}
catch(SocketException& e)
{
LogContext(logContext).logErr(std::string("SocketException occurred: ") + e.what() );
LogContext(logContext).log(Log_WARNING, "Details: "
"sessionID: " + getClientNumID().str() + "; "
"fileHandle: " + fileHandleID + "; "
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
"count: " + StringTk::int64ToStr(getCount() ) );
sessionLocalFile->setOffset(-1); /* invalidate offset (we can only do this if still locked,
but that's not a prob if we update offset correctly before send - see notes above) */
retVal = false;
goto release_session;
}
release_session:
// update operation counters
if(likely(readRes > 0) )
app->getNodeOpStats()->updateNodeOp(
ctx.getSocket()->getPeerIP(), StorageOpCounter_READOPS, readRes, getMsgHeaderUserID() );
return retVal;
}
inline size_t ReadLocalFileV2MsgSender::getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf)
{
*dataBuf = ctx.getBuffer() + READ_BUF_OFFSET; // offset for prepended data length info
*sendBuf = *dataBuf - READ_BUF_OFFSET_PROTO_MIN;
return ctx.getBufferLength() - READ_BUF_LEN_PROTOCOL_CUTOFF; /* cutoff for
prepended and finalizing length info */
}
/**
* Note: This is similar to incrementalReadAndSend, but uses the offset from sessionLocalFile
* to avoid calling seek every time.
*
* Warning: Do not use the returned value to set the new offset, as there might be other threads
* that also did something with the file (i.e. the io-lock is released somewhere within this
* method).
*
* @return number of bytes read or some arbitrary negative value otherwise
*/
template <class Msg, typename ReadState>
int64_t ReadLocalFileMsgExBase<Msg, ReadState>::incrementalReadStatefulAndSendV2(NetMessage::ResponseContext& ctx,
SessionLocalFile* sessionLocalFile)
{
/* note on session offset: the session offset must always be set before sending the data to the
client (otherwise the client could send the next request before we updated the offset, which
would lead to a race condition) */
std::string logContext = Msg::logContextPref + " (read incremental)";
Config* cfg = Program::getApp()->getConfig();
char* dataBuf;
char* sendBuf;
if (READ_BUF_LEN_PROTOCOL_CUTOFF >= ctx.getBufferLength())
{ // buffer too small. That shouldn't happen and is an error
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INTERNAL);
return -1;
}
const ssize_t dataBufLen = getBuffers(ctx, &dataBuf, &sendBuf);
auto& fd = sessionLocalFile->getFD();
int64_t oldOffset = sessionLocalFile->getOffset();
int64_t newOffset = getOffset();
bool skipReadAhead =
unlikely(isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_DISABLE_IO) ||
sessionLocalFile->getIsDirectIO());
ssize_t readAheadSize = skipReadAhead ? 0 : cfg->getTuneFileReadAheadSize();
ssize_t readAheadTriggerSize = cfg->getTuneFileReadAheadTriggerSize();
if( (oldOffset < 0) || (oldOffset != newOffset) )
{
sessionLocalFile->resetReadCounter(); // reset sequential read counter
sessionLocalFile->resetLastReadAheadTrigger();
}
else
{ // read continues at previous offset
LOG_DEBUG(logContext, Log_SPAM,
"fileID: " + sessionLocalFile->getFileID() + "; "
"offset: " + StringTk::int64ToStr(getOffset() ) );
}
size_t maxReadAtOnceLen = dataBufLen;
// reduce maxReadAtOnceLen to achieve better read/send aync overlap
/* (note: reducing makes only sense if we can rely on the kernel to do some read-ahead, so don't
reduce for direct IO and for random IO) */
if( (sessionLocalFile->getReadCounter() >= READ_USE_TUNEFILEREAD_TRIGGER) &&
!sessionLocalFile->getIsDirectIO() )
maxReadAtOnceLen = BEEGFS_MIN(dataBufLen, cfg->getTuneFileReadSize() );
off_t readOffset = getOffset();
ReadState readState(logContext.c_str(), getCount(), sessionLocalFile);
if (!isMsgValid() || !readStateInit(readState))
{
LogContext(logContext).logErr("Invalid read message.");
sessionLocalFile->setOffset(-1);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INVAL);
return -1;
}
for( ; ; )
{
ssize_t readLength = getReadLength(readState, BEEGFS_MIN(maxReadAtOnceLen, readState.toBeRead));
readState.readRes = unlikely(isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_DISABLE_IO) ) ?
readLength : MsgHelperIO::pread(*fd, dataBuf, readLength, readOffset);
LOG_DEBUG(logContext, Log_SPAM,
"toBeRead: " + StringTk::int64ToStr(readState.toBeRead) + "; "
"readLength: " + StringTk::int64ToStr(readLength) + "; "
"readRes: " + StringTk::int64ToStr(readState.readRes) );
if(readState.readRes == readLength)
{ // simple success case
readState.toBeRead -= readState.readRes;
readOffset += readState.readRes;
int64_t newOffset = getOffset() + getCount() - readState.toBeRead;
sessionLocalFile->setOffset(newOffset); // update offset
sessionLocalFile->incReadCounter(readState.readRes); // update sequential read length
ctx.getStats()->incVals.diskReadBytes += readState.readRes; // update stats
bool isFinal = !readState.toBeRead;
if (readStateSendData(ctx.getSocket(), readState, sendBuf, isFinal) < 0)
{
LogContext(logContext).logErr("readStateSendData failed.");
sessionLocalFile->setOffset(-1);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
return -1;
}
checkAndStartReadAhead(sessionLocalFile, readAheadTriggerSize, newOffset, readAheadSize);
if(isFinal)
{ // we reached the end of the requested data
return getCount();
}
if (!readStateNext(readState))
{
LogContext(logContext).logErr("readStateNext failed.");
sessionLocalFile->setOffset(-1);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
return -1;
}
}
else
{ // readRes not as it should be => might be an error or just an end-of-file
if(readState.readRes == -1)
{ // read error occurred
LogContext(logContext).log(Log_WARNING, "Unable to read file data. "
"FileID: " + sessionLocalFile->getFileID() + "; "
"SysErr: " + System::getErrString() );
sessionLocalFile->setOffset(-1);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INTERNAL);
return -1;
}
else
{ // just an end of file
LOG_DEBUG(logContext, Log_DEBUG,
"Unable to read all of the requested data (=> end of file)");
LOG_DEBUG(logContext, Log_DEBUG,
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
"count: " + StringTk::int64ToStr(getCount() ) + "; "
"readLength: " + StringTk::int64ToStr(readLength) + "; " +
"readRes: " + StringTk::int64ToStr(readState.readRes) + "; " +
"toBeRead: " + StringTk::int64ToStr(readState.toBeRead) );
readOffset += readState.readRes;
readState.toBeRead -= readState.readRes;
sessionLocalFile->setOffset(getOffset() + getCount() - readState.toBeRead); // update offset
sessionLocalFile->incReadCounter(readState.readRes); // update sequential read length
ctx.getStats()->incVals.diskReadBytes += readState.readRes; // update stats
if(readState.readRes > 0)
{
if (readStateSendData(ctx.getSocket(), readState, sendBuf, true) < 0)
{
LogContext(logContext).logErr("readStateSendData failed.");
sessionLocalFile->setOffset(-1);
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
return -1;
}
}
else
sendLengthInfo(ctx.getSocket(), 0);
return(getCount() - readState.toBeRead);
}
}
} // end of for-loop
}
/**
* Starts read-ahead if enough sequential data has been read.
*
* Note: if getDisableIO() is true, we assume the caller sets readAheadSize==0, so getDisableIO()
* is not checked explicitly within this function.
*
* @sessionLocalFile lastReadAheadOffset will be updated if read-head was triggered
* @param readAheadTriggerSize the length of sequential IO that triggers read-ahead
* @param currentOffset current file offset (where read-ahead would start)
*/
template <class Msg, typename ReadState>
void ReadLocalFileMsgExBase<Msg, ReadState>::checkAndStartReadAhead(SessionLocalFile* sessionLocalFile,
ssize_t readAheadTriggerSize, off_t currentOffset, off_t readAheadSize)
{
std::string logContext = Msg::logContextPref + " (read-ahead)";
if(!readAheadSize)
return;
int64_t readCounter = sessionLocalFile->getReadCounter();
int64_t nextReadAheadTrigger = sessionLocalFile->getLastReadAheadTrigger() ?
sessionLocalFile->getLastReadAheadTrigger() + readAheadSize : readAheadTriggerSize;
if(readCounter < nextReadAheadTrigger)
return; // we're not at the trigger point yet
/* start read-head...
(read-ahead is supposed to be non-blocking if there are free slots in the device IO queue) */
LOG_DEBUG(logContext, Log_SPAM,
std::string("Starting read-ahead... ") +
"offset: " + StringTk::int64ToStr(currentOffset) + "; "
"size: " + StringTk::int64ToStr(readAheadSize) );
MsgHelperIO::readAhead(*sessionLocalFile->getFD(), currentOffset, readAheadSize);
// update trigger
sessionLocalFile->setLastReadAheadTrigger(readCounter);
}
/**
* Open the file if a filedescriptor is not already set in sessionLocalFile.
* If the file needs to be opened, this method will check the target consistency state before
* opening.
*
* @return we return the special value FhgfsOpsErr_COMMUNICATION here in some cases to indirectly
* ask the client for a retry (e.g. if target consistency is not good for buddymirrored chunks).
*/
template <class Msg, typename ReadState>
FhgfsOpsErr ReadLocalFileMsgExBase<Msg, ReadState>::openFile(const StorageTarget& target,
SessionLocalFile* sessionLocalFile)
{
std::string logContext = Msg::logContextPref + " (open)";
bool isBuddyMirrorChunk = sessionLocalFile->getIsMirrorSession();
if (sessionLocalFile->getFD().valid())
return FhgfsOpsErr_SUCCESS; // file already open => nothing to be done here
// file not open yet => get targetFD and check consistency state
const auto consistencyState = target.getConsistencyState();
const int targetFD = isBuddyMirrorChunk ? *target.getMirrorFD() : *target.getChunkFD();
if(unlikely(consistencyState != TargetConsistencyState_GOOD) && isBuddyMirrorChunk)
{ // this is a request for a buddymirrored chunk on a non-good target
LogContext(logContext).log(Log_NOTICE, "Refusing request. Target consistency is not good. "
"targetID: " + StringTk::uintToStr(target.getID()));
return FhgfsOpsErr_COMMUNICATION;
}
FhgfsOpsErr openChunkRes = sessionLocalFile->openFile(targetFD, getPathInfo(), false, NULL);
return openChunkRes;
}

View File

@@ -0,0 +1,216 @@
#pragma once
#include <common/net/message/session/rw/ReadLocalFileV2Msg.h>
#include <common/storage/StorageErrors.h>
#include <session/SessionLocalFileStore.h>
class StorageTarget;
/**
* Contains common data needed by implementations of the network protocol
* that send data to the client.
*/
struct ReadStateBase
{
const char* logContext;
uint64_t toBeRead;
SessionLocalFile* sessionLocalFile;
ssize_t readRes;
ReadStateBase(const char* logContext, uint64_t toBeRead,
SessionLocalFile* sessionLocalFile)
{
this->logContext = logContext;
this->toBeRead = toBeRead;
this->sessionLocalFile = sessionLocalFile;
}
};
template <class Msg, typename ReadState>
class ReadLocalFileMsgExBase : public Msg
{
public:
bool processIncoming(NetMessage::ResponseContext& ctx);
private:
SessionLocalFileStore* sessionLocalFiles;
FhgfsOpsErr openFile(const StorageTarget& target, SessionLocalFile* sessionLocalFile);
void checkAndStartReadAhead(SessionLocalFile* sessionLocalFile, ssize_t readAheadTriggerSize,
off_t currentOffset, off_t readAheadSize);
int64_t incrementalReadStatefulAndSendV2(NetMessage::ResponseContext& ctx,
SessionLocalFile* sessionLocalFile);
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
{
static_cast<Msg&>(*this).sendLengthInfo(sock, lengthInfo);
}
inline bool readStateInit(ReadState& rs)
{
return static_cast<Msg&>(*this).readStateInit(rs);
}
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
{
return static_cast<Msg&>(*this).readStateSendData(sock, rs, buf, isFinal);
}
inline bool readStateNext(ReadState& rs)
{
return static_cast<Msg&>(*this).readStateNext(rs);
}
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
{
return static_cast<Msg&>(*this).getReadLength(rs, len);
}
inline size_t getBuffers(NetMessage::ResponseContext& ctx, char** dataBuf, char** sendBuf)
{
return static_cast<Msg&>(*this).getBuffers(ctx, dataBuf, sendBuf);
}
public:
inline unsigned getMsgHeaderUserID() const
{
return static_cast<const Msg&>(*this).getMsgHeaderUserID();
}
inline bool isMsgHeaderFeatureFlagSet(unsigned flag) const
{
return static_cast<const Msg&>(*this).isMsgHeaderFeatureFlagSet(flag);
}
inline uint16_t getTargetID() const
{
return static_cast<const Msg&>(*this).getTargetID();
}
inline int64_t getOffset() const
{
return static_cast<const Msg&>(*this).getOffset();
}
inline int64_t getCount() const
{
return static_cast<const Msg&>(*this).getCount();
}
inline const char* getFileHandleID()
{
return static_cast<Msg&>(*this).getFileHandleID();
}
inline NumNodeID getClientNumID() const
{
return static_cast<const Msg&>(*this).getClientNumID();
}
inline unsigned getAccessFlags() const
{
return static_cast<const Msg&>(*this).getAccessFlags();
}
inline PathInfo* getPathInfo ()
{
return static_cast<Msg&>(*this).getPathInfo();
}
inline bool isMsgValid() const
{
return static_cast<const Msg&>(*this).isMsgValid();
}
};
/**
* Implements the Version 2 send protocol. It uses a preceding length info for each chunk.
*/
class ReadLocalFileV2MsgSender : public ReadLocalFileV2Msg
{
/* note on protocol: this works by sending an int64 before each data chunk, which contains the
length of the next data chunk; or a zero if no more data can be read; or a negative fhgfs
error code in case of an error */
public:
struct ReadState : public ReadStateBase
{
ReadState(const char* logContext, uint64_t toBeRead,
SessionLocalFile* sessionLocalFile) :
ReadStateBase(logContext, toBeRead, sessionLocalFile) {}
};
private:
friend class ReadLocalFileMsgExBase<ReadLocalFileV2MsgSender, ReadState>;
static std::string logContextPref;
/**
* Send only length information without a data packet. Typically used for the final length
* info at the end of the requested data.
*/
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
{
lengthInfo = HOST_TO_LE_64(lengthInfo);
sock->send(&lengthInfo, sizeof(int64_t), 0);
}
/**
* No-op for this implementation.
*/
inline bool readStateInit(ReadState& rs)
{
return true;
}
/**
* Send length information and the corresponding data packet buffer.
*
* Note: rs.readRes is used to compute buf length for send()
*
* @param rs.readRes must not be negative
* @param buf the buffer with a preceding gap for the length info
* @param isFinal true if this is the last send, i.e. we have read all data
*/
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
{
ssize_t sendRes;
{
Serializer ser(buf, sizeof(int64_t));
ser % rs.readRes;
}
if (isFinal)
{
Serializer ser(buf + sizeof(int64_t) + rs.readRes, sizeof(int64_t));
ser % int64_t(0);
sendRes = sock->send(buf, (2*sizeof(int64_t) ) + rs.readRes, 0);
}
else
{
sendRes = sock->send(buf, sizeof(int64_t) + rs.readRes, 0);
}
return sendRes;
}
/**
* No-op for this implementation.
*/
inline bool readStateNext(ReadState& rs)
{
return true;
}
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
{
return len;
}
size_t getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf);
};
typedef ReadLocalFileMsgExBase<ReadLocalFileV2MsgSender,
ReadLocalFileV2MsgSender::ReadState> ReadLocalFileV2MsgEx;

View File

@@ -0,0 +1,926 @@
#include <program/Program.h>
#include <common/toolkit/MessagingTk.h>
#include <common/toolkit/SessionTk.h>
#include <common/toolkit/StorageTk.h>
#include <net/msghelpers/MsgHelperIO.h>
#include <storage/StorageTargets.h>
#include <toolkit/StorageTkEx.h>
#include "WriteLocalFileMsgEx.h"
#ifdef BEEGFS_NVFS
#include "WriteLocalFileRDMAMsgEx.h"
#endif
#include <boost/lexical_cast.hpp>
static WriteLocalFileMsgEx forcedLinkage;
#ifdef BEEGFS_NVFS
static WriteLocalFileRDMAMsgEx forcedLinkageRDMA;
#endif
const std::string WriteLocalFileMsgSender::logContextPref = "WriteChunkFileMsg";
#ifdef BEEGFS_NVFS
const std::string WriteLocalFileRDMAMsgSender::logContextPref = "WriteChunkFileRDMAMsg";
#endif
template <class Msg, typename WriteState>
bool WriteLocalFileMsgExBase<Msg, WriteState>::processIncoming(NetMessage::ResponseContext& ctx)
{
App* app = Program::getApp();
bool success;
int64_t writeClientRes;
if (!isMsgValid())
{
sendResponse(ctx, FhgfsOpsErr_INVAL);
return false;
}
std::tie(success, writeClientRes) = write(ctx);
if (success)
{
sendResponse(ctx, writeClientRes);
// update operation counters
if (likely(writeClientRes > 0))
app->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(),
StorageOpCounter_WRITEOPS, writeClientRes, getMsgHeaderUserID());
}
return success;
}
template <class Msg, typename WriteState>
std::pair<bool, int64_t> WriteLocalFileMsgExBase<Msg, WriteState>::write(NetMessage::ResponseContext& ctx)
{
std::string logContext = Msg::logContextPref + " incoming";
App* app = Program::getApp();
int64_t writeClientRes = -(int64_t)FhgfsOpsErr_INTERNAL; // bytes written or negative fhgfs err
FhgfsOpsErr finishMirroringRes = FhgfsOpsErr_INTERNAL;
std::string fileHandleID(getFileHandleID() );
bool isMirrorSession = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
bool serverCrashed = false;
QuotaExceededErrorType quotaExceeded = QuotaExceededErrorType_NOT_EXCEEDED;
SessionStore* sessions = Program::getApp()->getSessions();
auto session = sessions->referenceOrAddSession(getClientNumID());
SessionLocalFileStore* sessionLocalFiles = session->getLocalFiles();
ChunkLockStore* chunkLockStore = app->getChunkLockStore();
bool chunkLocked = false;
// select the right targetID
uint16_t targetID = getTargetID();
if(isMirrorSession)
{ // given targetID refers to a buddy mirror group
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
targetID = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
mirrorBuddies->getSecondaryTargetID(targetID) :
mirrorBuddies->getPrimaryTargetID(targetID);
// note: only log message here, error handling will happen below through invalid targetFD
if(unlikely(!targetID) )
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
StringTk::uintToStr(getTargetID() ) );
}
auto* const target = app->getStorageTargets()->getTarget(targetID);
if (!target)
{
if (isMirrorSession)
{ /* buddy mirrored file => fail with Err_COMMUNICATION to make the requestor retry.
mgmt will mark this target as (p)offline in a few moments. */
LOG(GENERAL, NOTICE, "Unknown target ID, refusing request.", targetID);
return {false, FhgfsOpsErr_COMMUNICATION};
}
LOG(GENERAL, ERR, "Unknown target ID.", targetID);
return {false, FhgfsOpsErr_UNKNOWNTARGET};
}
// check if we already have session for this file...
auto sessionLocalFile = sessionLocalFiles->referenceSession(
fileHandleID, targetID, isMirrorSession);
if(!sessionLocalFile)
{ // sessionLocalFile not exists yet => create, insert, re-get it
if(doSessionCheck() )
{ // server crashed during the write, maybe lost some data send error to client
LogContext log(logContext);
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
"No session for file available. "
"FileHandleID: " + fileHandleID);
serverCrashed = true;
}
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
int openFlags = SessionTk::sysOpenFlagsFromFhgfsAccessFlags(getAccessFlags() );
auto newFile = boost::make_unique<SessionLocalFile>(fileHandleID, targetID, fileID, openFlags,
serverCrashed);
if(isMirrorSession)
newFile->setIsMirrorSession(true);
sessionLocalFile = sessionLocalFiles->addAndReferenceSession(std::move(newFile));
}
else
{ // session file exists
if(doSessionCheck() && sessionLocalFile->isServerCrashed() )
{ // server crashed during the write, maybe lost some data send error to client
LogContext log(logContext);
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.)"
"The session is marked as dirty. "
"FileHandleID: " + fileHandleID);
serverCrashed = true;
}
}
// check if the size quota is exceeded for the user or group
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) &&
app->getConfig()->getQuotaEnableEnforcement() )
{
quotaExceeded = app->getExceededQuotaStores()->get(targetID)->isQuotaExceeded(getUserID(),
getGroupID(), QuotaLimitType_SIZE);
if(quotaExceeded != QuotaExceededErrorType_NOT_EXCEEDED)
{
LogContext(logContext).log(Log_NOTICE,
QuotaData::QuotaExceededErrorTypeToString(quotaExceeded) + " "
"UID: " + StringTk::uintToStr(this->getUserID()) + "; "
"GID: " + StringTk::uintToStr(this->getGroupID() ) );
// receive the message content before return with error
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
writeClientRes = -(int64_t) FhgfsOpsErr_DQUOT;
goto cleanup;
}
}
try
{
if(isMirrorSession && target->getBuddyResyncInProgress())
{
// mirrored chunk should be modified, check if resync is in progress and lock chunk
std::string chunkID = sessionLocalFile->getFileID();
chunkLockStore->lockChunk(targetID, chunkID);
chunkLocked = true;
}
// prepare file descriptor (if file not open yet then create/open it)
FhgfsOpsErr openRes = openFile(*target, sessionLocalFile.get());
if(unlikely(openRes != FhgfsOpsErr_SUCCESS) )
{
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
writeClientRes = -(int64_t)openRes;
goto cleanup;
}
// store mirror node reference in session and init mirrorToSock member
FhgfsOpsErr prepMirrorRes = prepareMirroring(ctx.getBuffer(), ctx.getBufferLength(),
sessionLocalFile.get(), *target);
if(unlikely(prepMirrorRes != FhgfsOpsErr_SUCCESS) )
{ // mirroring failed
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
writeClientRes = -(int64_t)prepMirrorRes;
goto cleanup;
}
// the actual write workhorse
int64_t writeLocalRes = incrementalRecvAndWriteStateful(ctx, sessionLocalFile.get());
// update client result, offset etc.
int64_t newOffset;
if(unlikely(writeLocalRes < 0) )
newOffset = -1; // writing failed
else
{ // writing succeeded
newOffset = getOffset() + writeLocalRes;
ctx.getStats()->incVals.diskWriteBytes += writeLocalRes; // update stats
}
sessionLocalFile->setOffset(newOffset);
writeClientRes = writeLocalRes;
}
catch(SocketException& e)
{
LogContext(logContext).logErr(std::string("SocketException occurred: ") + e.what() );
LogContext(logContext).log(Log_WARNING, std::string("Details: ") +
"sessionID: " + getClientNumID().str() + "; "
"fileHandle: " + std::string(sessionLocalFile->getFileHandleID() ) + "; "
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
"count: " + StringTk::int64ToStr(getCount() ) );
sessionLocalFile->setOffset(-1); // invalidate offset
finishMirroring(sessionLocalFile.get(), *target);
if (chunkLocked)
{
std::string chunkID = sessionLocalFile->getFileID();
chunkLockStore->unlockChunk(targetID, chunkID);
}
return {false, -1};
}
cleanup:
finishMirroringRes = finishMirroring(sessionLocalFile.get(), *target);
// check mirroring result (don't overwrite local error code, if any)
if(likely(writeClientRes > 0) )
{ // no local error => check mirroring result
if(unlikely(finishMirroringRes != FhgfsOpsErr_SUCCESS) )
writeClientRes = -finishMirroringRes; // mirroring failed => use err code as client result
}
if (chunkLocked)
{
std::string chunkID = sessionLocalFile->getFileID();
chunkLockStore->unlockChunk(targetID, chunkID);
}
if (serverCrashed)
writeClientRes = -(int64_t) FhgfsOpsErr_STORAGE_SRV_CRASHED;
return {true, writeClientRes};
}
ssize_t WriteLocalFileMsgSender::recvPadding(ResponseContext& ctx, int64_t toBeReceived)
{
Config* cfg = Program::getApp()->getConfig();
return ctx.getSocket()->recvT(ctx.getBuffer(),
BEEGFS_MIN(toBeReceived, ctx.getBufferLength()), 0, cfg->getConnMsgMediumTimeout());
}
#ifdef BEEGFS_NVFS
ssize_t WriteLocalFileRDMAMsgSender::recvPadding(ResponseContext& ctx, int64_t toBeReceived)
{
RdmaInfo* rdma = getRdmaInfo();
uint64_t rBuf;
size_t rLen;
uint64_t rOff;
if (!rdma->next(rBuf, rLen, rOff))
return -1;
ssize_t recvLength = BEEGFS_MIN(ctx.getBufferLength(), toBeReceived);
recvLength = BEEGFS_MIN(recvLength, (ssize_t)(rLen - rOff));
return ctx.getSocket()->read(ctx.getBuffer(), recvLength, 0, rBuf+rOff, rdma->key);
}
#endif /* BEEGFS_NVFS */
/**
* Note: New offset is saved in the session by the caller afterwards (to make life easier).
* @return number of written bytes or negative fhgfs error code
*/
template <class Msg, typename WriteState>
int64_t WriteLocalFileMsgExBase<Msg, WriteState>::incrementalRecvAndWriteStateful(NetMessage::ResponseContext& ctx,
SessionLocalFile* sessionLocalFile)
{
std::string logContext = Msg::logContextPref + " (write incremental)";
Config* cfg = Program::getApp()->getConfig();
// we can securely cast getTuneFileWriteSize to size_t below to make a comparision possible, as
// it can technically never be negative and will therefore always fit into size_t
const ssize_t exactStaticRecvSize = sessionLocalFile->getIsDirectIO()
? ctx.getBufferLength()
: BEEGFS_MIN(ctx.getBufferLength(), (size_t)cfg->getTuneFileWriteSize() );
auto& fd = sessionLocalFile->getFD();
int64_t oldOffset = sessionLocalFile->getOffset();
int64_t newOffset = getOffset();
bool useSyncRange = false; // true if sync_file_range should be called
if( (oldOffset < 0) || (oldOffset != newOffset) )
sessionLocalFile->resetWriteCounter(); // reset sequential write counter
else
{ // continue at previous offset => increase sequential write counter
LOG_DEBUG(logContext, Log_SPAM, "Offset: " + StringTk::int64ToStr(getOffset() ) );
sessionLocalFile->incWriteCounter(getCount() );
ssize_t syncSize = unlikely(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) ) ?
0 : cfg->getTuneFileWriteSyncSize();
if (syncSize && (sessionLocalFile->getWriteCounter() >= syncSize) )
useSyncRange = true;
}
// incrementally receive file contents...
WriteState writeState(logContext.c_str(), exactStaticRecvSize,
getCount(), getOffset(), sessionLocalFile);
if (!writeStateInit(writeState))
return -FhgfsOpsErr_COMMUNICATION;
do
{
// receive some bytes...
LOG_DEBUG(logContext, Log_SPAM,
"receiving... (remaining: " + StringTk::intToStr(writeState.toBeReceived) + ")");
ssize_t recvRes = writeStateRecvData(ctx, writeState);
if (recvRes < 0)
{
LogContext(logContext).log(Log_WARNING, "Socket data transfer error occurred. ");
return -FhgfsOpsErr_COMMUNICATION;
}
// forward to mirror...
FhgfsOpsErr mirrorRes = sendToMirror(ctx.getBuffer(), recvRes,
writeState.writeOffset, writeState.toBeReceived, sessionLocalFile);
if(unlikely(mirrorRes != FhgfsOpsErr_SUCCESS) )
{ // mirroring failed
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
return -FhgfsOpsErr_COMMUNICATION;
}
// write to underlying file system...
int errCode = 0;
ssize_t writeRes = unlikely(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
? recvRes
: doWrite(*fd, ctx.getBuffer(), recvRes, writeState.writeOffset, errCode);
writeState.toBeReceived -= recvRes;
// handle write errors...
if(unlikely(writeRes != recvRes) )
{ // didn't write all of the received data
if(writeRes == -1)
{ // write error occurred
LogContext(logContext).log(Log_WARNING, "Write error occurred. "
"FileHandleID: " + sessionLocalFile->getFileHandleID() + "."
"Target: " + StringTk::uintToStr(sessionLocalFile->getTargetID() ) + ". "
"File: " + sessionLocalFile->getFileID() + ". "
"SysErr: " + System::getErrString(errCode) );
LogContext(logContext).log(Log_NOTICE, std::string("Additional info: "
"FD: ") + StringTk::intToStr(*fd) + " " +
"OpenFlags: " + StringTk::intToStr(sessionLocalFile->getOpenFlags() ) + " " +
"received: " + StringTk::intToStr(recvRes) + ".");
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
return -FhgfsOpsErrTk::fromSysErr(errCode);
}
else
{ // wrote only a part of the data, not all of it
LogContext(logContext).log(Log_WARNING,
"Unable to write all of the received data. "
"target: " + StringTk::uintToStr(sessionLocalFile->getTargetID() ) + "; "
"file: " + sessionLocalFile->getFileID() + "; "
"sysErr: " + System::getErrString(errCode) );
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
// return bytes received so far minus num bytes that were not written with last write
return (getCount() - writeState.toBeReceived) - (recvRes - writeRes);
}
}
writeState.writeOffset += writeRes;
recvRes = writeStateNext(writeState, writeRes);
if (recvRes != 0)
return recvRes;
} while(writeState.toBeReceived);
LOG_DEBUG(logContext, Log_SPAM,
std::string("Received and wrote all the data") );
// commit to storage device queue...
if (useSyncRange)
{
// advise kernel to commit written data to storage device in max_sectors_kb chunks.
/* note: this is async if there are free slots in the request queue
/sys/block/<...>/nr_requests. (optimal_io_size is not honoured as of linux-3.4) */
off64_t syncSize = sessionLocalFile->getWriteCounter();
off64_t syncOffset = getOffset() + getCount() - syncSize;
MsgHelperIO::syncFileRange(*fd, syncOffset, syncSize);
sessionLocalFile->resetWriteCounter();
}
return getCount();
}
/**
* Write until everything was written (handle short-writes) or an error occured
*/
template <class Msg, typename WriteState>
ssize_t WriteLocalFileMsgExBase<Msg, WriteState>::doWrite(int fd, char* buf, size_t count, off_t offset, int& outErrno)
{
size_t sumWriteRes = 0;
do
{
ssize_t writeRes =
MsgHelperIO::pwrite(fd, buf + sumWriteRes, count - sumWriteRes, offset + sumWriteRes);
if (unlikely(writeRes == -1) )
{
sumWriteRes = (sumWriteRes > 0) ? sumWriteRes : writeRes;
outErrno = errno;
break;
}
sumWriteRes += writeRes;
} while (sumWriteRes != count);
return sumWriteRes;
}
/**
* Receive and discard data.
*/
template <class Msg, typename WriteState>
void WriteLocalFileMsgExBase<Msg, WriteState>::incrementalRecvPadding(NetMessage::ResponseContext& ctx,
int64_t padLen, SessionLocalFile* sessionLocalFile)
{
uint64_t toBeReceived = padLen;
while(toBeReceived)
{
ssize_t recvRes = recvPadding(ctx, toBeReceived);
if (recvRes == -1)
break;
// forward to mirror...
FhgfsOpsErr mirrorRes = sendToMirror(ctx.getBuffer(), recvRes,
getOffset() + padLen - toBeReceived, toBeReceived, sessionLocalFile);
if(unlikely(mirrorRes != FhgfsOpsErr_SUCCESS) )
{ // mirroring failed
/* ... but if we are in this method, then something went wrong anyways, so don't set
needs-resync here or report any error to caller. */
}
toBeReceived -= recvRes;
}
}
template <class Msg, typename WriteState>
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::openFile(const StorageTarget& target,
SessionLocalFile* sessionLocalFile)
{
std::string logContext = Msg::logContextPref + " (write incremental)";
bool useQuota = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA);
bool enforceQuota = Program::getApp()->getConfig()->getQuotaEnableEnforcement();
bool isBuddyMirrorChunk = sessionLocalFile->getIsMirrorSession();
if (sessionLocalFile->getFD().valid())
return FhgfsOpsErr_SUCCESS; // file already open => nothing to be done here
// file not open yet => get targetFD and check consistency state
const auto consistencyState = target.getConsistencyState();
const int targetFD = isBuddyMirrorChunk ? *target.getMirrorFD() : *target.getChunkFD();
if(unlikely(consistencyState != TargetConsistencyState_GOOD) &&
isBuddyMirrorChunk &&
!isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) )
{ // this is a request for a buddymirrored chunk on a non-good primary
LogContext(logContext).log(Log_NOTICE, "Refusing request. Target consistency is not good. "
"targetID: " + StringTk::uintToStr(target.getID()));
return FhgfsOpsErr_COMMUNICATION;
}
SessionQuotaInfo quotaInfo(useQuota, enforceQuota, getUserID(), getGroupID() );
FhgfsOpsErr openChunkRes = sessionLocalFile->openFile(targetFD, getPathInfo(), true, &quotaInfo);
return openChunkRes;
}
/**
* Prepares mirroring by storing mirrorNode reference in file session and setting the mirrorToSock
* member variable.
*
* Note: Mirror node reference needs to be released on file session close.
*
* @param buf used to send initial write msg header to mirror.
* @param requestorSock used to receive padding if mirroring fails.
* @return FhgfsOpsErr_COMMUNICATION if communication with mirror failed.
*/
template <class Msg, typename WriteState>
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::prepareMirroring(char* buf, size_t bufLen,
SessionLocalFile* sessionLocalFile, StorageTarget& target)
{
std::string logContext = Msg::logContextPref + " (prepare mirroring)";
// check if mirroring is enabled
if(!isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_FORWARD) )
return FhgfsOpsErr_SUCCESS;
App* app = Program::getApp();
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
TargetStateStore* targetStates = app->getTargetStateStore();
// check if secondary is offline or in unclear state
uint16_t secondaryTargetID = mirrorBuddies->getSecondaryTargetID(getTargetID() );
if(unlikely(!secondaryTargetID) )
{
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
StringTk::uintToStr(getTargetID() ) );
return FhgfsOpsErr_UNKNOWNTARGET;
}
CombinedTargetState secondaryState;
bool getSecondaryStateRes = targetStates->getState(secondaryTargetID, secondaryState);
if(unlikely(!getSecondaryStateRes) )
{
LOG_DEBUG(logContext, Log_DEBUG,
"Refusing request. Secondary target has invalid state. "
"targetID: " + StringTk::uintToStr(secondaryTargetID) );
return FhgfsOpsErr_COMMUNICATION;
}
if( (secondaryState.reachabilityState != TargetReachabilityState_ONLINE) ||
(secondaryState.consistencyState != TargetConsistencyState_GOOD) )
{
if(secondaryState.reachabilityState == TargetReachabilityState_OFFLINE)
{ // buddy is offline => mark needed resync and continue with local operation
LOG_DEBUG(logContext, Log_DEBUG,
"Secondary is offline and will need resync. "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
// buddy is marked offline, so local msg processing will be done and buddy needs resync
target.setBuddyNeedsResync(true);
return FhgfsOpsErr_SUCCESS;
}
if(secondaryState.consistencyState != TargetConsistencyState_NEEDS_RESYNC)
{ // unclear buddy state => client must try again
LOG_DEBUG(logContext, Log_DEBUG,
"Unclear secondary state, caller will have to try again later. "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
return FhgfsOpsErr_COMMUNICATION;
}
}
// store mirror node reference in session...
NodeHandle mirrorToNode = sessionLocalFile->getMirrorNode();
if(!mirrorToNode)
{
NodeStoreServers* storageNodes = app->getStorageNodes();
TargetMapper* targetMapper = app->getTargetMapper();
FhgfsOpsErr referenceErr;
mirrorToNode = storageNodes->referenceNodeByTargetID(secondaryTargetID, targetMapper,
&referenceErr);
if(unlikely(referenceErr != FhgfsOpsErr_SUCCESS) )
{
LogContext(logContext).logErr(
"Unable to forward to mirror target: " + StringTk::uintToStr(secondaryTargetID) + "; "
"Error: " + boost::lexical_cast<std::string>(referenceErr));
return referenceErr;
}
mirrorToNode = sessionLocalFile->setMirrorNodeExclusive(mirrorToNode);
}
// send initial write msg header to mirror (retry loop)...
for( ; ; )
{
try
{
// acquire connection to mirror node and send write msg...
mirrorToSock = mirrorToNode->getConnPool()->acquireStreamSocket();
WriteLocalFileMsg mirrorWriteMsg(getClientNumID(), getFileHandleID(), getTargetID(),
getPathInfo(), getAccessFlags(), getOffset(), getCount());
if(doSessionCheck() )
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_DISABLE_IO);
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) )
mirrorWriteMsg.setUserdataForQuota(getUserID(), getGroupID() );
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
unsigned msgLength = mirrorWriteMsg.serializeMessage(buf, bufLen).second;
mirrorToSock->send(buf, msgLength, 0);
return FhgfsOpsErr_SUCCESS;
}
catch(SocketConnectException& e)
{
LogContext(logContext).log(Log_CRITICAL, "Unable to connect to mirror node: " +
mirrorToNode->getNodeIDWithTypeStr() + "; "
"Msg: " + e.what() );
}
catch(SocketException& e)
{
LogContext(logContext).log(Log_CRITICAL, "Communication with mirror node failed: " +
mirrorToNode->getNodeIDWithTypeStr() + "; "
"Msg: " + e.what() );
if(mirrorToSock)
mirrorToNode->getConnPool()->invalidateStreamSocket(mirrorToSock);
mirrorToSock = NULL;
}
// error occurred if we got here
if(!mirrorRetriesLeft)
break;
mirrorRetriesLeft--;
// next round will be a retry
LogContext(logContext).log(Log_NOTICE, "Retrying mirror communication: " +
mirrorToNode->getNodeIDWithTypeStr() );
} // end of retry for-loop
// all retries exhausted if we got here
return FhgfsOpsErr_COMMUNICATION;
}
/**
* Send file contents to mirror.
*
* Note: Supports retries only at beginning of write msg.
*
* @param buf the buffer that should be sent to the mirror.
* @param offset the offset within the chunk file (only used if communication fails and we need to
* start over with a new WriteMsg to the mirror).
* @param toBeMirrored total remaining mirror data including given bufLen (only used for retries).
* @return FhgfsOpsErr_COMMUNICATION if mirroring fails.
*/
template <class Msg, typename WriteState>
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::sendToMirror(const char* buf, size_t bufLen,
int64_t offset, int64_t toBeMirrored, SessionLocalFile* sessionLocalFile)
{
std::string logContext = Msg::logContextPref + " (send to mirror)";
// check if mirroring enabled
if(!mirrorToSock)
return FhgfsOpsErr_SUCCESS; // either no mirroring enabled or all retries exhausted
bool isRetryRound = false;
// send raw data (retry loop)...
// (note: if sending fails, retrying requires sending of a new WriteMsg)
for( ; ; )
{
try
{
if(unlikely(isRetryRound) )
{ // retry requires reconnect and resend of write msg with current offset
auto mirrorToNode = sessionLocalFile->getMirrorNode();
mirrorToSock = mirrorToNode->getConnPool()->acquireStreamSocket();
WriteLocalFileMsg mirrorWriteMsg(getClientNumID(), getFileHandleID(),
getTargetID(), getPathInfo(), getAccessFlags(), offset, toBeMirrored);
if(doSessionCheck() )
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_DISABLE_IO);
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) )
mirrorWriteMsg.setUserdataForQuota(getUserID(), getGroupID() );
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
const auto mirrorBuf = MessagingTk::createMsgVec(mirrorWriteMsg);
mirrorToSock->send(&mirrorBuf[0], mirrorBuf.size(), 0);
}
mirrorToSock->send(buf, bufLen, 0);
return FhgfsOpsErr_SUCCESS;
}
catch(SocketConnectException& e)
{
auto mirrorToNode = sessionLocalFile->getMirrorNode();
LogContext(logContext).log(Log_CRITICAL, "Unable to connect to mirror node: " +
mirrorToNode->getNodeIDWithTypeStr() + "; "
"Msg: " + e.what() );
}
catch(SocketException& e)
{
LogContext(logContext).log(Log_CRITICAL, "Communication with mirror node failed: " +
sessionLocalFile->getMirrorNode()->getNodeIDWithTypeStr() + "; "
"Msg: " + e.what() );
if(mirrorToSock)
sessionLocalFile->getMirrorNode()->getConnPool()->invalidateStreamSocket(mirrorToSock);
mirrorToSock = NULL;
}
// error occurred if we got here
if(!mirrorRetriesLeft)
break;
// only allow retries if we're still at the beginning of the write msg.
/* (this is because later we don't have all the client data available; and without the mirror
response we don't know for sure whether previously sent data was really written or not.) */
if(toBeMirrored != getCount() )
break;
mirrorRetriesLeft--;
// next round will be a retry
LogContext(logContext).log(Log_NOTICE, "Retrying mirror communication: " +
sessionLocalFile->getMirrorNode()->getNodeIDWithTypeStr() );
isRetryRound = true;
} // end of retry for-loop
// all retries exhausted if we got here
return FhgfsOpsErr_COMMUNICATION;
}
/**
* Receive response from mirror node, check result, clean up (release mirror sock).
*
* Note: Does not do retries on communication errors
*/
template <class Msg, typename WriteState>
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::finishMirroring(SessionLocalFile* sessionLocalFile,
StorageTarget& target)
{
std::string logContext = Msg::logContextPref + " (finish mirroring)";
// check if mirroring enabled
if(!mirrorToSock)
return FhgfsOpsErr_SUCCESS; // mirroring disabled
App* app = Program::getApp();
auto mirrorToNode = sessionLocalFile->getMirrorNode();
WriteLocalFileRespMsg* writeRespMsg;
int64_t mirrorWriteRes;
// receive write msg response from mirror...
/* note: we don't have the file contents that were sent by the client anymore at this point, so
we cannot do retries here with a new WriteMsg. */
try
{
// receive write msg response...
auto resp = MessagingTk::recvMsgBuf(*mirrorToSock);
if (resp.empty())
{ // error
LogContext(logContext).log(Log_WARNING,
"Failed to receive response from mirror: " + mirrorToSock->getPeername() );
goto cleanup_commerr;
}
// got response => deserialize it...
auto respMsg = app->getNetMessageFactory()->createFromBuf(std::move(resp));
if(unlikely(respMsg->getMsgType() != NETMSGTYPE_WriteLocalFileResp) )
{ // response invalid (wrong msgType)
LogContext(logContext).logErr(
"Received invalid response type: " + StringTk::intToStr(respMsg->getMsgType() ) +"; "
"expected type: " + StringTk::intToStr(NETMSGTYPE_WriteLocalFileResp) + ". "
"Disconnecting: " + mirrorToSock->getPeername() );
goto cleanup_commerr;
}
// check mirror result and release mirror socket...
mirrorToNode->getConnPool()->releaseStreamSocket(mirrorToSock);
writeRespMsg = (WriteLocalFileRespMsg*)respMsg.get();
mirrorWriteRes = writeRespMsg->getValue();
if(likely(mirrorWriteRes == getCount() ) )
return FhgfsOpsErr_SUCCESS; // mirror successfully wrote all of the data
if(mirrorWriteRes >= 0)
{ // mirror only wrote a part of the data
LogContext(logContext).log(Log_WARNING,
"Mirror did not write all of the data (no space left); "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) + "; "
"fileHandle: " + sessionLocalFile->getFileHandleID() );
return FhgfsOpsErr_NOSPACE;
}
if(mirrorWriteRes == -FhgfsOpsErr_UNKNOWNTARGET)
{
/* local msg processing shall be done and buddy needs resync
(this is normal when a storage is restarted without a broken secondary target, so we
report success to a client in this case) */
LogContext(logContext).log(Log_DEBUG,
"Secondary reports unknown target error and will need resync. "
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
target.setBuddyNeedsResync(true);
return FhgfsOpsErr_SUCCESS;
}
if(mirrorWriteRes == -FhgfsOpsErr_STORAGE_SRV_CRASHED)
LogContext(logContext).log(Log_NOTICE, "Potential cache loss for open file handle. "
"(Mirror server crash detected.) "
"FileHandleID: " + sessionLocalFile->getFileHandleID() + "; "
"Mirror: " + mirrorToNode->getNodeIDWithTypeStr() );
// mirror encountered an error
return (FhgfsOpsErr)-mirrorWriteRes; // write response contains negative fhgfs error code
}
catch(SocketException& e)
{
LogContext(logContext).logErr(std::string("SocketException: ") + e.what() );
LogContext(logContext).log(Log_WARNING, "Additional info: "
"mirror node: " + mirrorToNode->getNodeIDWithTypeStr() + "; "
"fileHandle: " + sessionLocalFile->getFileHandleID() );
}
// cleanup after communication error...
cleanup_commerr:
mirrorToNode->getConnPool()->invalidateStreamSocket(mirrorToSock);
return FhgfsOpsErr_COMMUNICATION;
}
template <class Msg, typename WriteState>
bool WriteLocalFileMsgExBase<Msg, WriteState>::doSessionCheck()
{ // do session check only when it is not a mirror session
return isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR) ? false :
isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
}

View File

@@ -0,0 +1,213 @@
#pragma once
#include <common/net/message/session/rw/WriteLocalFileMsg.h>
#include <common/net/message/session/rw/WriteLocalFileRespMsg.h>
#include <session/SessionLocalFile.h>
#include <common/storage/StorageErrors.h>
#define WRITEMSG_MIRROR_RETRIES_NUM 1
class StorageTarget;
/**
* Contains common data needed by implementations of the network protocol
* that receive data from the client.
*/
struct WriteStateBase
{
const char* logContext;
ssize_t exactStaticRecvSize;
ssize_t recvLength;
int64_t toBeReceived;
off_t writeOffset;
SessionLocalFile* sessionLocalFile;
WriteStateBase(const char* logContext, ssize_t exactStaticRecvSize,
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile)
{
this->logContext = logContext;
this->exactStaticRecvSize = exactStaticRecvSize;
this->toBeReceived = toBeReceived;
this->writeOffset = writeOffset;
this->sessionLocalFile = sessionLocalFile;
recvLength = BEEGFS_MIN(exactStaticRecvSize, toBeReceived);
}
};
template <class Msg, typename WriteState>
class WriteLocalFileMsgExBase : public Msg
{
private:
Socket* mirrorToSock;
unsigned mirrorRetriesLeft;
public:
bool processIncoming(NetMessage::ResponseContext& ctx);
WriteLocalFileMsgExBase() : Msg()
{
mirrorToSock = NULL;
mirrorRetriesLeft = WRITEMSG_MIRROR_RETRIES_NUM;
}
private:
std::pair<bool, int64_t> write(NetMessage::ResponseContext& ctx);
ssize_t doWrite(int fd, char* buf, size_t count, off_t offset, int& outErrno);
FhgfsOpsErr openFile(const StorageTarget& target, SessionLocalFile* sessionLocalFile);
FhgfsOpsErr prepareMirroring(char* buf, size_t bufLen,
SessionLocalFile* sessionLocalFile, StorageTarget& target);
FhgfsOpsErr sendToMirror(const char* buf, size_t bufLen, int64_t offset, int64_t toBeMirrored,
SessionLocalFile* sessionLocalFile);
FhgfsOpsErr finishMirroring(SessionLocalFile* sessionLocalFile, StorageTarget& target);
bool doSessionCheck();
int64_t incrementalRecvAndWriteStateful(NetMessage::ResponseContext& ctx,
SessionLocalFile* sessionLocalFile);
void incrementalRecvPadding(NetMessage::ResponseContext& ctx, int64_t padLen,
SessionLocalFile* sessionLocalFile);
inline ssize_t recvPadding(NetMessage::ResponseContext& ctx, int64_t toBeReceived)
{
return static_cast<Msg&>(*this).recvPadding(ctx, toBeReceived);
}
inline void sendResponse(NetMessage::ResponseContext& ctx, int err)
{
return static_cast<Msg&>(*this).sendResponse(ctx, err);
}
inline bool writeStateInit(WriteState& ws)
{
return static_cast<Msg&>(*this).writeStateInit(ws);
}
inline ssize_t writeStateRecvData(NetMessage::ResponseContext& ctx, WriteState& ws)
{
return static_cast<Msg&>(*this).writeStateRecvData(ctx, ws);
}
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
{
return static_cast<Msg&>(*this).writeStateNext(ws, writeRes);
}
public:
inline bool isMsgValid() const
{
return static_cast<const Msg&>(*this).isMsgValid();
}
inline bool isMsgHeaderFeatureFlagSet(unsigned flag) const
{
return static_cast<const Msg&>(*this).isMsgHeaderFeatureFlagSet(flag);
}
inline unsigned getMsgHeaderUserID() const
{
return static_cast<const Msg&>(*this).getMsgHeaderUserID();
}
inline uint16_t getTargetID() const
{
return static_cast<const Msg&>(*this).getTargetID();
}
inline int64_t getOffset() const
{
return static_cast<const Msg&>(*this).getOffset();
}
inline unsigned getUserID() const
{
return static_cast<const Msg&>(*this).getUserID();
}
inline unsigned getGroupID() const
{
return static_cast<const Msg&>(*this).getGroupID();
}
inline int64_t getCount() const
{
return static_cast<const Msg&>(*this).getCount();
}
inline const char* getFileHandleID()
{
return static_cast<Msg&>(*this).getFileHandleID();
}
inline NumNodeID getClientNumID() const
{
return static_cast<const Msg&>(*this).getClientNumID();
}
inline unsigned getAccessFlags() const
{
return static_cast<const Msg&>(*this).getAccessFlags();
}
inline PathInfo* getPathInfo ()
{
return static_cast<Msg&>(*this).getPathInfo();
}
};
/**
* Implements the recv protocol.
*/
class WriteLocalFileMsgSender : public WriteLocalFileMsg
{
public:
struct WriteState : public WriteStateBase
{
WriteState(const char* logContext, ssize_t exactStaticRecvSize,
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile) :
WriteStateBase(logContext, exactStaticRecvSize, toBeReceived, writeOffset,
sessionLocalFile) {}
};
private:
friend class WriteLocalFileMsgExBase<WriteLocalFileMsgSender, WriteState>;
static const std::string logContextPref;
ssize_t recvPadding(ResponseContext& ctx, int64_t toBeReceived);
inline void sendResponse(ResponseContext& ctx, int err)
{
ctx.sendResponse(WriteLocalFileRespMsg(err));
}
inline bool writeStateInit(WriteState& ws)
{
return true;
}
inline ssize_t writeStateRecvData(ResponseContext& ctx, WriteState& ws)
{
AbstractApp* app = PThread::getCurrentThreadApp();
int connMsgMediumTimeout = app->getCommonConfig()->getConnMsgMediumTimeout();
ws.recvLength = BEEGFS_MIN(ws.exactStaticRecvSize, ws.toBeReceived);
return ctx.getSocket()->recvExactT(ctx.getBuffer(), ws.recvLength, 0, connMsgMediumTimeout);
}
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
{
return 0;
}
};
typedef WriteLocalFileMsgExBase<WriteLocalFileMsgSender,
WriteLocalFileMsgSender::WriteState> WriteLocalFileMsgEx;

View File

@@ -0,0 +1,94 @@
#pragma once
#ifdef BEEGFS_NVFS
#include <common/net/message/session/rw/WriteLocalFileRDMAMsg.h>
#include <common/net/message/session/rw/WriteLocalFileRDMARespMsg.h>
#include <common/components/worker/Worker.h>
#include <session/SessionLocalFile.h>
#include <common/storage/StorageErrors.h>
#include "WriteLocalFileMsgEx.h"
/**
* Implements RDMA read protocol.
*/
class WriteLocalFileRDMAMsgSender : public WriteLocalFileRDMAMsg
{
public:
struct WriteState : public WriteStateBase
{
RdmaInfo* rdma;
uint64_t rBuf;
size_t rLen;
uint64_t rOff;
int64_t recvSize;
WriteState(const char* logContext, ssize_t exactStaticRecvSize,
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile) :
WriteStateBase(logContext, exactStaticRecvSize, toBeReceived, writeOffset,
sessionLocalFile)
{
recvSize = toBeReceived;
}
};
private:
friend class WriteLocalFileMsgExBase<WriteLocalFileRDMAMsgSender, WriteState>;
static const std::string logContextPref;
ssize_t recvPadding(ResponseContext& ctx, int64_t toBeReceived);
inline void sendResponse(ResponseContext& ctx, int err)
{
ctx.sendResponse(WriteLocalFileRDMARespMsg(err));
}
inline bool writeStateInit(WriteState& ws)
{
ws.rdma = getRdmaInfo();
if (unlikely(!ws.rdma->next(ws.rBuf, ws.rLen, ws.rOff)))
{
LogContext(ws.logContext).logErr("No entities in RDMA buffers.");
return false;
}
return true;
}
inline ssize_t writeStateRecvData(ResponseContext& ctx, WriteState& ws)
{
// Cannot RDMA anything larger than WORKER_BUFIN_SIZE in a single operation
// because that is the size of the buffer passed in by the Worker.
// TODO: pass around a Buffer with a length instead of unqualified char*.
ws.recvLength = BEEGFS_MIN(
BEEGFS_MIN(
BEEGFS_MIN(ws.exactStaticRecvSize, ws.toBeReceived),
(ssize_t)(ws.rLen - ws.rOff)),
WORKER_BUFIN_SIZE);
return ctx.getSocket()->read(ctx.getBuffer(), ws.recvLength, 0, ws.rBuf + ws.rOff, ws.rdma->key);
}
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
{
ws.rOff += writeRes;
if (ws.toBeReceived > 0 && ws.rOff == ws.rLen)
{
if (unlikely(!ws.rdma->next(ws.rBuf, ws.rLen, ws.rOff)))
{
LogContext(ws.logContext).logErr("RDMA buffers expended but not all data received. toBeReceived=" +
StringTk::uint64ToStr(ws.toBeReceived) + "; "
"target: " + StringTk::uintToStr(ws.sessionLocalFile->getTargetID() ) + "; "
"file: " + ws.sessionLocalFile->getFileID() + "; ");
return ws.recvSize - ws.toBeReceived;
}
}
return 0;
}
};
typedef WriteLocalFileMsgExBase<WriteLocalFileRDMAMsgSender,
WriteLocalFileRDMAMsgSender::WriteState> WriteLocalFileRDMAMsgEx;
#endif /* BEEGFS_NVFS */