New upstream version 8.1.0
This commit is contained in:
94
storage/source/net/message/session/FSyncLocalFileMsgEx.cpp
Normal file
94
storage/source/net/message/session/FSyncLocalFileMsgEx.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
#include <program/Program.h>
|
||||
#include <common/net/message/session/FSyncLocalFileRespMsg.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <net/msghelpers/MsgHelperIO.h>
|
||||
#include "FSyncLocalFileMsgEx.h"
|
||||
|
||||
|
||||
bool FSyncLocalFileMsgEx::processIncoming(ResponseContext& ctx)
|
||||
{
|
||||
ctx.sendResponse(FSyncLocalFileRespMsg(fsync()));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
FhgfsOpsErr FSyncLocalFileMsgEx::fsync()
|
||||
{
|
||||
const char* logContext = "FSyncLocalFileMsg incoming";
|
||||
|
||||
FhgfsOpsErr clientRes = FhgfsOpsErr_SUCCESS;
|
||||
bool isMirrorSession = isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_BUDDYMIRROR);
|
||||
|
||||
// do session check only when it is not a mirror session
|
||||
bool useSessionCheck = isMirrorSession ? false :
|
||||
isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_SESSION_CHECK);
|
||||
|
||||
App* app = Program::getApp();
|
||||
SessionStore* sessions = app->getSessions();
|
||||
auto session = sessions->referenceOrAddSession(getSessionID());
|
||||
SessionLocalFileStore* sessionLocalFiles = session->getLocalFiles();
|
||||
|
||||
// select the right targetID
|
||||
|
||||
uint16_t targetID = getTargetID();
|
||||
|
||||
if(isMirrorSession)
|
||||
{ // given targetID refers to a buddy mirror group
|
||||
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
targetID = isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
|
||||
mirrorBuddies->getSecondaryTargetID(targetID) :
|
||||
mirrorBuddies->getPrimaryTargetID(targetID);
|
||||
|
||||
// note: only log message here, error handling will happen below through invalid targetFD
|
||||
if(unlikely(!targetID) )
|
||||
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
|
||||
StringTk::uintToStr(getTargetID() ) );
|
||||
}
|
||||
|
||||
auto sessionLocalFile =
|
||||
sessionLocalFiles->referenceSession(getFileHandleID(), targetID, isMirrorSession);
|
||||
|
||||
if(sessionLocalFile)
|
||||
{ // sessionLocalFile exists => check if open and perform fsync
|
||||
if (!isMsgHeaderFeatureFlagSet(FSYNCLOCALFILEMSG_FLAG_NO_SYNC) )
|
||||
{
|
||||
auto& fd = sessionLocalFile->getFD();
|
||||
if (fd.valid())
|
||||
{ // file open => sync
|
||||
int fsyncRes = MsgHelperIO::fsync(*fd);
|
||||
|
||||
if(fsyncRes)
|
||||
{
|
||||
LogContext log(logContext);
|
||||
log.log(Log_WARNING, std::string("fsync of chunk file failed. ") +
|
||||
std::string("SessionID: ") + getSessionID().str() +
|
||||
std::string(". SysErr: ") + System::getErrString() );
|
||||
|
||||
clientRes = FhgfsOpsErr_INTERNAL;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(useSessionCheck && sessionLocalFile->isServerCrashed() )
|
||||
{ // server crashed during the write, maybe lost some data send error to client
|
||||
LogContext log(logContext);
|
||||
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.) "
|
||||
"The session is marked as dirty.");
|
||||
clientRes = FhgfsOpsErr_STORAGE_SRV_CRASHED;
|
||||
}
|
||||
}
|
||||
else
|
||||
if (useSessionCheck)
|
||||
{ // the server crashed during a write or before the close was successful
|
||||
LogContext log(logContext);
|
||||
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
|
||||
"No session for file available. "
|
||||
"FileHandleID: " + std::string(getFileHandleID()) );
|
||||
|
||||
clientRes = FhgfsOpsErr_STORAGE_SRV_CRASHED;
|
||||
}
|
||||
|
||||
return clientRes;
|
||||
}
|
||||
13
storage/source/net/message/session/FSyncLocalFileMsgEx.h
Normal file
13
storage/source/net/message/session/FSyncLocalFileMsgEx.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/net/message/session/FSyncLocalFileMsg.h>
|
||||
|
||||
class FSyncLocalFileMsgEx : public FSyncLocalFileMsg
|
||||
{
|
||||
public:
|
||||
virtual bool processIncoming(ResponseContext& ctx);
|
||||
|
||||
private:
|
||||
FhgfsOpsErr fsync();
|
||||
};
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
#include <common/net/message/control/GenericResponseMsg.h>
|
||||
#include <common/net/message/session/opening/CloseChunkFileRespMsg.h>
|
||||
#include <common/toolkit/SessionTk.h>
|
||||
#include <net/msghelpers/MsgHelperIO.h>
|
||||
#include <program/Program.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include "CloseChunkFileMsgEx.h"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
bool CloseChunkFileMsgEx::processIncoming(ResponseContext& ctx)
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
|
||||
FhgfsOpsErr closeMsgRes;
|
||||
DynamicAttribs dynAttribs;
|
||||
|
||||
std::tie(closeMsgRes, dynAttribs) = close(ctx);
|
||||
// if closeMsgRes == FhgfsOpsErr_COMMUNICATION, a GenericResponseMsg has been sent already
|
||||
if (closeMsgRes != FhgfsOpsErr_COMMUNICATION)
|
||||
ctx.sendResponse(
|
||||
CloseChunkFileRespMsg(closeMsgRes, dynAttribs.filesize, dynAttribs.allocedBlocks,
|
||||
dynAttribs.modificationTimeSecs, dynAttribs.lastAccessTimeSecs,
|
||||
dynAttribs.storageVersion) );
|
||||
|
||||
// update op counters
|
||||
|
||||
app->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(), StorageOpCounter_CLOSELOCAL,
|
||||
getMsgHeaderUserID() );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::pair<FhgfsOpsErr, CloseChunkFileMsgEx::DynamicAttribs> CloseChunkFileMsgEx::close(
|
||||
ResponseContext& ctx)
|
||||
{
|
||||
const char* logContext = "CloseChunkFileMsg incoming";
|
||||
|
||||
App* app = Program::getApp();
|
||||
Config* config = app->getConfig();
|
||||
SessionStore* sessions = app->getSessions();
|
||||
|
||||
uint16_t targetID;
|
||||
|
||||
FhgfsOpsErr closeMsgRes = FhgfsOpsErr_SUCCESS; // the result that will be sent to requestor
|
||||
DynamicAttribs dynAttribs = {0, 0, 0, 0, 0};
|
||||
|
||||
std::string fileHandleID(getFileHandleID() );
|
||||
bool isMirrorSession = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR);
|
||||
|
||||
SessionLocalFileStore* sessionLocalFiles;
|
||||
|
||||
// select the right targetID
|
||||
|
||||
targetID = getTargetID();
|
||||
|
||||
if(isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR) )
|
||||
{ // given targetID refers to a buddy mirror group
|
||||
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
targetID = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
|
||||
mirrorBuddies->getSecondaryTargetID(targetID) :
|
||||
mirrorBuddies->getPrimaryTargetID(targetID);
|
||||
|
||||
if(unlikely(!targetID) )
|
||||
{ // unknown target
|
||||
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
|
||||
StringTk::uintToStr(getTargetID() ) );
|
||||
return {FhgfsOpsErr_UNKNOWNTARGET, {}};
|
||||
}
|
||||
}
|
||||
|
||||
// forward to secondary (if appropriate)
|
||||
|
||||
closeMsgRes = forwardToSecondary(ctx);
|
||||
if (unlikely(closeMsgRes != FhgfsOpsErr_SUCCESS))
|
||||
return {closeMsgRes, dynAttribs};
|
||||
|
||||
auto session = sessions->referenceOrAddSession(getSessionID());
|
||||
sessionLocalFiles = session->getLocalFiles();
|
||||
|
||||
auto fsState = sessionLocalFiles->removeSession(fileHandleID, targetID, isMirrorSession);
|
||||
|
||||
// get current dynamic file attribs
|
||||
|
||||
if (fsState)
|
||||
{ // file no longer in use => refresh filesize and close file fd
|
||||
auto& fd = fsState->getFD();
|
||||
|
||||
/* get dynamic attribs, here before closing the file.
|
||||
* Note: Depending on the underlying file system the returned st_blocks might be too large
|
||||
* (pre-allocated blocks, which are only released on close() ). Advantage here is
|
||||
* that we already have the file descriptor. */
|
||||
if( (config->getTuneEarlyStat() ) &&
|
||||
(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) ) )
|
||||
getDynamicAttribsByFD(*fd, fileHandleID, targetID, dynAttribs);
|
||||
|
||||
// close fd
|
||||
|
||||
if (!fsState->close())
|
||||
closeMsgRes = FhgfsOpsErr_INTERNAL;
|
||||
|
||||
// only get the attributes here, in order to make xfs to release pre-allocated blocks
|
||||
if( (!config->getTuneEarlyStat() ) &&
|
||||
(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) ) )
|
||||
getDynamicAttribsByPath(fileHandleID, targetID, dynAttribs);
|
||||
|
||||
}
|
||||
else
|
||||
if(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_NODYNAMICATTRIBS) )
|
||||
{ // file still in use by other threads => get dynamic attribs by path
|
||||
|
||||
bool getRes = getDynamicAttribsByPath(fileHandleID, targetID, dynAttribs);
|
||||
if (getRes)
|
||||
{
|
||||
// LogContext(logContext).log(Log_DEBUG, "Chunk file virtually closed. "
|
||||
// "HandleID: " + fileHandleID);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// note: "file not exists" is not an error. we just have nothing to do in that case.
|
||||
|
||||
return {closeMsgRes, dynAttribs};
|
||||
}
|
||||
|
||||
/**
|
||||
* If this is a buddy mirror msg and we are the primary, forward this msg to secondary.
|
||||
*
|
||||
* @return _COMMUNICATION if forwarding to buddy failed and buddy is not marked offline (in which
|
||||
* case *outChunkLocked==false is guaranteed).
|
||||
* @throw SocketException if sending of GenericResponseMsg fails.
|
||||
*/
|
||||
FhgfsOpsErr CloseChunkFileMsgEx::forwardToSecondary(ResponseContext& ctx)
|
||||
{
|
||||
const char* logContext = "CloseChunkFileMsg incoming (forward to secondary)";
|
||||
|
||||
App* app = Program::getApp();
|
||||
|
||||
if(!isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR) ||
|
||||
isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND) )
|
||||
return FhgfsOpsErr_SUCCESS; // nothing to do
|
||||
|
||||
// instead of creating a new msg object, we just re-use "this" with "buddymirror second" flag
|
||||
addMsgHeaderFeatureFlag(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND);
|
||||
|
||||
RequestResponseArgs rrArgs(NULL, this, NETMSGTYPE_CloseChunkFileResp);
|
||||
RequestResponseTarget rrTarget(getTargetID(), app->getTargetMapper(), app->getStorageNodes(),
|
||||
app->getTargetStateStore(), app->getMirrorBuddyGroupMapper(), true);
|
||||
|
||||
FhgfsOpsErr commRes = MessagingTk::requestResponseTarget(&rrTarget, &rrArgs);
|
||||
|
||||
// remove the flag that we just added for secondary
|
||||
unsetMsgHeaderFeatureFlag(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR_SECOND);
|
||||
|
||||
if(unlikely(
|
||||
(commRes == FhgfsOpsErr_COMMUNICATION) &&
|
||||
(rrTarget.outTargetReachabilityState == TargetReachabilityState_OFFLINE) ) )
|
||||
{
|
||||
LOG_DEBUG(logContext, Log_DEBUG, std::string("Secondary is offline and will need resync. ") +
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );;
|
||||
return FhgfsOpsErr_SUCCESS; // go ahead with local msg processing
|
||||
}
|
||||
|
||||
if(unlikely(commRes != FhgfsOpsErr_SUCCESS) )
|
||||
{
|
||||
LogContext(logContext).log(Log_DEBUG, "Forwarding failed. "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) + "; "
|
||||
"error: " + boost::lexical_cast<std::string>(commRes));
|
||||
|
||||
std::string genericRespStr = "Communication with secondary failed. "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() );
|
||||
|
||||
ctx.sendResponse(
|
||||
GenericResponseMsg(GenericRespMsgCode_INDIRECTCOMMERR, std::move(genericRespStr)));
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
CloseChunkFileRespMsg* respMsg = (CloseChunkFileRespMsg*)rrArgs.outRespMsg.get();
|
||||
FhgfsOpsErr secondaryRes = respMsg->getResult();
|
||||
if(unlikely(secondaryRes != FhgfsOpsErr_SUCCESS) )
|
||||
{
|
||||
LogContext(logContext).log(Log_NOTICE, std::string("Secondary reported error: ") +
|
||||
boost::lexical_cast<std::string>(secondaryRes) + "; "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
|
||||
|
||||
return secondaryRes;
|
||||
}
|
||||
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
|
||||
bool CloseChunkFileMsgEx::getDynamicAttribsByFD(const int fd, std::string fileHandleID,
|
||||
uint16_t targetID, DynamicAttribs& outDynAttribs)
|
||||
{
|
||||
SyncedStoragePaths* syncedPaths = Program::getApp()->getSyncedStoragePaths();
|
||||
|
||||
std::string fileID(SessionTk::fileIDFromHandleID(fileHandleID) );
|
||||
|
||||
uint64_t storageVersion = syncedPaths->lockPath(fileID, targetID); // LOCK
|
||||
|
||||
// note: this is locked because we need to get the filesize together with the storageVersion
|
||||
bool getDynAttribsRes = StorageTkEx::getDynamicFileAttribs(fd, &outDynAttribs.filesize,
|
||||
&outDynAttribs.allocedBlocks, &outDynAttribs.modificationTimeSecs,
|
||||
&outDynAttribs.lastAccessTimeSecs);
|
||||
|
||||
if(getDynAttribsRes)
|
||||
outDynAttribs.storageVersion = storageVersion;
|
||||
|
||||
syncedPaths->unlockPath(fileID, targetID); // UNLOCK
|
||||
|
||||
return getDynAttribsRes;
|
||||
}
|
||||
|
||||
bool CloseChunkFileMsgEx::getDynamicAttribsByPath(std::string fileHandleID, uint16_t targetID,
|
||||
DynamicAttribs& outDynAttribs)
|
||||
{
|
||||
const char* logContext = "CloseChunkFileMsg (attribs by path)";
|
||||
|
||||
App* app = Program::getApp();
|
||||
SyncedStoragePaths* syncedPaths = app->getSyncedStoragePaths();
|
||||
|
||||
auto* const target = app->getStorageTargets()->getTarget(targetID);
|
||||
if (!target)
|
||||
{ // unknown targetID
|
||||
LogContext(logContext).logErr("Unknown targetID: " + StringTk::uintToStr(targetID) );
|
||||
return false;
|
||||
}
|
||||
|
||||
const int targetFD = isMsgHeaderFeatureFlagSet(CLOSECHUNKFILEMSG_FLAG_BUDDYMIRROR)
|
||||
? *target->getMirrorFD()
|
||||
: *target->getChunkFD();
|
||||
|
||||
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
|
||||
std::string pathStr = StorageTk::getFileChunkPath(getPathInfo(), fileID);
|
||||
|
||||
uint64_t storageVersion = syncedPaths->lockPath(fileID, targetID); // L O C K path
|
||||
|
||||
// note: this is locked because we need to get the filesize together with the storageVersion
|
||||
bool getDynAttribsRes = StorageTkEx::getDynamicFileAttribs(targetFD, pathStr.c_str(),
|
||||
&outDynAttribs.filesize, &outDynAttribs.allocedBlocks, &outDynAttribs.modificationTimeSecs,
|
||||
&outDynAttribs.lastAccessTimeSecs);
|
||||
|
||||
if(getDynAttribsRes)
|
||||
outDynAttribs.storageVersion = storageVersion;
|
||||
|
||||
syncedPaths->unlockPath(fileID, targetID); // U N L O C K path
|
||||
|
||||
return getDynAttribsRes;
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/net/message/session/opening/CloseChunkFileMsg.h>
|
||||
|
||||
class CloseChunkFileMsgEx : public CloseChunkFileMsg
|
||||
{
|
||||
private:
|
||||
struct DynamicAttribs
|
||||
{
|
||||
int64_t filesize;
|
||||
int64_t allocedBlocks; // allocated 512byte blocks (relevant for sparse files)
|
||||
int64_t modificationTimeSecs;
|
||||
int64_t lastAccessTimeSecs;
|
||||
uint64_t storageVersion;
|
||||
};
|
||||
|
||||
public:
|
||||
virtual bool processIncoming(ResponseContext& ctx);
|
||||
|
||||
private:
|
||||
FhgfsOpsErr forwardToSecondary(ResponseContext& ctx);
|
||||
bool getDynamicAttribsByFD(int fd, std::string fileHandleID, uint16_t targetID,
|
||||
DynamicAttribs& outDynAttribs);
|
||||
bool getDynamicAttribsByPath(std::string fileHandleID, uint16_t targetID,
|
||||
DynamicAttribs& outDynAttribs);
|
||||
|
||||
std::pair<FhgfsOpsErr, DynamicAttribs> close(ResponseContext& ctx);
|
||||
};
|
||||
|
||||
114
storage/source/net/message/session/rw/ReadLocalFileRDMAMsgEx.h
Normal file
114
storage/source/net/message/session/rw/ReadLocalFileRDMAMsgEx.h
Normal file
@@ -0,0 +1,114 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef BEEGFS_NVFS
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <common/net/message/session/rw/ReadLocalFileRDMAMsg.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <common/components/worker/Worker.h>
|
||||
#include <session/SessionLocalFileStore.h>
|
||||
#include "ReadLocalFileV2MsgEx.h"
|
||||
|
||||
/**
|
||||
* Implements RDMA write protocol.
|
||||
*/
|
||||
class ReadLocalFileRDMAMsgSender : public ReadLocalFileRDMAMsg
|
||||
{
|
||||
public:
|
||||
struct ReadState : public ReadStateBase
|
||||
{
|
||||
RdmaInfo* rdma;
|
||||
uint64_t rBuf;
|
||||
size_t rLen;
|
||||
uint64_t rOff;
|
||||
|
||||
ReadState(const char* logContext, uint64_t toBeRead,
|
||||
SessionLocalFile* sessionLocalFile) :
|
||||
ReadStateBase(logContext, toBeRead, sessionLocalFile) {}
|
||||
|
||||
};
|
||||
|
||||
private:
|
||||
friend class ReadLocalFileMsgExBase<ReadLocalFileRDMAMsgSender, ReadState>;
|
||||
|
||||
static std::string logContextPref;
|
||||
|
||||
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
|
||||
{
|
||||
lengthInfo = HOST_TO_LE_64(lengthInfo);
|
||||
sock->send(&lengthInfo, sizeof(int64_t), 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* RDMA write data to the remote buffer.
|
||||
*/
|
||||
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
|
||||
{
|
||||
ssize_t writeRes = sock->write(buf, rs.readRes, 0, rs.rBuf + rs.rOff, rs.rdma->key);
|
||||
LOG_DEBUG(rs.logContext, Log_DEBUG,
|
||||
"buf: " + StringTk::uint64ToHexStr((uint64_t)buf) + "; "
|
||||
"bufLen: " + StringTk::int64ToStr(rs.readRes) + "; "
|
||||
"rbuf: " + StringTk::uint64ToHexStr(rs.rBuf) + "; "
|
||||
"rkey: " + StringTk::uintToHexStr(rs.rdma->key) + "; "
|
||||
"writeRes: " + StringTk::int64ToStr(writeRes));
|
||||
|
||||
if (unlikely(writeRes != rs.readRes))
|
||||
{
|
||||
LogContext(rs.logContext).logErr("Unable to write file data to client. "
|
||||
"FileID: " + rs.sessionLocalFile->getFileID() + "; "
|
||||
"SysErr: " + System::getErrString());
|
||||
writeRes = -1;
|
||||
}
|
||||
|
||||
if (isFinal && likely(writeRes >= 0))
|
||||
sendLengthInfo(sock, getCount() - rs.toBeRead);
|
||||
|
||||
return writeRes;
|
||||
}
|
||||
|
||||
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
|
||||
{
|
||||
// Cannot RDMA anything larger than WORKER_BUFOUT_SIZE in a single operation
|
||||
// because that is the size of the buffer passed in by the Worker.
|
||||
// TODO: pass around a Buffer with a length instead of unqualified char*.
|
||||
return BEEGFS_MIN(BEEGFS_MIN(len, ssize_t(rs.rLen - rs.rOff)), WORKER_BUFOUT_SIZE);
|
||||
}
|
||||
|
||||
inline bool readStateInit(ReadState& rs)
|
||||
{
|
||||
rs.rdma = getRdmaInfo();
|
||||
if (unlikely(!rs.rdma->next(rs.rBuf, rs.rLen, rs.rOff)))
|
||||
{
|
||||
LogContext(rs.logContext).logErr("No entities in RDMA buffers.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool readStateNext(ReadState& rs)
|
||||
{
|
||||
rs.rOff += rs.readRes;
|
||||
if (rs.rOff == rs.rLen)
|
||||
{
|
||||
if (unlikely(!rs.rdma->next(rs.rBuf, rs.rLen, rs.rOff)))
|
||||
{
|
||||
LogContext(rs.logContext).logErr("RDMA buffers exhausted");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline size_t getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf)
|
||||
{
|
||||
*dataBuf = ctx.getBuffer();
|
||||
*sendBuf = *dataBuf;
|
||||
return ctx.getBufferLength();
|
||||
}
|
||||
};
|
||||
|
||||
typedef ReadLocalFileMsgExBase<ReadLocalFileRDMAMsgSender,
|
||||
ReadLocalFileRDMAMsgSender::ReadState> ReadLocalFileRDMAMsgEx;
|
||||
|
||||
#endif /* BEEGFS_NVFS */
|
||||
|
||||
466
storage/source/net/message/session/rw/ReadLocalFileV2MsgEx.cpp
Normal file
466
storage/source/net/message/session/rw/ReadLocalFileV2MsgEx.cpp
Normal file
@@ -0,0 +1,466 @@
|
||||
#include <program/Program.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <common/toolkit/SessionTk.h>
|
||||
#include <net/msghelpers/MsgHelperIO.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include "ReadLocalFileV2MsgEx.h"
|
||||
#ifdef BEEGFS_NVFS
|
||||
#include "ReadLocalFileRDMAMsgEx.h"
|
||||
#endif
|
||||
#include <sys/sendfile.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#define READ_USE_TUNEFILEREAD_TRIGGER (4*1024*1024) /* seq IO trigger for tuneFileReadSize */
|
||||
|
||||
#define READ_BUF_OFFSET_PROTO_MIN (sizeof(int64_t) ) /* for prepended length info */
|
||||
#define READ_BUF_END_PROTO_MIN (sizeof(int64_t) ) /* for appended length info */
|
||||
|
||||
|
||||
/* reserve more than necessary at buf start to achieve page cache alignment */
|
||||
const size_t READ_BUF_OFFSET =
|
||||
BEEGFS_MAX( (long)READ_BUF_OFFSET_PROTO_MIN, sysconf(_SC_PAGESIZE) );
|
||||
/* reserve more than necessary at buf end to achieve page cache alignment */
|
||||
const size_t READ_BUF_END_RESERVE =
|
||||
BEEGFS_MAX( (long)READ_BUF_END_PROTO_MIN, sysconf(_SC_PAGESIZE) );
|
||||
/* read buffer size cutoff for protocol data */
|
||||
const size_t READ_BUF_LEN_PROTOCOL_CUTOFF =
|
||||
READ_BUF_OFFSET + READ_BUF_END_RESERVE;
|
||||
|
||||
|
||||
// A linker error occurs for processIncoming without having this forced linkage.
|
||||
static ReadLocalFileV2MsgEx forcedLinkageV2;
|
||||
#ifdef BEEGFS_NVFS
|
||||
static ReadLocalFileRDMAMsgEx forcedLinkageRDMA;
|
||||
#endif
|
||||
|
||||
std::string ReadLocalFileV2MsgSender::logContextPref = "ReadChunkFileV2Msg";
|
||||
#ifdef BEEGFS_NVFS
|
||||
std::string ReadLocalFileRDMAMsgSender::logContextPref = "ReadChunkFileRDMAMsg";
|
||||
#endif
|
||||
|
||||
template <class Msg, typename ReadState>
|
||||
bool ReadLocalFileMsgExBase<Msg, ReadState>::processIncoming(NetMessage::ResponseContext& ctx)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " incoming";
|
||||
|
||||
bool retVal = true; // return value
|
||||
|
||||
int64_t readRes = 0;
|
||||
|
||||
std::string fileHandleID(getFileHandleID() );
|
||||
bool isMirrorSession = isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_BUDDYMIRROR);
|
||||
|
||||
// do session check only when it is not a mirror session
|
||||
bool useSessionCheck = isMirrorSession ? false :
|
||||
isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_SESSION_CHECK);
|
||||
|
||||
App* app = Program::getApp();
|
||||
SessionStore* sessions = app->getSessions();
|
||||
auto session = sessions->referenceOrAddSession(getClientNumID());
|
||||
this->sessionLocalFiles = session->getLocalFiles();
|
||||
|
||||
// select the right targetID
|
||||
|
||||
uint16_t targetID = getTargetID();
|
||||
|
||||
if(isMirrorSession )
|
||||
{ // given targetID refers to a buddy mirror group
|
||||
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
targetID = isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
|
||||
mirrorBuddies->getSecondaryTargetID(targetID) :
|
||||
mirrorBuddies->getPrimaryTargetID(targetID);
|
||||
|
||||
// note: only log message here, error handling will happen below through invalid targetFD
|
||||
if(unlikely(!targetID) )
|
||||
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
|
||||
StringTk::uintToStr(getTargetID() ) );
|
||||
}
|
||||
|
||||
auto* const target = app->getStorageTargets()->getTarget(targetID);
|
||||
if (!target)
|
||||
{
|
||||
if (isMirrorSession)
|
||||
{ /* buddy mirrored file => fail with Err_COMMUNICATION to make the requestor retry.
|
||||
mgmt will mark this target as (p)offline in a few moments. */
|
||||
LOG(GENERAL, NOTICE, "Unknown target ID, refusing request.", targetID);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
|
||||
return true;
|
||||
}
|
||||
|
||||
LOG(GENERAL, ERR, "Unknown target ID.", targetID);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_UNKNOWNTARGET);
|
||||
return true;
|
||||
}
|
||||
|
||||
// check if we already have a session for this file...
|
||||
|
||||
auto sessionLocalFile = sessionLocalFiles->referenceSession(
|
||||
fileHandleID, targetID, isMirrorSession);
|
||||
if(!sessionLocalFile)
|
||||
{ // sessionLocalFile not exists yet => create, insert, re-get it
|
||||
if(useSessionCheck)
|
||||
{ // server crashed during the write, maybe lost some data send error to client
|
||||
LogContext log(logContext);
|
||||
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
|
||||
"No session for file available. "
|
||||
"FileHandleID: " + fileHandleID);
|
||||
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_STORAGE_SRV_CRASHED);
|
||||
goto release_session;
|
||||
}
|
||||
|
||||
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
|
||||
int openFlags = SessionTk::sysOpenFlagsFromFhgfsAccessFlags(getAccessFlags() );
|
||||
|
||||
auto newFile = boost::make_unique<SessionLocalFile>(fileHandleID, targetID, fileID, openFlags,
|
||||
false);
|
||||
|
||||
if(isMirrorSession)
|
||||
newFile->setIsMirrorSession(true);
|
||||
|
||||
sessionLocalFile = sessionLocalFiles->addAndReferenceSession(std::move(newFile));
|
||||
}
|
||||
else
|
||||
{ // session file exists
|
||||
if(useSessionCheck && sessionLocalFile->isServerCrashed() )
|
||||
{ // server crashed during the write, maybe lost some data send error to client
|
||||
LogContext log(logContext);
|
||||
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.) "
|
||||
"The session is marked as dirty. "
|
||||
"FileHandleID: " + fileHandleID);
|
||||
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_STORAGE_SRV_CRASHED);
|
||||
goto release_session;
|
||||
}
|
||||
}
|
||||
|
||||
/* Note: the session file must be unlocked/released before we send the finalizing info,
|
||||
because otherwise we have a race when the client assumes the read is complete and tries
|
||||
to close the file (while the handle is actually still referenced on the server). */
|
||||
/* Note: we also must be careful to update the current offset before sending the final length
|
||||
info because otherwise the session file might have been released already and we have no
|
||||
longer access to the offset. */
|
||||
|
||||
readRes = -1;
|
||||
try
|
||||
{
|
||||
// prepare file descriptor (if file not open yet then open it if it exists already)
|
||||
FhgfsOpsErr openRes = openFile(*target, sessionLocalFile.get());
|
||||
if(openRes != FhgfsOpsErr_SUCCESS)
|
||||
{
|
||||
sendLengthInfo(ctx.getSocket(), -openRes);
|
||||
goto release_session;
|
||||
}
|
||||
|
||||
// check if file exists
|
||||
if(!sessionLocalFile->getFD().valid())
|
||||
{ // file didn't exist (not an error) => send EOF
|
||||
sendLengthInfo(ctx.getSocket(), 0);
|
||||
goto release_session;
|
||||
}
|
||||
|
||||
// the actual read workhorse...
|
||||
|
||||
readRes = incrementalReadStatefulAndSendV2(ctx, sessionLocalFile.get());
|
||||
|
||||
LOG_DEBUG(logContext, Log_SPAM, "sending completed. "
|
||||
"readRes: " + StringTk::int64ToStr(readRes) );
|
||||
IGNORE_UNUSED_VARIABLE(readRes);
|
||||
|
||||
}
|
||||
catch(SocketException& e)
|
||||
{
|
||||
LogContext(logContext).logErr(std::string("SocketException occurred: ") + e.what() );
|
||||
LogContext(logContext).log(Log_WARNING, "Details: "
|
||||
"sessionID: " + getClientNumID().str() + "; "
|
||||
"fileHandle: " + fileHandleID + "; "
|
||||
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
|
||||
"count: " + StringTk::int64ToStr(getCount() ) );
|
||||
|
||||
sessionLocalFile->setOffset(-1); /* invalidate offset (we can only do this if still locked,
|
||||
but that's not a prob if we update offset correctly before send - see notes above) */
|
||||
|
||||
retVal = false;
|
||||
goto release_session;
|
||||
}
|
||||
|
||||
release_session:
|
||||
|
||||
// update operation counters
|
||||
|
||||
if(likely(readRes > 0) )
|
||||
app->getNodeOpStats()->updateNodeOp(
|
||||
ctx.getSocket()->getPeerIP(), StorageOpCounter_READOPS, readRes, getMsgHeaderUserID() );
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
inline size_t ReadLocalFileV2MsgSender::getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf)
|
||||
{
|
||||
*dataBuf = ctx.getBuffer() + READ_BUF_OFFSET; // offset for prepended data length info
|
||||
*sendBuf = *dataBuf - READ_BUF_OFFSET_PROTO_MIN;
|
||||
return ctx.getBufferLength() - READ_BUF_LEN_PROTOCOL_CUTOFF; /* cutoff for
|
||||
prepended and finalizing length info */
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: This is similar to incrementalReadAndSend, but uses the offset from sessionLocalFile
|
||||
* to avoid calling seek every time.
|
||||
*
|
||||
* Warning: Do not use the returned value to set the new offset, as there might be other threads
|
||||
* that also did something with the file (i.e. the io-lock is released somewhere within this
|
||||
* method).
|
||||
*
|
||||
* @return number of bytes read or some arbitrary negative value otherwise
|
||||
*/
|
||||
template <class Msg, typename ReadState>
|
||||
int64_t ReadLocalFileMsgExBase<Msg, ReadState>::incrementalReadStatefulAndSendV2(NetMessage::ResponseContext& ctx,
|
||||
SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
/* note on session offset: the session offset must always be set before sending the data to the
|
||||
client (otherwise the client could send the next request before we updated the offset, which
|
||||
would lead to a race condition) */
|
||||
|
||||
std::string logContext = Msg::logContextPref + " (read incremental)";
|
||||
Config* cfg = Program::getApp()->getConfig();
|
||||
|
||||
char* dataBuf;
|
||||
char* sendBuf;
|
||||
|
||||
if (READ_BUF_LEN_PROTOCOL_CUTOFF >= ctx.getBufferLength())
|
||||
{ // buffer too small. That shouldn't happen and is an error
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INTERNAL);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const ssize_t dataBufLen = getBuffers(ctx, &dataBuf, &sendBuf);
|
||||
|
||||
auto& fd = sessionLocalFile->getFD();
|
||||
int64_t oldOffset = sessionLocalFile->getOffset();
|
||||
int64_t newOffset = getOffset();
|
||||
|
||||
bool skipReadAhead =
|
||||
unlikely(isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_DISABLE_IO) ||
|
||||
sessionLocalFile->getIsDirectIO());
|
||||
|
||||
ssize_t readAheadSize = skipReadAhead ? 0 : cfg->getTuneFileReadAheadSize();
|
||||
ssize_t readAheadTriggerSize = cfg->getTuneFileReadAheadTriggerSize();
|
||||
|
||||
if( (oldOffset < 0) || (oldOffset != newOffset) )
|
||||
{
|
||||
sessionLocalFile->resetReadCounter(); // reset sequential read counter
|
||||
sessionLocalFile->resetLastReadAheadTrigger();
|
||||
}
|
||||
else
|
||||
{ // read continues at previous offset
|
||||
LOG_DEBUG(logContext, Log_SPAM,
|
||||
"fileID: " + sessionLocalFile->getFileID() + "; "
|
||||
"offset: " + StringTk::int64ToStr(getOffset() ) );
|
||||
}
|
||||
|
||||
size_t maxReadAtOnceLen = dataBufLen;
|
||||
|
||||
// reduce maxReadAtOnceLen to achieve better read/send aync overlap
|
||||
/* (note: reducing makes only sense if we can rely on the kernel to do some read-ahead, so don't
|
||||
reduce for direct IO and for random IO) */
|
||||
if( (sessionLocalFile->getReadCounter() >= READ_USE_TUNEFILEREAD_TRIGGER) &&
|
||||
!sessionLocalFile->getIsDirectIO() )
|
||||
maxReadAtOnceLen = BEEGFS_MIN(dataBufLen, cfg->getTuneFileReadSize() );
|
||||
|
||||
off_t readOffset = getOffset();
|
||||
ReadState readState(logContext.c_str(), getCount(), sessionLocalFile);
|
||||
|
||||
if (!isMsgValid() || !readStateInit(readState))
|
||||
{
|
||||
LogContext(logContext).logErr("Invalid read message.");
|
||||
sessionLocalFile->setOffset(-1);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INVAL);
|
||||
return -1;
|
||||
}
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
ssize_t readLength = getReadLength(readState, BEEGFS_MIN(maxReadAtOnceLen, readState.toBeRead));
|
||||
|
||||
readState.readRes = unlikely(isMsgHeaderFeatureFlagSet(READLOCALFILEMSG_FLAG_DISABLE_IO) ) ?
|
||||
readLength : MsgHelperIO::pread(*fd, dataBuf, readLength, readOffset);
|
||||
|
||||
LOG_DEBUG(logContext, Log_SPAM,
|
||||
"toBeRead: " + StringTk::int64ToStr(readState.toBeRead) + "; "
|
||||
"readLength: " + StringTk::int64ToStr(readLength) + "; "
|
||||
"readRes: " + StringTk::int64ToStr(readState.readRes) );
|
||||
|
||||
if(readState.readRes == readLength)
|
||||
{ // simple success case
|
||||
readState.toBeRead -= readState.readRes;
|
||||
|
||||
readOffset += readState.readRes;
|
||||
|
||||
int64_t newOffset = getOffset() + getCount() - readState.toBeRead;
|
||||
sessionLocalFile->setOffset(newOffset); // update offset
|
||||
|
||||
sessionLocalFile->incReadCounter(readState.readRes); // update sequential read length
|
||||
|
||||
ctx.getStats()->incVals.diskReadBytes += readState.readRes; // update stats
|
||||
|
||||
bool isFinal = !readState.toBeRead;
|
||||
|
||||
if (readStateSendData(ctx.getSocket(), readState, sendBuf, isFinal) < 0)
|
||||
{
|
||||
LogContext(logContext).logErr("readStateSendData failed.");
|
||||
sessionLocalFile->setOffset(-1);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
|
||||
return -1;
|
||||
}
|
||||
|
||||
checkAndStartReadAhead(sessionLocalFile, readAheadTriggerSize, newOffset, readAheadSize);
|
||||
|
||||
if(isFinal)
|
||||
{ // we reached the end of the requested data
|
||||
return getCount();
|
||||
}
|
||||
|
||||
if (!readStateNext(readState))
|
||||
{
|
||||
LogContext(logContext).logErr("readStateNext failed.");
|
||||
sessionLocalFile->setOffset(-1);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ // readRes not as it should be => might be an error or just an end-of-file
|
||||
|
||||
if(readState.readRes == -1)
|
||||
{ // read error occurred
|
||||
LogContext(logContext).log(Log_WARNING, "Unable to read file data. "
|
||||
"FileID: " + sessionLocalFile->getFileID() + "; "
|
||||
"SysErr: " + System::getErrString() );
|
||||
|
||||
sessionLocalFile->setOffset(-1);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_INTERNAL);
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{ // just an end of file
|
||||
LOG_DEBUG(logContext, Log_DEBUG,
|
||||
"Unable to read all of the requested data (=> end of file)");
|
||||
LOG_DEBUG(logContext, Log_DEBUG,
|
||||
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
|
||||
"count: " + StringTk::int64ToStr(getCount() ) + "; "
|
||||
"readLength: " + StringTk::int64ToStr(readLength) + "; " +
|
||||
"readRes: " + StringTk::int64ToStr(readState.readRes) + "; " +
|
||||
"toBeRead: " + StringTk::int64ToStr(readState.toBeRead) );
|
||||
|
||||
readOffset += readState.readRes;
|
||||
readState.toBeRead -= readState.readRes;
|
||||
|
||||
sessionLocalFile->setOffset(getOffset() + getCount() - readState.toBeRead); // update offset
|
||||
|
||||
sessionLocalFile->incReadCounter(readState.readRes); // update sequential read length
|
||||
|
||||
ctx.getStats()->incVals.diskReadBytes += readState.readRes; // update stats
|
||||
|
||||
if(readState.readRes > 0)
|
||||
{
|
||||
if (readStateSendData(ctx.getSocket(), readState, sendBuf, true) < 0)
|
||||
{
|
||||
LogContext(logContext).logErr("readStateSendData failed.");
|
||||
sessionLocalFile->setOffset(-1);
|
||||
sendLengthInfo(ctx.getSocket(), -FhgfsOpsErr_COMMUNICATION);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
sendLengthInfo(ctx.getSocket(), 0);
|
||||
|
||||
return(getCount() - readState.toBeRead);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // end of for-loop
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts read-ahead if enough sequential data has been read.
|
||||
*
|
||||
* Note: if getDisableIO() is true, we assume the caller sets readAheadSize==0, so getDisableIO()
|
||||
* is not checked explicitly within this function.
|
||||
*
|
||||
* @sessionLocalFile lastReadAheadOffset will be updated if read-head was triggered
|
||||
* @param readAheadTriggerSize the length of sequential IO that triggers read-ahead
|
||||
* @param currentOffset current file offset (where read-ahead would start)
|
||||
*/
|
||||
template <class Msg, typename ReadState>
|
||||
void ReadLocalFileMsgExBase<Msg, ReadState>::checkAndStartReadAhead(SessionLocalFile* sessionLocalFile,
|
||||
ssize_t readAheadTriggerSize, off_t currentOffset, off_t readAheadSize)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (read-ahead)";
|
||||
|
||||
if(!readAheadSize)
|
||||
return;
|
||||
|
||||
int64_t readCounter = sessionLocalFile->getReadCounter();
|
||||
int64_t nextReadAheadTrigger = sessionLocalFile->getLastReadAheadTrigger() ?
|
||||
sessionLocalFile->getLastReadAheadTrigger() + readAheadSize : readAheadTriggerSize;
|
||||
|
||||
if(readCounter < nextReadAheadTrigger)
|
||||
return; // we're not at the trigger point yet
|
||||
|
||||
/* start read-head...
|
||||
(read-ahead is supposed to be non-blocking if there are free slots in the device IO queue) */
|
||||
|
||||
LOG_DEBUG(logContext, Log_SPAM,
|
||||
std::string("Starting read-ahead... ") +
|
||||
"offset: " + StringTk::int64ToStr(currentOffset) + "; "
|
||||
"size: " + StringTk::int64ToStr(readAheadSize) );
|
||||
|
||||
MsgHelperIO::readAhead(*sessionLocalFile->getFD(), currentOffset, readAheadSize);
|
||||
|
||||
// update trigger
|
||||
|
||||
sessionLocalFile->setLastReadAheadTrigger(readCounter);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Open the file if a filedescriptor is not already set in sessionLocalFile.
|
||||
* If the file needs to be opened, this method will check the target consistency state before
|
||||
* opening.
|
||||
*
|
||||
* @return we return the special value FhgfsOpsErr_COMMUNICATION here in some cases to indirectly
|
||||
* ask the client for a retry (e.g. if target consistency is not good for buddymirrored chunks).
|
||||
*/
|
||||
template <class Msg, typename ReadState>
|
||||
FhgfsOpsErr ReadLocalFileMsgExBase<Msg, ReadState>::openFile(const StorageTarget& target,
|
||||
SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (open)";
|
||||
|
||||
bool isBuddyMirrorChunk = sessionLocalFile->getIsMirrorSession();
|
||||
|
||||
|
||||
if (sessionLocalFile->getFD().valid())
|
||||
return FhgfsOpsErr_SUCCESS; // file already open => nothing to be done here
|
||||
|
||||
|
||||
// file not open yet => get targetFD and check consistency state
|
||||
|
||||
const auto consistencyState = target.getConsistencyState();
|
||||
const int targetFD = isBuddyMirrorChunk ? *target.getMirrorFD() : *target.getChunkFD();
|
||||
|
||||
if(unlikely(consistencyState != TargetConsistencyState_GOOD) && isBuddyMirrorChunk)
|
||||
{ // this is a request for a buddymirrored chunk on a non-good target
|
||||
LogContext(logContext).log(Log_NOTICE, "Refusing request. Target consistency is not good. "
|
||||
"targetID: " + StringTk::uintToStr(target.getID()));
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
FhgfsOpsErr openChunkRes = sessionLocalFile->openFile(targetFD, getPathInfo(), false, NULL);
|
||||
|
||||
return openChunkRes;
|
||||
}
|
||||
216
storage/source/net/message/session/rw/ReadLocalFileV2MsgEx.h
Normal file
216
storage/source/net/message/session/rw/ReadLocalFileV2MsgEx.h
Normal file
@@ -0,0 +1,216 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/net/message/session/rw/ReadLocalFileV2Msg.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include <session/SessionLocalFileStore.h>
|
||||
|
||||
class StorageTarget;
|
||||
|
||||
/**
|
||||
* Contains common data needed by implementations of the network protocol
|
||||
* that send data to the client.
|
||||
*/
|
||||
struct ReadStateBase
|
||||
{
|
||||
const char* logContext;
|
||||
uint64_t toBeRead;
|
||||
SessionLocalFile* sessionLocalFile;
|
||||
ssize_t readRes;
|
||||
|
||||
ReadStateBase(const char* logContext, uint64_t toBeRead,
|
||||
SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
this->logContext = logContext;
|
||||
this->toBeRead = toBeRead;
|
||||
this->sessionLocalFile = sessionLocalFile;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Msg, typename ReadState>
|
||||
class ReadLocalFileMsgExBase : public Msg
|
||||
{
|
||||
public:
|
||||
bool processIncoming(NetMessage::ResponseContext& ctx);
|
||||
|
||||
private:
|
||||
SessionLocalFileStore* sessionLocalFiles;
|
||||
|
||||
FhgfsOpsErr openFile(const StorageTarget& target, SessionLocalFile* sessionLocalFile);
|
||||
|
||||
void checkAndStartReadAhead(SessionLocalFile* sessionLocalFile, ssize_t readAheadTriggerSize,
|
||||
off_t currentOffset, off_t readAheadSize);
|
||||
|
||||
int64_t incrementalReadStatefulAndSendV2(NetMessage::ResponseContext& ctx,
|
||||
SessionLocalFile* sessionLocalFile);
|
||||
|
||||
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
|
||||
{
|
||||
static_cast<Msg&>(*this).sendLengthInfo(sock, lengthInfo);
|
||||
}
|
||||
|
||||
inline bool readStateInit(ReadState& rs)
|
||||
{
|
||||
return static_cast<Msg&>(*this).readStateInit(rs);
|
||||
}
|
||||
|
||||
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
|
||||
{
|
||||
return static_cast<Msg&>(*this).readStateSendData(sock, rs, buf, isFinal);
|
||||
}
|
||||
|
||||
inline bool readStateNext(ReadState& rs)
|
||||
{
|
||||
return static_cast<Msg&>(*this).readStateNext(rs);
|
||||
}
|
||||
|
||||
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
|
||||
{
|
||||
return static_cast<Msg&>(*this).getReadLength(rs, len);
|
||||
}
|
||||
|
||||
inline size_t getBuffers(NetMessage::ResponseContext& ctx, char** dataBuf, char** sendBuf)
|
||||
{
|
||||
return static_cast<Msg&>(*this).getBuffers(ctx, dataBuf, sendBuf);
|
||||
}
|
||||
|
||||
public:
|
||||
inline unsigned getMsgHeaderUserID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getMsgHeaderUserID();
|
||||
}
|
||||
|
||||
inline bool isMsgHeaderFeatureFlagSet(unsigned flag) const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).isMsgHeaderFeatureFlagSet(flag);
|
||||
}
|
||||
|
||||
inline uint16_t getTargetID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getTargetID();
|
||||
}
|
||||
|
||||
inline int64_t getOffset() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getOffset();
|
||||
}
|
||||
|
||||
inline int64_t getCount() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getCount();
|
||||
}
|
||||
|
||||
inline const char* getFileHandleID()
|
||||
{
|
||||
return static_cast<Msg&>(*this).getFileHandleID();
|
||||
}
|
||||
|
||||
inline NumNodeID getClientNumID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getClientNumID();
|
||||
}
|
||||
|
||||
inline unsigned getAccessFlags() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getAccessFlags();
|
||||
}
|
||||
|
||||
inline PathInfo* getPathInfo ()
|
||||
{
|
||||
return static_cast<Msg&>(*this).getPathInfo();
|
||||
}
|
||||
|
||||
inline bool isMsgValid() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).isMsgValid();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Implements the Version 2 send protocol. It uses a preceding length info for each chunk.
|
||||
*/
|
||||
class ReadLocalFileV2MsgSender : public ReadLocalFileV2Msg
|
||||
{
|
||||
/* note on protocol: this works by sending an int64 before each data chunk, which contains the
|
||||
length of the next data chunk; or a zero if no more data can be read; or a negative fhgfs
|
||||
error code in case of an error */
|
||||
public:
|
||||
struct ReadState : public ReadStateBase
|
||||
{
|
||||
ReadState(const char* logContext, uint64_t toBeRead,
|
||||
SessionLocalFile* sessionLocalFile) :
|
||||
ReadStateBase(logContext, toBeRead, sessionLocalFile) {}
|
||||
};
|
||||
|
||||
private:
|
||||
friend class ReadLocalFileMsgExBase<ReadLocalFileV2MsgSender, ReadState>;
|
||||
|
||||
static std::string logContextPref;
|
||||
|
||||
/**
|
||||
* Send only length information without a data packet. Typically used for the final length
|
||||
* info at the end of the requested data.
|
||||
*/
|
||||
inline void sendLengthInfo(Socket* sock, int64_t lengthInfo)
|
||||
{
|
||||
lengthInfo = HOST_TO_LE_64(lengthInfo);
|
||||
sock->send(&lengthInfo, sizeof(int64_t), 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* No-op for this implementation.
|
||||
*/
|
||||
inline bool readStateInit(ReadState& rs)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send length information and the corresponding data packet buffer.
|
||||
*
|
||||
* Note: rs.readRes is used to compute buf length for send()
|
||||
*
|
||||
* @param rs.readRes must not be negative
|
||||
* @param buf the buffer with a preceding gap for the length info
|
||||
* @param isFinal true if this is the last send, i.e. we have read all data
|
||||
*/
|
||||
inline ssize_t readStateSendData(Socket* sock, ReadState& rs, char* buf, bool isFinal)
|
||||
{
|
||||
ssize_t sendRes;
|
||||
{
|
||||
Serializer ser(buf, sizeof(int64_t));
|
||||
ser % rs.readRes;
|
||||
}
|
||||
|
||||
if (isFinal)
|
||||
{
|
||||
Serializer ser(buf + sizeof(int64_t) + rs.readRes, sizeof(int64_t));
|
||||
ser % int64_t(0);
|
||||
sendRes = sock->send(buf, (2*sizeof(int64_t) ) + rs.readRes, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
sendRes = sock->send(buf, sizeof(int64_t) + rs.readRes, 0);
|
||||
}
|
||||
return sendRes;
|
||||
}
|
||||
|
||||
/**
|
||||
* No-op for this implementation.
|
||||
*/
|
||||
inline bool readStateNext(ReadState& rs)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
inline ssize_t getReadLength(ReadState& rs, ssize_t len)
|
||||
{
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t getBuffers(ResponseContext& ctx, char** dataBuf, char** sendBuf);
|
||||
};
|
||||
|
||||
typedef ReadLocalFileMsgExBase<ReadLocalFileV2MsgSender,
|
||||
ReadLocalFileV2MsgSender::ReadState> ReadLocalFileV2MsgEx;
|
||||
|
||||
926
storage/source/net/message/session/rw/WriteLocalFileMsgEx.cpp
Normal file
926
storage/source/net/message/session/rw/WriteLocalFileMsgEx.cpp
Normal file
@@ -0,0 +1,926 @@
|
||||
#include <program/Program.h>
|
||||
#include <common/toolkit/MessagingTk.h>
|
||||
#include <common/toolkit/SessionTk.h>
|
||||
#include <common/toolkit/StorageTk.h>
|
||||
#include <net/msghelpers/MsgHelperIO.h>
|
||||
#include <storage/StorageTargets.h>
|
||||
#include <toolkit/StorageTkEx.h>
|
||||
#include "WriteLocalFileMsgEx.h"
|
||||
#ifdef BEEGFS_NVFS
|
||||
#include "WriteLocalFileRDMAMsgEx.h"
|
||||
#endif
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
static WriteLocalFileMsgEx forcedLinkage;
|
||||
#ifdef BEEGFS_NVFS
|
||||
static WriteLocalFileRDMAMsgEx forcedLinkageRDMA;
|
||||
#endif
|
||||
|
||||
const std::string WriteLocalFileMsgSender::logContextPref = "WriteChunkFileMsg";
|
||||
#ifdef BEEGFS_NVFS
|
||||
const std::string WriteLocalFileRDMAMsgSender::logContextPref = "WriteChunkFileRDMAMsg";
|
||||
#endif
|
||||
|
||||
template <class Msg, typename WriteState>
|
||||
bool WriteLocalFileMsgExBase<Msg, WriteState>::processIncoming(NetMessage::ResponseContext& ctx)
|
||||
{
|
||||
App* app = Program::getApp();
|
||||
|
||||
bool success;
|
||||
int64_t writeClientRes;
|
||||
|
||||
if (!isMsgValid())
|
||||
{
|
||||
sendResponse(ctx, FhgfsOpsErr_INVAL);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::tie(success, writeClientRes) = write(ctx);
|
||||
|
||||
if (success)
|
||||
{
|
||||
sendResponse(ctx, writeClientRes);
|
||||
|
||||
// update operation counters
|
||||
|
||||
if (likely(writeClientRes > 0))
|
||||
app->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(),
|
||||
StorageOpCounter_WRITEOPS, writeClientRes, getMsgHeaderUserID());
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class Msg, typename WriteState>
|
||||
std::pair<bool, int64_t> WriteLocalFileMsgExBase<Msg, WriteState>::write(NetMessage::ResponseContext& ctx)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " incoming";
|
||||
|
||||
App* app = Program::getApp();
|
||||
|
||||
int64_t writeClientRes = -(int64_t)FhgfsOpsErr_INTERNAL; // bytes written or negative fhgfs err
|
||||
FhgfsOpsErr finishMirroringRes = FhgfsOpsErr_INTERNAL;
|
||||
std::string fileHandleID(getFileHandleID() );
|
||||
bool isMirrorSession = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
|
||||
|
||||
bool serverCrashed = false;
|
||||
QuotaExceededErrorType quotaExceeded = QuotaExceededErrorType_NOT_EXCEEDED;
|
||||
|
||||
SessionStore* sessions = Program::getApp()->getSessions();
|
||||
auto session = sessions->referenceOrAddSession(getClientNumID());
|
||||
SessionLocalFileStore* sessionLocalFiles = session->getLocalFiles();
|
||||
|
||||
ChunkLockStore* chunkLockStore = app->getChunkLockStore();
|
||||
bool chunkLocked = false;
|
||||
|
||||
// select the right targetID
|
||||
|
||||
uint16_t targetID = getTargetID();
|
||||
|
||||
if(isMirrorSession)
|
||||
{ // given targetID refers to a buddy mirror group
|
||||
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
|
||||
|
||||
targetID = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) ?
|
||||
mirrorBuddies->getSecondaryTargetID(targetID) :
|
||||
mirrorBuddies->getPrimaryTargetID(targetID);
|
||||
|
||||
// note: only log message here, error handling will happen below through invalid targetFD
|
||||
if(unlikely(!targetID) )
|
||||
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
|
||||
StringTk::uintToStr(getTargetID() ) );
|
||||
}
|
||||
|
||||
auto* const target = app->getStorageTargets()->getTarget(targetID);
|
||||
if (!target)
|
||||
{
|
||||
if (isMirrorSession)
|
||||
{ /* buddy mirrored file => fail with Err_COMMUNICATION to make the requestor retry.
|
||||
mgmt will mark this target as (p)offline in a few moments. */
|
||||
LOG(GENERAL, NOTICE, "Unknown target ID, refusing request.", targetID);
|
||||
return {false, FhgfsOpsErr_COMMUNICATION};
|
||||
}
|
||||
|
||||
LOG(GENERAL, ERR, "Unknown target ID.", targetID);
|
||||
return {false, FhgfsOpsErr_UNKNOWNTARGET};
|
||||
}
|
||||
|
||||
// check if we already have session for this file...
|
||||
|
||||
auto sessionLocalFile = sessionLocalFiles->referenceSession(
|
||||
fileHandleID, targetID, isMirrorSession);
|
||||
|
||||
if(!sessionLocalFile)
|
||||
{ // sessionLocalFile not exists yet => create, insert, re-get it
|
||||
|
||||
if(doSessionCheck() )
|
||||
{ // server crashed during the write, maybe lost some data send error to client
|
||||
LogContext log(logContext);
|
||||
log.log(Log_WARNING, "Potential cache loss for open file handle. (Server crash detected.) "
|
||||
"No session for file available. "
|
||||
"FileHandleID: " + fileHandleID);
|
||||
|
||||
serverCrashed = true;
|
||||
}
|
||||
|
||||
std::string fileID = SessionTk::fileIDFromHandleID(fileHandleID);
|
||||
int openFlags = SessionTk::sysOpenFlagsFromFhgfsAccessFlags(getAccessFlags() );
|
||||
|
||||
auto newFile = boost::make_unique<SessionLocalFile>(fileHandleID, targetID, fileID, openFlags,
|
||||
serverCrashed);
|
||||
|
||||
if(isMirrorSession)
|
||||
newFile->setIsMirrorSession(true);
|
||||
|
||||
sessionLocalFile = sessionLocalFiles->addAndReferenceSession(std::move(newFile));
|
||||
}
|
||||
else
|
||||
{ // session file exists
|
||||
|
||||
if(doSessionCheck() && sessionLocalFile->isServerCrashed() )
|
||||
{ // server crashed during the write, maybe lost some data send error to client
|
||||
LogContext log(logContext);
|
||||
log.log(Log_SPAM, "Potential cache loss for open file handle. (Server crash detected.)"
|
||||
"The session is marked as dirty. "
|
||||
"FileHandleID: " + fileHandleID);
|
||||
|
||||
serverCrashed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// check if the size quota is exceeded for the user or group
|
||||
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) &&
|
||||
app->getConfig()->getQuotaEnableEnforcement() )
|
||||
{
|
||||
quotaExceeded = app->getExceededQuotaStores()->get(targetID)->isQuotaExceeded(getUserID(),
|
||||
getGroupID(), QuotaLimitType_SIZE);
|
||||
|
||||
if(quotaExceeded != QuotaExceededErrorType_NOT_EXCEEDED)
|
||||
{
|
||||
LogContext(logContext).log(Log_NOTICE,
|
||||
QuotaData::QuotaExceededErrorTypeToString(quotaExceeded) + " "
|
||||
"UID: " + StringTk::uintToStr(this->getUserID()) + "; "
|
||||
"GID: " + StringTk::uintToStr(this->getGroupID() ) );
|
||||
|
||||
// receive the message content before return with error
|
||||
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
|
||||
writeClientRes = -(int64_t) FhgfsOpsErr_DQUOT;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
if(isMirrorSession && target->getBuddyResyncInProgress())
|
||||
{
|
||||
// mirrored chunk should be modified, check if resync is in progress and lock chunk
|
||||
std::string chunkID = sessionLocalFile->getFileID();
|
||||
chunkLockStore->lockChunk(targetID, chunkID);
|
||||
chunkLocked = true;
|
||||
}
|
||||
|
||||
// prepare file descriptor (if file not open yet then create/open it)
|
||||
FhgfsOpsErr openRes = openFile(*target, sessionLocalFile.get());
|
||||
if(unlikely(openRes != FhgfsOpsErr_SUCCESS) )
|
||||
{
|
||||
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
|
||||
writeClientRes = -(int64_t)openRes;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// store mirror node reference in session and init mirrorToSock member
|
||||
FhgfsOpsErr prepMirrorRes = prepareMirroring(ctx.getBuffer(), ctx.getBufferLength(),
|
||||
sessionLocalFile.get(), *target);
|
||||
if(unlikely(prepMirrorRes != FhgfsOpsErr_SUCCESS) )
|
||||
{ // mirroring failed
|
||||
incrementalRecvPadding(ctx, getCount(), sessionLocalFile.get());
|
||||
writeClientRes = -(int64_t)prepMirrorRes;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
// the actual write workhorse
|
||||
|
||||
int64_t writeLocalRes = incrementalRecvAndWriteStateful(ctx, sessionLocalFile.get());
|
||||
|
||||
// update client result, offset etc.
|
||||
|
||||
int64_t newOffset;
|
||||
|
||||
if(unlikely(writeLocalRes < 0) )
|
||||
newOffset = -1; // writing failed
|
||||
else
|
||||
{ // writing succeeded
|
||||
newOffset = getOffset() + writeLocalRes;
|
||||
ctx.getStats()->incVals.diskWriteBytes += writeLocalRes; // update stats
|
||||
}
|
||||
|
||||
sessionLocalFile->setOffset(newOffset);
|
||||
|
||||
writeClientRes = writeLocalRes;
|
||||
|
||||
}
|
||||
catch(SocketException& e)
|
||||
{
|
||||
LogContext(logContext).logErr(std::string("SocketException occurred: ") + e.what() );
|
||||
LogContext(logContext).log(Log_WARNING, std::string("Details: ") +
|
||||
"sessionID: " + getClientNumID().str() + "; "
|
||||
"fileHandle: " + std::string(sessionLocalFile->getFileHandleID() ) + "; "
|
||||
"offset: " + StringTk::int64ToStr(getOffset() ) + "; "
|
||||
"count: " + StringTk::int64ToStr(getCount() ) );
|
||||
|
||||
sessionLocalFile->setOffset(-1); // invalidate offset
|
||||
|
||||
finishMirroring(sessionLocalFile.get(), *target);
|
||||
|
||||
if (chunkLocked)
|
||||
{
|
||||
std::string chunkID = sessionLocalFile->getFileID();
|
||||
chunkLockStore->unlockChunk(targetID, chunkID);
|
||||
}
|
||||
|
||||
return {false, -1};
|
||||
}
|
||||
|
||||
|
||||
cleanup:
|
||||
finishMirroringRes = finishMirroring(sessionLocalFile.get(), *target);
|
||||
|
||||
// check mirroring result (don't overwrite local error code, if any)
|
||||
if(likely(writeClientRes > 0) )
|
||||
{ // no local error => check mirroring result
|
||||
if(unlikely(finishMirroringRes != FhgfsOpsErr_SUCCESS) )
|
||||
writeClientRes = -finishMirroringRes; // mirroring failed => use err code as client result
|
||||
}
|
||||
|
||||
if (chunkLocked)
|
||||
{
|
||||
std::string chunkID = sessionLocalFile->getFileID();
|
||||
chunkLockStore->unlockChunk(targetID, chunkID);
|
||||
}
|
||||
|
||||
if (serverCrashed)
|
||||
writeClientRes = -(int64_t) FhgfsOpsErr_STORAGE_SRV_CRASHED;
|
||||
|
||||
return {true, writeClientRes};
|
||||
}
|
||||
|
||||
ssize_t WriteLocalFileMsgSender::recvPadding(ResponseContext& ctx, int64_t toBeReceived)
|
||||
{
|
||||
Config* cfg = Program::getApp()->getConfig();
|
||||
return ctx.getSocket()->recvT(ctx.getBuffer(),
|
||||
BEEGFS_MIN(toBeReceived, ctx.getBufferLength()), 0, cfg->getConnMsgMediumTimeout());
|
||||
}
|
||||
|
||||
#ifdef BEEGFS_NVFS
|
||||
|
||||
ssize_t WriteLocalFileRDMAMsgSender::recvPadding(ResponseContext& ctx, int64_t toBeReceived)
|
||||
{
|
||||
RdmaInfo* rdma = getRdmaInfo();
|
||||
uint64_t rBuf;
|
||||
size_t rLen;
|
||||
uint64_t rOff;
|
||||
|
||||
if (!rdma->next(rBuf, rLen, rOff))
|
||||
return -1;
|
||||
|
||||
ssize_t recvLength = BEEGFS_MIN(ctx.getBufferLength(), toBeReceived);
|
||||
recvLength = BEEGFS_MIN(recvLength, (ssize_t)(rLen - rOff));
|
||||
return ctx.getSocket()->read(ctx.getBuffer(), recvLength, 0, rBuf+rOff, rdma->key);
|
||||
}
|
||||
|
||||
#endif /* BEEGFS_NVFS */
|
||||
|
||||
/**
|
||||
* Note: New offset is saved in the session by the caller afterwards (to make life easier).
|
||||
* @return number of written bytes or negative fhgfs error code
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
int64_t WriteLocalFileMsgExBase<Msg, WriteState>::incrementalRecvAndWriteStateful(NetMessage::ResponseContext& ctx,
|
||||
SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (write incremental)";
|
||||
Config* cfg = Program::getApp()->getConfig();
|
||||
|
||||
// we can securely cast getTuneFileWriteSize to size_t below to make a comparision possible, as
|
||||
// it can technically never be negative and will therefore always fit into size_t
|
||||
const ssize_t exactStaticRecvSize = sessionLocalFile->getIsDirectIO()
|
||||
? ctx.getBufferLength()
|
||||
: BEEGFS_MIN(ctx.getBufferLength(), (size_t)cfg->getTuneFileWriteSize() );
|
||||
|
||||
auto& fd = sessionLocalFile->getFD();
|
||||
|
||||
int64_t oldOffset = sessionLocalFile->getOffset();
|
||||
int64_t newOffset = getOffset();
|
||||
bool useSyncRange = false; // true if sync_file_range should be called
|
||||
|
||||
if( (oldOffset < 0) || (oldOffset != newOffset) )
|
||||
sessionLocalFile->resetWriteCounter(); // reset sequential write counter
|
||||
else
|
||||
{ // continue at previous offset => increase sequential write counter
|
||||
LOG_DEBUG(logContext, Log_SPAM, "Offset: " + StringTk::int64ToStr(getOffset() ) );
|
||||
|
||||
sessionLocalFile->incWriteCounter(getCount() );
|
||||
|
||||
ssize_t syncSize = unlikely(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) ) ?
|
||||
0 : cfg->getTuneFileWriteSyncSize();
|
||||
if (syncSize && (sessionLocalFile->getWriteCounter() >= syncSize) )
|
||||
useSyncRange = true;
|
||||
}
|
||||
|
||||
// incrementally receive file contents...
|
||||
|
||||
WriteState writeState(logContext.c_str(), exactStaticRecvSize,
|
||||
getCount(), getOffset(), sessionLocalFile);
|
||||
if (!writeStateInit(writeState))
|
||||
return -FhgfsOpsErr_COMMUNICATION;
|
||||
|
||||
do
|
||||
{
|
||||
// receive some bytes...
|
||||
|
||||
LOG_DEBUG(logContext, Log_SPAM,
|
||||
"receiving... (remaining: " + StringTk::intToStr(writeState.toBeReceived) + ")");
|
||||
|
||||
ssize_t recvRes = writeStateRecvData(ctx, writeState);
|
||||
if (recvRes < 0)
|
||||
{
|
||||
LogContext(logContext).log(Log_WARNING, "Socket data transfer error occurred. ");
|
||||
return -FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
// forward to mirror...
|
||||
|
||||
FhgfsOpsErr mirrorRes = sendToMirror(ctx.getBuffer(), recvRes,
|
||||
writeState.writeOffset, writeState.toBeReceived, sessionLocalFile);
|
||||
if(unlikely(mirrorRes != FhgfsOpsErr_SUCCESS) )
|
||||
{ // mirroring failed
|
||||
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
|
||||
|
||||
return -FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
// write to underlying file system...
|
||||
|
||||
int errCode = 0;
|
||||
ssize_t writeRes = unlikely(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
|
||||
? recvRes
|
||||
: doWrite(*fd, ctx.getBuffer(), recvRes, writeState.writeOffset, errCode);
|
||||
|
||||
writeState.toBeReceived -= recvRes;
|
||||
|
||||
// handle write errors...
|
||||
|
||||
if(unlikely(writeRes != recvRes) )
|
||||
{ // didn't write all of the received data
|
||||
|
||||
if(writeRes == -1)
|
||||
{ // write error occurred
|
||||
LogContext(logContext).log(Log_WARNING, "Write error occurred. "
|
||||
"FileHandleID: " + sessionLocalFile->getFileHandleID() + "."
|
||||
"Target: " + StringTk::uintToStr(sessionLocalFile->getTargetID() ) + ". "
|
||||
"File: " + sessionLocalFile->getFileID() + ". "
|
||||
"SysErr: " + System::getErrString(errCode) );
|
||||
LogContext(logContext).log(Log_NOTICE, std::string("Additional info: "
|
||||
"FD: ") + StringTk::intToStr(*fd) + " " +
|
||||
"OpenFlags: " + StringTk::intToStr(sessionLocalFile->getOpenFlags() ) + " " +
|
||||
"received: " + StringTk::intToStr(recvRes) + ".");
|
||||
|
||||
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
|
||||
|
||||
return -FhgfsOpsErrTk::fromSysErr(errCode);
|
||||
}
|
||||
else
|
||||
{ // wrote only a part of the data, not all of it
|
||||
LogContext(logContext).log(Log_WARNING,
|
||||
"Unable to write all of the received data. "
|
||||
"target: " + StringTk::uintToStr(sessionLocalFile->getTargetID() ) + "; "
|
||||
"file: " + sessionLocalFile->getFileID() + "; "
|
||||
"sysErr: " + System::getErrString(errCode) );
|
||||
|
||||
incrementalRecvPadding(ctx, writeState.toBeReceived, sessionLocalFile);
|
||||
|
||||
// return bytes received so far minus num bytes that were not written with last write
|
||||
return (getCount() - writeState.toBeReceived) - (recvRes - writeRes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
writeState.writeOffset += writeRes;
|
||||
recvRes = writeStateNext(writeState, writeRes);
|
||||
if (recvRes != 0)
|
||||
return recvRes;
|
||||
} while(writeState.toBeReceived);
|
||||
|
||||
LOG_DEBUG(logContext, Log_SPAM,
|
||||
std::string("Received and wrote all the data") );
|
||||
|
||||
// commit to storage device queue...
|
||||
|
||||
if (useSyncRange)
|
||||
{
|
||||
// advise kernel to commit written data to storage device in max_sectors_kb chunks.
|
||||
|
||||
/* note: this is async if there are free slots in the request queue
|
||||
/sys/block/<...>/nr_requests. (optimal_io_size is not honoured as of linux-3.4) */
|
||||
|
||||
off64_t syncSize = sessionLocalFile->getWriteCounter();
|
||||
off64_t syncOffset = getOffset() + getCount() - syncSize;
|
||||
|
||||
MsgHelperIO::syncFileRange(*fd, syncOffset, syncSize);
|
||||
sessionLocalFile->resetWriteCounter();
|
||||
}
|
||||
|
||||
return getCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Write until everything was written (handle short-writes) or an error occured
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
ssize_t WriteLocalFileMsgExBase<Msg, WriteState>::doWrite(int fd, char* buf, size_t count, off_t offset, int& outErrno)
|
||||
{
|
||||
size_t sumWriteRes = 0;
|
||||
|
||||
do
|
||||
{
|
||||
ssize_t writeRes =
|
||||
MsgHelperIO::pwrite(fd, buf + sumWriteRes, count - sumWriteRes, offset + sumWriteRes);
|
||||
|
||||
if (unlikely(writeRes == -1) )
|
||||
{
|
||||
sumWriteRes = (sumWriteRes > 0) ? sumWriteRes : writeRes;
|
||||
outErrno = errno;
|
||||
break;
|
||||
}
|
||||
|
||||
sumWriteRes += writeRes;
|
||||
|
||||
} while (sumWriteRes != count);
|
||||
|
||||
return sumWriteRes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Receive and discard data.
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
void WriteLocalFileMsgExBase<Msg, WriteState>::incrementalRecvPadding(NetMessage::ResponseContext& ctx,
|
||||
int64_t padLen, SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
uint64_t toBeReceived = padLen;
|
||||
|
||||
while(toBeReceived)
|
||||
{
|
||||
ssize_t recvRes = recvPadding(ctx, toBeReceived);
|
||||
if (recvRes == -1)
|
||||
break;
|
||||
// forward to mirror...
|
||||
|
||||
FhgfsOpsErr mirrorRes = sendToMirror(ctx.getBuffer(), recvRes,
|
||||
getOffset() + padLen - toBeReceived, toBeReceived, sessionLocalFile);
|
||||
if(unlikely(mirrorRes != FhgfsOpsErr_SUCCESS) )
|
||||
{ // mirroring failed
|
||||
/* ... but if we are in this method, then something went wrong anyways, so don't set
|
||||
needs-resync here or report any error to caller. */
|
||||
}
|
||||
|
||||
toBeReceived -= recvRes;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Msg, typename WriteState>
|
||||
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::openFile(const StorageTarget& target,
|
||||
SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (write incremental)";
|
||||
|
||||
bool useQuota = isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA);
|
||||
bool enforceQuota = Program::getApp()->getConfig()->getQuotaEnableEnforcement();
|
||||
|
||||
bool isBuddyMirrorChunk = sessionLocalFile->getIsMirrorSession();
|
||||
|
||||
|
||||
if (sessionLocalFile->getFD().valid())
|
||||
return FhgfsOpsErr_SUCCESS; // file already open => nothing to be done here
|
||||
|
||||
|
||||
// file not open yet => get targetFD and check consistency state
|
||||
|
||||
const auto consistencyState = target.getConsistencyState();
|
||||
const int targetFD = isBuddyMirrorChunk ? *target.getMirrorFD() : *target.getChunkFD();
|
||||
|
||||
if(unlikely(consistencyState != TargetConsistencyState_GOOD) &&
|
||||
isBuddyMirrorChunk &&
|
||||
!isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND) )
|
||||
{ // this is a request for a buddymirrored chunk on a non-good primary
|
||||
LogContext(logContext).log(Log_NOTICE, "Refusing request. Target consistency is not good. "
|
||||
"targetID: " + StringTk::uintToStr(target.getID()));
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
SessionQuotaInfo quotaInfo(useQuota, enforceQuota, getUserID(), getGroupID() );
|
||||
|
||||
FhgfsOpsErr openChunkRes = sessionLocalFile->openFile(targetFD, getPathInfo(), true, "aInfo);
|
||||
|
||||
return openChunkRes;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Prepares mirroring by storing mirrorNode reference in file session and setting the mirrorToSock
|
||||
* member variable.
|
||||
*
|
||||
* Note: Mirror node reference needs to be released on file session close.
|
||||
*
|
||||
* @param buf used to send initial write msg header to mirror.
|
||||
* @param requestorSock used to receive padding if mirroring fails.
|
||||
* @return FhgfsOpsErr_COMMUNICATION if communication with mirror failed.
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::prepareMirroring(char* buf, size_t bufLen,
|
||||
SessionLocalFile* sessionLocalFile, StorageTarget& target)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (prepare mirroring)";
|
||||
|
||||
// check if mirroring is enabled
|
||||
|
||||
if(!isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_FORWARD) )
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
|
||||
App* app = Program::getApp();
|
||||
MirrorBuddyGroupMapper* mirrorBuddies = app->getMirrorBuddyGroupMapper();
|
||||
TargetStateStore* targetStates = app->getTargetStateStore();
|
||||
|
||||
// check if secondary is offline or in unclear state
|
||||
|
||||
uint16_t secondaryTargetID = mirrorBuddies->getSecondaryTargetID(getTargetID() );
|
||||
if(unlikely(!secondaryTargetID) )
|
||||
{
|
||||
LogContext(logContext).logErr("Invalid mirror buddy group ID: " +
|
||||
StringTk::uintToStr(getTargetID() ) );
|
||||
|
||||
return FhgfsOpsErr_UNKNOWNTARGET;
|
||||
}
|
||||
|
||||
CombinedTargetState secondaryState;
|
||||
|
||||
bool getSecondaryStateRes = targetStates->getState(secondaryTargetID, secondaryState);
|
||||
if(unlikely(!getSecondaryStateRes) )
|
||||
{
|
||||
LOG_DEBUG(logContext, Log_DEBUG,
|
||||
"Refusing request. Secondary target has invalid state. "
|
||||
"targetID: " + StringTk::uintToStr(secondaryTargetID) );
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
if( (secondaryState.reachabilityState != TargetReachabilityState_ONLINE) ||
|
||||
(secondaryState.consistencyState != TargetConsistencyState_GOOD) )
|
||||
{
|
||||
if(secondaryState.reachabilityState == TargetReachabilityState_OFFLINE)
|
||||
{ // buddy is offline => mark needed resync and continue with local operation
|
||||
LOG_DEBUG(logContext, Log_DEBUG,
|
||||
"Secondary is offline and will need resync. "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
|
||||
|
||||
// buddy is marked offline, so local msg processing will be done and buddy needs resync
|
||||
|
||||
target.setBuddyNeedsResync(true);
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
|
||||
if(secondaryState.consistencyState != TargetConsistencyState_NEEDS_RESYNC)
|
||||
{ // unclear buddy state => client must try again
|
||||
LOG_DEBUG(logContext, Log_DEBUG,
|
||||
"Unclear secondary state, caller will have to try again later. "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// store mirror node reference in session...
|
||||
|
||||
NodeHandle mirrorToNode = sessionLocalFile->getMirrorNode();
|
||||
|
||||
if(!mirrorToNode)
|
||||
{
|
||||
NodeStoreServers* storageNodes = app->getStorageNodes();
|
||||
TargetMapper* targetMapper = app->getTargetMapper();
|
||||
FhgfsOpsErr referenceErr;
|
||||
|
||||
mirrorToNode = storageNodes->referenceNodeByTargetID(secondaryTargetID, targetMapper,
|
||||
&referenceErr);
|
||||
|
||||
if(unlikely(referenceErr != FhgfsOpsErr_SUCCESS) )
|
||||
{
|
||||
LogContext(logContext).logErr(
|
||||
"Unable to forward to mirror target: " + StringTk::uintToStr(secondaryTargetID) + "; "
|
||||
"Error: " + boost::lexical_cast<std::string>(referenceErr));
|
||||
return referenceErr;
|
||||
}
|
||||
|
||||
mirrorToNode = sessionLocalFile->setMirrorNodeExclusive(mirrorToNode);
|
||||
}
|
||||
|
||||
// send initial write msg header to mirror (retry loop)...
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
try
|
||||
{
|
||||
// acquire connection to mirror node and send write msg...
|
||||
|
||||
mirrorToSock = mirrorToNode->getConnPool()->acquireStreamSocket();
|
||||
|
||||
WriteLocalFileMsg mirrorWriteMsg(getClientNumID(), getFileHandleID(), getTargetID(),
|
||||
getPathInfo(), getAccessFlags(), getOffset(), getCount());
|
||||
|
||||
if(doSessionCheck() )
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
|
||||
|
||||
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_DISABLE_IO);
|
||||
|
||||
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) )
|
||||
mirrorWriteMsg.setUserdataForQuota(getUserID(), getGroupID() );
|
||||
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
|
||||
|
||||
unsigned msgLength = mirrorWriteMsg.serializeMessage(buf, bufLen).second;
|
||||
mirrorToSock->send(buf, msgLength, 0);
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
catch(SocketConnectException& e)
|
||||
{
|
||||
LogContext(logContext).log(Log_CRITICAL, "Unable to connect to mirror node: " +
|
||||
mirrorToNode->getNodeIDWithTypeStr() + "; "
|
||||
"Msg: " + e.what() );
|
||||
}
|
||||
catch(SocketException& e)
|
||||
{
|
||||
LogContext(logContext).log(Log_CRITICAL, "Communication with mirror node failed: " +
|
||||
mirrorToNode->getNodeIDWithTypeStr() + "; "
|
||||
"Msg: " + e.what() );
|
||||
|
||||
if(mirrorToSock)
|
||||
mirrorToNode->getConnPool()->invalidateStreamSocket(mirrorToSock);
|
||||
|
||||
mirrorToSock = NULL;
|
||||
}
|
||||
|
||||
// error occurred if we got here
|
||||
|
||||
if(!mirrorRetriesLeft)
|
||||
break;
|
||||
|
||||
mirrorRetriesLeft--;
|
||||
|
||||
// next round will be a retry
|
||||
LogContext(logContext).log(Log_NOTICE, "Retrying mirror communication: " +
|
||||
mirrorToNode->getNodeIDWithTypeStr() );
|
||||
|
||||
} // end of retry for-loop
|
||||
|
||||
|
||||
// all retries exhausted if we got here
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send file contents to mirror.
|
||||
*
|
||||
* Note: Supports retries only at beginning of write msg.
|
||||
*
|
||||
* @param buf the buffer that should be sent to the mirror.
|
||||
* @param offset the offset within the chunk file (only used if communication fails and we need to
|
||||
* start over with a new WriteMsg to the mirror).
|
||||
* @param toBeMirrored total remaining mirror data including given bufLen (only used for retries).
|
||||
* @return FhgfsOpsErr_COMMUNICATION if mirroring fails.
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::sendToMirror(const char* buf, size_t bufLen,
|
||||
int64_t offset, int64_t toBeMirrored, SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (send to mirror)";
|
||||
|
||||
// check if mirroring enabled
|
||||
|
||||
if(!mirrorToSock)
|
||||
return FhgfsOpsErr_SUCCESS; // either no mirroring enabled or all retries exhausted
|
||||
|
||||
bool isRetryRound = false;
|
||||
|
||||
// send raw data (retry loop)...
|
||||
// (note: if sending fails, retrying requires sending of a new WriteMsg)
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
try
|
||||
{
|
||||
if(unlikely(isRetryRound) )
|
||||
{ // retry requires reconnect and resend of write msg with current offset
|
||||
|
||||
auto mirrorToNode = sessionLocalFile->getMirrorNode();
|
||||
|
||||
mirrorToSock = mirrorToNode->getConnPool()->acquireStreamSocket();
|
||||
|
||||
WriteLocalFileMsg mirrorWriteMsg(getClientNumID(), getFileHandleID(),
|
||||
getTargetID(), getPathInfo(), getAccessFlags(), offset, toBeMirrored);
|
||||
|
||||
if(doSessionCheck() )
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
|
||||
|
||||
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_DISABLE_IO) )
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_DISABLE_IO);
|
||||
|
||||
if(isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_USE_QUOTA) )
|
||||
mirrorWriteMsg.setUserdataForQuota(getUserID(), getGroupID() );
|
||||
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR);
|
||||
mirrorWriteMsg.addMsgHeaderFeatureFlag(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR_SECOND);
|
||||
|
||||
const auto mirrorBuf = MessagingTk::createMsgVec(mirrorWriteMsg);
|
||||
|
||||
mirrorToSock->send(&mirrorBuf[0], mirrorBuf.size(), 0);
|
||||
}
|
||||
|
||||
mirrorToSock->send(buf, bufLen, 0);
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
catch(SocketConnectException& e)
|
||||
{
|
||||
auto mirrorToNode = sessionLocalFile->getMirrorNode();
|
||||
|
||||
LogContext(logContext).log(Log_CRITICAL, "Unable to connect to mirror node: " +
|
||||
mirrorToNode->getNodeIDWithTypeStr() + "; "
|
||||
"Msg: " + e.what() );
|
||||
}
|
||||
catch(SocketException& e)
|
||||
{
|
||||
LogContext(logContext).log(Log_CRITICAL, "Communication with mirror node failed: " +
|
||||
sessionLocalFile->getMirrorNode()->getNodeIDWithTypeStr() + "; "
|
||||
"Msg: " + e.what() );
|
||||
|
||||
if(mirrorToSock)
|
||||
sessionLocalFile->getMirrorNode()->getConnPool()->invalidateStreamSocket(mirrorToSock);
|
||||
|
||||
mirrorToSock = NULL;
|
||||
}
|
||||
|
||||
// error occurred if we got here
|
||||
|
||||
if(!mirrorRetriesLeft)
|
||||
break;
|
||||
|
||||
// only allow retries if we're still at the beginning of the write msg.
|
||||
/* (this is because later we don't have all the client data available; and without the mirror
|
||||
response we don't know for sure whether previously sent data was really written or not.) */
|
||||
if(toBeMirrored != getCount() )
|
||||
break;
|
||||
|
||||
mirrorRetriesLeft--;
|
||||
|
||||
// next round will be a retry
|
||||
LogContext(logContext).log(Log_NOTICE, "Retrying mirror communication: " +
|
||||
sessionLocalFile->getMirrorNode()->getNodeIDWithTypeStr() );
|
||||
|
||||
isRetryRound = true;
|
||||
|
||||
} // end of retry for-loop
|
||||
|
||||
// all retries exhausted if we got here
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
/**
|
||||
* Receive response from mirror node, check result, clean up (release mirror sock).
|
||||
*
|
||||
* Note: Does not do retries on communication errors
|
||||
*/
|
||||
template <class Msg, typename WriteState>
|
||||
FhgfsOpsErr WriteLocalFileMsgExBase<Msg, WriteState>::finishMirroring(SessionLocalFile* sessionLocalFile,
|
||||
StorageTarget& target)
|
||||
{
|
||||
std::string logContext = Msg::logContextPref + " (finish mirroring)";
|
||||
|
||||
// check if mirroring enabled
|
||||
|
||||
if(!mirrorToSock)
|
||||
return FhgfsOpsErr_SUCCESS; // mirroring disabled
|
||||
|
||||
App* app = Program::getApp();
|
||||
auto mirrorToNode = sessionLocalFile->getMirrorNode();
|
||||
|
||||
WriteLocalFileRespMsg* writeRespMsg;
|
||||
int64_t mirrorWriteRes;
|
||||
|
||||
|
||||
// receive write msg response from mirror...
|
||||
/* note: we don't have the file contents that were sent by the client anymore at this point, so
|
||||
we cannot do retries here with a new WriteMsg. */
|
||||
|
||||
try
|
||||
{
|
||||
// receive write msg response...
|
||||
|
||||
auto resp = MessagingTk::recvMsgBuf(*mirrorToSock);
|
||||
if (resp.empty())
|
||||
{ // error
|
||||
LogContext(logContext).log(Log_WARNING,
|
||||
"Failed to receive response from mirror: " + mirrorToSock->getPeername() );
|
||||
|
||||
goto cleanup_commerr;
|
||||
}
|
||||
|
||||
// got response => deserialize it...
|
||||
|
||||
auto respMsg = app->getNetMessageFactory()->createFromBuf(std::move(resp));
|
||||
|
||||
if(unlikely(respMsg->getMsgType() != NETMSGTYPE_WriteLocalFileResp) )
|
||||
{ // response invalid (wrong msgType)
|
||||
LogContext(logContext).logErr(
|
||||
"Received invalid response type: " + StringTk::intToStr(respMsg->getMsgType() ) +"; "
|
||||
"expected type: " + StringTk::intToStr(NETMSGTYPE_WriteLocalFileResp) + ". "
|
||||
"Disconnecting: " + mirrorToSock->getPeername() );
|
||||
|
||||
goto cleanup_commerr;
|
||||
}
|
||||
|
||||
// check mirror result and release mirror socket...
|
||||
|
||||
mirrorToNode->getConnPool()->releaseStreamSocket(mirrorToSock);
|
||||
|
||||
writeRespMsg = (WriteLocalFileRespMsg*)respMsg.get();
|
||||
mirrorWriteRes = writeRespMsg->getValue();
|
||||
|
||||
if(likely(mirrorWriteRes == getCount() ) )
|
||||
return FhgfsOpsErr_SUCCESS; // mirror successfully wrote all of the data
|
||||
|
||||
if(mirrorWriteRes >= 0)
|
||||
{ // mirror only wrote a part of the data
|
||||
LogContext(logContext).log(Log_WARNING,
|
||||
"Mirror did not write all of the data (no space left); "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) + "; "
|
||||
"fileHandle: " + sessionLocalFile->getFileHandleID() );
|
||||
|
||||
return FhgfsOpsErr_NOSPACE;
|
||||
}
|
||||
|
||||
if(mirrorWriteRes == -FhgfsOpsErr_UNKNOWNTARGET)
|
||||
{
|
||||
/* local msg processing shall be done and buddy needs resync
|
||||
(this is normal when a storage is restarted without a broken secondary target, so we
|
||||
report success to a client in this case) */
|
||||
|
||||
LogContext(logContext).log(Log_DEBUG,
|
||||
"Secondary reports unknown target error and will need resync. "
|
||||
"mirror buddy group ID: " + StringTk::uintToStr(getTargetID() ) );
|
||||
|
||||
target.setBuddyNeedsResync(true);
|
||||
|
||||
return FhgfsOpsErr_SUCCESS;
|
||||
}
|
||||
|
||||
if(mirrorWriteRes == -FhgfsOpsErr_STORAGE_SRV_CRASHED)
|
||||
LogContext(logContext).log(Log_NOTICE, "Potential cache loss for open file handle. "
|
||||
"(Mirror server crash detected.) "
|
||||
"FileHandleID: " + sessionLocalFile->getFileHandleID() + "; "
|
||||
"Mirror: " + mirrorToNode->getNodeIDWithTypeStr() );
|
||||
|
||||
// mirror encountered an error
|
||||
return (FhgfsOpsErr)-mirrorWriteRes; // write response contains negative fhgfs error code
|
||||
|
||||
}
|
||||
catch(SocketException& e)
|
||||
{
|
||||
LogContext(logContext).logErr(std::string("SocketException: ") + e.what() );
|
||||
LogContext(logContext).log(Log_WARNING, "Additional info: "
|
||||
"mirror node: " + mirrorToNode->getNodeIDWithTypeStr() + "; "
|
||||
"fileHandle: " + sessionLocalFile->getFileHandleID() );
|
||||
}
|
||||
|
||||
|
||||
// cleanup after communication error...
|
||||
|
||||
cleanup_commerr:
|
||||
mirrorToNode->getConnPool()->invalidateStreamSocket(mirrorToSock);
|
||||
|
||||
return FhgfsOpsErr_COMMUNICATION;
|
||||
}
|
||||
|
||||
template <class Msg, typename WriteState>
|
||||
bool WriteLocalFileMsgExBase<Msg, WriteState>::doSessionCheck()
|
||||
{ // do session check only when it is not a mirror session
|
||||
return isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_BUDDYMIRROR) ? false :
|
||||
isMsgHeaderFeatureFlagSet(WRITELOCALFILEMSG_FLAG_SESSION_CHECK);
|
||||
}
|
||||
213
storage/source/net/message/session/rw/WriteLocalFileMsgEx.h
Normal file
213
storage/source/net/message/session/rw/WriteLocalFileMsgEx.h
Normal file
@@ -0,0 +1,213 @@
|
||||
#pragma once
|
||||
|
||||
#include <common/net/message/session/rw/WriteLocalFileMsg.h>
|
||||
#include <common/net/message/session/rw/WriteLocalFileRespMsg.h>
|
||||
#include <session/SessionLocalFile.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
|
||||
|
||||
#define WRITEMSG_MIRROR_RETRIES_NUM 1
|
||||
|
||||
class StorageTarget;
|
||||
|
||||
/**
|
||||
* Contains common data needed by implementations of the network protocol
|
||||
* that receive data from the client.
|
||||
*/
|
||||
struct WriteStateBase
|
||||
{
|
||||
const char* logContext;
|
||||
ssize_t exactStaticRecvSize;
|
||||
ssize_t recvLength;
|
||||
int64_t toBeReceived;
|
||||
off_t writeOffset;
|
||||
SessionLocalFile* sessionLocalFile;
|
||||
|
||||
WriteStateBase(const char* logContext, ssize_t exactStaticRecvSize,
|
||||
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile)
|
||||
{
|
||||
this->logContext = logContext;
|
||||
this->exactStaticRecvSize = exactStaticRecvSize;
|
||||
this->toBeReceived = toBeReceived;
|
||||
this->writeOffset = writeOffset;
|
||||
this->sessionLocalFile = sessionLocalFile;
|
||||
recvLength = BEEGFS_MIN(exactStaticRecvSize, toBeReceived);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <class Msg, typename WriteState>
|
||||
class WriteLocalFileMsgExBase : public Msg
|
||||
{
|
||||
|
||||
private:
|
||||
Socket* mirrorToSock;
|
||||
unsigned mirrorRetriesLeft;
|
||||
|
||||
public:
|
||||
bool processIncoming(NetMessage::ResponseContext& ctx);
|
||||
|
||||
WriteLocalFileMsgExBase() : Msg()
|
||||
{
|
||||
mirrorToSock = NULL;
|
||||
mirrorRetriesLeft = WRITEMSG_MIRROR_RETRIES_NUM;
|
||||
}
|
||||
|
||||
private:
|
||||
std::pair<bool, int64_t> write(NetMessage::ResponseContext& ctx);
|
||||
|
||||
ssize_t doWrite(int fd, char* buf, size_t count, off_t offset, int& outErrno);
|
||||
|
||||
FhgfsOpsErr openFile(const StorageTarget& target, SessionLocalFile* sessionLocalFile);
|
||||
|
||||
FhgfsOpsErr prepareMirroring(char* buf, size_t bufLen,
|
||||
SessionLocalFile* sessionLocalFile, StorageTarget& target);
|
||||
FhgfsOpsErr sendToMirror(const char* buf, size_t bufLen, int64_t offset, int64_t toBeMirrored,
|
||||
SessionLocalFile* sessionLocalFile);
|
||||
FhgfsOpsErr finishMirroring(SessionLocalFile* sessionLocalFile, StorageTarget& target);
|
||||
|
||||
bool doSessionCheck();
|
||||
|
||||
int64_t incrementalRecvAndWriteStateful(NetMessage::ResponseContext& ctx,
|
||||
SessionLocalFile* sessionLocalFile);
|
||||
|
||||
void incrementalRecvPadding(NetMessage::ResponseContext& ctx, int64_t padLen,
|
||||
SessionLocalFile* sessionLocalFile);
|
||||
|
||||
inline ssize_t recvPadding(NetMessage::ResponseContext& ctx, int64_t toBeReceived)
|
||||
{
|
||||
return static_cast<Msg&>(*this).recvPadding(ctx, toBeReceived);
|
||||
}
|
||||
|
||||
inline void sendResponse(NetMessage::ResponseContext& ctx, int err)
|
||||
{
|
||||
return static_cast<Msg&>(*this).sendResponse(ctx, err);
|
||||
}
|
||||
|
||||
inline bool writeStateInit(WriteState& ws)
|
||||
{
|
||||
return static_cast<Msg&>(*this).writeStateInit(ws);
|
||||
}
|
||||
|
||||
inline ssize_t writeStateRecvData(NetMessage::ResponseContext& ctx, WriteState& ws)
|
||||
{
|
||||
return static_cast<Msg&>(*this).writeStateRecvData(ctx, ws);
|
||||
}
|
||||
|
||||
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
|
||||
{
|
||||
return static_cast<Msg&>(*this).writeStateNext(ws, writeRes);
|
||||
}
|
||||
|
||||
public:
|
||||
inline bool isMsgValid() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).isMsgValid();
|
||||
}
|
||||
|
||||
inline bool isMsgHeaderFeatureFlagSet(unsigned flag) const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).isMsgHeaderFeatureFlagSet(flag);
|
||||
}
|
||||
|
||||
inline unsigned getMsgHeaderUserID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getMsgHeaderUserID();
|
||||
}
|
||||
|
||||
inline uint16_t getTargetID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getTargetID();
|
||||
}
|
||||
|
||||
inline int64_t getOffset() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getOffset();
|
||||
}
|
||||
|
||||
inline unsigned getUserID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getUserID();
|
||||
}
|
||||
|
||||
inline unsigned getGroupID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getGroupID();
|
||||
}
|
||||
|
||||
inline int64_t getCount() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getCount();
|
||||
}
|
||||
|
||||
inline const char* getFileHandleID()
|
||||
{
|
||||
return static_cast<Msg&>(*this).getFileHandleID();
|
||||
}
|
||||
|
||||
inline NumNodeID getClientNumID() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getClientNumID();
|
||||
}
|
||||
|
||||
inline unsigned getAccessFlags() const
|
||||
{
|
||||
return static_cast<const Msg&>(*this).getAccessFlags();
|
||||
}
|
||||
|
||||
inline PathInfo* getPathInfo ()
|
||||
{
|
||||
return static_cast<Msg&>(*this).getPathInfo();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Implements the recv protocol.
|
||||
*/
|
||||
class WriteLocalFileMsgSender : public WriteLocalFileMsg
|
||||
{
|
||||
public:
|
||||
struct WriteState : public WriteStateBase
|
||||
{
|
||||
WriteState(const char* logContext, ssize_t exactStaticRecvSize,
|
||||
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile) :
|
||||
WriteStateBase(logContext, exactStaticRecvSize, toBeReceived, writeOffset,
|
||||
sessionLocalFile) {}
|
||||
};
|
||||
|
||||
private:
|
||||
friend class WriteLocalFileMsgExBase<WriteLocalFileMsgSender, WriteState>;
|
||||
|
||||
static const std::string logContextPref;
|
||||
|
||||
ssize_t recvPadding(ResponseContext& ctx, int64_t toBeReceived);
|
||||
|
||||
inline void sendResponse(ResponseContext& ctx, int err)
|
||||
{
|
||||
ctx.sendResponse(WriteLocalFileRespMsg(err));
|
||||
}
|
||||
|
||||
inline bool writeStateInit(WriteState& ws)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
inline ssize_t writeStateRecvData(ResponseContext& ctx, WriteState& ws)
|
||||
{
|
||||
AbstractApp* app = PThread::getCurrentThreadApp();
|
||||
int connMsgMediumTimeout = app->getCommonConfig()->getConnMsgMediumTimeout();
|
||||
ws.recvLength = BEEGFS_MIN(ws.exactStaticRecvSize, ws.toBeReceived);
|
||||
return ctx.getSocket()->recvExactT(ctx.getBuffer(), ws.recvLength, 0, connMsgMediumTimeout);
|
||||
}
|
||||
|
||||
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef WriteLocalFileMsgExBase<WriteLocalFileMsgSender,
|
||||
WriteLocalFileMsgSender::WriteState> WriteLocalFileMsgEx;
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef BEEGFS_NVFS
|
||||
#include <common/net/message/session/rw/WriteLocalFileRDMAMsg.h>
|
||||
#include <common/net/message/session/rw/WriteLocalFileRDMARespMsg.h>
|
||||
#include <common/components/worker/Worker.h>
|
||||
#include <session/SessionLocalFile.h>
|
||||
#include <common/storage/StorageErrors.h>
|
||||
#include "WriteLocalFileMsgEx.h"
|
||||
|
||||
|
||||
/**
|
||||
* Implements RDMA read protocol.
|
||||
*/
|
||||
class WriteLocalFileRDMAMsgSender : public WriteLocalFileRDMAMsg
|
||||
{
|
||||
public:
|
||||
struct WriteState : public WriteStateBase
|
||||
{
|
||||
RdmaInfo* rdma;
|
||||
uint64_t rBuf;
|
||||
size_t rLen;
|
||||
uint64_t rOff;
|
||||
int64_t recvSize;
|
||||
|
||||
WriteState(const char* logContext, ssize_t exactStaticRecvSize,
|
||||
int64_t toBeReceived, off_t writeOffset, SessionLocalFile* sessionLocalFile) :
|
||||
WriteStateBase(logContext, exactStaticRecvSize, toBeReceived, writeOffset,
|
||||
sessionLocalFile)
|
||||
{
|
||||
recvSize = toBeReceived;
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
friend class WriteLocalFileMsgExBase<WriteLocalFileRDMAMsgSender, WriteState>;
|
||||
|
||||
static const std::string logContextPref;
|
||||
|
||||
ssize_t recvPadding(ResponseContext& ctx, int64_t toBeReceived);
|
||||
|
||||
inline void sendResponse(ResponseContext& ctx, int err)
|
||||
{
|
||||
ctx.sendResponse(WriteLocalFileRDMARespMsg(err));
|
||||
}
|
||||
|
||||
inline bool writeStateInit(WriteState& ws)
|
||||
{
|
||||
ws.rdma = getRdmaInfo();
|
||||
if (unlikely(!ws.rdma->next(ws.rBuf, ws.rLen, ws.rOff)))
|
||||
{
|
||||
LogContext(ws.logContext).logErr("No entities in RDMA buffers.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline ssize_t writeStateRecvData(ResponseContext& ctx, WriteState& ws)
|
||||
{
|
||||
// Cannot RDMA anything larger than WORKER_BUFIN_SIZE in a single operation
|
||||
// because that is the size of the buffer passed in by the Worker.
|
||||
// TODO: pass around a Buffer with a length instead of unqualified char*.
|
||||
ws.recvLength = BEEGFS_MIN(
|
||||
BEEGFS_MIN(
|
||||
BEEGFS_MIN(ws.exactStaticRecvSize, ws.toBeReceived),
|
||||
(ssize_t)(ws.rLen - ws.rOff)),
|
||||
WORKER_BUFIN_SIZE);
|
||||
return ctx.getSocket()->read(ctx.getBuffer(), ws.recvLength, 0, ws.rBuf + ws.rOff, ws.rdma->key);
|
||||
}
|
||||
|
||||
inline size_t writeStateNext(WriteState& ws, ssize_t writeRes)
|
||||
{
|
||||
ws.rOff += writeRes;
|
||||
if (ws.toBeReceived > 0 && ws.rOff == ws.rLen)
|
||||
{
|
||||
if (unlikely(!ws.rdma->next(ws.rBuf, ws.rLen, ws.rOff)))
|
||||
{
|
||||
LogContext(ws.logContext).logErr("RDMA buffers expended but not all data received. toBeReceived=" +
|
||||
StringTk::uint64ToStr(ws.toBeReceived) + "; "
|
||||
"target: " + StringTk::uintToStr(ws.sessionLocalFile->getTargetID() ) + "; "
|
||||
"file: " + ws.sessionLocalFile->getFileID() + "; ");
|
||||
return ws.recvSize - ws.toBeReceived;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef WriteLocalFileMsgExBase<WriteLocalFileRDMAMsgSender,
|
||||
WriteLocalFileRDMAMsgSender::WriteState> WriteLocalFileRDMAMsgEx;
|
||||
|
||||
#endif /* BEEGFS_NVFS */
|
||||
|
||||
Reference in New Issue
Block a user