beegfs/meta/source/components/buddyresyncer/BuddyResyncJob.cpp

#include <program/Program.h>

#include <common/components/worker/IncSyncedCounterWork.h>
#include <common/net/message/nodes/SetTargetConsistencyStatesMsg.h>
#include <common/net/message/nodes/SetTargetConsistencyStatesRespMsg.h>
#include <common/net/message/storage/mirroring/StorageResyncStartedMsg.h>
#include <common/net/message/storage/mirroring/StorageResyncStartedRespMsg.h>
#include <common/threading/Barrier.h>
#include <common/toolkit/DebugVariable.h>
#include <common/toolkit/SynchronizedCounter.h>

#include <app/App.h>
#include <components/buddyresyncer/BuddyResyncerBulkSyncSlave.h>
#include <components/buddyresyncer/BuddyResyncerModSyncSlave.h>
#include <components/worker/BarrierWork.h>
#include <toolkit/BuddyCommTk.h>

#include "BuddyResyncJob.h"

BuddyResyncJob::BuddyResyncJob() :
   PThread("BuddyResyncJob"),
   state(BuddyResyncJobState_NOTSTARTED),
   startTime(0), endTime(0),
   gatherSlave(boost::make_unique<BuddyResyncerGatherSlave>(&syncCandidates))
{
   App* app = Program::getApp();
   Config* cfg = app->getConfig();
   buddyNodeID =
      NumNodeID(app->getMetaBuddyGroupMapper()->getBuddyTargetID(app->getLocalNodeNumID().val()));

   const unsigned numSyncSlaves = std::max<unsigned>(cfg->getTuneNumResyncSlaves(), 1);

   for (size_t i = 0; i < numSyncSlaves; i++)
      bulkSyncSlaves.emplace_back(
            boost::make_unique<BuddyResyncerBulkSyncSlave>(*this, &syncCandidates, i, buddyNodeID));

   sessionStoreResyncer = boost::make_unique<SessionStoreResyncer>(buddyNodeID);
   modSyncSlave = boost::make_unique<BuddyResyncerModSyncSlave>(*this, &syncCandidates, 1, buddyNodeID);
}

BuddyResyncJob::~BuddyResyncJob() = default;

void BuddyResyncJob::run()
{
   const char* logContext = "Run resync job";

   InternodeSyncer* internodeSyncer = Program::getApp()->getInternodeSyncer();
   App* app = Program::getApp();
   WorkerList* workers = app->getWorkers();
   NodeStore* metaNodes = app->getMetaNodes();
   const std::string metaPath = app->getMetaPath();
   const std::string metaBuddyMirPath = app->getMetaPath() + "/" + CONFIG_BUDDYMIRROR_SUBDIR_NAME;
   Barrier workerBarrier(workers->size() + 1);
   bool workersStopped = false;

   startTime = time(NULL);

   syncCandidates.clear();

   auto buddyNode = metaNodes->referenceNode(buddyNodeID);

   if (!buddyNode)
   {
      LOG(MIRRORING, ERR, "Unable to resolve buddy node. Resync will not start.");
      setState(BuddyResyncJobState_FAILURE);
      goto cleanup;
   }

   DEBUG_ENV_VAR(unsigned, DIE_AT_RESYNC_N, 0, "BEEGFS_RESYNC_DIE_AT_N");
   if (DIE_AT_RESYNC_N) {
      static unsigned resyncs = 0;
      // for #479: terminating a server at this point caused the workers to terminate before the
      // resyncer had communicated with them, causing a deadlock on shutdown
      if (++resyncs == DIE_AT_RESYNC_N) {
         ::kill(0, SIGTERM);
         sleep(4);
      }
   }
   stopAllWorkersOn(workerBarrier);
   {
      // Notify buddy that resync started and wait for confirmation
      StorageResyncStartedMsg msg(buddyNodeID.val());
      const auto respMsg = MessagingTk::requestResponse(*buddyNode, msg,
            NETMSGTYPE_StorageResyncStartedResp);

      if (!respMsg)
      {
         LogContext(logContext).logErr("Unable to notify buddy about resync attempt. "
               "Resync will not start.");
         setState(BuddyResyncJobState_FAILURE);
         workerBarrier.wait();
         goto cleanup;
      }

      // resync could have been aborted before we got here. if so, exit as soon as possible without
      // setting the resync job state to something else.
      {
         std::unique_lock<Mutex> lock(stateMutex);

         if (state == BuddyResyncJobState_INTERRUPTED)
         {
            lock.unlock();
            workerBarrier.wait();
            goto cleanup;
         }

         state = BuddyResyncJobState_RUNNING;
      }
      internodeSyncer->setResyncInProgress(true);

      const bool startGatherSlaveRes = startGatherSlaves();
      if (!startGatherSlaveRes)
      {
         setState(BuddyResyncJobState_FAILURE);
         workerBarrier.wait();
         goto cleanup;
      }

      const bool startResyncSlaveRes = startSyncSlaves();
      if (!startResyncSlaveRes)
      {
         setState(BuddyResyncJobState_FAILURE);
         workerBarrier.wait();
         goto cleanup;
      }
   }
   workerBarrier.wait();

   LOG_DEBUG(__func__, Log_DEBUG, "Going to join gather slaves.");
   joinGatherSlaves();
   LOG_DEBUG(__func__, Log_DEBUG, "Joined gather slaves.");

   LOG_DEBUG(__func__, Log_DEBUG, "Going to join sync slaves.");

   // gather slaves have finished. Tell sync slaves to stop when work packages are empty and wait.
   for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
   {
      (*it)->setOnlyTerminateIfIdle(true);
      (*it)->selfTerminate();
   }

   for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
      (*it)->join();

   // here we can be in one of two situations:
   //  1. bulk resync has succeeded. we then totally stop the workers: the session store must be in
   //     a quiescent state for resync, so for simplicitly, we suspend all client operations here.
   //     we do not want to do this any earlier than this point, because bulk syncers may take a
   //     very long time to complete.
   //  2. bulk resync has failed. in this case, the bulk syncers have aborted the currently running
   //     job, and the mod syncer is either dead or in the process of dying. here we MUST NOT stop
   //     the workers, because they are very likely blocked on the mod sync queue already and will
   //     not unblock before the queue is cleared.
   if (getState() == BuddyResyncJobState_RUNNING)
   {
      stopAllWorkersOn(workerBarrier);
      workersStopped = true;
   }

   modSyncSlave->setOnlyTerminateIfIdle(true);
   modSyncSlave->selfTerminate();
   modSyncSlave->join();

   // gatherers are done and the workers have been stopped, we can safely resync the session now.

   LOG_DEBUG(__func__, Log_DEBUG, "Joined sync slaves.");

   // Perform session store resync
   // the job may have been aborted or terminated by errors. in this case, do not resync the session
   // store. end the sync as quickly as possible.
   if (getState() == BuddyResyncJobState_RUNNING)
      sessionStoreResyncer->doSync();

   // session store is now synced, and future actions can be forwarded safely. we do not restart
   // the workers here because the resync may still enter FAILED state, and we don't want to forward
   // to the secondary in this case.

cleanup:
   bool syncErrors = false;

   {
      std::lock_guard<Mutex> lock(gatherSlave->stateMutex);
      while (gatherSlave->isRunning)
         gatherSlave->isRunningChangeCond.wait(&gatherSlave->stateMutex);

      syncErrors |= gatherSlave->getStats().errors != 0;
   }

   for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
   {
      BuddyResyncerBulkSyncSlave* slave = it->get();
      std::lock_guard<Mutex> lock(slave->stateMutex);
      while (slave->isRunning)
         slave->isRunningChangeCond.wait(&slave->stateMutex);

      syncErrors |= slave->getStats().dirErrors != 0;
      syncErrors |= slave->getStats().fileErrors != 0;
   }

   syncErrors |= sessionStoreResyncer->getStats().errors;

   {
      while (modSyncSlave->isRunning)
         modSyncSlave->isRunningChangeCond.wait(&modSyncSlave->stateMutex);

      syncErrors |= modSyncSlave->getStats().errors != 0;
   }


   if (getState() == BuddyResyncJobState_RUNNING || getState() == BuddyResyncJobState_INTERRUPTED)
   {
      if (syncErrors)
         setState(BuddyResyncJobState_ERRORS);
      else if (getState() == BuddyResyncJobState_RUNNING)
         setState(BuddyResyncJobState_SUCCESS);

      // delete timestamp override file if it exists.
      BuddyCommTk::setBuddyNeedsResync(metaPath, false);

      const TargetConsistencyState buddyState = newBuddyState();
      informBuddy(buddyState);
      informMgmtd(buddyState);

      const bool interrupted = getState() != BuddyResyncJobState_SUCCESS;
      LOG(MIRRORING, WARNING, "Resync finished.", interrupted, syncErrors);
   }

   internodeSyncer->setResyncInProgress(false);
   endTime = time(NULL);

   // restart all the worker threads
   if (workersStopped)
      workerBarrier.wait();

   // if the resync was aborted, the mod sync queue may still contain items. additionally, workers
   // may be waiting for a changeset slot, or they may have started executing after the resync was
   // aborted by the sync slaves, but before the resync was officially set to "not running".
   // we cannot set the resync to "not running" in abort() because we have no upper bound for the
   // number of worker threads. even if we did set the resync to "not running" in abort() and
   // cleared the sync queues at the same time, there may still be an arbitrary number of threads
   // waiting for a changeset slot.
   // instead, we have to wait for each thread to "see" that the resync is over, and periodically
   // clear the sync queue to unblock those workers that are still waiting for slots.
   if (syncErrors)
   {
      SynchronizedCounter counter;

      for (auto it = workers->begin(); it != workers->end(); ++it)
      {
         auto& worker = **it;

         worker.getWorkQueue()->addPersonalWork(
               new IncSyncedCounterWork(&counter),
               worker.getPersonalWorkQueue());
      }

      while (!counter.timedWaitForCount(workers->size(), 100))
      {
         while (!syncCandidates.isFilesEmpty())
         {
            MetaSyncCandidateFile candidate;
            syncCandidates.fetch(candidate, this);
            candidate.signal();
         }
      }
   }
}

void BuddyResyncJob::stopAllWorkersOn(Barrier& barrier)
{
   WorkerList* workers = Program::getApp()->getWorkers();

   for (WorkerListIter workerIt = workers->begin(); workerIt != workers->end(); ++workerIt)
   {
      Worker* worker = *workerIt;
      PersonalWorkQueue* personalQ = worker->getPersonalWorkQueue();
      MultiWorkQueue* workQueue = worker->getWorkQueue();
      workQueue->addPersonalWork(new BarrierWork(&barrier), personalQ);
   }

   barrier.wait(); // Wait until all workers are blocked.
}

void BuddyResyncJob::abort(bool wait_for_completion)
{
   setState(BuddyResyncJobState_INTERRUPTED);

   gatherSlave->selfTerminate();

   // set onlyTerminateIfIdle on the slaves to false - they will be stopped by the main loop then.
   for (auto it = bulkSyncSlaves.begin(); it != bulkSyncSlaves.end(); ++it)
   {
      BuddyResyncerBulkSyncSlave* slave = it->get();
      slave->setOnlyTerminateIfIdle(false);
   }

   modSyncSlave->selfTerminate();

   int retry = 600;
   /* Wait till all on-going thread events are fetched or max 30mins.
    * (fetch waits for 3secs if there are no files to be fetched)
    */
   if (wait_for_completion)
   {
      modSyncSlave->join();
      while (threadCount > 0 && retry)
      {
         LOG(MIRRORING, WARNING, "Wait for pending worker threads to finish");
         if (!syncCandidates.isFilesEmpty())
         {
            MetaSyncCandidateFile candidate;
            syncCandidates.fetch(candidate, this);
            candidate.signal();
         }
         retry--;
      }
      if (threadCount)
         LOG(MIRRORING, ERR, "Cleanup of aborted resync failed: I/O worker threads"
                           " did not finish properly: ",
                           ("threadCount", threadCount.load()));
   }
}

bool BuddyResyncJob::startGatherSlaves()
{
   try
   {
      gatherSlave->resetSelfTerminate();
      gatherSlave->start();
      gatherSlave->setIsRunning(true);
   }
   catch (PThreadCreateException& e)
   {
      LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what());
      return false;
   }

   return true;
}

bool BuddyResyncJob::startSyncSlaves()
{
   App* app = Program::getApp();
   const NumNodeID localNodeID = app->getLocalNodeNumID();
   const NumNodeID buddyNodeID(
      app->getMetaBuddyGroupMapper()->getBuddyTargetID(localNodeID.val(), NULL) );

   for (size_t i = 0; i < bulkSyncSlaves.size(); i++)
   {
      try
      {
         bulkSyncSlaves[i]->resetSelfTerminate();
         bulkSyncSlaves[i]->start();
         bulkSyncSlaves[i]->setIsRunning(true);
      }
      catch (PThreadCreateException& e)
      {
         LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() );

         for (size_t j = 0; j < i; j++)
            bulkSyncSlaves[j]->selfTerminate();

         return false;
      }
   }

   try
   {
      modSyncSlave->resetSelfTerminate();
      modSyncSlave->start();
      modSyncSlave->setIsRunning(true);
   }
   catch (PThreadCreateException& e)
   {
      LogContext(__func__).logErr(std::string("Unable to start thread: ") + e.what() );

      for (size_t j = 0; j < bulkSyncSlaves.size(); j++)
         bulkSyncSlaves[j]->selfTerminate();

      return false;
   }

   return true;
}

void BuddyResyncJob::joinGatherSlaves()
{
   gatherSlave->join();
}

MetaBuddyResyncJobStatistics BuddyResyncJob::getJobStats()
{
   std::lock_guard<Mutex> lock(stateMutex);

   BuddyResyncerGatherSlave::Stats gatherStats = gatherSlave->getStats();
   const uint64_t dirsDiscovered = gatherStats.dirsDiscovered;
   const uint64_t gatherErrors = gatherStats.errors;

   uint64_t dirsSynced = 0;
   uint64_t filesSynced = 0;
   uint64_t dirErrors = 0;
   uint64_t fileErrors = 0;

   for (auto syncerIt = bulkSyncSlaves.begin(); syncerIt != bulkSyncSlaves.end(); ++syncerIt)
   {
      BuddyResyncerBulkSyncSlave::Stats bulkSyncStats = (*syncerIt)->getStats();
      dirsSynced += bulkSyncStats.dirsSynced;
      filesSynced += bulkSyncStats.filesSynced;
      dirErrors += bulkSyncStats.dirErrors;
      fileErrors += bulkSyncStats.fileErrors;
   }

   SessionStoreResyncer::Stats sessionSyncStats = sessionStoreResyncer->getStats();
   const uint64_t sessionsToSync = sessionSyncStats.sessionsToSync;
   const uint64_t sessionsSynced = sessionSyncStats.sessionsSynced;
   const bool sessionSyncErrors = sessionSyncStats.errors;

   BuddyResyncerModSyncSlave::Stats modSyncStats = modSyncSlave->getStats();
   uint64_t modObjectsSynced = modSyncStats.objectsSynced;
   uint64_t modSyncErrors = modSyncStats.errors;

   return MetaBuddyResyncJobStatistics(
         state, startTime, endTime,
         dirsDiscovered, gatherErrors,
         dirsSynced, filesSynced, dirErrors, fileErrors,
         sessionsToSync, sessionsSynced, sessionSyncErrors,
         modObjectsSynced, modSyncErrors);
}

/**
 * Determine the state for the buddy after the end of a resync job.
 * @returns the new state to be set on the buddy accroding to this job's JobState.
 */
TargetConsistencyState BuddyResyncJob::newBuddyState()
{
   switch (getState())
   {
      case BuddyResyncJobState_ERRORS:
      case BuddyResyncJobState_INTERRUPTED:
      case BuddyResyncJobState_FAILURE:
         return TargetConsistencyState_BAD;

      case BuddyResyncJobState_SUCCESS:
         return TargetConsistencyState_GOOD;

      default:
         LOG(MIRRORING, ERR, "Undefined resync state.", state);
         return TargetConsistencyState_BAD;
   }
}

void BuddyResyncJob::informBuddy(const TargetConsistencyState newTargetState)
{
   App* app = Program::getApp();
   NodeStore* metaNodes = app->getMetaNodes();
   MirrorBuddyGroupMapper* buddyGroups = app->getMetaBuddyGroupMapper();

   NumNodeID buddyNodeID =
      NumNodeID(buddyGroups->getBuddyTargetID(app->getLocalNodeNumID().val()));
   auto metaNode = metaNodes->referenceNode(buddyNodeID);

   if (!metaNode)
   {
      LOG(MIRRORING, ERR, "Unable to inform buddy about finished resync", buddyNodeID.str());
      return;
   }

   UInt16List nodeIDs(1, buddyNodeID.val());
   UInt8List states(1, newTargetState);
   SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false);

   const auto respMsg = MessagingTk::requestResponse(*metaNode, msg,
         NETMSGTYPE_SetTargetConsistencyStatesResp);
   if (!respMsg)
   {
      LogContext(__func__).logErr(
         "Unable to inform buddy about finished resync. "
         "BuddyNodeID: " + buddyNodeID.str() + "; "
         "error: Communication Error");
      return;
   }

   {
      auto* respMsgCast = static_cast<SetTargetConsistencyStatesRespMsg*>(respMsg.get());
      FhgfsOpsErr result = respMsgCast->getResult();

      if (result != FhgfsOpsErr_SUCCESS)
      {
         LogContext(__func__).logErr(
            "Error while informing buddy about finished resync. "
            "BuddyNodeID: " + buddyNodeID.str() + "; "
            "error: " + boost::lexical_cast<std::string>(result) );
      }
   }
}

void BuddyResyncJob::informMgmtd(const TargetConsistencyState newTargetState)
{
   App* app = Program::getApp();
   NodeStore* mgmtNodes = app->getMgmtNodes();

   auto mgmtNode = mgmtNodes->referenceFirstNode();

   if (!mgmtNode)
   {
      LOG(MIRRORING, ERR, "Unable to communicate with management node.");
      return;
   }

   UInt16List nodeIDs(1, buddyNodeID.val());
   UInt8List states(1, newTargetState);
   SetTargetConsistencyStatesMsg msg(NODETYPE_Meta, &nodeIDs, &states, false);

   const auto respMsg = MessagingTk::requestResponse(*mgmtNode, msg,
         NETMSGTYPE_SetTargetConsistencyStatesResp);
   if (!respMsg)
   {
      LOG(MIRRORING, ERR,
            "Unable to inform management node about finished resync: Communication error.");
      return;
   }

   {
      auto* respMsgCast = static_cast<SetTargetConsistencyStatesRespMsg*>(respMsg.get());
      FhgfsOpsErr result = respMsgCast->getResult();

      if (result != FhgfsOpsErr_SUCCESS)
         LOG(MIRRORING, ERR, "Error informing management node about finished resync.", result);
   }
}