namd/doxygen/SynchronousCollectives_8C_source.html

 #include "CudaRecord.h"
 #include "CudaUtils.h"
 #include "NamdTypes.h"

 #include "SynchronousCollectives.decl.h"
 #include "SynchronousCollectives.h"
 #include "Node.h"
 #include "SimParameters.h"
 #include "NamdEventsProfiling.h"
 #include "Priorities.h"

 #include <cstring> // std::memcpy

 #include "charm++.h"

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)

 /*
  * PUP (Pack-UnPack) various types that will communicated via Charm++
  */
 #if !(defined(__NVCC__) || defined(__HIPCC__))
 #include <pup.h>
 PUPbytes (CudaLocalRecord);
 #endif  // !((__NVCC__) || (__HIPCC__))

 SynchronousCollectives::SynchronousCollectives(void)
 {
   if (CkpvAccess(SynchronousCollectives_instance) == NULL) {
     CkpvAccess(SynchronousCollectives_instance) = this;
   } else {
     NAMD_bug("SynchronousCollectives instanced twice on same processor!");
   }
 }

 SynchronousCollectives::~SynchronousCollectives(void) { }

 void SynchronousCollectives::initAllScope() {
   allPes_ = CkpvAccess(BOCclass_group).synchronousCollectives;
   currentBarrierAll_ = std::vector<int>(CkNumPes(), 0);
   currentBarrierSingle_ = std::vector<int>(1, 0);
 }

 void SynchronousCollectives::initMasterScope(const int isMasterPe, const int isMasterDevice,
   const int numDevices, const int deviceIndex, const std::vector<int>& masterPeList) {

   isMasterPe_ = isMasterPe;
   numDevices_ = numDevices;
   deviceIndex_ = deviceIndex;
   masterPeList_ = masterPeList;

   currentBarrierMasterPe_ = std::vector<int>(numDevices_, 0);
   forceBarrierAll();  // Make sure all PEs have set expectedBarrierMasterPe

   masterPes_ = CProxySection_SynchronousCollectives(allPes_.ckGetGroupID(),
     masterPeList_.data(), masterPeList_.size());
   masterPesMulticast_ = CProxySection_SynchronousCollectives(allPes_.ckGetGroupID(),
     masterPeList_.data(), masterPeList_.size());

   //
   // For section broadcasts, we must use the multi-cast library; however, it requires explicitly
   // defined messages, so the multi-cast section will not be used for the non-reduction sections
   //
   CProxy_CkMulticastMgr mcastProxy = CkpvAccess(BOCclass_group).multicastMgr;
   CkMulticastMgr *mcastPtr = CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
   masterPesMulticast_.ckSectionDelegate(mcastPtr);

   if (isMasterDevice && isMasterPe_) {
     SynchronousCollectivesMulticastMsg *msg = new SynchronousCollectivesMulticastMsg();
     setThread(CthSelf());
     masterPesMulticast_.setupMulticastSection(msg);
     suspendAndCheck(SynchronousCollectiveScope::single);
   } else if (isMasterPe_) {
     suspendAndCheck(SynchronousCollectiveScope::single);
   }

   forceBarrierAll();  // Make sure all PEs have set expectedBarrierMasterPe
 }

 void SynchronousCollectives::incrementCount(const SynchronousCollectiveScope scope, const int index) {
   auto& currentBarrier = getBarrier(scope);
   if (currentBarrier.size() <= index) {
     NAMD_bug("SynchronousCollectives currentBarrier not large enough");
   }
   currentBarrier[index]++;
 }

 void SynchronousCollectives::suspendAndCheck(const SynchronousCollectiveScope scope) {
   auto& currentBarrier = getBarrier(scope);
   bool done = true;
   do {
     CthYield();
     done = true;
     for (size_t i = 0; i < currentBarrier.size(); i++) {
       done = (done && currentBarrier[i]);
     }
   } while (!done);

   for (size_t i = 0; i < currentBarrier.size(); i++) {
     currentBarrier[i]--;
   }
 }

 void SynchronousCollectives::wait() {
   int finished = false;
   switch (waitPhase_) {
         case 0:
         break;
         case 1:
             finished = true;
         break;
   }

   waitPhase_++;
   if (!CkMyPe()) {
     if (!finished) {
       CkStartQD(CkCallback(CkIndex_SynchronousCollectives::wait(), thisgroup));
     }
   }

   if (finished) {
     waitPhase_ = 0;
     CthAwaken(self_awaken_thread_);
   }
 }

 void SynchronousCollectives::waitAndAwaken() {
   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_WAITANDAWAKEN);
   setThread(CthSelf());
   wait();
   CthSuspend();
   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_WAITANDAWAKEN);
 }

 void SynchronousCollectives::recvBarrierAll(const int PE) {
   incrementCount(SynchronousCollectiveScope::all, PE);
 }

 void SynchronousCollectives::recvBarrierMasterPe(const int deviceIndex) {
   incrementCount(SynchronousCollectiveScope::master, deviceIndex);
 }

 void SynchronousCollectives::forceBarrierAll() {
   allPes_.recvBarrierAll(CkMyPe());
   suspendAndCheck(SynchronousCollectiveScope::all);
 }

 void SynchronousCollectives::barrier(const SynchronousCollectiveScope scope) {
   if (scope == SynchronousCollectiveScope::single) {
     NAMD_bug("SynchronousCollectives::barrier does not support single scope");
   }

   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_BARRIER);
   if (scope == SynchronousCollectiveScope::all) {
     if (CkNumNodes() == 1) {
       CmiNodeBarrier();
     } else if (currentBarrierMasterPe_.size() == 0) {
       // If expectedBarrierMasterPe is not set, then we need to
       // default back to a true all synchronization
       forceBarrierAll();
     } else {
       if (isMasterPe_) {
         barrier(SynchronousCollectiveScope::master);
       }
       CmiNodeBarrier();
     }
   } else if (isMasterPe_) {
     masterPes_.recvBarrierMasterPe(deviceIndex_);
     suspendAndCheck(SynchronousCollectiveScope::master);
   }
   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_BARRIER);
 }

 void SynchronousCollectives::handleReductionAll(CkReductionMsg *msg) {
   allPes_.broadcastReductionResult(msg->getSize(), (char*) msg->getData());
   // TODO: Should this message be deleted here
 }

 void SynchronousCollectives::handleReductionMaster(CkReductionMsg *msg) {
   masterPes_.broadcastReductionResult(msg->getSize(), (char*) msg->getData());
   // TODO: Should this message be deleted here
 }

 void SynchronousCollectives::broadcastReductionResult(int n, char* data) {
   std::memcpy(reductionPtr_, (void*) data, n);
   incrementCount(SynchronousCollectiveScope::single, 0);
 }

 template<typename T>
 std::vector<T> SynchronousCollectives::allReduce(std::vector<T>& data, CkReduction::reducerType type,
   SynchronousCollectiveScope scope) {
   if (scope == SynchronousCollectiveScope::single) {
     NAMD_bug("SynchronousCollectives::allreduce does not support single scope");
   }
   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_ALLREDUCE);

   std::vector<T> out;
   reductionTemp_ = std::vector<T>(data.size());
   reductionPtr_ = (void*) std::any_cast<std::vector<T>&>(reductionTemp_).data();

   setThread(CthSelf());
   if (scope == SynchronousCollectiveScope::all) {
     CkCallback cb(CkReductionTarget(SynchronousCollectives, handleReductionAll),
       thisProxy[thisIndex]);

     contribute(data.size() * sizeof(T), data.data(), type, cb);
     suspendAndCheck(SynchronousCollectiveScope::single);
   } else if (isMasterPe_) {
     CkCallback cb(CkReductionTarget(SynchronousCollectives, handleReductionMaster),
       thisProxy[thisIndex]);

     CProxy_CkMulticastMgr mcastProxy = CkpvAccess(BOCclass_group).multicastMgr;
     CkMulticastMgr *mcastPtr = CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
     mcastPtr->contribute(data.size() * sizeof(T), data.data(), type, reductionCookie_, cb);
     suspendAndCheck(SynchronousCollectiveScope::single);
   }
   out = std::move(std::any_cast<std::vector<T>&>(reductionTemp_));
   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_ALLREDUCE);
   return out;
 }

 void SynchronousCollectives::setupMulticastSection(SynchronousCollectivesMulticastMsg *msg) {
   CkGetSectionInfo(reductionCookie_, msg);
   delete msg;
   incrementCount(SynchronousCollectiveScope::single, 0);
 }

 template<typename T>
 void SynchronousCollectives::sendAllGather(const T& data, const SynchronousCollectiveScope scope, const unsigned int key) {
   if (scope == SynchronousCollectiveScope::all) {
     allPes_.recvIndexData(CkMyPe(), data, scope, key);
   } else if (isMasterPe_) {
     masterPes_.recvIndexData(deviceIndex_, data, scope, key);
   }
 }

 template<typename T>
 void SynchronousCollectives::recvIndexData(const int index, const T& data, const SynchronousCollectiveScope scope, const unsigned int key) {
   const int tempSize = (scope == SynchronousCollectiveScope::all) ? CkNumPes() : numDevices_;
   auto res = tempData_.try_emplace(key, std::in_place_type<std::vector<T>>, tempSize);

   std::vector<T>& tempVec = std::any_cast<std::vector<T>&>(res.first->second);
   if (index >= tempVec.size()) {
     NAMD_die("SynchronousCollectives::recvIndexData: temp array not large enough");
   }

   tempVec[index] = std::move(data);
   incrementCount(scope, index);
 }

 template<typename T>
 T SynchronousCollectives::retrieveTemp(const unsigned int key) {
   auto outIter = tempData_.find(key);
   if (outIter == tempData_.end()) {
     NAMD_die("SynchronousCollectives::retrieveTemp: could not find data");
   }
   auto out = std::move(std::any_cast<T&>(outIter->second));
   tempData_.erase(key);
   return out;
 }

 template<typename T>
 std::vector<T> SynchronousCollectives::allGather(const T& data, const SynchronousCollectiveScope scope) {
   if (scope == SynchronousCollectiveScope::single) {
     NAMD_bug("SynchronousCollectives::allgather does not support single scope");
   }
   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_ALLGATHER);

   std::vector<T> out;
   if (scope == SynchronousCollectiveScope::all || isMasterPe_) {
     const unsigned int key = getKey(scope);
     sendAllGather<T>(data, scope, key);
     suspendAndCheck(scope);

     out = retrieveTemp<std::vector<T>>(key);
   }
   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_ALLGATHER);
   return out;
 }

 template<typename T>
 void SynchronousCollectives::sendAlltoallv(const std::vector<T>& data,
   const SynchronousCollectiveScope scope, const unsigned int key) {

   CProxy_SynchronousCollectives cp(thisgroup);
   if (scope == SynchronousCollectiveScope::all) {
     for (size_t i = 0; i < CkNumPes(); i++) {
       cp[i].recvIndexData<T>(CkMyPe(), data[i], scope, key);
     }
   } else if (isMasterPe_) {
     for (size_t i = 0; i < numDevices_; i++) {
       const int PE = masterPeList_[i];
       cp[PE].recvIndexData<T>(deviceIndex_, data[i], scope, key);
     }
   }
 }

 template<typename T>
 std::vector<T> SynchronousCollectives::alltoallv(const std::vector<T>& data, const SynchronousCollectiveScope scope) {
   if (scope == SynchronousCollectiveScope::single) {
     NAMD_bug("SynchronousCollectives::alltoallv does not support single scope");
   }
   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_ALLTOALL);

   std::vector<T> out;

   if (scope == SynchronousCollectiveScope::all || isMasterPe_) {
     const unsigned int key = getKey(scope);
     sendAlltoallv<T>(data, scope, key);
     suspendAndCheck(scope);
     out = retrieveTemp<std::vector<T>>(key);
   }

   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_ALLTOALL);
   return out;
 }

 template<typename T>
 void SynchronousCollectives::sendBroadcast(const T& data, const SynchronousCollectiveScope scope, const unsigned int key) {
   if (scope == SynchronousCollectiveScope::all) {
     allPes_.recvBroadcast<T>(data, key);
   } else if (isMasterPe_) {
     masterPes_.recvBroadcast<T>(data, key);
   }
 }

 template<typename T>
 void SynchronousCollectives::recvBroadcast(const T& data, const unsigned int key) {
   // Since we are only expecting one message, the key should not exist in the map
   tempData_.insert({key, std::move(data)});
   incrementCount(SynchronousCollectiveScope::single, 0);
 }

 template<typename T>
 T SynchronousCollectives::broadcast(const T& data, const bool isRoot, const SynchronousCollectiveScope scope) {
   if (scope == SynchronousCollectiveScope::single) {
     NAMD_bug("SynchronousCollectives::broadcast does not support single scope");
   }
   NAMD_EVENT_START(1, NamdProfileEvent::CHARM_BROADCAST);

   T out = data;  // If we are not participating in the broadcast, just return the input
   if (scope == SynchronousCollectiveScope::all || isMasterPe_) {
     const unsigned int key = getKey(scope);
     if (isRoot) {
       sendBroadcast(data, scope, key);
     }
     suspendAndCheck(SynchronousCollectiveScope::single);

     out = retrieveTemp<T>(key);
   }
   NAMD_EVENT_STOP(1, NamdProfileEvent::CHARM_BROADCAST);
   return out;
 }

 #define INSTANTIATE_ALLGATHER(type) \
 template std::vector<type> \
 SynchronousCollectives::allGather<type>(const type&, SynchronousCollectiveScope);

 INSTANTIATE_ALLGATHER(int);
 INSTANTIATE_ALLGATHER(unsigned long long);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 INSTANTIATE_ALLGATHER(cudaIpcMemHandle_t);
 #endif  // NAMD_CUDA || NAMD_HIP
 INSTANTIATE_ALLGATHER(std::vector<CudaLocalRecord>);
 INSTANTIATE_ALLGATHER(std::vector<int>);

 #undef INSTANTIATE_ALLGATHER

 #define INSTANTIATE_ALLTOALLV(type) \
 template std::vector<type> \
 SynchronousCollectives::alltoallv<type>(const std::vector<type>&, SynchronousCollectiveScope);

 INSTANTIATE_ALLTOALLV(std::vector<int>);
 INSTANTIATE_ALLTOALLV(int);

 #undef INSTANTIATE_ALLTOALLV

 #define INSTANTIATE_ALLREDUCE(type) \
 template std::vector<type> \
 SynchronousCollectives::allReduce<type>(std::vector<type>&, \
   CkReduction::reducerType, SynchronousCollectiveScope)

 INSTANTIATE_ALLREDUCE(unsigned int);
 INSTANTIATE_ALLREDUCE(size_t);
 INSTANTIATE_ALLREDUCE(double);

 #undef INSTANTIATE_ALLREDUCE

 #endif  /* NAMD_CUDA || NAMD_HIP */

 #include "SynchronousCollectives.def.h"

CudaUtils.h

SynchronousCollectives::barrier
void barrier(const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:147

NAMD_EVENT_STOP
#define NAMD_EVENT_STOP(eon, id)
Definition: NamdEventsProfiling.h:318

SynchronousCollectives::allGather
std::vector< T > allGather(const T &data, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:262

SynchronousCollectives::handleReductionMaster
void handleReductionMaster(CkReductionMsg *msg)
Definition: SynchronousCollectives.C:178

SynchronousCollectives::allReduce
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:189

SynchronousCollectivesMulticastMsg
Definition: SynchronousCollectives.h:29

INSTANTIATE_ALLGATHER
#define INSTANTIATE_ALLGATHER(type)
Definition: SynchronousCollectives.C:354

NamdEventsProfiling.h

Node.h

SynchronousCollectives::initMasterScope
void initMasterScope(const int isMasterPe, const int isMasterDevice, const int numDevices, const int deviceIndex, const std::vector< int > &masterPeList)
Definition: SynchronousCollectives.C:43

masterPeList
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95

CudaLocalRecord
Definition: CudaRecord.h:35

SynchronousCollectives::SynchronousCollectives
SynchronousCollectives()
Definition: SynchronousCollectives.C:26

SynchronousCollectives::broadcast
T broadcast(const T &data, const bool isRoot, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:334

NAMD_EVENT_START
#define NAMD_EVENT_START(eon, id)
Definition: NamdEventsProfiling.h:312

SynchronousCollectives::recvBarrierAll
void recvBarrierAll(const int PE)
Definition: SynchronousCollectives.C:134

SynchronousCollectives::~SynchronousCollectives
~SynchronousCollectives()
Definition: SynchronousCollectives.C:35

NAMD_bug
void NAMD_bug(const char *err_msg)
Definition: common.C:196

SynchronousCollectives::recvBarrierMasterPe
void recvBarrierMasterPe(const int deviceIndex)
Definition: SynchronousCollectives.C:138

SynchronousCollectiveScope::all

SynchronousCollectives::broadcastReductionResult
void broadcastReductionResult(int n, char *data)
Definition: SynchronousCollectives.C:183

SynchronousCollectives::initAllScope
void initAllScope()
Definition: SynchronousCollectives.C:37

NAMD_die
void NAMD_die(const char *err_msg)
Definition: common.C:148

SynchronousCollectives
Definition: SynchronousCollectives.h:60

SynchronousCollectiveScope::master

NamdTypes.h

SynchronousCollectiveScope
SynchronousCollectiveScope
Definition: SynchronousCollectives.h:23

SynchronousCollectives::recvIndexData
void recvIndexData(const int index, const T &data, const SynchronousCollectiveScope scope, const unsigned int key)
Definition: SynchronousCollectives.C:237

SynchronousCollectives.h

INSTANTIATE_ALLREDUCE
#define INSTANTIATE_ALLREDUCE(type)
Definition: SynchronousCollectives.C:377

SynchronousCollectives::setupMulticastSection
void setupMulticastSection(SynchronousCollectivesMulticastMsg *msg)
Definition: SynchronousCollectives.C:221

SynchronousCollectives::handleReductionAll
void handleReductionAll(CkReductionMsg *msg)
Definition: SynchronousCollectives.C:173

PUPbytes
PUPbytes(CudaLocalRecord)

INSTANTIATE_ALLTOALLV
#define INSTANTIATE_ALLTOALLV(type)
Definition: SynchronousCollectives.C:368

SynchronousCollectives::waitAndAwaken
void waitAndAwaken()
Definition: SynchronousCollectives.C:126

Priorities.h

SynchronousCollectives::alltoallv
std::vector< T > alltoallv(const std::vector< T > &data, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:298

SynchronousCollectives::wait
void wait()
Definition: SynchronousCollectives.C:103

SynchronousCollectiveScope::single

SimParameters.h

SynchronousCollectives::forceBarrierAll
void forceBarrierAll()
Definition: SynchronousCollectives.C:142

SynchronousCollectives::recvBroadcast
void recvBroadcast(const T &data, const unsigned int key)
Definition: SynchronousCollectives.C:327

CudaRecord.h