8 #if !defined(WIN32) || defined(__CYGWIN__)    34 #include "NamdHybridLB.def.h"    40 #ifdef MEM_OPT_VERSION    50   int seqno = LdbInfra::Object()->getLoadbalancerTicket();
    51   CProxy_NamdHybridLB::ckNew(CkLBOptions(seqno));
    55   if (CkMyPe() == 0 && 
cpuloads == NULL) {
    58     for (
int i=0; i<CkNumPes(); i++) 
cpuloads[i] = 0.0;
    68   lbname = (
char *)
"NamdHybridLB";
    72   if (CkNumPes() <= 
simParams->hybridGroupSize)  {
    73     tree = 
new TwoLevelTree;   
    76     tree = 
new ThreeLevelTree(
simParams->hybridGroupSize);
    79     statsStrategy = SHRINK_NULL;
    83   thisProxy = CProxy_NamdHybridLB(thisgroup);
    95   processorArray = NULL;
    98   splitComputesMsgs = 0;
   109 bool NamdHybridLB::QueryBalanceNow(
int _step){ 
   117 bool NamdHybridLB::QueryDumpData() {
   131 LBVectorMigrateMsg* NamdHybridLB::VectorStrategy(LDStats* stats){
   132   CkPrintf(
"[%d] Using Vector Strategy to balance the load\n",CkMyPe());
   133   LBVectorMigrateMsg* msg = 
new(0,0) LBVectorMigrateMsg;
   135   msg->level = currentLevel;
   143 CLBMigrateMsg* NamdHybridLB::Strategy(LDStats* stats)
   149         if(currentLevel == 1){
   150                 LevelData *lData = levelData[currentLevel];
   153                 msg = GrpLevelStrategy(stats);
   157                 newMsg->
n_moves = msg->n_moves;
   159                 newMsg->
endPE = endPE;
   160                 for(i=0; i<msg->n_moves; i++){
   161                         newMsg->
moves[i] = msg->moves[i];
   163                 for(i=0; i<endPE-startPE+1; i++){
   167                 thisProxy[0].UpdateLocalLBInfo(newMsg);
   170                 dummyLB->
work(stats);
   171                 return createMigrateMsg(stats);
   183         children = tree->numNodes(currentLevel);
   191             if (msg->
moves[i].to_pe != -1)
   197         for(i=msg->
startPE; i<=msg->endPE; i++){
   203         if(updateCount == children){
   210         if(updateFlag && collectFlag){
   213                 thisProxy[parent_backup].CollectInfo(loc_backup, n_backup, fromlevel_backup);
   220   const int children = tree->numNodes(1);
   222   if ( ! splitComputesMsgs ) {
   226   splitComputesMsgs[splitCount] = msg;
   228   if ( ++splitCount == children ) {
   234     double maxUnsplit = 0.;
   236     double avgCompute = 0.;
   237     double maxCompute = 0.;
   238     int maxComputeId = -1;
   239     int nMoveableComputes = 0;
   240     int numPesAvailable = 0;
   243     for ( 
int j=0; j < children; ++j ) {
   255     if ( nMoveableComputes ) avgCompute /= nMoveableComputes; 
else avgCompute = 0.;
   257     CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
   259     CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
   263       for ( 
int j=0; j < children; ++j ) {
   264         delete splitComputesMsgs[j];
   270 #if defined(NAMD_CUDA) || defined(NAMD_HIP)   276       int totalAddedParts = 0;
   278       if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
   279       if ( 
simParams->ldbRelativeGrainsize > 0. ) {
   282       CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
   284       for ( 
int j=0; j < children; ++j ) {
   286         for (
int i=0; i < msg->
n; ++i) {
   287           int nparts = (int) ceil(msg->
load[i] / maxCompute);
   288           if ( nparts > maxParts ) nparts = maxParts;
   289           if ( nparts < 1 ) nparts = 1;
   291           totalAddedParts += nparts - 1;
   296       CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
   297               nMoveableComputes,nMoveableComputes+totalAddedParts);
   298       CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
   308 CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {
   309   int numProcessors = stats->nprocs();  
   313   const int numGroupComputes = stats->n_migrateobjs;
   316   if ( ! processorArray ) processorArray = 
new processorInfo[numProcessors];
   318   if ( ! patchArray ) patchArray = 
new patchInfo[numPatches];
   319   if ( ! computeArray ) computeArray = 
new computeInfo[numGroupComputes];
   320   if ( ! from_procs ) from_procs = 
new int[numGroupComputes];
   322   int nMoveableComputes = buildData(stats);
   323   CmiAssert(nMoveableComputes <= numGroupComputes);
   327 #define DUMP_LDBDATA 1   328 #define LOAD_LDBDATA 1   332   dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
   334   loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
   348    for (i=0; i<nMoveableComputes; i++) {
   349       double load = computeArray[i].
load;
   351       if ( load > maxCompute ) { maxCompute = load;  maxi = i; }
   353    avgCompute = total / nMoveableComputes;
   354    maxComputeId = maxi < 0 ? -1 : 
LdbIdField(computeArray[maxi].handle.id, 0);
   356     int P = stats->nprocs();
   358    for (i=0; i<P; i++) {
   359       if (processorArray[i].available) {
   364    if (numPesAvailable == 0)
   365      NAMD_die(
"No processors available for load balancing!\n");
   371   double maxUnsplit = 0.;
   374     for (
int i=0; i<nMoveableComputes; i++) {
   375       const int cid = 
LdbIdField(computeArray[i].handle.id, 0);
   377         const double load = computeArray[i].
load;
   378         if ( load > maxUnsplit ) maxUnsplit = load;
   398       for (
int i=0; i<nMoveableComputes; i++) {
   400         const int cid = 
LdbIdField(computeArray[i].handle.id, 0);
   404         msg->
cid[i_split] = cid;
   405         msg->
load[i_split] = computeArray[i].
load;
   410     thisProxy[0].splitComputes(msg);
   417       TorusLB(computeArray, patchArray, processorArray,
   418                   nMoveableComputes, numPatches, numProcessors);
   421                   nMoveableComputes, numPatches, numProcessors, 1);
   423     TorusLB(computeArray, patchArray, processorArray,
   424                   nMoveableComputes, numPatches, numProcessors);
   427                   nMoveableComputes, numPatches, numProcessors, 1);
   429     NAMD_die(
"Old load balancer strategy is not compatible with hybrid balancer.");
   431       Alg7(computeArray, patchArray, processorArray,
   432                   nMoveableComputes, numPatches, numProcessors);
   434       RefineOnly(computeArray, patchArray, processorArray,
   435                   nMoveableComputes, numPatches, numProcessors);
   438 #if LDB_DEBUG && USE_TOPOMAP   440   int pe1, pe2, pe3, hops=0;
   451   for (
int i=0; i<numPatches; i++)  {
   458       hops += tmgr.getHopsBetweenRanks(pe1, pe2);
   462   CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
   466   dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
   468   dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
   477   int* computeCount = 
new int[numProcessors];
   478   for(i=0; i<numProcessors; i++)
   480   for(i=0; i<nMoveableComputes; i++)
   481     computeCount[computeArray[i].processor]++;
   482   for(i=0; i<numProcessors; i++) {
   483     if (computeCount[i]==0)
   484       iout << 
iINFO <<
"Warning: Processor " << i 
   485            << 
" has NO moveable computes.\n" << 
endi;
   487   delete [] computeCount;
   490   CkVec<MigrateInfo *> migrateInfo;
   491   for(i=0;i<nMoveableComputes;i++) {
   492     if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {
   496       MigrateInfo *migrateMe = 
new MigrateInfo;
   497       migrateMe->obj = computeArray[i].
handle;
   499       int frompe = from_procs[i];
   500       if (frompe == numProcessors)
   503         frompe = frompe + stats->procs[0].pe;
   504       migrateMe->from_pe = frompe;
   505       migrateMe->to_pe = computeArray[i].
processor;
   510         obj.handle = computeArray[i].
handle;
   511         thisProxy[computeArray[i].
processor].ObjMigrated(obj, NULL, 0, currentLevel-1);
   513       migrateInfo.push_back(migrateMe);
   518                                 computeArray[i].processor);
   524   msg = createMigrateMsg(migrateInfo, numProcessors);
   526   peLoads = 
new double [numProcessors]; 
   527   startPE = processorArray[0].
Id;
   528   endPE = processorArray[numProcessors-1].
Id;
   530   for (i=0; i<numProcessors; i++) {
   531         peLoads[i] = processorArray[i].
load;
   535   delete [] from_procs;
   536   delete [] processorArray;
   537   delete [] patchArray;
   538   delete [] computeArray;
   541   processorArray = NULL;
   549 void NamdHybridLB::dumpDataASCII(
char *file, 
int numProcessors,
   550                                int numPatches, 
int numComputes)
   553   sprintf(filename, 
"%s_%d.%d", file, CkMyPe(), step());
   554   FILE* fp = fopen(filename,
"w");
   556      perror(
"dumpLDStatsASCII");
   560   fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
   563   for(i=0;i<numProcessors;i++) {
   568   for(i=0;i < numPatches; i++) {
   573   for(i=0; i < numComputes; i++) {
   581   for (i=0; i< numProcessors; i++) {
   583       fprintf(fp, 
"%d %d: ", i, num);
   588           fprintf(fp, 
"%d ", p->
Id);
   589           p = (
patchInfo *)processorArray[i].proxies.
   595   for (i=0; i<numPatches; i++)  {
   597     fprintf(fp, 
"%d %d: ", i, num);
   602         fprintf(fp, 
"%d ", p->
Id);
   617 int NamdHybridLB::buildData(LDStats* stats) {
   618   int n_pes = stats->nprocs();
   630   int unLoadZero = 
simParams->ldbUnloadZero;
   632   int unLoadIO= 
simParams->ldbUnloadOutputPEs;
   635   for (i=0; i<n_pes; ++i) {
   636     pe_no = stats->procs[i].pe;
   639     processorArray[i].
Id = pe_no;               
   643       processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
   646       processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
   648       processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
   650     processorArray[i].
idleTime = stats->procs[i].idletime;
   655   if(stats->procs[0].pe == 0) {
   656     if(unLoadZero) processorArray[0].
available = 
false;
   657     if(unLoadOne) processorArray[1].
available = 
false;
   661   if (pmeOn && unLoadPme) {
   662     for (i=0; i<n_pes; i++) {
   671   if (pmeOn && unLoadPme) {
   672     for (i=0; i<n_pes; i++) {
   674         processorArray[i].available = 
false;
   679 #ifdef MEM_OPT_VERSION   681       if (
simParams->numoutputprocs == n_pes) {
   687       for (i=0; i<n_pes; i++) {
   689               processorArray[i].available = 
false;
   696   int totalLocalProxies = 0;
   697   int totalProxies = 0;
   698   for ( 
int pid=0; pid<numPatches; ++pid ) {
   701         patchArray[pid].
Id = pid;
   705         const int numProxies = 
   706 #if 0 // USE_TOPOMAP - this function needs to be there for the hybrid case   707         requiredProxiesOnProcGrid(pid,neighborNodes);
   709         requiredProxies(pid, neighborNodes);
   712         int numLocalProxies = 0;
   713         for (
int k=0; k<numProxies; k++) {
   714                 if( (neighborNodes[k] >= stats->procs[0].pe) && (neighborNodes[k] <= stats->procs[n_pes-1].pe) ){
   716                         int index = neighborNodes[k] - stats->procs[0].pe;
   722         if ( numLocalProxies ) {
   723             CkPrintf(
"LDB Pe %d patch %d has %d local of %d total proxies\n",
   724                 CkMyPe(), pid, numLocalProxies, numProxies);
   727         totalLocalProxies += numLocalProxies;
   728         totalProxies += numProxies;
   731   CkPrintf(
"LDB Pe %d has %d local of %d total proxies\n",
   732                 CkMyPe(), totalLocalProxies, totalProxies);
   735   int nMoveableComputes=0;
   741   const auto nObjs = stats->objData.size();
   742   for(j=0; j < nObjs; j++) {
   743         const LDObjData &this_obj = stats->objData[j];
   744         int frompe = stats->from_proc[j];
   747         if (this_obj.omID().id.idx != 1) {
   760         } 
else if (this_obj.migratable && this_obj.wallTime != 0.) { 
   763                 const int p0 = computeMap->
pid(cid,0);
   767                 if (computeMap->
numPids(cid) > 1)
   768                         p1 = computeMap->
pid(cid,1);
   770                         computeArray[nMoveableComputes].
Id = cid;
   772                         if (frompe >= n_pes) {  
   773 CkPrintf(
"assigning random old processor...this looks broken\n");
   774                           computeArray[nMoveableComputes].
oldProcessor = CrnRand()%n_pes + stats->procs[0].pe;     
   777                           computeArray[nMoveableComputes].
oldProcessor = frompe + stats->procs[0].pe;
   779                         from_procs[nMoveableComputes] = frompe;
   783                         int index = computeArray[nMoveableComputes].
oldProcessor - stats->procs[0].pe; 
   784                         processorArray[index].
computeLoad += this_obj.wallTime;
   785                         computeArray[nMoveableComputes].
processor = -1;
   786                         computeArray[nMoveableComputes].
patch1 = p0;
   787                         computeArray[nMoveableComputes].
patch2 = p1;
   788                         computeArray[nMoveableComputes].
handle = this_obj.handle;
   789                         computeArray[nMoveableComputes].
load = this_obj.wallTime;
   794         for (i=0; i<n_pes; i++) {
   798         return nMoveableComputes;
   802 int NamdHybridLB::requiredProxies(
PatchID id, 
int neighborNodes[])
   805   int myNode = patchMap->
node(
id);
   808 #define IF_NEW_NODE \   810     for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \   811     if ( j == nProxyNodes )   816   for ( 
int i = 0; i < numNeighbors; ++i ) {
   817     const int proxyNode = patchMap->
basenode(neighbors[i]);
   818     if ( proxyNode != myNode ) {
   820         neighborNodes[nProxyNodes] = proxyNode;
   834   int numNodes = CkNumPes();
   836   int emptyNodes = numNodes - numPatches;
   837   if ( emptyNodes > numPatches ) {
   838     int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
   840     if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
   841     int proxyNode = (myNode + 1) % numNodes;
   842     while ( nProxyNodes < nodesPerPatch &&
   844       if ( proxyNode != myNode ) {
   846           neighborNodes[nProxyNodes] = proxyNode;
   850       proxyNode = (proxyNode + 1) % numNodes;
   852     proxyNode = (myNode - 1 + numNodes) % numNodes;
   853     while ( nProxyNodes < nodesPerPatch &&
   855       if ( proxyNode != myNode ) {
   857           neighborNodes[nProxyNodes] = proxyNode;
   861       proxyNode = (proxyNode - 1 + numNodes) % numNodes;
   863     proxyNode = (myNode + 1) % numNodes;
   865     while ( nProxyNodes < nodesPerPatch ) {
   868           neighborNodes[nProxyNodes] = proxyNode;
   872       proxyNode = (proxyNode + 1) % numNodes;
   873       count ++; 
if (count == numNodes) 
break;   
   876     int proxyNode = myNode - 1;
   878       if ( proxyNode != myNode ) {
   880           neighborNodes[nProxyNodes] = proxyNode;
   885     proxyNode = myNode + 1;
   886     if ( proxyNode < numNodes && ! patchMap->numPatchesOnNode(proxyNode) ) {
   887       if ( proxyNode != myNode ) {
   889           neighborNodes[nProxyNodes] = proxyNode;
 
void UpdateLocalLBInfo(LocalLBInfoMsg *msg)
 
NamdDummyLB * AllocateNamdDummyLB()
 
std::ostream & iINFO(std::ostream &s)
 
NamdCentLB * AllocateNamdCentLB()
 
void setNewNumPartitions(ComputeID cid, char numPartitions)
 
static PatchMap * Object()
 
SimParameters * simParameters
 
#define LDBSTRAT_REFINEONLY
 
std::ostream & endi(std::ostream &s)
 
static double averageLoad
 
NamdHybridLB(const CkLBOptions &opt)
Default constructor. 
 
static Units next(Units u)
 
void splitComputes(SplitComputesMsg *)
 
int numPatches(void) const
 
void CreateNamdHybridLB()
 
int numPartitions(ComputeID cid)
 
void setNewNode(ComputeID cid, NodeID node)
 
const int & LdbIdField(const LdbId &id, const int index)
 
void NAMD_die(const char *err_msg)
 
static LdbCoordinator * Object()
 
int basenode(int pid) const
 
#define LDBSTRAT_COMPREHENSIVE
 
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
 
void work(LDStats *stats)
 
static ComputeMap * Object()
 
int numPids(ComputeID cid)
 
int numPatchesOnNode(int node)
 
void unchecked_insert(InfoRecord *)
 
represents bonded compute 
 
int pid(ComputeID cid, int i)
 
int isOutputProcessor(int pe)