8 #if !defined(WIN32) || defined(__CYGWIN__)    15 #include "NamdCentLB.def.h"    28   int seqno = LdbInfra::Object()->getLoadbalancerTicket();
    29   loadbalancer = CProxy_NamdCentLB::ckNew(CkLBOptions(seqno));
    31   if (CkMyRank() == 0 && 
cpuloads == NULL) {    
    34     for (
int i=0; i<CkNumPes(); i++) 
cpuloads[i] = 0.0;
    39   return new NamdCentLB((CkMigrateMessage*)NULL);
    69 bool NamdCentLB::QueryBalanceNow(
int _step)
    79 bool NamdCentLB::QueryDumpData()
    93   int numProcessors = stats->nprocs();
   100   if ( ! processorArray ) processorArray = 
new processorInfo[numProcessors];
   101   if ( ! patchArray ) patchArray = 
new patchInfo[numPatches];
   102   if ( ! computeArray ) computeArray = 
new computeInfo[numComputes];
   104   int nMoveableComputes = buildData(stats);
   107 #define DUMP_LDBDATA 1   108 #define LOAD_LDBDATA 1   112   dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
   114   loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
   119   double avgCompute = 0.;
   120   if ( nMoveableComputes ) {
   123    double maxCompute = 0.;
   125    for (i=0; i<nMoveableComputes; i++) {
   126       double load = computeArray[i].
load;
   128       if ( load > maxCompute ) { maxCompute = load;  maxi = i; }
   130    avgCompute = total / nMoveableComputes;
   132     int P = stats->nprocs();
   133    int numPesAvailable = 0;
   134    for (i=0; i<P; i++) {
   135       if (processorArray[i].available) {
   140    if (numPesAvailable == 0)
   141      NAMD_die(
"No processors available for load balancing!\n");
   144    CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
   147    CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
   156 #if defined(NAMD_CUDA) || defined(NAMD_HIP)   162     int totalAddedParts = 0;
   164     if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
   165     if ( 
simParams->ldbRelativeGrainsize > 0. ) {
   168     CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
   169     double maxUnsplit = 0.;
   170     for (
int i=0; i<nMoveableComputes; i++) {
   172       const int cid = 
LdbIdField(computeArray[i].handle.id, 0);
   173       const double load = computeArray[i].
load;
   175         if ( load > maxUnsplit ) maxUnsplit = load;
   178       int nparts = (int) ceil(load / maxCompute);
   179       if ( nparts > maxParts ) nparts = maxParts;
   180       if ( nparts < 1 ) nparts = 1;
   181       if ( 0 && nparts > 1 ) {
   182         CkPrintf(
"LDB: Partitioning compute %d with load %f by %d\n",
   186       totalAddedParts += nparts - 1;
   188     CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
   189               nMoveableComputes,nMoveableComputes+totalAddedParts);
   190     CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
   193       TorusLB(computeArray, patchArray, processorArray,
   194                   nMoveableComputes, numPatches, numProcessors);
   197                   nMoveableComputes, numPatches, numProcessors, 1);
   199     TorusLB(computeArray, patchArray, processorArray,
   200                   nMoveableComputes, numPatches, numProcessors);
   203                   nMoveableComputes, numPatches, numProcessors, 1);
   206       Alg7(computeArray, patchArray, processorArray,
   207                   nMoveableComputes, numPatches, numProcessors);
   209       RefineOnly(computeArray, patchArray, processorArray, 
   210                   nMoveableComputes, numPatches, numProcessors);
   213 #if LDB_DEBUG && USE_TOPOMAP   215   int pe1, pe2, pe3, hops=0;
   226   for (
int i=0; i<numPatches; i++)  {
   233       hops += tmgr.getHopsBetweenRanks(pe1, pe2);
   237   CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
   241   dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
   243   dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
   252   int* computeCount = 
new int[numProcessors];
   253   for(i=0; i<numProcessors; i++)
   255   for(i=0; i<nMoveableComputes; i++)
   256     computeCount[computeArray[i].processor]++;
   257   for(i=0; i<numProcessors; i++) {
   258     if (computeCount[i]==0)
   259       iout << 
iINFO <<
"Warning: Processor " << i 
   260            << 
" has NO moveable computes.\n" << 
endi;
   262   delete [] computeCount;
   265   std::vector<MigrateInfo *> migrateInfo;
   266   for(i=0;i<nMoveableComputes;i++) {
   267     if (computeArray[i].processor != computeArray[i].oldProcessor) {
   271       MigrateInfo *migrateMe = 
new MigrateInfo;
   272       migrateMe->obj = computeArray[i].
handle;
   274       migrateMe->to_pe = computeArray[i].
processor;
   275       migrateInfo.push_back(migrateMe);
   283   const int migrate_count=migrateInfo.size();
   285   CLBMigrateMsg* msg = 
new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
   287   msg->n_moves = migrate_count;
   288   for(i=0; i < migrate_count; i++) {
   289     MigrateInfo* item = migrateInfo[i];
   290     msg->moves[i] = *item;
   292     migrateInfo[i] = 
nullptr;
   295   for (i=0; i<numProcessors; i++) {
   299   delete [] processorArray;
   300   delete [] patchArray;
   301   delete [] computeArray;
   303   processorArray = NULL;
   312 void NamdCentLB::dumpDataASCII(
char *file, 
int numProcessors,
   313                                int numPatches, 
int numComputes)
   316   sprintf(filename, 
"%s.%d", file, step());
   317   FILE* fp = fopen(filename,
"w");
   319      perror(
"dumpLDStatsASCII");
   322   CkPrintf(
"***** DUMP data to file: %s ***** \n", filename);
   323   fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
   326   for(i=0;i<numProcessors;i++) {
   331   for(i=0;i < numPatches; i++) {
   336   for(i=0; i < numComputes; i++) {
   344   for (i=0; i< numProcessors; i++) {
   346       fprintf(fp, 
"%d %d: ", i, num);
   351           fprintf(fp, 
"%d ", p->
Id);
   352           p = (
patchInfo *)processorArray[i].proxies.
   358   for (i=0; i<numPatches; i++)  {
   360     fprintf(fp, 
"%d %d: ", i, num);
   365         fprintf(fp, 
"%d ", p->
Id);
   376 void NamdCentLB::loadDataASCII(
char *file, 
int &numProcessors,
   377                                int &numPatches, 
int &numComputes)
   381   sprintf(filename, 
"%s", file);
   383   CkPrintf(
"***** Load ascii data from file: %s ***** \n", filename);
   385   FILE* fp = fopen(filename, 
"r");
   387      perror(
"loadDataASCII");
   391   fscanf(fp,
"%d %d %d",&numProcessors,&numPatches,&numComputes);
   393   printf(
"numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
   395   delete [] processorArray;
   396   delete [] patchArray;
   397   delete [] computeArray;
   403   for(i=0;i<numProcessors;i++) {
   407     if (p->
Id != i) CmiAbort(
"Reading processorArray error!");
   411   for(i=0;i < numPatches; i++) {
   415       CmiAbort(
"Reading patchArray error!");
   418   for(i=0; i < numComputes; i++) {
   424       CmiAbort(
"Reading computeArray error!");
   429   for (i=0; i< numProcessors; i++) {
   431       fscanf(fp,
"%d %d: ",&curp, &num);
   433         CmiAbort(
"Reading patchsSet error!");
   434       for (
int j=0; j<num; j++) {
   441   for (i=0; i<numPatches; i++)  {
   443       fscanf(fp,
"%d %d: ",&curp, &num);
   445         CmiAbort(
"Reading proxiesOn error!");
   446       for (
int j=0; j<num; j++) {
   458 #ifdef MEM_OPT_VERSION   461 #if defined(NAMD_MIC)   462 extern int isMICProcessor(
int);
   465 int NamdCentLB::buildData(LDStats* stats)
   467   int n_pes = stats->nprocs();
   479   int unLoadZero = 
simParams->ldbUnloadZero;
   481   int unLoadIO= 
simParams->ldbUnloadOutputPEs;
   483   for (i=0; i<n_pes; ++i) {
   484     processorArray[i].
Id = i;
   487       processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
   489       processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
   491       processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
   493     processorArray[i].
idleTime = stats->procs[i].idletime;
   538   if (unLoadZero) processorArray[0].
available = 
false;
   539   if (unLoadOne) processorArray[1].
available = 
false;
   542   if (pmeOn && unLoadPme) {
   543     for (i=0; i<n_pes; i++) {
   552   if (pmeOn && unLoadPme) {
   553     for (i=0; i<n_pes; i++) {
   555         processorArray[i].available = 
false;
   559 #ifdef MEM_OPT_VERSION   562       if (
simParams->numoutputprocs == n_pes) {
   569       for (i=0; i<n_pes; i++) {
   584   #if defined(NAMD_MIC)   586       for (i = 0; i < n_pes; i++) {
   587         if (isMICProcessor(i) != 0) { processorArray[i].
available = 
false; }
   592   int nMoveableComputes=0;
   594   int nIdleComputes = 0;
   597   const auto nObjs = stats->objData.size();
   598   for (j=0; j < nObjs; j++) {
   599       const LDObjData &this_obj = stats->objData[j];
   600       int frompe = stats->from_proc[j];
   603       if (this_obj.omID().id.idx != 1) {
   606         processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
   614         patchArray[pid].
Id = pid;
   616         patchArray[pid].
processor = stats->from_proc[j];
   617         const int numProxies = 
   619         requiredProxiesOnProcGrid(pid,neighborNodes);
   621         requiredProxies(pid, neighborNodes);
   624         nProxies += numProxies;
   626         for (
int k=0; k<numProxies; k++) {
   630         processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
   632         processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
   633       } 
else if (this_obj.migratable) { 
   634        if ( this_obj.wallTime == 0. ) { 
   638         const int p0 = computeMap->
pid(cid,0);
   642         if (computeMap->
numPids(cid) > 1)
   643           p1 = computeMap->
pid(cid,1);
   645         computeArray[nMoveableComputes].
Id = cid;
   646         computeArray[nMoveableComputes].
oldProcessor = stats->from_proc[j];
   647         processorArray[stats->from_proc[j]].
computeLoad += this_obj.wallTime;
   648         computeArray[nMoveableComputes].
processor = -1;
   649         computeArray[nMoveableComputes].
patch1 = p0;
   650         computeArray[nMoveableComputes].
patch2 = p1;
   651         computeArray[nMoveableComputes].
handle = this_obj.handle;
   652         computeArray[nMoveableComputes].
load = this_obj.wallTime;
   656         processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
   661      CkPrintf(
"LDB: %d computes have load of zero\n", nIdleComputes);
   679   for (i=0; i<n_pes; i++) {
   683   return nMoveableComputes;
   690 int NamdCentLB::requiredProxies(
PatchID id, 
int neighborNodes[])
   693   int myNode = patchMap->
node(
id);
   696 #define IF_NEW_NODE \   698     for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \   699     if ( j == nProxyNodes )   704   for ( 
int i = 0; i < numNeighbors; ++i ) {
   705     const int proxyNode = patchMap->
basenode(neighbors[i]);
   706     if ( proxyNode != myNode ) {
   708         neighborNodes[nProxyNodes] = proxyNode;
   722   int numPes = CkNumPes();
   724   int emptyNodes = numPes - numPatches;
   725   if ( emptyNodes > numPatches ) {
   726     int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
   728     if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
   729     int proxyNode = (myNode + 1) % numPes;
   730     while ( nProxyNodes < nodesPerPatch &&
   732       if ( proxyNode != myNode ) {
   734           neighborNodes[nProxyNodes] = proxyNode;
   738       proxyNode = (proxyNode + 1) % numPes;
   740     proxyNode = (myNode - 1 + numPes) % numPes;
   741     while ( nProxyNodes < nodesPerPatch &&
   743       if ( proxyNode != myNode ) {
   745           neighborNodes[nProxyNodes] = proxyNode;
   749       proxyNode = (proxyNode - 1 + numPes) % numPes;
   751     proxyNode = (myNode + 1) % numPes;
   753     while ( nProxyNodes < nodesPerPatch ) {
   756           neighborNodes[nProxyNodes] = proxyNode;
   760       proxyNode = (proxyNode + 1) % numPes;
   761       count ++; 
if (count == numPes) 
break;   
   764     int proxyNode = myNode - 1;
   766       if ( proxyNode != myNode ) {
   768           neighborNodes[nProxyNodes] = proxyNode;
   773     proxyNode = myNode + 1;
   774     if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
   775       if ( proxyNode != myNode ) {
   777           neighborNodes[nProxyNodes] = proxyNode;
   795 int NamdCentLB::requiredProxiesOnProcGrid(
PatchID id, 
int neighborNodes[])
   797   enum proxyHere { No, Yes };
   798   int numPes = CkNumPes();
   799   proxyHere *proxyNodes = 
new proxyHere[numPes];
   803   int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
   804   int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
   807   int myNode = patchMap->
node(
id);
   810   xsize = tmgr.getDimNX();
   811   ysize = tmgr.getDimNY();
   812   zsize = tmgr.getDimNZ();
   813   tsize = tmgr.getDimNT();
   815   tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
   817   if(xsize * ysize * zsize * tsize != CkNumPes()) {
   818     delete [] proxyNodes;
   819     return requiredProxies(
id, neighborNodes);
   823   for ( i = 0; i < numPes; ++i )
   842   bool smallFlag = 
false;
   843   double pnodes = CkNumPes();
   845   smallFlag = (patchMap->
numPatches() > pnodes )?1:0;
   850   for ( i = 1; i < numNeighbors; ++i )
   852       int proxyNode = patchMap->
basenode(neighbors[i]);
   854       if (proxyNode != myNode)
   855         if (proxyNodes[proxyNode] == No)
   857             proxyNodes[proxyNode] = Yes;
   858             neighborNodes[nProxyNodes] = proxyNode;
   865     delete [] proxyNodes;
   872   int emptyNodes = numPes - numPatches;
   875   int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
   877   int proxy_x=0, proxy_y=0, proxy_z=0;
   883   for(k=-1; k<= 1; k++) {
   884     proxy_z = (my_z + k + zsize) % zsize;
   885     for(j=-1; j <= 1; j++) {
   886       proxy_y = (my_y + j + ysize) % ysize;
   887       for(i = -1; i <= 1; i++) {
   888         proxy_x = (my_x + i + xsize) % xsize;
   889         for(l = 0; l < tsize; l++) {
   890           if(i == 0 && j == 0 && k == 0 && l == 0)
   893           proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
   896              proxyNodes[proxyNode] == No) {
   897             proxyNodes[proxyNode] = Yes;
   898             neighborNodes[nProxyNodes] = proxyNode;
   902           if(nProxyNodes >= nodesPerPatch || 
   907         if(nProxyNodes >= nodesPerPatch || 
   912       if(nProxyNodes >= nodesPerPatch || 
   917     if(nProxyNodes >= nodesPerPatch || 
   924     for(k=-2; k<= 2; k+=2) {
   925       proxy_z = (my_z + k + zsize) % zsize;
   926       for(j=-2; j <= 2; j+=2) {
   927         proxy_y = (my_y + j + ysize) % ysize;
   928         for(i = -2; i <= 2; i+=2) {
   929           proxy_x = (my_x + i + xsize) % xsize;
   930           for(l = 0; l < tsize; l++) {
   931             if(i == 0 && j == 0 && k == 0 && l == 0)
   934             proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
   937                proxyNodes[proxyNode] == No) {
   938               proxyNodes[proxyNode] = Yes;
   939               neighborNodes[nProxyNodes] = proxyNode;
   943             if(nProxyNodes >= nodesPerPatch || 
   948           if(nProxyNodes >= nodesPerPatch || 
   953         if(nProxyNodes >= nodesPerPatch || 
   958       if(nProxyNodes >= nodesPerPatch || 
   971       proxy_y = (my_y + 2) % ysize;
   972       proxy_x = my_x  % xsize;
   973       proxy_z = my_z  % zsize;
   975       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
   976       if(proxyNodes[proxyNode] == No) {
   977         proxyNodes[proxyNode] = Yes;
   978         neighborNodes[nProxyNodes] = proxyNode;
   982       proxy_y = (my_y - 2 + ysize) % ysize;
   983       proxy_x = my_x  % xsize;
   984       proxy_z = my_z % zsize;
   986       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
   987       if(proxyNodes[proxyNode] == No) {
   988         proxyNodes[proxyNode] = Yes;
   989         neighborNodes[nProxyNodes] = proxyNode;
   996       proxy_y = my_y  % ysize;
   997       proxy_x = my_x  % xsize;
   998       proxy_z = (my_z + 2) % zsize;
  1000       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
  1001       if(proxyNodes[proxyNode] == No) {
  1002         proxyNodes[proxyNode] = Yes;
  1003         neighborNodes[nProxyNodes] = proxyNode;
  1007       proxy_y = my_y  % ysize;
  1008       proxy_x = my_x  % xsize;
  1009       proxy_z = (my_z - 2 + zsize) % zsize;
  1011       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
  1012       if(proxyNodes[proxyNode] == No) {
  1013         proxyNodes[proxyNode] = Yes;
  1014         neighborNodes[nProxyNodes] = proxyNode;
  1021       proxy_y = my_y  % ysize;
  1022       proxy_x = (my_x + 2) % xsize;
  1023       proxy_z = my_z  % zsize;
  1025       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
  1026       if(proxyNodes[proxyNode] == No) {
  1027         proxyNodes[proxyNode] = Yes;
  1028         neighborNodes[nProxyNodes] = proxyNode;
  1032       proxy_y = my_y  % ysize;
  1033       proxy_x = (my_x  - 2 + xsize) % xsize;
  1034       proxy_z = my_z % zsize;
  1036       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
  1037       if(proxyNodes[proxyNode] == No) {
  1038         proxyNodes[proxyNode] = Yes;
  1039         neighborNodes[nProxyNodes] = proxyNode;
  1049   delete [] proxyNodes;
 
std::ostream & iINFO(std::ostream &s)
 
NamdCentLB * AllocateNamdCentLB()
 
void setNewNumPartitions(ComputeID cid, char numPartitions)
 
static PatchMap * Object()
 
SimParameters * simParameters
 
#define LDBSTRAT_REFINEONLY
 
std::ostream & endi(std::ostream &s)
 
CLBMigrateMsg * Strategy(LDStats *stats)
 
static double averageLoad
 
void insert(InfoRecord *)
 
static Units next(Units u)
 
int numPatches(void) const
 
int numaway_c(void) const
 
int numPartitions(ComputeID cid)
 
int numaway_a(void) const
 
void setNewNode(ComputeID cid, NodeID node)
 
const int & LdbIdField(const LdbId &id, const int index)
 
void NAMD_die(const char *err_msg)
 
static LdbCoordinator * Object()
 
int basenode(int pid) const
 
#define LDBSTRAT_COMPREHENSIVE
 
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
 
static ComputeMap * Object()
 
int numaway_b(void) const
 
int numPids(ComputeID cid)
 
int numPatchesOnNode(int node)
 
void unchecked_insert(InfoRecord *)
 
represents bonded compute 
 
NamdCentLB(const CkLBOptions &opt)
 
int pid(ComputeID cid, int i)
 
int isOutputProcessor(int pe)