NAMD
Classes | Public Member Functions | Public Attributes | Static Public Attributes | Friends | List of all members
ComputePmeMgr Class Reference
Inheritance diagram for ComputePmeMgr:
ComputePmeUtil

Classes

struct  cuda_submit_charges_args
 

Public Member Functions

 ComputePmeMgr ()
 
 ~ComputePmeMgr ()
 
void initialize (CkQdMsg *)
 
void initialize_pencils (CkQdMsg *)
 
void activate_pencils (CkQdMsg *)
 
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
 
void initialize_computes ()
 
void sendData (Lattice &, int sequence)
 
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
 
void sendPencils (Lattice &, int sequence)
 
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
 
void recvGrid (PmeGridMsg *)
 
void gridCalc1 (void)
 
void sendTransBarrier (void)
 
void sendTransSubset (int first, int last)
 
void sendTrans (void)
 
void fwdSharedTrans (PmeTransMsg *)
 
void recvSharedTrans (PmeSharedTransMsg *)
 
void sendDataHelper (int)
 
void sendPencilsHelper (int)
 
void recvTrans (PmeTransMsg *)
 
void procTrans (PmeTransMsg *)
 
void gridCalc2 (void)
 
void gridCalc2R (void)
 
void fwdSharedUntrans (PmeUntransMsg *)
 
void recvSharedUntrans (PmeSharedUntransMsg *)
 
void sendUntrans (void)
 
void sendUntransSubset (int first, int last)
 
void recvUntrans (PmeUntransMsg *)
 
void procUntrans (PmeUntransMsg *)
 
void gridCalc3 (void)
 
void sendUngrid (void)
 
void sendUngridSubset (int first, int last)
 
void recvUngrid (PmeGridMsg *)
 
void recvAck (PmeAckMsg *)
 
void copyResults (PmeGridMsg *)
 
void copyPencils (PmeGridMsg *)
 
void ungridCalc (void)
 
void recvRecipEvir (PmeEvirMsg *)
 
void addRecipEvirClient (void)
 
void submitReductions ()
 
void chargeGridSubmitted (Lattice &lattice, int sequence)
 
void cuda_submit_charges (Lattice &lattice, int sequence)
 
void sendChargeGridReady ()
 
void pollChargeGridReady ()
 
void pollForcesReady ()
 
void recvChargeGridReady ()
 
void chargeGridReady (Lattice &lattice, int sequence)
 
- Public Member Functions inherited from ComputePmeUtil
 ComputePmeUtil ()
 
 ~ComputePmeUtil ()
 

Public Attributes

LatticesendDataHelper_lattice
 
int sendDataHelper_sequence
 
int sendDataHelper_sourcepe
 
int sendDataHelper_errors
 
CmiNodeLock pmemgr_lock
 
float * a_data_host
 
float * a_data_dev
 
float * f_data_host
 
float * f_data_dev
 
int cuda_atoms_count
 
int cuda_atoms_alloc
 
cudaEvent_t end_charges
 
cudaEvent_t * end_forces
 
int forces_count
 
int forces_done_count
 
double charges_time
 
double forces_time
 
int check_charges_count
 
int check_forces_count
 
int master_pe
 
int this_pe
 
int chargeGridSubmittedCount
 
Latticesaved_lattice
 
int saved_sequence
 
ResizeArray< ComputePme * > pmeComputes
 

Static Public Attributes

static CmiNodeLock fftw_plan_lock
 
static CmiNodeLock cuda_lock
 
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
 
static bool cuda_busy
 
- Static Public Attributes inherited from ComputePmeUtil
static int numGrids
 
static Bool alchOn
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchDecouple
 
static BigReal alchElecLambdaStart
 
static Bool lesOn
 
static int lesFactor
 
static Bool pairOn
 
static Bool selfOn
 
static Bool LJPMEOn
 

Friends

class ComputePme
 
class NodePmeMgr
 

Additional Inherited Members

- Static Public Member Functions inherited from ComputePmeUtil
static void select (void)
 

Detailed Description

Definition at line 383 of file ComputePme.C.

Constructor & Destructor Documentation

◆ ComputePmeMgr()

ComputePmeMgr::ComputePmeMgr ( )

Definition at line 738 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, and this_pe.

738  : pmeProxy(thisgroup),
739  pmeProxyDir(thisgroup) {
740 
741  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
742  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
743  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
744 
745  pmeNodeProxy.ckLocalBranch()->initialize();
746 
747  if ( CmiMyRank() == 0 ) {
748  fftw_plan_lock = CmiCreateLock();
749  }
750  pmemgr_lock = CmiCreateLock();
751 
752  myKSpace = 0;
753  kgrid = 0;
754  work = 0;
755  grid_count = 0;
756  trans_count = 0;
757  untrans_count = 0;
758  ungrid_count = 0;
759  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
760  useBarrier = 0;
761  sendTransBarrier_received = 0;
762  usePencils = 0;
763 
764 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
765  // offload has not been set so this happens on every run
766  if ( CmiMyRank() == 0 ) {
767  cuda_lock = CmiCreateLock();
768  }
769 
770 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
771  int leastPriority, greatestPriority;
772  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
773  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
774  //if ( CkMyNode() == 0 ) {
775  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
776  //}
777 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
778 #else
779 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
780 #endif
781 
782  stream = 0;
783  for ( int i=0; i<NUM_STREAMS; ++i ) {
784 #if 1
785  CUDA_STREAM_CREATE(&streams[i]);
786  cuda_errcheck("cudaStreamCreate");
787 #else
788  streams[i] = 0; // XXXX Testing!!!
789 #endif
790  }
791 
792  this_pe = CkMyPe();
793 
794  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
795  end_forces = 0;
797  check_forces_count = 0;
799 
800  cuda_atoms_count = 0;
801  cuda_atoms_alloc = 0;
802 
803  f_data_mgr_alloc = 0;
804  f_data_mgr_host = 0;
805  f_data_mgr_dev = 0;
806  afn_host = 0;
807  afn_dev = 0;
808 
809 #define CUDA_EVENT_ID_PME_CHARGES 80
810 #define CUDA_EVENT_ID_PME_FORCES 81
811 #define CUDA_EVENT_ID_PME_TICK 82
812 #define CUDA_EVENT_ID_PME_COPY 83
813 #define CUDA_EVENT_ID_PME_KERNEL 84
814  if ( 0 == CkMyPe() ) {
815  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
816  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
817  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
818  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
819  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
820  }
821 #endif
822  recipEvirCount = 0;
823  recipEvirClients = 0;
824  recipEvirPe = -999;
825 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:442
cudaEvent_t end_charges
Definition: ComputePme.C:454
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
#define CUDA_STREAM_CREATE(X)
#define CUDA_EVENT_ID_PME_COPY
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:443
int check_charges_count
Definition: ComputePme.C:460
int cuda_atoms_alloc
Definition: ComputePme.C:451
#define CUDA_EVENT_ID_PME_FORCES
#define CUDA_EVENT_ID_PME_TICK
int chargeGridSubmittedCount
Definition: ComputePme.C:472
int cuda_atoms_count
Definition: ComputePme.C:450
cudaEvent_t * end_forces
Definition: ComputePme.C:455
#define CUDA_EVENT_ID_PME_KERNEL
#define CUDA_EVENT_ID_PME_CHARGES
#define NUM_STREAMS
Definition: ComputePme.C:542
int check_forces_count
Definition: ComputePme.C:461
static CmiNodeLock cuda_lock
Definition: ComputePme.C:452

◆ ~ComputePmeMgr()

ComputePmeMgr::~ComputePmeMgr ( )

Definition at line 1822 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

1822  {
1823 
1824  if ( CmiMyRank() == 0 ) {
1825  CmiDestroyLock(fftw_plan_lock);
1826  }
1827  CmiDestroyLock(pmemgr_lock);
1828 
1829  delete myKSpace;
1830  delete [] localInfo;
1831  delete [] gridNodeInfo;
1832  delete [] transNodeInfo;
1833  delete [] gridPeMap;
1834  delete [] transPeMap;
1835  delete [] recipPeDest;
1836  delete [] gridPeOrder;
1837  delete [] gridNodeOrder;
1838  delete [] transNodeOrder;
1839  delete [] qgrid;
1840  if ( kgrid != qgrid ) delete [] kgrid;
1841  delete [] work;
1842  delete [] gridmsg_reuse;
1843 
1844  if ( ! offload ) {
1845  for (int i=0; i<q_count; ++i) {
1846  delete [] q_list[i];
1847  }
1848  delete [] q_list;
1849  delete [] fz_arr;
1850  }
1851  delete [] f_arr;
1852  delete [] q_arr;
1853 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:442
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:443

Member Function Documentation

◆ activate_pencils()

void ComputePmeMgr::activate_pencils ( CkQdMsg *  msg)

Definition at line 1816 of file ComputePme.C.

1816  {
1817  if ( ! usePencils ) return;
1818  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1819 }

◆ addRecipEvirClient()

void ComputePmeMgr::addRecipEvirClient ( void  )

Definition at line 3064 of file ComputePme.C.

3064  {
3065  ++recipEvirClients;
3066 }

◆ chargeGridReady()

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3626 of file ComputePme.C.

References PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by ComputePme::doWork(), and recvChargeGridReady().

3626  {
3627 
3628 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3629  if ( offload ) {
3630  int errcount = 0;
3631  int q_stride = myGrid.K3+myGrid.order-1;
3632  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3633  f_arr[j] = ffz_host[j];
3634  if ( ffz_host[j] & ~1 ) ++errcount;
3635  }
3636  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3637  }
3638 #endif
3639  recipEvirCount = recipEvirClients;
3640  ungridForcesCount = pmeComputes.size();
3641 
3642  for (int j=0; j<myGrid.order-1; ++j) {
3643  fz_arr[j] |= fz_arr[myGrid.K3+j];
3644  }
3645 
3646  if ( usePencils ) {
3647  sendPencils(lattice,sequence);
3648  } else {
3649  sendData(lattice,sequence);
3650  }
3651 }
int size(void) const
Definition: ResizeArray.h:131
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3809
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:4036
int K3
Definition: PmeBase.h:21
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:482

◆ chargeGridSubmitted()

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3567 of file ComputePme.C.

References chargeGridSubmittedCount, CUDA_EVENT_ID_PME_COPY, end_charges, master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

3567  {
3568  saved_lattice = &lattice;
3569  saved_sequence = sequence;
3570 
3571  // cudaDeviceSynchronize(); // XXXX TESTING
3572  //int q_stride = myGrid.K3+myGrid.order-1;
3573  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3574  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3575  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3576  // }
3577  //}
3578  //CmiLock(cuda_lock);
3579 
3580  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3581  double before = CmiWallTimer();
3582  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3583  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3584  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3585  cudaMemcpyDeviceToHost, streams[stream]);
3586  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3587  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3588  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3589  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3590  //CmiUnlock(cuda_lock);
3591  // cudaDeviceSynchronize(); // XXXX TESTING
3592  // cuda_errcheck("after memcpy grid to host");
3593 
3595  pmeProxy[master_pe].pollChargeGridReady();
3596  }
3597 }
static Node * Object()
Definition: Node.h:86
Lattice * saved_lattice
Definition: ComputePme.C:475
cudaEvent_t end_charges
Definition: ComputePme.C:454
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_EVENT_ID_PME_COPY
int chargeGridSubmittedCount
Definition: ComputePme.C:472
#define simParams
Definition: Output.C:131
int saved_sequence
Definition: ComputePme.C:476

◆ copyPencils()

void ComputePmeMgr::copyPencils ( PmeGridMsg msg)

Definition at line 3872 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3872  {
3873 
3874  int K1 = myGrid.K1;
3875  int K2 = myGrid.K2;
3876  int dim2 = myGrid.dim2;
3877  int dim3 = myGrid.dim3;
3878  int block1 = myGrid.block1;
3879  int block2 = myGrid.block2;
3880 
3881  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3882  int ib = msg->sourceNode / yBlocks;
3883  int jb = msg->sourceNode % yBlocks;
3884 
3885  int ibegin = ib*block1;
3886  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3887  int jbegin = jb*block2;
3888  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3889 
3890  int zlistlen = msg->zlistlen;
3891  int *zlist = msg->zlist;
3892  float *qmsg = msg->qgrid;
3893  int g;
3894  for ( g=0; g<numGrids; ++g ) {
3895  char *f = f_arr + g*fsize;
3896  float **q = q_arr + g*fsize;
3897  for ( int i=ibegin; i<iend; ++i ) {
3898  for ( int j=jbegin; j<jend; ++j ) {
3899  if( f[i*dim2+j] ) {
3900  f[i*dim2+j] = 0;
3901  for ( int k=0; k<zlistlen; ++k ) {
3902  q[i*dim2+j][zlist[k]] = *(qmsg++);
3903  }
3904  for (int h=0; h<myGrid.order-1; ++h) {
3905  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3906  }
3907  }
3908  }
3909  }
3910  }
3911 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:24
int block2
Definition: PmeBase.h:24
int sourceNode
Definition: ComputePme.C:143
int order
Definition: PmeBase.h:23
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int K3
Definition: PmeBase.h:21
int zlistlen
Definition: ComputePme.C:149

◆ copyResults()

void ComputePmeMgr::copyResults ( PmeGridMsg msg)

Definition at line 4064 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

4064  {
4065 
4066  int zdim = myGrid.dim3;
4067  int flen = msg->len;
4068  int fstart = msg->start;
4069  int zlistlen = msg->zlistlen;
4070  int *zlist = msg->zlist;
4071  float *qmsg = msg->qgrid;
4072  int g;
4073  for ( g=0; g<numGrids; ++g ) {
4074  char *f = msg->fgrid + g*flen;
4075  float **q = q_arr + fstart + g*fsize;
4076  for ( int i=0; i<flen; ++i ) {
4077  if ( f[i] ) {
4078  f[i] = 0;
4079  for ( int k=0; k<zlistlen; ++k ) {
4080  q[i][zlist[k]] = *(qmsg++);
4081  }
4082  for (int h=0; h<myGrid.order-1; ++h) {
4083  q[i][myGrid.K3+h] = q[i][h];
4084  }
4085  }
4086  }
4087  }
4088 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
int order
Definition: PmeBase.h:23
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int K3
Definition: PmeBase.h:21
int zlistlen
Definition: ComputePme.C:149
char * fgrid
Definition: ComputePme.C:151

◆ cuda_submit_charges()

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3512 of file ComputePme.C.

References a_data_dev, a_data_host, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

Referenced by ComputePme::doWork().

3512  {
3513 
3514  int n = cuda_atoms_count;
3515  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3516  cuda_atoms_count = 0;
3517 
3518  const double before = CmiWallTimer();
3519  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3520  cudaMemcpyHostToDevice, streams[stream]);
3521  const double after = CmiWallTimer();
3522 
3523  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3524 
3525  cuda_pme_charges(
3526  bspline_coeffs_dev,
3527  q_arr_dev, ffz_dev, ffz_dev + fsize,
3528  a_data_dev, n,
3529  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3530  streams[stream]);
3531  const double after2 = CmiWallTimer();
3532 
3533  chargeGridSubmitted(lattice,sequence); // must be inside lock
3534 
3535  masterPmeMgr->charges_time = before;
3536  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3537  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3538 }
float * a_data_dev
Definition: ComputePme.C:447
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
#define CUDA_EVENT_ID_PME_COPY
int order
Definition: PmeBase.h:23
int K3
Definition: PmeBase.h:21
int cuda_atoms_count
Definition: ComputePme.C:450
#define CUDA_EVENT_ID_PME_KERNEL
double charges_time
Definition: ComputePme.C:458
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3567
float * a_data_host
Definition: ComputePme.C:446

◆ fwdSharedTrans()

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg msg)

Definition at line 2042 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by sendTransSubset().

2042  {
2043  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2044  int pe = transNodeInfo[myTransNode].pe_start;
2045  int npe = transNodeInfo[myTransNode].npe;
2046  CmiNodeLock lock = CmiCreateLock();
2047  int *count = new int; *count = npe;
2048  for (int i=0; i<npe; ++i, ++pe) {
2051  shmsg->msg = msg;
2052  shmsg->count = count;
2053  shmsg->lock = lock;
2054  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2055  }
2056 }
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
PmeTransMsg * msg
Definition: ComputePme.C:171
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
CmiNodeLock lock
Definition: ComputePme.C:173

◆ fwdSharedUntrans()

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg msg)

Definition at line 2305 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by sendUntransSubset().

2305  {
2306  int pe = gridNodeInfo[myGridNode].pe_start;
2307  int npe = gridNodeInfo[myGridNode].npe;
2308  CmiNodeLock lock = CmiCreateLock();
2309  int *count = new int; *count = npe;
2310  for (int i=0; i<npe; ++i, ++pe) {
2312  shmsg->msg = msg;
2313  shmsg->count = count;
2314  shmsg->lock = lock;
2315  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2316  }
2317 }
CmiNodeLock lock
Definition: ComputePme.C:190
PmeUntransMsg * msg
Definition: ComputePme.C:188

◆ gridCalc1()

void ComputePmeMgr::gridCalc1 ( void  )

Definition at line 1934 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

1934  {
1935  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1936 
1937 #ifdef NAMD_FFTW
1938  for ( int g=0; g<numGrids; ++g ) {
1939 #ifdef NAMD_FFTW_3
1940  fftwf_execute(forward_plan_yz[g]);
1941 #else
1942  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1943  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1944 #endif
1945 
1946  }
1947 #endif
1948 
1949  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1950 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32

◆ gridCalc2()

void ComputePmeMgr::gridCalc2 ( void  )

Definition at line 2110 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

2110  {
2111  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2112 
2113 #if CMK_BLUEGENEL
2114  CmiNetworkProgressAfter (0);
2115 #endif
2116 
2117  int zdim = myGrid.dim3;
2118  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2119  int ny = localInfo[myTransPe].ny_after_transpose;
2120 
2121  for ( int g=0; g<numGrids; ++g ) {
2122  // finish forward FFT (x dimension)
2123 #ifdef NAMD_FFTW
2124 #ifdef NAMD_FFTW_3
2125  fftwf_execute(forward_plan_x[g]);
2126 #else
2127  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2128  ny * zdim / 2, 1, work, 1, 0);
2129 #endif
2130 #endif
2131  }
2132 
2133 #ifdef OPENATOM_VERSION
2134  if ( ! simParams -> openatomOn ) {
2135 #endif // OPENATOM_VERSION
2136  gridCalc2R();
2137 #ifdef OPENATOM_VERSION
2138  } else {
2139  gridCalc2Moa();
2140  }
2141 #endif // OPENATOM_VERSION
2142 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
int ny_after_transpose
Definition: ComputePme.C:261
#define simParams
Definition: Output.C:131
void gridCalc2R(void)
Definition: ComputePme.C:2170

◆ gridCalc2R()

void ComputePmeMgr::gridCalc2R ( void  )

Definition at line 2170 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeKSpace::compute_energy_LJPME(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, ComputeNonbondedUtil::LJewaldcof, ComputePmeUtil::LJPMEOn, ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

2170  {
2171 
2172  int useCkLoop = 0;
2173 #if CMK_SMP && USE_CKLOOP
2174  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2175  && CkNumPes() >= 2 * numTransPes ) {
2176  useCkLoop = 1;
2177  }
2178 #endif
2179 
2180  int zdim = myGrid.dim3;
2181  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2182  int ny = localInfo[myTransPe].ny_after_transpose;
2183 
2184  for ( int g=0; g<numGrids; ++g ) {
2185  // reciprocal space portion of PME
2186  if ( LJPMEOn && 1==g ) {
2188  recip_evir2[g][0] = myKSpace->compute_energy_LJPME(kgrid+qgrid_size*g,
2189  lattice, LJewaldcof, &(recip_evir2[g][1]), useCkLoop);
2190  // CkPrintf("LJ Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2191  } else {
2193  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2194  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2195  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2196  }
2197 
2198  // start backward FFT (x dimension)
2199 
2200 #ifdef NAMD_FFTW
2201 #ifdef NAMD_FFTW_3
2202  fftwf_execute(backward_plan_x[g]);
2203 #else
2204  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2205  ny * zdim / 2, 1, work, 1, 0);
2206 #endif
2207 #endif
2208  }
2209 
2210  pmeProxyDir[CkMyPe()].sendUntrans();
2211 }
static Node * Object()
Definition: Node.h:86
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
double compute_energy_LJPME(float q_arr[], const Lattice &lattice, double LJewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:545
static Bool LJPMEOn
Definition: ComputePme.h:43
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:99
int ny_after_transpose
Definition: ComputePme.C:261
double BigReal
Definition: common.h:123

◆ gridCalc3()

void ComputePmeMgr::gridCalc3 ( void  )

Definition at line 2379 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

2379  {
2380  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2381 
2382  // finish backward FFT
2383 #ifdef NAMD_FFTW
2384  for ( int g=0; g<numGrids; ++g ) {
2385 #ifdef NAMD_FFTW_3
2386  fftwf_execute(backward_plan_yz[g]);
2387 #else
2388  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2389  (fftw_complex *) (qgrid + qgrid_size * g),
2390  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2391 #endif
2392  }
2393 
2394 #endif
2395 
2396  pmeProxyDir[CkMyPe()].sendUngrid();
2397 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32

◆ initialize()

void ComputePmeMgr::initialize ( CkQdMsg *  msg)

Definition at line 890 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

890  {
891  delete msg;
892 
893  localInfo = new LocalPmeInfo[CkNumPes()];
894  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
895  transNodeInfo = new NodePmeInfo[CkNumNodes()];
896  gridPeMap = new int[CkNumPes()];
897  transPeMap = new int[CkNumPes()];
898  recipPeDest = new int[CkNumPes()];
899  gridPeOrder = new int[CkNumPes()];
900  gridNodeOrder = new int[CkNumNodes()];
901  transNodeOrder = new int[CkNumNodes()];
902 
903  if (CkMyRank() == 0) {
904  pencilPMEProcessors = new char [CkNumPes()];
905  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
906  }
907 
909  PatchMap *patchMap = PatchMap::Object();
910 
911  offload = simParams->PMEOffload;
912 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
913  if ( offload && ! deviceCUDA->one_device_per_node() ) {
914  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
915  }
916  if ( offload ) {
917  int dev;
918  cudaGetDevice(&dev);
919  cuda_errcheck("in cudaGetDevice");
920  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
921  cudaDeviceProp deviceProp;
922  cudaGetDeviceProperties(&deviceProp, dev);
923  cuda_errcheck("in cudaGetDeviceProperties");
924  if ( deviceProp.major < 2 )
925  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
926  }
927 #endif
928 
929  alchLambda = -1.; // illegal value to catch if not updated
930  alchLambda2 = -1.;
931  useBarrier = simParams->PMEBarrier;
932 
933  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
934  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
935  else {
936  int nrps = simParams->PMEProcessors;
937  if ( nrps <= 0 ) nrps = CkNumPes();
938  if ( nrps > CkNumPes() ) nrps = CkNumPes();
939  int dimx = simParams->PMEGridSizeX;
940  int dimy = simParams->PMEGridSizeY;
941  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
942  if ( maxslabs > nrps ) maxslabs = nrps;
943  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
944  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
945  if ( maxpencils > nrps ) maxpencils = nrps;
946  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
947  else usePencils = 0;
948  }
949 
950  if ( usePencils ) {
951  int nrps = simParams->PMEProcessors;
952  if ( nrps <= 0 ) nrps = CkNumPes();
953  if ( nrps > CkNumPes() ) nrps = CkNumPes();
954  if ( simParams->PMEPencils > 1 &&
955  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
956  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
957  } else {
958  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
959  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
960  if ( nb2 > nrps ) nb2 = nrps;
961  if ( nb2 < 1 ) nb2 = 1;
962  int nb = (int) sqrt((float)nb2);
963  if ( nb < 1 ) nb = 1;
964  xBlocks = zBlocks = nb;
965  yBlocks = nb2 / nb;
966  }
967 
968  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
969  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
970  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
971 
972  int dimx = simParams->PMEGridSizeX;
973  int bx = 1 + ( dimx - 1 ) / xBlocks;
974  xBlocks = 1 + ( dimx - 1 ) / bx;
975 
976  int dimy = simParams->PMEGridSizeY;
977  int by = 1 + ( dimy - 1 ) / yBlocks;
978  yBlocks = 1 + ( dimy - 1 ) / by;
979 
980  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
981  int bz = 1 + ( dimz - 1 ) / zBlocks;
982  zBlocks = 1 + ( dimz - 1 ) / bz;
983 
984  if ( xBlocks * yBlocks > CkNumPes() ) {
985  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
986  }
987  if ( xBlocks * zBlocks > CkNumPes() ) {
988  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
989  }
990  if ( yBlocks * zBlocks > CkNumPes() ) {
991  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
992  }
993 
994  if ( ! CkMyPe() ) {
995  iout << iINFO << "PME using " << xBlocks << " x " <<
996  yBlocks << " x " << zBlocks <<
997  " pencil grid for FFT and reciprocal sum.\n" << endi;
998  }
999  } else { // usePencils
1000 
1001  { // decide how many pes to use for reciprocal sum
1002 
1003  // rules based on work available
1004  int minslices = simParams->PMEMinSlices;
1005  int dimx = simParams->PMEGridSizeX;
1006  int nrpx = ( dimx + minslices - 1 ) / minslices;
1007  int dimy = simParams->PMEGridSizeY;
1008  int nrpy = ( dimy + minslices - 1 ) / minslices;
1009 
1010  // rules based on processors available
1011  int nrpp = CkNumPes();
1012  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
1013  if ( nrpp < nrpx ) nrpx = nrpp;
1014  if ( nrpp < nrpy ) nrpy = nrpp;
1015 
1016  // user override
1017  int nrps = simParams->PMEProcessors;
1018  if ( nrps > CkNumPes() ) nrps = CkNumPes();
1019  if ( nrps > 0 ) nrpx = nrps;
1020  if ( nrps > 0 ) nrpy = nrps;
1021 
1022  // make sure there aren't any totally empty processors
1023  int bx = ( dimx + nrpx - 1 ) / nrpx;
1024  nrpx = ( dimx + bx - 1 ) / bx;
1025  int by = ( dimy + nrpy - 1 ) / nrpy;
1026  nrpy = ( dimy + by - 1 ) / by;
1027  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1028  NAMD_bug("Error in selecting number of PME processors.");
1029  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1030  NAMD_bug("Error in selecting number of PME processors.");
1031 
1032  numGridPes = nrpx;
1033  numTransPes = nrpy;
1034  }
1035  if ( ! CkMyPe() ) {
1036  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1037  " processors for FFT and reciprocal sum.\n" << endi;
1038  }
1039 
1040  int sum_npes = numTransPes + numGridPes;
1041  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1042 
1043 #if 0 // USE_TOPOMAP
1044  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1045  PatchMap * pmap = PatchMap::Object();
1046 
1047  int patch_pes = pmap->numNodesWithPatches();
1048  TopoManager tmgr;
1049  if(tmgr.hasMultipleProcsPerNode())
1050  patch_pes *= 2;
1051 
1052  bool done = false;
1053  if(CkNumPes() > 2*sum_npes + patch_pes) {
1054  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1055  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1056  }
1057  else
1058  if(CkNumPes() > 2 *max_npes + patch_pes) {
1059  done = generateBGLORBPmePeList(transPeMap, max_npes);
1060  gridPeMap = transPeMap;
1061  }
1062 
1063  if (!done)
1064 #endif
1065  {
1066  //generatePmePeList(transPeMap, max_npes);
1067  //gridPeMap = transPeMap;
1068  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1069  }
1070 
1071  if ( ! CkMyPe() ) {
1072  iout << iINFO << "PME GRID LOCATIONS:";
1073  int i;
1074  for ( i=0; i<numGridPes && i<10; ++i ) {
1075  iout << " " << gridPeMap[i];
1076  }
1077  if ( i < numGridPes ) iout << " ...";
1078  iout << "\n" << endi;
1079  iout << iINFO << "PME TRANS LOCATIONS:";
1080  for ( i=0; i<numTransPes && i<10; ++i ) {
1081  iout << " " << transPeMap[i];
1082  }
1083  if ( i < numTransPes ) iout << " ...";
1084  iout << "\n" << endi;
1085  }
1086 
1087  // sort based on nodes and physical nodes
1088  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1089 
1090  myGridPe = -1;
1091  myGridNode = -1;
1092  int i = 0;
1093  int node = -1;
1094  int real_node = -1;
1095  for ( i=0; i<numGridPes; ++i ) {
1096  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1097  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1098  int real_node_i = CkNodeOf(gridPeMap[i]);
1099  if ( real_node_i == real_node ) {
1100  gridNodeInfo[node].npe += 1;
1101  } else {
1102  real_node = real_node_i;
1103  ++node;
1104  gridNodeInfo[node].real_node = real_node;
1105  gridNodeInfo[node].pe_start = i;
1106  gridNodeInfo[node].npe = 1;
1107  }
1108  if ( CkMyNode() == real_node_i ) myGridNode = node;
1109  }
1110  numGridNodes = node + 1;
1111  myTransPe = -1;
1112  myTransNode = -1;
1113  node = -1;
1114  real_node = -1;
1115  for ( i=0; i<numTransPes; ++i ) {
1116  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1117  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1118  int real_node_i = CkNodeOf(transPeMap[i]);
1119  if ( real_node_i == real_node ) {
1120  transNodeInfo[node].npe += 1;
1121  } else {
1122  real_node = real_node_i;
1123  ++node;
1124  transNodeInfo[node].real_node = real_node;
1125  transNodeInfo[node].pe_start = i;
1126  transNodeInfo[node].npe = 1;
1127  }
1128  if ( CkMyNode() == real_node_i ) myTransNode = node;
1129  }
1130  numTransNodes = node + 1;
1131 
1132  if ( ! CkMyPe() ) {
1133  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1134  << numTransNodes << " TRANS NODES\n" << endi;
1135  }
1136 
1137  { // generate random orderings for grid and trans messages
1138  int i;
1139  for ( i = 0; i < numGridPes; ++i ) {
1140  gridPeOrder[i] = i;
1141  }
1142  Random rand(CkMyPe());
1143  if ( myGridPe < 0 ) {
1144  rand.reorder(gridPeOrder,numGridPes);
1145  } else { // self last
1146  gridPeOrder[myGridPe] = numGridPes-1;
1147  gridPeOrder[numGridPes-1] = myGridPe;
1148  rand.reorder(gridPeOrder,numGridPes-1);
1149  }
1150  for ( i = 0; i < numGridNodes; ++i ) {
1151  gridNodeOrder[i] = i;
1152  }
1153  if ( myGridNode < 0 ) {
1154  rand.reorder(gridNodeOrder,numGridNodes);
1155  } else { // self last
1156  gridNodeOrder[myGridNode] = numGridNodes-1;
1157  gridNodeOrder[numGridNodes-1] = myGridNode;
1158  rand.reorder(gridNodeOrder,numGridNodes-1);
1159  }
1160  for ( i = 0; i < numTransNodes; ++i ) {
1161  transNodeOrder[i] = i;
1162  }
1163  if ( myTransNode < 0 ) {
1164  rand.reorder(transNodeOrder,numTransNodes);
1165  } else { // self last
1166  transNodeOrder[myTransNode] = numTransNodes-1;
1167  transNodeOrder[numTransNodes-1] = myTransNode;
1168  rand.reorder(transNodeOrder,numTransNodes-1);
1169  }
1170  }
1171 
1172  } // ! usePencils
1173 
1174  myGrid.K1 = simParams->PMEGridSizeX;
1175  myGrid.K2 = simParams->PMEGridSizeY;
1176  myGrid.K3 = simParams->PMEGridSizeZ;
1177  myGrid.order = simParams->PMEInterpOrder;
1178  myGrid.dim2 = myGrid.K2;
1179  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1180 
1181  if ( ! usePencils ) {
1182  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1183  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1184  myGrid.block3 = myGrid.dim3 / 2; // complex
1185  }
1186 
1187  if ( usePencils ) {
1188  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1189  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1190  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1191 
1192 
1193  int pe = 0;
1194  int x,y,z;
1195 
1196  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1197  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1198  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1199 
1200  // decide which pes to use by bit reversal and patch use
1201  int i;
1202  int ncpus = CkNumPes();
1203  SortableResizeArray<int> patches, nopatches, pmeprocs;
1204  PatchMap *pmap = PatchMap::Object();
1205  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1206  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1207  if ( ri ) { // keep 0 for special case
1208  // pretend pe 1 has patches to avoid placing extra PME load on node
1209  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1210  else nopatches.add(ri);
1211  }
1212  }
1213 
1214 #if USE_RANDOM_TOPO
1215  Random rand(CkMyPe());
1216  int *tmp = new int[patches.size()];
1217  int nn = patches.size();
1218  for (i=0;i<nn;i++) tmp[i] = patches[i];
1219  rand.reorder(tmp, nn);
1220  patches.resize(0);
1221  for (i=0;i<nn;i++) patches.add(tmp[i]);
1222  delete [] tmp;
1223  tmp = new int[nopatches.size()];
1224  nn = nopatches.size();
1225  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1226  rand.reorder(tmp, nn);
1227  nopatches.resize(0);
1228  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1229  delete [] tmp;
1230 #endif
1231 
1232  // only use zero if it eliminates overloading or has patches
1233  int useZero = 0;
1234  int npens = xBlocks*yBlocks;
1235  if ( npens % ncpus == 0 ) useZero = 1;
1236  if ( npens == nopatches.size() + 1 ) useZero = 1;
1237  npens += xBlocks*zBlocks;
1238  if ( npens % ncpus == 0 ) useZero = 1;
1239  if ( npens == nopatches.size() + 1 ) useZero = 1;
1240  npens += yBlocks*zBlocks;
1241  if ( npens % ncpus == 0 ) useZero = 1;
1242  if ( npens == nopatches.size() + 1 ) useZero = 1;
1243 
1244  // add nopatches then patches in reversed order
1245  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1246  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1247  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1248  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1249 
1250  int npes = pmeprocs.size();
1251  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1252  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1253 #if !USE_RANDOM_TOPO
1254  zprocs.sort();
1255 #endif
1256  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1257  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1258 #if !USE_RANDOM_TOPO
1259  yprocs.sort();
1260 #endif
1261  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1262  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1263 #if !USE_RANDOM_TOPO
1264  xprocs.sort();
1265 #endif
1266 
1267 #if USE_TOPO_SFC
1268  CmiLock(tmgr_lock);
1269  //{
1270  TopoManager tmgr;
1271  int xdim = tmgr.getDimNX();
1272  int ydim = tmgr.getDimNY();
1273  int zdim = tmgr.getDimNZ();
1274  int xdim1 = find_level_grid(xdim);
1275  int ydim1 = find_level_grid(ydim);
1276  int zdim1 = find_level_grid(zdim);
1277  if(CkMyPe() == 0)
1278  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1279 
1280  vector<Coord> result;
1281  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1282  sort_sfc(xprocs, tmgr, result);
1283  sort_sfc(yprocs, tmgr, result);
1284  sort_sfc(zprocs, tmgr, result);
1285  //}
1286  CmiUnlock(tmgr_lock);
1287 #endif
1288 
1289 
1290  if(CkMyPe() == 0){
1291  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1292  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1293 #if USE_TOPO_SFC
1294  int x,y,z,t;
1295  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1296  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1297 #else
1298  iout << " " << zprocs[i];
1299 #endif
1300  }
1301  if ( i < zprocs.size() ) iout << " ...";
1302  iout << "\n" << endi;
1303  }
1304 
1305  if (CkMyRank() == 0) {
1306  for (pe=0, x = 0; x < xBlocks; ++x)
1307  for (y = 0; y < yBlocks; ++y, ++pe ) {
1308  pencilPMEProcessors[zprocs[pe]] = 1;
1309  }
1310  }
1311 
1312  if(CkMyPe() == 0){
1313  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1314  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1315 #if USE_TOPO_SFC
1316  int x,y,z,t;
1317  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1318  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1319 #else
1320  iout << " " << yprocs[i];
1321 #endif
1322  }
1323  if ( i < yprocs.size() ) iout << " ...";
1324  iout << "\n" << endi;
1325  }
1326 
1327  if (CkMyRank() == 0) {
1328  for (pe=0, z = 0; z < zBlocks; ++z )
1329  for (x = 0; x < xBlocks; ++x, ++pe ) {
1330  pencilPMEProcessors[yprocs[pe]] = 1;
1331  }
1332  }
1333 
1334  if(CkMyPe() == 0){
1335  iout << iINFO << "PME X PENCIL LOCATIONS:";
1336  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1337 #if USE_TOPO_SFC
1338  int x,y,z,t;
1339  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1340  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1341 #else
1342  iout << " " << xprocs[i];
1343 #endif
1344  }
1345  if ( i < xprocs.size() ) iout << " ...";
1346  iout << "\n" << endi;
1347  }
1348 
1349  if (CkMyRank() == 0) {
1350  for (pe=0, y = 0; y < yBlocks; ++y )
1351  for (z = 0; z < zBlocks; ++z, ++pe ) {
1352  pencilPMEProcessors[xprocs[pe]] = 1;
1353  }
1354  }
1355 
1356 
1357  // creating the pencil arrays
1358  if ( CkMyPe() == 0 ){
1359 #if !USE_RANDOM_TOPO
1360  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1361  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1362  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1363  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1364 #endif
1365 #if 1
1366  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1367  CProxy_PmePencilMap ym;
1368  if ( simParams->PMEPencilsYLayout )
1369  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1370  else
1371  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1372  CProxy_PmePencilMap xm;
1373  if ( simParams->PMEPencilsXLayout )
1374  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1375  else
1376  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1377  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1378  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1379  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1380  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1381  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1382  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1383  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1384  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1385  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1386  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1387 #else
1388  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1389  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1390  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1391 
1392  for (pe=0, x = 0; x < xBlocks; ++x)
1393  for (y = 0; y < yBlocks; ++y, ++pe ) {
1394  zPencil(x,y,0).insert(zprocs[pe]);
1395  }
1396  zPencil.doneInserting();
1397 
1398  for (pe=0, x = 0; x < xBlocks; ++x)
1399  for (z = 0; z < zBlocks; ++z, ++pe ) {
1400  yPencil(x,0,z).insert(yprocs[pe]);
1401  }
1402  yPencil.doneInserting();
1403 
1404 
1405  for (pe=0, y = 0; y < yBlocks; ++y )
1406  for (z = 0; z < zBlocks; ++z, ++pe ) {
1407  xPencil(0,y,z).insert(xprocs[pe]);
1408  }
1409  xPencil.doneInserting();
1410 #endif
1411 
1412  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1413  PmePencilInitMsgData msgdata;
1414  msgdata.grid = myGrid;
1415  msgdata.xBlocks = xBlocks;
1416  msgdata.yBlocks = yBlocks;
1417  msgdata.zBlocks = zBlocks;
1418  msgdata.xPencil = xPencil;
1419  msgdata.yPencil = yPencil;
1420  msgdata.zPencil = zPencil;
1421  msgdata.pmeProxy = pmeProxyDir;
1422  msgdata.pmeNodeProxy = pmeNodeProxy;
1423  msgdata.xm = xm;
1424  msgdata.ym = ym;
1425  msgdata.zm = zm;
1426  xPencil.init(new PmePencilInitMsg(msgdata));
1427  yPencil.init(new PmePencilInitMsg(msgdata));
1428  zPencil.init(new PmePencilInitMsg(msgdata));
1429  }
1430 
1431  return; // continue in initialize_pencils() at next startup stage
1432  }
1433 
1434 
1435  int pe;
1436  int nx = 0;
1437  for ( pe = 0; pe < numGridPes; ++pe ) {
1438  localInfo[pe].x_start = nx;
1439  nx += myGrid.block1;
1440  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1441  localInfo[pe].nx = nx - localInfo[pe].x_start;
1442  }
1443  int ny = 0;
1444  for ( pe = 0; pe < numTransPes; ++pe ) {
1445  localInfo[pe].y_start_after_transpose = ny;
1446  ny += myGrid.block2;
1447  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1448  localInfo[pe].ny_after_transpose =
1449  ny - localInfo[pe].y_start_after_transpose;
1450  }
1451 
1452  { // decide how many pes this node exchanges charges with
1453 
1454  PatchMap *patchMap = PatchMap::Object();
1455  Lattice lattice = simParams->lattice;
1456  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1457  BigReal cutoff = simParams->cutoff;
1458  BigReal patchdim = simParams->patchDimension;
1459  int numPatches = patchMap->numPatches();
1460  int numNodes = CkNumPes();
1461  int *source_flags = new int[numNodes];
1462  int node;
1463  for ( node=0; node<numNodes; ++node ) {
1464  source_flags[node] = 0;
1465  recipPeDest[node] = 0;
1466  }
1467 
1468  // // make sure that we don't get ahead of ourselves on this node
1469  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1470  // source_flags[CkMyPe()] = 1;
1471  // recipPeDest[myRecipPe] = 1;
1472  // }
1473 
1474  for ( int pid=0; pid < numPatches; ++pid ) {
1475  int pnode = patchMap->node(pid);
1476 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1477  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1478 #endif
1479  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1480  BigReal minx = patchMap->min_a(pid);
1481  BigReal maxx = patchMap->max_a(pid);
1482  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1483  // min1 (max1) is smallest (largest) grid line for this patch
1484  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1485  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1486  for ( int i=min1; i<=max1; ++i ) {
1487  int ix = i;
1488  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1489  while ( ix < 0 ) ix += myGrid.K1;
1490  // set source_flags[pnode] if this patch sends to our node
1491  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1492  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1493  source_flags[pnode] = 1;
1494  }
1495  // set dest_flags[] for node that our patch sends to
1496 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1497  if ( offload ) {
1498  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1499  recipPeDest[ix / myGrid.block1] = 1;
1500  }
1501  } else
1502 #endif
1503  if ( pnode == CkMyPe() ) {
1504  recipPeDest[ix / myGrid.block1] = 1;
1505  }
1506  }
1507  }
1508 
1509  int numSourcesSamePhysicalNode = 0;
1510  numSources = 0;
1511  numDestRecipPes = 0;
1512  for ( node=0; node<numNodes; ++node ) {
1513  if ( source_flags[node] ) ++numSources;
1514  if ( recipPeDest[node] ) ++numDestRecipPes;
1515  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1516  }
1517 
1518 #if 0
1519  if ( numSources ) {
1520  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1521  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1522  iout << iINFO << "PME " << CkMyPe() << " sources:";
1523  for ( node=0; node<numNodes; ++node ) {
1524  if ( source_flags[node] ) iout << " " << node;
1525  }
1526  iout << "\n" << endi;
1527  }
1528 #endif
1529 
1530  delete [] source_flags;
1531 
1532  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1533  // CkMyPe(), numSources, numDestRecipPes);
1534 
1535  } // decide how many pes this node exchanges charges with (end)
1536 
1537  ungrid_count = numDestRecipPes;
1538 
1539  sendTransBarrier_received = 0;
1540 
1541  if ( myGridPe < 0 && myTransPe < 0 ) return;
1542  // the following only for nodes doing reciprocal sum
1543 
1544  if ( myTransPe >= 0 ) {
1545  recipEvirPe = findRecipEvirPe();
1546  pmeProxy[recipEvirPe].addRecipEvirClient();
1547  }
1548 
1549  if ( myTransPe >= 0 ) {
1550  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1551  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1552  #ifdef OPENATOM_VERSION
1553  if ( simParams->openatomOn ) {
1554  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1555  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1556  } else {
1557  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1558  }
1559  #else // OPENATOM_VERSION
1560  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1561  #endif // OPENATOM_VERSION
1562  }
1563 
1564  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1565  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1566  if ( local_size < local_size_2 ) local_size = local_size_2;
1567  qgrid = new float[local_size*numGrids];
1568  if ( numGridPes > 1 || numTransPes > 1 ) {
1569  kgrid = new float[local_size*numGrids];
1570  } else {
1571  kgrid = qgrid;
1572  }
1573  qgrid_size = local_size;
1574 
1575  if ( myGridPe >= 0 ) {
1576  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1577  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1578  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1579  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1580  }
1581 
1582  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1583 #ifdef NAMD_FFTW
1584  CmiLock(fftw_plan_lock);
1585 #ifdef NAMD_FFTW_3
1586  work = new fftwf_complex[n[0]];
1587  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1588  if ( myGridPe >= 0 ) {
1589  forward_plan_yz=new fftwf_plan[numGrids];
1590  backward_plan_yz=new fftwf_plan[numGrids];
1591  }
1592  if ( myTransPe >= 0 ) {
1593  forward_plan_x=new fftwf_plan[numGrids];
1594  backward_plan_x=new fftwf_plan[numGrids];
1595  }
1596  /* need one plan per grid */
1597  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1598  if ( myGridPe >= 0 ) {
1599  for( int g=0; g<numGrids; g++)
1600  {
1601  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1602  localInfo[myGridPe].nx,
1603  qgrid + qgrid_size * g,
1604  NULL,
1605  1,
1606  myGrid.dim2 * myGrid.dim3,
1607  (fftwf_complex *)
1608  (qgrid + qgrid_size * g),
1609  NULL,
1610  1,
1611  myGrid.dim2 * (myGrid.dim3/2),
1612  fftwFlags);
1613  }
1614  }
1615  int zdim = myGrid.dim3;
1616  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1617  if ( ! CkMyPe() ) iout << " 2..." << endi;
1618  if ( myTransPe >= 0 ) {
1619  for( int g=0; g<numGrids; g++)
1620  {
1621 
1622  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1623  (fftwf_complex *)
1624  (kgrid+qgrid_size*g),
1625  NULL,
1626  xStride,
1627  1,
1628  (fftwf_complex *)
1629  (kgrid+qgrid_size*g),
1630  NULL,
1631  xStride,
1632  1,
1633  FFTW_FORWARD,fftwFlags);
1634 
1635  }
1636  }
1637  if ( ! CkMyPe() ) iout << " 3..." << endi;
1638  if ( myTransPe >= 0 ) {
1639  for( int g=0; g<numGrids; g++)
1640  {
1641  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1642  (fftwf_complex *)
1643  (kgrid+qgrid_size*g),
1644  NULL,
1645  xStride,
1646  1,
1647  (fftwf_complex *)
1648  (kgrid+qgrid_size*g),
1649  NULL,
1650  xStride,
1651  1,
1652  FFTW_BACKWARD, fftwFlags);
1653 
1654  }
1655  }
1656  if ( ! CkMyPe() ) iout << " 4..." << endi;
1657  if ( myGridPe >= 0 ) {
1658  for( int g=0; g<numGrids; g++)
1659  {
1660  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1661  localInfo[myGridPe].nx,
1662  (fftwf_complex *)
1663  (qgrid + qgrid_size * g),
1664  NULL,
1665  1,
1666  myGrid.dim2*(myGrid.dim3/2),
1667  qgrid + qgrid_size * g,
1668  NULL,
1669  1,
1670  myGrid.dim2 * myGrid.dim3,
1671  fftwFlags);
1672  }
1673  }
1674  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1675 
1676 #else
1677  work = new fftw_complex[n[0]];
1678 
1679  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1680  if ( myGridPe >= 0 ) {
1681  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1682  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1683  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1684  }
1685  if ( ! CkMyPe() ) iout << " 2..." << endi;
1686  if ( myTransPe >= 0 ) {
1687  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1688  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1689  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1690  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1691  }
1692  if ( ! CkMyPe() ) iout << " 3..." << endi;
1693  if ( myTransPe >= 0 ) {
1694  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1695  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1696  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1697  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1698  }
1699  if ( ! CkMyPe() ) iout << " 4..." << endi;
1700  if ( myGridPe >= 0 ) {
1701  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1702  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1703  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1704  }
1705  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1706 #endif
1707  CmiUnlock(fftw_plan_lock);
1708 #else
1709  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1710 #endif
1711 
1712  if ( myGridPe >= 0 && numSources == 0 )
1713  NAMD_bug("PME grid elements exist without sources.");
1714  grid_count = numSources;
1715  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1716  trans_count = numGridPes;
1717 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:442
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:307
int numNodesWithPatches(void)
Definition: PatchMap.h:61
int size(void) const
Definition: ResizeArray.h:131
int dim3
Definition: PmeBase.h:22
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:245
BigReal max_a(int pid) const
Definition: PatchMap.h:92
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:244
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:243
CProxy_PmePencilMap zm
Definition: ComputePme.C:249
CProxy_PmePencilMap xm
Definition: ComputePme.C:247
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:246
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int add(const Elem &elem)
Definition: ResizeArray.h:101
void resize(int i)
Definition: ResizeArray.h:84
Definition: Random.h:37
int numPatches(void) const
Definition: PatchMap.h:59
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int block3
Definition: PmeBase.h:24
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:320
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void NAMD_die(const char *err_msg)
Definition: common.C:147
BigReal min_a(int pid) const
Definition: PatchMap.h:91
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
static int findRecipEvirPe()
Definition: ComputePme.C:269
static int * peDiffuseOrdering
Definition: WorkDistrib.h:116
int ny_after_transpose
Definition: ComputePme.C:261
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
CProxy_PmePencilMap ym
Definition: ComputePme.C:248
int node(int pid) const
Definition: PatchMap.h:114
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
bool one_device_per_node()
Definition: DeviceCUDA.C:553
char * pencilPMEProcessors
Definition: ComputePme.C:135
int64_t int64
Definition: common.h:39
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
double BigReal
Definition: common.h:123
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:242
int y_start_after_transpose
Definition: ComputePme.C:261

◆ initialize_computes()

void ComputePmeMgr::initialize_computes ( )

Definition at line 2765 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, master_pe, NAMD_bug(), ComputePmeUtil::numGrids, PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

2765  {
2766 
2767  noWorkCount = 0;
2768  doWorkCount = 0;
2769  ungridForcesCount = 0;
2770 
2772 
2774 
2775  strayChargeErrors = 0;
2776 
2777 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2778  PatchMap *patchMap = PatchMap::Object();
2779  int pe = master_pe = CkNodeFirst(CkMyNode());
2780  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2781  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2782  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2783  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2784  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2786  && pe != deviceCUDA->getMasterPe() ) {
2787  master_pe = pe;
2788  }
2789  }
2790  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2791  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2792  }
2793 
2794  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2795  bool cudaFirst = 1;
2796  if ( offload ) {
2797  CmiLock(cuda_lock);
2798  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2799  }
2800 
2801  if ( cudaFirst ) {
2802  nodePmeMgr->master_pe = master_pe;
2803  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2804  }
2805 #endif
2806 
2807  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2808  fsize = myGrid.K1 * myGrid.dim2;
2809  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2810 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2811  if ( ! offload )
2812 #endif
2813  {
2814  q_arr = new float*[fsize*numGrids];
2815  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2816  q_list = new float*[fsize*numGrids];
2817  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2818  q_count = 0;
2819  }
2820 
2821 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2822  if ( cudaFirst || ! offload ) {
2823 #endif
2824  f_arr = new char[fsize*numGrids];
2825  // memset to non-zero value has race condition on BlueGene/Q
2826  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2827  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2828 
2829  for ( int g=0; g<numGrids; ++g ) {
2830  char *f = f_arr + g*fsize;
2831  if ( usePencils ) {
2832  int K1 = myGrid.K1;
2833  int K2 = myGrid.K2;
2834  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2835  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2836  int dim2 = myGrid.dim2;
2837  for (int ap=0; ap<numPencilsActive; ++ap) {
2838  int ib = activePencils[ap].i;
2839  int jb = activePencils[ap].j;
2840  int ibegin = ib*block1;
2841  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2842  int jbegin = jb*block2;
2843  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2844  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2845  for ( int i=ibegin; i<iend; ++i ) {
2846  for ( int j=jbegin; j<jend; ++j ) {
2847  f[i*dim2+j] = 0;
2848  }
2849  }
2850  }
2851  } else {
2852  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2853  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2854  for (int pe=0; pe<numGridPes; pe++) {
2855  if ( ! recipPeDest[pe] ) continue;
2856  int start = pe * bsize;
2857  int len = bsize;
2858  if ( start >= qsize ) { start = 0; len = 0; }
2859  if ( start + len > qsize ) { len = qsize - start; }
2860  int zdim = myGrid.dim3;
2861  int fstart = start / zdim;
2862  int flen = len / zdim;
2863  memset(f + fstart, 0, flen*sizeof(char));
2864  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2865  }
2866  }
2867  }
2868 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2869  }
2870  if ( offload ) {
2871  cudaSetDevice(deviceCUDA->getDeviceID());
2872  if ( cudaFirst ) {
2873 
2874  int f_alloc_count = 0;
2875  for ( int n=fsize, i=0; i<n; ++i ) {
2876  if ( f_arr[i] == 0 ) {
2877  ++f_alloc_count;
2878  }
2879  }
2880  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2881 
2882  q_arr = new float*[fsize*numGrids];
2883  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2884 
2885  float **q_arr_dev_host = new float*[fsize];
2886  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2887 
2888  float **v_arr_dev_host = new float*[fsize];
2889  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2890 
2891  int q_stride = myGrid.K3+myGrid.order-1;
2892  q_data_size = f_alloc_count * q_stride * sizeof(float);
2893  ffz_size = (fsize + q_stride) * sizeof(int);
2894 
2895  // tack ffz onto end of q_data to allow merged transfer
2896  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2897  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2898  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2899  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2900  cudaMalloc((void**) &v_data_dev, q_data_size);
2901  cuda_errcheck("malloc grid data for pme");
2902  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2903  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2904  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2905  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2906  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2907 
2908  f_alloc_count = 0;
2909  for ( int n=fsize, i=0; i<n; ++i ) {
2910  if ( f_arr[i] == 0 ) {
2911  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2912  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2913  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2914  ++f_alloc_count;
2915  } else {
2916  q_arr[i] = 0;
2917  q_arr_dev_host[i] = 0;
2918  v_arr_dev_host[i] = 0;
2919  }
2920  }
2921 
2922  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2923  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2924  delete [] q_arr_dev_host;
2925  delete [] v_arr_dev_host;
2926  delete [] f_arr;
2927  f_arr = new char[fsize + q_stride];
2928  fz_arr = f_arr + fsize;
2929  memset(f_arr, 0, fsize + q_stride);
2930  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2931 
2932  cuda_errcheck("initialize grid data for pme");
2933 
2934  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2935  cuda_errcheck("initialize bspline coefficients for pme");
2936 
2937 #define XCOPY(X) masterPmeMgr->X = X;
2938  XCOPY(bspline_coeffs_dev)
2939  XCOPY(bspline_dcoeffs_dev)
2940  XCOPY(q_arr)
2941  XCOPY(q_arr_dev)
2942  XCOPY(v_arr_dev)
2943  XCOPY(q_data_size)
2944  XCOPY(q_data_host)
2945  XCOPY(q_data_dev)
2946  XCOPY(v_data_dev)
2947  XCOPY(ffz_size)
2948  XCOPY(ffz_host)
2949  XCOPY(ffz_dev)
2950  XCOPY(f_arr)
2951  XCOPY(fz_arr)
2952 #undef XCOPY
2953  //CkPrintf("pe %d init first\n", CkMyPe());
2954  } else { // cudaFirst
2955  //CkPrintf("pe %d init later\n", CkMyPe());
2956 #define XCOPY(X) X = masterPmeMgr->X;
2957  XCOPY(bspline_coeffs_dev)
2958  XCOPY(bspline_dcoeffs_dev)
2959  XCOPY(q_arr)
2960  XCOPY(q_arr_dev)
2961  XCOPY(v_arr_dev)
2962  XCOPY(q_data_size)
2963  XCOPY(q_data_host)
2964  XCOPY(q_data_dev)
2965  XCOPY(v_data_dev)
2966  XCOPY(ffz_size)
2967  XCOPY(ffz_host)
2968  XCOPY(ffz_dev)
2969  XCOPY(f_arr)
2970  XCOPY(fz_arr)
2971 #undef XCOPY
2972  } // cudaFirst
2973  CmiUnlock(cuda_lock);
2974  } else // offload
2975 #endif // NAMD_CUDA
2976  {
2977  fz_arr = new char[myGrid.K3+myGrid.order-1];
2978  }
2979 
2980 #if 0 && USE_PERSISTENT
2981  recvGrid_handle = NULL;
2982 #endif
2983 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
int dim3
Definition: PmeBase.h:22
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
int order
Definition: PmeBase.h:23
int getMasterPe()
Definition: DeviceCUDA.h:137
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int chargeGridSubmittedCount
Definition: ComputePme.C:472
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define XCOPY(X)
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:371
int j
Definition: ComputePme.C:371
static CmiNodeLock cuda_lock
Definition: ComputePme.C:452

◆ initialize_pencils()

void ComputePmeMgr::initialize_pencils ( CkQdMsg *  msg)

Definition at line 1721 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), PmeGrid::order, Random::reorder(), Node::simParameters, simParams, and Vector::unit().

1721  {
1722  delete msg;
1723  if ( ! usePencils ) return;
1724 
1726 
1727  PatchMap *patchMap = PatchMap::Object();
1728  Lattice lattice = simParams->lattice;
1729  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1730  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1731  BigReal cutoff = simParams->cutoff;
1732  BigReal patchdim = simParams->patchDimension;
1733  int numPatches = patchMap->numPatches();
1734 
1735  pencilActive = new char[xBlocks*yBlocks];
1736  for ( int i=0; i<xBlocks; ++i ) {
1737  for ( int j=0; j<yBlocks; ++j ) {
1738  pencilActive[i*yBlocks+j] = 0;
1739  }
1740  }
1741 
1742  for ( int pid=0; pid < numPatches; ++pid ) {
1743  int pnode = patchMap->node(pid);
1744 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1745  if ( offload ) {
1746  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1747  } else
1748 #endif
1749  if ( pnode != CkMyPe() ) continue;
1750 
1751  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1752  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1753 
1754  BigReal minx = patchMap->min_a(pid);
1755  BigReal maxx = patchMap->max_a(pid);
1756  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1757  // min1 (max1) is smallest (largest) grid line for this patch
1758  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1759  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1760 
1761  BigReal miny = patchMap->min_b(pid);
1762  BigReal maxy = patchMap->max_b(pid);
1763  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1764  // min2 (max2) is smallest (largest) grid line for this patch
1765  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1766  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1767 
1768  for ( int i=min1; i<=max1; ++i ) {
1769  int ix = i;
1770  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1771  while ( ix < 0 ) ix += myGrid.K1;
1772  for ( int j=min2; j<=max2; ++j ) {
1773  int jy = j;
1774  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1775  while ( jy < 0 ) jy += myGrid.K2;
1776  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1777  }
1778  }
1779  }
1780 
1781  numPencilsActive = 0;
1782  for ( int i=0; i<xBlocks; ++i ) {
1783  for ( int j=0; j<yBlocks; ++j ) {
1784  if ( pencilActive[i*yBlocks+j] ) {
1785  ++numPencilsActive;
1786 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1787  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1788 #endif
1789  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1790  }
1791  }
1792  }
1793  activePencils = new ijpair[numPencilsActive];
1794  numPencilsActive = 0;
1795  for ( int i=0; i<xBlocks; ++i ) {
1796  for ( int j=0; j<yBlocks; ++j ) {
1797  if ( pencilActive[i*yBlocks+j] ) {
1798  activePencils[numPencilsActive++] = ijpair(i,j);
1799  }
1800  }
1801  }
1802  if ( simParams->PMESendOrder ) {
1803  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1804  } else {
1805  Random rand(CkMyPe());
1806  rand.reorder(activePencils,numPencilsActive);
1807  }
1808  //if ( numPencilsActive ) {
1809  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1810  //}
1811 
1812  ungrid_count = numPencilsActive;
1813 }
static Node * Object()
Definition: Node.h:86
BigReal max_a(int pid) const
Definition: PatchMap.h:92
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
int block1
Definition: PmeBase.h:24
int block2
Definition: PmeBase.h:24
Definition: Random.h:37
int numPatches(void) const
Definition: PatchMap.h:59
int order
Definition: PmeBase.h:23
int getMasterPe()
Definition: DeviceCUDA.h:137
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
BigReal min_a(int pid) const
Definition: PatchMap.h:91
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
#define simParams
Definition: Output.C:131
BigReal max_b(int pid) const
Definition: PatchMap.h:94
int node(int pid) const
Definition: PatchMap.h:114
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
BigReal min_b(int pid) const
Definition: PatchMap.h:93
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
double BigReal
Definition: common.h:123

◆ pollChargeGridReady()

void ComputePmeMgr::pollChargeGridReady ( )

Definition at line 3613 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

3613  {
3614 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3615  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3617 #else
3618  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3619 #endif
3620 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2505
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3540

◆ pollForcesReady()

void ComputePmeMgr::pollForcesReady ( )

Definition at line 2701 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

2701  {
2702 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2703  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2705 #else
2706  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2707 #endif
2708 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2505
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2512

◆ procTrans()

void ComputePmeMgr::procTrans ( PmeTransMsg msg)

Definition at line 2076 of file ComputePme.C.

References PmeGrid::dim3, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

2076  {
2077  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2078  if ( trans_count == numGridPes ) {
2079  lattice = msg->lattice;
2080  grid_sequence = msg->sequence;
2081  }
2082 
2083  if ( msg->nx ) {
2084  int zdim = myGrid.dim3;
2085  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2086  int first_pe = nodeInfo.pe_start;
2087  int last_pe = first_pe+nodeInfo.npe-1;
2088  int y_skip = localInfo[myTransPe].y_start_after_transpose
2089  - localInfo[first_pe].y_start_after_transpose;
2090  int ny_msg = localInfo[last_pe].y_start_after_transpose
2091  + localInfo[last_pe].ny_after_transpose
2092  - localInfo[first_pe].y_start_after_transpose;
2093  int ny = localInfo[myTransPe].ny_after_transpose;
2094  int x_start = msg->x_start;
2095  int nx = msg->nx;
2096  for ( int g=0; g<numGrids; ++g ) {
2097  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2098  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2099  nx*ny*zdim*sizeof(float));
2100  }
2101  }
2102 
2103  --trans_count;
2104 
2105  if ( trans_count == 0 ) {
2106  pmeProxyDir[CkMyPe()].gridCalc2();
2107  }
2108 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:165
int ny_after_transpose
Definition: ComputePme.C:261
Lattice lattice
Definition: ComputePme.C:162
int y_start_after_transpose
Definition: ComputePme.C:261

◆ procUntrans()

void ComputePmeMgr::procUntrans ( PmeUntransMsg msg)

Definition at line 2337 of file ComputePme.C.

References PmeGrid::dim3, PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

2337  {
2338  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2339 
2340 #if CMK_BLUEGENEL
2341  CmiNetworkProgressAfter (0);
2342 #endif
2343 
2344  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2345  int first_pe = nodeInfo.pe_start;
2346  int g;
2347 
2348  if ( msg->ny ) {
2349  int zdim = myGrid.dim3;
2350  int last_pe = first_pe+nodeInfo.npe-1;
2351  int x_skip = localInfo[myGridPe].x_start
2352  - localInfo[first_pe].x_start;
2353  int nx_msg = localInfo[last_pe].x_start
2354  + localInfo[last_pe].nx
2355  - localInfo[first_pe].x_start;
2356  int nx = localInfo[myGridPe].nx;
2357  int y_start = msg->y_start;
2358  int ny = msg->ny;
2359  int slicelen = myGrid.K2 * zdim;
2360  int cpylen = ny * zdim;
2361  for ( g=0; g<numGrids; ++g ) {
2362  float *q = qgrid + qgrid_size * g + y_start * zdim;
2363  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2364  for ( int x = 0; x < nx; ++x ) {
2365  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2366  q += slicelen;
2367  qmsg += cpylen;
2368  }
2369  }
2370  }
2371 
2372  --untrans_count;
2373 
2374  if ( untrans_count == 0 ) {
2375  pmeProxyDir[CkMyPe()].gridCalc3();
2376  }
2377 }
float * qgrid
Definition: ComputePme.C:182
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32

◆ recvAck()

void ComputePmeMgr::recvAck ( PmeAckMsg msg)

Definition at line 2479 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

2479  {
2480  if ( msg ) delete msg;
2481 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2482  if ( offload ) {
2483  CmiLock(cuda_lock);
2484  if ( ungrid_count == 0 ) {
2485  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2486  }
2487  int uc = --ungrid_count;
2488  CmiUnlock(cuda_lock);
2489 
2490  if ( uc == 0 ) {
2491  pmeProxyDir[master_pe].ungridCalc();
2492  }
2493  return;
2494  }
2495 #endif
2496  --ungrid_count;
2497 
2498  if ( ungrid_count == 0 ) {
2499  pmeProxyDir[CkMyPe()].ungridCalc();
2500  }
2501 }
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static CmiNodeLock cuda_lock
Definition: ComputePme.C:452

◆ recvArrays()

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  x,
CProxy_PmeYPencil  y,
CProxy_PmeZPencil  z 
)

Definition at line 828 of file ComputePme.C.

829  {
830  xPencil = x; yPencil = y; zPencil = z;
831 
832  if(CmiMyRank()==0)
833  {
834  pmeNodeProxy.ckLocalBranch()->xPencil=x;
835  pmeNodeProxy.ckLocalBranch()->yPencil=y;
836  pmeNodeProxy.ckLocalBranch()->zPencil=z;
837  }
838 }

◆ recvChargeGridReady()

void ComputePmeMgr::recvChargeGridReady ( )

Definition at line 3622 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

3622  {
3624 }
Lattice * saved_lattice
Definition: ComputePme.C:475
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3626
int saved_sequence
Definition: ComputePme.C:476

◆ recvGrid()

void ComputePmeMgr::recvGrid ( PmeGridMsg msg)

Definition at line 1855 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

1855  {
1856  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1857  if ( grid_count == 0 ) {
1858  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1859  }
1860  if ( grid_count == numSources ) {
1861  lattice = msg->lattice;
1862  grid_sequence = msg->sequence;
1863  }
1864 
1865  int zdim = myGrid.dim3;
1866  int zlistlen = msg->zlistlen;
1867  int *zlist = msg->zlist;
1868  float *qmsg = msg->qgrid;
1869  for ( int g=0; g<numGrids; ++g ) {
1870  char *f = msg->fgrid + fgrid_len * g;
1871  float *q = qgrid + qgrid_size * g;
1872  for ( int i=0; i<fgrid_len; ++i ) {
1873  if ( f[i] ) {
1874  for ( int k=0; k<zlistlen; ++k ) {
1875  q[zlist[k]] += *(qmsg++);
1876  }
1877  }
1878  q += zdim;
1879  }
1880  }
1881 
1882  gridmsg_reuse[numSources-grid_count] = msg;
1883  --grid_count;
1884 
1885  if ( grid_count == 0 ) {
1886  pmeProxyDir[CkMyPe()].gridCalc1();
1887  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1888  }
1889 }
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:144
static int numGrids
Definition: ComputePme.h:32
Lattice lattice
Definition: ComputePme.C:146
void NAMD_bug(const char *err_msg)
Definition: common.C:195
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int zlistlen
Definition: ComputePme.C:149
char * fgrid
Definition: ComputePme.C:151

◆ recvRecipEvir()

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg msg)

Definition at line 3068 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), ComputePmeUtil::numGrids, pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

3068  {
3069  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3070  for ( int g=0; g<numGrids; ++g ) {
3071  evir[g] += msg->evir[g];
3072  }
3073  delete msg;
3074  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3075  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3076 }
int size(void) const
Definition: ResizeArray.h:131
static int numGrids
Definition: ComputePme.h:32
PmeReduction * evir
Definition: ComputePme.C:195
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void submitReductions()
Definition: ComputePme.C:4297
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:482

◆ recvSharedTrans()

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg msg)

Definition at line 2058 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

2058  {
2059  procTrans(msg->msg);
2060  CmiLock(msg->lock);
2061  int count = --(*msg->count);
2062  CmiUnlock(msg->lock);
2063  if ( count == 0 ) {
2064  CmiDestroyLock(msg->lock);
2065  delete msg->count;
2066  delete msg->msg;
2067  }
2068  delete msg;
2069 }
PmeTransMsg * msg
Definition: ComputePme.C:171
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2076
CmiNodeLock lock
Definition: ComputePme.C:173

◆ recvSharedUntrans()

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg msg)

Definition at line 2319 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

2319  {
2320  procUntrans(msg->msg);
2321  CmiLock(msg->lock);
2322  int count = --(*msg->count);
2323  CmiUnlock(msg->lock);
2324  if ( count == 0 ) {
2325  CmiDestroyLock(msg->lock);
2326  delete msg->count;
2327  delete msg->msg;
2328  }
2329  delete msg;
2330 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2337
CmiNodeLock lock
Definition: ComputePme.C:190
PmeUntransMsg * msg
Definition: ComputePme.C:188

◆ recvTrans()

void ComputePmeMgr::recvTrans ( PmeTransMsg msg)

Definition at line 2071 of file ComputePme.C.

References procTrans().

2071  {
2072  procTrans(msg);
2073  delete msg;
2074 }
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2076

◆ recvUngrid()

void ComputePmeMgr::recvUngrid ( PmeGridMsg msg)

Definition at line 2464 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

2464  {
2465  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2466 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2467  if ( ! offload ) // would need lock
2468 #endif
2469  if ( ungrid_count == 0 ) {
2470  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2471  }
2472 
2473  if ( usePencils ) copyPencils(msg);
2474  else copyResults(msg);
2475  delete msg;
2476  recvAck(0);
2477 }
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2479
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3872
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:4064

◆ recvUntrans()

void ComputePmeMgr::recvUntrans ( PmeUntransMsg msg)

Definition at line 2332 of file ComputePme.C.

References procUntrans().

2332  {
2333  procUntrans(msg);
2334  delete msg;
2335 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2337

◆ sendChargeGridReady()

void ComputePmeMgr::sendChargeGridReady ( )

Definition at line 3599 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

3599  {
3600  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3601  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3602  int cs = mgr->pmeComputes.size();
3603  if ( cs ) {
3604  mgr->ungridForcesCount = cs;
3605  mgr->recipEvirCount = mgr->recipEvirClients;
3606  masterPmeMgr->chargeGridSubmittedCount++;
3607  }
3608  }
3609  pmeProxy[master_pe].recvChargeGridReady();
3610 }
int size(void) const
Definition: ResizeArray.h:131
int chargeGridSubmittedCount
Definition: ComputePme.C:472
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:482

◆ sendData()

void ComputePmeMgr::sendData ( Lattice lattice,
int  sequence 
)

Definition at line 4036 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

4036  {
4037 
4038  sendDataHelper_lattice = &lattice;
4039  sendDataHelper_sequence = sequence;
4040  sendDataHelper_sourcepe = CkMyPe();
4041  sendDataHelper_errors = strayChargeErrors;
4042  strayChargeErrors = 0;
4043 
4044 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4045  if ( offload ) {
4046  for ( int i=0; i < numGridPes; ++i ) {
4047  int pe = gridPeOrder[i]; // different order
4048  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
4049 #if CMK_MULTICORE
4050  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
4051  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4052 #else
4053  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4054 #endif
4055  }
4056  } else
4057 #endif
4058  {
4059  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
4060  }
4061 
4062 }
int sendDataHelper_sequence
Definition: ComputePme.C:399
int sendDataHelper_sourcepe
Definition: ComputePme.C:400
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:398
int sendDataHelper_errors
Definition: ComputePme.C:401
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3914

◆ sendDataHelper()

void ComputePmeMgr::sendDataHelper ( int  iter)

Definition at line 4023 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

4023  {
4024  nodePmeMgr->sendDataHelper(iter);
4025 }
void sendDataHelper(int)
Definition: ComputePme.C:4027

◆ sendDataPart()

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3914 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), iout, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

3914  {
3915 
3916  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3917 
3918  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3919 
3920  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3921  for (int j=first; j<=last; j++) {
3922  int pe = gridPeOrder[j]; // different order
3923  if ( ! recipPeDest[pe] && ! errors ) continue;
3924  int start = pe * bsize;
3925  int len = bsize;
3926  if ( start >= qsize ) { start = 0; len = 0; }
3927  if ( start + len > qsize ) { len = qsize - start; }
3928  int zdim = myGrid.dim3;
3929  int fstart = start / zdim;
3930  int flen = len / zdim;
3931  int fcount = 0;
3932  int i;
3933 
3934  int g;
3935  for ( g=0; g<numGrids; ++g ) {
3936  char *f = f_arr + fstart + g*fsize;
3937 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3938  if ( offload ) {
3939  int errcount = 0;
3940  for ( i=0; i<flen; ++i ) {
3941  f[i] = ffz_host[fstart+i];
3942  fcount += f[i];
3943  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3944  }
3945  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3946  } else
3947 #endif
3948  for ( i=0; i<flen; ++i ) {
3949  fcount += f[i];
3950  }
3951  if ( ! recipPeDest[pe] ) {
3952  int errfound = 0;
3953  for ( i=0; i<flen; ++i ) {
3954  if ( f[i] == 3 ) {
3955  errfound = 1;
3956  break;
3957  }
3958  }
3959  if ( errfound ) {
3960  iout << iERROR << "Stray PME grid charges detected: "
3961  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3962  int iz = -1;
3963  for ( i=0; i<flen; ++i ) {
3964  if ( f[i] == 3 ) {
3965  f[i] = 2;
3966  int jz = (i+fstart)/myGrid.K2;
3967  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3968  }
3969  }
3970  iout << "\n" << endi;
3971  }
3972  }
3973  }
3974 
3975 #ifdef NETWORK_PROGRESS
3976  CmiNetworkProgress();
3977 #endif
3978 
3979  if ( ! recipPeDest[pe] ) continue;
3980 
3981  int zlistlen = 0;
3982  for ( i=0; i<myGrid.K3; ++i ) {
3983  if ( fz_arr[i] ) ++zlistlen;
3984  }
3985 
3986  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3987  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3988 
3989  msg->sourceNode = sourcepe;
3990  msg->lattice = lattice;
3991  msg->start = fstart;
3992  msg->len = flen;
3993  msg->zlistlen = zlistlen;
3994  int *zlist = msg->zlist;
3995  zlistlen = 0;
3996  for ( i=0; i<myGrid.K3; ++i ) {
3997  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3998  }
3999  float *qmsg = msg->qgrid;
4000  for ( g=0; g<numGrids; ++g ) {
4001  char *f = f_arr + fstart + g*fsize;
4002  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
4003  float **q = q_arr + fstart + g*fsize;
4004  for ( i=0; i<flen; ++i ) {
4005  if ( f[i] ) {
4006  for (int h=0; h<myGrid.order-1; ++h) {
4007  q[i][h] += q[i][myGrid.K3+h];
4008  }
4009  for ( int k=0; k<zlistlen; ++k ) {
4010  *(qmsg++) = q[i][zlist[k]];
4011  }
4012  }
4013  }
4014  }
4015 
4016  msg->sequence = compute_sequence;
4017  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
4018  pmeProxy[gridPeMap[pe]].recvGrid(msg);
4019  }
4020 
4021 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:144
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
Lattice lattice
Definition: ComputePme.C:146
#define iout
Definition: InfoStream.h:51
int sourceNode
Definition: ComputePme.C:143
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int K3
Definition: PmeBase.h:21
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int zlistlen
Definition: ComputePme.C:149
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:151

◆ sendPencils()

void ComputePmeMgr::sendPencils ( Lattice lattice,
int  sequence 
)

Definition at line 3809 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), ijpair::i, iERROR(), iout, ijpair::j, PmeGrid::K1, PmeGrid::K2, ComputePmeUtil::numGrids, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

3809  {
3810 
3811  sendDataHelper_lattice = &lattice;
3812  sendDataHelper_sequence = sequence;
3813  sendDataHelper_sourcepe = CkMyPe();
3814 
3815 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3816  if ( offload ) {
3817  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3818 #if CMK_MULTICORE
3819  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3820  int ib = activePencils[ap].i;
3821  int jb = activePencils[ap].j;
3822  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3823  pmeProxy[destproc].sendPencilsHelper(ap);
3824 #else
3825  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3826 #endif
3827  }
3828  } else
3829 #endif
3830  {
3831  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3832  }
3833 
3834  if ( strayChargeErrors ) {
3835  strayChargeErrors = 0;
3836  iout << iERROR << "Stray PME grid charges detected: "
3837  << CkMyPe() << " sending to (x,y)";
3838  int K1 = myGrid.K1;
3839  int K2 = myGrid.K2;
3840  int dim2 = myGrid.dim2;
3841  int block1 = myGrid.block1;
3842  int block2 = myGrid.block2;
3843  for (int ib=0; ib<xBlocks; ++ib) {
3844  for (int jb=0; jb<yBlocks; ++jb) {
3845  int ibegin = ib*block1;
3846  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3847  int jbegin = jb*block2;
3848  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3849  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3850 
3851  for ( int g=0; g<numGrids; ++g ) {
3852  char *f = f_arr + g*fsize;
3853  if ( ! pencilActive[ib*yBlocks+jb] ) {
3854  for ( int i=ibegin; i<iend; ++i ) {
3855  for ( int j=jbegin; j<jend; ++j ) {
3856  if ( f[i*dim2+j] == 3 ) {
3857  f[i*dim2+j] = 2;
3858  iout << " (" << i << "," << j << ")";
3859  }
3860  }
3861  }
3862  }
3863  }
3864  }
3865  }
3866  iout << "\n" << endi;
3867  }
3868 
3869 }
int dim2
Definition: PmeBase.h:22
CProxy_PmePencilMap zm
Definition: ComputePme.C:662
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int sendDataHelper_sequence
Definition: ComputePme.C:399
int sendDataHelper_sourcepe
Definition: ComputePme.C:400
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:398
int i
Definition: ComputePme.C:371
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int j
Definition: ComputePme.C:371
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3654

◆ sendPencilsHelper()

void ComputePmeMgr::sendPencilsHelper ( int  iter)

Definition at line 3796 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

3796  {
3797  nodePmeMgr->sendPencilsHelper(iter);
3798 }
void sendPencilsHelper(int)
Definition: ComputePme.C:3800

◆ sendPencilsPart()

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe 
)

Definition at line 3654 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

3654  {
3655 
3656  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3657 
3658 #if 0 && USE_PERSISTENT
3659  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3660 #endif
3661  int K1 = myGrid.K1;
3662  int K2 = myGrid.K2;
3663  int dim2 = myGrid.dim2;
3664  int dim3 = myGrid.dim3;
3665  int block1 = myGrid.block1;
3666  int block2 = myGrid.block2;
3667 
3668  // int savedMessages = 0;
3669  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3670 
3671  for (int ap=first; ap<=last; ++ap) {
3672  int ib = activePencils[ap].i;
3673  int jb = activePencils[ap].j;
3674  int ibegin = ib*block1;
3675  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3676  int jbegin = jb*block2;
3677  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3678  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3679 
3680  int fcount = 0;
3681  for ( int g=0; g<numGrids; ++g ) {
3682  char *f = f_arr + g*fsize;
3683 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3684  if ( offload ) {
3685  int errcount = 0;
3686  for ( int i=ibegin; i<iend; ++i ) {
3687  for ( int j=jbegin; j<jend; ++j ) {
3688  int k = i*dim2+j;
3689  f[k] = ffz_host[k];
3690  fcount += f[k];
3691  if ( ffz_host[k] & ~1 ) ++errcount;
3692  }
3693  }
3694  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3695  } else
3696 #endif
3697  for ( int i=ibegin; i<iend; ++i ) {
3698  for ( int j=jbegin; j<jend; ++j ) {
3699  fcount += f[i*dim2+j];
3700  }
3701  }
3702  }
3703 
3704 #ifdef NETWORK_PROGRESS
3705  CmiNetworkProgress();
3706 #endif
3707 
3708  if ( ! pencilActive[ib*yBlocks+jb] )
3709  NAMD_bug("PME activePencils list inconsistent");
3710 
3711  int zlistlen = 0;
3712  for ( int i=0; i<myGrid.K3; ++i ) {
3713  if ( fz_arr[i] ) ++zlistlen;
3714  }
3715 
3716  int hd = ( fcount? 1 : 0 ); // has data?
3717  // if ( ! hd ) ++savedMessages;
3718 
3719 
3720  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3721  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3722  msg->sourceNode = sourcepe;
3723  msg->hasData = hd;
3724  msg->lattice = lattice;
3725  if ( hd ) {
3726 #if 0
3727  msg->start = fstart;
3728  msg->len = flen;
3729 #else
3730  msg->start = -1; // obsolete?
3731  msg->len = -1; // obsolete?
3732 #endif
3733  msg->zlistlen = zlistlen;
3734  int *zlist = msg->zlist;
3735  zlistlen = 0;
3736  for ( int i=0; i<myGrid.K3; ++i ) {
3737  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3738  }
3739  char *fmsg = msg->fgrid;
3740  float *qmsg = msg->qgrid;
3741  for ( int g=0; g<numGrids; ++g ) {
3742  char *f = f_arr + g*fsize;
3743  float **q = q_arr + g*fsize;
3744  for ( int i=ibegin; i<iend; ++i ) {
3745  for ( int j=jbegin; j<jend; ++j ) {
3746  *(fmsg++) = f[i*dim2+j];
3747  if( f[i*dim2+j] ) {
3748  for (int h=0; h<myGrid.order-1; ++h) {
3749  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3750  }
3751  for ( int k=0; k<zlistlen; ++k ) {
3752  *(qmsg++) = q[i*dim2+j][zlist[k]];
3753  }
3754  }
3755  }
3756  }
3757  }
3758  }
3759 
3760  msg->sequence = compute_sequence;
3761  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3762  CmiEnableUrgentSend(1);
3763 #if USE_NODE_PAR_RECEIVE
3764  msg->destElem=CkArrayIndex3D(ib,jb,0);
3765  CProxy_PmePencilMap lzm = npMgr->zm;
3766  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3767  int destnode = CmiNodeOf(destproc);
3768 
3769 #if 0
3770  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3771 #endif
3772  pmeNodeProxy[destnode].recvZGrid(msg);
3773 #if 0
3774  CmiUsePersistentHandle(NULL, 0);
3775 #endif
3776 #else
3777 #if 0
3778  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3779 #endif
3780  zPencil(ib,jb,0).recvGrid(msg);
3781 #if 0
3782  CmiUsePersistentHandle(NULL, 0);
3783 #endif
3784 #endif
3785  CmiEnableUrgentSend(0);
3786  }
3787 
3788 
3789  // if ( savedMessages ) {
3790  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3791  // }
3792 
3793 }
int dim2
Definition: PmeBase.h:22
CProxy_PmePencilMap zm
Definition: ComputePme.C:662
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:144
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:24
Lattice lattice
Definition: ComputePme.C:146
int block2
Definition: PmeBase.h:24
int sourceNode
Definition: ComputePme.C:143
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
CkArrayIndex3D destElem
Definition: ComputePme.C:153
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int K3
Definition: PmeBase.h:21
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
int i
Definition: ComputePme.C:371
int zlistlen
Definition: ComputePme.C:149
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:151
int j
Definition: ComputePme.C:371

◆ sendTrans()

void ComputePmeMgr::sendTrans ( void  )

Definition at line 1967 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

1967  {
1968 
1969  untrans_count = numTransPes;
1970 
1971 #if CMK_SMP && USE_CKLOOP
1972  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1973  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1974  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1975  } else
1976 #endif
1977  {
1978  sendTransSubset(0, numTransNodes-1);
1979  }
1980 
1981 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1962
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:98
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1983

◆ sendTransBarrier()

void ComputePmeMgr::sendTransBarrier ( void  )

Definition at line 1952 of file ComputePme.C.

1952  {
1953  sendTransBarrier_received += 1;
1954  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1955  if ( sendTransBarrier_received < numGridPes ) return;
1956  sendTransBarrier_received = 0;
1957  for ( int i=0; i<numGridPes; ++i ) {
1958  pmeProxyDir[gridPeMap[i]].sendTrans();
1959  }
1960 }

◆ sendTransSubset()

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1983 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), PmeGrid::K2, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, LocalPmeInfo::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendTrans(), and sendTrans().

1983  {
1984  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1985 
1986  // send data for transpose
1987  int zdim = myGrid.dim3;
1988  int nx = localInfo[myGridPe].nx;
1989  int x_start = localInfo[myGridPe].x_start;
1990  int slicelen = myGrid.K2 * zdim;
1991 
1992  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1993 
1994 #if CMK_BLUEGENEL
1995  CmiNetworkProgressAfter (0);
1996 #endif
1997 
1998  for (int j=first; j<=last; j++) {
1999  int node = transNodeOrder[j]; // different order on each node
2000  int pe = transNodeInfo[node].pe_start;
2001  int npe = transNodeInfo[node].npe;
2002  int totlen = 0;
2003  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
2004  LocalPmeInfo &li = localInfo[pe];
2005  int cpylen = li.ny_after_transpose * zdim;
2006  totlen += cpylen;
2007  }
2008  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
2010  newmsg->sourceNode = myGridPe;
2011  newmsg->lattice = lattice;
2012  newmsg->x_start = x_start;
2013  newmsg->nx = nx;
2014  for ( int g=0; g<numGrids; ++g ) {
2015  float *qmsg = newmsg->qgrid + nx * totlen * g;
2016  pe = transNodeInfo[node].pe_start;
2017  for (int i=0; i<npe; ++i, ++pe) {
2018  LocalPmeInfo &li = localInfo[pe];
2019  int cpylen = li.ny_after_transpose * zdim;
2020  if ( node == myTransNode ) {
2021  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2022  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2023  }
2024  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2025  for ( int x = 0; x < nx; ++x ) {
2026  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2027  q += slicelen;
2028  qmsg += cpylen;
2029  }
2030  }
2031  }
2032  newmsg->sequence = grid_sequence;
2033  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2034  if ( node == myTransNode ) newmsg->nx = 0;
2035  if ( npe > 1 ) {
2036  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2037  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2038  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2039  }
2040 }
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:165
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2042
#define PRIORITY_SIZE
Definition: Priorities.h:13
int sourceNode
Definition: ComputePme.C:159
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
int ny_after_transpose
Definition: ComputePme.C:261
Lattice lattice
Definition: ComputePme.C:162
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
int y_start_after_transpose
Definition: ComputePme.C:261

◆ sendUngrid()

void ComputePmeMgr::sendUngrid ( void  )

Definition at line 2404 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, ComputePmeUtil::numGrids, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

2404  {
2405 
2406 #if CMK_SMP && USE_CKLOOP
2407  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2408  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2409  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2410  } else
2411 #endif
2412  {
2413  sendUngridSubset(0, numSources-1);
2414  }
2415 
2416  grid_count = numSources;
2417  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2418 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
static int numGrids
Definition: ComputePme.h:32
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2399
#define CKLOOP_CTRL_PME_SENDUNTRANS
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2420

◆ sendUngridSubset()

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2420 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::len, ComputePmeUtil::numGrids, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, PmeGridMsg::qgrid, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

2420  {
2421 
2422 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2423  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2424 #else
2425  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2426 #endif
2427 
2428  for ( int j=first; j<=last; ++j ) {
2429  // int msglen = qgrid_len;
2430  PmeGridMsg *newmsg = gridmsg_reuse[j];
2431  int pe = newmsg->sourceNode;
2432  int zdim = myGrid.dim3;
2433  int flen = newmsg->len;
2434  int fstart = newmsg->start;
2435  int zlistlen = newmsg->zlistlen;
2436  int *zlist = newmsg->zlist;
2437  float *qmsg = newmsg->qgrid;
2438  for ( int g=0; g<numGrids; ++g ) {
2439  char *f = newmsg->fgrid + fgrid_len * g;
2440  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2441  for ( int i=0; i<flen; ++i ) {
2442  if ( f[i] ) {
2443  for ( int k=0; k<zlistlen; ++k ) {
2444  *(qmsg++) = q[zlist[k]];
2445  }
2446  }
2447  q += zdim;
2448  }
2449  }
2450  newmsg->sourceNode = myGridPe;
2451 
2452  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2453  CmiEnableUrgentSend(1);
2454 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2455  if ( offload ) {
2456  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2457  } else
2458 #endif
2459  pmeProxyDir[pe].recvUngrid(newmsg);
2460  CmiEnableUrgentSend(0);
2461  }
2462 }
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
int sourceNode
Definition: ComputePme.C:143
float * qgrid
Definition: ComputePme.C:152
int * zlist
Definition: ComputePme.C:150
int zlistlen
Definition: ComputePme.C:149
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:151

◆ sendUntrans()

void ComputePmeMgr::sendUntrans ( void  )

Definition at line 2218 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, ComputePmeUtil::numGrids, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

2218  {
2219 
2220  trans_count = numGridPes;
2221 
2222  { // send energy and virial
2223  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2224  for ( int g=0; g<numGrids; ++g ) {
2225  newmsg->evir[g] = recip_evir2[g];
2226  }
2227  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2228  CmiEnableUrgentSend(1);
2229  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2230  CmiEnableUrgentSend(0);
2231  }
2232 
2233 #if CMK_SMP && USE_CKLOOP
2234  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2235  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2236  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2237  } else
2238 #endif
2239  {
2240  sendUntransSubset(0, numGridNodes-1);
2241  }
2242 
2243 }
static Node * Object()
Definition: Node.h:86
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
SimParameters * simParameters
Definition: Node.h:181
static int numGrids
Definition: ComputePme.h:32
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2245
PmeReduction * evir
Definition: ComputePme.C:195
#define CKLOOP_CTRL_PME_SENDUNTRANS
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2213

◆ sendUntransSubset()

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2245 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, LocalPmeInfo::x_start, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

2245  {
2246 
2247  int zdim = myGrid.dim3;
2248  int y_start = localInfo[myTransPe].y_start_after_transpose;
2249  int ny = localInfo[myTransPe].ny_after_transpose;
2250  int slicelen = myGrid.K2 * zdim;
2251 
2252  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2253 
2254 #if CMK_BLUEGENEL
2255  CmiNetworkProgressAfter (0);
2256 #endif
2257 
2258  // send data for reverse transpose
2259  for (int j=first; j<=last; j++) {
2260  int node = gridNodeOrder[j]; // different order on each node
2261  int pe = gridNodeInfo[node].pe_start;
2262  int npe = gridNodeInfo[node].npe;
2263  int totlen = 0;
2264  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2265  LocalPmeInfo &li = localInfo[pe];
2266  int cpylen = li.nx * zdim;
2267  totlen += cpylen;
2268  }
2269  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2270  newmsg->sourceNode = myTransPe;
2271  newmsg->y_start = y_start;
2272  newmsg->ny = ny;
2273  for ( int g=0; g<numGrids; ++g ) {
2274  float *qmsg = newmsg->qgrid + ny * totlen * g;
2275  pe = gridNodeInfo[node].pe_start;
2276  for (int i=0; i<npe; ++i, ++pe) {
2277  LocalPmeInfo &li = localInfo[pe];
2278  if ( node == myGridNode ) {
2279  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2280  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2281  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2282  int cpylen = ny * zdim;
2283  for ( int x = 0; x < li.nx; ++x ) {
2284  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2285  q += cpylen;
2286  qmsg += slicelen;
2287  }
2288  } else {
2289  CmiMemcpy((void*)qmsg,
2290  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2291  li.nx*ny*zdim*sizeof(float));
2292  qmsg += li.nx*ny*zdim;
2293  }
2294  }
2295  }
2296  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2297  if ( node == myGridNode ) newmsg->ny = 0;
2298  if ( npe > 1 ) {
2299  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2300  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2301  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2302  }
2303 }
float * qgrid
Definition: ComputePme.C:182
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
#define PRIORITY_SIZE
Definition: Priorities.h:13
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2305
int ny_after_transpose
Definition: ComputePme.C:261
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
int y_start_after_transpose
Definition: ComputePme.C:261

◆ submitReductions()

void ComputePmeMgr::submitReductions ( )

Definition at line 4297 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchFepOn, ComputePmeUtil::alchOn, ComputePmeUtil::alchThermIntOn, SubmitReduction::item(), ComputePmeUtil::lesFactor, ComputePmeUtil::lesOn, ComputePmeUtil::LJPMEOn, WorkDistrib::messageEnqueueWork(), NAMD_bug(), ComputePmeUtil::numGrids, Node::Object(), ComputePmeUtil::pairOn, REDUCTION_ELECT_ENERGY_PME_TI_1, REDUCTION_ELECT_ENERGY_PME_TI_2, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_LJ_ENERGY_SLOW, REDUCTION_STRAY_CHARGE_ERRORS, ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and SubmitReduction::submit().

Referenced by ComputePme::doWork(), and recvRecipEvir().

4297  {
4298 
4300 
4301  for ( int g=0; g<numGrids; ++g ) {
4302  double scale = 1.;
4303  if (alchOn) {
4304  BigReal elecLambdaUp, elecLambdaDown;
4305  // alchLambda set on each step in ComputePme::ungridForces()
4306  if ( alchLambda < 0 || alchLambda > 1 ) {
4307  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4308  }
4309  elecLambdaUp = simParams->getElecLambda(alchLambda);
4310  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4311  if ( g == 0 ) scale = elecLambdaUp;
4312  else if ( g == 1 ) scale = elecLambdaDown;
4313  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4314  if (alchDecouple) {
4315  if ( g == 2 ) scale = 1-elecLambdaUp;
4316  else if ( g == 3 ) scale = 1-elecLambdaDown;
4317  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4318  }
4319  } else if ( lesOn ) {
4320  scale = 1.0 / lesFactor;
4321  } else if ( pairOn ) {
4322  scale = ( g == 0 ? 1. : -1. );
4323  }
4324  if ( LJPMEOn && 1==g ) {
4325  reduction->item(REDUCTION_LJ_ENERGY_SLOW) += evir[g][0] * scale;
4326  } else {
4327  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4328  }
4329  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4330  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4331  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4332  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4333  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4334  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4335  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4336  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4337  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4338 
4339  if (alchFepOn) {
4340  double scale2 = 0.;
4341  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4342  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4343  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4344  if ( g == 0 ) scale2 = elecLambda2Up;
4345  else if ( g == 1 ) scale2 = elecLambda2Down;
4346  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4347  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4348  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4349  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4350  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4351  }
4352 
4353  if (alchThermIntOn) {
4354 
4355  // no decoupling:
4356  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4357  // (interactions between all atoms [partition 0 OR partition 1],
4358  // minus all [within partition 0])
4359  // U = elecLambdaUp * (U[0] - U[2])
4360  // dU/dl = U[0] - U[2];
4361 
4362  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4363  // (interactions between all atoms [partition 0 OR partition 2],
4364  // minus all [within partition 0])
4365  // U = elecLambdaDown * (U[1] - U[2])
4366  // dU/dl = U[1] - U[2];
4367 
4368  // alchDecouple:
4369  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4370  // (interactions between all atoms [partition 0 OR partition 1]
4371  // minus all [within partition 1] minus all [within partition 0]
4372  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4373  // dU/dl = U[0] - U[2] - U[4];
4374 
4375  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4376  // (interactions between all atoms [partition 0 OR partition 2]
4377  // minus all [within partition 2] minus all [within partition 0]
4378  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4379  // dU/dl = U[1] - U[3] - U[4];
4380 
4381 
4382  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4383  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4384  if (!alchDecouple) {
4385  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4386  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4387  }
4388  else { // alchDecouple
4389  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4390  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4391  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4392  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4393  }
4394  }
4395  }
4396 
4397  alchLambda = -1.; // illegal value to catch if not updated
4398 
4399  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4400  reduction->submit();
4401 
4402  for ( int i=0; i<heldComputes.size(); ++i ) {
4403  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4404  }
4405  heldComputes.resize(0);
4406 }
static Node * Object()
Definition: Node.h:86
int size(void) const
Definition: ResizeArray.h:131
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
BigReal & item(int i)
Definition: ReductionMgr.h:336
static int numGrids
Definition: ComputePme.h:32
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
void resize(int i)
Definition: ResizeArray.h:84
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static Bool LJPMEOn
Definition: ComputePme.h:43
static Bool alchDecouple
Definition: ComputePme.h:36
static int lesFactor
Definition: ComputePme.h:39
#define simParams
Definition: Output.C:131
static Bool pairOn
Definition: ComputePme.h:40
static Bool lesOn
Definition: ComputePme.h:38
static Bool alchFepOn
Definition: ComputePme.h:34
double BigReal
Definition: common.h:123
static Bool alchThermIntOn
Definition: ComputePme.h:35

◆ ungridCalc()

void ComputePmeMgr::ungridCalc ( void  )

Definition at line 2554 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

2554  {
2555  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2556 
2557  ungridForcesCount = pmeComputes.size();
2558 
2559 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2560  if ( offload ) {
2561  //CmiLock(cuda_lock);
2562  cudaSetDevice(deviceCUDA->getDeviceID());
2563 
2564  if ( this == masterPmeMgr ) {
2565  double before = CmiWallTimer();
2566  // XXX prevents something from breaking???
2567  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2568  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2569  // try to make the unspecified launch failures go away
2570  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2571  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2572  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2573 
2574  const int myrank = CkMyRank();
2575  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2576  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2577  nodePmeMgr->mgrObjects[i]->ungridCalc();
2578  }
2579  }
2580  if ( ! pmeComputes.size() ) return;
2581  }
2582 
2583  if ( ! end_forces ) {
2584  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2585  end_forces = new cudaEvent_t[n];
2586  for ( int i=0; i<n; ++i ) {
2587  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2588  }
2589  }
2590 
2591  const int pcsz = pmeComputes.size();
2592  if ( ! afn_host ) {
2593  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2594  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2595  cuda_errcheck("malloc params for pme");
2596  }
2597  int totn = 0;
2598  for ( int i=0; i<pcsz; ++i ) {
2599  int n = pmeComputes[i]->numGridAtoms[0];
2600  totn += n;
2601  }
2602  if ( totn > f_data_mgr_alloc ) {
2603  if ( f_data_mgr_alloc ) {
2604  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2605  cudaFree(f_data_mgr_dev);
2606  cudaFreeHost(f_data_mgr_host);
2607  }
2608  f_data_mgr_alloc = 1.2 * (totn + 100);
2609  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2610  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2611  cuda_errcheck("malloc forces for pme");
2612  }
2613  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2614  float *f_dev = f_data_mgr_dev;
2615  float *f_host = f_data_mgr_host;
2616  for ( int i=0; i<pcsz; ++i ) {
2617  int n = pmeComputes[i]->numGridAtoms[0];
2618  pmeComputes[i]->f_data_dev = f_dev;
2619  pmeComputes[i]->f_data_host = f_host;
2620  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2621  afn_host[3*i+1] = f_dev;
2622  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2623  f_dev += 3*n;
2624  f_host += 3*n;
2625  }
2626  //CmiLock(cuda_lock);
2627  double before = CmiWallTimer();
2628  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2629  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2630  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2631  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2632  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2633  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2634 
2635  for ( int i=0; i<pcsz; ++i ) {
2636  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2637  if ( i%EVENT_STRIDE == 0 ) {
2638  int dimy = pcsz - i;
2639  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2640  int maxn = 0;
2641  int subtotn = 0;
2642  for ( int j=0; j<dimy; ++j ) {
2643  int n = pmeComputes[i+j]->numGridAtoms[0];
2644  subtotn += n;
2645  if ( n > maxn ) maxn = n;
2646  }
2647  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2648  before = CmiWallTimer();
2649  cuda_pme_forces(
2650  bspline_coeffs_dev,
2651  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2652  pmeComputes[i]->a_data_dev,
2653  pmeComputes[i]->f_data_dev,
2654  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2655  streams[stream]);
2656  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2657  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2658  before = CmiWallTimer();
2659  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2660  cudaMemcpyDeviceToHost, streams[stream]);
2661 #if 0
2662  cudaDeviceSynchronize();
2663  fprintf(stderr, "i = %d\n", i);
2664  for(int k=0; k < subtotn*3; k++)
2665  {
2666  fprintf(stderr, "f_data_host[%d][%d] = %f\n", i, k,
2667  pmeComputes[i]->f_data_host[k]);
2668  }
2669 #endif
2670  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2671  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2672  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2673  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2674  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2675  }
2676  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2677  }
2678  //CmiUnlock(cuda_lock);
2679  } else
2680 #endif // NAMD_CUDA
2681  {
2682  for ( int i=0; i<pmeComputes.size(); ++i ) {
2684  // pmeComputes[i]->ungridForces();
2685  }
2686  }
2687  // submitReductions(); // must follow all ungridForces()
2688 
2689 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2690  if ( offload ) {
2691  forces_time = CmiWallTimer();
2692  forces_count = ungridForcesCount;
2693  forces_done_count = 0;
2694  pmeProxy[this_pe].pollForcesReady();
2695  }
2696 #endif
2697 
2698  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2699 }
double forces_time
Definition: ComputePme.C:459
int size(void) const
Definition: ResizeArray.h:131
float * a_data_dev
Definition: ComputePme.C:447
#define EVENT_STRIDE
Definition: ComputePme.C:2506
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
#define CUDA_EVENT_ID_PME_COPY
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
float * f_data_host
Definition: ComputePme.C:448
int order
Definition: PmeBase.h:23
#define CUDA_EVENT_ID_PME_TICK
float * f_data_dev
Definition: ComputePme.C:449
void ungridCalc(void)
Definition: ComputePme.C:2554
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int getDeviceID()
Definition: DeviceCUDA.h:144
int K3
Definition: PmeBase.h:21
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:482
cudaEvent_t * end_forces
Definition: ComputePme.C:455
#define CUDA_EVENT_ID_PME_KERNEL
int forces_done_count
Definition: ComputePme.C:457

Friends And Related Function Documentation

◆ ComputePme

friend class ComputePme
friend

Definition at line 385 of file ComputePme.C.

◆ NodePmeMgr

friend class NodePmeMgr
friend

Definition at line 386 of file ComputePme.C.

Member Data Documentation

◆ a_data_dev

float* ComputePmeMgr::a_data_dev

Definition at line 447 of file ComputePme.C.

Referenced by cuda_submit_charges(), ComputePme::doWork(), and ungridCalc().

◆ a_data_host

float* ComputePmeMgr::a_data_host

Definition at line 446 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ComputePme::doWork().

◆ chargeGridSubmittedCount

int ComputePmeMgr::chargeGridSubmittedCount

◆ charges_time

double ComputePmeMgr::charges_time

Definition at line 458 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

◆ check_charges_count

int ComputePmeMgr::check_charges_count

Definition at line 460 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

◆ check_forces_count

int ComputePmeMgr::check_forces_count

Definition at line 461 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

◆ cuda_atoms_alloc

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 451 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ComputePme::doWork().

◆ cuda_atoms_count

int ComputePmeMgr::cuda_atoms_count

◆ cuda_busy

bool ComputePmeMgr::cuda_busy
static

Definition at line 470 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ cuda_lock

CmiNodeLock ComputePmeMgr::cuda_lock
static

Definition at line 452 of file ComputePme.C.

Referenced by ComputePmeMgr(), ComputePme::doWork(), initialize_computes(), and recvAck().

◆ cuda_submit_charges_deque

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque
static

Definition at line 469 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ end_charges

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

◆ end_forces

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 455 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

◆ f_data_dev

float* ComputePmeMgr::f_data_dev

Definition at line 449 of file ComputePme.C.

Referenced by ungridCalc().

◆ f_data_host

float* ComputePmeMgr::f_data_host

Definition at line 448 of file ComputePme.C.

Referenced by ungridCalc().

◆ fftw_plan_lock

CmiNodeLock ComputePmeMgr::fftw_plan_lock
static

◆ forces_count

int ComputePmeMgr::forces_count

Definition at line 456 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_done_count

int ComputePmeMgr::forces_done_count

Definition at line 457 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_time

double ComputePmeMgr::forces_time

Definition at line 459 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ master_pe

int ComputePmeMgr::master_pe

◆ pmeComputes

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

◆ pmemgr_lock

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 443 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

◆ saved_lattice

Lattice* ComputePmeMgr::saved_lattice

Definition at line 475 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

◆ saved_sequence

int ComputePmeMgr::saved_sequence

◆ sendDataHelper_errors

int ComputePmeMgr::sendDataHelper_errors

Definition at line 401 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

◆ sendDataHelper_lattice

Lattice* ComputePmeMgr::sendDataHelper_lattice

◆ sendDataHelper_sequence

int ComputePmeMgr::sendDataHelper_sequence

◆ sendDataHelper_sourcepe

int ComputePmeMgr::sendDataHelper_sourcepe

◆ this_pe

int ComputePmeMgr::this_pe

Definition at line 463 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file: