Inheritance diagram for ComputePmeMgr:

Classes
struct	cuda_submit_charges_args

Public Member Functions
	ComputePmeMgr ()

	~ComputePmeMgr ()

void	initialize (CkQdMsg *)

void	initialize_pencils (CkQdMsg *)

void	activate_pencils (CkQdMsg *)

void	recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)

void	initialize_computes ()

void	sendData (Lattice &, int sequence)

void	sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)

void	sendPencils (Lattice &, int sequence)

void	sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)

void	recvGrid (PmeGridMsg *)

void	gridCalc1 (void)

void	sendTransBarrier (void)

void	sendTransSubset (int first, int last)

void	sendTrans (void)

void	fwdSharedTrans (PmeTransMsg *)

void	recvSharedTrans (PmeSharedTransMsg *)

void	sendDataHelper (int)

void	sendPencilsHelper (int)

void	recvTrans (PmeTransMsg *)

void	procTrans (PmeTransMsg *)

void	gridCalc2 (void)

void	gridCalc2R (void)

void	fwdSharedUntrans (PmeUntransMsg *)

void	recvSharedUntrans (PmeSharedUntransMsg *)

void	sendUntrans (void)

void	sendUntransSubset (int first, int last)

void	recvUntrans (PmeUntransMsg *)

void	procUntrans (PmeUntransMsg *)

void	gridCalc3 (void)

void	sendUngrid (void)

void	sendUngridSubset (int first, int last)

void	recvUngrid (PmeGridMsg *)

void	recvAck (PmeAckMsg *)

void	copyResults (PmeGridMsg *)

void	copyPencils (PmeGridMsg *)

void	ungridCalc (void)

void	recvRecipEvir (PmeEvirMsg *)

void	addRecipEvirClient (void)

void	submitReductions ()

void	chargeGridSubmitted (Lattice &lattice, int sequence)

void	cuda_submit_charges (Lattice &lattice, int sequence)

void	sendChargeGridReady ()

void	pollChargeGridReady ()

void	pollForcesReady ()

void	recvChargeGridReady ()

void	chargeGridReady (Lattice &lattice, int sequence)

Public Member Functions inherited from ComputePmeUtil
	ComputePmeUtil ()

	~ComputePmeUtil ()

Public Attributes
Lattice *	sendDataHelper_lattice

int	sendDataHelper_sequence

int	sendDataHelper_sourcepe

int	sendDataHelper_errors

CmiNodeLock	pmemgr_lock

float *	a_data_host

float *	a_data_dev

float *	f_data_host

float *	f_data_dev

int	cuda_atoms_count

int	cuda_atoms_alloc

cudaEvent_t	end_charges

cudaEvent_t *	end_forces

int	forces_count

int	forces_done_count

double	charges_time

double	forces_time

int	check_charges_count

int	check_forces_count

int	master_pe

int	this_pe

int	chargeGridSubmittedCount

Lattice *	saved_lattice

int	saved_sequence

ResizeArray< ComputePme * >	pmeComputes

Static Public Attributes
static CmiNodeLock	fftw_plan_lock

static CmiNodeLock	cuda_lock

static std::deque< cuda_submit_charges_args >	cuda_submit_charges_deque

static bool	cuda_busy

Static Public Attributes inherited from ComputePmeUtil
static int	numGrids

static Bool	alchOn

static Bool	alchFepOn

static Bool	alchThermIntOn

static Bool	alchDecouple

static BigReal	alchElecLambdaStart

static Bool	lesOn

static int	lesFactor

static Bool	pairOn

static Bool	selfOn

static Bool	LJPMEOn

Friends
class	ComputePme

class	NodePmeMgr

Additional Inherited Members
Static Public Member Functions inherited from ComputePmeUtil
static void	select (void)

Detailed Description

Definition at line 383 of file ComputePme.C.

Constructor & Destructor Documentation

◆ ComputePmeMgr()

ComputePmeMgr::ComputePmeMgr ( )

Definition at line 738 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, and this_pe.

                              : pmeProxy(thisgroup), 
                                  pmeProxyDir(thisgroup) {
 
   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
 
   pmeNodeProxy.ckLocalBranch()->initialize();
 
   if ( CmiMyRank() == 0 ) {
     fftw_plan_lock = CmiCreateLock();
   }
   pmemgr_lock = CmiCreateLock();
 
   myKSpace = 0;
   kgrid = 0;
   work = 0;
   grid_count = 0;
   trans_count = 0;
   untrans_count = 0;
   ungrid_count = 0;
   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
   useBarrier = 0;
   sendTransBarrier_received = 0;
   usePencils = 0;
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  // offload has not been set so this happens on every run
   if ( CmiMyRank() == 0 ) {
     cuda_lock = CmiCreateLock();
   }
 
 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
   int leastPriority, greatestPriority;
   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
   //if ( CkMyNode() == 0 ) {
   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
   //}
 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
 #else
 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
 #endif
 
   stream = 0;
   for ( int i=0; i<NUM_STREAMS; ++i ) {
 #if 1
     CUDA_STREAM_CREATE(&streams[i]);
     cuda_errcheck("cudaStreamCreate");
 #else
   streams[i] = 0;  // XXXX Testing!!!
 #endif
   }
 
   this_pe = CkMyPe();
  
   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
   end_forces = 0;
   check_charges_count = 0;
   check_forces_count = 0;
   chargeGridSubmittedCount = 0;
 
   cuda_atoms_count = 0;
   cuda_atoms_alloc = 0;
 
   f_data_mgr_alloc = 0;
   f_data_mgr_host = 0;
   f_data_mgr_dev = 0;
   afn_host = 0;
   afn_dev = 0;
 
 #define CUDA_EVENT_ID_PME_CHARGES 80
 #define CUDA_EVENT_ID_PME_FORCES 81
 #define CUDA_EVENT_ID_PME_TICK 82
 #define CUDA_EVENT_ID_PME_COPY 83
 #define CUDA_EVENT_ID_PME_KERNEL 84
   if ( 0 == CkMyPe() ) {
     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
   }
 #endif
   recipEvirCount = 0;
   recipEvirClients = 0;
   recipEvirPe = -999;
 }

◆ ~ComputePmeMgr()

ComputePmeMgr::~ComputePmeMgr ( )

Definition at line 1822 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

                               {
 
   if ( CmiMyRank() == 0 ) {
     CmiDestroyLock(fftw_plan_lock);
   }
   CmiDestroyLock(pmemgr_lock);
 
   delete myKSpace;
   delete [] localInfo;
   delete [] gridNodeInfo;
   delete [] transNodeInfo;
   delete [] gridPeMap;
   delete [] transPeMap;
   delete [] recipPeDest;
   delete [] gridPeOrder;
   delete [] gridNodeOrder;
   delete [] transNodeOrder;
   delete [] qgrid;
   if ( kgrid != qgrid ) delete [] kgrid;
   delete [] work;
   delete [] gridmsg_reuse;
 
  if ( ! offload ) {
   for (int i=0; i<q_count; ++i) {
     delete [] q_list[i];
   }
   delete [] q_list;
   delete [] fz_arr;
  }
   delete [] f_arr;
   delete [] q_arr;
 }

Member Function Documentation

◆ activate_pencils()

void ComputePmeMgr::activate_pencils ( CkQdMsg * msg )

Definition at line 1816 of file ComputePme.C.

                                                  {
   if ( ! usePencils ) return;
   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
 }

◆ addRecipEvirClient()

void ComputePmeMgr::addRecipEvirClient ( void )

Definition at line 3064 of file ComputePme.C.

                                        {
   ++recipEvirClients;
 }

◆ chargeGridReady()

void ComputePmeMgr::chargeGridReady	(	Lattice &	lattice,
		int	sequence
	)

Definition at line 3626 of file ComputePme.C.

References PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by ComputePme::doWork(), and recvChargeGridReady().

                                                                   {
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  if ( offload ) {
   int errcount = 0;
   int q_stride = myGrid.K3+myGrid.order-1;
   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
     f_arr[j] = ffz_host[j];
     if ( ffz_host[j] & ~1 ) ++errcount;
   }
   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
  }
 #endif
   recipEvirCount = recipEvirClients;
   ungridForcesCount = pmeComputes.size();
 
   for (int j=0; j<myGrid.order-1; ++j) {
     fz_arr[j] |= fz_arr[myGrid.K3+j];
   }
 
   if ( usePencils ) {
     sendPencils(lattice,sequence);
   } else {
     sendData(lattice,sequence);
   }
 }

◆ chargeGridSubmitted()

void ComputePmeMgr::chargeGridSubmitted	(	Lattice &	lattice,
		int	sequence
	)

Definition at line 3567 of file ComputePme.C.

References chargeGridSubmittedCount, CUDA_EVENT_ID_PME_COPY, end_charges, master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

                                                                       {
   saved_lattice = &lattice;
   saved_sequence = sequence;
 
   // cudaDeviceSynchronize();  //  XXXX TESTING
   //int q_stride = myGrid.K3+myGrid.order-1;
   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
   //  }
   //}
   //CmiLock(cuda_lock);
 
  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
   double before = CmiWallTimer();
   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
                         cudaMemcpyDeviceToHost, streams[stream]);
   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
   //CmiUnlock(cuda_lock);
   // cudaDeviceSynchronize();  //  XXXX TESTING
   // cuda_errcheck("after memcpy grid to host");
 
   SimParameters *simParams = Node::Object()->simParameters;
   pmeProxy[master_pe].pollChargeGridReady();
  }
 }

◆ copyPencils()

void ComputePmeMgr::copyPencils ( PmeGridMsg * msg )

Definition at line 3872 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

                                                {
 
   int K1 = myGrid.K1;
   int K2 = myGrid.K2;
   int dim2 = myGrid.dim2;
   int dim3 = myGrid.dim3;
   int block1 = myGrid.block1;
   int block2 = myGrid.block2;
 
   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
   int ib = msg->sourceNode / yBlocks;
   int jb = msg->sourceNode % yBlocks;
 
   int ibegin = ib*block1;
   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
   int jbegin = jb*block2;
   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
 
   int zlistlen = msg->zlistlen;
   int *zlist = msg->zlist;
   float *qmsg = msg->qgrid;
   int g;
   for ( g=0; g<numGrids; ++g ) {
     char *f = f_arr + g*fsize;
     float **q = q_arr + g*fsize;
     for ( int i=ibegin; i<iend; ++i ) {
      for ( int j=jbegin; j<jend; ++j ) {
       if( f[i*dim2+j] ) {
         f[i*dim2+j] = 0;
         for ( int k=0; k<zlistlen; ++k ) {
           q[i*dim2+j][zlist[k]] = *(qmsg++);
         }
         for (int h=0; h<myGrid.order-1; ++h) {
           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
         }
       }
      }
     }
   }
 }

◆ copyResults()

void ComputePmeMgr::copyResults ( PmeGridMsg * msg )

Definition at line 4064 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

                                                {
 
   int zdim = myGrid.dim3;
   int flen = msg->len;
   int fstart = msg->start;
   int zlistlen = msg->zlistlen;
   int *zlist = msg->zlist;
   float *qmsg = msg->qgrid;
   int g;
   for ( g=0; g<numGrids; ++g ) {
     char *f = msg->fgrid + g*flen;
     float **q = q_arr + fstart + g*fsize;
     for ( int i=0; i<flen; ++i ) {
       if ( f[i] ) {
         f[i] = 0;
         for ( int k=0; k<zlistlen; ++k ) {
           q[i][zlist[k]] = *(qmsg++);
         }
         for (int h=0; h<myGrid.order-1; ++h) {
           q[i][myGrid.K3+h] = q[i][h];
         }
       }
     }
   }
 }

◆ cuda_submit_charges()

void ComputePmeMgr::cuda_submit_charges	(	Lattice &	lattice,
		int	sequence
	)

Definition at line 3512 of file ComputePme.C.

References a_data_dev, a_data_host, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

Referenced by ComputePme::doWork().

                                                                       {
 
     int n = cuda_atoms_count;
     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
     cuda_atoms_count = 0;
 
     const double before = CmiWallTimer();
     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
                           cudaMemcpyHostToDevice, streams[stream]);
     const double after = CmiWallTimer();
 
     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
 
     cuda_pme_charges(
       bspline_coeffs_dev,
       q_arr_dev, ffz_dev, ffz_dev + fsize,
       a_data_dev, n,
       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
       streams[stream]);
     const double after2 = CmiWallTimer();
 
     chargeGridSubmitted(lattice,sequence);  // must be inside lock
 
     masterPmeMgr->charges_time = before;
     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
 }

◆ fwdSharedTrans()

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg * msg )

Definition at line 2042 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by sendTransSubset().

                                                    {
   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
   int pe = transNodeInfo[myTransNode].pe_start;
   int npe = transNodeInfo[myTransNode].npe;
   CmiNodeLock lock = CmiCreateLock();
   int *count = new int; *count = npe;
   for (int i=0; i<npe; ++i, ++pe) {
     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
     shmsg->msg = msg;
     shmsg->count = count;
     shmsg->lock = lock;
     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
   }
 }

◆ fwdSharedUntrans()

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg * msg )

Definition at line 2305 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by sendUntransSubset().

                                                        {
   int pe = gridNodeInfo[myGridNode].pe_start;
   int npe = gridNodeInfo[myGridNode].npe;
   CmiNodeLock lock = CmiCreateLock();
   int *count = new int; *count = npe;
   for (int i=0; i<npe; ++i, ++pe) {
     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
     shmsg->msg = msg;
     shmsg->count = count;
     shmsg->lock = lock;
     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
   }
 }

◆ gridCalc1()

void ComputePmeMgr::gridCalc1 ( void )

Definition at line 1934 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

                                   {
   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
 
 #ifdef NAMD_FFTW
   for ( int g=0; g<numGrids; ++g ) {
 #ifdef NAMD_FFTW_3
     fftwf_execute(forward_plan_yz[g]);
 #else
     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
 #endif
 
   }
 #endif
 
   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
 }

◆ gridCalc2()

void ComputePmeMgr::gridCalc2 ( void )

Definition at line 2110 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

                                   {
   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
 
 #if CMK_BLUEGENEL
   CmiNetworkProgressAfter (0);
 #endif
 
   int zdim = myGrid.dim3;
   // int y_start = localInfo[myTransPe].y_start_after_transpose;
   int ny = localInfo[myTransPe].ny_after_transpose;
 
   for ( int g=0; g<numGrids; ++g ) {
     // finish forward FFT (x dimension)
 #ifdef NAMD_FFTW
 #ifdef NAMD_FFTW_3
     fftwf_execute(forward_plan_x[g]);
 #else
     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
         ny * zdim / 2, 1, work, 1, 0);
 #endif
 #endif
   }
 
 #ifdef OPENATOM_VERSION
     if ( ! simParams -> openatomOn ) { 
 #endif // OPENATOM_VERSION
       gridCalc2R();
 #ifdef OPENATOM_VERSION
     } else {
       gridCalc2Moa();
     }
 #endif // OPENATOM_VERSION
 }

◆ gridCalc2R()

void ComputePmeMgr::gridCalc2R ( void )

Definition at line 2170 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeKSpace::compute_energy_LJPME(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, ComputeNonbondedUtil::LJewaldcof, ComputePmeUtil::LJPMEOn, ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

                                    {
 
   int useCkLoop = 0;
 #if CMK_SMP && USE_CKLOOP
   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
        && CkNumPes() >= 2 * numTransPes ) {
     useCkLoop = 1;
   }
 #endif
 
   int zdim = myGrid.dim3;
   // int y_start = localInfo[myTransPe].y_start_after_transpose;
   int ny = localInfo[myTransPe].ny_after_transpose;
 
   for ( int g=0; g<numGrids; ++g ) {
     // reciprocal space portion of PME
     if ( LJPMEOn && 1==g ) {
       BigReal LJewaldcof = ComputeNonbondedUtil::LJewaldcof;
       recip_evir2[g][0] = myKSpace->compute_energy_LJPME(kgrid+qgrid_size*g,
           lattice, LJewaldcof, &(recip_evir2[g][1]), useCkLoop);
       // CkPrintf("LJ Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
     } else {
       BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
       recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
                             lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
       // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
     }
 
     // start backward FFT (x dimension)
 
 #ifdef NAMD_FFTW
 #ifdef NAMD_FFTW_3
     fftwf_execute(backward_plan_x[g]);
 #else
     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
         ny * zdim / 2, 1, work, 1, 0);
 #endif
 #endif
   }
   
   pmeProxyDir[CkMyPe()].sendUntrans();
 }

◆ gridCalc3()

void ComputePmeMgr::gridCalc3 ( void )

Definition at line 2379 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

                                   {
   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
 
   // finish backward FFT
 #ifdef NAMD_FFTW
   for ( int g=0; g<numGrids; ++g ) {
 #ifdef NAMD_FFTW_3
     fftwf_execute(backward_plan_yz[g]);
 #else
     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
         (fftw_complex *) (qgrid + qgrid_size * g),
         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
 #endif
   }
 
 #endif
 
   pmeProxyDir[CkMyPe()].sendUngrid();
 }

◆ initialize()

void ComputePmeMgr::initialize ( CkQdMsg * msg )

Definition at line 890 of file ComputePme.C.

                                            {
   delete msg;
 
   localInfo = new LocalPmeInfo[CkNumPes()];
   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
   transNodeInfo = new NodePmeInfo[CkNumNodes()];
   gridPeMap = new int[CkNumPes()];
   transPeMap = new int[CkNumPes()];
   recipPeDest = new int[CkNumPes()];
   gridPeOrder = new int[CkNumPes()];
   gridNodeOrder = new int[CkNumNodes()];
   transNodeOrder = new int[CkNumNodes()];
 
   if (CkMyRank() == 0) {
     pencilPMEProcessors = new char [CkNumPes()];
     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
   }
 
   SimParameters *simParams = Node::Object()->simParameters;
   PatchMap *patchMap = PatchMap::Object();
 
   offload = simParams->PMEOffload;
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   if ( offload && ! deviceCUDA->one_device_per_node() ) {
     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
   }
   if ( offload ) {
     int dev;
     cudaGetDevice(&dev);
     cuda_errcheck("in cudaGetDevice");
     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, dev);
     cuda_errcheck("in cudaGetDeviceProperties");
     if ( deviceProp.major < 2 )
       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
   }
 #endif
 
   alchLambda = -1.;  // illegal value to catch if not updated
   alchLambda2 = -1.;
   useBarrier = simParams->PMEBarrier;
 
   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
   else {
     int nrps = simParams->PMEProcessors;
     if ( nrps <= 0 ) nrps = CkNumPes();
     if ( nrps > CkNumPes() ) nrps = CkNumPes();
     int dimx = simParams->PMEGridSizeX;
     int dimy = simParams->PMEGridSizeY;
     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
     if ( maxslabs > nrps ) maxslabs = nrps;
     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
     if ( maxpencils > nrps ) maxpencils = nrps;
     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
     else usePencils = 0;
   }
 
   if ( usePencils ) {
     int nrps = simParams->PMEProcessors;
     if ( nrps <= 0 ) nrps = CkNumPes();
     if ( nrps > CkNumPes() ) nrps = CkNumPes();
     if ( simParams->PMEPencils > 1 &&
          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
     } else {
       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
       if ( nb2 > nrps ) nb2 = nrps;
       if ( nb2 < 1 ) nb2 = 1;
       int nb = (int) sqrt((float)nb2);
       if ( nb < 1 ) nb = 1;
       xBlocks = zBlocks = nb;
       yBlocks = nb2 / nb;
     }
 
     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
 
     int dimx = simParams->PMEGridSizeX;
     int bx = 1 + ( dimx - 1 ) / xBlocks;
     xBlocks = 1 + ( dimx - 1 ) / bx;
 
     int dimy = simParams->PMEGridSizeY;
     int by = 1 + ( dimy - 1 ) / yBlocks;
     yBlocks = 1 + ( dimy - 1 ) / by;
 
     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
     int bz = 1 + ( dimz - 1 ) / zBlocks;
     zBlocks = 1 + ( dimz - 1 ) / bz;
 
     if ( xBlocks * yBlocks > CkNumPes() ) {
       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
     }
     if ( xBlocks * zBlocks > CkNumPes() ) {
       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
     }
     if ( yBlocks * zBlocks > CkNumPes() ) {
       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
     }
 
     if ( ! CkMyPe() ) {
       iout << iINFO << "PME using " << xBlocks << " x " <<
         yBlocks << " x " << zBlocks <<
         " pencil grid for FFT and reciprocal sum.\n" << endi;
     }
   } else { // usePencils
 
   {  // decide how many pes to use for reciprocal sum
 
     // rules based on work available
     int minslices = simParams->PMEMinSlices;
     int dimx = simParams->PMEGridSizeX;
     int nrpx = ( dimx + minslices - 1 ) / minslices;
     int dimy = simParams->PMEGridSizeY;
     int nrpy = ( dimy + minslices - 1 ) / minslices;
 
     // rules based on processors available
     int nrpp = CkNumPes();
     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
     if ( nrpp < nrpx ) nrpx = nrpp;
     if ( nrpp < nrpy ) nrpy = nrpp;
 
     // user override
     int nrps = simParams->PMEProcessors;
     if ( nrps > CkNumPes() ) nrps = CkNumPes();
     if ( nrps > 0 ) nrpx = nrps;
     if ( nrps > 0 ) nrpy = nrps;
 
     // make sure there aren't any totally empty processors
     int bx = ( dimx + nrpx - 1 ) / nrpx;
     nrpx = ( dimx + bx - 1 ) / bx;
     int by = ( dimy + nrpy - 1 ) / nrpy;
     nrpy = ( dimy + by - 1 ) / by;
     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
       NAMD_bug("Error in selecting number of PME processors.");
     if ( by != ( dimy + nrpy - 1 ) / nrpy )
       NAMD_bug("Error in selecting number of PME processors.");
 
     numGridPes = nrpx;
     numTransPes = nrpy;
   }
   if ( ! CkMyPe() ) {
     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
       " processors for FFT and reciprocal sum.\n" << endi;
   }
 
   int sum_npes = numTransPes + numGridPes;
   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
 
 #if 0 // USE_TOPOMAP
   /* This code is being disabled permanently for slab PME on Blue Gene machines */
   PatchMap * pmap = PatchMap::Object();
   
   int patch_pes = pmap->numNodesWithPatches();
   TopoManager tmgr;
   if(tmgr.hasMultipleProcsPerNode())
     patch_pes *= 2;
 
   bool done = false;
   if(CkNumPes() > 2*sum_npes + patch_pes) {    
     done = generateBGLORBPmePeList(transPeMap, numTransPes);
     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
   }
   else 
     if(CkNumPes() > 2 *max_npes + patch_pes) {
       done = generateBGLORBPmePeList(transPeMap, max_npes);
       gridPeMap = transPeMap;
     }
 
   if (!done)
 #endif
     {
       //generatePmePeList(transPeMap, max_npes);
       //gridPeMap = transPeMap;
       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
     }
   
   if ( ! CkMyPe() ) {
     iout << iINFO << "PME GRID LOCATIONS:";
     int i;
     for ( i=0; i<numGridPes && i<10; ++i ) {
       iout << " " << gridPeMap[i];
     }
     if ( i < numGridPes ) iout << " ...";
     iout << "\n" << endi;
     iout << iINFO << "PME TRANS LOCATIONS:";
     for ( i=0; i<numTransPes && i<10; ++i ) {
       iout << " " << transPeMap[i];
     }
     if ( i < numTransPes ) iout << " ...";
     iout << "\n" << endi;
   }
 
   // sort based on nodes and physical nodes
   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
 
   myGridPe = -1;
   myGridNode = -1;
   int i = 0;
   int node = -1;
   int real_node = -1;
   for ( i=0; i<numGridPes; ++i ) {
     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
     int real_node_i = CkNodeOf(gridPeMap[i]);
     if ( real_node_i == real_node ) {
       gridNodeInfo[node].npe += 1;
     } else {
       real_node = real_node_i;
       ++node;
       gridNodeInfo[node].real_node = real_node;
       gridNodeInfo[node].pe_start = i;
       gridNodeInfo[node].npe = 1;
     }
     if ( CkMyNode() == real_node_i ) myGridNode = node;
   }
   numGridNodes = node + 1;
   myTransPe = -1;
   myTransNode = -1;
   node = -1;
   real_node = -1;
   for ( i=0; i<numTransPes; ++i ) {
     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
     int real_node_i = CkNodeOf(transPeMap[i]);
     if ( real_node_i == real_node ) {
       transNodeInfo[node].npe += 1;
     } else {
       real_node = real_node_i;
       ++node;
       transNodeInfo[node].real_node = real_node;
       transNodeInfo[node].pe_start = i;
       transNodeInfo[node].npe = 1;
     }
     if ( CkMyNode() == real_node_i ) myTransNode = node;
   }
   numTransNodes = node + 1;
 
   if ( ! CkMyPe() ) {
     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
          << numTransNodes << " TRANS NODES\n" << endi;
   }
 
   { // generate random orderings for grid and trans messages
     int i;
     for ( i = 0; i < numGridPes; ++i ) {
       gridPeOrder[i] = i;
     }
     Random rand(CkMyPe());
     if ( myGridPe < 0 ) {
       rand.reorder(gridPeOrder,numGridPes);
     } else {  // self last
       gridPeOrder[myGridPe] = numGridPes-1;
       gridPeOrder[numGridPes-1] = myGridPe;
       rand.reorder(gridPeOrder,numGridPes-1);
     } 
     for ( i = 0; i < numGridNodes; ++i ) {
       gridNodeOrder[i] = i;
     }
     if ( myGridNode < 0 ) {
       rand.reorder(gridNodeOrder,numGridNodes);
     } else {  // self last
       gridNodeOrder[myGridNode] = numGridNodes-1;
       gridNodeOrder[numGridNodes-1] = myGridNode;
       rand.reorder(gridNodeOrder,numGridNodes-1);
     }
     for ( i = 0; i < numTransNodes; ++i ) {
       transNodeOrder[i] = i;
     }
     if ( myTransNode < 0 ) {
       rand.reorder(transNodeOrder,numTransNodes);
     } else {  // self last
       transNodeOrder[myTransNode] = numTransNodes-1;
       transNodeOrder[numTransNodes-1] = myTransNode;
       rand.reorder(transNodeOrder,numTransNodes-1);
     }
   }
   
   } // ! usePencils
 
   myGrid.K1 = simParams->PMEGridSizeX;
   myGrid.K2 = simParams->PMEGridSizeY;
   myGrid.K3 = simParams->PMEGridSizeZ;
   myGrid.order = simParams->PMEInterpOrder;
   myGrid.dim2 = myGrid.K2;
   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
 
   if ( ! usePencils ) {
     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
     myGrid.block3 = myGrid.dim3 / 2;  // complex
   }
 
   if ( usePencils ) {
     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
 
 
       int pe = 0;
       int x,y,z;
 
                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
       
                 // decide which pes to use by bit reversal and patch use
                 int i;
                 int ncpus = CkNumPes();
                 SortableResizeArray<int> patches, nopatches, pmeprocs;
                 PatchMap *pmap = PatchMap::Object();
                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
                         if ( ri ) { // keep 0 for special case
                                 // pretend pe 1 has patches to avoid placing extra PME load on node
                                 if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
                                 else nopatches.add(ri);
                         }
                 }
 
 #if USE_RANDOM_TOPO
             Random rand(CkMyPe());
             int *tmp = new int[patches.size()];
             int nn = patches.size();
             for (i=0;i<nn;i++)  tmp[i] = patches[i];
             rand.reorder(tmp, nn);
             patches.resize(0);
             for (i=0;i<nn;i++)  patches.add(tmp[i]);
             delete [] tmp;
             tmp = new int[nopatches.size()];
             nn = nopatches.size();
             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
             rand.reorder(tmp, nn);
             nopatches.resize(0);
             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
             delete [] tmp;
 #endif
 
                 // only use zero if it eliminates overloading or has patches
                 int useZero = 0;
                 int npens = xBlocks*yBlocks;
                 if ( npens % ncpus == 0 ) useZero = 1;
                 if ( npens == nopatches.size() + 1 ) useZero = 1;
                 npens += xBlocks*zBlocks;
                 if ( npens % ncpus == 0 ) useZero = 1;
                 if ( npens == nopatches.size() + 1 ) useZero = 1;
                 npens += yBlocks*zBlocks;
                 if ( npens % ncpus == 0 ) useZero = 1;
                 if ( npens == nopatches.size() + 1 ) useZero = 1;
 
                 // add nopatches then patches in reversed order
                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
   
                 int npes = pmeprocs.size();
                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
 #if !USE_RANDOM_TOPO
                 zprocs.sort();
 #endif
                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
 #if !USE_RANDOM_TOPO
                 yprocs.sort();
 #endif
       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
 #if !USE_RANDOM_TOPO
       xprocs.sort();
 #endif
 
 #if USE_TOPO_SFC
   CmiLock(tmgr_lock);
   //{
   TopoManager tmgr;
   int xdim = tmgr.getDimNX();
   int ydim = tmgr.getDimNY();
   int zdim = tmgr.getDimNZ();
   int xdim1 = find_level_grid(xdim);
   int ydim1 = find_level_grid(ydim);
   int zdim1 = find_level_grid(zdim);
   if(CkMyPe() == 0)
       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
 
   vector<Coord> result;
   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
   sort_sfc(xprocs, tmgr, result);
   sort_sfc(yprocs, tmgr, result);
   sort_sfc(zprocs, tmgr, result);
   //}
   CmiUnlock(tmgr_lock);
 #endif
 
 
                 if(CkMyPe() == 0){  
               iout << iINFO << "PME Z PENCIL LOCATIONS:";
           for ( i=0; i<zprocs.size() && i<10; ++i ) {
 #if USE_TOPO_SFC
               int x,y,z,t;
               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
 #else
               iout << " " << zprocs[i];
 #endif
           }
           if ( i < zprocs.size() ) iout << " ...";
               iout << "\n" << endi;
                 }
 
     if (CkMyRank() == 0) {
       for (pe=0, x = 0; x < xBlocks; ++x)
         for (y = 0; y < yBlocks; ++y, ++pe ) {
           pencilPMEProcessors[zprocs[pe]] = 1;
         }
     }
      
                 if(CkMyPe() == 0){  
               iout << iINFO << "PME Y PENCIL LOCATIONS:";
           for ( i=0; i<yprocs.size() && i<10; ++i ) {
 #if USE_TOPO_SFC
               int x,y,z,t;
               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
 #else
               iout << " " << yprocs[i];
 #endif
           }
           if ( i < yprocs.size() ) iout << " ...";
               iout << "\n" << endi;
                 }
 
     if (CkMyRank() == 0) {
       for (pe=0, z = 0; z < zBlocks; ++z )
         for (x = 0; x < xBlocks; ++x, ++pe ) {
           pencilPMEProcessors[yprocs[pe]] = 1;
         }
     }
     
                 if(CkMyPe() == 0){  
                 iout << iINFO << "PME X PENCIL LOCATIONS:";
                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
 #if USE_TOPO_SFC
                 int x,y,z,t;
                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
 #else
                 iout << " " << xprocs[i];
 #endif
             }
                 if ( i < xprocs.size() ) iout << " ...";
                 iout << "\n" << endi;
                 }
 
     if (CkMyRank() == 0) {
       for (pe=0, y = 0; y < yBlocks; ++y )      
         for (z = 0; z < zBlocks; ++z, ++pe ) {
           pencilPMEProcessors[xprocs[pe]] = 1;
         }
     }
         
 
         // creating the pencil arrays
         if ( CkMyPe() == 0 ){
 #if !USE_RANDOM_TOPO
         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
 #endif
 #if 1
         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
         CProxy_PmePencilMap ym;
         if ( simParams->PMEPencilsYLayout )
           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
         else
           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
         CProxy_PmePencilMap xm;
         if ( simParams->PMEPencilsXLayout )
           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
         else
           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
 #else
         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
 
                 for (pe=0, x = 0; x < xBlocks; ++x)
                         for (y = 0; y < yBlocks; ++y, ++pe ) {
                                 zPencil(x,y,0).insert(zprocs[pe]);
                         }
         zPencil.doneInserting();
 
                 for (pe=0, x = 0; x < xBlocks; ++x)
                         for (z = 0; z < zBlocks; ++z, ++pe ) {
                                 yPencil(x,0,z).insert(yprocs[pe]);
                         }
         yPencil.doneInserting();
 
 
                 for (pe=0, y = 0; y < yBlocks; ++y )    
                         for (z = 0; z < zBlocks; ++z, ++pe ) {
                                 xPencil(0,y,z).insert(xprocs[pe]);
                         }
                 xPencil.doneInserting();     
 #endif
 
                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
                 PmePencilInitMsgData msgdata;
                 msgdata.grid = myGrid;
                 msgdata.xBlocks = xBlocks;
                 msgdata.yBlocks = yBlocks;
                 msgdata.zBlocks = zBlocks;
                 msgdata.xPencil = xPencil;
                 msgdata.yPencil = yPencil;
                 msgdata.zPencil = zPencil;
                 msgdata.pmeProxy = pmeProxyDir;
         msgdata.pmeNodeProxy = pmeNodeProxy;
         msgdata.xm = xm;
         msgdata.ym = ym;
         msgdata.zm = zm;
                 xPencil.init(new PmePencilInitMsg(msgdata));
                 yPencil.init(new PmePencilInitMsg(msgdata));
                 zPencil.init(new PmePencilInitMsg(msgdata));
         }
 
     return;  // continue in initialize_pencils() at next startup stage
   }
 
 
   int pe;
   int nx = 0;
   for ( pe = 0; pe < numGridPes; ++pe ) {
     localInfo[pe].x_start = nx;
     nx += myGrid.block1;
     if ( nx > myGrid.K1 ) nx = myGrid.K1;
     localInfo[pe].nx = nx - localInfo[pe].x_start;
   }
   int ny = 0;
   for ( pe = 0; pe < numTransPes; ++pe ) {
     localInfo[pe].y_start_after_transpose = ny;
     ny += myGrid.block2;
     if ( ny > myGrid.K2 ) ny = myGrid.K2;
     localInfo[pe].ny_after_transpose =
                         ny - localInfo[pe].y_start_after_transpose;
   }
 
   {  // decide how many pes this node exchanges charges with
 
   PatchMap *patchMap = PatchMap::Object();
   Lattice lattice = simParams->lattice;
   BigReal sysdima = lattice.a_r().unit() * lattice.a();
   BigReal cutoff = simParams->cutoff;
   BigReal patchdim = simParams->patchDimension;
   int numPatches = patchMap->numPatches();
   int numNodes = CkNumPes();
   int *source_flags = new int[numNodes];
   int node;
   for ( node=0; node<numNodes; ++node ) {
     source_flags[node] = 0;
     recipPeDest[node] = 0;
   }
 
   // // make sure that we don't get ahead of ourselves on this node
   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
   //   source_flags[CkMyPe()] = 1;
   //   recipPeDest[myRecipPe] = 1;
   // }
 
   for ( int pid=0; pid < numPatches; ++pid ) {
     int pnode = patchMap->node(pid);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
 #endif
     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
     BigReal minx = patchMap->min_a(pid);
     BigReal maxx = patchMap->max_a(pid);
     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
     // min1 (max1) is smallest (largest) grid line for this patch
     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
     for ( int i=min1; i<=max1; ++i ) {
       int ix = i;
       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
       while ( ix < 0 ) ix += myGrid.K1;
       // set source_flags[pnode] if this patch sends to our node
       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
         source_flags[pnode] = 1;
       }
       // set dest_flags[] for node that our patch sends to
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
       if ( offload ) {
         if ( pnode == CkNodeFirst(CkMyNode()) ) {
           recipPeDest[ix / myGrid.block1] = 1;
         }
       } else
 #endif
       if ( pnode == CkMyPe() ) {
         recipPeDest[ix / myGrid.block1] = 1;
       }
     }
   }
 
   int numSourcesSamePhysicalNode = 0;
   numSources = 0;
   numDestRecipPes = 0;
   for ( node=0; node<numNodes; ++node ) {
     if ( source_flags[node] ) ++numSources;
     if ( recipPeDest[node] ) ++numDestRecipPes;
     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
   }
 
 #if 0
   if ( numSources ) {
     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
             CkMyPe(), numSourcesSamePhysicalNode, numSources);
     iout << iINFO << "PME " << CkMyPe() << " sources:";
     for ( node=0; node<numNodes; ++node ) {
       if ( source_flags[node] ) iout << " " << node;
     }
     iout << "\n" << endi;
   }
 #endif
 
   delete [] source_flags;
 
   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
   //           CkMyPe(), numSources, numDestRecipPes);
 
   }  // decide how many pes this node exchanges charges with (end)
 
   ungrid_count = numDestRecipPes;
 
   sendTransBarrier_received = 0;
 
   if ( myGridPe < 0 && myTransPe < 0 ) return;
   // the following only for nodes doing reciprocal sum
 
   if ( myTransPe >= 0 ) {
     recipEvirPe = findRecipEvirPe();
     pmeProxy[recipEvirPe].addRecipEvirClient();
   }
 
   if ( myTransPe >= 0 ) {
       int k2_start = localInfo[myTransPe].y_start_after_transpose;
       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
       #ifdef OPENATOM_VERSION
       if ( simParams->openatomOn ) { 
         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
       } else {
         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
       }
       #else  // OPENATOM_VERSION
       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
       #endif // OPENATOM_VERSION
   }
 
   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
   if ( local_size < local_size_2 ) local_size = local_size_2;
   qgrid = new float[local_size*numGrids];
   if ( numGridPes > 1 || numTransPes > 1 ) {
     kgrid = new float[local_size*numGrids];
   } else {
     kgrid = qgrid;
   }
   qgrid_size = local_size;
 
   if ( myGridPe >= 0 ) {
   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
   }
 
   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
 #ifdef NAMD_FFTW
   CmiLock(fftw_plan_lock);
 #ifdef NAMD_FFTW_3
   work = new fftwf_complex[n[0]];
   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
   if ( myGridPe >= 0 ) {
     forward_plan_yz=new fftwf_plan[numGrids];
     backward_plan_yz=new fftwf_plan[numGrids];
   }
   if ( myTransPe >= 0 ) {
     forward_plan_x=new fftwf_plan[numGrids];
     backward_plan_x=new fftwf_plan[numGrids];
   }
   /* need one plan per grid */
   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
   if ( myGridPe >= 0 ) {
     for( int g=0; g<numGrids; g++)
       {
         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
                                                      localInfo[myGridPe].nx,
                                                      qgrid + qgrid_size * g,
                                                      NULL,
                                                      1,
                                                      myGrid.dim2 * myGrid.dim3,
                                                      (fftwf_complex *) 
                                                      (qgrid + qgrid_size * g),
                                                      NULL,
                                                      1,
                                                      myGrid.dim2 * (myGrid.dim3/2),
                                                      fftwFlags);
       }
   }
   int zdim = myGrid.dim3;
   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
   if ( ! CkMyPe() ) iout << " 2..." << endi;
   if ( myTransPe >= 0 ) {
     for( int g=0; g<numGrids; g++)
       {
 
         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
                                                 (fftwf_complex *)
                                                 (kgrid+qgrid_size*g),
                                                 NULL,
                                                 xStride,
                                                 1,
                                                 (fftwf_complex *)
                                                 (kgrid+qgrid_size*g),
                                                 NULL,
                                                 xStride,
                                                 1,
                                                 FFTW_FORWARD,fftwFlags);
         
       }
   }
   if ( ! CkMyPe() ) iout << " 3..." << endi;
   if ( myTransPe >= 0 ) {
     for( int g=0; g<numGrids; g++)
       {
         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
                                                  (fftwf_complex *)
                                                  (kgrid+qgrid_size*g),
                                                  NULL,
                                                  xStride,
                                                  1,
                                                  (fftwf_complex *)
                                                  (kgrid+qgrid_size*g),
                                                  NULL,
                                                  xStride,
                                                  1,
                                                  FFTW_BACKWARD, fftwFlags);
 
       }
   }
   if ( ! CkMyPe() ) iout << " 4..." << endi;
   if ( myGridPe >= 0 ) {
     for( int g=0; g<numGrids; g++)
       {
         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
                                                       localInfo[myGridPe].nx,
                                                       (fftwf_complex *)
                                                       (qgrid + qgrid_size * g),
                                                       NULL,
                                                       1,
                                                       myGrid.dim2*(myGrid.dim3/2),
                                                       qgrid + qgrid_size * g,
                                                       NULL,
                                                       1,
                                                       myGrid.dim2 * myGrid.dim3,
                                                       fftwFlags);
       }
   }
   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
 
 #else
   work = new fftw_complex[n[0]];
 
   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
   if ( myGridPe >= 0 ) {
   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
   }
   if ( ! CkMyPe() ) iout << " 2..." << endi;
   if ( myTransPe >= 0 ) {
       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
   }
   if ( ! CkMyPe() ) iout << " 3..." << endi;
   if ( myTransPe >= 0 ) {
   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
   }
   if ( ! CkMyPe() ) iout << " 4..." << endi;
   if ( myGridPe >= 0 ) {
   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
   }
   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
 #endif
   CmiUnlock(fftw_plan_lock);
 #else
   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
 #endif
 
   if ( myGridPe >= 0 && numSources == 0 )
                 NAMD_bug("PME grid elements exist without sources.");
   grid_count = numSources;
   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
   trans_count = numGridPes;
 }

◆ initialize_computes()

void ComputePmeMgr::initialize_computes ( )

Definition at line 2765 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, master_pe, NAMD_bug(), ComputePmeUtil::numGrids, PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

                                         {
 
   noWorkCount = 0;
   doWorkCount = 0;
   ungridForcesCount = 0;
 
   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
 
   SimParameters *simParams = Node::Object()->simParameters;
 
   strayChargeErrors = 0;
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  PatchMap *patchMap = PatchMap::Object();
  int pe = master_pe = CkNodeFirst(CkMyNode());
  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
         && pe != deviceCUDA->getMasterPe() ) {
       master_pe = pe;
     }
  }
  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
  }
 
  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
  bool cudaFirst = 1;
  if ( offload ) {
   CmiLock(cuda_lock);
   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
  }
 
  if ( cudaFirst ) {
   nodePmeMgr->master_pe = master_pe;
   nodePmeMgr->masterPmeMgr = masterPmeMgr;
  }
 #endif
 
   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
   fsize = myGrid.K1 * myGrid.dim2;
   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  if ( ! offload )
 #endif
  {
   q_arr = new float*[fsize*numGrids];
   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
   q_list = new float*[fsize*numGrids];
   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
   q_count = 0;
  }
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  if ( cudaFirst || ! offload ) {
 #endif
   f_arr = new char[fsize*numGrids];
   // memset to non-zero value has race condition on BlueGene/Q
   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
 
   for ( int g=0; g<numGrids; ++g ) {
     char *f = f_arr + g*fsize;
     if ( usePencils ) {
       int K1 = myGrid.K1;
       int K2 = myGrid.K2;
       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
       int dim2 = myGrid.dim2;
       for (int ap=0; ap<numPencilsActive; ++ap) {
         int ib = activePencils[ap].i;
         int jb = activePencils[ap].j;
         int ibegin = ib*block1;
         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
         int jbegin = jb*block2;
         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
         for ( int i=ibegin; i<iend; ++i ) {
           for ( int j=jbegin; j<jend; ++j ) {
             f[i*dim2+j] = 0;
           }
         }
       }
     } else {
       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
       bsize = block1 * myGrid.dim2 * myGrid.dim3;
       for (int pe=0; pe<numGridPes; pe++) {
         if ( ! recipPeDest[pe] ) continue;
         int start = pe * bsize;
         int len = bsize;
         if ( start >= qsize ) { start = 0; len = 0; }
         if ( start + len > qsize ) { len = qsize - start; }
         int zdim = myGrid.dim3;
         int fstart = start / zdim;
         int flen = len / zdim;
         memset(f + fstart, 0, flen*sizeof(char));
         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
       }
     }
   }
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  }
  if ( offload ) {
  cudaSetDevice(deviceCUDA->getDeviceID());
  if ( cudaFirst ) {
 
   int f_alloc_count = 0;
   for ( int n=fsize, i=0; i<n; ++i ) {
     if ( f_arr[i] == 0 ) {
       ++f_alloc_count;
     }
   }
   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
 
   q_arr = new float*[fsize*numGrids];
   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
 
   float **q_arr_dev_host = new float*[fsize];
   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
 
   float **v_arr_dev_host = new float*[fsize];
   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
 
   int q_stride = myGrid.K3+myGrid.order-1;
   q_data_size = f_alloc_count * q_stride * sizeof(float);
   ffz_size = (fsize + q_stride) * sizeof(int);
 
   // tack ffz onto end of q_data to allow merged transfer
   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
   cudaMalloc((void**) &v_data_dev, q_data_size);
   cuda_errcheck("malloc grid data for pme");
   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
 
   f_alloc_count = 0;
   for ( int n=fsize, i=0; i<n; ++i ) {
     if ( f_arr[i] == 0 ) {
       q_arr[i] = q_data_host + f_alloc_count * q_stride;
       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
       ++f_alloc_count;
     } else {
       q_arr[i] = 0;
       q_arr_dev_host[i] = 0;
       v_arr_dev_host[i] = 0;
     }
   }
 
   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
   delete [] q_arr_dev_host;
   delete [] v_arr_dev_host;
   delete [] f_arr;
   f_arr = new char[fsize + q_stride];
   fz_arr = f_arr + fsize;
   memset(f_arr, 0, fsize + q_stride);
   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
 
   cuda_errcheck("initialize grid data for pme");
 
   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
   cuda_errcheck("initialize bspline coefficients for pme");
 
 #define XCOPY(X) masterPmeMgr->X = X;
   XCOPY(bspline_coeffs_dev)
   XCOPY(bspline_dcoeffs_dev)
   XCOPY(q_arr)
   XCOPY(q_arr_dev)
   XCOPY(v_arr_dev)
   XCOPY(q_data_size)
   XCOPY(q_data_host)
   XCOPY(q_data_dev)
   XCOPY(v_data_dev)
   XCOPY(ffz_size)
   XCOPY(ffz_host)
   XCOPY(ffz_dev)
   XCOPY(f_arr)
   XCOPY(fz_arr)
 #undef XCOPY
   //CkPrintf("pe %d init first\n", CkMyPe());
  } else { // cudaFirst
   //CkPrintf("pe %d init later\n", CkMyPe());
 #define XCOPY(X) X = masterPmeMgr->X;
   XCOPY(bspline_coeffs_dev)
   XCOPY(bspline_dcoeffs_dev)
   XCOPY(q_arr)
   XCOPY(q_arr_dev)
   XCOPY(v_arr_dev)
   XCOPY(q_data_size)
   XCOPY(q_data_host)
   XCOPY(q_data_dev)
   XCOPY(v_data_dev)
   XCOPY(ffz_size)
   XCOPY(ffz_host)
   XCOPY(ffz_dev)
   XCOPY(f_arr)
   XCOPY(fz_arr)
 #undef XCOPY
  } // cudaFirst
   CmiUnlock(cuda_lock);
  } else // offload
 #endif // NAMD_CUDA
  {
   fz_arr = new char[myGrid.K3+myGrid.order-1];
  }
 
 #if 0 && USE_PERSISTENT
   recvGrid_handle = NULL;
 #endif
 }

◆ initialize_pencils()

void ComputePmeMgr::initialize_pencils ( CkQdMsg * msg )

Definition at line 1721 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), PmeGrid::order, Random::reorder(), Node::simParameters, simParams, and Vector::unit().

                                                    {
   delete msg;
   if ( ! usePencils ) return;
 
   SimParameters *simParams = Node::Object()->simParameters;
 
   PatchMap *patchMap = PatchMap::Object();
   Lattice lattice = simParams->lattice;
   BigReal sysdima = lattice.a_r().unit() * lattice.a();
   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
   BigReal cutoff = simParams->cutoff;
   BigReal patchdim = simParams->patchDimension;
   int numPatches = patchMap->numPatches();
 
   pencilActive = new char[xBlocks*yBlocks];
   for ( int i=0; i<xBlocks; ++i ) {
     for ( int j=0; j<yBlocks; ++j ) {
       pencilActive[i*yBlocks+j] = 0;
     }
   }
 
   for ( int pid=0; pid < numPatches; ++pid ) {
     int pnode = patchMap->node(pid);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
     if ( offload ) {
       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
     } else
 #endif
     if ( pnode != CkMyPe() ) continue;
 
     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
 
     BigReal minx = patchMap->min_a(pid);
     BigReal maxx = patchMap->max_a(pid);
     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
     // min1 (max1) is smallest (largest) grid line for this patch
     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
 
     BigReal miny = patchMap->min_b(pid);
     BigReal maxy = patchMap->max_b(pid);
     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
     // min2 (max2) is smallest (largest) grid line for this patch
     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
 
     for ( int i=min1; i<=max1; ++i ) {
       int ix = i;
       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
       while ( ix < 0 ) ix += myGrid.K1;
       for ( int j=min2; j<=max2; ++j ) {
         int jy = j;
         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
         while ( jy < 0 ) jy += myGrid.K2;
         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
       }
     }
   }
 
   numPencilsActive = 0;
   for ( int i=0; i<xBlocks; ++i ) {
     for ( int j=0; j<yBlocks; ++j ) {
       if ( pencilActive[i*yBlocks+j] ) {
         ++numPencilsActive;
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
 #endif
         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
       }
     }
   }
   activePencils = new ijpair[numPencilsActive];
   numPencilsActive = 0;
   for ( int i=0; i<xBlocks; ++i ) {
     for ( int j=0; j<yBlocks; ++j ) {
       if ( pencilActive[i*yBlocks+j] ) {
         activePencils[numPencilsActive++] = ijpair(i,j);
       }
     }
   }
   if ( simParams->PMESendOrder ) {
     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
   } else {
     Random rand(CkMyPe());
     rand.reorder(activePencils,numPencilsActive);
   }
   //if ( numPencilsActive ) {
   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
   //}
 
   ungrid_count = numPencilsActive;
 }

◆ pollChargeGridReady()

void ComputePmeMgr::pollChargeGridReady ( )

Definition at line 3613 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

                                         {
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
   CUDA_POLL(cuda_check_pme_charges,this);
 #else
   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
 #endif
 }

◆ pollForcesReady()

void ComputePmeMgr::pollForcesReady ( )

Definition at line 2701 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

                                     {
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
   CUDA_POLL(cuda_check_pme_forces,this);
 #else
   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
 #endif
 }

◆ procTrans()

void ComputePmeMgr::procTrans ( PmeTransMsg * msg )

Definition at line 2076 of file ComputePme.C.

References PmeGrid::dim3, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

                                               {
   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
   if ( trans_count == numGridPes ) {
     lattice = msg->lattice;
     grid_sequence = msg->sequence;
   }
 
  if ( msg->nx ) {
   int zdim = myGrid.dim3;
   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
   int first_pe = nodeInfo.pe_start;
   int last_pe = first_pe+nodeInfo.npe-1;
   int y_skip = localInfo[myTransPe].y_start_after_transpose
              - localInfo[first_pe].y_start_after_transpose;
   int ny_msg = localInfo[last_pe].y_start_after_transpose
              + localInfo[last_pe].ny_after_transpose
              - localInfo[first_pe].y_start_after_transpose;
   int ny = localInfo[myTransPe].ny_after_transpose;
   int x_start = msg->x_start;
   int nx = msg->nx;
   for ( int g=0; g<numGrids; ++g ) {
     memcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
         nx*ny*zdim*sizeof(float));
   }
  }
 
   --trans_count;
 
   if ( trans_count == 0 ) {
     pmeProxyDir[CkMyPe()].gridCalc2();
   }
 }

◆ procUntrans()

void ComputePmeMgr::procUntrans ( PmeUntransMsg * msg )

Definition at line 2337 of file ComputePme.C.

References PmeGrid::dim3, PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

                                                   {
   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
 
 #if CMK_BLUEGENEL
   CmiNetworkProgressAfter (0);
 #endif
 
   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
   int first_pe = nodeInfo.pe_start;
   int g;
 
  if ( msg->ny ) {
   int zdim = myGrid.dim3;
   int last_pe = first_pe+nodeInfo.npe-1;
   int x_skip = localInfo[myGridPe].x_start
              - localInfo[first_pe].x_start;
   int nx_msg = localInfo[last_pe].x_start
              + localInfo[last_pe].nx
              - localInfo[first_pe].x_start;
   int nx = localInfo[myGridPe].nx;
   int y_start = msg->y_start;
   int ny = msg->ny;
   int slicelen = myGrid.K2 * zdim;
   int cpylen = ny * zdim;
   for ( g=0; g<numGrids; ++g ) {
     float *q = qgrid + qgrid_size * g + y_start * zdim;
     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
     for ( int x = 0; x < nx; ++x ) {
       memcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
       q += slicelen;
       qmsg += cpylen;
     }
   }
  }
 
   --untrans_count;
 
   if ( untrans_count == 0 ) {
     pmeProxyDir[CkMyPe()].gridCalc3();
   }
 }

◆ recvAck()

void ComputePmeMgr::recvAck ( PmeAckMsg * msg )

Definition at line 2479 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

                                           {
   if ( msg ) delete msg;
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   if ( offload ) {
     CmiLock(cuda_lock);
     if ( ungrid_count == 0 ) {
       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
     }
     int uc = --ungrid_count;
     CmiUnlock(cuda_lock);
 
     if ( uc == 0 ) {
       pmeProxyDir[master_pe].ungridCalc();
     }
     return;
   }
 #endif
   --ungrid_count;
 
   if ( ungrid_count == 0 ) {
     pmeProxyDir[CkMyPe()].ungridCalc();
   }
 }

◆ recvArrays()

void ComputePmeMgr::recvArrays	(	CProxy_PmeXPencil	x,
		CProxy_PmeYPencil	y,
		CProxy_PmeZPencil	z
	)

Definition at line 828 of file ComputePme.C.

                                                                        {
   xPencil = x;  yPencil = y;  zPencil = z;
   
     if(CmiMyRank()==0)
     {
       pmeNodeProxy.ckLocalBranch()->xPencil=x;
       pmeNodeProxy.ckLocalBranch()->yPencil=y;
       pmeNodeProxy.ckLocalBranch()->zPencil=z;
     }
 }

◆ recvChargeGridReady()

void ComputePmeMgr::recvChargeGridReady ( )

Definition at line 3622 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

                                         {
   chargeGridReady(*saved_lattice,saved_sequence);
 }

◆ recvGrid()

void ComputePmeMgr::recvGrid ( PmeGridMsg * msg )

Definition at line 1855 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

                                             {
   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
   if ( grid_count == 0 ) {
     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
   }
   if ( grid_count == numSources ) {
     lattice = msg->lattice;
     grid_sequence = msg->sequence;
   }
 
   int zdim = myGrid.dim3;
   int zlistlen = msg->zlistlen;
   int *zlist = msg->zlist;
   float *qmsg = msg->qgrid;
   for ( int g=0; g<numGrids; ++g ) {
     char *f = msg->fgrid + fgrid_len * g;
     float *q = qgrid + qgrid_size * g;
     for ( int i=0; i<fgrid_len; ++i ) {
       if ( f[i] ) {
         for ( int k=0; k<zlistlen; ++k ) {
           q[zlist[k]] += *(qmsg++);
         }
       }
       q += zdim;
     }
   }
 
   gridmsg_reuse[numSources-grid_count] = msg;
   --grid_count;
 
   if ( grid_count == 0 ) {
     pmeProxyDir[CkMyPe()].gridCalc1();
     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
   }
 }

◆ recvRecipEvir()

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg * msg )

Definition at line 3068 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), ComputePmeUtil::numGrids, pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

                                                  {
   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
   for ( int g=0; g<numGrids; ++g ) {
     evir[g] += msg->evir[g];
   }
   delete msg;
   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
 }

◆ recvSharedTrans()

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg * msg )

Definition at line 2058 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

                                                           {
   procTrans(msg->msg);
   CmiLock(msg->lock);
   int count = --(*msg->count);
   CmiUnlock(msg->lock);
   if ( count == 0 ) {
     CmiDestroyLock(msg->lock);
     delete msg->count;
     delete msg->msg;
   }
   delete msg;
 }

◆ recvSharedUntrans()

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg * msg )

Definition at line 2319 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

                                                               {
   procUntrans(msg->msg);
   CmiLock(msg->lock);
   int count = --(*msg->count);
   CmiUnlock(msg->lock);
   if ( count == 0 ) {
     CmiDestroyLock(msg->lock);
     delete msg->count;
     delete msg->msg;
   }
   delete msg;
 }

◆ recvTrans()

void ComputePmeMgr::recvTrans ( PmeTransMsg * msg )

Definition at line 2071 of file ComputePme.C.

References procTrans().

                                               {
   procTrans(msg);
   delete msg;
 }

◆ recvUngrid()

void ComputePmeMgr::recvUngrid ( PmeGridMsg * msg )

Definition at line 2464 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

                                               {
   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   if ( ! offload )  // would need lock
 #endif
   if ( ungrid_count == 0 ) {
     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
   }
 
   if ( usePencils ) copyPencils(msg);
   else copyResults(msg);
   delete msg;
   recvAck(0);
 }

◆ recvUntrans()

void ComputePmeMgr::recvUntrans ( PmeUntransMsg * msg )

Definition at line 2332 of file ComputePme.C.

References procUntrans().

                                                   {
   procUntrans(msg);
   delete msg;
 }

◆ sendChargeGridReady()

void ComputePmeMgr::sendChargeGridReady ( )

Definition at line 3599 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

                                         {
   for ( int i=0; i<CkMyNodeSize(); ++i ) {
     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
     int cs = mgr->pmeComputes.size();
     if ( cs ) {
       mgr->ungridForcesCount = cs;
       mgr->recipEvirCount = mgr->recipEvirClients;
       masterPmeMgr->chargeGridSubmittedCount++;
     }
   }
   pmeProxy[master_pe].recvChargeGridReady();
 }

◆ sendData()

void ComputePmeMgr::sendData	(	Lattice &	lattice,
		int	sequence
	)

Definition at line 4036 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

                                                            {
 
   sendDataHelper_lattice = &lattice;
   sendDataHelper_sequence = sequence;
   sendDataHelper_sourcepe = CkMyPe();
   sendDataHelper_errors = strayChargeErrors;
   strayChargeErrors = 0;
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   if ( offload ) {
     for ( int i=0; i < numGridPes; ++i ) {
       int pe = gridPeOrder[i];  // different order
       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
 #if CMK_MULTICORE
       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
 #else
       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
 #endif
     }
   } else
 #endif
   {
     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
   }
  
 }

◆ sendDataHelper()

void ComputePmeMgr::sendDataHelper ( int iter )

Definition at line 4023 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

                                            {
   nodePmeMgr->sendDataHelper(iter);
 }

◆ sendDataPart()

void ComputePmeMgr::sendDataPart	(	int	first,
		int	last,
		Lattice &	lattice,
		int	sequence,
		int	sourcepe,
		int	errors
	)

Definition at line 3914 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), iout, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

                                                                                                               {
 
   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
 
   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
 
   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
   for (int j=first; j<=last; j++) {
     int pe = gridPeOrder[j];  // different order
     if ( ! recipPeDest[pe] && ! errors ) continue;
     int start = pe * bsize;
     int len = bsize;
     if ( start >= qsize ) { start = 0; len = 0; }
     if ( start + len > qsize ) { len = qsize - start; }
     int zdim = myGrid.dim3;
     int fstart = start / zdim;
     int flen = len / zdim;
     int fcount = 0;
     int i;
 
     int g;
     for ( g=0; g<numGrids; ++g ) {
       char *f = f_arr + fstart + g*fsize;
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
      if ( offload ) {
       int errcount = 0;
       for ( i=0; i<flen; ++i ) {
         f[i] = ffz_host[fstart+i];
         fcount += f[i];
         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
       }
       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
      } else
 #endif
       for ( i=0; i<flen; ++i ) {
         fcount += f[i];
       }
       if ( ! recipPeDest[pe] ) {
         int errfound = 0;
         for ( i=0; i<flen; ++i ) {
           if ( f[i] == 3 ) {
             errfound = 1;
             break;
           }
         }
         if ( errfound ) {
           iout << iERROR << "Stray PME grid charges detected: "
                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
           int iz = -1;
           for ( i=0; i<flen; ++i ) {
             if ( f[i] == 3 ) {
               f[i] = 2;
               int jz = (i+fstart)/myGrid.K2;
               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
             }
           }
           iout << "\n" << endi;
         }
       }
     }
 
 #ifdef NETWORK_PROGRESS
     CmiNetworkProgress();
 #endif
 
     if ( ! recipPeDest[pe] ) continue;
 
     int zlistlen = 0;
     for ( i=0; i<myGrid.K3; ++i ) {
       if ( fz_arr[i] ) ++zlistlen;
     }
 
     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
 
     msg->sourceNode = sourcepe;
     msg->lattice = lattice;
     msg->start = fstart;
     msg->len = flen;
     msg->zlistlen = zlistlen;
     int *zlist = msg->zlist;
     zlistlen = 0;
     for ( i=0; i<myGrid.K3; ++i ) {
       if ( fz_arr[i] ) zlist[zlistlen++] = i;
     }
     float *qmsg = msg->qgrid;
     for ( g=0; g<numGrids; ++g ) {
       char *f = f_arr + fstart + g*fsize;
       memcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
       float **q = q_arr + fstart + g*fsize;
       for ( i=0; i<flen; ++i ) {
         if ( f[i] ) {
           for (int h=0; h<myGrid.order-1; ++h) {
             q[i][h] += q[i][myGrid.K3+h];
           }
           for ( int k=0; k<zlistlen; ++k ) {
             *(qmsg++) = q[i][zlist[k]];
           }
         }
       }
     }
 
     msg->sequence = compute_sequence;
     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
     pmeProxy[gridPeMap[pe]].recvGrid(msg);
   }
 
 }

◆ sendPencils()

void ComputePmeMgr::sendPencils	(	Lattice &	lattice,
		int	sequence
	)

Definition at line 3809 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), ijpair::i, iERROR(), iout, ijpair::j, PmeGrid::K1, PmeGrid::K2, ComputePmeUtil::numGrids, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

                                                               {
 
   sendDataHelper_lattice = &lattice;
   sendDataHelper_sequence = sequence;
   sendDataHelper_sourcepe = CkMyPe();
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   if ( offload ) {
     for ( int ap=0; ap < numPencilsActive; ++ap ) {
 #if CMK_MULTICORE
       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
       int ib = activePencils[ap].i;
       int jb = activePencils[ap].j;
       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
       pmeProxy[destproc].sendPencilsHelper(ap);
 #else
       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
 #endif
     }
   } else
 #endif
   {
     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
   }
 
   if ( strayChargeErrors ) {
    strayChargeErrors = 0;
    iout << iERROR << "Stray PME grid charges detected: "
         << CkMyPe() << " sending to (x,y)";
    int K1 = myGrid.K1;
    int K2 = myGrid.K2;
    int dim2 = myGrid.dim2;
    int block1 = myGrid.block1;
    int block2 = myGrid.block2;
    for (int ib=0; ib<xBlocks; ++ib) {
     for (int jb=0; jb<yBlocks; ++jb) {
      int ibegin = ib*block1;
      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
      int jbegin = jb*block2;
      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
 
      for ( int g=0; g<numGrids; ++g ) {
        char *f = f_arr + g*fsize;
        if ( ! pencilActive[ib*yBlocks+jb] ) {
            for ( int i=ibegin; i<iend; ++i ) {
             for ( int j=jbegin; j<jend; ++j ) {
              if ( f[i*dim2+j] == 3 ) {
                f[i*dim2+j] = 2;
                iout << " (" << i << "," << j << ")";
              }
             }
            }
        }
      }
     }
    }
    iout << "\n" << endi;
   }
  
 }

◆ sendPencilsHelper()

void ComputePmeMgr::sendPencilsHelper ( int iter )

Definition at line 3796 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

                                               {
   nodePmeMgr->sendPencilsHelper(iter);
 }

◆ sendPencilsPart()

void ComputePmeMgr::sendPencilsPart	(	int	first,
		int	last,
		Lattice &	lattice,
		int	sequence,
		int	sourcepe
	)

Definition at line 3654 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

                                                                                                      {
 
   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
 
 #if 0 && USE_PERSISTENT
     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
 #endif
   int K1 = myGrid.K1;
   int K2 = myGrid.K2;
   int dim2 = myGrid.dim2;
   int dim3 = myGrid.dim3;
   int block1 = myGrid.block1;
   int block2 = myGrid.block2;
 
   // int savedMessages = 0;
   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
 
   for (int ap=first; ap<=last; ++ap) {
     int ib = activePencils[ap].i;
     int jb = activePencils[ap].j;
     int ibegin = ib*block1;
     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
     int jbegin = jb*block2;
     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
 
     int fcount = 0;
     for ( int g=0; g<numGrids; ++g ) {
       char *f = f_arr + g*fsize;
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
      if ( offload ) {
       int errcount = 0;
       for ( int i=ibegin; i<iend; ++i ) {
        for ( int j=jbegin; j<jend; ++j ) {
         int k = i*dim2+j;
         f[k] = ffz_host[k];
         fcount += f[k];
         if ( ffz_host[k] & ~1 ) ++errcount;
        }
       }
       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
      } else
 #endif
       for ( int i=ibegin; i<iend; ++i ) {
        for ( int j=jbegin; j<jend; ++j ) {
         fcount += f[i*dim2+j];
        }
       }
     }
 
 #ifdef NETWORK_PROGRESS
     CmiNetworkProgress();
 #endif
 
     if ( ! pencilActive[ib*yBlocks+jb] )
       NAMD_bug("PME activePencils list inconsistent");
 
     int zlistlen = 0;
     for ( int i=0; i<myGrid.K3; ++i ) {
       if ( fz_arr[i] ) ++zlistlen;
     }
 
     int hd = ( fcount? 1 : 0 );  // has data?
     // if ( ! hd ) ++savedMessages;
 
     
     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
     msg->sourceNode = sourcepe;
     msg->hasData = hd;
     msg->lattice = lattice;
    if ( hd ) {
 #if 0
     msg->start = fstart;
     msg->len = flen;
 #else
     msg->start = -1;   // obsolete?
     msg->len = -1;   // obsolete?
 #endif
     msg->zlistlen = zlistlen;
     int *zlist = msg->zlist;
     zlistlen = 0;
     for ( int i=0; i<myGrid.K3; ++i ) {
       if ( fz_arr[i] ) zlist[zlistlen++] = i;
     }
     char *fmsg = msg->fgrid;
     float *qmsg = msg->qgrid;
     for ( int g=0; g<numGrids; ++g ) {
       char *f = f_arr + g*fsize;
       float **q = q_arr + g*fsize;
       for ( int i=ibegin; i<iend; ++i ) {
        for ( int j=jbegin; j<jend; ++j ) {
         *(fmsg++) = f[i*dim2+j];
         if( f[i*dim2+j] ) {
           for (int h=0; h<myGrid.order-1; ++h) {
             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
           }
           for ( int k=0; k<zlistlen; ++k ) {
             *(qmsg++) = q[i*dim2+j][zlist[k]];
           }
         }
        }
       }
     }
    }
 
     msg->sequence = compute_sequence;
     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
     CmiEnableUrgentSend(1);
 #if USE_NODE_PAR_RECEIVE
     msg->destElem=CkArrayIndex3D(ib,jb,0);
     CProxy_PmePencilMap lzm = npMgr->zm;
     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
     int destnode = CmiNodeOf(destproc);
     
 #if  0 
     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
 #endif
     pmeNodeProxy[destnode].recvZGrid(msg);
 #if 0 
     CmiUsePersistentHandle(NULL, 0);
 #endif
 #else
 #if 0 
     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
 #endif
     zPencil(ib,jb,0).recvGrid(msg);
 #if 0 
     CmiUsePersistentHandle(NULL, 0);
 #endif
 #endif
     CmiEnableUrgentSend(0);
   }
 
 
   // if ( savedMessages ) {
   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
   // }
 
 }

◆ sendTrans()

void ComputePmeMgr::sendTrans ( void )

Definition at line 1967 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

                                   {
 
   untrans_count = numTransPes;
 
 #if     CMK_SMP && USE_CKLOOP
   int useCkLoop = Node::Object()->simParameters->useCkLoop;
   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
   } else
 #endif
   {
     sendTransSubset(0, numTransNodes-1);
   }
 
 }

◆ sendTransBarrier()

void ComputePmeMgr::sendTransBarrier ( void )

Definition at line 1952 of file ComputePme.C.

                                          {
   sendTransBarrier_received += 1;
   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
   if ( sendTransBarrier_received < numGridPes ) return;
   sendTransBarrier_received = 0;
   for ( int i=0; i<numGridPes; ++i ) {
     pmeProxyDir[gridPeMap[i]].sendTrans();
   }
 }

◆ sendTransSubset()

void ComputePmeMgr::sendTransSubset	(	int	first,
		int	last
	)

Definition at line 1983 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), PmeGrid::K2, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, LocalPmeInfo::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendTrans(), and sendTrans().

                                                        {
   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
 
   // send data for transpose
   int zdim = myGrid.dim3;
   int nx = localInfo[myGridPe].nx;
   int x_start = localInfo[myGridPe].x_start;
   int slicelen = myGrid.K2 * zdim;
 
   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
 
 #if CMK_BLUEGENEL
   CmiNetworkProgressAfter (0);
 #endif
 
   for (int j=first; j<=last; j++) {
     int node = transNodeOrder[j];  // different order on each node
     int pe = transNodeInfo[node].pe_start;
     int npe = transNodeInfo[node].npe;
     int totlen = 0;
     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
       LocalPmeInfo &li = localInfo[pe];
       int cpylen = li.ny_after_transpose * zdim;
       totlen += cpylen;
     }
     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
                                 PRIORITY_SIZE) PmeTransMsg;
     newmsg->sourceNode = myGridPe;
     newmsg->lattice = lattice;
     newmsg->x_start = x_start;
     newmsg->nx = nx;
     for ( int g=0; g<numGrids; ++g ) {
       float *qmsg = newmsg->qgrid + nx * totlen * g;
       pe = transNodeInfo[node].pe_start;
       for (int i=0; i<npe; ++i, ++pe) {
         LocalPmeInfo &li = localInfo[pe];
         int cpylen = li.ny_after_transpose * zdim;
         if ( node == myTransNode ) {
           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
         }
         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
         for ( int x = 0; x < nx; ++x ) {
           memcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
           q += slicelen;
           qmsg += cpylen;
         }
       }
     }
     newmsg->sequence = grid_sequence;
     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
     if ( node == myTransNode ) newmsg->nx = 0;
     if ( npe > 1 ) {
       if ( node == myTransNode ) fwdSharedTrans(newmsg);
       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
   }
 }

◆ sendUngrid()

void ComputePmeMgr::sendUngrid ( void )

Definition at line 2404 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, ComputePmeUtil::numGrids, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

                                    {
 
 #if     CMK_SMP && USE_CKLOOP
   int useCkLoop = Node::Object()->simParameters->useCkLoop;
   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
   } else
 #endif
   {
     sendUngridSubset(0, numSources-1);
   }
 
   grid_count = numSources;
   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
 }

◆ sendUngridSubset()

void ComputePmeMgr::sendUngridSubset	(	int	first,
		int	last
	)

Definition at line 2420 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::len, ComputePmeUtil::numGrids, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, PmeGridMsg::qgrid, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

                                                         {
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
 #else
   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
 #endif
 
   for ( int j=first; j<=last; ++j ) {
     // int msglen = qgrid_len;
     PmeGridMsg *newmsg = gridmsg_reuse[j];
     int pe = newmsg->sourceNode;
     int zdim = myGrid.dim3;
     int flen = newmsg->len;
     int fstart = newmsg->start;
     int zlistlen = newmsg->zlistlen;
     int *zlist = newmsg->zlist;
     float *qmsg = newmsg->qgrid;
     for ( int g=0; g<numGrids; ++g ) {
       char *f = newmsg->fgrid + fgrid_len * g;
       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
       for ( int i=0; i<flen; ++i ) {
         if ( f[i] ) {
           for ( int k=0; k<zlistlen; ++k ) {
             *(qmsg++) = q[zlist[k]];
           }
         }
         q += zdim;
       }
     }
     newmsg->sourceNode = myGridPe;
 
     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
     CmiEnableUrgentSend(1);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
     if ( offload ) {
       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
     } else
 #endif
     pmeProxyDir[pe].recvUngrid(newmsg);
     CmiEnableUrgentSend(0);
   }
 }

◆ sendUntrans()

void ComputePmeMgr::sendUntrans ( void )

Definition at line 2218 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, ComputePmeUtil::numGrids, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

                                     {
 
   trans_count = numGridPes;
 
   { // send energy and virial
     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
     for ( int g=0; g<numGrids; ++g ) {
       newmsg->evir[g] = recip_evir2[g];
     }
     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
     CmiEnableUrgentSend(1);
     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
     CmiEnableUrgentSend(0);
   }
 
 #if     CMK_SMP && USE_CKLOOP
   int useCkLoop = Node::Object()->simParameters->useCkLoop;
   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
   } else
 #endif
   {
     sendUntransSubset(0, numGridNodes-1);
   }
 
 }

◆ sendUntransSubset()

void ComputePmeMgr::sendUntransSubset	(	int	first,
		int	last
	)

Definition at line 2245 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, LocalPmeInfo::x_start, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

                                                          {
 
   int zdim = myGrid.dim3;
   int y_start = localInfo[myTransPe].y_start_after_transpose;
   int ny = localInfo[myTransPe].ny_after_transpose;
   int slicelen = myGrid.K2 * zdim;
 
   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
 
 #if CMK_BLUEGENEL
   CmiNetworkProgressAfter (0);
 #endif
 
   // send data for reverse transpose
   for (int j=first; j<=last; j++) {
     int node = gridNodeOrder[j];  // different order on each node
     int pe = gridNodeInfo[node].pe_start;
     int npe = gridNodeInfo[node].npe;
     int totlen = 0;
     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
       LocalPmeInfo &li = localInfo[pe];
       int cpylen = li.nx * zdim;
       totlen += cpylen;
     }
     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
     newmsg->sourceNode = myTransPe;
     newmsg->y_start = y_start;
     newmsg->ny = ny;
     for ( int g=0; g<numGrids; ++g ) {
       float *qmsg = newmsg->qgrid + ny * totlen * g;
       pe = gridNodeInfo[node].pe_start;
       for (int i=0; i<npe; ++i, ++pe) {
         LocalPmeInfo &li = localInfo[pe];
         if ( node == myGridNode ) {
           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
           int cpylen = ny * zdim;
           for ( int x = 0; x < li.nx; ++x ) {
             memcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
             q += cpylen;
             qmsg += slicelen;
           }
         } else {
           memcpy((void*)qmsg,
                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
                 li.nx*ny*zdim*sizeof(float));
           qmsg += li.nx*ny*zdim;
         }
       }
     }
     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
     if ( node == myGridNode ) newmsg->ny = 0;
     if ( npe > 1 ) {
       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
   }
 }

◆ submitReductions()

void ComputePmeMgr::submitReductions ( )

Definition at line 4297 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchFepOn, ComputePmeUtil::alchOn, ComputePmeUtil::alchThermIntOn, SubmitReduction::item(), ComputePmeUtil::lesFactor, ComputePmeUtil::lesOn, ComputePmeUtil::LJPMEOn, WorkDistrib::messageEnqueueWork(), NAMD_bug(), ComputePmeUtil::numGrids, Node::Object(), ComputePmeUtil::pairOn, REDUCTION_ELECT_ENERGY_PME_TI_1, REDUCTION_ELECT_ENERGY_PME_TI_2, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_LJ_ENERGY_SLOW, REDUCTION_STRAY_CHARGE_ERRORS, ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and SubmitReduction::submit().

Referenced by ComputePme::doWork(), and recvRecipEvir().

                                      {
 
     SimParameters *simParams = Node::Object()->simParameters;
 
     for ( int g=0; g<numGrids; ++g ) {
       double scale = 1.;
       if (alchOn) {
         BigReal elecLambdaUp, elecLambdaDown;
         // alchLambda set on each step in ComputePme::ungridForces()
         if ( alchLambda < 0 || alchLambda > 1 ) {
           NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
         }
         elecLambdaUp = simParams->getElecLambda(alchLambda);
         elecLambdaDown = simParams->getElecLambda(1-alchLambda);
         if ( g == 0 ) scale = elecLambdaUp;
         else if ( g == 1 ) scale = elecLambdaDown;
         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
         if (alchDecouple) {
           if ( g == 2 ) scale = 1-elecLambdaUp;
           else if ( g == 3 ) scale = 1-elecLambdaDown;
           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
         }
       } else if ( lesOn ) {
         scale = 1.0 / lesFactor;
       } else if ( pairOn ) {
         scale = ( g == 0 ? 1. : -1. );
       }
       if ( LJPMEOn && 1==g ) {
         reduction->item(REDUCTION_LJ_ENERGY_SLOW) += evir[g][0] * scale;
       } else {
         reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
       }
       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
 
       if (alchFepOn) {
         double scale2 = 0.;
         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
         elecLambda2Up = simParams->getElecLambda(alchLambda2);
         elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);        
         if ( g == 0 ) scale2 = elecLambda2Up;
         else if ( g == 1 ) scale2 = elecLambda2Down;
         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
         reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
       }
       
       if (alchThermIntOn) {
         
         // no decoupling:
         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
         // (interactions between all atoms [partition 0 OR partition 1], 
         // minus all [within partition 0])
         // U = elecLambdaUp * (U[0] - U[2])
         // dU/dl = U[0] - U[2];
         
         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
         // (interactions between all atoms [partition 0 OR partition 2], 
         // minus all [within partition 0])
         // U = elecLambdaDown * (U[1] - U[2])
         // dU/dl = U[1] - U[2];
 
         // alchDecouple:
         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
         // (interactions between all atoms [partition 0 OR partition 1]
         // minus all [within partition 1] minus all [within partition 0]
         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
         // dU/dl = U[0] - U[2] - U[4];
 
         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
         // (interactions between all atoms [partition 0 OR partition 2]
         // minus all [within partition 2] minus all [within partition 0]
         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
         // dU/dl = U[1] - U[3] - U[4];
         
         
         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
         if (!alchDecouple) {
           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
         }
         else {  // alchDecouple
           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
         }
       }
     }
 
     alchLambda = -1.;  // illegal value to catch if not updated
 
     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
     reduction->submit();
 
   for ( int i=0; i<heldComputes.size(); ++i ) {
     WorkDistrib::messageEnqueueWork(heldComputes[i]);
   }
   heldComputes.resize(0);
 }

◆ ungridCalc()

void ComputePmeMgr::ungridCalc ( void )

Definition at line 2554 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

                                    {
   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
 
   ungridForcesCount = pmeComputes.size();
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  if ( offload ) {
   //CmiLock(cuda_lock);
   cudaSetDevice(deviceCUDA->getDeviceID());
 
   if ( this == masterPmeMgr ) {
     double before = CmiWallTimer();
     // XXX prevents something from breaking???
     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
     // try to make the unspecified launch failures go away
     cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
     cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
 
     const int myrank = CkMyRank();
     for ( int i=0; i<CkMyNodeSize(); ++i ) {
       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
         nodePmeMgr->mgrObjects[i]->ungridCalc();
       }
     }
     if ( ! pmeComputes.size() ) return;
   }
 
   if ( ! end_forces ) {
     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
     end_forces = new cudaEvent_t[n];
     for ( int i=0; i<n; ++i ) {
       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
     }
   }
 
   const int pcsz = pmeComputes.size();
   if ( ! afn_host ) {
     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
     cuda_errcheck("malloc params for pme");
   }
   int totn = 0;
   for ( int i=0; i<pcsz; ++i ) {
     int n = pmeComputes[i]->numGridAtoms[0];
     totn += n;
   }
   if ( totn > f_data_mgr_alloc ) {
     if ( f_data_mgr_alloc ) {
       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
       cudaFree(f_data_mgr_dev);
       cudaFreeHost(f_data_mgr_host);
     }
     f_data_mgr_alloc = 1.2 * (totn + 100);
     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
     cuda_errcheck("malloc forces for pme");
   }
   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
   float *f_dev = f_data_mgr_dev;
   float *f_host = f_data_mgr_host;
   for ( int i=0; i<pcsz; ++i ) {
     int n = pmeComputes[i]->numGridAtoms[0];
     pmeComputes[i]->f_data_dev = f_dev;
     pmeComputes[i]->f_data_host = f_host;
     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
     afn_host[3*i+1] = f_dev;
     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
     f_dev += 3*n;
     f_host += 3*n;
   }
   //CmiLock(cuda_lock);
   double before = CmiWallTimer();
   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
   cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
   cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
 
   for ( int i=0; i<pcsz; ++i ) {
     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
     if ( i%EVENT_STRIDE == 0 ) {
       int dimy = pcsz - i;
       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
       int maxn = 0;
       int subtotn = 0;
       for ( int j=0; j<dimy; ++j ) {
         int n = pmeComputes[i+j]->numGridAtoms[0];
         subtotn += n;
         if ( n > maxn ) maxn = n;
       }
       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
       before = CmiWallTimer();
       cuda_pme_forces(
         bspline_coeffs_dev,
         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
         pmeComputes[i]->a_data_dev,
         pmeComputes[i]->f_data_dev,
         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
         streams[stream]);
       cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
       before = CmiWallTimer();
       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
         cudaMemcpyDeviceToHost, streams[stream]);
 #if 0 
       cudaDeviceSynchronize();
       fprintf(stderr, "i = %d\n", i);
       for(int k=0; k < subtotn*3; k++)
       {
         fprintf(stderr, "f_data_host[%d][%d] = %f\n", i, k, 
                 pmeComputes[i]->f_data_host[k]);
       }
 #endif
       cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
       cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
     }
     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
   }
   //CmiUnlock(cuda_lock);
  } else
 #endif // NAMD_CUDA
  {
   for ( int i=0; i<pmeComputes.size(); ++i ) {
     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
     // pmeComputes[i]->ungridForces();
   }
  }
   // submitReductions();  // must follow all ungridForces()
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
  if ( offload ) {
   forces_time = CmiWallTimer();
   forces_count = ungridForcesCount;
   forces_done_count = 0;
   pmeProxy[this_pe].pollForcesReady();
  }
 #endif
 
   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
 }

Friends And Related Function Documentation

◆ ComputePme

friend class ComputePme

friend

Definition at line 385 of file ComputePme.C.

◆ NodePmeMgr

friend class NodePmeMgr

friend

Definition at line 386 of file ComputePme.C.

Member Data Documentation

◆ a_data_dev

float* ComputePmeMgr::a_data_dev

Definition at line 447 of file ComputePme.C.

Referenced by cuda_submit_charges(), ComputePme::doWork(), and ungridCalc().

◆ a_data_host

float* ComputePmeMgr::a_data_host

Definition at line 446 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ComputePme::doWork().

◆ chargeGridSubmittedCount

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 472 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

◆ charges_time

double ComputePmeMgr::charges_time

Definition at line 458 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

◆ check_charges_count

int ComputePmeMgr::check_charges_count

Definition at line 460 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

◆ check_forces_count

int ComputePmeMgr::check_forces_count

Definition at line 461 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

◆ cuda_atoms_alloc

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 451 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ComputePme::doWork().

◆ cuda_atoms_count

int ComputePmeMgr::cuda_atoms_count

Definition at line 450 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), ComputePme::doWork(), and ComputePme::initialize().

◆ cuda_busy

bool ComputePmeMgr::cuda_busy

static

Definition at line 470 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ cuda_lock

CmiNodeLock ComputePmeMgr::cuda_lock

static

Definition at line 452 of file ComputePme.C.

Referenced by ComputePmeMgr(), ComputePme::doWork(), initialize_computes(), and recvAck().

◆ cuda_submit_charges_deque

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque

static

Definition at line 469 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ end_charges

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

◆ end_forces

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 455 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

◆ f_data_dev

float* ComputePmeMgr::f_data_dev

Definition at line 449 of file ComputePme.C.

Referenced by ungridCalc().

◆ f_data_host

float* ComputePmeMgr::f_data_host

Definition at line 448 of file ComputePme.C.

Referenced by ungridCalc().

◆ fftw_plan_lock

CmiNodeLock ComputePmeMgr::fftw_plan_lock

static

Definition at line 442 of file ComputePme.C.

Referenced by ComputePmeMgr(), PmeZPencil::fft_init(), PmeYPencil::fft_init(), PmeXPencil::fft_init(), initialize(), PmeZPencil::node_process_grid(), PmeZPencil::node_process_untrans(), NodePmeMgr::registerXPencil(), NodePmeMgr::registerYPencil(), NodePmeMgr::registerZPencil(), and ~ComputePmeMgr().

◆ forces_count

int ComputePmeMgr::forces_count

Definition at line 456 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_done_count

int ComputePmeMgr::forces_done_count

Definition at line 457 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_time

double ComputePmeMgr::forces_time

Definition at line 459 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ master_pe

int ComputePmeMgr::master_pe

Definition at line 462 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

◆ pmeComputes

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 482 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), ComputePme::doWork(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

◆ pmemgr_lock

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 443 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

◆ saved_lattice

Lattice* ComputePmeMgr::saved_lattice

Definition at line 475 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

◆ saved_sequence

int ComputePmeMgr::saved_sequence

Definition at line 476 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

◆ sendDataHelper_errors

int ComputePmeMgr::sendDataHelper_errors

Definition at line 401 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

◆ sendDataHelper_lattice

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 398 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

◆ sendDataHelper_sequence

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 399 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

◆ sendDataHelper_sourcepe

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 400 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

◆ this_pe

int ComputePmeMgr::this_pe

Definition at line 463 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().

The documentation for this class was generated from the following file:

ComputePme.C

Classes

Public Member Functions

Public Attributes

Static Public Attributes

Friends

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ ComputePmeMgr()

◆ ~ComputePmeMgr()

Member Function Documentation

◆ activate_pencils()

◆ addRecipEvirClient()

◆ chargeGridReady()

◆ chargeGridSubmitted()

◆ copyPencils()

◆ copyResults()

◆ cuda_submit_charges()

◆ fwdSharedTrans()

◆ fwdSharedUntrans()

◆ gridCalc1()

◆ gridCalc2()

◆ gridCalc2R()

◆ gridCalc3()

◆ initialize()

◆ initialize_computes()

◆ initialize_pencils()

◆ pollChargeGridReady()

◆ pollForcesReady()

◆ procTrans()

◆ procUntrans()

◆ recvAck()

◆ recvArrays()

◆ recvChargeGridReady()

◆ recvGrid()

◆ recvRecipEvir()

◆ recvSharedTrans()

◆ recvSharedUntrans()

◆ recvTrans()

◆ recvUngrid()

◆ recvUntrans()

◆ sendChargeGridReady()

◆ sendData()

◆ sendDataHelper()

◆ sendDataPart()

◆ sendPencils()

◆ sendPencilsHelper()

◆ sendPencilsPart()

◆ sendTrans()

◆ sendTransBarrier()

◆ sendTransSubset()

◆ sendUngrid()

◆ sendUngridSubset()

◆ sendUntrans()

◆ sendUntransSubset()

◆ submitReductions()

◆ ungridCalc()

Friends And Related Function Documentation

◆ ComputePme

◆ NodePmeMgr

Member Data Documentation

◆ a_data_dev

◆ a_data_host

◆ chargeGridSubmittedCount

◆ charges_time

◆ check_charges_count

◆ check_forces_count

◆ cuda_atoms_alloc

◆ cuda_atoms_count

◆ cuda_busy

◆ cuda_lock

◆ cuda_submit_charges_deque

◆ end_charges

◆ end_forces

◆ f_data_dev

◆ f_data_host

◆ fftw_plan_lock

◆ forces_count

◆ forces_done_count

◆ forces_time