#include <CudaPmeSolverUtil.h>

Classes
struct	EnergyVirial

Public Member Functions
	CudaPmeOneDevice (PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)

	~CudaPmeOneDevice ()

void	compute (const Lattice &lattice, int doEnergyVirial, int step)

void	finishReduction (bool doEnergyVirial)

int	getShiftedGrid (const double x, const int grid)

int	computeSharedMemoryPatchLevelSpreadCharge (const int numThreads, const int3 patchGridDim, const int order)

int	computeSharedMemoryPatchLevelGatherForce (const int numThreads, const int3 patchGridDim, const int order)

void	checkPatchLevelSimParamCompatibility (const int order, const bool periodicY, const bool periodicZ)

void	checkPatchLevelDeviceCompatibility ()

void	checkPatchLevelLatticeCompatibilityAndComputeOffsets (const Lattice &lattice, const int numPatches, const CudaLocalRecord localRecords, double3 patchMin, double3 patchMax, double3 awayDists)

Public Attributes
PmeGrid	pmeGrid

int	deviceID

int	deviceIndex

cudaStream_t	stream

int	natoms

size_t	num_used_grids

float4 *	d_atoms

int *	d_partition

float3 *	d_forces

float *	d_scaling_factors

cudaTextureObject_t *	gridTexObjArrays

float *	d_grids

float2 *	d_trans

size_t	gridsize

size_t	transize

cufftHandle *	forwardPlans

cufftHandle *	backwardPlans

float *	d_bm1

float *	d_bm2

float *	d_bm3

double	kappa

EnergyVirial *	d_energyVirials

EnergyVirial *	h_energyVirials

bool	self_energy_alch_first_time

bool	force_scaling_alch_first_time

double *	d_selfEnergy

double *	d_selfEnergy_FEP

double *	d_selfEnergy_TI_1

double *	d_selfEnergy_TI_2

double	selfEnergy

double	selfEnergy_FEP

double	selfEnergy_TI_1

double	selfEnergy_TI_2

int	m_step

PatchLevelPmeData	patchLevelPmeData

Lattice	currentLattice

Detailed Description

PME for single GPU case, where data persists on GPU calls real space, FFT, and K space parts receives atom and charge data as float4 * allocated on device returns force data as float3 * allocated on device returns energy and virial allocated on device

Definition at line 209 of file CudaPmeSolverUtil.h.

Constructor & Destructor Documentation

◆ CudaPmeOneDevice()

CudaPmeOneDevice::CudaPmeOneDevice	(	PmeGrid	pmeGrid_,
		int	deviceID_,
		int	deviceIndex_
	)

Definition at line 1303 of file CudaPmeSolverUtil.C.

References SimParameters::alchFepOn, SimParameters::alchGetNumOfPMEGrids(), SimParameters::alchOn, SimParameters::alchThermIntOn, backwardPlans, checkPatchLevelDeviceCompatibility(), checkPatchLevelSimParamCompatibility(), compute_b_moduli(), cudaCheck, SimParameters::CUDASOAintegrateMode, cufftCheck, d_atoms, d_bm1, d_bm2, d_bm3, d_energyVirials, d_forces, d_grids, d_partition, d_scaling_factors, d_selfEnergy, d_selfEnergy_FEP, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_trans, deviceID, deviceIndex, forwardPlans, gridsize, gridTexObjArrays, h_energyVirials, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, Node::molecule, NAMD_bug(), natoms, num_used_grids, Molecule::numAtoms, Node::Object(), ReductionMgr::Object(), PmeGrid::order, order, pmeGrid, REDUCTIONS_GPURESIDENT, Node::simParameters, stream, transize, and ReductionMgr::willSubmit().

       :
   pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
   natoms(0), d_atoms(0), d_forces(0),
   d_grids(0), gridsize(0),
   d_trans(0), transize(0),
   d_bm1(0), d_bm2(0), d_bm3(0),
   kappa(ComputeNonbondedUtil::ewaldcof),
   self_energy_alch_first_time(true),
   force_scaling_alch_first_time(true),
   selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
 {
 //   fprintf(stderr, "CudaPmeOneDevice constructor START ******************************************\n");
   const SimParameters& sim_params = *(Node::Object()->simParameters);
   natoms = Node::Object()->molecule->numAtoms;
   // Determine how many grids we need for the alchemical route
   if (sim_params.alchOn) {
     num_used_grids = sim_params.alchGetNumOfPMEGrids();
   } else {
     num_used_grids = 1;
   }
   cudaCheck(cudaSetDevice(deviceID));
 
   // Check to see if the simulation and device is compatible with patch-level kernels. The results
   // will be worked in the PatchLevelPmeData field
   checkPatchLevelSimParamCompatibility(pmeGrid.order, true /* periodic Y */, true /* periodic Z */);
   checkPatchLevelDeviceCompatibility();
   
   if (!sim_params.CUDASOAintegrateMode) {
     NAMD_bug("CudaPmeOneDevice requires GPU-resident mode");
   }
   reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
 
   // create our own CUDA stream
 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   int leastPriority, greatestPriority;
   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
 #else
   cudaCheck(cudaStreamCreate(&stream));
 #endif
 
   allocate_host<EnergyVirial>(&h_energyVirials, num_used_grids);
   allocate_device<EnergyVirial>(&d_energyVirials, num_used_grids);
   allocate_device<float>(&d_scaling_factors, num_used_grids);
   allocate_device<double>(&d_selfEnergy, 1);
   if (sim_params.alchFepOn) {
     allocate_device<double>(&d_selfEnergy_FEP, 1);
   } else {
     d_selfEnergy_FEP = NULL;
   }
   if (sim_params.alchThermIntOn) {
     allocate_device<double>(&d_selfEnergy_TI_1, 1);
     allocate_device<double>(&d_selfEnergy_TI_2, 1);
   } else {
     d_selfEnergy_TI_1 = NULL;
     d_selfEnergy_TI_2 = NULL;
   }
 
   // create device buffer space for atom positions and forces
   // to be accessed externally through PatchData
   allocate_device<float4>(&d_atoms, num_used_grids * natoms);
   allocate_device<float3>(&d_forces, num_used_grids * natoms);
   if (sim_params.alchOn) {
     allocate_device<int>(&d_partition, natoms);
   } else {
     d_partition = NULL;
   }
 #ifdef NODEGROUP_FORCE_REGISTER
   DeviceData& devData = cpdata.ckLocalBranch()->devData[deviceIndex];
   devData.s_datoms = (CudaAtom *) (d_atoms);
   devData.f_slow = (CudaForce *) (d_forces);
   devData.f_slow_size = natoms;
   devData.s_datoms_partition = d_partition;
 #endif
   int k1 = pmeGrid.K1;
   int k2 = pmeGrid.K2;
   int k3 = pmeGrid.K3;
   int order = pmeGrid.order;
   gridsize = k1 * k2 * k3;
   transize = (k1/2 + 1) * k2 * k3;
 
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 
   // set up cufft
   forwardPlans = new cufftHandle[num_used_grids];
   backwardPlans = new cufftHandle[num_used_grids];
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftPlan3d(&(forwardPlans[iGrid]), k3, k2, k1, CUFFT_R2C));
     cufftCheck(cufftPlan3d(&(backwardPlans[iGrid]), k3, k2, k1, CUFFT_C2R));
     cufftCheck(cufftSetStream(forwardPlans[iGrid], stream));
     cufftCheck(cufftSetStream(backwardPlans[iGrid], stream));
   }
 #endif
 
 #ifdef NAMD_CUDA 
   cudaDeviceProp deviceProp;
   cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceID));
   const int texture_alignment = int(deviceProp.textureAlignment);
   // d_grids and d_grids + N * gridsize will be used as device pointers for ::cudaResourceDesc::res::linear::devPtr
   // check if (d_grids + N * gridsize) is an address aligned to ::cudaDeviceProp::textureAlignment
   // which is required by cudaCreateTextureObject()
   // or maybe I should use cudaMallocPitch()?
   if ((gridsize % texture_alignment) != 0) {
     // if it is not aligned, padding is required
     gridsize = (int(gridsize / texture_alignment) + 1) * texture_alignment;
   }
   // Is it necesary to align transize too?
 //   if ((transize % texture_alignment) != 0) {
 //     // if it is not aligned, padding is required
 //     transize = (int(transize / texture_alignment) + 1) * texture_alignment;
 //   }
   allocate_device<float>(&d_grids, num_used_grids * gridsize);
   allocate_device<float2>(&d_trans, num_used_grids * transize);
   gridTexObjArrays = new cudaTextureObject_t[num_used_grids];
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     // set up texture object
     cudaResourceDesc resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
     resDesc.resType = cudaResourceTypeLinear;
     resDesc.res.linear.devPtr = (void*)(d_grids + iGrid * (size_t)gridsize);
     resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
     resDesc.res.linear.desc.x = sizeof(float)*8;
     resDesc.res.linear.sizeInBytes = gridsize*sizeof(float);
     cudaTextureDesc texDesc;
     memset(&texDesc, 0, sizeof(texDesc));
     texDesc.readMode = cudaReadModeElementType;
     cudaCheck(cudaCreateTextureObject(&(gridTexObjArrays[iGrid]), &resDesc, &texDesc, NULL));
   }
 #else
   allocate_device<float>(&d_grids, num_used_grids * gridsize);
   allocate_device<float2>(&d_trans, num_used_grids * transize);
 #endif
   // calculate prefactors
   double *bm1 = new double[k1];
   double *bm2 = new double[k2];
   double *bm3 = new double[k3];
   // Use compute_b_moduli from PmeKSpace.C
   extern void compute_b_moduli(double *bm, int k, int order);
   compute_b_moduli(bm1, k1, order);
   compute_b_moduli(bm2, k2, order);
   compute_b_moduli(bm3, k3, order);
 
   // allocate space for and copy prefactors onto GPU
   float *bm1f = new float[k1];
   float *bm2f = new float[k2];
   float *bm3f = new float[k3];
   for (int i=0;  i < k1;  i++)  bm1f[i] = (float) bm1[i];
   for (int i=0;  i < k2;  i++)  bm2f[i] = (float) bm2[i];
   for (int i=0;  i < k3;  i++)  bm3f[i] = (float) bm3[i];
   allocate_device<float>(&d_bm1, k1);
   allocate_device<float>(&d_bm2, k2);
   allocate_device<float>(&d_bm3, k3);
   copy_HtoD_sync<float>(bm1f, d_bm1, k1);
   copy_HtoD_sync<float>(bm2f, d_bm2, k2);
   copy_HtoD_sync<float>(bm3f, d_bm3, k3);
   delete [] bm1f;
   delete [] bm2f;
   delete [] bm3f;
   delete [] bm1;
   delete [] bm2;
   delete [] bm3;
 
   cudaCheck(cudaStreamSynchronize(stream));
 
 //   fprintf(stderr, "CudaPmeOneDevice constructor END ********************************************\n");
 }

◆ ~CudaPmeOneDevice()

CudaPmeOneDevice::~CudaPmeOneDevice ( )

Definition at line 1475 of file CudaPmeSolverUtil.C.

References backwardPlans, cudaCheck, cufftCheck, d_atoms, d_bm1, d_bm2, d_bm3, d_energyVirials, d_forces, d_grids, d_partition, PatchLevelPmeData::d_patchGridOffsets, d_scaling_factors, d_selfEnergy, d_selfEnergy_FEP, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_trans, forwardPlans, gridTexObjArrays, h_energyVirials, PatchLevelPmeData::h_patchGridOffsets, num_used_grids, patchLevelPmeData, and stream.

                                     {
   deallocate_device<float4>(&d_atoms);
   deallocate_device<float3>(&d_forces);
   deallocate_device<float2>(&d_trans);
   deallocate_device<float>(&d_grids);
   deallocate_host<EnergyVirial>(&h_energyVirials);
   deallocate_device<EnergyVirial>(&d_energyVirials);
   deallocate_device<float>(&d_scaling_factors);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftDestroy(forwardPlans[iGrid]));
     cufftCheck(cufftDestroy(backwardPlans[iGrid]));
 #if defined(NAMD_CUDA) // only CUDA uses texture objects
     cudaCheck(cudaDestroyTextureObject(gridTexObjArrays[iGrid]));
 #endif
   }
 
   if (patchLevelPmeData.h_patchGridOffsets != nullptr) {
     deallocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets);
   }
   if (patchLevelPmeData.d_patchGridOffsets != nullptr) {
     deallocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets);
   } 
 
   delete[] forwardPlans;
   delete[] backwardPlans;
 #if defined(NAMD_CUDA) // only CUDA uses texture objects
   delete[] gridTexObjArrays;
 #endif
 
 
 #endif
   deallocate_device<double>(&d_selfEnergy);
   if (d_partition != NULL) deallocate_device<int>(&d_partition);
   if (d_selfEnergy_FEP != NULL) deallocate_device<double>(&d_selfEnergy_FEP);
   if (d_selfEnergy_TI_1 != NULL) deallocate_device<double>(&d_selfEnergy_TI_1);
   if (d_selfEnergy_TI_2 != NULL) deallocate_device<double>(&d_selfEnergy_TI_2);
   deallocate_device<float>(&d_bm1);
   deallocate_device<float>(&d_bm2);
   deallocate_device<float>(&d_bm3);
   cudaCheck(cudaStreamDestroy(stream));
 
   if (reductionGpuResident) {
     delete reductionGpuResident;
   }
 }

Member Function Documentation

◆ checkPatchLevelDeviceCompatibility()

void CudaPmeOneDevice::checkPatchLevelDeviceCompatibility ( )

Definition at line 2045 of file CudaPmeSolverUtil.C.

References computeSharedMemoryPatchLevelGatherForce(), computeSharedMemoryPatchLevelSpreadCharge(), PatchLevelPmeData::deviceCompatible, deviceID, PatchLevelPmeData::deviceMaxSharedBytes, PatchLevelPmeData::gatherForceSharedBytes, PatchLevelPmeData::kNumThreads, PatchLevelPmeData::kPatchGridDim, PatchLevelPmeData::kPatchGridDimPad, patchLevelPmeData, and PatchLevelPmeData::spreadChargeSharedBytes.

Referenced by CudaPmeOneDevice().

                                                           {
   cudaDeviceGetAttribute(&patchLevelPmeData.deviceMaxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID); 
 
   const int3 constexprPatchGridDim = make_int3(
     PatchLevelPmeData::kPatchGridDimPad,
     PatchLevelPmeData::kPatchGridDim,
     PatchLevelPmeData::kPatchGridDim);
 
   patchLevelPmeData.spreadChargeSharedBytes = computeSharedMemoryPatchLevelSpreadCharge(
     PatchLevelPmeData::kNumThreads,
     constexprPatchGridDim, 8 /* order */);
   patchLevelPmeData.gatherForceSharedBytes = computeSharedMemoryPatchLevelGatherForce(
     PatchLevelPmeData::kNumThreads,
     constexprPatchGridDim, 8 /* order */);
 
   patchLevelPmeData.deviceCompatible = 
       (patchLevelPmeData.spreadChargeSharedBytes <= patchLevelPmeData.deviceMaxSharedBytes) &&
       (patchLevelPmeData.gatherForceSharedBytes <= patchLevelPmeData.deviceMaxSharedBytes);
 }

◆ checkPatchLevelLatticeCompatibilityAndComputeOffsets()

void CudaPmeOneDevice::checkPatchLevelLatticeCompatibilityAndComputeOffsets	(	const Lattice &	lattice,
		const int	numPatches,
		const CudaLocalRecord *	localRecords,
		double3 *	patchMin,
		double3 *	patchMax,
		double3 *	awayDists
	)

Definition at line 2065 of file CudaPmeSolverUtil.C.

                                                             {
 
   patchLevelPmeData.localRecords = localRecords;
 
   // If the simulation isn't compatible or the device isn't compatible then no point in checking
   // patch sizes
   if (!patchLevelPmeData.simulationCompatible || !patchLevelPmeData.deviceCompatible) return;
 
   patchLevelPmeData.numPatches = numPatches;
 
   if (patchLevelPmeData.h_patchGridOffsets == nullptr) {
     allocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets, numPatches);
   }
   if (patchLevelPmeData.d_patchGridOffsets == nullptr) {
     allocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets, numPatches);
   }
   
   SimParameters* simParams = Node::Object()->simParameters;
   const int order = pmeGrid.order;
 
   // We only need to recompute the grid offsets if the lattice has changed
   if (!lattice.isEqual(currentLattice)) {
     currentLattice = lattice;
 
     double sysdima = currentLattice.a_r().unit() * currentLattice.a();
     double sysdimb = currentLattice.b_r().unit() * currentLattice.b();
     double sysdimc = currentLattice.c_r().unit() * currentLattice.c();
 
     patchLevelPmeData.patchGridDim = make_int3(0,0,0);
 
     for (int i = 0; i < numPatches; i++) {
       double3 pmin = currentLattice.unscale(patchMin[i]);
       double3 pmax = currentLattice.unscale(patchMax[i]);
       double3 width = pmax - pmin;
 
       // Logic copied from margin violation check
       double3 marginVal;
       marginVal.x = 0.5 * (awayDists[i].x - simParams->cutoff / sysdima);
       marginVal.y = 0.5 * (awayDists[i].y - simParams->cutoff / sysdimb);
       marginVal.z = 0.5 * (awayDists[i].z - simParams->cutoff / sysdimc);
       marginVal = currentLattice.unscale(marginVal);
 
       double3 minAtom = pmin - marginVal;
       double3 maxAtom = pmax + marginVal;
 
       double3 minScaled = currentLattice.scale(minAtom);
       double3 maxScaled = currentLattice.scale(maxAtom);
 
       int3 gridMin;
       gridMin.x = getShiftedGrid(minScaled.x, pmeGrid.K1);
       gridMin.y = getShiftedGrid(minScaled.y, pmeGrid.K2);
       gridMin.z = getShiftedGrid(minScaled.z, pmeGrid.K3);
 
       int3 gridMax;
       gridMax.x = getShiftedGrid(maxScaled.x, pmeGrid.K1);
       gridMax.y = getShiftedGrid(maxScaled.y, pmeGrid.K2);
       gridMax.z = getShiftedGrid(maxScaled.z, pmeGrid.K3);
 
       int3 gridWidth;
       gridWidth.x = gridMax.x - gridMin.x + order;
       gridWidth.y = gridMax.y - gridMin.y + order;
       gridWidth.z = gridMax.z - gridMin.z + order;
 
       patchLevelPmeData.h_patchGridOffsets[i] = gridMin;
       patchLevelPmeData.patchGridDim.x = std::max(patchLevelPmeData.patchGridDim.x, gridWidth.x);
       patchLevelPmeData.patchGridDim.y = std::max(patchLevelPmeData.patchGridDim.y, gridWidth.y);
       patchLevelPmeData.patchGridDim.z = std::max(patchLevelPmeData.patchGridDim.z, gridWidth.z);
     }
     copy_HtoD<int3>(patchLevelPmeData.h_patchGridOffsets, patchLevelPmeData.d_patchGridOffsets, 
       numPatches, nullptr);
     cudaStreamSynchronize(nullptr);
     const int maxGridPoints = patchLevelPmeData.patchGridDim.x * 
         patchLevelPmeData.patchGridDim.y * patchLevelPmeData.patchGridDim.z;
 
     patchLevelPmeData.latticeCompatible =  
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim &&
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim &&
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim;
   }
 }

◆ checkPatchLevelSimParamCompatibility()

void CudaPmeOneDevice::checkPatchLevelSimParamCompatibility	(	const int	order,
		const bool	periodicY,
		const bool	periodicZ
	)

Definition at line 2034 of file CudaPmeSolverUtil.C.

References deviceCUDA, DeviceCUDA::getNumDevice(), order, patchLevelPmeData, and PatchLevelPmeData::simulationCompatible.

Referenced by CudaPmeOneDevice().

                                                                                                                        {
   bool use = true;
   use = use && (order == 8);
   use = use && (periodicY);
   use = use && (periodicZ);
 
   use = use && (deviceCUDA->getNumDevice() == 1); // This is only supported for single GPU currently
 
   patchLevelPmeData.simulationCompatible = use;
 }

◆ compute()

void CudaPmeOneDevice::compute	(	const Lattice &	lattice,
		int	doEnergyVirial,
		int	step
	)

Definition at line 1522 of file CudaPmeSolverUtil.C.

References Lattice::a_r(), SimParameters::alchOn, Lattice::b_r(), backwardPlans, Lattice::c_r(), compute_selfEnergy(), cudaCheck, cufftCheck, d_atoms, d_bm1, d_bm2, d_bm3, d_energyVirials, d_forces, d_grids, d_selfEnergy, d_trans, deviceID, SimParameters::firstTimestep, forwardPlans, gather_force(), gridsize, gridTexObjArrays, h_energyVirials, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, kappa, m_step, natoms, num_used_grids, Node::Object(), PmeGrid::order, order, patchLevelPmeData, pmeGrid, scalar_sum(), selfEnergy, Node::simParameters, spread_charge(), spread_charge_v2(), stream, transize, Lattice::volume(), WARPSIZE, Vector::x, Vector::y, and Vector::z.

       {
 //   fprintf(stderr, "CudaPmeOneDevice compute ****************************************************\n");
   int k1 = pmeGrid.K1;
   int k2 = pmeGrid.K2;
   int k3 = pmeGrid.K3;
   int order = pmeGrid.order;
   double volume = lattice.volume();
   Vector a_r = lattice.a_r();
   Vector b_r = lattice.b_r();
   Vector c_r = lattice.c_r();
   float arx = a_r.x;
   float ary = a_r.y;
   float arz = a_r.z;
   float brx = b_r.x;
   float bry = b_r.y;
   float brz = b_r.z;
   float crx = c_r.x;
   float cry = c_r.y;
   float crz = c_r.z;
   m_step = step;
 
   //JM:  actually necessary if you reserve a PME device!
   cudaCheck(cudaSetDevice(deviceID));
   const SimParameters& sim_params = *(Node::Object()->simParameters); 
 
   // clear force array
   //fprintf(stderr, "Calling clear_device_array on d_force\n");
   clear_device_array<float3>(d_forces, num_used_grids * natoms, stream);
   // clear grid
   //fprintf(stderr, "Calling clear_device_array on d_grid\n");
   clear_device_array<float>(d_grids, num_used_grids * gridsize, stream);
   clear_device_array<float2>(d_trans, num_used_grids * transize, stream);
 
   // Clear energy and virial array if needed
   if (doEnergyVirial) {
     // clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
     clear_device_array<EnergyVirial>(d_energyVirials, num_used_grids * 1, stream);
     const bool updateSelfEnergy = (step == sim_params.firstTimestep) || (selfEnergy == 0);
     if (updateSelfEnergy && (sim_params.alchOn == false)) {
       clear_device_array<double>(d_selfEnergy, 1, stream);
       // calculate self energy term if not yet done
       selfEnergy = compute_selfEnergy(d_selfEnergy, d_atoms, natoms,
           kappa, stream);
       //fprintf(stderr, "selfEnergy = %12.8f\n", selfEnergy);
     }
     /* the self energy depends on the scaling factor, or lambda
      * the cases when self energy will be changed:
      * 1. If alchLambdaFreq > 0, we will have a linear scaling of lambda. Lambda is changed EVERY STEP!
      * 2. In most cases, users will not use alchLambdaFreq > 0, but simulations may enter another lambda-window by using TCL scripts.
      * in summary, the self energy will be not changed unless lambda is changed.
      * so calcSelfEnergyAlch() would compare lambda of current step with the one from last step.
      * only if lambda is changed, the calcSelfEnergyFEPKernel or calcSelfEnergyTIKernel will be executed again.
      */
     if (sim_params.alchOn) calcSelfEnergyAlch(m_step);
   }
 
 #if 0
 
   spread_charge(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
       k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
       true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
       d_grid, order, stream);
 #else
   const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     spread_charge_v2(patchLevelPmeData, 
         d_atoms + iGrid * natoms, natoms, k1, k2, k3, 
         float(k1), (float)k2, (float)k3, order3,
         k1, k2, k3,
         k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
         true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
         d_grids + iGrid * gridsize, order, stream);
   }
 
 #endif
   //cudaCheck(cudaStreamSynchronize(stream));
 
   // forward FFT
   //fprintf(stderr, "Calling cufftExecR2C\n");
   //cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)d_grid,
   //      (cufftComplex *)d_tran));
 
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftExecR2C(forwardPlans[iGrid],
           (cufftReal *)(d_grids + iGrid * gridsize),
           (cufftComplex *)(d_trans + iGrid * transize)));
   }
 
   //cudaCheck(cudaStreamSynchronize(stream));
 
   // reciprocal space calculation
   //fprintf(stderr, "Calling scalar_sum\n");
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
         kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
         d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
         (bool) doEnergyVirial, &(d_energyVirials[iGrid].energy),
         d_energyVirials[iGrid].virial, d_trans + iGrid * transize, stream);
   }
   //scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
   //    kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
   //    d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
   //    (bool) doEnergyVirial, &(d_energyVirial->energy),
   //    d_energyVirial->virial, d_tran, stream);
   //cudaCheck(cudaStreamSynchronize(stream));
 
   // backward FFT
   //fprintf(stderr, "Calling cufftExecC2R\n");
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftExecC2R(backwardPlans[iGrid],
           (cufftComplex *)(d_trans + iGrid * transize),
           (cufftReal *)(d_grids + iGrid * gridsize)));
   }
 
   //cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)d_tran,
   //      (cufftReal *)d_grid));
   //cudaCheck(cudaStreamSynchronize(stream));
 
   // gather force from grid to atoms
   // missing cudaTextureObject_t below works for __CUDA_ARCH__ >= 350
   //fprintf(stderr, "Calling gather_force\n");
   for (unsigned int iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     gather_force(patchLevelPmeData,
       &(d_atoms[iGrid * natoms]), natoms, k1, k2, k3, k1, k2, k3,
       k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
       true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
       d_grids + iGrid * gridsize, order, d_forces + iGrid * natoms,
 #ifdef NAMD_CUDA
       gridTexObjArrays[iGrid] /* cudaTextureObject_t */,
 #endif
       stream);
   }
 
   //gather_force(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
   //    k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
   //    true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
   //    d_grid, order, d_force, gridTexObj /* cudaTextureObject_t */,
   //    stream);
   //cudaCheck(cudaStreamSynchronize(stream));
 
   // Copy energy and virial to host if needed
   if (doEnergyVirial) {
     //fprintf(stderr, "Calling copy_DtoH on d_energyVirial\n");
     copy_DtoH<EnergyVirial>(d_energyVirials, h_energyVirials,
         num_used_grids, stream);
     //cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
     //cudaCheck(cudaStreamSynchronize(stream));
   }
 
   // XXX debugging, quick test for borked forces
   //clear_device_array<float3>(d_force, natoms, stream);
   if (sim_params.alchOn) {
     scaleAndMergeForce(m_step);
   }
 }

◆ computeSharedMemoryPatchLevelGatherForce()

int CudaPmeOneDevice::computeSharedMemoryPatchLevelGatherForce	(	const int	numThreads,
		const int3	patchGridDim,
		const int	order
	)

Definition at line 2024 of file CudaPmeSolverUtil.C.

References PatchLevelPmeData::kThetaPad, and order.

Referenced by checkPatchLevelDeviceCompatibility().

                                                                   {
 
   const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
   const int thetaBytes = (numThreads + PatchLevelPmeData::kThetaPad) * order * 
     2 /* theta and dtheta */ * sizeof(float);
 
   return gridBytes + thetaBytes;
 }

◆ computeSharedMemoryPatchLevelSpreadCharge()

int CudaPmeOneDevice::computeSharedMemoryPatchLevelSpreadCharge	(	const int	numThreads,
		const int3	patchGridDim,
		const int	order
	)

Definition at line 2013 of file CudaPmeSolverUtil.C.

References PatchLevelPmeData::kDim, PatchLevelPmeData::kThetaPad, and order.

Referenced by checkPatchLevelDeviceCompatibility().

                                                                   {
 
   const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
   const int thetaBytes = PatchLevelPmeData::kDim * (numThreads + PatchLevelPmeData::kThetaPad) * 
     order * sizeof(float);
   const int indexBytes = numThreads * sizeof(char4);
 
   return gridBytes + thetaBytes + indexBytes;
 }

◆ finishReduction()

void CudaPmeOneDevice::finishReduction ( bool doEnergyVirial )

Definition at line 1691 of file CudaPmeSolverUtil.C.

References SimParameters::alchFepOn, SimParameters::alchOn, SimParameters::alchThermIntOn, cudaCheck, deviceID, CudaPmeOneDevice::EnergyVirial::energy, h_energyVirials, SubmitReduction::item(), m_step, Node::Object(), REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_ELECT_ENERGY_SLOW_TI_1, REDUCTION_ELECT_ENERGY_SLOW_TI_2, selfEnergy, selfEnergy_FEP, selfEnergy_TI_1, selfEnergy_TI_2, Node::simParameters, stream, SubmitReduction::submit(), and CudaPmeOneDevice::EnergyVirial::virial.

       {
   cudaCheck(cudaStreamSynchronize(stream));
   SubmitReduction *reduction = getCurrentReduction();
   if(doEnergyVirial){
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     PatchData* patchData = cpdata.ckLocalBranch();
     cudaCheck(cudaSetDevice(deviceID));
     double virial[9];
     double energy, energy_F, energy_TI_1, energy_TI_2;
     const SimParameters& sim_params = *(Node::Object()->simParameters);
     if (sim_params.alchOn) {
       if (sim_params.alchFepOn) {
         scaleAndComputeFEPEnergyVirials(h_energyVirials, m_step, energy, energy_F, virial);
         energy += selfEnergy;
         energy_F += selfEnergy_FEP;
       }
       if (sim_params.alchThermIntOn) {
         scaleAndComputeTIEnergyVirials(h_energyVirials, m_step, energy, energy_TI_1, energy_TI_2, virial);
         energy += selfEnergy;
         energy_TI_1 += selfEnergy_TI_1;
         energy_TI_2 += selfEnergy_TI_2;
       }
     } else {
       virial[0] = h_energyVirials[0].virial[0];
       virial[1] = h_energyVirials[0].virial[1];
       virial[2] = h_energyVirials[0].virial[2];
       virial[3] = h_energyVirials[0].virial[1];
       virial[4] = h_energyVirials[0].virial[3];
       virial[5] = h_energyVirials[0].virial[4];
       virial[6] = h_energyVirials[0].virial[2];
       virial[7] = h_energyVirials[0].virial[4];
       virial[8] = h_energyVirials[0].virial[5];
       energy = h_energyVirials[0].energy + selfEnergy;
     }
   #if 0
     fprintf(stderr, "PME ENERGY = %g %g\n", h_energyVirials[0].energy, selfEnergy );
     fprintf(stderr, "PME VIRIAL =\n"
         "  %g  %g  %g\n  %g  %g  %g\n  %g %g %g\n",
         virial[0], virial[1], virial[2], virial[3], virial[4],
         virial[5], virial[6], virial[7], virial[8]);
   #endif
     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
     reduction->item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
     reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
     reduction->item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
     reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
     reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energy;
     if (sim_params.alchFepOn) {
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += energy_F;
     }
     if (sim_params.alchThermIntOn) {
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += energy_TI_1;
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += energy_TI_2;
     }
   }
   reduction->submit();
 }

◆ getShiftedGrid()

int CudaPmeOneDevice::getShiftedGrid	(	const double	x,
		const int	grid
	)

Definition at line 2007 of file CudaPmeSolverUtil.C.

Referenced by checkPatchLevelLatticeCompatibilityAndComputeOffsets().

                                                                    {
   double w = x + 0.5;
   double gw = w * grid;
   return floor(gw);
 }

Member Data Documentation

◆ backwardPlans

cufftHandle* CudaPmeOneDevice::backwardPlans

Definition at line 238 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ currentLattice

Lattice CudaPmeOneDevice::currentLattice

Definition at line 283 of file CudaPmeSolverUtil.h.

Referenced by checkPatchLevelLatticeCompatibilityAndComputeOffsets().

◆ d_atoms

float4* CudaPmeOneDevice::d_atoms

Definition at line 219 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_bm1

float* CudaPmeOneDevice::d_bm1

Definition at line 241 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_bm2

float* CudaPmeOneDevice::d_bm2

Definition at line 242 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_bm3

float* CudaPmeOneDevice::d_bm3

Definition at line 243 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_energyVirials

EnergyVirial* CudaPmeOneDevice::d_energyVirials

Definition at line 251 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_forces

float3* CudaPmeOneDevice::d_forces

Definition at line 221 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_grids

float* CudaPmeOneDevice::d_grids

on device grid of charge before forward FFT R->C, then grid of potential after backward FFT C->R

Definition at line 227 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_partition

int* CudaPmeOneDevice::d_partition

Definition at line 220 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_scaling_factors

float* CudaPmeOneDevice::d_scaling_factors

Definition at line 222 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_selfEnergy

double* CudaPmeOneDevice::d_selfEnergy

Definition at line 256 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_selfEnergy_FEP

double* CudaPmeOneDevice::d_selfEnergy_FEP

Definition at line 257 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_selfEnergy_TI_1

double* CudaPmeOneDevice::d_selfEnergy_TI_1

Definition at line 258 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_selfEnergy_TI_2

double* CudaPmeOneDevice::d_selfEnergy_TI_2

Definition at line 259 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ d_trans

float2* CudaPmeOneDevice::d_trans

on device FFT transformation to complex

Definition at line 231 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ deviceID

int CudaPmeOneDevice::deviceID

Definition at line 212 of file CudaPmeSolverUtil.h.

Referenced by checkPatchLevelDeviceCompatibility(), compute(), CudaPmeOneDevice(), and finishReduction().

◆ deviceIndex

int CudaPmeOneDevice::deviceIndex

Definition at line 213 of file CudaPmeSolverUtil.h.

Referenced by CudaPmeOneDevice().

◆ force_scaling_alch_first_time

bool CudaPmeOneDevice::force_scaling_alch_first_time

Definition at line 255 of file CudaPmeSolverUtil.h.

◆ forwardPlans

cufftHandle* CudaPmeOneDevice::forwardPlans

Definition at line 237 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ gridsize

size_t CudaPmeOneDevice::gridsize

Definition at line 233 of file CudaPmeSolverUtil.h.

Referenced by compute(), and CudaPmeOneDevice().

◆ gridTexObjArrays

cudaTextureObject_t* CudaPmeOneDevice::gridTexObjArrays

Definition at line 224 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ h_energyVirials

EnergyVirial* CudaPmeOneDevice::h_energyVirials

Definition at line 252 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), finishReduction(), and ~CudaPmeOneDevice().

◆ kappa

double CudaPmeOneDevice::kappa

Definition at line 245 of file CudaPmeSolverUtil.h.

Referenced by compute().

◆ m_step

int CudaPmeOneDevice::m_step

Definition at line 264 of file CudaPmeSolverUtil.h.

Referenced by compute(), and finishReduction().

◆ natoms

int CudaPmeOneDevice::natoms

Definition at line 216 of file CudaPmeSolverUtil.h.

Referenced by compute(), and CudaPmeOneDevice().

◆ num_used_grids

size_t CudaPmeOneDevice::num_used_grids

Definition at line 217 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), and ~CudaPmeOneDevice().

◆ patchLevelPmeData

PatchLevelPmeData CudaPmeOneDevice::patchLevelPmeData

Definition at line 282 of file CudaPmeSolverUtil.h.

Referenced by checkPatchLevelDeviceCompatibility(), checkPatchLevelLatticeCompatibilityAndComputeOffsets(), checkPatchLevelSimParamCompatibility(), compute(), and ~CudaPmeOneDevice().

◆ pmeGrid

PmeGrid CudaPmeOneDevice::pmeGrid

Definition at line 211 of file CudaPmeSolverUtil.h.

Referenced by checkPatchLevelLatticeCompatibilityAndComputeOffsets(), compute(), and CudaPmeOneDevice().

◆ self_energy_alch_first_time

bool CudaPmeOneDevice::self_energy_alch_first_time

Definition at line 254 of file CudaPmeSolverUtil.h.

◆ selfEnergy

double CudaPmeOneDevice::selfEnergy

Definition at line 260 of file CudaPmeSolverUtil.h.

Referenced by compute(), and finishReduction().

◆ selfEnergy_FEP

double CudaPmeOneDevice::selfEnergy_FEP

Definition at line 261 of file CudaPmeSolverUtil.h.

Referenced by finishReduction().

◆ selfEnergy_TI_1

double CudaPmeOneDevice::selfEnergy_TI_1

Definition at line 262 of file CudaPmeSolverUtil.h.

Referenced by finishReduction().

◆ selfEnergy_TI_2

double CudaPmeOneDevice::selfEnergy_TI_2

Definition at line 263 of file CudaPmeSolverUtil.h.

Referenced by finishReduction().

◆ stream

cudaStream_t CudaPmeOneDevice::stream

Definition at line 214 of file CudaPmeSolverUtil.h.

Referenced by compute(), CudaPmeOneDevice(), finishReduction(), and ~CudaPmeOneDevice().

◆ transize

size_t CudaPmeOneDevice::transize

Definition at line 234 of file CudaPmeSolverUtil.h.

Referenced by compute(), and CudaPmeOneDevice().

The documentation for this class was generated from the following files:

Classes

Public Member Functions

Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ CudaPmeOneDevice()

◆ ~CudaPmeOneDevice()

Member Function Documentation

◆ checkPatchLevelDeviceCompatibility()

◆ checkPatchLevelLatticeCompatibilityAndComputeOffsets()

◆ checkPatchLevelSimParamCompatibility()

◆ compute()

◆ computeSharedMemoryPatchLevelGatherForce()

◆ computeSharedMemoryPatchLevelSpreadCharge()

◆ finishReduction()

◆ getShiftedGrid()

Member Data Documentation

◆ backwardPlans

◆ currentLattice

◆ d_atoms

◆ d_bm1

◆ d_bm2

◆ d_bm3

◆ d_energyVirials

◆ d_forces

◆ d_grids

◆ d_partition

◆ d_scaling_factors

◆ d_selfEnergy

◆ d_selfEnergy_FEP

◆ d_selfEnergy_TI_1

◆ d_selfEnergy_TI_2

◆ d_trans

◆ deviceID

◆ deviceIndex

◆ force_scaling_alch_first_time

◆ forwardPlans

◆ gridsize

◆ gridTexObjArrays

◆ h_energyVirials

◆ kappa

◆ m_step

◆ natoms

◆ num_used_grids

◆ patchLevelPmeData

◆ pmeGrid

◆ self_energy_alch_first_time

◆ selfEnergy

◆ selfEnergy_FEP

◆ selfEnergy_TI_1

◆ selfEnergy_TI_2

◆ stream

◆ transize