2 #define _USE_MATH_DEFINES
     9 #if __CUDACC_VER_MAJOR__ >= 11
    10 #include <cub/cub.cuh>
    12 #include <namd_cub/cub.cuh>
    17 #include <hip/hip_runtime.h>
    18 #include <hip/hip_runtime_api.h>
    19 #include <hipcub/hipcub.hpp>
    23 #include "HipDefines.h"
    24 #include "CudaUtils.h"
    25 #include "CudaRecord.h"
    26 #include "CudaPmeSolverUtilKernel.h"
    28 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
    29 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
    30 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
    32 //      parameter :: CCELEC=332.0636D0 ! old value of dubious origin
    33 //      parameter :: CCELEC=331.843D0  ! value from 1986-1987 CRC Handbook
    34 //                                   ! of Chemistry and Physics
    35 //  real(chm_real), parameter ::  &
    36 //       CCELEC_amber    = 332.0522173D0, &
    37 //       CCELEC_charmm   = 332.0716D0   , &
    38 //       CCELEC_discover = 332.054D0    , &
    39 //       CCELEC_namd     = 332.0636D0   
    40 //const double ccelec = 332.0636;
    41 //const double half_ccelec = 0.5*ccelec;
    42 //const float ccelec_float = 332.0636f;
    45 // Structure into which virials are stored
    46 // NOTE: This structure is only used for computing addresses
    48   double sforce_dp[27][3];
    49   long long int sforce_fp[27][3];
    51   // Energies start here ...
    55 // Local structure for scalar_sum -function for energy and virial reductions
    56 struct RecipVirial_t {
    62 __forceinline__ __device__ double expfunc(const double x) {
    66 __forceinline__ __device__ float expfunc(const float x) {
    71 // Performs scalar sum on data(nfft1, nfft2, nfft3)
    72 // T = float or double
    73 // T2 = float2 or double2
    75 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
    76 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
    77           const int size1, const int size2, const int size3,
    78           const T recip1x, const T recip1y, const T recip1z,
    79           const T recip2x, const T recip2y, const T recip2z,
    80           const T recip3x, const T recip3y, const T recip3z,
    81           const T* prefac1, const T* prefac2, const T* prefac3,
    82           const T pi_ewald, const T piv_inv,
    83           const int k2_00, const int k3_00,
    85           double* __restrict__ energy_recip,
    86           double* __restrict__ virial) {
    87   // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
    89   extern __shared__ T sh_prefac[];
    91   HIP_DYNAMIC_SHARED( T, sh_prefac)
    94   // Create pointers to shared memory
    95   T* sh_prefac1 = (T *)&sh_prefac[0];
    96   T* sh_prefac2 = (T *)&sh_prefac[nfft1];
    97   T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
    99   // Calculate start position (k1, k2, k3) for each thread
   100   unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
   101   int k3 = tid/(size1*size2);
   102   tid -= k3*size1*size2;
   104   int k1 = tid - k2*size1;
   106   // Starting position in data
   107   int pos = k1 + (k2 + k3*size2)*size1;
   109   // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
   113   // Calculate limits w.r.t. global coordinates
   114   const int lim2 = size2 + k2_00;
   115   const int lim3 = size3 + k3_00;
   117   // Calculate increments (k1_inc, k2_inc, k3_inc)
   118   int tot_inc = blockDim.x*gridDim.x;
   119   const int k3_inc = tot_inc/(size1*size2);
   120   tot_inc -= k3_inc*size1*size2;
   121   const int k2_inc = tot_inc/size1;
   122   const int k1_inc = tot_inc - k2_inc*size1;
   124   // Set data[0] = 0 for the global (0,0,0)
   125   if (k1 == 0 && k2 == 0 && k3 == 0) {
   130     // Increment position
   144     pos += k3_inc*size1*size2;
   147   // Load prefac data into shared memory
   151       sh_prefac1[t] = prefac1[t];
   156       sh_prefac2[t] = prefac2[t];
   161       sh_prefac3[t] = prefac3[t];
   168   double virial0 = 0.0;
   169   double virial1 = 0.0;
   170   double virial2 = 0.0;
   171   double virial3 = 0.0;
   172   double virial4 = 0.0;
   173   double virial5 = 0.0;
   175   // If nfft1 is odd, set nfft1_half to impossible value so that
   176   // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )" 
   177   // is never satisfied
   178   const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
   179   const int nfft2_half = nfft2/2;
   180   const int nfft3_half = nfft3/2;
   186     int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
   187     int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
   195       m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
   196       m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
   197       m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
   200     T msq = m1*m1 + m2*m2 + m3*m3;
   201     T msq_inv = ((T)1)/msq;
   203     T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
   204     T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
   205     if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
   206     T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
   210     if (calc_energy_virial) {
   212       T vir = ((T)2)*(pi_ewald + msq_inv);
   214       virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
   215       virial1 += (double)(fac*vir*m1*m2);
   216       virial2 += (double)(fac*vir*m1*m3);
   217       virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
   218       virial4 += (double)(fac*vir*m2*m3);
   219       virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
   226     // Increment position
   240     pos += k3_inc*size2*size1;
   243   // Reduce energy and virial
   244   if (calc_energy_virial) {
   245     const int tid = threadIdx.x & (warpSize-1);
   246     const int base = (threadIdx.x/warpSize);
   247     volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
   248     // Reduce within warps
   249     for (int d=warpSize/2;d >= 1;d /= 2) {
   250       energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
   251          WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
   252       virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
   253           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
   254       virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
   255           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
   256       virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
   257           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
   258       virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
   259           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
   260       virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
   261           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
   262       virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
   263           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
   265     // Reduce between warps
   266     // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
   269       sh_ev[base].energy = energy;
   270       sh_ev[base].virial[0] = virial0;
   271       sh_ev[base].virial[1] = virial1;
   272       sh_ev[base].virial[2] = virial2;
   273       sh_ev[base].virial[3] = virial3;
   274       sh_ev[base].virial[4] = virial4;
   275       sh_ev[base].virial[5] = virial5;
   279       energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
   280       virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
   281       virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
   282       virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
   283       virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
   284       virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
   285       virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
   286       for (int d=warpSize/2;d >= 1;d /= 2) {
   287   energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
   288            WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
   289   virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
   290             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
   291   virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
   292             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
   293   virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
   294             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
   295   virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
   296             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
   297   virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
   298             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
   299   virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
   300             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
   304     if (threadIdx.x == 0) {
   305       atomicAdd(energy_recip, energy*0.5);
   312       atomicAdd(&virial[0], virial0);
   313       atomicAdd(&virial[1], virial1);
   314       atomicAdd(&virial[2], virial2);
   315       atomicAdd(&virial[3], virial3);
   316       atomicAdd(&virial[4], virial4);
   317       atomicAdd(&virial[5], virial5);
   324 template <typename T>
   325 __forceinline__ __device__ void write_grid(const float val, const int ind,
   327   atomicAdd(&data[ind], (T)val);
   331 // General version for any order
   333 template <typename T, int order>
   334 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
   336   theta[order-1] = ((T)0);
   338   theta[0] = ((T)1) - w;
   341   for (int k=3;k <= order-1;k++) {
   342     T div = ((T)1) / (T)(k-1);
   343     theta[k-1] = div*w*theta[k-2];
   345     for (int j=1;j <= k-2;j++) {
   346       theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
   348     theta[0] = div*(((T)1) - w)*theta[0];
   351   //--- one more recursion
   352   T div = ((T)1) / (T)(order-1);
   353   theta[order-1] = div*w*theta[order-2];
   355   for (int j=1;j <= order-2;j++) {
   356     theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
   359   theta[0] = div*(((T)1) - w)*theta[0];
   363 // Calculate theta and dtheta for general order bspline
   365 template <typename T, typename T3, int order>
   366 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
   368   theta[order-1].x = ((T)0);
   369   theta[order-1].y = ((T)0);
   370   theta[order-1].z = ((T)0);
   374   theta[0].x = ((T)1) - wx;
   375   theta[0].y = ((T)1) - wy;
   376   theta[0].z = ((T)1) - wz;
   379   for (int k=3;k <= order-1;k++) {
   380     T div = ((T)1) / (T)(k-1);
   381     theta[k-1].x = div*wx*theta[k-2].x;
   382     theta[k-1].y = div*wy*theta[k-2].y;
   383     theta[k-1].z = div*wz*theta[k-2].z;
   385     for (int j=1;j <= k-2;j++) {
   386       theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
   387       theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
   388       theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
   390     theta[0].x = div*(((T)1) - wx)*theta[0].x;
   391     theta[0].y = div*(((T)1) - wy)*theta[0].y;
   392     theta[0].z = div*(((T)1) - wz)*theta[0].z;
   395   //--- perform standard b-spline differentiation
   396   dtheta[0].x = -theta[0].x;
   397   dtheta[0].y = -theta[0].y;
   398   dtheta[0].z = -theta[0].z;
   400   for (int j=2;j <= order;j++) {
   401     dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
   402     dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
   403     dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
   406   //--- one more recursion
   407   T div = ((T)1) / (T)(order-1);
   408   theta[order-1].x = div*wx*theta[order-2].x;
   409   theta[order-1].y = div*wy*theta[order-2].y;
   410   theta[order-1].z = div*wz*theta[order-2].z;
   412   for (int j=1;j <= order-2;j++) {
   413     theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
   414     theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
   415     theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
   418   theta[0].x = div*(((T)1) - wx)*theta[0].x;
   419   theta[0].y = div*(((T)1) - wy)*theta[0].y;
   420   theta[0].z = div*(((T)1) - wz)*theta[0].z;
   424 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
   425 // blockDim.x                   = Number of atoms each block loads
   426 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
   428 template <typename AT, int order, bool periodicY, bool periodicZ>
   430 spread_charge_kernel(const float4 *xyzq, const int ncoord,
   431           const int nfftx, const int nffty, const int nfftz,
   432           const int xsize, const int ysize, const int zsize,
   433           const int xdim, const int y00, const int z00,
   436   // Shared memory use:
   437   // order = 4: 1920 bytes
   438   // order = 6: 2688 bytes
   439   // order = 8: 3456 bytes
   440   __shared__ int sh_ix[32];
   441   __shared__ int sh_iy[32];
   442   __shared__ int sh_iz[32];
   443   __shared__ float sh_thetax[order*32];
   444   __shared__ float sh_thetay[order*32];
   445   __shared__ float sh_thetaz[order*32];
   447   // Process atoms pos to pos_end-1 (blockDim.x)
   448   const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
   449   const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
   451   if (pos < pos_end && threadIdx.y == 0) {
   453     float4 xyzqi = xyzq[pos];
   459     float frx = ((float)nfftx)*x;
   460     float fry = ((float)nffty)*y;
   461     float frz = ((float)nfftz)*z;
   467     float wx = frx - (float)frxi;
   468     float wy = fry - (float)fryi;
   469     float wz = frz - (float)frzi;
   471     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
   472     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
   474     sh_ix[threadIdx.x] = frxi;
   475     sh_iy[threadIdx.x] = fryi - y00;
   476     sh_iz[threadIdx.x] = frzi - z00;
   480     calc_one_theta<float, order>(wx, theta);
   482     for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
   484     calc_one_theta<float, order>(wy, theta);
   486     for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
   488     calc_one_theta<float, order>(wz, theta);
   490     for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
   496   // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
   497   // NOTE: Only tid=0...order*order*order-1 do any computation
   498   const int order3 = ((order*order*order-1)/32 + 1)*32;
   499   const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;   // 0...order3-1
   500   const int x0 = tid % order;
   501   const int y0 = (tid / order) % order;
   502   const int z0 = tid / (order*order);
   504   // Loop over atoms pos..pos_end-1
   505   int iadd = blockDim.x*blockDim.y/order3;
   506   int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
   507   int iend = pos_end - blockIdx.x*blockDim.x;
   508   for (;i < iend;i += iadd) {
   509     int x = sh_ix[i] + x0;
   510     int y = sh_iy[i] + y0;
   511     int z = sh_iz[i] + z0;
   513     if (x >= nfftx) x -= nfftx;
   515     if (periodicY  && (y >= nffty)) y -= nffty;
   516     if (!periodicY && (y < 0 || y >= ysize)) continue;
   518     if (periodicZ  && (z >= nfftz)) z -= nfftz;
   519     if (!periodicZ && (z < 0 || z >= zsize)) continue;
   521     // Get position on the grid
   522     int ind = x + xdim*(y + ysize*(z));
   524     // Here we unroll the 6x6x6 loop with 216 threads.
   525     // NOTE: We use 7*32=224 threads to do this
   526     // Calculate interpolated charge value and store it to global memory
   528     if (tid < order*order*order)
   529       write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
   535 template <typename AT, int order, bool periodicY, bool periodicZ>
   537 spread_charge_kernel_v2(const float4 *xyzq, const int ncoord,
   538           const int nfftx, const int nffty, const int nfftz,
   539           const float nfftx_f, const float nffty_f, 
   541           const int xsize, const int ysize, const int zsize,
   542           const int xdim, const int y00, const int z00,
   545   // Shared memory use:
   546   // order = 4: 1920 bytes
   547   // order = 6: 2688 bytes
   548   // order = 8: 3456 bytes
   549   __shared__ int sh_ix[WARPSIZE];
   550   __shared__ int sh_iy[WARPSIZE];
   551   __shared__ int sh_iz[WARPSIZE];
   552   __shared__ float sh_thetax[order*WARPSIZE];
   553   __shared__ float sh_thetay[order*WARPSIZE];
   554   __shared__ float sh_thetaz[order*WARPSIZE];
   556   // Process atoms pos to pos_end-1 (blockDim.x)
   557   const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
   558   const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
   559   // const float invorder = 1.f/order;
   561   if (pos < pos_end && threadIdx.y == 0) {
   563     float4 xyzqi = xyzq[pos];
   569     float frx = (nfftx_f)*x;
   570     float fry = (nffty_f)*y;
   571     float frz = (nfftz_f)*z;
   577     float wx = frx - (float)frxi;
   578     float wy = fry - (float)fryi;
   579     float wz = frz - (float)frzi;
   581     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
   582     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
   584     sh_ix[threadIdx.x] = frxi;
   585     sh_iy[threadIdx.x] = fryi - y00;
   586     sh_iz[threadIdx.x] = frzi - z00;
   590     calc_one_theta<float, order>(wx, theta);
   592     for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
   594     calc_one_theta<float, order>(wy, theta);
   596     for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
   598     calc_one_theta<float, order>(wz, theta);
   600     for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
   606   // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
   607   // NOTE: Only tid=0...order*order*order-1 do any computation
   608   const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
   609   const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;   // 0...order3-1
   610   const int x0 = tid % order;
   611   const int y0 = (tid / order) % order;
   612   //const int y0 = (int)(tid * invorder) % order;
   613   const int z0 = tid / (order*order);
   614   //const int z0 = tid * invorder * invorder;
   616   // Loop over atoms pos..pos_end-1
   617   int iadd = blockDim.x*blockDim.y/order3;
   618   int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
   619   int iend = pos_end - blockIdx.x*blockDim.x;
   620   for (;i < iend;i += iadd) {
   621     int x = sh_ix[i] + x0;
   622     int y = sh_iy[i] + y0;
   623     int z = sh_iz[i] + z0;
   625     if (x >= nfftx) x -= nfftx;
   627     if (periodicY  && (y >= nffty)) y -= nffty;
   628     if (!periodicY && (y < 0 || y >= ysize)) continue;
   630     if (periodicZ  && (z >= nfftz)) z -= nfftz;
   631     if (!periodicZ && (z < 0 || z >= zsize)) continue;
   633     // Get position on the grid
   634     int ind = x + xdim*(y + ysize*(z));
   636     // Here we unroll the 6x6x6 loop with 216 threads.
   637     // NOTE: We use 7*32=224 threads to do this
   638     // Calculate interpolated charge value and store it to global memory
   640     if (tid < order*order*order)
   641       write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
   647 template<typename AT, int kPatchGridDim>
   649 __launch_bounds__(PatchLevelPmeData::kNumThreads, 4)
   650 spread_charge_patch_8th(
   651   const int numPatches,
   652   const int3 patchGridDim,
   653   const CudaLocalRecord*        localRecords,
   654   const int3* patchGridOffsets,
   656   const int nfftx, const int nffty, const int nfftz,
   657   const float nfftx_f, const float nffty_f, 
   659   const int xsize, const int ysize, const int zsize,
   660   const int xdim, const int y00, const int z00,
   663   // Define local copies of constexpr variables
   664   constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
   665   constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
   666   constexpr int kDims = PatchLevelPmeData::kDim;
   667   constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDimPad;
   668   constexpr int kOrder = 8;
   670   // Setup shared memory buffers
   671   extern __shared__ float sharedBuffer[];
   672   char4* s_indices = (char4*) sharedBuffer; 
   673   float* s_theta = (float*) &sharedBuffer[kNumThreads]; 
   674   float* s_grid = (float*) &sharedBuffer[kNumThreads + kOrder * kDims * kThetaStride];
   676   // Zero out grid shared memory buffer since we will be accumulating into it
   677   const int numGridPoints = kPatchGridDimPad * kPatchGridDim * kPatchGridDim;
   678   for (int i = threadIdx.x; i < numGridPoints; i += blockDim.x) {
   683   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
   684     const int numAtoms = localRecords[patchIndex].numAtoms;
   685     const int offset = localRecords[patchIndex].bufferOffset;
   686     const int3 patchGridOffset = patchGridOffsets[patchIndex];
   688     const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
   689     // Iterate over all the atoms in the patch in chunks as we need to stash theta values in 
   691     for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
   693       // Compute thetas with each thread processing a different atom
   695       NAMD_WARP_SYNC(WARP_FULL_MASK);
   696       const int atomIndex = iter * blockDim.x + threadIdx.x;
   697       if (atomIndex < numAtoms) {
   698         float4 xyzqi = xyzq[atomIndex + offset];
   700         // This will just shift the unscaled positions to be between 0 and 1 instead of 
   701         // -0.5 and 0.5 without handling the boundary conditions. The boundary conditions
   702         // will be handled when the grid is written from shared to global memory
   703         float x = xyzqi.x + 0.5f; 
   704         float y = xyzqi.y + 0.5f;
   705         float z = xyzqi.z + 0.5f;
   708         float frx = (nfftx_f)*x;
   709         float fry = (nffty_f)*y;
   710         float frz = (nfftz_f)*z;
   712         int frxi = (int)floorf(frx);
   713         int fryi = (int)floorf(fry);
   714         int frzi = (int)floorf(frz);
   716         float wx = frx - (float)frxi;
   717         float wy = fry - (float)fryi;
   718         float wz = frz - (float)frzi;
   720         // Because these are indexes into the shared memory buffer, it should be safe to store
   722         s_indices[threadIdx.x].x = (char) (frxi - patchGridOffset.x);
   723         s_indices[threadIdx.x].y = (char) (fryi - patchGridOffset.y);
   724         s_indices[threadIdx.x].z = (char) (frzi - patchGridOffset.z);
   728         calc_one_theta<float, kOrder>(wx, theta);
   730         for (int i=0;i < kOrder;i++) {
   731           // The x theta matrix has the atom as the fastest moving index and the point as the slowest moving index:
   733           // atom 0 point 0, atom 1 point 0, atom 2 point 0, ...
   734           // atom 0 point 1, atom 1 point 1, atom 2 point 1, ...
   737           const int idx = 0 * (kOrder * kThetaStride) + i * kThetaStride + threadIdx.x;
   738           s_theta[idx] = q*theta[i];
   741         calc_one_theta<float, kOrder>(wy, theta);
   743         for (int i=0;i < kOrder;i++) {
   744           // Each thread will process 2 grid points in the Y dimension, and we want that data to be consecutive in
   745           // shared memory to allow for vector loads:
   747           // atom 0 point 0, atom 0 point 4, atom 1 point 0, atom 1 point 1, ...
   748           // atom 0 point 1, atom 0 point 5, atom 1 point 1, atom 1 point 5, ...
   751           const int indexInFloat2 = i / 4;
   752           const int float2Index = i % 4;
   753           const int idx = 1 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
   754           s_theta[idx] = theta[i];
   757         calc_one_theta<float, kOrder>(wz, theta);
   759         for (int i=0;i < kOrder;i++) {
   760           // The z theta values use the same layout as the y theta values since each thread will 
   761           // process 2 grid points in the Z dimension
   762           const int indexInFloat2 = i / 4;
   763           const int float2Index = i % 4;
   764           const int idx = 2 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
   765           s_theta[idx] = theta[i];
   771       // Compute the grid contributions and store in shared memory with all threads processing a single atom concurrently
   774       // Each thread block will compute a 8x8x8 grid of charges with 128 threads. Each thread will
   775       // compute a 1x2x2 non-contiguous sub-grid based on the layout of the theta described above.
   776       // For example, thread 0 will compute (0,0,0), (0,4,0), (0,0,4), (0,4,4)
   777       const int xOffset = threadIdx.x % kOrder;
   778       const int yPartialOffset = (threadIdx.x / kOrder) % (kOrder / 2);
   779       const int zPartialOffset = threadIdx.x / (kOrder * (kOrder / 2));
   781       constexpr int kYItersPerThread = 2;
   782       constexpr int kZItersPerThread = 2;
   783       constexpr int kYValPerIter = 4;
   784       constexpr int kZValPerIter = 4;
   786       const int atomsToProcess = min(kNumThreads, numAtoms - iter * kNumThreads);
   787       const int outerIters = ((atomsToProcess) / 4) * 4;
   788       int thetaIndexOuter = 0; 
   790       // The loop over atoms is broken into chunks of 4 to enable loading in multiple atom's x theta data.
   791       // Each warp will only need to load in 8 distinct x theta values, so some of the bytes are wasted. By
   792       // loading in the x theta data for 4 atoms at once, we can avoid this.
   793       for ( ; thetaIndexOuter < outerIters; thetaIndexOuter += 4) {
   794         char4 r_global_indices[4];
   798         for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
   799           const int thetaIndex = thetaIndexOuter + thetaIndexInner;
   800           // The compiler should auto vectorize these loads
   801           r_global_indices[thetaIndexInner] = s_indices[thetaIndex];
   802           r_x_val[thetaIndexInner] = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
   806         for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
   807           const int thetaIndex = thetaIndexOuter + thetaIndexInner;
   809           const int global_idx_x = r_global_indices[thetaIndexInner].x;
   810           const int global_idx_y = r_global_indices[thetaIndexInner].y;
   811           const int global_idx_z = r_global_indices[thetaIndexInner].z;
   813           const int shared_idx_x = global_idx_x + xOffset;
   814           const float x_val = r_x_val[thetaIndexInner];
   817           int r_shared_idx_y[kYItersPerThread];
   818           float r_grid_val_xy[kYItersPerThread];
   820           for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
   821             const int yOffset = 2 * yPartialOffset;
   822             r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
   823             // The compiler should auto vectorize these loads
   824             const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
   825             r_grid_val_xy[yIter] = x_val * y_val;
   828           int r_shared_idx_z[kZItersPerThread];
   829           float r_z_val[kZItersPerThread];
   831           for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
   832             const int zOffset = 2 * zPartialOffset;
   833             r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
   834             r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
   838           for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
   840             for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
   841               const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
   842               const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
   843               s_grid[shared_idx] += grid_val;
   850       // Compute for the remaining atoms
   851       for (int thetaIndex = thetaIndexOuter ; thetaIndex < atomsToProcess; thetaIndex++) {
   852         const int global_idx_x = s_indices[thetaIndex].x;
   853         const int global_idx_y = s_indices[thetaIndex].y;
   854         const int global_idx_z = s_indices[thetaIndex].z;
   856         const int shared_idx_x = global_idx_x + xOffset;
   857         const float x_val = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
   859         int r_shared_idx_y[kYItersPerThread];
   860         float r_grid_val_xy[kYItersPerThread];
   862         for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
   863           const int yOffset = 2 * yPartialOffset;
   864           r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
   865           const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
   866           r_grid_val_xy[yIter] = x_val * y_val;
   869         int r_shared_idx_z[kZItersPerThread];
   870         float r_z_val[kZItersPerThread];
   872         for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
   873           const int zOffset = 2 * zPartialOffset;
   874           r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
   875           r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
   879         for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
   881           for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
   882             const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
   883             const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
   884             s_grid[shared_idx] += grid_val;
   891     // Write Shared grid to global memory
   893     const int indexInWarp = threadIdx.x % 32;
   894     const int warpIndex = threadIdx.x / 32;
   895     const int numWarps = blockDim.x / 32;
   896     const bool isActive = (indexInWarp < patchGridDim.x);
   897     if (!isActive) continue;
   899     const int shared_idx_x = indexInWarp;
   900     int global_idx_x = shared_idx_x + patchGridOffset.x;
   901     if (global_idx_x >= nfftx) global_idx_x -= nfftx;
   902     else if (global_idx_x < 0) global_idx_x += nfftx;
   905     for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
   906       const int shared_idx_y = yIter;
   907       int global_idx_y = shared_idx_y + patchGridOffset.y;
   908       if (global_idx_y >= nffty) global_idx_y -= nffty;
   909       else if (global_idx_y < 0) global_idx_y += nffty;
   911       for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
   912         const int shared_idx_z = zIter;
   913         int global_idx_z = shared_idx_z + patchGridOffset.z;
   914         if (global_idx_z >= nfftz) global_idx_z -= nfftz;
   915         else if (global_idx_z < 0) global_idx_z += nfftz;
   917         const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
   918         const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
   920         atomicAdd(data + global_idx, s_grid[shared_idx]);
   921         s_grid[shared_idx] = 0.0f;
   928 //-----------------------------------------------------------------------------------------
   929 // Generic version can not be used
   930 template <typename T> __forceinline__ __device__
   931 void gather_force_store(const float fx, const float fy, const float fz,
   932       const int stride, const int pos,
   936 // Template specialization for "float"
   937 template <> __forceinline__ __device__
   938 void gather_force_store<float>(const float fx, const float fy, const float fz, 
   939              const int stride, const int pos, 
   941   // Store into non-strided float XYZ array
   943   force[pos+stride]   = fy;
   944   force[pos+stride*2] = fz;
   947 // Template specialization for "float3"
   948 template <> __forceinline__ __device__
   949 void gather_force_store<float3>(const float fx, const float fy, const float fz, 
   950         const int stride, const int pos, 
   952   // Store into non-strided "float3" array
   954   // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
   955   // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
   956   reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
   957   reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
   958   reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
   965 //-----------------------------------------------------------------------------------------
   967 // Per atom data structure for the gather_force -kernels
   968 template <typename T, int order>
   986 // Gathers forces from the grid
   987 // blockDim.x            = Number of atoms each block loads
   988 // blockDim.x*blockDim.y = Total number of threads per block
   990 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
   991 __global__ void gather_force(const float4 *xyzq, const int ncoord,
   992               const int nfftx, const int nffty, const int nfftz,
   993               const int xsize, const int ysize, const int zsize,
   994               const int xdim, const int y00, const int z00,
   995               // const float recip1, const float recip2, const float recip3,
   996               const float* data,      // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
   998               const cudaTextureObject_t gridTexObj,
  1003   const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
  1006   __shared__ gather_t<CT, order> shmem[32];
  1008   const int pos = blockIdx.x*blockDim.x + threadIdx.x;
  1009   const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
  1011   // Load atom data into shared memory
  1012   if (pos < pos_end && threadIdx.y == 0) {
  1014     float4 xyzqi = xyzq[pos];
  1020     float frx = ((float)nfftx)*x;
  1021     float fry = ((float)nffty)*y;
  1022     float frz = ((float)nfftz)*z;
  1024     int frxi = (int)frx;
  1025     int fryi = (int)fry;
  1026     int frzi = (int)frz;
  1028     float wx = frx - (float)frxi;
  1029     float wy = fry - (float)fryi;
  1030     float wz = frz - (float)frzi;
  1032     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
  1033     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
  1035     shmem[threadIdx.x].ix = frxi;
  1036     shmem[threadIdx.x].iy = fryi - y00;
  1037     shmem[threadIdx.x].iz = frzi - z00;
  1038     shmem[threadIdx.x].charge = q;
  1040     float3 theta_tmp[order];
  1041     float3 dtheta_tmp[order];
  1042     calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
  1045     for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
  1048     for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
  1051     for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
  1054     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
  1057     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
  1060     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
  1065   // We divide the order x order x order cube into 8 sub-cubes.
  1066   // These sub-cubes are taken care by a single thread
  1067   // The size of the sub-cubes is:
  1071   const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
  1072   // Calculate the starting index on the sub-cube for this thread
  1074   const int t = (tid % 8);         // sub-cube index (0...7)
  1075   // t = (tx0 + ty0*2 + tz0*4)/nsc
  1076   // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
  1077   const int tz0 = (t / 4)*nsc;
  1078   const int ty0 = ((t / 2) % 2)*nsc;
  1079   const int tx0 = (t % 2)*nsc;
  1082   // Calculate forces for 32 atoms. We have 32*2 = 64 threads
  1083   // Loop is iterated 4 times:
  1085   // Threads 0...7   = atoms 0, 8,  16, 24
  1086   // Threads 8...15  = atoms 1, 9,  17, 25
  1087   // Threads 16...31 = atoms 2, 10, 18, 26
  1089   // Threads 56...63 = atoms 7, 15, 23, 31
  1093   const int base_end = pos_end - blockIdx.x*blockDim.x;
  1095   // Make sure to mask out any threads that are not running the loop.
  1096   // This will happen if the number of atoms is not a multiple of 32.
  1097 #if defined(NAMD_HIP)
  1098   WarpMask warp_mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
  1100   WarpMask warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
  1103   while (base < base_end) {
  1108     int ix0 = shmem[base].ix;
  1109     int iy0 = shmem[base].iy;
  1110     int iz0 = shmem[base].iz;
  1112     // Each thread calculates a nsc x nsc x nsc sub-cube
  1114     for (int i=0;i < nsc*nsc*nsc;i++) {
  1115       int tz = tz0 + (i/(nsc*nsc));
  1116       int ty = ty0 + ((i/nsc) % nsc);
  1117       int tx = tx0 + (i % nsc);
  1122       if (ix >= nfftx) ix -= nfftx;
  1124       if (periodicY  && (iy >= nffty)) iy -= nffty;
  1125       if (!periodicY && (iy < 0 || iy >= ysize)) continue;
  1127       if (periodicZ  && (iz >= nfftz)) iz -= nfftz;
  1128       if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
  1130       int ind = ix + (iy + iz*ysize)*xdim;
  1132 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
  1133       float q0 = __ldg(&data[ind]);
  1135       float q0 = tex1Dfetch<float>(gridTexObj, ind);
  1137       float thx0 = shmem[base].thetax[tx];
  1138       float thy0 = shmem[base].thetay[ty];
  1139       float thz0 = shmem[base].thetaz[tz];
  1140       float dthx0 = shmem[base].dthetax[tx];
  1141       float dthy0 = shmem[base].dthetay[ty];
  1142       float dthz0 = shmem[base].dthetaz[tz];
  1143       f1 += dthx0 * thy0 * thz0 * q0;
  1144       f2 += thx0 * dthy0 * thz0 * q0;
  1145       f3 += thx0 * thy0 * dthz0 * q0;
  1148     //-------------------------
  1151     const int i = threadIdx.x & 7;
  1153     f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
  1154     f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
  1155     f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
  1157     f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
  1158     f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
  1159     f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
  1161     f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
  1162     f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
  1163     f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
  1166       shmem[base].f1 = f1;
  1167       shmem[base].f2 = f2;
  1168       shmem[base].f3 = f3;
  1172 #if defined(NAMD_HIP)
  1173     warp_mask = NAMD_WARP_BALLOT(warp_mask, (base < base_end) );
  1175     warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
  1181   if (pos < pos_end && threadIdx.y == 0) {
  1182     float f1 = shmem[threadIdx.x].f1;
  1183     float f2 = shmem[threadIdx.x].f2;
  1184     float f3 = shmem[threadIdx.x].f3;
  1185     float q = -shmem[threadIdx.x].charge; //*ccelec_float;
  1186     // float fx = q*recip1*f1*nfftx;
  1187     // float fy = q*recip2*f2*nffty;
  1188     // float fz = q*recip3*f3*nfftz;
  1189     float fx = q*f1*nfftx;
  1190     float fy = q*f2*nffty;
  1191     float fz = q*f3*nfftz;
  1192     gather_force_store<FT>(fx, fy, fz, stride, pos, force);
  1197 template<typename CT, typename FT, int kPatchGridDim>
  1199 __launch_bounds__(PatchLevelPmeData::kNumThreads, 1)
  1200 gather_force_patch_8th( const int numPatches,
  1201   const int3 patchGridDim,
  1202   const CudaLocalRecord*        localRecords,
  1203   const int3* patchGridOffsets,
  1205   const int nfftx, const int nffty, const int nfftz,
  1206   const int xsize, const int ysize, const int zsize,
  1207   const int xdim, const int y00, const int z00,
  1212   // Define local copies of constexpr variables
  1213   constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
  1214   constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
  1215   constexpr int kDims = PatchLevelPmeData::kDim;
  1216   constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDim;
  1217   constexpr int kOrder = 8;
  1219   // Setup shared memory buffers
  1220   extern __shared__ float sharedBuffer[];
  1221   float2* s_x_theta = (float2*) sharedBuffer;
  1222   float* s_grid = (float*) &sharedBuffer[2 * kOrder * kThetaStride];
  1224   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
  1225     const int numAtoms = localRecords[patchIndex].numAtoms;
  1226     const int offset = localRecords[patchIndex].bufferOffset;
  1227     const int3 patchGridOffset = patchGridOffsets[patchIndex];
  1230     // Load grid into shared memory
  1232     const int indexInWarp = threadIdx.x % 32;
  1233     const int warpIndex = threadIdx.x / 32;
  1234     const int numWarps = blockDim.x / 32;
  1235     const bool isActive = (indexInWarp < patchGridDim.x);
  1237     const int shared_idx_x = indexInWarp;
  1238     int global_idx_x = shared_idx_x + patchGridOffset.x;
  1239     if (global_idx_x >= nfftx) global_idx_x -= nfftx;
  1240     else if (global_idx_x < 0) global_idx_x += nfftx;
  1243     for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
  1244       const int shared_idx_y = yIter;
  1245       int global_idx_y = shared_idx_y + patchGridOffset.y;
  1246       if (global_idx_y >= nffty) global_idx_y -= nffty;
  1247       else if (global_idx_y < 0) global_idx_y += nffty;
  1249       for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
  1250         const int shared_idx_z = zIter;
  1251         int global_idx_z = shared_idx_z + patchGridOffset.z;
  1252         if (global_idx_z >= nfftz) global_idx_z -= nfftz;
  1253         else if (global_idx_z < 0) global_idx_z += nfftz;
  1255         const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
  1256         const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
  1259           s_grid[shared_idx] = data[global_idx];
  1265     // Iterate over the atoms in chunks of size equal to the number of threads. For small system, we can 
  1266     // assign multiple thread blocks per patch in the Y dimension to increase the amount of parallelism
  1268     // Each thread will compute the forces on one atom; this simplifies the kernel since it doesn't need
  1269     // to reduce any partial sums of forces.
  1270     const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
  1271     for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
  1272       const int atomIndex = iter * blockDim.x + threadIdx.x;
  1274       if (atomIndex < numAtoms) {
  1275         gather_t<CT, kOrder> atomData;
  1277         float4 xyzqi = xyzq[atomIndex + offset];
  1278         // The boundary conditions are handled when the grid is loaded into shared memory, so we 
  1279         // just need to shift the solutions here
  1280         float x = xyzqi.x + 0.5f;  
  1281         float y = xyzqi.y + 0.5f;
  1282         float z = xyzqi.z + 0.5f;
  1285         float frx = ((float)nfftx)*x;
  1286         float fry = ((float)nffty)*y;
  1287         float frz = ((float)nfftz)*z;
  1289         int frxi = (int)floorf(frx);
  1290         int fryi = (int)floorf(fry);
  1291         int frzi = (int)floorf(frz);
  1293         float wx = frx - (float)frxi;
  1294         float wy = fry - (float)fryi;
  1295         float wz = frz - (float)frzi;
  1297         atomData.ix = frxi - patchGridOffset.x;
  1298         atomData.iy = fryi - patchGridOffset.y;
  1299         atomData.iz = frzi - patchGridOffset.z;
  1300         atomData.charge = q;
  1302         float3 theta_tmp[kOrder];
  1303         float3 dtheta_tmp[kOrder];
  1304         calc_theta_dtheta<float, float3, kOrder>(wx, wy, wz, theta_tmp, dtheta_tmp);
  1307         for (int i=0;i < kOrder;i++) {
  1308           // x theta values are stored in shared memory to reduce register pressure
  1309           s_x_theta[i * kThetaStride + threadIdx.x].x = theta_tmp[i].x;
  1310           s_x_theta[i * kThetaStride + threadIdx.x].y = dtheta_tmp[i].x;
  1311           atomData.thetay[i] = theta_tmp[i].y;
  1312           atomData.dthetay[i] = dtheta_tmp[i].y;
  1313           atomData.thetaz[i] = theta_tmp[i].z;
  1314           atomData.dthetaz[i] = dtheta_tmp[i].z;
  1323         for (int xIter = 0; xIter < kOrder; xIter++) {
  1324           const int xOffset = xIter;
  1325           // To reduce register pressure, the x theta values are stored in shared memory
  1326           float thx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].x;
  1327           float dthx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].y;
  1330           for (int yIter = 0; yIter < kOrder; yIter++) {
  1332             for (int zIter = 0; zIter < kOrder; zIter++) {
  1333               const int yOffset = yIter;
  1334               const int zOffset = zIter;
  1336               const int shared_idx_x = atomData.ix + xOffset;
  1337               const int shared_idx_y = atomData.iy + yIter;
  1338               const int shared_idx_z = atomData.iz + zIter;
  1339               const int shared_idx = shared_idx_x + kPatchGridDimPad * 
  1340                   (shared_idx_y + shared_idx_z * kPatchGridDim);
  1341               // This load is essentially doing random accesses to shared memory, so
  1342               // it results in a significant amount of bank conflicts
  1343               const float grid_val = s_grid[shared_idx];
  1345               float thy0 = atomData.thetay[yOffset];
  1346               float thz0 = atomData.thetaz[zOffset];
  1347               float dthy0 = atomData.dthetay[yOffset];
  1348               float dthz0 = atomData.dthetaz[zOffset];
  1350               f1 += dthx0 * thy0 * thz0 * grid_val;
  1351               f2 += thx0 * dthy0 * thz0 * grid_val;
  1352               f3 += thx0 * thy0 * dthz0 * grid_val;
  1357         const float fx = -1.0f * atomData.charge * f1 * nfftx;
  1358         const float fy = -1.0f * atomData.charge * f2 * nffty;
  1359         const float fz = -1.0f * atomData.charge * f3 * nfftz;
  1360         gather_force_store<FT>(fx, fy, fz, stride, atomIndex + offset, force);
  1367 const int TILEDIM = 32;
  1368 const int TILEROWS = 8;
  1370 template <typename T>
  1371 __device__ __forceinline__
  1372 void transpose_xyz_yzx_device(
  1373   const int x_in, const int y_in, const int z_in,
  1374   const int x_out, const int y_out,
  1375   const int nx, const int ny, const int nz,
  1376   const int xsize_in, const int ysize_in,
  1377   const int ysize_out, const int zsize_out,
  1378   const T* data_in, T* data_out) {
  1381   __shared__ T tile[TILEDIM][TILEDIM+1];
  1383   // Read (x,y) data_in into tile (shared memory)
  1384   for (int j=0;j < TILEDIM;j += TILEROWS)
  1385     if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
  1386       tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
  1390   // Write (y,x) tile into data_out
  1391   const int z_out = z_in;
  1392   for (int j=0;j < TILEDIM;j += TILEROWS)
  1393     if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
  1394       data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
  1398 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
  1400 template <typename T>
  1401 __global__ void transpose_xyz_yzx_kernel(
  1402   const int nx, const int ny, const int nz,
  1403   const int xsize_in, const int ysize_in,
  1404   const int ysize_out, const int zsize_out,
  1405   const T* data_in, T* data_out) {
  1407   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
  1408   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
  1409   int z_in = blockIdx.z           + threadIdx.z;
  1411   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
  1412   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
  1414   transpose_xyz_yzx_device<T>(
  1419     ysize_out, zsize_out,
  1424   __shared__ T tile[TILEDIM][TILEDIM+1];
  1426   int x = blockIdx.x * TILEDIM + threadIdx.x;
  1427   int y = blockIdx.y * TILEDIM + threadIdx.y;
  1428   int z = blockIdx.z           + threadIdx.z;
  1430   // Read (x,y) data_in into tile (shared memory)
  1431   for (int j=0;j < TILEDIM;j += TILEROWS)
  1432     if ((x < nx) && (y + j < ny) && (z < nz))
  1433       tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
  1437   // Write (y,x) tile into data_out
  1438   x = blockIdx.x * TILEDIM + threadIdx.y;
  1439   y = blockIdx.y * TILEDIM + threadIdx.x;
  1440   for (int j=0;j < TILEDIM;j += TILEROWS)
  1441     if ((x + j < nx) && (y < ny) && (z < nz))
  1442       data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
  1447 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
  1448 // Batch index bi is encoded in blockIdx.z, where 
  1449 // blockIdx.z = 0...nz-1 are for batch 1
  1450 // blockIdx.z = nz...2*nz-1 are for batch 2
  1452 // gridDim.z = nz*numBatches
  1454 template <typename T>
  1455 __global__ void batchTranspose_xyz_yzx_kernel(
  1456   const TransposeBatch<T>* batches,
  1457   const int ny, const int nz, 
  1458   const int xsize_in, const int ysize_in) {
  1460   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
  1461   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
  1462   int z_in = (blockIdx.z % nz)    + threadIdx.z;
  1464   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
  1465   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
  1467   int bi = blockIdx.z/nz;
  1469   TransposeBatch<T> batch = batches[bi];
  1471   int ysize_out = batch.ysize_out;
  1472   int zsize_out = batch.zsize_out;
  1473   T* data_in    = batch.data_in;
  1474   T* data_out   = batch.data_out;
  1476   transpose_xyz_yzx_device<T>(
  1481     ysize_out, zsize_out,
  1488 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
  1490 template <typename T>
  1491 __forceinline__ __device__
  1492 void transpose_xyz_yzx_dev(
  1494   const int nx, const int ny, const int nz,
  1495   const int xsize_in, const int ysize_in,
  1496   const int xsize_out, const int ysize_out,
  1497   const T* data_in, T* data_out) {
  1500   __shared__ T tile[TILEDIM][TILEDIM+1];
  1502   int x = blockIdx.x * TILEDIM + threadIdx.x;
  1503   int y = blockIdx.y * TILEDIM + threadIdx.y;
  1504   // int z = blockIdx.z           + threadIdx.z;
  1505   int z = blockz               + threadIdx.z;
  1507   // Read (x,y) data_in into tile (shared memory)
  1508   for (int j=0;j < TILEDIM;j += TILEROWS)
  1509     if ((x < nx) && (y + j < ny) && (z < nz))
  1510       tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
  1514   // Write (y,x) tile into data_out
  1515   x = blockIdx.x * TILEDIM + threadIdx.y;
  1516   y = blockIdx.y * TILEDIM + threadIdx.x;
  1517   for (int j=0;j < TILEDIM;j += TILEROWS)
  1518     if ((x + j < nx) && (y < ny) && (z < nz))
  1519       data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
  1524 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
  1525 // (nx, ny, nz)                     = size of the transposed volume
  1526 // (xsize_in, ysize_in, zsize_in)   = size of the input data
  1527 // into nblock memory blocks
  1529 template <typename T>
  1530 __global__ void transpose_xyz_yzx_kernel(
  1532   const int nx, const int ny, const int nz,
  1533   const int xsize_in, const int ysize_in,
  1534   const int xsize_out, const int ysize_out,
  1535   const T* data_in, T* data_out) {
  1537   const int iblock = blockIdx.z/nz;
  1539   if (iblock < nblock) {
  1540     transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
  1541       xsize_in, ysize_in, xsize_out, ysize_out,
  1548 template <typename T>
  1549 __device__ __forceinline__
  1550 void transpose_xyz_zxy_device(
  1551   const int x_in, const int y_in, const int z_in,
  1552   const int x_out, const int z_out,
  1553   const int nx, const int ny, const int nz,
  1554   const int xsize_in, const int ysize_in,
  1555   const int zsize_out, const int xsize_out,
  1556   const T* data_in, T* data_out) {
  1559   __shared__ T tile[TILEDIM][TILEDIM+1];
  1561   // Read (x,z) data_in into tile (shared memory)
  1562   for (int k=0;k < TILEDIM;k += TILEROWS)
  1563     if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
  1564       tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
  1568   // Write (z,x) tile into data_out
  1569   const int y_out = y_in;
  1570   for (int k=0;k < TILEDIM;k += TILEROWS)
  1571     if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
  1572       data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
  1576 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
  1578 template <typename T>
  1579 __global__ void transpose_xyz_zxy_kernel(
  1580   const int nx, const int ny, const int nz,
  1581   const int xsize_in, const int ysize_in,
  1582   const int zsize_out, const int xsize_out,
  1583   const T* data_in, T* data_out) {
  1585   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
  1586   int y_in = blockIdx.z           + threadIdx.z;
  1587   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
  1589   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
  1590   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
  1592   transpose_xyz_zxy_device<T>(
  1593     x_in, y_in, z_in, x_out, z_out,
  1596     zsize_out, xsize_out,
  1602 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
  1603 // Batch index bi is encoded in blockIdx.z, where 
  1604 // blockIdx.z = 0...ny-1 are for batch 1
  1605 // blockIdx.z = ny...2*ny-1 are for batch 2
  1607 // gridDim.z = ny*numBatches
  1609 template <typename T>
  1610 __global__ void batchTranspose_xyz_zxy_kernel(
  1611   const TransposeBatch<T>* batches,
  1612   const int ny, const int nz, 
  1613   const int xsize_in, const int ysize_in) {
  1615   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
  1616   int y_in = (blockIdx.z % ny)    + threadIdx.z;
  1617   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
  1619   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
  1620   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
  1622   int bi = blockIdx.z/ny;
  1624   TransposeBatch<T> batch = batches[bi];
  1626   int zsize_out = batch.zsize_out;
  1627   int xsize_out = batch.xsize_out;
  1628   T* data_in    = batch.data_in;
  1629   T* data_out   = batch.data_out;
  1631   transpose_xyz_zxy_device<T>(
  1632     x_in, y_in, z_in, x_out, z_out,
  1635     zsize_out, xsize_out,
  1640 // XXX should be defined in a centralized place
  1642 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
  1644 template<int BLOCK_SIZE>
  1645 __global__ void selfEnergyKernel(
  1647   const float4 *atoms,
  1652   double qq=0; // sum q^2 over all threads
  1653   int i = threadIdx.x + blockIdx.x * blockDim.x;
  1655     double q = atoms[i].w;
  1659   typedef cub::BlockReduce<double, BLOCK_SIZE> BlockReduce;
  1660   __shared__ typename BlockReduce::TempStorage temp_storage;
  1661   qq = BlockReduce(temp_storage).Sum(qq);
  1665   if (threadIdx.x == 0) {
  1666     // each charge already scaled by sqrt(COULOMB * scaling * dielectric_1)
  1667     // finish scaling self energy term
  1668     double c = -ewaldcof * (1.0 / SQRT_PI);
  1669     atomicAdd(selfEnergy, c*qq);
  1673 double compute_selfEnergy(
  1674     double *d_selfEnergy,
  1675     const float4 *d_atoms,
  1678     cudaStream_t stream)
  1680   double selfEnergy = 0;
  1681   const int block = 128;
  1682   const int grid = (natoms + block - 1) / block;
  1683   selfEnergyKernel<block><<<grid, block, 0, stream>>>(d_selfEnergy,
  1684       d_atoms, natoms, ewaldcof);
  1685   cudaCheck(cudaStreamSynchronize(stream));
  1686   copy_DtoH<double>(d_selfEnergy, &selfEnergy, 1, stream);
  1687   cudaCheck(cudaStreamSynchronize(stream));
  1691 //#######################################################################################
  1692 //#######################################################################################
  1693 //#######################################################################################
  1695 void spread_charge(const float4 *atoms, const int numAtoms,
  1696   const int nfftx, const int nffty, const int nfftz,
  1697   const int xsize, const int ysize, const int zsize,
  1698   const int xdim, const int y00, const int z00, 
  1699   const bool periodicY, const bool periodicZ,
  1700   float* data, const int order, cudaStream_t stream) {
  1702   dim3 nthread, nblock;
  1709     nblock.x = (numAtoms - 1)/nthread.x + 1;
  1712     if (periodicY && periodicZ)
  1713       spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
  1715          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1717       spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
  1719          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1721       spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
  1723          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1725       spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
  1727          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1734     nblock.x = (numAtoms - 1)/nthread.x + 1;
  1737     if (periodicY && periodicZ)
  1738       spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
  1740          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1742       spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
  1744          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1746       spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
  1748          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1750       spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
  1752          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1759     nblock.x = (numAtoms - 1)/nthread.x + 1;
  1762     if (periodicY && periodicZ)
  1763       spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
  1765          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1767       spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
  1769          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1771       spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
  1773          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1775       spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
  1777          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
  1782     sprintf(str, "spread_charge, order %d not implemented",order);
  1785   cudaCheck(cudaGetLastError());
  1790 // JM new version of spread_charge routine
  1791 void spread_charge_v2(
  1792   const PatchLevelPmeData patchLevelPmeData,
  1793   const float4 *atoms, const int numAtoms,
  1794   const int nfftx, const int nffty, const int nfftz,
  1795   const float nfftx_f, const float nffty_f, const float nfftz_f,
  1797   const int xsize, const int ysize, const int zsize,
  1798   const int xdim, const int y00, const int z00, 
  1799   const bool periodicY, const bool periodicZ,
  1800   float* data, const int order, cudaStream_t stream) {
  1802   dim3 nthread, nblock;
  1804   if (patchLevelPmeData.compatible()) {
  1805     const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
  1806     const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
  1807     cudaFuncSetAttribute(
  1808       (void*)spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>,
  1809       cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);  
  1810     spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>
  1811       <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
  1812       patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim, 
  1813       patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets, 
  1814       atoms, nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, 
  1815       xsize, ysize, zsize, xdim, y00, z00, data
  1817     cudaCheck(cudaGetLastError());
  1826       nblock.x = (numAtoms - 1)/nthread.x + 1;
  1829       if (periodicY && periodicZ)
  1830         spread_charge_kernel_v2<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
  1832            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1834         spread_charge_kernel_v2<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
  1836            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1838         spread_charge_kernel_v2<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
  1840            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1842         spread_charge_kernel_v2<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
  1844            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1848       nthread.x = WARPSIZE;
  1849       nthread.y = order3 / WARPSIZE;
  1851       nblock.x = (numAtoms - 1)/nthread.x + 1;
  1854       if (periodicY && periodicZ)
  1855         spread_charge_kernel_v2<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
  1857            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1859         spread_charge_kernel_v2<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
  1861            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1863         spread_charge_kernel_v2<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
  1865            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1867         spread_charge_kernel_v2<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
  1869            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1873       nthread.x = WARPSIZE;
  1874       nthread.y = order3 / WARPSIZE;
  1876       nblock.x = (numAtoms - 1)/nthread.x + 1;
  1879       if (periodicY && periodicZ)
  1880         spread_charge_kernel_v2<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
  1882            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1884         spread_charge_kernel_v2<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
  1886            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1888         spread_charge_kernel_v2<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
  1890            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1892         spread_charge_kernel_v2<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
  1894            nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
  1899       sprintf(str, "spread_charge, order %d not implemented",order);
  1902     // cudaCheck(cudaGetLastError());
  1906 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
  1907   const int size1, const int size2, const int size3, const double kappa,
  1908   const float recip1x, const float recip1y, const float recip1z,
  1909   const float recip2x, const float recip2y, const float recip2z,
  1910   const float recip3x, const float recip3y, const float recip3z,
  1911   const double volume,
  1912   const float* prefac1, const float* prefac2, const float* prefac3,
  1913   const int k2_00, const int k3_00,
  1914   const bool doEnergyVirial, double* energy, double* virial, float2* data,
  1915   cudaStream_t stream) {
  1924   int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
  1925   if (doEnergyVirial) {    
  1926     shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
  1929   float piv_inv = (float)(1.0/(M_PI*volume));
  1930   float fac = (float)(M_PI*M_PI/(kappa*kappa));
  1932   if (doEnergyVirial) {
  1934       scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
  1935       (nfft1, nfft2, nfft3, size1, size2, size3,
  1936         recip1x, recip1y, recip1z,
  1937         recip2x, recip2y, recip2z,
  1938         recip3x, recip3y, recip3z,
  1939         prefac1, prefac2, prefac3,
  1940         fac, piv_inv, k2_00, k3_00, data, energy, virial);
  1942       scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
  1943       (nfft1, nfft2, nfft3, size1, size2, size3,
  1944         recip1x, recip1y, recip1z,
  1945         recip2x, recip2y, recip2z,
  1946         recip3x, recip3y, recip3z,
  1947         prefac1, prefac2, prefac3,
  1948         fac, piv_inv, k2_00, k3_00, data, energy, virial);
  1952       scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
  1953       (nfft1, nfft2, nfft3, size1, size2, size3,
  1954         recip1x, recip1y, recip1z,
  1955         recip2x, recip2y, recip2z,
  1956         recip3x, recip3y, recip3z,
  1957         prefac1, prefac2, prefac3,
  1958         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
  1960       scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
  1961       (nfft1, nfft2, nfft3, size1, size2, size3,
  1962         recip1x, recip1y, recip1z,
  1963         recip2x, recip2y, recip2z,
  1964         recip3x, recip3y, recip3z,
  1965         prefac1, prefac2, prefac3,
  1966         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
  1969   cudaCheck(cudaGetLastError());
  1974   const PatchLevelPmeData patchLevelPmeData,
  1975   const float4 *atoms, const int numAtoms,
  1976   // const float recip11, const float recip22, const float recip33,
  1977   const int nfftx, const int nffty, const int nfftz,
  1978   const int xsize, const int ysize, const int zsize,
  1979   const int xdim, const int y00, const int z00, 
  1980   const bool periodicY, const bool periodicZ,
  1981   const float* data, const int order, float3* force, 
  1983   const cudaTextureObject_t gridTexObj,
  1985   cudaStream_t stream) {
  1987   if (patchLevelPmeData.compatible()) {
  1988     const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
  1989     const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
  1991     cudaFuncSetAttribute(
  1992       (void*)gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>,
  1993       cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);  
  1994     gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>
  1995       <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
  1996       patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim, 
  1997       patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets, 
  1998       atoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data,
  2002     cudaCheck(cudaGetLastError());
  2004     dim3 nthread(32, 2, 1);
  2005     dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
  2006     // dim3 nblock(npatch, 1, 1);
  2010       if (periodicY && periodicZ)
  2011         gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
  2012         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2013           // recip11, recip22, recip33,
  2020         gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
  2021         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2022           // recip11, recip22, recip33,
  2029         gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
  2030         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2031           // recip11, recip22, recip33,
  2038         gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
  2039         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2040           // recip11, recip22, recip33,
  2049       if (periodicY && periodicZ)
  2050         gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
  2051         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2052           // recip11, recip22, recip33,
  2059         gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
  2060         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2061           // recip11, recip22, recip33,
  2068         gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
  2069         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2070           // recip11, recip22, recip33,
  2077         gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
  2078         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2079           // recip11, recip22, recip33,
  2088       if (periodicY && periodicZ)
  2089         gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
  2090         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2091           // recip11, recip22, recip33,
  2098         gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
  2099         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2100           // recip11, recip22, recip33,
  2107         gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
  2108         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2109           // recip11, recip22, recip33,
  2116         gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
  2117         (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
  2118           // recip11, recip22, recip33,
  2128       sprintf(str, "gather_force, order %d not implemented",order);
  2131     cudaCheck(cudaGetLastError());
  2138 void transpose_xyz_yzx(
  2139   const int nx, const int ny, const int nz,
  2140   const int xsize_in, const int ysize_in,
  2141   const int ysize_out, const int zsize_out,
  2142   const float2* data_in, float2* data_out, cudaStream_t stream) {
  2144   dim3 numthread(TILEDIM, TILEROWS, 1);
  2145   dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
  2147   transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
  2148   (nx, ny, nz, xsize_in, ysize_in,
  2149     ysize_out, zsize_out,
  2152   cudaCheck(cudaGetLastError());
  2156 // Batched transpose
  2158 void batchTranspose_xyz_yzx(
  2159   const int numBatches, TransposeBatch<float2>* batches, 
  2160   const int max_nx, const int ny, const int nz,
  2161   const int xsize_in, const int ysize_in, cudaStream_t stream) {
  2163   dim3 numthread(TILEDIM, TILEROWS, 1);
  2164   dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
  2166   batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
  2167   (batches, ny, nz, xsize_in, ysize_in);
  2169   cudaCheck(cudaGetLastError());
  2175 void transpose_xyz_zxy(
  2176   const int nx, const int ny, const int nz,
  2177   const int xsize_in, const int ysize_in,
  2178   const int zsize_out, const int xsize_out,
  2179   const float2* data_in, float2* data_out, cudaStream_t stream) {
  2181   dim3 numthread(TILEDIM, TILEROWS, 1);
  2182   dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
  2184   transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
  2185   (nx, ny, nz, xsize_in, ysize_in,
  2186     zsize_out, xsize_out,
  2189   cudaCheck(cudaGetLastError());
  2193 // Batched transpose
  2195 void batchTranspose_xyz_zxy(
  2196   const int numBatches, TransposeBatch<float2>* batches, 
  2197   const int max_nx, const int ny, const int nz,
  2198   const int xsize_in, const int ysize_in,
  2199   cudaStream_t stream) {
  2201   dim3 numthread(TILEDIM, TILEROWS, 1);
  2202   dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
  2204   batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
  2205   (batches, ny, nz, xsize_in, ysize_in);
  2207   cudaCheck(cudaGetLastError());
  2210 template <int block_size, bool alchDecouple>
  2211 __global__ void calcSelfEnergyFEPKernel(double* d_selfEnergy, double* d_selfEnergy_FEP, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down) {
  2212   const int i = blockIdx.x * blockDim.x + threadIdx.x;
  2216   double scaleLambda1 = 0;
  2217   double scaleLambda2 = 0;
  2218   // NOTE: len is NOT the full length of d_atoms array, it's just the length of the first grid
  2219   //       d_atoms always has at least 2 grids when the code reaches here since we are doing alchemical transformations
  2221     switch (d_partition[i]) {
  2223         scaleLambda1 = 1.0f;
  2224         scaleLambda2 = 1.0f;
  2225         q = d_atoms[i].w; // or d_atoms_grid2[i].w
  2229         scaleLambda1 = lambda1Up; // lambda1 up
  2230         scaleLambda2 = lambda2Up; // lambda2 up
  2235         scaleLambda1 = lambda1Down; // lambda1 down
  2236         scaleLambda2 = lambda2Down; // lambda2 down
  2237         // the charges in the #1 grid are already scaled to 0
  2238         // so I fetch the charges from the second grid
  2239         q = d_atoms[i + len].w;
  2247       qq1 = q * q * scaleLambda1;
  2248       qq2 = q * q * scaleLambda2;
  2251   typedef cub::BlockReduce<double, block_size> BlockReduce;
  2252   __shared__ typename BlockReduce::TempStorage temp_storage;
  2253   qq1 = BlockReduce(temp_storage).Sum(qq1);
  2256   qq2 = BlockReduce(temp_storage).Sum(qq2);
  2258   if (threadIdx.x == 0) {
  2259     const double c = -ewaldcof * (1.0 / SQRT_PI);
  2260     atomicAdd(d_selfEnergy, c*qq1);
  2261     atomicAdd(d_selfEnergy_FEP, c*qq2);
  2266 void calcSelfEnergyFEPWrapper(double* d_selfEnergy, double* d_selfEnergy_FEP, double& h_selfEnergy, double& h_selfEnergyFEP, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream) {
  2267   const int block_size = 128;
  2268   const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
  2270     calcSelfEnergyFEPKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
  2272     calcSelfEnergyFEPKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
  2274   copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
  2275   copy_DtoH<double>(d_selfEnergy_FEP, &h_selfEnergyFEP, 1, stream);
  2276   cudaCheck(cudaStreamSynchronize(stream));
  2279 template <int block_size, bool alchDecouple>
  2280 __global__ void calcSelfEnergyTIKernel(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda1Down) {
  2281   const int i = blockIdx.x * blockDim.x + threadIdx.x;
  2286   double factor_ti_1 = 0.0;
  2287   double factor_ti_2 = 0.0;
  2288   double elecLambda1 = 0.0;
  2290     switch (d_partition[i]) {
  2295         q = d_atoms[i].w; // or d_atoms_grid2[i].w
  2299         elecLambda1 = lambda1Up;
  2306         elecLambda1 = lambda1Down;
  2309         q = d_atoms[i + len].w;
  2316       qq = q * q * elecLambda1;
  2317       qq1 = q * q * factor_ti_1;
  2318       qq2 = q * q * factor_ti_2;
  2321   typedef cub::BlockReduce<double, block_size> BlockReduce;
  2322   __shared__ typename BlockReduce::TempStorage temp_storage;
  2323   qq = BlockReduce(temp_storage).Sum(qq);
  2325   qq1 = BlockReduce(temp_storage).Sum(qq1);
  2327   qq2 = BlockReduce(temp_storage).Sum(qq2);
  2329   if (threadIdx.x == 0) {
  2330     const double c = -ewaldcof * (1.0 / SQRT_PI);
  2331     atomicAdd(d_selfEnergy, c*qq);
  2332     atomicAdd(d_selfEnergy_TI_1, c*qq1);
  2333     atomicAdd(d_selfEnergy_TI_2, c*qq2);
  2338 void calcSelfEnergyTIWrapper(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, double& h_selfEnergy, double& h_selfEnergy_TI_1, double& h_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream) {
  2339   const int block_size = 128;
  2340   const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
  2342     calcSelfEnergyTIKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
  2344     calcSelfEnergyTIKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
  2346   copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
  2347   copy_DtoH<double>(d_selfEnergy_TI_1, &h_selfEnergy_TI_1, 1, stream);
  2348   copy_DtoH<double>(d_selfEnergy_TI_2, &h_selfEnergy_TI_2, 1, stream);
  2349   cudaCheck(cudaStreamSynchronize(stream));
  2352 template <int NUM_ARRAYS>
  2353 __global__ void scaleAndMergeForceKernel(float3* forces, const float* factors, const int num_atoms) {
  2354   const int i = blockIdx.x * blockDim.x + threadIdx.x;
  2355   if (i >= num_atoms) return;
  2356   switch (NUM_ARRAYS) {
  2358       forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1];
  2359       forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1];
  2360       forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1];
  2364       forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2];
  2365       forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2];
  2366       forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2];
  2370       forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3];
  2371       forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3];
  2372       forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3];
  2376       forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3] + forces[i + 4 * num_atoms].x * factors[4];
  2377       forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3] + forces[i + 4 * num_atoms].y * factors[4];
  2378       forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3] + forces[i + 4 * num_atoms].z * factors[4];
  2384 void scaleAndMergeForceWrapper(float3* forces, const float* factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream) {
  2385   const int block_size = 128;
  2386   const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
  2387   switch (num_arrays) {
  2388     case 2: scaleAndMergeForceKernel<2><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
  2389     case 3: scaleAndMergeForceKernel<3><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
  2390     case 4: scaleAndMergeForceKernel<4><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
  2391     case 5: scaleAndMergeForceKernel<5><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
  2393   cudaCheck(cudaGetLastError());