3 #if __CUDACC_VER_MAJOR__ >= 11
     6 #include <namd_cub/cub.cuh>
     7 #endif //CUDACC version
     9 #ifdef NAMD_HIP //NAMD_HIP
    10 #include <hip/hip_runtime.h>
    11 #include <hipcub/hipcub.hpp>
    15 #include "HipDefines.h"
    16 #include "CudaUtils.h"
    17 #include "CudaTileListKernel.h"
    18 #include "CudaComputeGBISKernel.h"
    19 #include "DeviceCUDA.h"
    22 #include "ComputeGBIS.inl"
    24 #define __thread __declspec(thread)
    26 extern __thread DeviceCUDA *deviceCUDA;
    28 #define LARGE_FLOAT (float)(1.0e10)
    30 #define GBISKERNEL_NUM_WARP 4
    32 __device__ __forceinline__
    33 void shuffleNext(float& w) {
    34   w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
    38 template <int phase> struct GBISParam {};
    39 template <int phase> struct GBISInput {};
    40 template <int phase> struct GBISResults {};
    42 // ------------------------------------------------------------------------------------------------
    43 // Phase 1 specialization
    45 template <> struct GBISParam<1> {
    49 template <> struct GBISInput<1> {
    51   float intRad0j, intRadSi;
    54   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
    58   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
    62   __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
    63   __device__ __forceinline__ void initQj(const float q) {}
    64   __device__ __forceinline__ void shuffleNext() {
    65     qj       = WARP_SHUFFLE(WARP_FULL_MASK, qj,       (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
    66     intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
    70 template <> struct GBISResults<1> {
    72   __device__ __forceinline__ void init() {psiSum = 0.0f;}
    73   __device__ __forceinline__ void shuffleNext() {
    74     psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
    78 // calculate H(i,j) [and not H(j,i)]
    79 template <bool doEnergy, bool doSlow>
    80 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
    81   const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
    82   GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
    84   if (r2 < cutoff2 && r2 > 0.01f) {
    85     float r_i = rsqrtf(r2);
    89     CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
    93     CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
    99 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
   100   atomicAdd(&out[i], res.psiSum);
   103 // ------------------------------------------------------------------------------------------------
   106 template <> struct GBISParam<2> {
   108   float epsilon_p_i, epsilon_s_i;
   110   float r_cut2, r_cut_2, r_cut_4;
   114 template <> struct GBISInput<2> {
   116   float bornRadI, bornRadJ;
   118   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
   121   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
   124   __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
   125     qi = -q*param.scaling;    
   127   __device__ __forceinline__ void initQj(const float q) {
   130   __device__ __forceinline__ void shuffleNext() {
   131     qj       = WARP_SHUFFLE(WARP_FULL_MASK, qj,       (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   132     bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   136 template <> struct GBISResults<2> {
   139   __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
   140   __device__ __forceinline__ void shuffleNext() {
   141     force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   142     force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   143     force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   144     dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   148 template <bool doEnergy, bool doSlow>
   149 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
   150   const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
   151   GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
   153   if (r2 < cutoff2 && r2 > 0.01f) {
   154     float r_i = rsqrtf(r2);
   156     //float bornRadJ = sh_jBornRad[j];
   158     //calculate GB energy
   159     float qiqj = inp.qi*inp.qj;
   160     float aiaj = inp.bornRadI*inp.bornRadJ;
   161     float aiaj4 = 4*aiaj;
   162     float expr2aiaj4 = exp(-r2/aiaj4);
   163     float fij = sqrt(r2 + aiaj*expr2aiaj4);
   165     float expkappa = exp(-param.kappa*fij);
   166     float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
   167     float gbEij = qiqj*Dij*f_i;
   169     //calculate energy derivatives
   170     float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
   171     float ddrf_i = -ddrfij*f_i*f_i;
   172     float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
   173     float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
   175     //NAMD smoothing function
   177     float ddrScale = 0.f;
   179     if (param.smoothDist > 0.f) {
   180       scale = r2 * param.r_cut_2 - 1.f;
   182       ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
   183       if (doEnergy) energyT += gbEij * scale;
   184       forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
   186       if (doEnergy) energyT += gbEij;
   187       forcedEdr = ddrGbEij;
   192       float dEdai = 0.5f*qiqj*f_i*f_i
   193                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
   194                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
   195       resI.dEdaSum += dEdai;
   196       float dEdaj = 0.5f*qiqj*f_i*f_i
   197                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
   198                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
   199       resJ.dEdaSum += dEdaj;
   203     float tmpx = dx*forcedEdr;
   204     float tmpy = dy*forcedEdr;
   205     float tmpz = dz*forcedEdr;
   206     resI.force.x += tmpx;
   207     resI.force.y += tmpy;
   208     resI.force.z += tmpz;
   209     resJ.force.x -= tmpx;
   210     resJ.force.y -= tmpy;
   211     resJ.force.z -= tmpz;
   217       float fij = inp.bornRadI;//inf
   218       float expkappa = exp(-param.kappa*fij);//0
   219       float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
   220       float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
   221       energyT += 0.5f*gbEij;
   222     } //same atom or within cutoff
   226 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
   227   atomicAdd(&out[i], res.dEdaSum);
   228   atomicAdd(&forces[i].x, res.force.x);
   229   atomicAdd(&forces[i].y, res.force.y);
   230   atomicAdd(&forces[i].z, res.force.z);
   233 // ------------------------------------------------------------------------------------------------
   235 template <> struct GBISParam<3> {
   239 template <> struct GBISInput<3> {
   241   float intRadSJ, intRadJ0, intRadIS;
   242   float dHdrPrefixI, dHdrPrefixJ;
   246   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
   249     dHdrPrefixI = inp3[i];
   251   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
   254     dHdrPrefixJ = inp3[i];
   256   __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
   257   __device__ __forceinline__ void initQj(const float q) {}
   258   __device__ __forceinline__ void shuffleNext() {
   259     intRadSJ    = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ,    (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   260     dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   261     intRadJ0    = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0,    (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   265 template <> struct GBISResults<3> {
   267   __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
   268   __device__ __forceinline__ void shuffleNext() {
   269     force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   270     force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   271     force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
   275 template <bool doEnergy, bool doSlow>
   276 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
   277   const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
   278   GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
   280   if (r2 < cutoff2 && r2 > 0.01f) {
   281     float r_i = rsqrtf(r2);
   285     CalcDH(r, r2, r_i, param.a_cut, inp.qi,       inp.intRadSJ, dhij, dij);
   286     CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
   288     float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
   289     float tmpx = dx * forceAlpha;
   290     float tmpy = dy * forceAlpha;
   291     float tmpz = dz * forceAlpha;
   292     resI.force.x += tmpx;
   293     resI.force.y += tmpy;
   294     resI.force.z += tmpz;
   295     resJ.force.x -= tmpx;
   296     resJ.force.y -= tmpy;
   297     resJ.force.z -= tmpz;
   302 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
   303   atomicAdd(&forces[i].x, res.force.x);
   304   atomicAdd(&forces[i].y, res.force.y);
   305   atomicAdd(&forces[i].z, res.force.z);
   308 // ------------------------------------------------------------------------------------------------
   313 template <bool doEnergy, bool doSlow, int phase>
   315 __launch_bounds__(WARPSIZE*GBISKERNEL_NUM_WARP, 6)
   316 GBIS_Kernel(const int numTileLists,
   317   const TileList* __restrict__ tileLists,
   318   const int* __restrict__ tileJatomStart,
   319   const PatchPairRecord* __restrict__ patchPairs,
   320   const float3 lata, const float3 latb, const float3 latc,
   321   const float4* __restrict__ xyzq,
   323   const GBISParam<phase> param,
   324   const float* __restrict__ inp1,
   325   const float* __restrict__ inp2,
   326   const float* __restrict__ inp3,
   327   float* __restrict__ out,
   328   float4* __restrict__ forces,
   329   TileListVirialEnergy* __restrict__ virialEnergy) {
   331   // Single warp takes care of one list of tiles
   332   for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
   334     TileList tmp = tileLists[itileList];
   335     int iatomStart = tmp.iatomStart;
   336     int jtileStart = tmp.jtileStart;
   337     int jtileEnd   = tmp.jtileEnd;
   338     PatchPairRecord PPStmp = patchPairs[itileList];
   339     int iatomSize     = PPStmp.iatomSize;
   340     int jatomSize     = PPStmp.jatomSize;
   342     float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
   343     float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
   344     float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
   346     // Warp index (0...warpsize-1)
   347     const int wid = threadIdx.x % WARPSIZE;
   349     // Load i-atom data (and shift coordinates)
   350     float4 xyzq_i = xyzq[iatomStart + wid];
   355     GBISInput<phase> inp;
   356     inp.loadI(iatomStart + wid, inp1, inp2, inp3);
   357     if (phase == 2) inp.initQi(param, xyzq_i.w);
   360     int nloopi = min(iatomSize - iatomStart, WARPSIZE);
   362     GBISResults<phase> resI;
   365     if (doEnergy) energyT = 0.0f;
   367     for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
   369       // Load j-atom starting index and exclusion mask
   370       int jatomStart = tileJatomStart[jtile];
   372       // Load coordinates and charge
   373       float4 xyzq_j  = xyzq[jatomStart + wid];
   375       inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
   376       if (phase == 2) inp.initQj(xyzq_j.w);
   379       int nloopj = min(jatomSize - jatomStart, WARPSIZE);
   381       const bool diag_tile = (iatomStart == jatomStart);
   382       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
   383       int t = (phase != 2 && diag_tile) ? 1 : 0;
   384       if (phase != 2 && diag_tile) {
   388       GBISResults<phase> resJ;
   391       for (;t < WARPSIZE;t++) {
   392         int j = (t + wid) & modval;
   394         float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
   395         float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
   396         float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
   398         float r2 = dx*dx + dy*dy + dz*dz;
   400         if (j < nloopj && wid < nloopi) {
   401           calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
   409       writeResult(jatomStart + wid, resJ, out, forces);
   414     writeResult(iatomStart + wid, resI, out, forces);
   418       typedef cub::WarpReduce<double> WarpReduceDouble;
   419       __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
   420       int warpId = threadIdx.x / WARPSIZE;
   421       volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
   422       if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
   430 // ##############################################################################################
   431 // ##############################################################################################
   432 // ##############################################################################################
   437 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
   438   cudaCheck(cudaSetDevice(deviceID));
   463 CudaComputeGBISKernel::~CudaComputeGBISKernel() {
   464   cudaCheck(cudaSetDevice(deviceID));
   465   if (intRad0 != NULL) deallocate_device<float>(&intRad0);
   466   if (intRadS != NULL) deallocate_device<float>(&intRadS);
   467   if (psiSum != NULL) deallocate_device<float>(&psiSum);
   468   if (bornRad != NULL) deallocate_device<float>(&bornRad);
   469   if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
   470   if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
   474 // Update (copy Host->Device) intRad0, intRadS
   476 void CudaComputeGBISKernel::updateIntRad(const int atomStorageSize, float* intRad0H, float* intRadSH,
   477   cudaStream_t stream) {
   479   reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
   480   reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
   482   copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
   483   copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
   487 // Update (copy Host->Device) bornRad
   489 void CudaComputeGBISKernel::updateBornRad(const int atomStorageSize, float* bornRadH, cudaStream_t stream) {
   490   reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
   491   copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
   495 // Update (copy Host->Device) dHdrPrefix
   497 void CudaComputeGBISKernel::update_dHdrPrefix(const int atomStorageSize, float* dHdrPrefixH, cudaStream_t stream) {
   498   reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
   499   copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
   505 void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize,
   506   const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
   507   cudaStream_t stream) {
   509   reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
   510   clear_device_array<float>(psiSum, atomStorageSize, stream);
   512   int nwarp = GBISKERNEL_NUM_WARP;
   513   int nthread = WARPSIZE*nwarp;
   514   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
   519   float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
   521   GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
   522   (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
   523     tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
   524     param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
   526   cudaCheck(cudaGetLastError());
   528   copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
   534 void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize,
   535   const bool doEnergy, const bool doSlow,
   536   const float3 lata, const float3 latb, const float3 latc,
   537   const float r_cut, const float scaling, const float kappa, const float smoothDist,
   538   const float epsilon_p, const float epsilon_s,
   539   float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
   541   reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
   542   clear_device_array<float>(dEdaSum, atomStorageSize, stream);
   545     tlKernel.setTileListVirialEnergyGBISLength(tlKernel.getNumTileListsGBIS());
   548   int nwarp = GBISKERNEL_NUM_WARP;
   549   int nthread = WARPSIZE*nwarp;
   551   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
   554   param.r_cut2      = r_cut*r_cut;
   555   param.r_cut_2     = 1.f / param.r_cut2;
   556   param.r_cut_4     = 4.f*param.r_cut_2*param.r_cut_2;
   557   param.epsilon_s_i = 1.f / epsilon_s;
   558   param.epsilon_p_i = 1.f / epsilon_p;
   559   param.scaling     = scaling;
   561   param.smoothDist  = smoothDist;
   563 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
   564      <<< nblock, nthread, 0, stream >>> \
   565     (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
   566       tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
   567       param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
   569   if (!doEnergy && !doSlow) CALL(false, false);
   570   if (!doEnergy &&  doSlow) CALL(false, true);
   571   if ( doEnergy && !doSlow) CALL(true,  false);
   572   if ( doEnergy &&  doSlow) CALL(true,  true);
   574   cudaCheck(cudaGetLastError());
   576   copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
   582 void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize,
   583   const float3 lata, const float3 latb, const float3 latc, const float a_cut,
   584   float4* d_forces, cudaStream_t stream) {
   586   int nwarp = GBISKERNEL_NUM_WARP;
   587   int nthread = WARPSIZE*nwarp;
   588   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
   593   float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
   595   GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
   596   (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
   597     tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
   598     param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
   600   cudaCheck(cudaGetLastError());