2 #if __CUDACC_VER_MAJOR__ >= 11
     5 #include <namd_cub/cub.cuh>
     8 #include <hip/hip_runtime.h>
     9 #include <hipcub/hipcub.hpp>
    11 #endif // end NAMD_CUDA vs. NAMD_HIP
    13 #include "HipDefines.h"
    15 #include "ComputeGlobalMasterVirialCUDAKernel.h"
    17 #ifdef NODEGROUP_FORCE_REGISTER
    19 template <int BLOCK_SIZE>
    20 __global__ void computeGlobalMasterVirialKernel(
    22   CudaLocalRecord*                   localRecords,
    23   const double* __restrict d_pos_x,
    24   const double* __restrict d_pos_y,
    25   const double* __restrict d_pos_z,
    26   const char3* __restrict d_transform,
    27   double*       __restrict f_global_x,
    28   double*       __restrict f_global_y,
    29   double*       __restrict f_global_z,
    30   double3*      __restrict d_extForce,
    31   double3*      __restrict h_extForce,
    32   cudaTensor*   __restrict d_virial,
    33   cudaTensor*   __restrict h_extVirial,
    35   unsigned int* __restrict tbcatomic
    38   double3 r_netForce = {0, 0, 0};
    40   r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
    41   r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
    42   r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
    43   int totaltb = gridDim.x;
    44   int i = threadIdx.x + blockIdx.x*blockDim.x;
    45   __shared__ bool isLastBlockDone;
    58     const char3 t = d_transform[i];
    59     pos_i = lat.reverse_transform(pos, t);
    60     r_virial.xx = f_global_x[i] * pos_i.x;
    61     r_virial.xy = f_global_x[i] * pos_i.y;
    62     r_virial.xz = f_global_x[i] * pos_i.z;
    63     r_virial.yx = f_global_y[i] * pos_i.x;
    64     r_virial.yy = f_global_y[i] * pos_i.y;
    65     r_virial.yz = f_global_y[i] * pos_i.z;
    66     r_virial.zx = f_global_z[i] * pos_i.x;
    67     r_virial.zy = f_global_z[i] * pos_i.y;
    68     r_virial.zz = f_global_z[i] * pos_i.z;
    69     r_netForce.x = f_global_x[i];
    70     r_netForce.y = f_global_y[i];
    71     r_netForce.z = f_global_z[i];
    75   typedef cub::BlockReduce<double, BLOCK_SIZE> BlockReduce;
    76   __shared__ typename BlockReduce::TempStorage temp_storage;
    78   r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
    80   r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
    82   r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
    85   r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
    87   r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
    89   r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
    92   r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
    94   r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
    96   r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
    99   r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
   101   r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
   103   r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
   106   if(threadIdx.x == 0){
   107     atomicAdd(&(d_virial->xx), r_virial.xx);
   108     atomicAdd(&(d_virial->xy), r_virial.xy);
   109     atomicAdd(&(d_virial->xz), r_virial.xz);
   111     atomicAdd(&(d_virial->yx), r_virial.yx);
   112     atomicAdd(&(d_virial->yy), r_virial.yy);
   113     atomicAdd(&(d_virial->yz), r_virial.yz);
   115     atomicAdd(&(d_virial->zx), r_virial.zx);
   116     atomicAdd(&(d_virial->zy), r_virial.zy);
   117     atomicAdd(&(d_virial->zz), r_virial.zz);
   119     atomicAdd(&(d_extForce->x), r_netForce.x);
   120     atomicAdd(&(d_extForce->y), r_netForce.y);
   121     atomicAdd(&(d_extForce->z), r_netForce.z);
   124     unsigned int value = atomicInc(&tbcatomic[0], totaltb);
   125     isLastBlockDone = (value == (totaltb -1));
   130     if(threadIdx.x == 0){
   131       h_extVirial->xx = d_virial->xx;
   132       h_extVirial->xy = d_virial->xy;
   133       h_extVirial->xz = d_virial->xz;
   134       h_extVirial->yx = d_virial->yx;
   135       h_extVirial->yy = d_virial->yy;
   136       h_extVirial->yz = d_virial->yz;
   137       h_extVirial->zx = d_virial->zx;
   138       h_extVirial->zy = d_virial->zy;
   139       h_extVirial->zz = d_virial->zz;
   141       //reset the device virial value
   152       h_extForce->x  = d_extForce->x;
   153       h_extForce->y  = d_extForce->y;
   154       h_extForce->z  = d_extForce->z;
   158       //resets atomic counter
   165 void computeGlobalMasterVirial(
   166   const int                numPatches,
   168   CudaLocalRecord*         localRecords,
   169   const double* __restrict d_pos_x,
   170   const double* __restrict d_pos_y,
   171   const double* __restrict d_pos_z,
   172   const char3* __restrict d_transform,
   173   double*       __restrict f_global_x,
   174   double*       __restrict f_global_y,
   175   double*       __restrict f_global_z,
   176   double3*      __restrict d_extForce,
   177   double3*      __restrict h_extForce,
   178   cudaTensor*   __restrict d_virial,
   179   cudaTensor*   __restrict h_virial,
   181   unsigned int* __restrict d_tbcatomic,
   186   const int atom_blocks = 128;
   187   const int grid = (numAtoms + atom_blocks - 1) / atom_blocks;
   188   computeGlobalMasterVirialKernel<atom_blocks><<<grid, atom_blocks, 0, stream>>>(
   207 #endif // NODEGROUP_FORCE_REGISTER