1 #include "CudaGlobalMasterClientKernel.h"
     3 #include <hipcub/hipcub.hpp>
     9 #if __CUDACC_VER_MAJOR__ >= 11
    10 #include <cub/cub.cuh>
    12 #include <namd_cub/cub.cuh>
    16 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
    18 #define ATOM_BLOCKS 128
    20 __global__ void clientVirialAndExtForceKernel(double *d_pos,
    21                                               double *d_applied_force,
    22                                               size_t numAtoms, size_t stride,
    25   const int i = threadIdx.x + blockIdx.x * blockDim.x;
    26   cudaTensor virial = {0};
    27   double3 f = {0, 0, 0};
    28   double3 pos = {0, 0, 0};
    30     f.x = d_applied_force[i];
    31     f.y = d_applied_force[i + stride];
    32     f.z = d_applied_force[i + 2 * stride];
    34     pos.y = d_pos[i + stride];
    35     pos.z = d_pos[i + 2 * stride];
    36     virial.xx = f.x * pos.x;
    37     virial.xy = f.x * pos.y;
    38     virial.xz = f.x * pos.z;
    39     virial.yx = f.y * pos.x;
    40     virial.yy = f.y * pos.y;
    41     virial.yz = f.y * pos.z;
    42     virial.zx = f.z * pos.x;
    43     virial.zy = f.z * pos.y;
    44     virial.zz = f.z * pos.z;
    47   typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
    48   __shared__ typename BlockReduce::TempStorage temp_storage;
    49   virial.xx = BlockReduce(temp_storage).Sum(virial.xx);
    51   virial.xy = BlockReduce(temp_storage).Sum(virial.xy);
    53   virial.xz = BlockReduce(temp_storage).Sum(virial.xz);
    55   virial.yx = BlockReduce(temp_storage).Sum(virial.yx);
    57   virial.yy = BlockReduce(temp_storage).Sum(virial.yy);
    59   virial.yz = BlockReduce(temp_storage).Sum(virial.yz);
    61   virial.zx = BlockReduce(temp_storage).Sum(virial.zx);
    63   virial.zy = BlockReduce(temp_storage).Sum(virial.zy);
    65   virial.zz = BlockReduce(temp_storage).Sum(virial.zz);
    67   f.x = BlockReduce(temp_storage).Sum(f.x);
    69   f.y = BlockReduce(temp_storage).Sum(f.y);
    71   f.z = BlockReduce(temp_storage).Sum(f.z);
    73   if (threadIdx.x == 0) {
    74     atomicAdd(&(d_virial->xx), virial.xx);
    75     atomicAdd(&(d_virial->xy), virial.xy);
    76     atomicAdd(&(d_virial->xz), virial.xz);
    77     atomicAdd(&(d_virial->yx), virial.yx);
    78     atomicAdd(&(d_virial->yy), virial.yy);
    79     atomicAdd(&(d_virial->yz), virial.yz);
    80     atomicAdd(&(d_virial->zx), virial.zx);
    81     atomicAdd(&(d_virial->zy), virial.zy);
    82     atomicAdd(&(d_virial->zz), virial.zz);
    83     atomicAdd(&(d_extForce->x), f.x);
    84     atomicAdd(&(d_extForce->y), f.y);
    85     atomicAdd(&(d_extForce->z), f.z);
    89 void clientVirialAndExtForce(double *d_pos, double *d_applied_force,
    90                              size_t numAtoms, cudaTensor *d_virial,
    91                              Vector *d_extForce, cudaStream_t stream) {
    92   if (numAtoms == 0) return;
    93   const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
    94   clientVirialAndExtForceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
    95       d_pos, d_applied_force, numAtoms, numAtoms, d_virial, d_extForce);
    98 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)