1 #include "ComputeLonepairsCUDA.h"
     2 #include "ComputeLonepairsCUDAKernel.h"
     7 #include <hipcub/hipcub.hpp>
    13 #if __CUDACC_VER_MAJOR__ >= 11
    14 #include <cub/cub.cuh>
    16 #include <namd_cub/cub.cuh>
    20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
    22 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
    23 __global__ void repositionRelativeKernel(
    24   double* __restrict__ d_pos_x,
    25   double* __restrict__ d_pos_y,
    26   double* __restrict__ d_pos_z,
    27   const ComputeLonepairsCUDA::LonepairRelative* __restrict__ d_lprelative_list,
    28   size_t lprelative_list_size) {
    29   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    30   if (tid < lprelative_list_size) {
    31     const int i = d_lprelative_list[tid].i_soaid;
    32     const int j = d_lprelative_list[tid].j_soaid;
    33     const int k = d_lprelative_list[tid].k_soaid;
    34     const int l = d_lprelative_list[tid].l_soaid;
    35     // Bernard Brooks: Do all LP math in double precision.  Cost is not excessive.
    36     const double dcosa = d_lprelative_list[tid].dcosa;
    37     const double dsinacost = d_lprelative_list[tid].dsinacost;
    38     const double dsinasint = d_lprelative_list[tid].dsinasint;
    39     const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
    40     const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
    41     const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
    45     BigReal invlen = rnorm3d(a.x, a.y, a.z);
    47     Vector c = cross(b, a);
    48     invlen = rnorm3d(c.x, c.y, c.z);
    51     const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
    58 void repositionRelative(
    62   const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
    63   size_t lprelative_list_size,
    64   cudaStream_t stream) {
    65   const int block_size = 128;
    66   const int grid = (lprelative_list_size + block_size - 1) / block_size;
    67   repositionRelativeKernel<<<grid, block_size, 0, stream>>>(
    68     d_pos_x, d_pos_y, d_pos_z, d_lprelative_list, lprelative_list_size);
    71 __global__ void repositionBisectorKernel(
    72   double* __restrict__ d_pos_x,
    73   double* __restrict__ d_pos_y,
    74   double* __restrict__ d_pos_z,
    75   const ComputeLonepairsCUDA::LonepairBisector* __restrict__ d_lpbisector_list,
    76   size_t lpbisector_list_size) {
    77   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    78   if (tid < lpbisector_list_size) {
    79     const int i = d_lpbisector_list[tid].i_soaid;
    80     const int j = d_lpbisector_list[tid].j_soaid;
    81     const int k = d_lpbisector_list[tid].k_soaid;
    82     const int l = d_lpbisector_list[tid].l_soaid;
    83     // Bernard Brooks: Do all LP math in double precision.  Cost is not excessive.
    84     const double dcosa = d_lpbisector_list[tid].dcosa;
    85     const double dsinacost = d_lpbisector_list[tid].dsinacost;
    86     const double dsinasint = d_lpbisector_list[tid].dsinasint;
    87     const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
    88     const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
    89     const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
    90     // The middle point of l and k
    91     const Vector v = 0.5 * (rl + rk);
    94     BigReal invlen = rnorm3d(a.x, a.y, a.z);
    96     Vector c = cross(b, a);
    97     invlen = rnorm3d(c.x, c.y, c.z);
   100     const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
   107 void repositionBisector(
   111   const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
   112   size_t lpbisector_list_size,
   113   cudaStream_t stream) {
   114   const int block_size = 128;
   115   const int grid = (lpbisector_list_size + block_size - 1) / block_size;
   116   repositionBisectorKernel<<<grid, block_size, 0, stream>>>(
   117     d_pos_x, d_pos_y, d_pos_z, d_lpbisector_list, lpbisector_list_size);
   120 __global__ void repositionColinearKernel(
   121   double* __restrict__ d_pos_x,
   122   double* __restrict__ d_pos_y,
   123   double* __restrict__ d_pos_z,
   124   const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
   125   size_t lpcolinear_list_size) {
   126   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   127   if (tid < lpcolinear_list_size) {
   128     const int i = d_lpcolinear_list[tid].i_soaid;
   129     const int j = d_lpcolinear_list[tid].j_soaid;
   130     const int k = d_lpcolinear_list[tid].k_soaid;
   131     const double distance = d_lpcolinear_list[tid].distance;
   132     const double scale_factor = d_lpcolinear_list[tid].scale_factor;
   133     const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
   134     const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
   135     const Vector rkj = rj - rk;
   136     const double inv_r = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
   137     const Vector ri = rj + (scale_factor + distance * inv_r) * rkj;
   144 void repositionColinear(
   145   double* __restrict__ d_pos_x,
   146   double* __restrict__ d_pos_y,
   147   double* __restrict__ d_pos_z,
   148   const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
   149   size_t lpcolinear_list_size,
   150   cudaStream_t stream) {
   151   const int block_size = 128;
   152   const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
   153   repositionColinearKernel<<<grid, block_size, 0, stream>>>(
   154     d_pos_x, d_pos_y, d_pos_z, d_lpcolinear_list, lpcolinear_list_size);
   157 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
   158 // I try to project multiple forces at once for MTS.
   159 template <int numForces>
   160 __inline__ __device__ void projectRelativeForces(
   165   const Vector (&fi)[numForces],
   166   Vector (&fj)[numForces],
   167   Vector (&fk)[numForces],
   168   Vector (&fl)[numForces]) {
   169   const Vector rji = ri - rj;
   170   const Vector rjk = rk - rj;
   171   const Vector rkl = rl - rk;
   172   const Vector rki = ri - rk;
   173   const double inv_rjk_norm2 = 1.0 / rjk.length2();
   174   const Vector plane_jkl = cross(rjk, rkl);
   175   // Project the force on rij to keep the distance unchanged
   176   const double rji_norm2 = rji.length2();
   177   const double inv_rij_norm2 = 1.0 / rji_norm2;
   180     for (int m = 0; m < numForces; ++m) {
   181       fj[m] += inv_rij_norm2 * fi[m].dot(rji) * rji;
   184   // Project the force on the dihedral angle to keep it unchanged
   185   Vector fp[numForces];
   187   for (int m = 0; m < numForces; ++m) {
   190   // Get the normal vector of plane ikj
   191   Vector normal_ikj = cross(rji, rjk);
   192   const double normal_ikj_norm = normal_ikj.length();
   193   // The arm of the force on plane ikj (v2 + v3)
   194   const Vector hijk_ri = rji - (rji.dot(rjk) * inv_rjk_norm2 * rjk);
   195   // Only project the force to plane ijk if the ijk are not colinear
   196   // If rji and rjk is colinear then rjiĆrjk is zero
   197   if (normal_ikj_norm > 1e-6) {
   198     normal_ikj /= normal_ikj_norm;
   199     // Force on plane ikj
   201     for (int m = 0; m < numForces; ++m) {
   202       fp[m] = fi[m].dot(normal_ikj) * normal_ikj;
   205   // Torque of the force on l after projecting the force on plane ijk to l
   206   const Vector dir_l_to_jk = cross(plane_jkl, rjk).unit();
   207   const Vector h_l = rkl.dot(dir_l_to_jk) * dir_l_to_jk;
   208   const double inv_h_l_norm2 = 1.0 / h_l.length2();
   210   for (int m = 0; m < numForces; ++m) {
   212     const Vector torque_p = cross(hijk_ri, fp[m]);
   214     const Vector fpl = inv_h_l_norm2 * cross(torque_p, h_l);
   216     // (a) The remaining force on j and k after subtracting the force on l from the torsional force
   217     const Vector fj_prime = inv_rjk_norm2 * cross(cross(rki, fp[m]) - cross(rkl, fpl), -rjk);
   218     const Vector fk_prime = fp[m] - fpl - fj_prime;
   219     // (b) The remaining angular force
   220     const Vector fai = fi[m] - fp[m] - fj[m];
   221     // Sum the torsional force (a) on j and k
   224     // Torque of the angular force (b) on k
   225     const Vector torque_k = cross(rji, fai);
   226     const Vector fak = cross(torque_k, rjk) * inv_rjk_norm2;
   232 enum class LonepairThreeHostsType {Relative, Bisector};
   234 template <typename LplistT, int maxForceNumer, bool doVirial, int blockSize, LonepairThreeHostsType lptype>
   235 __global__ void redistributeForceThreeHostsKernel(
   236   double* __restrict__ d_f_normal_x,
   237   double* __restrict__ d_f_normal_y,
   238   double* __restrict__ d_f_normal_z,
   239   double* __restrict__ d_f_nbond_x,
   240   double* __restrict__ d_f_nbond_y,
   241   double* __restrict__ d_f_nbond_z,
   242   double* __restrict__ d_f_slow_x,
   243   double* __restrict__ d_f_slow_y,
   244   double* __restrict__ d_f_slow_z,
   245   cudaTensor* __restrict__ d_virial_normal,
   246   cudaTensor* __restrict__ d_virial_nbond,
   247   cudaTensor* __restrict__ d_virial_slow,
   248   const double* __restrict__ d_pos_x,
   249   const double* __restrict__ d_pos_y,
   250   const double* __restrict__ d_pos_z,
   251   const LplistT* __restrict__ d_lp_list,
   252   size_t lp_list_size) {
   253   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   254   cudaTensor vir_slow, vir_nbond, vir_normal;
   256     switch (maxForceNumer) {
   257       // Intentionally fall-through
   293   if (tid < lp_list_size) {
   294     const int i = d_lp_list[tid].i_soaid;
   295     const int j = d_lp_list[tid].j_soaid;
   296     const int k = d_lp_list[tid].k_soaid;
   297     const int l = d_lp_list[tid].l_soaid;
   298     // Bernard Brooks: Do all LP math in double precision.  Cost is not excessive.
   299     const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
   300     const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
   301     const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
   302     const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
   303     Vector fi[maxForceNumer+1];
   304     Vector fj[maxForceNumer+1];
   305     Vector fk[maxForceNumer+1];
   306     Vector fl[maxForceNumer+1];
   308     for (int m = 0; m < maxForceNumer + 1; ++m) {
   313     switch (maxForceNumer) {
   314       // Intentionally fall-through
   315       case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
   316       case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
   317       case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
   320       case LonepairThreeHostsType::Relative: {
   321         projectRelativeForces<maxForceNumer+1>(ri, rj, rk, rl, fi, fj, fk, fl);
   324       case LonepairThreeHostsType::Bisector: {
   325         // The difference between the bisector and the relative cases is
   326         // that the order of the vertices of the former is changed to ijmk
   327         // from ijkl, so we can just reuse the function.
   328         const Vector rm = 0.5 * (rj + rk);
   329         // Store the force of m to fl, so I rotate the buffers fl and fk
   330         projectRelativeForces<maxForceNumer+1>(ri, rj, rm, rk, fi, fj, fl, fk);
   331         // fk and fl actually store the forces on m and k, respectively.
   332         // The real force on k is 0.5 * fl + fk
   333         // The real force on l is 0.5 * fl
   335         for (int m = 0; m < maxForceNumer + 1; ++m) {
   342     // Add the forces back to jkl
   343     switch (maxForceNumer) {
   344       // Intentionally fall-through
   346         atomicAdd(&(d_f_slow_x[j]), fj[2].x);
   347         atomicAdd(&(d_f_slow_y[j]), fj[2].y);
   348         atomicAdd(&(d_f_slow_z[j]), fj[2].z);
   349         atomicAdd(&(d_f_slow_x[k]), fk[2].x);
   350         atomicAdd(&(d_f_slow_y[k]), fk[2].y);
   351         atomicAdd(&(d_f_slow_z[k]), fk[2].z);
   352         atomicAdd(&(d_f_slow_x[l]), fl[2].x);
   353         atomicAdd(&(d_f_slow_y[l]), fl[2].y);
   354         atomicAdd(&(d_f_slow_z[l]), fl[2].z);
   357         atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
   358         atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
   359         atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
   360         atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
   361         atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
   362         atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
   363         atomicAdd(&(d_f_nbond_x[l]), fl[1].x);
   364         atomicAdd(&(d_f_nbond_y[l]), fl[1].y);
   365         atomicAdd(&(d_f_nbond_z[l]), fl[1].z);
   368         atomicAdd(&(d_f_normal_x[j]), fj[0].x);
   369         atomicAdd(&(d_f_normal_y[j]), fj[0].y);
   370         atomicAdd(&(d_f_normal_z[j]), fj[0].z);
   371         atomicAdd(&(d_f_normal_x[k]), fk[0].x);
   372         atomicAdd(&(d_f_normal_y[k]), fk[0].y);
   373         atomicAdd(&(d_f_normal_z[k]), fk[0].z);
   374         atomicAdd(&(d_f_normal_x[l]), fl[0].x);
   375         atomicAdd(&(d_f_normal_y[l]), fl[0].y);
   376         atomicAdd(&(d_f_normal_z[l]), fl[0].z);
   379     // Compute the virial contribution after redistributing the forces
   381       switch (maxForceNumer) {
   382         // Intentionally fall-through
   384           vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x + fl[2].x * rl.x - fi[2].x * ri.x;
   385           vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y + fl[2].x * rl.y - fi[2].x * ri.y;
   386           vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z + fl[2].x * rl.z - fi[2].x * ri.z;
   387           vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x + fl[2].y * rl.x - fi[2].y * ri.x;
   388           vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y + fl[2].y * rl.y - fi[2].y * ri.y;
   389           vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z + fl[2].y * rl.z - fi[2].y * ri.z;
   390           vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x + fl[2].z * rl.x - fi[2].z * ri.x;
   391           vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y + fl[2].z * rl.y - fi[2].z * ri.y;
   392           vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z + fl[2].z * rl.z - fi[2].z * ri.z;
   395           vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x + fl[1].x * rl.x - fi[1].x * ri.x;
   396           vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y + fl[1].x * rl.y - fi[1].x * ri.y;
   397           vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z + fl[1].x * rl.z - fi[1].x * ri.z;
   398           vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x + fl[1].y * rl.x - fi[1].y * ri.x;
   399           vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y + fl[1].y * rl.y - fi[1].y * ri.y;
   400           vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z + fl[1].y * rl.z - fi[1].y * ri.z;
   401           vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x + fl[1].z * rl.x - fi[1].z * ri.x;
   402           vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y + fl[1].z * rl.y - fi[1].z * ri.y;
   403           vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z + fl[1].z * rl.z - fi[1].z * ri.z;
   406           vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x + fl[0].x * rl.x - fi[0].x * ri.x;
   407           vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y + fl[0].x * rl.y - fi[0].x * ri.y;
   408           vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z + fl[0].x * rl.z - fi[0].x * ri.z;
   409           vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x + fl[0].y * rl.x - fi[0].y * ri.x;
   410           vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y + fl[0].y * rl.y - fi[0].y * ri.y;
   411           vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z + fl[0].y * rl.z - fi[0].y * ri.z;
   412           vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x + fl[0].z * rl.x - fi[0].z * ri.x;
   413           vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y + fl[0].z * rl.y - fi[0].z * ri.y;
   414           vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z + fl[0].z * rl.z - fi[0].z * ri.z;
   418     // Clean the forces on the lone pair
   419     switch (maxForceNumer) {
   420       // Intentionally fall-through
   438   typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
   439   __shared__ typename BlockReduce::TempStorage temp_storage;
   441     switch (maxForceNumer) {
   442       // Intentionally fall-through
   444         vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   448         vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   452         vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   456     if (threadIdx.x == 0) {
   457       switch (maxForceNumer) {
   458         // Intentionally fall-through
   460           atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
   461           atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
   462           atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
   463           atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
   464           atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
   465           atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
   466           atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
   467           atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
   468           atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
   471           atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
   472           atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
   473           atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
   474           atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
   475           atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
   476           atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
   477           atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
   478           atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
   479           atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
   482           atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
   483           atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
   484           atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
   485           atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
   486           atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
   487           atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
   488           atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
   489           atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
   490           atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
   497 void redistributeForceRelative(
   498   double*        d_f_normal_x,
   499   double*        d_f_normal_y,
   500   double*        d_f_normal_z,
   507   cudaTensor*    d_virial_normal,
   508   cudaTensor*    d_virial_nbond,
   509   cudaTensor*    d_virial_slow,
   510   const double*  d_pos_x,
   511   const double*  d_pos_y,
   512   const double*  d_pos_z,
   513   const int      maxForceNumber,
   515   const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
   516   size_t lprelative_list_size,
   517   cudaStream_t stream) {
   518   const int block_size = 128;
   519   const int grid = (lprelative_list_size + block_size - 1) / block_size;
   520 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
   521   redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairRelative, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Relative> \
   522   <<<grid, block_size, 0, stream>>>( \
   523     d_f_normal_x, d_f_normal_y, d_f_normal_z, \
   524     d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
   525     d_f_slow_x, d_f_slow_y, d_f_slow_z, \
   526     d_virial_normal, d_virial_nbond, d_virial_slow, \
   527     d_pos_x, d_pos_y, d_pos_z, \
   528     d_lprelative_list, lprelative_list_size \
   530   const int option = (maxForceNumber << 0) + (doVirial << 2);
   532     case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
   533     case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
   534     case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
   535     case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
   536     case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
   537     case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
   539       const std::string error =
   540         "redistributeForceRelative: no kernel called (maxForceNumber = " +
   541         std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
   542       NAMD_bug(error.c_str());
   548 void redistributeForceBisector(
   549   double*        d_f_normal_x,
   550   double*        d_f_normal_y,
   551   double*        d_f_normal_z,
   558   cudaTensor*    d_virial_normal,
   559   cudaTensor*    d_virial_nbond,
   560   cudaTensor*    d_virial_slow,
   561   const double*  d_pos_x,
   562   const double*  d_pos_y,
   563   const double*  d_pos_z,
   564   const int      maxForceNumber,
   566   const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
   567   size_t lpbisector_list_size,
   568   cudaStream_t stream) {
   569   const int block_size = 128;
   570   const int grid = (lpbisector_list_size + block_size - 1) / block_size;
   571 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
   572   redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairBisector, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Bisector> \
   573   <<<grid, block_size, 0, stream>>>( \
   574     d_f_normal_x, d_f_normal_y, d_f_normal_z, \
   575     d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
   576     d_f_slow_x, d_f_slow_y, d_f_slow_z, \
   577     d_virial_normal, d_virial_nbond, d_virial_slow, \
   578     d_pos_x, d_pos_y, d_pos_z, \
   579     d_lpbisector_list, lpbisector_list_size \
   581   const int option = (maxForceNumber << 0) + (doVirial << 2);
   583     case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
   584     case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
   585     case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
   586     case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
   587     case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
   588     case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
   590       const std::string error =
   591         "redistributeForceBisector: no kernel called (maxForceNumber = " +
   592         std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
   593       NAMD_bug(error.c_str());
   599 template <int maxForceNumer, bool doVirial, int blockSize>
   600 __global__ void redistributeForceColinearKernel(
   601   double* __restrict__ d_f_normal_x,
   602   double* __restrict__ d_f_normal_y,
   603   double* __restrict__ d_f_normal_z,
   604   double* __restrict__ d_f_nbond_x,
   605   double* __restrict__ d_f_nbond_y,
   606   double* __restrict__ d_f_nbond_z,
   607   double* __restrict__ d_f_slow_x,
   608   double* __restrict__ d_f_slow_y,
   609   double* __restrict__ d_f_slow_z,
   610   cudaTensor* __restrict__ d_virial_normal,
   611   cudaTensor* __restrict__ d_virial_nbond,
   612   cudaTensor* __restrict__ d_virial_slow,
   613   const double* __restrict__ d_pos_x,
   614   const double* __restrict__ d_pos_y,
   615   const double* __restrict__ d_pos_z,
   616   const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lp_list,
   617   size_t lp_list_size) {
   618   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   619   cudaTensor vir_slow, vir_nbond, vir_normal;
   621     switch (maxForceNumer) {
   622       // Intentionally fall-through
   658   if (tid < lp_list_size) {
   659     const int i = d_lp_list[tid].i_soaid;
   660     const int j = d_lp_list[tid].j_soaid;
   661     const int k = d_lp_list[tid].k_soaid;
   662     double distance = d_lp_list[tid].distance;
   663     const double scale_factor = d_lp_list[tid].scale_factor;
   664     const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
   665     const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
   666     const Vector rkj = rj - rk;
   667     const double inv_rkj_norm = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
   668     distance *= inv_rkj_norm;
   669     // Prepare the force buffers
   670     Vector fi[maxForceNumer+1];
   671     Vector fj[maxForceNumer+1];
   672     Vector fk[maxForceNumer+1];
   674     for (int m = 0; m < maxForceNumer + 1; ++m) {
   678     switch (maxForceNumer) {
   679       // Intentionally fall-through
   680       case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
   681       case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
   682       case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
   684     // Project the forces
   686     for (int m = 0; m < maxForceNumer + 1; ++m) {
   687       const double fdot = distance * (fi[m].dot(rkj)) * inv_rkj_norm * inv_rkj_norm;
   688       fj[m] = (1.0 + scale_factor + distance) * fi[m] - fdot * rkj;
   689       fk[m] = fi[m] - fj[m];
   691     // Add the forces back
   692     switch (maxForceNumer) {
   693       // Intentionally fall-through
   695         atomicAdd(&(d_f_slow_x[j]), fj[2].x);
   696         atomicAdd(&(d_f_slow_y[j]), fj[2].y);
   697         atomicAdd(&(d_f_slow_z[j]), fj[2].z);
   698         atomicAdd(&(d_f_slow_x[k]), fk[2].x);
   699         atomicAdd(&(d_f_slow_y[k]), fk[2].y);
   700         atomicAdd(&(d_f_slow_z[k]), fk[2].z);
   703         atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
   704         atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
   705         atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
   706         atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
   707         atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
   708         atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
   711         atomicAdd(&(d_f_normal_x[j]), fj[0].x);
   712         atomicAdd(&(d_f_normal_y[j]), fj[0].y);
   713         atomicAdd(&(d_f_normal_z[j]), fj[0].z);
   714         atomicAdd(&(d_f_normal_x[k]), fk[0].x);
   715         atomicAdd(&(d_f_normal_y[k]), fk[0].y);
   716         atomicAdd(&(d_f_normal_z[k]), fk[0].z);
   720       const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
   721       switch (maxForceNumer) {
   722         // Intentionally fall-through
   724           vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x - fi[2].x * ri.x;
   725           vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y - fi[2].x * ri.y;
   726           vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z - fi[2].x * ri.z;
   727           vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x - fi[2].y * ri.x;
   728           vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y - fi[2].y * ri.y;
   729           vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z - fi[2].y * ri.z;
   730           vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x - fi[2].z * ri.x;
   731           vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y - fi[2].z * ri.y;
   732           vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z - fi[2].z * ri.z;
   735           vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x - fi[1].x * ri.x;
   736           vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y - fi[1].x * ri.y;
   737           vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z - fi[1].x * ri.z;
   738           vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x - fi[1].y * ri.x;
   739           vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y - fi[1].y * ri.y;
   740           vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z - fi[1].y * ri.z;
   741           vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x - fi[1].z * ri.x;
   742           vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y - fi[1].z * ri.y;
   743           vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z - fi[1].z * ri.z;
   746           vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x - fi[0].x * ri.x;
   747           vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y - fi[0].x * ri.y;
   748           vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z - fi[0].x * ri.z;
   749           vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x - fi[0].y * ri.x;
   750           vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y - fi[0].y * ri.y;
   751           vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z - fi[0].y * ri.z;
   752           vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x - fi[0].z * ri.x;
   753           vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y - fi[0].z * ri.y;
   754           vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z - fi[0].z * ri.z;
   758     // Clean the forces on the lone pair
   759     switch (maxForceNumer) {
   760       // Intentionally fall-through
   778   typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
   779   __shared__ typename BlockReduce::TempStorage temp_storage;
   781     switch (maxForceNumer) {
   782       // Intentionally fall-through
   784         vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   788         vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   792         vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
   796     if (threadIdx.x == 0) {
   797       switch (maxForceNumer) {
   798         // Intentionally fall-through
   800           atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
   801           atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
   802           atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
   803           atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
   804           atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
   805           atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
   806           atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
   807           atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
   808           atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
   811           atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
   812           atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
   813           atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
   814           atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
   815           atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
   816           atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
   817           atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
   818           atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
   819           atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
   822           atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
   823           atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
   824           atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
   825           atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
   826           atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
   827           atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
   828           atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
   829           atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
   830           atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
   837 void redistributeForceColinear(
   838   double*        d_f_normal_x,
   839   double*        d_f_normal_y,
   840   double*        d_f_normal_z,
   847   cudaTensor*    d_virial_normal,
   848   cudaTensor*    d_virial_nbond,
   849   cudaTensor*    d_virial_slow,
   850   const double*  d_pos_x,
   851   const double*  d_pos_y,
   852   const double*  d_pos_z,
   853   const int      maxForceNumber,
   855   const ComputeLonepairsCUDA::LonepairColinear* d_lpcolinear_list,
   856   size_t lpcolinear_list_size,
   857   cudaStream_t stream) {
   858   const int block_size = 128;
   859   const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
   860 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
   861   redistributeForceColinearKernel<MAXFORCENUMBER, DOVIRIAL, block_size> \
   862   <<<grid, block_size, 0, stream>>>( \
   863     d_f_normal_x, d_f_normal_y, d_f_normal_z, \
   864     d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
   865     d_f_slow_x, d_f_slow_y, d_f_slow_z, \
   866     d_virial_normal, d_virial_nbond, d_virial_slow, \
   867     d_pos_x, d_pos_y, d_pos_z, \
   868     d_lpcolinear_list, lpcolinear_list_size);
   869   const int option = (maxForceNumber << 0) + (doVirial << 2);
   871     case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
   872     case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
   873     case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
   874     case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
   875     case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
   876     case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
   878       const std::string error =
   879         "redistributeForceColinear: no kernel called (maxForceNumber = " +
   880         std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
   881       NAMD_bug(error.c_str());