NAMD
ComputeLonepairsCUDAKernel.cu
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "ComputeLonepairsCUDAKernel.h"
3 #include "Vector.h"
4 #include <string>
5 
6 #ifdef NAMD_HIP
7 #include <hipcub/hipcub.hpp>
8 #define cub hipcub
9 #endif // NAMD_HIP
10 
11 #ifdef NAMD_CUDA
12 #include <cuda.h>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
15 #else
16 #include <namd_cub/cub.cuh>
17 #endif
18 #endif // NAMD_CUDA
19 
20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
21 
22 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
23 __global__ void repositionRelativeKernel(
24  double* __restrict__ d_pos_x,
25  double* __restrict__ d_pos_y,
26  double* __restrict__ d_pos_z,
27  const ComputeLonepairsCUDA::LonepairRelative* __restrict__ d_lprelative_list,
28  size_t lprelative_list_size) {
29  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
30  if (tid < lprelative_list_size) {
31  const int i = d_lprelative_list[tid].i_soaid;
32  const int j = d_lprelative_list[tid].j_soaid;
33  const int k = d_lprelative_list[tid].k_soaid;
34  const int l = d_lprelative_list[tid].l_soaid;
35  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
36  const double dcosa = d_lprelative_list[tid].dcosa;
37  const double dsinacost = d_lprelative_list[tid].dsinacost;
38  const double dsinasint = d_lprelative_list[tid].dsinasint;
39  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
40  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
41  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
42  const Vector v = rk;
43  Vector a = v - rj;
44  Vector b = rl - v;
45  BigReal invlen = rnorm3d(a.x, a.y, a.z);
46  a *= invlen;
47  Vector c = cross(b, a);
48  invlen = rnorm3d(c.x, c.y, c.z);
49  c *= invlen;
50  b = cross(a, c);
51  const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
52  d_pos_x[i] = ri.x;
53  d_pos_y[i] = ri.y;
54  d_pos_z[i] = ri.z;
55  }
56 }
57 
58 void repositionRelative(
59  double* d_pos_x,
60  double* d_pos_y,
61  double* d_pos_z,
62  const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
63  size_t lprelative_list_size,
64  cudaStream_t stream) {
65  const int block_size = 128;
66  const int grid = (lprelative_list_size + block_size - 1) / block_size;
67  repositionRelativeKernel<<<grid, block_size, 0, stream>>>(
68  d_pos_x, d_pos_y, d_pos_z, d_lprelative_list, lprelative_list_size);
69 }
70 
71 __global__ void repositionBisectorKernel(
72  double* __restrict__ d_pos_x,
73  double* __restrict__ d_pos_y,
74  double* __restrict__ d_pos_z,
75  const ComputeLonepairsCUDA::LonepairBisector* __restrict__ d_lpbisector_list,
76  size_t lpbisector_list_size) {
77  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
78  if (tid < lpbisector_list_size) {
79  const int i = d_lpbisector_list[tid].i_soaid;
80  const int j = d_lpbisector_list[tid].j_soaid;
81  const int k = d_lpbisector_list[tid].k_soaid;
82  const int l = d_lpbisector_list[tid].l_soaid;
83  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
84  const double dcosa = d_lpbisector_list[tid].dcosa;
85  const double dsinacost = d_lpbisector_list[tid].dsinacost;
86  const double dsinasint = d_lpbisector_list[tid].dsinasint;
87  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
88  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
89  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
90  // The middle point of l and k
91  const Vector v = 0.5 * (rl + rk);
92  Vector a = v - rj;
93  Vector b = rl - v;
94  BigReal invlen = rnorm3d(a.x, a.y, a.z);
95  a *= invlen;
96  Vector c = cross(b, a);
97  invlen = rnorm3d(c.x, c.y, c.z);
98  c *= invlen;
99  b = cross(a, c);
100  const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
101  d_pos_x[i] = ri.x;
102  d_pos_y[i] = ri.y;
103  d_pos_z[i] = ri.z;
104  }
105 }
106 
107 void repositionBisector(
108  double* d_pos_x,
109  double* d_pos_y,
110  double* d_pos_z,
111  const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
112  size_t lpbisector_list_size,
113  cudaStream_t stream) {
114  const int block_size = 128;
115  const int grid = (lpbisector_list_size + block_size - 1) / block_size;
116  repositionBisectorKernel<<<grid, block_size, 0, stream>>>(
117  d_pos_x, d_pos_y, d_pos_z, d_lpbisector_list, lpbisector_list_size);
118 }
119 
120 __global__ void repositionColinearKernel(
121  double* __restrict__ d_pos_x,
122  double* __restrict__ d_pos_y,
123  double* __restrict__ d_pos_z,
124  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
125  size_t lpcolinear_list_size) {
126  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
127  if (tid < lpcolinear_list_size) {
128  const int i = d_lpcolinear_list[tid].i_soaid;
129  const int j = d_lpcolinear_list[tid].j_soaid;
130  const int k = d_lpcolinear_list[tid].k_soaid;
131  const double distance = d_lpcolinear_list[tid].distance;
132  const double scale_factor = d_lpcolinear_list[tid].scale_factor;
133  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
134  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
135  const Vector rkj = rj - rk;
136  const double inv_r = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
137  const Vector ri = rj + (scale_factor + distance * inv_r) * rkj;
138  d_pos_x[i] = ri.x;
139  d_pos_y[i] = ri.y;
140  d_pos_z[i] = ri.z;
141  }
142 }
143 
144 void repositionColinear(
145  double* __restrict__ d_pos_x,
146  double* __restrict__ d_pos_y,
147  double* __restrict__ d_pos_z,
148  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
149  size_t lpcolinear_list_size,
150  cudaStream_t stream) {
151  const int block_size = 128;
152  const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
153  repositionColinearKernel<<<grid, block_size, 0, stream>>>(
154  d_pos_x, d_pos_y, d_pos_z, d_lpcolinear_list, lpcolinear_list_size);
155 }
156 
157 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
158 // I try to project multiple forces at once for MTS.
159 template <int numForces>
160 __inline__ __device__ void projectRelativeForces(
161  const Vector& ri,
162  const Vector& rj,
163  const Vector& rk,
164  const Vector& rl,
165  const Vector (&fi)[numForces],
166  Vector (&fj)[numForces],
167  Vector (&fk)[numForces],
168  Vector (&fl)[numForces]) {
169  const Vector rji = ri - rj;
170  const Vector rjk = rk - rj;
171  const Vector rkl = rl - rk;
172  const Vector rki = ri - rk;
173  const double inv_rjk_norm2 = 1.0 / rjk.length2();
174  const Vector plane_jkl = cross(rjk, rkl);
175  // Project the force on rij to keep the distance unchanged
176  const double rji_norm2 = rji.length2();
177  const double inv_rij_norm2 = 1.0 / rji_norm2;
178  if (rji_norm2 > 0) {
179  #pragma unroll
180  for (int m = 0; m < numForces; ++m) {
181  fj[m] += inv_rij_norm2 * fi[m].dot(rji) * rji;
182  }
183  }
184  // Project the force on the dihedral angle to keep it unchanged
185  Vector fp[numForces];
186  #pragma unroll
187  for (int m = 0; m < numForces; ++m) {
188  fp[m] = 0.0;
189  }
190  // Get the normal vector of plane ikj
191  Vector normal_ikj = cross(rji, rjk);
192  const double normal_ikj_norm = normal_ikj.length();
193  // The arm of the force on plane ikj (v2 + v3)
194  const Vector hijk_ri = rji - (rji.dot(rjk) * inv_rjk_norm2 * rjk);
195  // Only project the force to plane ijk if the ijk are not colinear
196  // If rji and rjk is colinear then rjiƗrjk is zero
197  if (normal_ikj_norm > 1e-6) {
198  normal_ikj /= normal_ikj_norm;
199  // Force on plane ikj
200  #pragma unroll
201  for (int m = 0; m < numForces; ++m) {
202  fp[m] = fi[m].dot(normal_ikj) * normal_ikj;
203  }
204  }
205  // Torque of the force on l after projecting the force on plane ijk to l
206  const Vector dir_l_to_jk = cross(plane_jkl, rjk).unit();
207  const Vector h_l = rkl.dot(dir_l_to_jk) * dir_l_to_jk;
208  const double inv_h_l_norm2 = 1.0 / h_l.length2();
209  #pragma unroll
210  for (int m = 0; m < numForces; ++m) {
211  // Torque of fp
212  const Vector torque_p = cross(hijk_ri, fp[m]);
213  // The force on l
214  const Vector fpl = inv_h_l_norm2 * cross(torque_p, h_l);
215  fl[m] += fpl;
216  // (a) The remaining force on j and k after subtracting the force on l from the torsional force
217  const Vector fj_prime = inv_rjk_norm2 * cross(cross(rki, fp[m]) - cross(rkl, fpl), -rjk);
218  const Vector fk_prime = fp[m] - fpl - fj_prime;
219  // (b) The remaining angular force
220  const Vector fai = fi[m] - fp[m] - fj[m];
221  // Sum the torsional force (a) on j and k
222  fj[m] += fj_prime;
223  fk[m] += fk_prime;
224  // Torque of the angular force (b) on k
225  const Vector torque_k = cross(rji, fai);
226  const Vector fak = cross(torque_k, rjk) * inv_rjk_norm2;
227  fk[m] += fak;
228  fj[m] += fai - fak;
229  }
230 }
231 
232 enum class LonepairThreeHostsType {Relative, Bisector};
233 
234 template <typename LplistT, int maxForceNumer, bool doVirial, int blockSize, LonepairThreeHostsType lptype>
235 __global__ void redistributeForceThreeHostsKernel(
236  double* __restrict__ d_f_normal_x,
237  double* __restrict__ d_f_normal_y,
238  double* __restrict__ d_f_normal_z,
239  double* __restrict__ d_f_nbond_x,
240  double* __restrict__ d_f_nbond_y,
241  double* __restrict__ d_f_nbond_z,
242  double* __restrict__ d_f_slow_x,
243  double* __restrict__ d_f_slow_y,
244  double* __restrict__ d_f_slow_z,
245  cudaTensor* __restrict__ d_virial_normal,
246  cudaTensor* __restrict__ d_virial_nbond,
247  cudaTensor* __restrict__ d_virial_slow,
248  const double* __restrict__ d_pos_x,
249  const double* __restrict__ d_pos_y,
250  const double* __restrict__ d_pos_z,
251  const LplistT* __restrict__ d_lp_list,
252  size_t lp_list_size) {
253  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
254  cudaTensor vir_slow, vir_nbond, vir_normal;
255  if (doVirial) {
256  switch (maxForceNumer) {
257  // Intentionally fall-through
258  case 2: {
259  vir_slow.xx = 0;
260  vir_slow.xy = 0;
261  vir_slow.xz = 0;
262  vir_slow.yx = 0;
263  vir_slow.yy = 0;
264  vir_slow.yz = 0;
265  vir_slow.zx = 0;
266  vir_slow.zy = 0;
267  vir_slow.zz = 0;
268  }
269  case 1: {
270  vir_nbond.xx = 0;
271  vir_nbond.xy = 0;
272  vir_nbond.xz = 0;
273  vir_nbond.yx = 0;
274  vir_nbond.yy = 0;
275  vir_nbond.yz = 0;
276  vir_nbond.zx = 0;
277  vir_nbond.zy = 0;
278  vir_nbond.zz = 0;
279  }
280  case 0: {
281  vir_normal.xx = 0;
282  vir_normal.xy = 0;
283  vir_normal.xz = 0;
284  vir_normal.yx = 0;
285  vir_normal.yy = 0;
286  vir_normal.yz = 0;
287  vir_normal.zx = 0;
288  vir_normal.zy = 0;
289  vir_normal.zz = 0;
290  }
291  }
292  }
293  if (tid < lp_list_size) {
294  const int i = d_lp_list[tid].i_soaid;
295  const int j = d_lp_list[tid].j_soaid;
296  const int k = d_lp_list[tid].k_soaid;
297  const int l = d_lp_list[tid].l_soaid;
298  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
299  const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
300  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
301  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
302  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
303  Vector fi[maxForceNumer+1];
304  Vector fj[maxForceNumer+1];
305  Vector fk[maxForceNumer+1];
306  Vector fl[maxForceNumer+1];
307  #pragma unroll
308  for (int m = 0; m < maxForceNumer + 1; ++m) {
309  fj[m] = 0.0;
310  fk[m] = 0.0;
311  fl[m] = 0.0;
312  }
313  switch (maxForceNumer) {
314  // Intentionally fall-through
315  case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
316  case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
317  case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
318  }
319  switch (lptype) {
320  case LonepairThreeHostsType::Relative: {
321  projectRelativeForces<maxForceNumer+1>(ri, rj, rk, rl, fi, fj, fk, fl);
322  break;
323  }
324  case LonepairThreeHostsType::Bisector: {
325  // The difference between the bisector and the relative cases is
326  // that the order of the vertices of the former is changed to ijmk
327  // from ijkl, so we can just reuse the function.
328  const Vector rm = 0.5 * (rj + rk);
329  // Store the force of m to fl, so I rotate the buffers fl and fk
330  projectRelativeForces<maxForceNumer+1>(ri, rj, rm, rk, fi, fj, fl, fk);
331  // fk and fl actually store the forces on m and k, respectively.
332  // The real force on k is 0.5 * fl + fk
333  // The real force on l is 0.5 * fl
334  #pragma unroll
335  for (int m = 0; m < maxForceNumer + 1; ++m) {
336  fl[m] *= 0.5;
337  fk[m] += fl[m];
338  }
339  break;
340  }
341  }
342  // Add the forces back to jkl
343  switch (maxForceNumer) {
344  // Intentionally fall-through
345  case 2: {
346  atomicAdd(&(d_f_slow_x[j]), fj[2].x);
347  atomicAdd(&(d_f_slow_y[j]), fj[2].y);
348  atomicAdd(&(d_f_slow_z[j]), fj[2].z);
349  atomicAdd(&(d_f_slow_x[k]), fk[2].x);
350  atomicAdd(&(d_f_slow_y[k]), fk[2].y);
351  atomicAdd(&(d_f_slow_z[k]), fk[2].z);
352  atomicAdd(&(d_f_slow_x[l]), fl[2].x);
353  atomicAdd(&(d_f_slow_y[l]), fl[2].y);
354  atomicAdd(&(d_f_slow_z[l]), fl[2].z);
355  }
356  case 1: {
357  atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
358  atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
359  atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
360  atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
361  atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
362  atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
363  atomicAdd(&(d_f_nbond_x[l]), fl[1].x);
364  atomicAdd(&(d_f_nbond_y[l]), fl[1].y);
365  atomicAdd(&(d_f_nbond_z[l]), fl[1].z);
366  }
367  case 0: {
368  atomicAdd(&(d_f_normal_x[j]), fj[0].x);
369  atomicAdd(&(d_f_normal_y[j]), fj[0].y);
370  atomicAdd(&(d_f_normal_z[j]), fj[0].z);
371  atomicAdd(&(d_f_normal_x[k]), fk[0].x);
372  atomicAdd(&(d_f_normal_y[k]), fk[0].y);
373  atomicAdd(&(d_f_normal_z[k]), fk[0].z);
374  atomicAdd(&(d_f_normal_x[l]), fl[0].x);
375  atomicAdd(&(d_f_normal_y[l]), fl[0].y);
376  atomicAdd(&(d_f_normal_z[l]), fl[0].z);
377  }
378  }
379  // Compute the virial contribution after redistributing the forces
380  if (doVirial) {
381  switch (maxForceNumer) {
382  // Intentionally fall-through
383  case 2: {
384  vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x + fl[2].x * rl.x - fi[2].x * ri.x;
385  vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y + fl[2].x * rl.y - fi[2].x * ri.y;
386  vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z + fl[2].x * rl.z - fi[2].x * ri.z;
387  vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x + fl[2].y * rl.x - fi[2].y * ri.x;
388  vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y + fl[2].y * rl.y - fi[2].y * ri.y;
389  vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z + fl[2].y * rl.z - fi[2].y * ri.z;
390  vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x + fl[2].z * rl.x - fi[2].z * ri.x;
391  vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y + fl[2].z * rl.y - fi[2].z * ri.y;
392  vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z + fl[2].z * rl.z - fi[2].z * ri.z;
393  }
394  case 1: {
395  vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x + fl[1].x * rl.x - fi[1].x * ri.x;
396  vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y + fl[1].x * rl.y - fi[1].x * ri.y;
397  vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z + fl[1].x * rl.z - fi[1].x * ri.z;
398  vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x + fl[1].y * rl.x - fi[1].y * ri.x;
399  vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y + fl[1].y * rl.y - fi[1].y * ri.y;
400  vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z + fl[1].y * rl.z - fi[1].y * ri.z;
401  vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x + fl[1].z * rl.x - fi[1].z * ri.x;
402  vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y + fl[1].z * rl.y - fi[1].z * ri.y;
403  vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z + fl[1].z * rl.z - fi[1].z * ri.z;
404  }
405  case 0: {
406  vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x + fl[0].x * rl.x - fi[0].x * ri.x;
407  vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y + fl[0].x * rl.y - fi[0].x * ri.y;
408  vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z + fl[0].x * rl.z - fi[0].x * ri.z;
409  vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x + fl[0].y * rl.x - fi[0].y * ri.x;
410  vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y + fl[0].y * rl.y - fi[0].y * ri.y;
411  vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z + fl[0].y * rl.z - fi[0].y * ri.z;
412  vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x + fl[0].z * rl.x - fi[0].z * ri.x;
413  vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y + fl[0].z * rl.y - fi[0].z * ri.y;
414  vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z + fl[0].z * rl.z - fi[0].z * ri.z;
415  }
416  }
417  }
418  // Clean the forces on the lone pair
419  switch (maxForceNumer) {
420  // Intentionally fall-through
421  case 2: {
422  d_f_slow_x[i] = 0;
423  d_f_slow_y[i] = 0;
424  d_f_slow_z[i] = 0;
425  }
426  case 1: {
427  d_f_nbond_x[i] = 0;
428  d_f_nbond_y[i] = 0;
429  d_f_nbond_z[i] = 0;
430  }
431  case 0: {
432  d_f_normal_x[i] = 0;
433  d_f_normal_y[i] = 0;
434  d_f_normal_z[i] = 0;
435  }
436  }
437  }
438  typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
439  __shared__ typename BlockReduce::TempStorage temp_storage;
440  if (doVirial) {
441  switch (maxForceNumer) {
442  // Intentionally fall-through
443  case 2: {
444  vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
445  __syncthreads();
446  }
447  case 1: {
448  vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
449  __syncthreads();
450  }
451  case 0: {
452  vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
453  __syncthreads();
454  }
455  }
456  if (threadIdx.x == 0) {
457  switch (maxForceNumer) {
458  // Intentionally fall-through
459  case 2: {
460  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
461  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
462  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
463  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
464  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
465  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
466  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
467  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
468  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
469  }
470  case 1: {
471  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
472  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
473  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
474  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
475  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
476  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
477  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
478  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
479  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
480  }
481  case 0: {
482  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
483  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
484  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
485  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
486  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
487  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
488  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
489  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
490  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
491  }
492  }
493  }
494  }
495 }
496 
497 void redistributeForceRelative(
498  double* d_f_normal_x,
499  double* d_f_normal_y,
500  double* d_f_normal_z,
501  double* d_f_nbond_x,
502  double* d_f_nbond_y,
503  double* d_f_nbond_z,
504  double* d_f_slow_x,
505  double* d_f_slow_y,
506  double* d_f_slow_z,
507  cudaTensor* d_virial_normal,
508  cudaTensor* d_virial_nbond,
509  cudaTensor* d_virial_slow,
510  const double* d_pos_x,
511  const double* d_pos_y,
512  const double* d_pos_z,
513  const int maxForceNumber,
514  const int doVirial,
515  const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
516  size_t lprelative_list_size,
517  cudaStream_t stream) {
518  const int block_size = 128;
519  const int grid = (lprelative_list_size + block_size - 1) / block_size;
520 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
521  redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairRelative, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Relative> \
522  <<<grid, block_size, 0, stream>>>( \
523  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
524  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
525  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
526  d_virial_normal, d_virial_nbond, d_virial_slow, \
527  d_pos_x, d_pos_y, d_pos_z, \
528  d_lprelative_list, lprelative_list_size \
529  );
530  const int option = (maxForceNumber << 0) + (doVirial << 2);
531  switch (option) {
532  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
533  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
534  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
535  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
536  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
537  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
538  default: {
539  const std::string error =
540  "redistributeForceRelative: no kernel called (maxForceNumber = " +
541  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
542  NAMD_bug(error.c_str());
543  }
544  }
545 #undef CALL
546 }
547 
548 void redistributeForceBisector(
549  double* d_f_normal_x,
550  double* d_f_normal_y,
551  double* d_f_normal_z,
552  double* d_f_nbond_x,
553  double* d_f_nbond_y,
554  double* d_f_nbond_z,
555  double* d_f_slow_x,
556  double* d_f_slow_y,
557  double* d_f_slow_z,
558  cudaTensor* d_virial_normal,
559  cudaTensor* d_virial_nbond,
560  cudaTensor* d_virial_slow,
561  const double* d_pos_x,
562  const double* d_pos_y,
563  const double* d_pos_z,
564  const int maxForceNumber,
565  const int doVirial,
566  const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
567  size_t lpbisector_list_size,
568  cudaStream_t stream) {
569  const int block_size = 128;
570  const int grid = (lpbisector_list_size + block_size - 1) / block_size;
571 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
572  redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairBisector, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Bisector> \
573  <<<grid, block_size, 0, stream>>>( \
574  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
575  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
576  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
577  d_virial_normal, d_virial_nbond, d_virial_slow, \
578  d_pos_x, d_pos_y, d_pos_z, \
579  d_lpbisector_list, lpbisector_list_size \
580  );
581  const int option = (maxForceNumber << 0) + (doVirial << 2);
582  switch (option) {
583  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
584  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
585  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
586  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
587  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
588  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
589  default: {
590  const std::string error =
591  "redistributeForceBisector: no kernel called (maxForceNumber = " +
592  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
593  NAMD_bug(error.c_str());
594  }
595  }
596 #undef CALL
597 }
598 
599 template <int maxForceNumer, bool doVirial, int blockSize>
600 __global__ void redistributeForceColinearKernel(
601  double* __restrict__ d_f_normal_x,
602  double* __restrict__ d_f_normal_y,
603  double* __restrict__ d_f_normal_z,
604  double* __restrict__ d_f_nbond_x,
605  double* __restrict__ d_f_nbond_y,
606  double* __restrict__ d_f_nbond_z,
607  double* __restrict__ d_f_slow_x,
608  double* __restrict__ d_f_slow_y,
609  double* __restrict__ d_f_slow_z,
610  cudaTensor* __restrict__ d_virial_normal,
611  cudaTensor* __restrict__ d_virial_nbond,
612  cudaTensor* __restrict__ d_virial_slow,
613  const double* __restrict__ d_pos_x,
614  const double* __restrict__ d_pos_y,
615  const double* __restrict__ d_pos_z,
616  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lp_list,
617  size_t lp_list_size) {
618  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
619  cudaTensor vir_slow, vir_nbond, vir_normal;
620  if (doVirial) {
621  switch (maxForceNumer) {
622  // Intentionally fall-through
623  case 2: {
624  vir_slow.xx = 0;
625  vir_slow.xy = 0;
626  vir_slow.xz = 0;
627  vir_slow.yx = 0;
628  vir_slow.yy = 0;
629  vir_slow.yz = 0;
630  vir_slow.zx = 0;
631  vir_slow.zy = 0;
632  vir_slow.zz = 0;
633  }
634  case 1: {
635  vir_nbond.xx = 0;
636  vir_nbond.xy = 0;
637  vir_nbond.xz = 0;
638  vir_nbond.yx = 0;
639  vir_nbond.yy = 0;
640  vir_nbond.yz = 0;
641  vir_nbond.zx = 0;
642  vir_nbond.zy = 0;
643  vir_nbond.zz = 0;
644  }
645  case 0: {
646  vir_normal.xx = 0;
647  vir_normal.xy = 0;
648  vir_normal.xz = 0;
649  vir_normal.yx = 0;
650  vir_normal.yy = 0;
651  vir_normal.yz = 0;
652  vir_normal.zx = 0;
653  vir_normal.zy = 0;
654  vir_normal.zz = 0;
655  }
656  }
657  }
658  if (tid < lp_list_size) {
659  const int i = d_lp_list[tid].i_soaid;
660  const int j = d_lp_list[tid].j_soaid;
661  const int k = d_lp_list[tid].k_soaid;
662  double distance = d_lp_list[tid].distance;
663  const double scale_factor = d_lp_list[tid].scale_factor;
664  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
665  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
666  const Vector rkj = rj - rk;
667  const double inv_rkj_norm = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
668  distance *= inv_rkj_norm;
669  // Prepare the force buffers
670  Vector fi[maxForceNumer+1];
671  Vector fj[maxForceNumer+1];
672  Vector fk[maxForceNumer+1];
673  #pragma unroll
674  for (int m = 0; m < maxForceNumer + 1; ++m) {
675  fj[m] = 0.0;
676  fk[m] = 0.0;
677  }
678  switch (maxForceNumer) {
679  // Intentionally fall-through
680  case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
681  case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
682  case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
683  }
684  // Project the forces
685  #pragma unroll
686  for (int m = 0; m < maxForceNumer + 1; ++m) {
687  const double fdot = distance * (fi[m].dot(rkj)) * inv_rkj_norm * inv_rkj_norm;
688  fj[m] = (1.0 + scale_factor + distance) * fi[m] - fdot * rkj;
689  fk[m] = fi[m] - fj[m];
690  }
691  // Add the forces back
692  switch (maxForceNumer) {
693  // Intentionally fall-through
694  case 2: {
695  atomicAdd(&(d_f_slow_x[j]), fj[2].x);
696  atomicAdd(&(d_f_slow_y[j]), fj[2].y);
697  atomicAdd(&(d_f_slow_z[j]), fj[2].z);
698  atomicAdd(&(d_f_slow_x[k]), fk[2].x);
699  atomicAdd(&(d_f_slow_y[k]), fk[2].y);
700  atomicAdd(&(d_f_slow_z[k]), fk[2].z);
701  }
702  case 1: {
703  atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
704  atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
705  atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
706  atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
707  atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
708  atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
709  }
710  case 0: {
711  atomicAdd(&(d_f_normal_x[j]), fj[0].x);
712  atomicAdd(&(d_f_normal_y[j]), fj[0].y);
713  atomicAdd(&(d_f_normal_z[j]), fj[0].z);
714  atomicAdd(&(d_f_normal_x[k]), fk[0].x);
715  atomicAdd(&(d_f_normal_y[k]), fk[0].y);
716  atomicAdd(&(d_f_normal_z[k]), fk[0].z);
717  }
718  }
719  if (doVirial) {
720  const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
721  switch (maxForceNumer) {
722  // Intentionally fall-through
723  case 2: {
724  vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x - fi[2].x * ri.x;
725  vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y - fi[2].x * ri.y;
726  vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z - fi[2].x * ri.z;
727  vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x - fi[2].y * ri.x;
728  vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y - fi[2].y * ri.y;
729  vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z - fi[2].y * ri.z;
730  vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x - fi[2].z * ri.x;
731  vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y - fi[2].z * ri.y;
732  vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z - fi[2].z * ri.z;
733  }
734  case 1: {
735  vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x - fi[1].x * ri.x;
736  vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y - fi[1].x * ri.y;
737  vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z - fi[1].x * ri.z;
738  vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x - fi[1].y * ri.x;
739  vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y - fi[1].y * ri.y;
740  vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z - fi[1].y * ri.z;
741  vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x - fi[1].z * ri.x;
742  vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y - fi[1].z * ri.y;
743  vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z - fi[1].z * ri.z;
744  }
745  case 0: {
746  vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x - fi[0].x * ri.x;
747  vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y - fi[0].x * ri.y;
748  vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z - fi[0].x * ri.z;
749  vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x - fi[0].y * ri.x;
750  vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y - fi[0].y * ri.y;
751  vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z - fi[0].y * ri.z;
752  vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x - fi[0].z * ri.x;
753  vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y - fi[0].z * ri.y;
754  vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z - fi[0].z * ri.z;
755  }
756  }
757  }
758  // Clean the forces on the lone pair
759  switch (maxForceNumer) {
760  // Intentionally fall-through
761  case 2: {
762  d_f_slow_x[i] = 0;
763  d_f_slow_y[i] = 0;
764  d_f_slow_z[i] = 0;
765  }
766  case 1: {
767  d_f_nbond_x[i] = 0;
768  d_f_nbond_y[i] = 0;
769  d_f_nbond_z[i] = 0;
770  }
771  case 0: {
772  d_f_normal_x[i] = 0;
773  d_f_normal_y[i] = 0;
774  d_f_normal_z[i] = 0;
775  }
776  }
777  }
778  typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
779  __shared__ typename BlockReduce::TempStorage temp_storage;
780  if (doVirial) {
781  switch (maxForceNumer) {
782  // Intentionally fall-through
783  case 2: {
784  vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
785  __syncthreads();
786  }
787  case 1: {
788  vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
789  __syncthreads();
790  }
791  case 0: {
792  vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
793  __syncthreads();
794  }
795  }
796  if (threadIdx.x == 0) {
797  switch (maxForceNumer) {
798  // Intentionally fall-through
799  case 2: {
800  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
801  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
802  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
803  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
804  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
805  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
806  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
807  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
808  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
809  }
810  case 1: {
811  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
812  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
813  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
814  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
815  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
816  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
817  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
818  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
819  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
820  }
821  case 0: {
822  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
823  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
824  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
825  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
826  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
827  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
828  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
829  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
830  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
831  }
832  }
833  }
834  }
835 }
836 
837 void redistributeForceColinear(
838  double* d_f_normal_x,
839  double* d_f_normal_y,
840  double* d_f_normal_z,
841  double* d_f_nbond_x,
842  double* d_f_nbond_y,
843  double* d_f_nbond_z,
844  double* d_f_slow_x,
845  double* d_f_slow_y,
846  double* d_f_slow_z,
847  cudaTensor* d_virial_normal,
848  cudaTensor* d_virial_nbond,
849  cudaTensor* d_virial_slow,
850  const double* d_pos_x,
851  const double* d_pos_y,
852  const double* d_pos_z,
853  const int maxForceNumber,
854  const int doVirial,
855  const ComputeLonepairsCUDA::LonepairColinear* d_lpcolinear_list,
856  size_t lpcolinear_list_size,
857  cudaStream_t stream) {
858  const int block_size = 128;
859  const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
860 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
861  redistributeForceColinearKernel<MAXFORCENUMBER, DOVIRIAL, block_size> \
862  <<<grid, block_size, 0, stream>>>( \
863  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
864  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
865  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
866  d_virial_normal, d_virial_nbond, d_virial_slow, \
867  d_pos_x, d_pos_y, d_pos_z, \
868  d_lpcolinear_list, lpcolinear_list_size);
869  const int option = (maxForceNumber << 0) + (doVirial << 2);
870  switch (option) {
871  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
872  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
873  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
874  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
875  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
876  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
877  default: {
878  const std::string error =
879  "redistributeForceColinear: no kernel called (maxForceNumber = " +
880  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
881  NAMD_bug(error.c_str());
882  }
883  }
884 #undef CALL
885 }
886 
887 #endif // NAMD_CUDA