1 #include "ComputeLonepairsCUDA.h"
2 #include "ComputeLonepairsCUDAKernel.h"
7 #include <hipcub/hipcub.hpp>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
16 #include <namd_cub/cub.cuh>
20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
22 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
23 __global__ void repositionRelativeKernel(
24 double* __restrict__ d_pos_x,
25 double* __restrict__ d_pos_y,
26 double* __restrict__ d_pos_z,
27 const ComputeLonepairsCUDA::LonepairRelative* __restrict__ d_lprelative_list,
28 size_t lprelative_list_size) {
29 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
30 if (tid < lprelative_list_size) {
31 const int i = d_lprelative_list[tid].i_soaid;
32 const int j = d_lprelative_list[tid].j_soaid;
33 const int k = d_lprelative_list[tid].k_soaid;
34 const int l = d_lprelative_list[tid].l_soaid;
35 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
36 const double dcosa = d_lprelative_list[tid].dcosa;
37 const double dsinacost = d_lprelative_list[tid].dsinacost;
38 const double dsinasint = d_lprelative_list[tid].dsinasint;
39 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
40 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
41 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
45 BigReal invlen = rnorm3d(a.x, a.y, a.z);
47 Vector c = cross(b, a);
48 invlen = rnorm3d(c.x, c.y, c.z);
51 const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
58 void repositionRelative(
62 const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
63 size_t lprelative_list_size,
64 cudaStream_t stream) {
65 const int block_size = 128;
66 const int grid = (lprelative_list_size + block_size - 1) / block_size;
67 repositionRelativeKernel<<<grid, block_size, 0, stream>>>(
68 d_pos_x, d_pos_y, d_pos_z, d_lprelative_list, lprelative_list_size);
71 __global__ void repositionBisectorKernel(
72 double* __restrict__ d_pos_x,
73 double* __restrict__ d_pos_y,
74 double* __restrict__ d_pos_z,
75 const ComputeLonepairsCUDA::LonepairBisector* __restrict__ d_lpbisector_list,
76 size_t lpbisector_list_size) {
77 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
78 if (tid < lpbisector_list_size) {
79 const int i = d_lpbisector_list[tid].i_soaid;
80 const int j = d_lpbisector_list[tid].j_soaid;
81 const int k = d_lpbisector_list[tid].k_soaid;
82 const int l = d_lpbisector_list[tid].l_soaid;
83 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
84 const double dcosa = d_lpbisector_list[tid].dcosa;
85 const double dsinacost = d_lpbisector_list[tid].dsinacost;
86 const double dsinasint = d_lpbisector_list[tid].dsinasint;
87 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
88 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
89 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
90 // The middle point of l and k
91 const Vector v = 0.5 * (rl + rk);
94 BigReal invlen = rnorm3d(a.x, a.y, a.z);
96 Vector c = cross(b, a);
97 invlen = rnorm3d(c.x, c.y, c.z);
100 const Vector ri = rj + dcosa * a + dsinacost * b + dsinasint * c;
107 void repositionBisector(
111 const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
112 size_t lpbisector_list_size,
113 cudaStream_t stream) {
114 const int block_size = 128;
115 const int grid = (lpbisector_list_size + block_size - 1) / block_size;
116 repositionBisectorKernel<<<grid, block_size, 0, stream>>>(
117 d_pos_x, d_pos_y, d_pos_z, d_lpbisector_list, lpbisector_list_size);
120 __global__ void repositionColinearKernel(
121 double* __restrict__ d_pos_x,
122 double* __restrict__ d_pos_y,
123 double* __restrict__ d_pos_z,
124 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
125 size_t lpcolinear_list_size) {
126 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
127 if (tid < lpcolinear_list_size) {
128 const int i = d_lpcolinear_list[tid].i_soaid;
129 const int j = d_lpcolinear_list[tid].j_soaid;
130 const int k = d_lpcolinear_list[tid].k_soaid;
131 const double distance = d_lpcolinear_list[tid].distance;
132 const double scale_factor = d_lpcolinear_list[tid].scale_factor;
133 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
134 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
135 const Vector rkj = rj - rk;
136 const double inv_r = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
137 const Vector ri = rj + (scale_factor + distance * inv_r) * rkj;
144 void repositionColinear(
145 double* __restrict__ d_pos_x,
146 double* __restrict__ d_pos_y,
147 double* __restrict__ d_pos_z,
148 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
149 size_t lpcolinear_list_size,
150 cudaStream_t stream) {
151 const int block_size = 128;
152 const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
153 repositionColinearKernel<<<grid, block_size, 0, stream>>>(
154 d_pos_x, d_pos_y, d_pos_z, d_lpcolinear_list, lpcolinear_list_size);
157 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
158 // I try to project multiple forces at once for MTS.
159 template <int numForces>
160 __inline__ __device__ void projectRelativeForces(
165 const Vector (&fi)[numForces],
166 Vector (&fj)[numForces],
167 Vector (&fk)[numForces],
168 Vector (&fl)[numForces]) {
169 const Vector rji = ri - rj;
170 const Vector rjk = rk - rj;
171 const Vector rkl = rl - rk;
172 const Vector rki = ri - rk;
173 const double inv_rjk_norm2 = 1.0 / rjk.length2();
174 const Vector plane_jkl = cross(rjk, rkl);
175 // Project the force on rij to keep the distance unchanged
176 const double rji_norm2 = rji.length2();
177 const double inv_rij_norm2 = 1.0 / rji_norm2;
180 for (int m = 0; m < numForces; ++m) {
181 fj[m] += inv_rij_norm2 * fi[m].dot(rji) * rji;
184 // Project the force on the dihedral angle to keep it unchanged
185 Vector fp[numForces];
187 for (int m = 0; m < numForces; ++m) {
190 // Get the normal vector of plane ikj
191 Vector normal_ikj = cross(rji, rjk);
192 const double normal_ikj_norm = normal_ikj.length();
193 // The arm of the force on plane ikj (v2 + v3)
194 const Vector hijk_ri = rji - (rji.dot(rjk) * inv_rjk_norm2 * rjk);
195 // Only project the force to plane ijk if the ijk are not colinear
196 // If rji and rjk is colinear then rjiĆrjk is zero
197 if (normal_ikj_norm > 1e-6) {
198 normal_ikj /= normal_ikj_norm;
199 // Force on plane ikj
201 for (int m = 0; m < numForces; ++m) {
202 fp[m] = fi[m].dot(normal_ikj) * normal_ikj;
205 // Torque of the force on l after projecting the force on plane ijk to l
206 const Vector dir_l_to_jk = cross(plane_jkl, rjk).unit();
207 const Vector h_l = rkl.dot(dir_l_to_jk) * dir_l_to_jk;
208 const double inv_h_l_norm2 = 1.0 / h_l.length2();
210 for (int m = 0; m < numForces; ++m) {
212 const Vector torque_p = cross(hijk_ri, fp[m]);
214 const Vector fpl = inv_h_l_norm2 * cross(torque_p, h_l);
216 // (a) The remaining force on j and k after subtracting the force on l from the torsional force
217 const Vector fj_prime = inv_rjk_norm2 * cross(cross(rki, fp[m]) - cross(rkl, fpl), -rjk);
218 const Vector fk_prime = fp[m] - fpl - fj_prime;
219 // (b) The remaining angular force
220 const Vector fai = fi[m] - fp[m] - fj[m];
221 // Sum the torsional force (a) on j and k
224 // Torque of the angular force (b) on k
225 const Vector torque_k = cross(rji, fai);
226 const Vector fak = cross(torque_k, rjk) * inv_rjk_norm2;
232 enum class LonepairThreeHostsType {Relative, Bisector};
234 template <typename LplistT, int maxForceNumer, bool doVirial, int blockSize, LonepairThreeHostsType lptype>
235 __global__ void redistributeForceThreeHostsKernel(
236 double* __restrict__ d_f_normal_x,
237 double* __restrict__ d_f_normal_y,
238 double* __restrict__ d_f_normal_z,
239 double* __restrict__ d_f_nbond_x,
240 double* __restrict__ d_f_nbond_y,
241 double* __restrict__ d_f_nbond_z,
242 double* __restrict__ d_f_slow_x,
243 double* __restrict__ d_f_slow_y,
244 double* __restrict__ d_f_slow_z,
245 cudaTensor* __restrict__ d_virial_normal,
246 cudaTensor* __restrict__ d_virial_nbond,
247 cudaTensor* __restrict__ d_virial_slow,
248 const double* __restrict__ d_pos_x,
249 const double* __restrict__ d_pos_y,
250 const double* __restrict__ d_pos_z,
251 const LplistT* __restrict__ d_lp_list,
252 size_t lp_list_size) {
253 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
254 cudaTensor vir_slow, vir_nbond, vir_normal;
256 switch (maxForceNumer) {
257 // Intentionally fall-through
293 if (tid < lp_list_size) {
294 const int i = d_lp_list[tid].i_soaid;
295 const int j = d_lp_list[tid].j_soaid;
296 const int k = d_lp_list[tid].k_soaid;
297 const int l = d_lp_list[tid].l_soaid;
298 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
299 const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
300 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
301 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
302 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
303 Vector fi[maxForceNumer+1];
304 Vector fj[maxForceNumer+1];
305 Vector fk[maxForceNumer+1];
306 Vector fl[maxForceNumer+1];
308 for (int m = 0; m < maxForceNumer + 1; ++m) {
313 switch (maxForceNumer) {
314 // Intentionally fall-through
315 case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
316 case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
317 case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
320 case LonepairThreeHostsType::Relative: {
321 projectRelativeForces<maxForceNumer+1>(ri, rj, rk, rl, fi, fj, fk, fl);
324 case LonepairThreeHostsType::Bisector: {
325 // The difference between the bisector and the relative cases is
326 // that the order of the vertices of the former is changed to ijmk
327 // from ijkl, so we can just reuse the function.
328 const Vector rm = 0.5 * (rj + rk);
329 // Store the force of m to fl, so I rotate the buffers fl and fk
330 projectRelativeForces<maxForceNumer+1>(ri, rj, rm, rk, fi, fj, fl, fk);
331 // fk and fl actually store the forces on m and k, respectively.
332 // The real force on k is 0.5 * fl + fk
333 // The real force on l is 0.5 * fl
335 for (int m = 0; m < maxForceNumer + 1; ++m) {
342 // Add the forces back to jkl
343 switch (maxForceNumer) {
344 // Intentionally fall-through
346 atomicAdd(&(d_f_slow_x[j]), fj[2].x);
347 atomicAdd(&(d_f_slow_y[j]), fj[2].y);
348 atomicAdd(&(d_f_slow_z[j]), fj[2].z);
349 atomicAdd(&(d_f_slow_x[k]), fk[2].x);
350 atomicAdd(&(d_f_slow_y[k]), fk[2].y);
351 atomicAdd(&(d_f_slow_z[k]), fk[2].z);
352 atomicAdd(&(d_f_slow_x[l]), fl[2].x);
353 atomicAdd(&(d_f_slow_y[l]), fl[2].y);
354 atomicAdd(&(d_f_slow_z[l]), fl[2].z);
357 atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
358 atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
359 atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
360 atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
361 atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
362 atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
363 atomicAdd(&(d_f_nbond_x[l]), fl[1].x);
364 atomicAdd(&(d_f_nbond_y[l]), fl[1].y);
365 atomicAdd(&(d_f_nbond_z[l]), fl[1].z);
368 atomicAdd(&(d_f_normal_x[j]), fj[0].x);
369 atomicAdd(&(d_f_normal_y[j]), fj[0].y);
370 atomicAdd(&(d_f_normal_z[j]), fj[0].z);
371 atomicAdd(&(d_f_normal_x[k]), fk[0].x);
372 atomicAdd(&(d_f_normal_y[k]), fk[0].y);
373 atomicAdd(&(d_f_normal_z[k]), fk[0].z);
374 atomicAdd(&(d_f_normal_x[l]), fl[0].x);
375 atomicAdd(&(d_f_normal_y[l]), fl[0].y);
376 atomicAdd(&(d_f_normal_z[l]), fl[0].z);
379 // Compute the virial contribution after redistributing the forces
381 switch (maxForceNumer) {
382 // Intentionally fall-through
384 vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x + fl[2].x * rl.x - fi[2].x * ri.x;
385 vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y + fl[2].x * rl.y - fi[2].x * ri.y;
386 vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z + fl[2].x * rl.z - fi[2].x * ri.z;
387 vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x + fl[2].y * rl.x - fi[2].y * ri.x;
388 vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y + fl[2].y * rl.y - fi[2].y * ri.y;
389 vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z + fl[2].y * rl.z - fi[2].y * ri.z;
390 vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x + fl[2].z * rl.x - fi[2].z * ri.x;
391 vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y + fl[2].z * rl.y - fi[2].z * ri.y;
392 vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z + fl[2].z * rl.z - fi[2].z * ri.z;
395 vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x + fl[1].x * rl.x - fi[1].x * ri.x;
396 vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y + fl[1].x * rl.y - fi[1].x * ri.y;
397 vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z + fl[1].x * rl.z - fi[1].x * ri.z;
398 vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x + fl[1].y * rl.x - fi[1].y * ri.x;
399 vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y + fl[1].y * rl.y - fi[1].y * ri.y;
400 vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z + fl[1].y * rl.z - fi[1].y * ri.z;
401 vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x + fl[1].z * rl.x - fi[1].z * ri.x;
402 vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y + fl[1].z * rl.y - fi[1].z * ri.y;
403 vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z + fl[1].z * rl.z - fi[1].z * ri.z;
406 vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x + fl[0].x * rl.x - fi[0].x * ri.x;
407 vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y + fl[0].x * rl.y - fi[0].x * ri.y;
408 vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z + fl[0].x * rl.z - fi[0].x * ri.z;
409 vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x + fl[0].y * rl.x - fi[0].y * ri.x;
410 vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y + fl[0].y * rl.y - fi[0].y * ri.y;
411 vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z + fl[0].y * rl.z - fi[0].y * ri.z;
412 vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x + fl[0].z * rl.x - fi[0].z * ri.x;
413 vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y + fl[0].z * rl.y - fi[0].z * ri.y;
414 vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z + fl[0].z * rl.z - fi[0].z * ri.z;
418 // Clean the forces on the lone pair
419 switch (maxForceNumer) {
420 // Intentionally fall-through
438 typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
439 __shared__ typename BlockReduce::TempStorage temp_storage;
441 switch (maxForceNumer) {
442 // Intentionally fall-through
444 vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
448 vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
452 vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
456 if (threadIdx.x == 0) {
457 switch (maxForceNumer) {
458 // Intentionally fall-through
460 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
461 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
462 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
463 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
464 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
465 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
466 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
467 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
468 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
471 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
472 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
473 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
474 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
475 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
476 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
477 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
478 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
479 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
482 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
483 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
484 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
485 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
486 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
487 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
488 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
489 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
490 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
497 void redistributeForceRelative(
498 double* d_f_normal_x,
499 double* d_f_normal_y,
500 double* d_f_normal_z,
507 cudaTensor* d_virial_normal,
508 cudaTensor* d_virial_nbond,
509 cudaTensor* d_virial_slow,
510 const double* d_pos_x,
511 const double* d_pos_y,
512 const double* d_pos_z,
513 const int maxForceNumber,
515 const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
516 size_t lprelative_list_size,
517 cudaStream_t stream) {
518 const int block_size = 128;
519 const int grid = (lprelative_list_size + block_size - 1) / block_size;
520 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
521 redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairRelative, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Relative> \
522 <<<grid, block_size, 0, stream>>>( \
523 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
524 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
525 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
526 d_virial_normal, d_virial_nbond, d_virial_slow, \
527 d_pos_x, d_pos_y, d_pos_z, \
528 d_lprelative_list, lprelative_list_size \
530 const int option = (maxForceNumber << 0) + (doVirial << 2);
532 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
533 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
534 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
535 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
536 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
537 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
539 const std::string error =
540 "redistributeForceRelative: no kernel called (maxForceNumber = " +
541 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
542 NAMD_bug(error.c_str());
548 void redistributeForceBisector(
549 double* d_f_normal_x,
550 double* d_f_normal_y,
551 double* d_f_normal_z,
558 cudaTensor* d_virial_normal,
559 cudaTensor* d_virial_nbond,
560 cudaTensor* d_virial_slow,
561 const double* d_pos_x,
562 const double* d_pos_y,
563 const double* d_pos_z,
564 const int maxForceNumber,
566 const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
567 size_t lpbisector_list_size,
568 cudaStream_t stream) {
569 const int block_size = 128;
570 const int grid = (lpbisector_list_size + block_size - 1) / block_size;
571 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
572 redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairBisector, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Bisector> \
573 <<<grid, block_size, 0, stream>>>( \
574 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
575 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
576 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
577 d_virial_normal, d_virial_nbond, d_virial_slow, \
578 d_pos_x, d_pos_y, d_pos_z, \
579 d_lpbisector_list, lpbisector_list_size \
581 const int option = (maxForceNumber << 0) + (doVirial << 2);
583 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
584 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
585 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
586 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
587 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
588 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
590 const std::string error =
591 "redistributeForceBisector: no kernel called (maxForceNumber = " +
592 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
593 NAMD_bug(error.c_str());
599 template <int maxForceNumer, bool doVirial, int blockSize>
600 __global__ void redistributeForceColinearKernel(
601 double* __restrict__ d_f_normal_x,
602 double* __restrict__ d_f_normal_y,
603 double* __restrict__ d_f_normal_z,
604 double* __restrict__ d_f_nbond_x,
605 double* __restrict__ d_f_nbond_y,
606 double* __restrict__ d_f_nbond_z,
607 double* __restrict__ d_f_slow_x,
608 double* __restrict__ d_f_slow_y,
609 double* __restrict__ d_f_slow_z,
610 cudaTensor* __restrict__ d_virial_normal,
611 cudaTensor* __restrict__ d_virial_nbond,
612 cudaTensor* __restrict__ d_virial_slow,
613 const double* __restrict__ d_pos_x,
614 const double* __restrict__ d_pos_y,
615 const double* __restrict__ d_pos_z,
616 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lp_list,
617 size_t lp_list_size) {
618 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
619 cudaTensor vir_slow, vir_nbond, vir_normal;
621 switch (maxForceNumer) {
622 // Intentionally fall-through
658 if (tid < lp_list_size) {
659 const int i = d_lp_list[tid].i_soaid;
660 const int j = d_lp_list[tid].j_soaid;
661 const int k = d_lp_list[tid].k_soaid;
662 double distance = d_lp_list[tid].distance;
663 const double scale_factor = d_lp_list[tid].scale_factor;
664 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
665 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
666 const Vector rkj = rj - rk;
667 const double inv_rkj_norm = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
668 distance *= inv_rkj_norm;
669 // Prepare the force buffers
670 Vector fi[maxForceNumer+1];
671 Vector fj[maxForceNumer+1];
672 Vector fk[maxForceNumer+1];
674 for (int m = 0; m < maxForceNumer + 1; ++m) {
678 switch (maxForceNumer) {
679 // Intentionally fall-through
680 case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
681 case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
682 case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
684 // Project the forces
686 for (int m = 0; m < maxForceNumer + 1; ++m) {
687 const double fdot = distance * (fi[m].dot(rkj)) * inv_rkj_norm * inv_rkj_norm;
688 fj[m] = (1.0 + scale_factor + distance) * fi[m] - fdot * rkj;
689 fk[m] = fi[m] - fj[m];
691 // Add the forces back
692 switch (maxForceNumer) {
693 // Intentionally fall-through
695 atomicAdd(&(d_f_slow_x[j]), fj[2].x);
696 atomicAdd(&(d_f_slow_y[j]), fj[2].y);
697 atomicAdd(&(d_f_slow_z[j]), fj[2].z);
698 atomicAdd(&(d_f_slow_x[k]), fk[2].x);
699 atomicAdd(&(d_f_slow_y[k]), fk[2].y);
700 atomicAdd(&(d_f_slow_z[k]), fk[2].z);
703 atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
704 atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
705 atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
706 atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
707 atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
708 atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
711 atomicAdd(&(d_f_normal_x[j]), fj[0].x);
712 atomicAdd(&(d_f_normal_y[j]), fj[0].y);
713 atomicAdd(&(d_f_normal_z[j]), fj[0].z);
714 atomicAdd(&(d_f_normal_x[k]), fk[0].x);
715 atomicAdd(&(d_f_normal_y[k]), fk[0].y);
716 atomicAdd(&(d_f_normal_z[k]), fk[0].z);
720 const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
721 switch (maxForceNumer) {
722 // Intentionally fall-through
724 vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x - fi[2].x * ri.x;
725 vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y - fi[2].x * ri.y;
726 vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z - fi[2].x * ri.z;
727 vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x - fi[2].y * ri.x;
728 vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y - fi[2].y * ri.y;
729 vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z - fi[2].y * ri.z;
730 vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x - fi[2].z * ri.x;
731 vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y - fi[2].z * ri.y;
732 vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z - fi[2].z * ri.z;
735 vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x - fi[1].x * ri.x;
736 vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y - fi[1].x * ri.y;
737 vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z - fi[1].x * ri.z;
738 vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x - fi[1].y * ri.x;
739 vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y - fi[1].y * ri.y;
740 vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z - fi[1].y * ri.z;
741 vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x - fi[1].z * ri.x;
742 vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y - fi[1].z * ri.y;
743 vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z - fi[1].z * ri.z;
746 vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x - fi[0].x * ri.x;
747 vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y - fi[0].x * ri.y;
748 vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z - fi[0].x * ri.z;
749 vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x - fi[0].y * ri.x;
750 vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y - fi[0].y * ri.y;
751 vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z - fi[0].y * ri.z;
752 vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x - fi[0].z * ri.x;
753 vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y - fi[0].z * ri.y;
754 vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z - fi[0].z * ri.z;
758 // Clean the forces on the lone pair
759 switch (maxForceNumer) {
760 // Intentionally fall-through
778 typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
779 __shared__ typename BlockReduce::TempStorage temp_storage;
781 switch (maxForceNumer) {
782 // Intentionally fall-through
784 vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
788 vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
792 vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
796 if (threadIdx.x == 0) {
797 switch (maxForceNumer) {
798 // Intentionally fall-through
800 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
801 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
802 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
803 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
804 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
805 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
806 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
807 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
808 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
811 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
812 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
813 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
814 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
815 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
816 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
817 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
818 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
819 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
822 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
823 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
824 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
825 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
826 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
827 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
828 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
829 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
830 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
837 void redistributeForceColinear(
838 double* d_f_normal_x,
839 double* d_f_normal_y,
840 double* d_f_normal_z,
847 cudaTensor* d_virial_normal,
848 cudaTensor* d_virial_nbond,
849 cudaTensor* d_virial_slow,
850 const double* d_pos_x,
851 const double* d_pos_y,
852 const double* d_pos_z,
853 const int maxForceNumber,
855 const ComputeLonepairsCUDA::LonepairColinear* d_lpcolinear_list,
856 size_t lpcolinear_list_size,
857 cudaStream_t stream) {
858 const int block_size = 128;
859 const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
860 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
861 redistributeForceColinearKernel<MAXFORCENUMBER, DOVIRIAL, block_size> \
862 <<<grid, block_size, 0, stream>>>( \
863 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
864 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
865 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
866 d_virial_normal, d_virial_nbond, d_virial_slow, \
867 d_pos_x, d_pos_y, d_pos_z, \
868 d_lpcolinear_list, lpcolinear_list_size);
869 const int option = (maxForceNumber << 0) + (doVirial << 2);
871 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
872 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
873 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
874 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
875 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
876 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
878 const std::string error =
879 "redistributeForceColinear: no kernel called (maxForceNumber = " +
880 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
881 NAMD_bug(error.c_str());