NAMD
ComputeBondedCUDAKernel.cu
Go to the documentation of this file.
1 #include "TupleTypesCUDA.h"
2 #ifdef WIN32
3 #define _USE_MATH_DEFINES
4 #define __thread __declspec(thread)
5 #endif // WIN32
6 
7 #ifdef NAMD_HIP
8 #include <hipcub/hipcub.hpp>
9 #define cub hipcub
10 #endif // NAMD_HIP
11 
12 #ifdef NAMD_CUDA
13 #include <cuda.h>
14 
15 #if (NAMD_CCCL_MAJOR_VERSION >= 3)
16 #include <cuda/ptx>
17 #endif
18 
19 #if __CUDACC_VER_MAJOR__ >= 11
20 #include <cub/cub.cuh>
21 #else
22 #include <namd_cub/cub.cuh>
23 #endif
24 #endif // NAMD_CUDA
25 
26 #include <math.h>
27 
28 #include "ComputeBondedCUDAKernel.h"
29 #include "CudaComputeNonbondedInteractions.h"
30 
31 #ifdef FUTURE_CUDADEVICE
32 #include "CudaDevice.h"
33 #else
34 #include "DeviceCUDA.h"
35 extern __thread DeviceCUDA *deviceCUDA;
36 #endif
37 
38 #ifndef USE_TABLE_ARRAYS
39 __device__ __forceinline__
40 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
41 #if defined(NAMD_CUDA)
42  return tex1D<float4>(tex, k);
43 #else
44  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
45  const float x = k * (float)tableSize - 0.5f;
46  const float f = floorf(x);
47  const float a = x - f;
48  const unsigned int i = (unsigned int)f;
49  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
50  const int i1 = i0 + 1;
51  const float4 t0 = tex1Dfetch<float4>(tex, i0);
52  const float4 t1 = tex1Dfetch<float4>(tex, i1);
53  return make_float4(
54  a * (t1.x - t0.x) + t0.x,
55  a * (t1.y - t0.y) + t0.y,
56  a * (t1.z - t0.z) + t0.z,
57  a * (t1.w - t0.w) + t0.w);
58 #endif
59 }
60 #endif
61 
62 __device__ __forceinline__
63 float4 tableLookup(const float4* table, const float k)
64 {
65  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
66  const float x = k * static_cast<float>(tableSize) - 0.5f;
67  const float f = floorf(x);
68  const float a = x - f;
69  const int i = static_cast<int>(f);
70  const int i0 = max(0, min(tableSize - 1, i));
71  const int i1 = max(0, min(tableSize - 1, i + 1));
72  const float4 t0 = __ldg(&table[i0]);
73  const float4 t1 = __ldg(&table[i1]);
74  return make_float4(
75  a * (t1.x - t0.x) + t0.x,
76  a * (t1.y - t0.y) + t0.y,
77  a * (t1.z - t0.z) + t0.z,
78  a * (t1.w - t0.w) + t0.w);
79 }
80 
81 // global variable for alchemical transformation
82 namespace AlchBondedCUDA {
83  // Alchemical parameters and lambdas
84  __constant__ CudaAlchParameters alchParams;
85  __constant__ CudaAlchLambdas alchLambdas;
86 }
87 
88 
89 template <typename T>
90 __forceinline__ __device__
91 void convertForces(const float x, const float y, const float z,
92  T &Tx, T &Ty, T &Tz) {
93 
94  Tx = (T)(x);
95  Ty = (T)(y);
96  Tz = (T)(z);
97 }
98 
99 template <typename T>
100 __forceinline__ __device__
101 void calcComponentForces(
102  float fij,
103  const float dx, const float dy, const float dz,
104  T &fxij, T &fyij, T &fzij) {
105 
106  fxij = (T)(fij*dx);
107  fyij = (T)(fij*dy);
108  fzij = (T)(fij*dz);
109 
110 }
111 
112 template <typename T>
113 __forceinline__ __device__
114 void calcComponentForces(
115  float fij1,
116  const float dx1, const float dy1, const float dz1,
117  float fij2,
118  const float dx2, const float dy2, const float dz2,
119  T &fxij, T &fyij, T &fzij) {
120 
121  fxij = (T)(fij1*dx1 + fij2*dx2);
122  fyij = (T)(fij1*dy1 + fij2*dy2);
123  fzij = (T)(fij1*dz1 + fij2*dz2);
124 }
125 
126 __forceinline__ __device__
127 int warpAggregatingAtomicInc(int* counter) {
128 #if BOUNDINGBOXSIZE == 64
129  WarpMask mask = __ballot(1);
130  int total = __popcll(mask);
131  int prefix = __popcll(mask & cub::LaneMaskLt());
132  int firstLane = __ffsll(mask) - 1;
133 #elif defined(NAMD_CUDA) && (NAMD_CCCL_MAJOR_VERSION >= 3)
134  // cub::LaneMaskLt was removed with CCCL 3.0
135  WarpMask mask = __ballot_sync(WARP_FULL_MASK, 1);
136  int total = __popc(mask);
137  int prefix = __popc(mask & cuda::ptx::get_sreg_lanemask_lt());
138  int firstLane = __ffs(mask) - 1;
139 #else
140  WarpMask mask = __ballot(1);
141  int total = __popc(mask);
142  int prefix = __popc(mask & cub::LaneMaskLt());
143  int firstLane = __ffs(mask) - 1;
144 #endif
145  int start = 0;
146  if (prefix == 0) {
147  start = atomicAdd(counter, total);
148  }
149  start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
150  return start + prefix;
151 }
152 
153 template <typename T>
154 __forceinline__ __device__
155 void storeForces(const T fx, const T fy, const T fz,
156  const int ind, const int stride,
157  T* force,
158  T* forceList, int* forceListCounter,
159  int* forceListStarts, int* forceListNexts) {
160 
161 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
162 #if defined(NAMD_HIP)
163  // Try to decrease conflicts between lanes if there are repeting ind
164  WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
165  if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
166  const int laneId = threadIdx.x % WARPSIZE;
167  const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
168  const bool isHead = laneId == 0 || ind != prevLaneInd;
169  if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
170  // There are segments of repeating ind
171  typedef cub::WarpReduce<T> WarpReduce;
172  __shared__ typename WarpReduce::TempStorage temp_storage;
173  const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
174  const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
175  const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
176  if (isHead) {
177  atomicAdd(&force[ind ], sumfx);
178  atomicAdd(&force[ind + stride ], sumfy);
179  atomicAdd(&force[ind + stride*2], sumfz);
180  }
181  return;
182  }
183  }
184  // Not all lanes are active (the last block) or there is no repeating ind
185  atomicAdd(&force[ind ], fx);
186  atomicAdd(&force[ind + stride ], fy);
187  atomicAdd(&force[ind + stride*2], fz);
188 #else
189  atomicAdd(&force[ind ], fx);
190  atomicAdd(&force[ind + stride ], fy);
191  atomicAdd(&force[ind + stride*2], fz);
192 #endif
193 #else
194  const int newPos = warpAggregatingAtomicInc(forceListCounter);
195  forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
196  forceList[newPos * 3 + 0] = fx;
197  forceList[newPos * 3 + 1] = fy;
198  forceList[newPos * 3 + 2] = fz;
199 #endif
200 }
201 
202 //
203 // Calculates bonds
204 //
205 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
206 __device__ void bondForce(
207  const int index,
208  const CudaBond* __restrict__ bondList,
209  const CudaBondValue* __restrict__ bondValues,
210  const float4* __restrict__ xyzq,
211  const int stride,
212  const float3 lata, const float3 latb, const float3 latc,
213  T* __restrict__ force, double &energy,
214  T* __restrict__ forceList, int* forceListCounter,
215  int* forceListStarts, int* forceListNexts,
216 #ifdef WRITE_FULL_VIRIALS
217  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
218 #else
219  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
220 #endif
221  double &energy_F, double &energy_TI_1, double &energy_TI_2
222  ) {
223 
224  CudaBond bl = bondList[index];
225  CudaBondValue bondValue = bondValues[bl.itype];
226 // if (bondValue.x0 == 0.0f) return;
227 
228  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
229  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
230  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
231 
232  float4 xyzqi = xyzq[bl.i];
233  float4 xyzqj = xyzq[bl.j];
234 
235  float xij = xyzqi.x + shx - xyzqj.x;
236  float yij = xyzqi.y + shy - xyzqj.y;
237  float zij = xyzqi.z + shz - xyzqj.z;
238 
239  float r = sqrtf(xij*xij + yij*yij + zij*zij);
240 
241  float db = r - bondValue.x0;
242  if (bondValue.x1) {
243  // in this case, the bond represents a harmonic wall potential
244  // where x0 is the lower wall and x1 is the upper
245  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
246  }
247  float fij = db * bondValue.k * bl.scale;
248 
249  // Define a temporary variable to store the energy
250  // used by both alchemical and non-alchemical route
251  float energy_tmp = 0.0f;
252  if (doEnergy) {
253  energy_tmp += fij * db;
254  }
255 
256  // Alchemical route
257  // JM: Get this shit off here
258  float alch_scale = 1.0f;
259  if (doFEP || doTI) {
260  switch (bl.fepBondedType) {
261  case 1: {
262  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
263  if (doTI) {
264  energy_TI_1 += (double)(energy_tmp);
265  }
266  if (doFEP) {
267  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
268  }
269  break;
270  }
271  case 2: {
272  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
273  if (doTI) {
274  energy_TI_2 += (double)(energy_tmp);
275  }
276  if (doFEP) {
277  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
278  }
279  break;
280  }
281  }
282  }
283 
284  if (doEnergy) {
285  if (doFEP || doTI) {
286  energy += (double)(energy_tmp * alch_scale);
287  } else {
288  energy += (double)(energy_tmp);
289  }
290  }
291  fij *= -2.0f/r;
292  // XXX TODO: Get this off here
293  // XXX TODO: Decide on a templated parameter nomenclature
294  if (doFEP || doTI) {
295  fij *= alch_scale;
296  }
297 
298  T T_fxij, T_fyij, T_fzij;
299  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
300 
301  // Store forces
302  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
303  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
304 
305  // Store virial
306  if (doVirial) {
307 #ifdef WRITE_FULL_VIRIALS
308  float fxij = fij*xij;
309  float fyij = fij*yij;
310  float fzij = fij*zij;
311  virial.xx = (fxij*xij);
312  virial.xy = (fxij*yij);
313  virial.xz = (fxij*zij);
314  virial.yx = (fyij*xij);
315  virial.yy = (fyij*yij);
316  virial.yz = (fyij*zij);
317  virial.zx = (fzij*xij);
318  virial.zy = (fzij*yij);
319  virial.zz = (fzij*zij);
320 #endif
321  }
322 }
323 
324 //
325 // Calculates modified exclusions
326 //
327 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
328 __device__ void modifiedExclusionForce(
329  const int index,
330  const CudaExclusion* __restrict__ exclusionList,
331  const bool doSlow,
332  const float one_scale14, // 1 - scale14
333  const int vdwCoefTableWidth,
334 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
335  const float2* __restrict__ vdwCoefTable,
336 #else
337  cudaTextureObject_t vdwCoefTableTex,
338 #endif
339 #ifdef USE_TABLE_ARRAYS
340  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
341 #else
342  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
343 #endif
344  const float4* __restrict__ xyzq,
345  const int stride,
346  const float3 lata, const float3 latb, const float3 latc,
347  const float cutoff2,
348  const CudaNBConstants nbConstants,
349  double &energyVdw,
350  T* __restrict__ forceNbond, double &energyNbond,
351  T* __restrict__ forceSlow, double &energySlow,
352  T* __restrict__ forceList, int* forceListCounter,
353  int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
354 #ifdef WRITE_FULL_VIRIALS
355  ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
356  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
357 #else
358  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
359  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
360 #endif
361  double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
362  double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
363  double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
364  ) {
365 
366  CudaExclusion bl = exclusionList[index];
367 
368  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
369  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
370  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
371 
372  float4 xyzqi = xyzq[bl.i];
373  float4 xyzqj = xyzq[bl.j];
374 
375  float xij = xyzqi.x + shx - xyzqj.x;
376  float yij = xyzqi.y + shy - xyzqj.y;
377  float zij = xyzqi.z + shz - xyzqj.z;
378 
379  float r2 = xij*xij + yij*yij + zij*zij;
380  if (r2 < cutoff2) {
381 
382  float rinv = rsqrtf(r2);
383 
384  float qq;
385  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
386 
387  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
388 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
389  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
390 #else
391  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
392 #endif
393 
394  // Alchemical route
395  float myElecLambda = 1.0f;
396  float myElecLambda2 = 1.0f;
397  float myVdwLambda = 1.0f;
398  float myVdwLambda2 = 1.0f;
399  float myVdwShift = 0.0f;
400  float myVdwShift2 = 0.0f;
401  double alch_vdw_energy;
402  double alch_vdw_energy_2;
403  float alch_vdw_force = 0.0f;
404  float alch_vdw_dUdl = 0.0f;
405  double* p_energyVdw_TI = NULL;
406  double* p_energyNbond_TI = NULL;
407  double* p_energySlow_TI = NULL;
408  float p0Factor;
409  if (doFEP || doTI) {
410  p0Factor = 0.0f;
411  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
412  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
413  float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
414  float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
415  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
416  float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
417  float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
418  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
419  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
420  float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
421  float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
422  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
423  float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
424  float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
425  switch (bl.pswitch) {
426  case 0: myVdwLambda = 1.0f;
427  myVdwLambda2 = 1.0f;
428  myElecLambda = 1.0f;
429  myElecLambda2 = 1.0f;
430  p0Factor = 1.0f;
431  break;
432  case 1: myVdwLambda = vdwLambdaUp;
433  myVdwLambda2 = vdwLambda2Up;
434  myElecLambda = elecLambdaUp;
435  myElecLambda2 = elecLambda2Up;
436  myVdwShift = vdwShiftUp;
437  myVdwShift2 = vdwShift2Up;
438  p_energyVdw_TI = &(energyVdw_TI_1);
439  p_energyNbond_TI= &(energyNbond_TI_1);
440  p_energySlow_TI = &(energySlow_TI_1);
441  break;
442  case 2: myVdwLambda = vdwLambdaDown;
443  myVdwLambda2 = vdwLambda2Down;
444  myElecLambda = elecLambdaDown;
445  myElecLambda2 = elecLambda2Down;
446  myVdwShift = vdwShiftDown;
447  myVdwShift2 = vdwShift2Down;
448  p_energyVdw_TI = &(energyVdw_TI_2);
449  p_energyNbond_TI= &(energyNbond_TI_2);
450  p_energySlow_TI = &(energySlow_TI_2);
451  break;
452  default: myVdwLambda = 0.0f;
453  myVdwLambda2 = 0.0f;
454  myElecLambda = 0.0f;
455  myElecLambda2 = 0.0f;
456  break;
457  }
458  if (bl.pswitch != 99 && bl.pswitch != 0) {
459  // Common part of FEP and TI
460  const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
461  const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
462  const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
463  const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
464  // A = ljab.x ; B = ljab.y
465  const float r2_1 = 1.0f / (r2 + myVdwShift);
466  const float r2_2 = 1.0f / (r2 + myVdwShift2);
467  const float r6_1 = r2_1 * r2_1 * r2_1;
468  const float r6_2 = r2_2 * r2_2 * r2_2;
469  const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
470  const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
471  // rinv is already calculated above
472  if (doEnergy) {
473  alch_vdw_energy = double(myVdwLambda * switchmul * U1);
474  if (doFEP) {
475  alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
476  }
477  }
478  alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
479  if (doTI) {
480  alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
481  }
482  } else {
483  if (doEnergy) {
484  alch_vdw_energy = 0.0;
485  alch_vdw_energy_2 = 0.0;
486  }
487  }
488  }
489 
490  float4 fi;
491  float4 ei;
492  if (doTable) {
493 #ifdef NAMD_HIP
494 #ifdef USE_TABLE_ARRAYS
495  fi = tableLookup(forceTable, rinv);
496 #else
497  fi = sampleTableTex(forceTableTex, rinv);
498 #endif
499 #else
500  fi = tex1D<float4>(forceTableTex, rinv);
501 #endif
502  }
503 
504  if (doEnergy && doTable) {
505 #ifdef NAMD_HIP
506 #ifdef USE_TABLE_ARRAYS
507  ei = tableLookup(energyTable, rinv);
508 #else
509  ei = sampleTableTex(energyTableTex, rinv);
510 #endif
511 #else
512  ei = tex1D<float4>(energyTableTex, rinv);
513 #endif
514  if (doFEP || doTI) {
515  energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
516  energyVdw += alch_vdw_energy;
517  if (doFEP) {
518  energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
519  energyVdw_F += alch_vdw_energy_2;
520  }
521  if (doTI) {
522  if (p_energyVdw_TI != NULL) {
523  (*p_energyVdw_TI) += alch_vdw_dUdl;
524  }
525  }
526  } else {
527  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
528  }
529  if (doElect) {
530  energyNbond += qq * ei.x * myElecLambda;
531  if (doFEP) {
532  energyNbond_F += qq * ei.x * myElecLambda2;
533  }
534  if (doTI) {
535  if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
536  }
537  if (doSlow) {
538  energySlow += qq * ei.w * myElecLambda;
539  if (doFEP) {
540  energySlow_F += qq * ei.w * myElecLambda2;
541  }
542  if (doTI) {
543  if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
544  }
545  }
546  }
547  }
548 
549  float fNbond;
550  float fSlow;
551  if (doTable) {
552  if (doElect) {
553  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
554  } else {
555  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
556  }
557  } else {
558  float e_vdw, e_elec, e_slow;
559  if (doEnergy) {
560  e_vdw = 0.0f;
561  e_elec = 0.0f;
562  e_slow = 0.0f;
563  }
564 
565  cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
566  doSlow, doElect, r2, rinv, qq, ljab,
567  nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
568 
569  if (doEnergy) {
570  energyVdw += (double) (e_vdw);
571  energyNbond += (double) (e_elec * myElecLambda);
572  energySlow += (double) (e_slow * myElecLambda);
573  }
574  }
575  // fNbond = fFast + fVdw
576  if (doFEP || doTI) {
577  if (bl.pswitch != 0) {
578  fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
579  fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
580  }
581  // if pswitch == 0, myElecLambda = 1.0
582  }
583  T T_fxij, T_fyij, T_fzij;
584  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
585  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
586  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
587 
588  if (doSlow && doElect) {
589  if (doTable) {
590  fSlow = -qq * fi.w;
591  }
592  if (doFEP || doTI) {
593  fSlow *= myElecLambda;
594  }
595  T T_fxij, T_fyij, T_fzij;
596  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
597  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
598  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
599  }
600 
601  // Store virial
602  if (doVirial) {
603 #ifdef WRITE_FULL_VIRIALS
604  float fxij = fNbond*xij;
605  float fyij = fNbond*yij;
606  float fzij = fNbond*zij;
607  virialNbond.xx = (fxij*xij);
608  virialNbond.xy = (fxij*yij);
609  virialNbond.xz = (fxij*zij);
610  virialNbond.yx = (fyij*xij);
611  virialNbond.yy = (fyij*yij);
612  virialNbond.yz = (fyij*zij);
613  virialNbond.zx = (fzij*xij);
614  virialNbond.zy = (fzij*yij);
615  virialNbond.zz = (fzij*zij);
616 #endif
617  }
618 
619  // Store virial
620  if (doVirial && doSlow && doElect) {
621 #ifdef WRITE_FULL_VIRIALS
622  float fxij = fSlow*xij;
623  float fyij = fSlow*yij;
624  float fzij = fSlow*zij;
625  virialSlow.xx = (fxij*xij);
626  virialSlow.xy = (fxij*yij);
627  virialSlow.xz = (fxij*zij);
628  virialSlow.yx = (fyij*xij);
629  virialSlow.yy = (fyij*yij);
630  virialSlow.yz = (fyij*zij);
631  virialSlow.zx = (fzij*xij);
632  virialSlow.zy = (fzij*yij);
633  virialSlow.zz = (fzij*zij);
634 #endif
635  }
636 
637  }
638 }
639 
640 //
641 // Calculates exclusions. Here doSlow = true
642 //
643 // doFull = doFullElectrostatics = doSlow
644 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
645 __device__ void exclusionForce(
646  const int index,
647  const CudaExclusion* __restrict__ exclusionList,
648  const float r2_delta, const int r2_delta_expc,
649 
650 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
651  const float* __restrict__ r2_table,
652  const float4* __restrict__ exclusionTable,
653 #else
654  cudaTextureObject_t r2_table_tex,
655  cudaTextureObject_t exclusionTableTex,
656 #endif
657  const float4* __restrict__ xyzq,
658  const int stride,
659  const float3 lata, const float3 latb, const float3 latc,
660  const float cutoff2,
661  T* __restrict__ forceSlow, double &energySlow,
662  T* __restrict__ forceList, int* forceListCounter,
663  int* forceListStartsSlow, int* forceListNexts,
664 #ifdef WRITE_FULL_VIRIALS
665  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
666 #else
667  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
668 #endif
669  double &energy_F, double &energy_TI_1, double &energy_TI_2
670  ) {
671 
672  CudaExclusion bl = exclusionList[index];
673 
674  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
675  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
676  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
677 
678  float4 xyzqi = xyzq[bl.i];
679  float4 xyzqj = xyzq[bl.j];
680 
681  float xij = xyzqi.x + shx - xyzqj.x;
682  float yij = xyzqi.y + shy - xyzqj.y;
683  float zij = xyzqi.z + shz - xyzqj.z;
684 
685  float r2 = xij*xij + yij*yij + zij*zij;
686  if (r2 < cutoff2) {
687  r2 += r2_delta;
688 
689  union { float f; int i; } r2i;
690  r2i.f = r2;
691  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
692 
693 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
694 #ifdef NAMD_HIP
695  float r2_table_val = r2_table[table_i];
696 #else
697  float r2_table_val = __ldg(&r2_table[table_i]);
698 #endif
699 #else
700  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
701 #endif
702  float diffa = r2 - r2_table_val;
703  float qq = xyzqi.w * xyzqj.w;
704 
705 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
706 #ifdef NAMD_HIP
707  float4 slow = exclusionTable[table_i];
708 #else
709  float4 slow = __ldg(&exclusionTable[table_i]);
710 #endif
711 #else
712  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
713 #endif
714  // Alchemical route
715  float myElecLambda;
716  float myElecLambda2;
717  double* p_energy_TI;
718  if (doFEP || doTI) {
719  myElecLambda = 1.0f;
720  myElecLambda2 = 1.0f;
721  if (doTI) p_energy_TI = NULL;
722  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
723  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
724  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
725  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
726  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
727  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
728  switch (bl.pswitch) {
729  case 0: myElecLambda = 1.0f;
730  myElecLambda2 = 1.0f;
731  break;
732  case 1: myElecLambda = elecLambdaUp;
733  myElecLambda2 = elecLambda2Up;
734  p_energy_TI = &(energy_TI_1);
735  break;
736  case 2: myElecLambda = elecLambdaDown;
737  myElecLambda2 = elecLambda2Down;
738  p_energy_TI = &(energy_TI_2);
739  break;
740  default: myElecLambda = 0.0f;
741  myElecLambda2 = 0.0f;
742  break;
743  }
744  }
745  if (doEnergy) {
746  double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
747  if (doFEP || doTI) {
748  energySlow += energy_slow_tmp * myElecLambda;
749  if (doFEP) {
750  energy_F += energy_slow_tmp * myElecLambda2;
751  }
752  if (doTI) {
753  if (p_energy_TI != NULL) {
754  (*p_energy_TI) += energy_slow_tmp;
755  }
756  }
757  } else {
758  energySlow += energy_slow_tmp;
759  }
760  }
761 
762  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
763  if (doFEP || doTI) {
764  fSlow *= myElecLambda;
765  }
766 
767  T T_fxij, T_fyij, T_fzij;
768  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
769  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
770  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
771 
772  // Store virial
773  if (doVirial) {
774 #ifdef WRITE_FULL_VIRIALS
775  float fxij = fSlow*xij;
776  float fyij = fSlow*yij;
777  float fzij = fSlow*zij;
778  virialSlow.xx = (fxij*xij);
779  virialSlow.xy = (fxij*yij);
780  virialSlow.xz = (fxij*zij);
781  virialSlow.yx = (fyij*xij);
782  virialSlow.yy = (fyij*yij);
783  virialSlow.yz = (fyij*zij);
784  virialSlow.zx = (fzij*xij);
785  virialSlow.zy = (fzij*yij);
786  virialSlow.zz = (fzij*zij);
787 #endif
788  }
789  }
790 }
791 
792 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
793 __device__ void angleForce(const int index,
794  const CudaAngle* __restrict__ angleList,
795  const CudaAngleValue* __restrict__ angleValues,
796  const float4* __restrict__ xyzq,
797  const int stride,
798  const float3 lata, const float3 latb, const float3 latc,
799  T* __restrict__ force, double &energy,
800  T* __restrict__ forceList, int* forceListCounter,
801  int* forceListStarts, int* forceListNexts,
802 #ifdef WRITE_FULL_VIRIALS
803  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
804 #else
805  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
806 #endif
807  double &energy_F, double &energy_TI_1, double &energy_TI_2
808  ) {
809 
810  CudaAngle al = angleList[index];
811 
812  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
813  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
814  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
815 
816  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
817  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
818  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
819 
820  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
821  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
822  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
823 
824  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
825  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
826  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
827 
828  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
829  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
830 
831  float xijr = xij*rij_inv;
832  float yijr = yij*rij_inv;
833  float zijr = zij*rij_inv;
834  float xkjr = xkj*rkj_inv;
835  float ykjr = ykj*rkj_inv;
836  float zkjr = zkj*rkj_inv;
837  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
838 
839  CudaAngleValue angleValue = angleValues[al.itype];
840  angleValue.k *= al.scale;
841 
842  float diff;
843  if (angleValue.normal == 1) {
844  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
845  cos_theta = min(0.999f, max(-0.999f, cos_theta));
846  float theta = acosf(cos_theta);
847  diff = theta - angleValue.theta0;
848  } else {
849  diff = cos_theta - angleValue.theta0;
850  }
851 
852  float energy_tmp = 0.0f;
853  if (doEnergy) {
854  energy_tmp += angleValue.k * diff * diff;
855  }
856 
857  // Alchemical route
858  float alch_scale;
859  // Be careful: alch_scale_energy_fep is 0 here!
860  float alch_scale_energy_fep;
861  double* p_energy_TI;
862  // NOTE: On account of the Urey-Bradley terms, energy calculation is not
863  // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
864  // but we still need the scaling factor for forces,
865  // so the code here is a little bit different from the bondForce.
866  // calculate a scale factor for FEP energy first, and then scale it to
867  // the final result after finishing energy evaluation, and also use
868  // a pointer point to the correct energy_TI_x term.
869  if (doFEP || doTI) {
870  alch_scale = 1.0f;
871  alch_scale_energy_fep = 0.0f;
872  if (doTI) p_energy_TI = NULL;
873  switch (al.fepBondedType) {
874  case 1: {
875  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
876  if (doEnergy) {
877  if (doTI) {
878  p_energy_TI = &(energy_TI_1);
879  }
880  if (doFEP) {
881  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
882  }
883  }
884  break;
885  }
886  case 2: {
887  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
888  if (doEnergy) {
889  if (doTI) {
890  p_energy_TI = &(energy_TI_2);
891  }
892  if (doFEP) {
893  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
894  }
895  }
896  break;
897  }
898  }
899  }
900 
901  if (angleValue.normal == 1) {
902  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
903  if (inv_sin_theta > 1.0e6) {
904  diff = (diff < 0.0f) ? 1.0f : -1.0f;
905  } else {
906  diff *= -inv_sin_theta;
907  }
908  }
909  diff *= -2.0f*angleValue.k;
910 
911  // Do alchemical scaling
912  if (doFEP || doTI) {
913  diff *= alch_scale;
914  }
915 
916  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
917  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
918  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
919  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
920  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
921  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
922 
923  T T_dtxi, T_dtyi, T_dtzi;
924  T T_dtxj, T_dtyj, T_dtzj;
925  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
926  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
927  T T_dtxk = -T_dtxi - T_dtxj;
928  T T_dtyk = -T_dtyi - T_dtyj;
929  T T_dtzk = -T_dtzi - T_dtzj;
930  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
931 
932  if (angleValue.k_ub) {
933  float xik = xij - xkj;
934  float yik = yij - ykj;
935  float zik = zij - zkj;
936  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
937  float rik = 1.0f/rik_inv;
938  float diff_ub = rik - angleValue.r_ub;
939  if (doEnergy) {
940  energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
941  }
942  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
943  if (doFEP || doTI) {
944  diff_ub *= alch_scale;
945  }
946  T T_dxik, T_dyik, T_dzik;
947  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
948  T_dtxi += T_dxik;
949  T_dtyi += T_dyik;
950  T_dtzi += T_dzik;
951  T_dtxj -= T_dxik;
952  T_dtyj -= T_dyik;
953  T_dtzj -= T_dzik;
954  }
955 
956 
957 
958  if (doEnergy) {
959  if (doFEP || doTI) {
960  energy += double(energy_tmp * alch_scale);
961  if (doTI) {
962  if (p_energy_TI != NULL) {
963  *(p_energy_TI) += double(energy_tmp);
964  }
965  }
966  if (doFEP) {
967  energy_F += double(energy_tmp * alch_scale_energy_fep);
968  }
969  } else {
970  energy += double(energy_tmp);
971  }
972  }
973 
974  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
975  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
976 
977  // Store virial
978  if (doVirial) {
979 #ifdef WRITE_FULL_VIRIALS
980  float fxi = ((float)T_dtxi);
981  float fyi = ((float)T_dtyi);
982  float fzi = ((float)T_dtzi);
983  float fxk = ((float)T_dtxj);
984  float fyk = ((float)T_dtyj);
985  float fzk = ((float)T_dtzj);
986  virial.xx = (fxi*xij) + (fxk*xkj);
987  virial.xy = (fxi*yij) + (fxk*ykj);
988  virial.xz = (fxi*zij) + (fxk*zkj);
989  virial.yx = (fyi*xij) + (fyk*xkj);
990  virial.yy = (fyi*yij) + (fyk*ykj);
991  virial.yz = (fyi*zij) + (fyk*zkj);
992  virial.zx = (fzi*xij) + (fzk*xkj);
993  virial.zy = (fzi*yij) + (fzk*ykj);
994  virial.zz = (fzi*zij) + (fzk*zkj);
995 #endif
996  }
997 }
998 
999 //
1000 // Dihedral computation
1001 //
1002 // Out: df, e
1003 //
1004 template <bool doEnergy>
1005 __forceinline__ __device__
1006 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
1007  const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
1008 
1009  df = 0.0f;
1010  if (doEnergy) e = 0.0;
1011 
1012  float phi = atan2f(sin_phi, cos_phi);
1013 
1014  bool lrep = true;
1015  while (lrep) {
1016  CudaDihedralValue dihedralValue = dihedralValues[ic];
1017  dihedralValue.k *= scale;
1018 
1019  // Last dihedral has n low bit set to 0
1020  lrep = (dihedralValue.n & 1);
1021  dihedralValue.n >>= 1;
1022  if (dihedralValue.n) {
1023  float nf = dihedralValue.n;
1024  float x = nf*phi - dihedralValue.delta;
1025  if (doEnergy) {
1026  float sin_x, cos_x;
1027  sincosf(x, &sin_x, &cos_x);
1028  e += (double)(dihedralValue.k*(1.0f + cos_x));
1029  df += (double)(nf*dihedralValue.k*sin_x);
1030  } else {
1031  float sin_x = sinf(x);
1032  df += (double)(nf*dihedralValue.k*sin_x);
1033  }
1034  } else {
1035  float diff = phi - dihedralValue.delta;
1036  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1037  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1038  if (doEnergy) {
1039  e += (double)(dihedralValue.k*diff*diff);
1040  }
1041  df -= 2.0f*dihedralValue.k*diff;
1042  }
1043  ic++;
1044  }
1045 
1046 }
1047 
1048 
1049 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1050 __device__ void diheForce(const int index,
1051  const CudaDihedral* __restrict__ diheList,
1052  const CudaDihedralValue* __restrict__ dihedralValues,
1053  const float4* __restrict__ xyzq,
1054  const int stride,
1055  const float3 lata, const float3 latb, const float3 latc,
1056  T* __restrict__ force, double &energy,
1057  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1058 #ifdef WRITE_FULL_VIRIALS
1059  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1060 #else
1061  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1062 #endif
1063  double &energy_F, double &energy_TI_1, double &energy_TI_2
1064  ) {
1065 
1066  CudaDihedral dl = diheList[index];
1067 
1068  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1069  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1070  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1071 
1072  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1073  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1074  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1075 
1076  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1077  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1078  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1079 
1080  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1081  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1082  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1083 
1084  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1085  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1086  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1087 
1088  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1089  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1090  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1091 
1092  // A=F^G, B=H^G.
1093  float ax = yij*zjk - zij*yjk;
1094  float ay = zij*xjk - xij*zjk;
1095  float az = xij*yjk - yij*xjk;
1096  float bx = ylk*zjk - zlk*yjk;
1097  float by = zlk*xjk - xlk*zjk;
1098  float bz = xlk*yjk - ylk*xjk;
1099 
1100  float ra2 = ax*ax + ay*ay + az*az;
1101  float rb2 = bx*bx + by*by + bz*bz;
1102  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1103 
1104  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1105  // nlinear = nlinear + 1
1106  // endif
1107 
1108  float rgr = 1.0f / rg;
1109  float ra2r = 1.0f / ra2;
1110  float rb2r = 1.0f / rb2;
1111  float rabr = sqrtf(ra2r*rb2r);
1112 
1113  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1114  //
1115  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1116  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1117  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1118  //
1119  // Energy and derivative contributions.
1120 
1121  float df;
1122  double e;
1123  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1124 
1125  // Alchemical transformation
1126  float alch_scale;
1127  if (doFEP || doTI) {
1128  alch_scale = 1.0f;
1129  switch (dl.fepBondedType) {
1130  case 1: {
1131  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1132  if (doEnergy) {
1133  if (doTI) {
1134  energy_TI_1 += (double)(e);
1135  }
1136  if (doFEP) {
1137  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1138  }
1139  }
1140  break;
1141  }
1142  case 2: {
1143  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1144  if (doEnergy) {
1145  if (doTI) {
1146  energy_TI_2 += (double)(e);
1147  }
1148  if (doFEP) {
1149  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1150  }
1151  }
1152  break;
1153  }
1154  }
1155  }
1156 
1157  if (doEnergy) {
1158  if (doFEP || doTI) {
1159  energy += e * alch_scale;
1160  } else {
1161  energy += e;
1162  }
1163  }
1164 
1165  //
1166  // Compute derivatives wrt catesian coordinates.
1167  //
1168  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1169  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1170 
1171  float fg = xij*xjk + yij*yjk + zij*zjk;
1172  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1173  ra2r *= df;
1174  rb2r *= df;
1175  float fga = fg*ra2r*rgr;
1176  float hgb = hg*rb2r*rgr;
1177  float gaa = ra2r*rg;
1178  float gbb = rb2r*rg;
1179  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1180 
1181  // Remember T is long long int
1182  // Don't try to scale T_fix and similar variables directly by float
1183  if (doFEP || doTI) {
1184  fga *= alch_scale;
1185  hgb *= alch_scale;
1186  gaa *= alch_scale;
1187  gbb *= alch_scale;
1188  }
1189 
1190  // Store forces
1191  T T_fix, T_fiy, T_fiz;
1192  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1193  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1194 
1195  T dgx, dgy, dgz;
1196  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1197  T T_fjx = dgx - T_fix;
1198  T T_fjy = dgy - T_fiy;
1199  T T_fjz = dgz - T_fiz;
1200  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1201 
1202  T dhx, dhy, dhz;
1203  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1204  T T_fkx = -dhx - dgx;
1205  T T_fky = -dhy - dgy;
1206  T T_fkz = -dhz - dgz;
1207  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1208  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1209 
1210  // Store virial
1211  if (doVirial) {
1212 #ifdef WRITE_FULL_VIRIALS
1213  float fix = ((float)T_fix);
1214  float fiy = ((float)T_fiy);
1215  float fiz = ((float)T_fiz);
1216  float fjx = ((float)dgx);
1217  float fjy = ((float)dgy);
1218  float fjz = ((float)dgz);
1219  float flx = ((float)dhx);
1220  float fly = ((float)dhy);
1221  float flz = ((float)dhz);
1222  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1223  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1224  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1225  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1226  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1227  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1228  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1229  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1230  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1231 #endif
1232  }
1233 
1234 }
1235 
1236 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1237  return make_float3(
1238  v1.y*v2.z-v2.y*v1.z,
1239  v2.x*v1.z-v1.x*v2.z,
1240  v1.x*v2.y-v2.x*v1.y
1241  );
1242 }
1243 
1244 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1245  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1246 }
1247 
1248 //
1249 // Calculates crossterms
1250 //
1251 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1252 __device__ void crosstermForce(
1253  const int index,
1254  const CudaCrossterm* __restrict__ crosstermList,
1255  const CudaCrosstermValue* __restrict__ crosstermValues,
1256  const float4* __restrict__ xyzq,
1257  const int stride,
1258  const float3 lata, const float3 latb, const float3 latc,
1259  T* __restrict__ force, double &energy,
1260  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1261 #ifdef WRITE_FULL_VIRIALS
1262  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1263 #else
1264  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1265 #endif
1266  double &energy_F, double &energy_TI_1, double &energy_TI_2
1267  ) {
1268 
1269  CudaCrossterm cl = crosstermList[index];
1270 
1271  // ----------------------------------------------------------------------------
1272  // Angle between 1 - 2 - 3 - 4
1273  //
1274  // 1 - 2
1275  float3 sh12 = make_float3(
1276  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1277  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1278  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1279 
1280  float4 xyzq1 = xyzq[cl.i1];
1281  float4 xyzq2 = xyzq[cl.i2];
1282 
1283  float3 r12 = make_float3(
1284  xyzq1.x + sh12.x - xyzq2.x,
1285  xyzq1.y + sh12.y - xyzq2.y,
1286  xyzq1.z + sh12.z - xyzq2.z);
1287 
1288  // 2 - 3
1289  float3 sh23 = make_float3(
1290  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1291  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1292  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1293 
1294  float4 xyzq3 = xyzq[cl.i3];
1295 
1296  float3 r23 = make_float3(
1297  xyzq2.x + sh23.x - xyzq3.x,
1298  xyzq2.y + sh23.y - xyzq3.y,
1299  xyzq2.z + sh23.z - xyzq3.z);
1300 
1301  // 3 - 4
1302  float3 sh34 = make_float3(
1303  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1304  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1305  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1306 
1307  float4 xyzq4 = xyzq[cl.i4];
1308 
1309  float3 r34 = make_float3(
1310  xyzq3.x + sh34.x - xyzq4.x,
1311  xyzq3.y + sh34.y - xyzq4.y,
1312  xyzq3.z + sh34.z - xyzq4.z);
1313 
1314  // Calculate the cross products
1315  float3 A = cross(r12, r23);
1316  float3 B = cross(r23, r34);
1317  float3 C = cross(r23, A);
1318 
1319  // Calculate the inverse distances
1320  float inv_rA = rsqrtf(dot(A, A));
1321  float inv_rB = rsqrtf(dot(B, B));
1322  float inv_rC = rsqrtf(dot(C, C));
1323 
1324  // Calculate the sin and cos
1325  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1326  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1327 
1328  float phi = -atan2f(sin_phi,cos_phi);
1329 
1330  // ----------------------------------------------------------------------------
1331  // Angle between 5 - 6 - 7 - 8
1332  //
1333 
1334  // 5 - 6
1335  float3 sh56 = make_float3(
1336  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1337  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1338  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1339 
1340  float4 xyzq5 = xyzq[cl.i5];
1341  float4 xyzq6 = xyzq[cl.i6];
1342 
1343  float3 r56 = make_float3(
1344  xyzq5.x + sh56.x - xyzq6.x,
1345  xyzq5.y + sh56.y - xyzq6.y,
1346  xyzq5.z + sh56.z - xyzq6.z);
1347 
1348  // 6 - 7
1349  float3 sh67 = make_float3(
1350  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1351  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1352  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1353 
1354  float4 xyzq7 = xyzq[cl.i7];
1355 
1356  float3 r67 = make_float3(
1357  xyzq6.x + sh67.x - xyzq7.x,
1358  xyzq6.y + sh67.y - xyzq7.y,
1359  xyzq6.z + sh67.z - xyzq7.z);
1360 
1361  // 7 - 8
1362  float3 sh78 = make_float3(
1363  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1364  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1365  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1366 
1367  float4 xyzq8 = xyzq[cl.i8];
1368 
1369  float3 r78 = make_float3(
1370  xyzq7.x + sh78.x - xyzq8.x,
1371  xyzq7.y + sh78.y - xyzq8.y,
1372  xyzq7.z + sh78.z - xyzq8.z);
1373 
1374  // Calculate the cross products
1375  float3 D = cross(r56, r67);
1376  float3 E = cross(r67, r78);
1377  float3 F = cross(r67, D);
1378 
1379  // Calculate the inverse distances
1380  float inv_rD = rsqrtf(dot(D, D));
1381  float inv_rE = rsqrtf(dot(E, E));
1382  float inv_rF = rsqrtf(dot(F, F));
1383 
1384  // Calculate the sin and cos
1385  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1386  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1387 
1388  float psi = -atan2f(sin_psi,cos_psi);
1389 
1390  // ----------------------------------------------------------------------------
1391  // Calculate the energy
1392 
1393  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1394 
1395  // Shift angles
1396  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1397  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1398 
1399  // distance measured in grid points between angle and smallest value
1400  float phi_h = phi * inv_h;
1401  float psi_h = psi * inv_h;
1402 
1403  // find smallest numbered grid point in stencil
1404  int iphi = (int)floor(phi_h);
1405  int ipsi = (int)floor(psi_h);
1406 
1407  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1408 
1409  // Load coefficients
1410  float4 c[4];
1411  c[0] = cp.c[iphi][ipsi][0];
1412  c[1] = cp.c[iphi][ipsi][1];
1413  c[2] = cp.c[iphi][ipsi][2];
1414  c[3] = cp.c[iphi][ipsi][3];
1415 
1416  float dphi = phi_h - (float)iphi;
1417  float dpsi = psi_h - (float)ipsi;
1418 
1419  float alch_scale;
1420  float alch_scale_energy_fep;
1421  double* p_energy_TI;
1422  if (doFEP || doTI) {
1423  alch_scale = 1.0f;
1424  alch_scale_energy_fep = 0.0f;
1425  if (doTI) p_energy_TI = NULL;
1426  switch (cl.fepBondedType) {
1427  case 1: {
1428  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1429  if (doEnergy) {
1430  if (doTI) {
1431  p_energy_TI = &(energy_TI_1);
1432  }
1433  if (doFEP) {
1434  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1435  }
1436  }
1437  break;
1438  }
1439  case 2: {
1440  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1441  if (doEnergy) {
1442  if (doTI) {
1443  p_energy_TI = &(energy_TI_2);
1444  }
1445  if (doFEP) {
1446  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1447  }
1448  }
1449  break;
1450  }
1451  }
1452  }
1453 
1454  if (doEnergy) {
1455  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1456  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1457  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1458  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1459  if (doFEP || doTI) {
1460  energy += energyf * cl.scale * alch_scale;
1461  if (doFEP) {
1462  energy_F += energyf * cl.scale * alch_scale_energy_fep;
1463  }
1464  if (doTI) {
1465  if (p_energy_TI != NULL) {
1466  (*p_energy_TI) += energyf * cl.scale;
1467  }
1468  }
1469  } else {
1470  energy += energyf * cl.scale;
1471  }
1472  }
1473 
1474  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1475  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1476  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1477  dEdphi *= cl.scale*inv_h;
1478 
1479  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1480  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1481  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1482  dEdpsi *= cl.scale*inv_h;
1483 
1484  // float normCross1 = dot(A, A);
1485  float square_r23 = dot(r23, r23);
1486  float norm_r23 = sqrtf(square_r23);
1487  float inv_square_r23 = 1.0f/square_r23;
1488  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1489  float ff2 = -dot(r12, r23)*inv_square_r23;
1490  float ff3 = -dot(r34, r23)*inv_square_r23;
1491  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1492 
1493  // NOTE: The reason why scaling ff1 and ff4 is enough:
1494  // f1, f4 are already scaled, and in following t1's formula:
1495  // first term (t1.x) : ff2*f1.x - ff3*f4.x
1496  // ^ ^
1497  // scaled scaled
1498  // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1499  // then let's look at f2.x:
1500  // f2.x = t1.x - f1.x
1501  // ^ ^
1502  // scaled scaled => f2.x is scaled
1503  // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1504  // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1505  if (doFEP || doTI) {
1506  ff1 *= alch_scale;
1507  ff4 *= alch_scale;
1508  }
1509 
1510  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1511  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1512  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1513  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1514  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1515 
1516  T T_f1x, T_f1y, T_f1z;
1517  T T_f2x, T_f2y, T_f2z;
1518  T T_f3x, T_f3y, T_f3z;
1519  T T_f4x, T_f4y, T_f4z;
1520  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1521  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1522  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1523  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1524  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1525  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1526  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1527  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1528 
1529  float square_r67 = dot(r67, r67);
1530  float norm_r67 = sqrtf(square_r67);
1531  float inv_square_r67 = 1.0f/(square_r67);
1532  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1533  ff2 = -dot(r56, r67)*inv_square_r67;
1534  ff3 = -dot(r78, r67)*inv_square_r67;
1535  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1536 
1537  if (doFEP || doTI) {
1538  ff1 *= alch_scale;
1539  ff4 *= alch_scale;
1540  }
1541 
1542  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1543  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1544  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1545  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1546  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1547 
1548  T T_f5x, T_f5y, T_f5z;
1549  T T_f6x, T_f6y, T_f6z;
1550  T T_f7x, T_f7y, T_f7z;
1551  T T_f8x, T_f8y, T_f8z;
1552  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1553  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1554  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1555  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1556  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1557  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1558  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1559  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1560 
1561  // Store virial
1562  if (doVirial) {
1563 #ifdef WRITE_FULL_VIRIALS
1564  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1565  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1566  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1567  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1568  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1569  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1570  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1571  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1572  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1573  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1574  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1575  }
1576 #endif
1577 
1578 }
1579 
1580 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1581 __device__ void tholeForce(
1582  const int index,
1583  const CudaThole* __restrict__ tholeList,
1584  const float4* __restrict__ xyzq,
1585  const int stride,
1586  const float3 lata, const float3 latb, const float3 latc,
1587  T* __restrict__ force, double &energy,
1588  T* __restrict__ forceList, int* forceListCounter,
1589  int* forceListStarts, int* forceListNexts,
1590 #ifdef WRITE_FULL_VIRIALS
1591  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1592 #else
1593  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1594 #endif
1595  double &energy_F, double &energy_TI_1, double &energy_TI_2
1596 ) {
1597  // For reference, similar calculation can be found in OpenMM's code repo:
1598  // https://github.com/openmm/openmm/blob/8f123ef287693bb4b553163a9d5fd101ea9e4462/plugins/drude/platforms/common/src/kernels/drudePairForce.cc
1599  // CHARMM source code: charmm/source/gener/drude.F90, FUNCTION E_THOLE
1600  const CudaThole tl = tholeList[index];
1601 
1602  const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1603  const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1604  const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1605 
1606  const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1607  const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1608  const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1609 
1610  const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1611  const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1612  const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1613 
1614  const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1615  const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1616  const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1617 
1618  const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1619  const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1620  const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1621 
1622  const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1623  const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1624  const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1625 
1626  const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1627  const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1628  const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1629 
1630  const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1631  const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1632  const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1633 
1634  const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1635  const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1636  const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1637  const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1638 
1639  const float auaa = tl.aa / raa_invlen;
1640  const float auad = tl.aa / rad_invlen;
1641  const float auda = tl.aa / rda_invlen;
1642  const float audd = tl.aa / rdd_invlen;
1643 
1644  // exp(-ar)
1645  const float expauaa = expf(-auaa);
1646  const float expauad = expf(-auad);
1647  const float expauda = expf(-auda);
1648  const float expaudd = expf(-audd);
1649 
1650  // (1 + ar/2)
1651  float polyauaa = 1.0f + 0.5f*auaa;
1652  float polyauad = 1.0f + 0.5f*auad;
1653  float polyauda = 1.0f + 0.5f*auda;
1654  float polyaudd = 1.0f + 0.5f*audd;
1655 
1656  if (doEnergy) {
1657  float ethole = 0;
1658  ethole += tl.qq * raa_invlen * (1.0f - polyauaa * expauaa);
1659  ethole += -tl.qq * rad_invlen * (1.0f - polyauad * expauad);
1660  ethole += -tl.qq * rda_invlen * (1.0f - polyauda * expauda);
1661  ethole += tl.qq * rdd_invlen * (1.0f - polyaudd * expaudd);
1662  if (doFEP || doTI) {
1663  switch (tl.fepBondedType) {
1664  case 1: {
1665  // 0 < typeSum && typeSum < order
1666  if (doTI) energy_TI_1 += (double)ethole;
1667  if (doFEP) {
1668  // elecLambda2Up = simParams->getElecLambda(alchLambda2)
1669  energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Up * ethole;
1670  // elecLambdaUp = simParams->getElecLambda(alchLambda)
1671  ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1672  }
1673  break;
1674  }
1675  case 2: {
1676  if (doTI) energy_TI_2 += (double)ethole;
1677  if (doFEP) {
1678  // elecLambda2Down = simParams->getElecLambda(1-alchLambda2)
1679  energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Down * ethole;
1680  // elecLambdaDown = simParams->getElecLambda(1-alchLambda)
1681  ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1682  break;
1683  }
1684  }
1685  }
1686  }
1687  energy += (double)ethole;
1688  }
1689 
1690  polyauaa = 1.0f + auaa*polyauaa;
1691  polyauad = 1.0f + auad*polyauad;
1692  polyauda = 1.0f + auda*polyauda;
1693  polyaudd = 1.0f + audd*polyaudd;
1694 
1695  const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
1696  const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
1697  const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
1698  const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
1699 
1700  // df = (1/r) (dU/dr)
1701  float dfaa = -tl.qq * raa_invlen3 * (polyauaa*expauaa - 1);
1702  float dfad = tl.qq * rad_invlen3 * (polyauad*expauad - 1);
1703  float dfda = tl.qq * rda_invlen3 * (polyauda*expauda - 1);
1704  float dfdd = -tl.qq * rdd_invlen3 * (polyaudd*expaudd - 1);
1705 
1706  if (doFEP || doTI) {
1707  switch (tl.fepBondedType) {
1708  case 1: {
1709  dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1710  dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1711  dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1712  dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1713  break;
1714  }
1715  case 2: {
1716  dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1717  dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1718  dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1719  dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1720  break;
1721  }
1722  }
1723  }
1724 
1725  T T_fx0, T_fy0, T_fz0;
1726  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
1727  storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1728 
1729  T T_fx1, T_fy1, T_fz1;
1730  calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
1731  storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1732 
1733  T T_fx2, T_fy2, T_fz2;
1734  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
1735  storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1736 
1737  T T_fx3, T_fy3, T_fz3;
1738  calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
1739  storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1740 
1741  if (doVirial) {
1742 #ifdef WRITE_FULL_VIRIALS
1743  const float faa_x = dfaa * raa_x;
1744  const float faa_y = dfaa * raa_y;
1745  const float faa_z = dfaa * raa_z;
1746  const float fad_x = dfad * rad_x;
1747  const float fad_y = dfad * rad_y;
1748  const float fad_z = dfad * rad_z;
1749  const float fda_x = dfda * rda_x;
1750  const float fda_y = dfda * rda_y;
1751  const float fda_z = dfda * rda_z;
1752  const float fdd_x = dfdd * rdd_x;
1753  const float fdd_y = dfdd * rdd_y;
1754  const float fdd_z = dfdd * rdd_z;
1755  virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
1756  virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
1757  virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
1758  virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
1759  virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
1760  virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
1761  virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
1762  virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
1763  virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
1764 #endif
1765  }
1766 }
1767 
1768 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1769 __device__ void anisoForce(
1770  const int index,
1771  const CudaAniso* __restrict__ anisoList,
1772  const float4* __restrict__ xyzq,
1773  const int stride,
1774  const float3 lata, const float3 latb, const float3 latc,
1775  T* __restrict__ force, double &energy,
1776  T* __restrict__ forceList, int* forceListCounter,
1777  int* forceListStarts, int* forceListNexts,
1778 #ifdef WRITE_FULL_VIRIALS
1779  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1780 #else
1781  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1782 #endif
1783  double &energy_F, double &energy_TI_1, double &energy_TI_2
1784 ) {
1785  const CudaAniso al = anisoList[index];
1786 
1787  const float shx_il = al.offset_il.x*lata.x + al.offset_il.y*latb.x + al.offset_il.z*latc.x;
1788  const float shy_il = al.offset_il.x*lata.y + al.offset_il.y*latb.y + al.offset_il.z*latc.y;
1789  const float shz_il = al.offset_il.x*lata.z + al.offset_il.y*latb.z + al.offset_il.z*latc.z;
1790 
1791  const float shx_mn = al.offset_mn.x*lata.x + al.offset_mn.y*latb.x + al.offset_mn.z*latc.x;
1792  const float shy_mn = al.offset_mn.x*lata.y + al.offset_mn.y*latb.y + al.offset_mn.z*latc.y;
1793  const float shz_mn = al.offset_mn.x*lata.z + al.offset_mn.y*latb.z + al.offset_mn.z*latc.z;
1794 
1795  // calculate parallel and perpendicular displacement vectors
1796  // shortest vector image: ri - rl
1797  const float ril_x = xyzq[al.i].x + shx_il - xyzq[al.l].x;
1798  const float ril_y = xyzq[al.i].y + shy_il - xyzq[al.l].y;
1799  const float ril_z = xyzq[al.i].z + shz_il - xyzq[al.l].z;
1800 
1801  // shortest vector image: rm - rn
1802  const float rmn_x = xyzq[al.m].x + shx_mn - xyzq[al.n].x;
1803  const float rmn_y = xyzq[al.m].y + shy_mn - xyzq[al.n].y;
1804  const float rmn_z = xyzq[al.m].z + shz_mn - xyzq[al.n].z;
1805 
1806  // need recip lengths of r_il, r_mn
1807  const float ril_invlen = rnorm3df(ril_x, ril_y, ril_z);
1808  const float rmn_invlen = rnorm3df(rmn_x, rmn_y, rmn_z);
1809 
1810  // normalize r_il, r_mn
1811  const float u1_x = ril_x * ril_invlen;
1812  const float u1_y = ril_y * ril_invlen;
1813  const float u1_z = ril_z * ril_invlen;
1814  const float u2_x = rmn_x * rmn_invlen;
1815  const float u2_y = rmn_y * rmn_invlen;
1816  const float u2_z = rmn_z * rmn_invlen;
1817 
1818  // Drude displacement vector (ri, rj are in same patch)
1819  const float dr_x = xyzq[al.j].x - xyzq[al.i].x;
1820  const float dr_y = xyzq[al.j].y - xyzq[al.i].y;
1821  const float dr_z = xyzq[al.j].z - xyzq[al.i].z;
1822 
1823  // parallel displacement
1824  const float dpar = dr_x * u1_x + dr_y * u1_y + dr_z * u1_z;
1825 
1826  // perpendicular displacement
1827  const float dperp = dr_x * u2_x + dr_y * u2_y + dr_z * u2_z;
1828 
1829  // force on atom j
1830  float fj_x = -al.kiso0 * dr_x;
1831  float fj_y = -al.kiso0 * dr_y;
1832  float fj_z = -al.kiso0 * dr_z;
1833  fj_x -= al.kpar0 * dpar * u1_x;
1834  fj_y -= al.kpar0 * dpar * u1_y;
1835  fj_z -= al.kpar0 * dpar * u1_z;
1836  fj_x -= al.kperp0 * dperp * u2_x;
1837  fj_y -= al.kperp0 * dperp * u2_y;
1838  fj_z -= al.kperp0 * dperp * u2_z;
1839 
1840  // force on atom l
1841  float fl_x = al.kpar0 * dpar * ril_invlen * dr_x;
1842  float fl_y = al.kpar0 * dpar * ril_invlen * dr_y;
1843  float fl_z = al.kpar0 * dpar * ril_invlen * dr_z;
1844  fl_x -= al.kpar0 * dpar * dpar * ril_invlen * u1_x;
1845  fl_y -= al.kpar0 * dpar * dpar * ril_invlen * u1_y;
1846  fl_z -= al.kpar0 * dpar * dpar * ril_invlen * u1_z;
1847 
1848  // force on atom m
1849  float fm_x = al.kperp0 * dperp * dperp * rmn_invlen * u2_x;
1850  float fm_y = al.kperp0 * dperp * dperp * rmn_invlen * u2_y;
1851  float fm_z = al.kperp0 * dperp * dperp * rmn_invlen * u2_z;
1852  fm_x -= al.kperp0 * dperp * rmn_invlen * dr_x;
1853  fm_y -= al.kperp0 * dperp * rmn_invlen * dr_y;
1854  fm_z -= al.kperp0 * dperp * rmn_invlen * dr_z;
1855 
1856  if (doEnergy) {
1857  // aniso spring energy
1858  // kpar reduces response along carbonyl vector
1859  // kperp reduces response perp to bond vector
1860  // (reg in and out of plane response)
1861  const float dr2 = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
1862  float eaniso = 0.5f * al.kpar0 * dpar * dpar + 0.5f * al.kperp0 * dperp * dperp + 0.5f * al.kiso0 * dr2;
1863  if (doFEP || doTI) {
1864  switch (al.fepBondedType) {
1865  case 1: {
1866  if (doTI) energy_TI_1 += (double)eaniso;
1867  if (doFEP) {
1868  energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1869  eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1870  fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1871  fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1872  fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1873  fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1874  fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1875  fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1876  fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1877  fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1878  fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1879  }
1880  break;
1881  }
1882  case 2: {
1883  if (doTI) energy_TI_2 += (double)eaniso;
1884  if (doFEP) {
1885  energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1886  eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1887  fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1888  fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1889  fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1890  fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1891  fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1892  fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1893  fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1894  fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1895  fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1896  }
1897  break;
1898  }
1899  }
1900  }
1901  energy += (double)eaniso;
1902  }
1903 
1904  // accumulate forces
1905  T T_fx0, T_fy0, T_fz0;
1906  convertForces<T>(-(fj_x + fl_x), -(fj_y + fl_y), -(fj_z + fl_z), T_fx0, T_fy0, T_fz0);
1907  storeForces<T>(T_fx0, T_fy0, T_fz0, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1908 
1909  T T_fx1, T_fy1, T_fz1;
1910  convertForces<T>(fj_x, fj_y, fj_z, T_fx1, T_fy1, T_fz1);
1911  storeForces<T>(T_fx1, T_fy1, T_fz1, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1912 
1913  T T_fx2, T_fy2, T_fz2;
1914  convertForces<T>(fl_x, fl_y, fl_z, T_fx2, T_fy2, T_fz2);
1915  storeForces<T>(T_fx2, T_fy2, T_fz2, al.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1916 
1917  T T_fx3, T_fy3, T_fz3;
1918  convertForces<T>(fm_x, fm_y, fm_z, T_fx3, T_fy3, T_fz3);
1919  storeForces<T>(T_fx3, T_fy3, T_fz3, al.m, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1920 
1921  T T_fx4, T_fy4, T_fz4;
1922  convertForces<T>(-fm_x, -fm_y, -fm_z, T_fx4, T_fy4, T_fz4);
1923  storeForces<T>(T_fx4, T_fy4, T_fz4, al.n, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1924 
1925  if (doVirial) {
1926 #ifdef WRITE_FULL_VIRIALS
1927  virial.xx = fj_x * dr_x - fl_x * ril_x + fm_x * rmn_x;
1928  virial.xy = fj_x * dr_y - fl_x * ril_y + fm_x * rmn_y;
1929  virial.xz = fj_x * dr_z - fl_x * ril_z + fm_x * rmn_z;
1930  virial.yx = fj_y * dr_x - fl_y * ril_x + fm_y * rmn_x;
1931  virial.yy = fj_y * dr_y - fl_y * ril_y + fm_y * rmn_y;
1932  virial.yz = fj_y * dr_z - fl_y * ril_z + fm_y * rmn_z;
1933  virial.zx = fj_z * dr_x - fl_z * ril_x + fm_z * rmn_x;
1934  virial.zy = fj_z * dr_y - fl_z * ril_y + fm_z * rmn_y;
1935  virial.zz = fj_z * dr_z - fl_z * ril_z + fm_z * rmn_z;
1936 #endif
1937  }
1938 }
1939 
1940 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1941 __device__ void oneFourNbTholeForce(
1942  const int index, const float one_scale14,
1943  const CudaOneFourNbThole* __restrict__ oneFourNbTholeList,
1944  const float4* __restrict__ xyzq,
1945  const int stride,
1946  const float3 lata, const float3 latb, const float3 latc,
1947  T* __restrict__ force, double &energy,
1948  T* __restrict__ forceList, int* forceListCounter,
1949  int* forceListStarts, int* forceListNexts,
1950 #ifdef WRITE_FULL_VIRIALS
1951  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1952 #else
1953  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1954 #endif
1955  double &energy_F, double &energy_TI_1, double &energy_TI_2
1956 ) {
1957  const auto tl = oneFourNbTholeList[index];
1958 
1959  const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1960  const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1961  const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1962 
1963  const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1964  const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1965  const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1966 
1967  const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1968  const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1969  const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1970 
1971  const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1972  const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1973  const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1974 
1975  const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1976  const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1977  const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1978 
1979  const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1980  const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1981  const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1982 
1983  const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1984  const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1985  const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1986 
1987  const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1988  const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1989  const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1990 
1991  const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1992  const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1993  const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1994  const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1995 
1996  const float auaa = tl.aa / raa_invlen;
1997  const float auad = tl.aa / rad_invlen;
1998  const float auda = tl.aa / rda_invlen;
1999  const float audd = tl.aa / rdd_invlen;
2000 
2001  // exp(-ar)
2002  const float expauaa = expf(-auaa);
2003  const float expauad = expf(-auad);
2004  const float expauda = expf(-auda);
2005  const float expaudd = expf(-audd);
2006 
2007  // (1 + ar/2)
2008  float polyauaa = 1.0f + 0.5f*auaa;
2009  float polyauad = 1.0f + 0.5f*auad;
2010  float polyauda = 1.0f + 0.5f*auda;
2011  float polyaudd = 1.0f + 0.5f*audd;
2012 
2013  /*
2014  * Although CHARMM Drude force field expects the 1-4 scaling factor to be 1.0,
2015  * I try my best to generalize the case here. The GPU nonbonded kernel differs
2016  * from the CPU kernel as the former always calculate the full 1-4 interactions
2017  * without scaling (or just with a scaling factor 1.0), and then the bonded
2018  * kernel "modifiedExclusionForce" in ComputeBondedCUDAKernel.cu actually
2019  * computes a complementary part scaled by -(1.0 - ComputeNonbondedUtil::scale14).
2020  *
2021  * As a result, the following scaling factor uses the same convention. In the
2022  * GPU kernel, the NbThole correction of 1-4 interactions is computed by
2023  * calcForceEnergyNbThole, so the following code should only computes the
2024  * complementary part.
2025  */
2026 
2027  // Charges
2028  const float qqaa = -one_scale14 * xyzq[tl.i].w * xyzq[tl.k].w;
2029  const float qqad = -one_scale14 * xyzq[tl.i].w * xyzq[tl.l].w;
2030  const float qqda = -one_scale14 * xyzq[tl.j].w * xyzq[tl.k].w;
2031  const float qqdd = -one_scale14 * xyzq[tl.j].w * xyzq[tl.l].w;
2032 
2033  if (doEnergy) {
2034  float ethole = 0;
2035  ethole += qqaa * raa_invlen * (-polyauaa * expauaa);
2036  ethole += qqad * rad_invlen * (-polyauad * expauad);
2037  ethole += qqda * rda_invlen * (-polyauda * expauda);
2038  ethole += qqdd * rdd_invlen * (-polyaudd * expaudd);
2039  energy += (double)ethole;
2040  }
2041 
2042  polyauaa = 1.0f + auaa*polyauaa;
2043  polyauad = 1.0f + auad*polyauad;
2044  polyauda = 1.0f + auda*polyauda;
2045  polyaudd = 1.0f + audd*polyaudd;
2046 
2047  const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
2048  const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
2049  const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
2050  const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
2051 
2052  float dfaa = -qqaa * raa_invlen3 * (polyauaa*expauaa);
2053  float dfad = -qqad * rad_invlen3 * (polyauad*expauad);
2054  float dfda = -qqda * rda_invlen3 * (polyauda*expauda);
2055  float dfdd = -qqdd * rdd_invlen3 * (polyaudd*expaudd);
2056 
2057  T T_fx0, T_fy0, T_fz0;
2058  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
2059  storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2060 
2061  T T_fx1, T_fy1, T_fz1;
2062  calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
2063  storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2064 
2065  T T_fx2, T_fy2, T_fz2;
2066  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
2067  storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2068 
2069  T T_fx3, T_fy3, T_fz3;
2070  calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
2071  storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2072 
2073  if (doVirial) {
2074 #ifdef WRITE_FULL_VIRIALS
2075  const float faa_x = dfaa * raa_x;
2076  const float faa_y = dfaa * raa_y;
2077  const float faa_z = dfaa * raa_z;
2078  const float fad_x = dfad * rad_x;
2079  const float fad_y = dfad * rad_y;
2080  const float fad_z = dfad * rad_z;
2081  const float fda_x = dfda * rda_x;
2082  const float fda_y = dfda * rda_y;
2083  const float fda_z = dfda * rda_z;
2084  const float fdd_x = dfdd * rdd_x;
2085  const float fdd_y = dfdd * rdd_y;
2086  const float fdd_z = dfdd * rdd_z;
2087  virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
2088  virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
2089  virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
2090  virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
2091  virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
2092  virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
2093  virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
2094  virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
2095  virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
2096 #endif
2097  }
2098 }
2099 
2100 #ifndef NAMD_CUDA
2101 #define BONDEDFORCESKERNEL_NUM_WARP 2
2102 #else
2103 #define BONDEDFORCESKERNEL_NUM_WARP 4
2104 #endif
2105 /**
2106  * @brief Calculates all bonded forces in a single kernel call
2107  *
2108  * @remark This kernel may be called multiple times. For example,
2109  * assuming that we have 123 bonds and 456 angles, then there are
2110  * 579 bonded terms. The size of a thread block is 2*32=64 (CUDA),
2111  * so we need use 2 blocks for bonds, and 8 blocks for angles. Due to
2112  * hardware limitation we may be limited to use 4 blocks at a time, but
2113  * we need to compute all 10 (2 bonds + 8 angles) blocks, and then we
2114  * need to launch bondedForcesKernel three times, with 4, 4 and 2 blocks
2115  * for each time:
2116  * 1st launch: starts with an offset of 0 blocks (start=0), and in such
2117  * case, we always have indexTB < numBondsTB, where numBondsTB
2118  * is the number of blocks used for bonds, so the code in
2119  * if (indexTB < numBondsTB) {...} will compute the bond
2120  * force, and after that it jumps to the reduce part by goto.
2121  * 2nd launch: starts with an offset of 4 blocks (start=4), and just like the
2122  * 1st iteration, these blocks will be used for the bond forces.
2123  * 3rd launch: starts with an offset of 8 blocks (start=8), but now
2124  * indexTB must be larger than numBondsTB, so the bond forces
2125  * will be skipped, and the remaining two blocks will continue
2126  * to the angle forces.
2127  * In summary, the goto in the kernel is used to guarantee that a group of
2128  * blocks are only used for a specific bonded term, while reusing the
2129  * reduction code for accumulating the energies.
2130  */
2131 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI>
2132 __global__ void bondedForcesKernel(
2133  const int start,
2134 
2135  const int numBonds,
2136  const CudaBond* __restrict__ bonds,
2137  const CudaBondValue* __restrict__ bondValues,
2138 
2139  const int numAngles,
2140  const CudaAngle* __restrict__ angles,
2141  const CudaAngleValue* __restrict__ angleValues,
2142 
2143  const int numDihedrals,
2144  const CudaDihedral* __restrict__ dihedrals,
2145  const CudaDihedralValue* __restrict__ dihedralValues,
2146 
2147  const int numImpropers,
2148  const CudaDihedral* __restrict__ impropers,
2149  const CudaDihedralValue* __restrict__ improperValues,
2150 
2151  const int numExclusions,
2152  const CudaExclusion* __restrict__ exclusions,
2153 
2154  const int numCrossterms,
2155  const CudaCrossterm* __restrict__ crossterms,
2156  const CudaCrosstermValue* __restrict__ crosstermValues,
2157 
2158  const int numTholes,
2159  const CudaThole* __restrict__ tholes,
2160 
2161  const int numAnisos,
2162  const CudaAniso* __restrict__ anisos,
2163 
2164  const int numOneFourNbTholes,
2165  const CudaOneFourNbThole* __restrict oneFourNbTholes,
2166 
2167  const float cutoff2, const float one_scale14,
2168  const float r2_delta, const int r2_delta_expc,
2169 
2170  const float* __restrict__ r2_table,
2171  const float4* __restrict__ exclusionTable,
2172 
2173 #if !defined(USE_TABLE_ARRAYS)
2174  cudaTextureObject_t r2_table_tex,
2175  cudaTextureObject_t exclusionTableTex,
2176 #endif
2177 
2178  const float4* __restrict__ xyzq,
2179  const int stride,
2180  const float3 lata, const float3 latb, const float3 latc,
2181  T* __restrict__ force,
2182  T* __restrict__ forceSlow,
2183  T* __restrict__ forceList,
2184  int* forceListCounter,
2185  int* forceListStarts,
2186  int* forceListStartsSlow,
2187  int* forceListNexts,
2188  double* __restrict__ energies_virials) {
2189 
2190  // Thread-block index
2191  int indexTB = start + blockIdx.x;
2192 
2193  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
2194  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
2195  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
2196  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
2197  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
2198  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
2199  const int numTholesTB = (numTholes + blockDim.x - 1)/blockDim.x;
2200  const int numAnisosTB = (numAnisos + blockDim.x - 1)/blockDim.x;
2201  int numOneFourNbTholesTB;
2202  if (doElect) numOneFourNbTholesTB = (numOneFourNbTholes + blockDim.x - 1)/blockDim.x;
2203 
2204  // Each thread computes single bonded interaction.
2205  // Each thread block computes single bonded type
2206  double energy;
2207  double energy_F;
2208  double energy_TI_1;
2209  double energy_TI_2;
2210  int energyIndex;
2211  int energyIndex_F;
2212  int energyIndex_TI_1;
2213  int energyIndex_TI_2;
2214 
2215  if (doEnergy) {
2216  energy = 0.0;
2217  energyIndex = -1;
2218  if (doFEP) {
2219  energy_F = 0.0;
2220  energyIndex_F = -1;
2221  }
2222  if (doTI) {
2223  energy_TI_1 = 0.0;
2224  energy_TI_2 = 0.0;
2225  energyIndex_TI_1 = -1;
2226  energyIndex_TI_2 = -1;
2227  }
2228  }
2229 
2230 #ifdef WRITE_FULL_VIRIALS
2231  ComputeBondedCUDAKernel::BondedVirial<double> virial;
2232  int virialIndex;
2233  if (doVirial) {
2234  virial.xx = 0.0;
2235  virial.xy = 0.0;
2236  virial.xz = 0.0;
2237  virial.yx = 0.0;
2238  virial.yy = 0.0;
2239  virial.yz = 0.0;
2240  virial.zx = 0.0;
2241  virial.zy = 0.0;
2242  virial.zz = 0.0;
2243  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2244  }
2245 #endif
2246 
2247  if (indexTB < numBondsTB) {
2248  int i = threadIdx.x + indexTB*blockDim.x;
2249  if (doEnergy) {
2250  energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
2251  if (doFEP) {
2252  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
2253  }
2254  if (doTI) {
2255  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
2256  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
2257  }
2258  }
2259  if (i < numBonds) {
2260  bondForce<T, doEnergy, doVirial, doFEP, doTI>
2261  (i, bonds, bondValues, xyzq,
2262  stride, lata, latb, latc,
2263  force, energy,
2264  forceList, forceListCounter, forceListStarts, forceListNexts,
2265  virial,
2266  energy_F, energy_TI_1, energy_TI_2);
2267  }
2268  goto reduce;
2269  }
2270  indexTB -= numBondsTB;
2271 
2272  if (indexTB < numAnglesTB) {
2273  int i = threadIdx.x + indexTB*blockDim.x;
2274  if (doEnergy) {
2275  energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
2276  if (doFEP) {
2277  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
2278  }
2279  if (doTI) {
2280  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
2281  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
2282  }
2283  }
2284  if (i < numAngles) {
2285  angleForce<T, doEnergy, doVirial, doFEP, doTI>
2286  (i, angles, angleValues, xyzq, stride,
2287  lata, latb, latc,
2288  force, energy,
2289  forceList, forceListCounter, forceListStarts, forceListNexts,
2290  virial,
2291  energy_F, energy_TI_1, energy_TI_2);
2292  }
2293  goto reduce;
2294  }
2295  indexTB -= numAnglesTB;
2296 
2297  if (indexTB < numDihedralsTB) {
2298  int i = threadIdx.x + indexTB*blockDim.x;
2299  if (doEnergy) {
2300  energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
2301  if (doFEP) {
2302  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
2303  }
2304  if (doTI) {
2305  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
2306  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
2307  }
2308  }
2309  if (doVirial) {
2310  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2311  }
2312  if (i < numDihedrals) {
2313  diheForce<T, doEnergy, doVirial, doFEP, doTI>
2314  (i, dihedrals, dihedralValues, xyzq, stride,
2315  lata, latb, latc,
2316  force, energy,
2317  forceList, forceListCounter, forceListStarts, forceListNexts,
2318  virial,
2319  energy_F, energy_TI_1, energy_TI_2);
2320  }
2321  goto reduce;
2322  }
2323  indexTB -= numDihedralsTB;
2324 
2325  if (indexTB < numImpropersTB) {
2326  int i = threadIdx.x + indexTB*blockDim.x;
2327  if (doEnergy) {
2328  energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
2329  if (doFEP) {
2330  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
2331  }
2332  if (doTI) {
2333  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
2334  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
2335  }
2336  }
2337  if (i < numImpropers) {
2338  diheForce<T, doEnergy, doVirial, doFEP, doTI>
2339  (i, impropers, improperValues, xyzq, stride,
2340  lata, latb, latc,
2341  force, energy,
2342  forceList, forceListCounter, forceListStarts, forceListNexts,
2343  virial,
2344  energy_F, energy_TI_1, energy_TI_2);
2345  }
2346  goto reduce;
2347  }
2348  indexTB -= numImpropersTB;
2349 
2350  if (indexTB < numExclusionsTB) {
2351  int i = threadIdx.x + indexTB*blockDim.x;
2352  if (doEnergy) {
2353  energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
2354  if (doFEP) {
2355  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
2356  }
2357  if (doTI) {
2358  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
2359  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
2360  }
2361  }
2362  if (doVirial) {
2363  virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
2364  }
2365  if (i < numExclusions) {
2366  exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
2367  (i, exclusions, r2_delta, r2_delta_expc,
2368 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
2369  r2_table, exclusionTable,
2370 #else
2371  r2_table_tex, exclusionTableTex,
2372 #endif
2373  xyzq, stride, lata, latb, latc, cutoff2,
2374  forceSlow, energy,
2375  forceList, forceListCounter, forceListStartsSlow, forceListNexts,
2376  virial,
2377  energy_F, energy_TI_1, energy_TI_2);
2378  }
2379  goto reduce;
2380  }
2381  indexTB -= numExclusionsTB;
2382 
2383  if (indexTB < numCrosstermsTB) {
2384  int i = threadIdx.x + indexTB*blockDim.x;
2385  if (doEnergy) {
2386  energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
2387  if (doFEP) {
2388  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
2389  }
2390  if (doTI) {
2391  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
2392  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
2393  }
2394  }
2395  if (doVirial) {
2396  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2397  }
2398  if (i < numCrossterms) {
2399  crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
2400  (i, crossterms, crosstermValues,
2401  xyzq, stride, lata, latb, latc,
2402  force, energy,
2403  forceList, forceListCounter, forceListStarts, forceListNexts,
2404  virial,
2405  energy_F, energy_TI_1, energy_TI_2);
2406  }
2407  goto reduce;
2408  }
2409  indexTB -= numCrosstermsTB;
2410 
2411  if (indexTB < numTholesTB) {
2412  int i = threadIdx.x + indexTB*blockDim.x;
2413  if (doEnergy) {
2414  energyIndex = ComputeBondedCUDAKernel::energyIndex_THOLE;
2415  if (doFEP) {
2416  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_THOLE_F;
2417  }
2418  if (doTI) {
2419  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_1;
2420  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_2;
2421  }
2422  }
2423  if (doVirial) {
2424  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2425  }
2426  if (i < numTholes) {
2427  tholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2428  i, tholes, xyzq, stride, lata, latb, latc,
2429  force, energy, forceList, forceListCounter,
2430  forceListStarts, forceListNexts,
2431  virial, energy_F, energy_TI_1, energy_TI_2);
2432  }
2433  goto reduce;
2434  }
2435  indexTB -= numTholesTB;
2436 
2437  if (indexTB < numAnisosTB) {
2438  int i = threadIdx.x + indexTB*blockDim.x;
2439  if (doEnergy) {
2440  energyIndex = ComputeBondedCUDAKernel::energyIndex_ANISO;
2441  if (doFEP) {
2442  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANISO_F;
2443  }
2444  if (doTI) {
2445  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_1;
2446  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_2;
2447  }
2448  }
2449  if (doVirial) {
2450  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2451  }
2452  if (i <numAnisos) {
2453  anisoForce<T, doEnergy, doVirial, doFEP, doTI>(
2454  i, anisos, xyzq, stride, lata, latb, latc,
2455  force, energy, forceList, forceListCounter,
2456  forceListStarts, forceListNexts,
2457  virial, energy_F, energy_TI_1, energy_TI_2);
2458  }
2459  goto reduce;
2460  }
2461  if (doElect) indexTB -= numAnisosTB;
2462 
2463  if (doElect) {
2464  if (indexTB < numOneFourNbTholesTB) {
2465  int i = threadIdx.x + indexTB*blockDim.x;
2466  if (doEnergy) {
2467  energyIndex = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE;
2468  if (doFEP) {
2469  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_F;
2470  }
2471  if (doTI) {
2472  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_TI_1;
2473  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_TI_2;
2474  }
2475  }
2476  if (doVirial) {
2477  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2478  }
2479  if (i < numOneFourNbTholes) {
2480  oneFourNbTholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2481  i, one_scale14, oneFourNbTholes, xyzq, stride, lata, latb, latc,
2482  force, energy, forceList, forceListCounter,
2483  forceListStarts, forceListNexts,
2484  virial, energy_F, energy_TI_1, energy_TI_2);
2485  }
2486  goto reduce;
2487  }
2488  // indexTB -= numOneFourNbTholesTB;
2489  }
2490 
2491  reduce:
2492 
2493  // Write energies to global memory
2494  if (doEnergy) {
2495  // energyIndex is constant within thread-block
2496  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
2497  __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2498  __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2499  __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
2500 #pragma unroll
2501  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2502  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2503 
2504  if (doFEP) {
2505  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2506  }
2507  if (doTI) {
2508  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2509  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2510  }
2511  }
2512  int laneID = (threadIdx.x & (WARPSIZE - 1));
2513  int warpID = threadIdx.x / WARPSIZE;
2514  if (laneID == 0) {
2515  shEnergy[warpID] = energy;
2516  if (doFEP) {
2517  shEnergy_F[warpID] = energy_F;
2518  }
2519  if (doTI) {
2520  shEnergy_TI_1[warpID] = energy_TI_1;
2521  shEnergy_TI_2[warpID] = energy_TI_2;
2522  }
2523  }
2524  BLOCK_SYNC;
2525  if (warpID == 0) {
2526  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
2527  if (doFEP) {
2528  energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
2529  }
2530  if (doTI) {
2531  energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
2532  energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
2533  }
2534 #pragma unroll
2535  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2536  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2537 
2538  if (doFEP) {
2539  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2540  }
2541  if (doTI) {
2542  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2543  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2544  }
2545  }
2546  if (laneID == 0) {
2547  const int bin = blockIdx.x % ATOMIC_BINS;
2548  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
2549  if (doFEP) {
2550  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
2551  }
2552  if (doTI) {
2553  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
2554  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
2555  }
2556  }
2557  }
2558  }
2559 
2560  // Write virials to global memory
2561 #ifdef WRITE_FULL_VIRIALS
2562  if (doVirial) {
2563 #pragma unroll
2564  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2565  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2566  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2567  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2568  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2569  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2570  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2571  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2572  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2573  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2574  }
2575  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
2576  int laneID = (threadIdx.x & (WARPSIZE - 1));
2577  int warpID = threadIdx.x / WARPSIZE;
2578  if (laneID == 0) {
2579  shVirial[warpID] = virial;
2580  }
2581  BLOCK_SYNC;
2582 
2583  if (warpID == 0) {
2584  virial.xx = 0.0;
2585  virial.xy = 0.0;
2586  virial.xz = 0.0;
2587  virial.yx = 0.0;
2588  virial.yy = 0.0;
2589  virial.yz = 0.0;
2590  virial.zx = 0.0;
2591  virial.zy = 0.0;
2592  virial.zz = 0.0;
2593  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
2594 #pragma unroll
2595  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2596  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2597  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2598  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2599  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2600  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2601  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2602  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2603  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2604  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2605  }
2606 
2607  if (laneID == 0) {
2608  const int bin = blockIdx.x % ATOMIC_BINS;
2609  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
2610  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
2611  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
2612  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
2613  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
2614  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
2615  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
2616  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
2617  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
2618  }
2619  }
2620  }
2621 #endif
2622 
2623 }
2624 
2625 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
2626 __global__ void modifiedExclusionForcesKernel(
2627  const int start,
2628  const int numModifiedExclusions,
2629  const CudaExclusion* __restrict__ modifiedExclusions,
2630  const bool doSlow,
2631  const float one_scale14, // 1 - scale14
2632  const float cutoff2,
2633  const CudaNBConstants nbConstants,
2634  const int vdwCoefTableWidth,
2635  const float2* __restrict__ vdwCoefTable,
2636 #if !defined(USE_TABLE_ARRAYS)
2637  cudaTextureObject_t vdwCoefTableTex,
2638 #endif
2639 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2640  const float4* __restrict__ modifiedExclusionForceTable,
2641  const float4* __restrict__ modifiedExclusionEnergyTable,
2642 #else
2643  cudaTextureObject_t modifiedExclusionForceTableTex,
2644  cudaTextureObject_t modifiedExclusionEnergyTableTex,
2645 #endif
2646  const float4* __restrict__ xyzq,
2647  const int stride,
2648  const float3 lata, const float3 latb, const float3 latc,
2649  T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2650  T* __restrict__ forceList,
2651  int* forceListCounter,
2652  int* forceListStartsNbond,
2653  int* forceListStartsSlow,
2654  int* forceListNexts,
2655  double* __restrict__ energies_virials) {
2656 
2657  // index
2658  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2659 
2660  double energyVdw, energyNbond, energySlow;
2661  // Alchemical energies
2662  double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2663  double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2664  double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2665  if (doEnergy) {
2666  energyVdw = 0.0;
2667  if (doFEP) {
2668  energyVdw_F = 0.0;
2669  }
2670  if (doTI) {
2671  energyVdw_TI_1 = 0.0;
2672  energyVdw_TI_2 = 0.0;
2673  }
2674  if (doElect) {
2675  energyNbond = 0.0;
2676  energySlow = 0.0;
2677  if (doFEP) {
2678  energyNbond_F = 0.0;
2679  energySlow_F = 0.0;
2680  }
2681  if (doTI) {
2682  energyNbond_TI_1 = 0.0;
2683  energyNbond_TI_2 = 0.0;
2684  energySlow_TI_1 = 0.0;
2685  energySlow_TI_2 = 0.0;
2686  }
2687  }
2688  }
2689 
2690 #ifdef WRITE_FULL_VIRIALS
2691  ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2692  ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2693  if (doVirial) {
2694  virialNbond.xx = 0.0;
2695  virialNbond.xy = 0.0;
2696  virialNbond.xz = 0.0;
2697  virialNbond.yx = 0.0;
2698  virialNbond.yy = 0.0;
2699  virialNbond.yz = 0.0;
2700  virialNbond.zx = 0.0;
2701  virialNbond.zy = 0.0;
2702  virialNbond.zz = 0.0;
2703  if (doElect) {
2704  virialSlow.xx = 0.0;
2705  virialSlow.xy = 0.0;
2706  virialSlow.xz = 0.0;
2707  virialSlow.yx = 0.0;
2708  virialSlow.yy = 0.0;
2709  virialSlow.yz = 0.0;
2710  virialSlow.zx = 0.0;
2711  virialSlow.zy = 0.0;
2712  virialSlow.zz = 0.0;
2713  }
2714  }
2715 #endif
2716 
2717  if (i < numModifiedExclusions)
2718  {
2719  modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2720  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2721 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2722  vdwCoefTable,
2723 #else
2724  vdwCoefTableTex,
2725 #endif
2726  // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2727 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2728  // tables
2729  modifiedExclusionForceTable,
2730  modifiedExclusionEnergyTable,
2731 #else
2732  // if CUDA build, non-tables, fall back to texture force/energy tables
2733  modifiedExclusionForceTableTex,
2734  modifiedExclusionEnergyTableTex,
2735 #endif
2736  xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2737  energyVdw, forceNbond, energyNbond,
2738  forceSlow, energySlow,
2739  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2740  virialNbond, virialSlow,
2741  energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2742  energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2743  energySlow_F, energySlow_TI_1, energySlow_TI_2);
2744  }
2745 
2746  // Write energies to global memory
2747  if (doEnergy) {
2748  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2749  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2750  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2751  __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2752  __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2753  __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2754  __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2755  __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2756  __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2757  __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2758  __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2759  __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2760 #pragma unroll
2761  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2762  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2763 
2764  if (doFEP) {
2765  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2766  }
2767  if (doTI) {
2768  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2769  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2770  }
2771  if (doElect) {
2772  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2773  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2774  if (doFEP) {
2775  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2776  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2777  }
2778  if (doTI) {
2779  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2780  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2781  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2782  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2783  }
2784  }
2785  }
2786  int laneID = (threadIdx.x & (WARPSIZE - 1));
2787  int warpID = threadIdx.x / WARPSIZE;
2788  if (laneID == 0) {
2789  shEnergyVdw[warpID] = energyVdw;
2790  if (doFEP) {
2791  shEnergyVdw_F[warpID] = energyVdw_F;
2792  }
2793  if (doTI) {
2794  shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2795  shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2796  }
2797  if (doElect) {
2798  shEnergyNbond[warpID] = energyNbond;
2799  shEnergySlow[warpID] = energySlow;
2800  if (doFEP) {
2801  shEnergyNbond_F[warpID] = energyNbond_F;
2802  shEnergySlow_F[warpID] = energySlow_F;
2803  }
2804  if (doTI) {
2805  shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2806  shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2807  shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2808  shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2809  }
2810  }
2811  }
2812  BLOCK_SYNC;
2813  if (warpID == 0) {
2814  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2815  if (doFEP) {
2816  energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2817  }
2818  if (doTI) {
2819  energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2820  energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2821  }
2822  if (doElect) {
2823  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2824  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2825  if (doFEP) {
2826  energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2827  energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2828  }
2829  if (doTI) {
2830  energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2831  energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2832  energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2833  energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2834  }
2835  }
2836 #pragma unroll
2837  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2838  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2839 
2840  if (doFEP) {
2841  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2842  }
2843  if (doTI) {
2844  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2845  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2846  }
2847  if (doElect) {
2848  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2849  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2850  if (doFEP) {
2851  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2852  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2853  }
2854  if (doTI) {
2855  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2856  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2857  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2858  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2859  }
2860  }
2861  }
2862  if (laneID == 0) {
2863  const int bin = blockIdx.x % ATOMIC_BINS;
2864  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2865  if (doFEP) {
2866  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2867  }
2868  if (doTI) {
2869  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2870  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2871  }
2872  if (doElect) {
2873  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2874  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2875  if (doFEP) {
2876  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2877  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2878  }
2879  if (doTI) {
2880  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2881  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2882  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2883  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2884  }
2885  }
2886  }
2887  }
2888  }
2889 
2890  // Write virials to global memory
2891 #ifdef WRITE_FULL_VIRIALS
2892  if (doVirial) {
2893 #pragma unroll
2894  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2895  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2896  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2897  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2898  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2899  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2900  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2901  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2902  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2903  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2904  if (doElect && doSlow) {
2905  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2906  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2907  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2908  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2909  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2910  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2911  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2912  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2913  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2914  }
2915  }
2916  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2917  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2918  int laneID = (threadIdx.x & (WARPSIZE - 1));
2919  int warpID = threadIdx.x / WARPSIZE;
2920  if (laneID == 0) {
2921  shVirialNbond[warpID] = virialNbond;
2922  if (doElect && doSlow) {
2923  shVirialSlow[warpID] = virialSlow;
2924  }
2925  }
2926  BLOCK_SYNC;
2927 
2928  virialNbond.xx = 0.0;
2929  virialNbond.xy = 0.0;
2930  virialNbond.xz = 0.0;
2931  virialNbond.yx = 0.0;
2932  virialNbond.yy = 0.0;
2933  virialNbond.yz = 0.0;
2934  virialNbond.zx = 0.0;
2935  virialNbond.zy = 0.0;
2936  virialNbond.zz = 0.0;
2937  if (doElect && doSlow) {
2938  virialSlow.xx = 0.0;
2939  virialSlow.xy = 0.0;
2940  virialSlow.xz = 0.0;
2941  virialSlow.yx = 0.0;
2942  virialSlow.yy = 0.0;
2943  virialSlow.yz = 0.0;
2944  virialSlow.zx = 0.0;
2945  virialSlow.zy = 0.0;
2946  virialSlow.zz = 0.0;
2947  }
2948 
2949  if (warpID == 0) {
2950  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2951  virialNbond = shVirialNbond[laneID];
2952  if (doElect && doSlow) {
2953  virialSlow = shVirialSlow[laneID];
2954  }
2955  }
2956 #pragma unroll
2957  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2958  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2959  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2960  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2961  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2962  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2963  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2964  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2965  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2966  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2967  if (doElect && doSlow) {
2968  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2969  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2970  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2971  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2972  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2973  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2974  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2975  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2976  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2977  }
2978  }
2979 
2980  if (laneID == 0)
2981  {
2982  const int bin = blockIdx.x % ATOMIC_BINS;
2983  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2984  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2985  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2986  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2987  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2988  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2989  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2990  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2991  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2992  if (doElect && doSlow) {
2993  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2994  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2995  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2996  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2997  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2998  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2999  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
3000  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
3001  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
3002  }
3003  }
3004  }
3005  }
3006 #endif
3007 
3008 }
3009 
3010 template <typename T>
3011 __global__ void gatherBondedForcesKernel(
3012  const int start,
3013  const int atomStorageSize,
3014  const int stride,
3015  const T* __restrict__ forceList,
3016  const int* __restrict__ forceListStarts,
3017  const int* __restrict__ forceListNexts,
3018  T* __restrict__ force) {
3019 
3020  int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
3021 
3022  if (i < atomStorageSize) {
3023  T fx = 0;
3024  T fy = 0;
3025  T fz = 0;
3026  int pos = forceListStarts[i];
3027  while (pos != -1) {
3028  fx += forceList[pos * 3 + 0];
3029  fy += forceList[pos * 3 + 1];
3030  fz += forceList[pos * 3 + 2];
3031  pos = forceListNexts[pos];
3032  }
3033  force[i ] = fx;
3034  force[i + stride ] = fy;
3035  force[i + stride * 2] = fz;
3036  }
3037 }
3038 
3039 __global__ void reduceBondedBinsKernel(double* energies_virials) {
3040  const int bin = threadIdx.x;
3041 
3042  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3043  __shared__ typename WarpReduce::TempStorage tempStorage;
3044 
3045  double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
3046  if (threadIdx.x == 0) {
3047  energies_virials[blockIdx.x] = v;
3048  }
3049 }
3050 
3051 // ##############################################################################################
3052 // ##############################################################################################
3053 // ##############################################################################################
3054 
3055 //
3056 // Class constructor
3057 //
3058 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
3059 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
3060 
3061  cudaCheck(cudaSetDevice(deviceID));
3062 
3063  tupleData = NULL;
3064  tupleDataSize = 0;
3065 
3066  bonds = NULL;
3067  angles = NULL;
3068  dihedrals = NULL;
3069  impropers = NULL;
3070  modifiedExclusions = NULL;
3071  exclusions = NULL;
3072  tholes = NULL;
3073  anisos = NULL;
3074  oneFourNbTholes = NULL;
3075 
3076  numBonds = 0;
3077  numAngles = 0;
3078  numDihedrals = 0;
3079  numImpropers = 0;
3080  numModifiedExclusions = 0;
3081  numExclusions = 0;
3082  numCrossterms = 0;
3083  numTholes = 0;
3084  numAnisos = 0;
3085  numOneFourNbTholes = 0;
3086 
3087  bondValues = NULL;
3088  angleValues = NULL;
3089  dihedralValues = NULL;
3090  improperValues = NULL;
3091  crosstermValues = NULL;
3092 
3093  xyzq = NULL;
3094  xyzqSize = 0;
3095 
3096  forces = NULL;
3097  forcesSize = 0;
3098 
3099  forceList = NULL;
3100  forceListStarts = NULL;
3101  forceListNexts = NULL;
3102  forceListSize = 0;
3103  forceListStartsSize = 0;
3104  forceListNextsSize = 0;
3105  allocate_device<int>(&forceListCounter, 1);
3106 
3107  allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
3108 }
3109 
3110 //
3111 // Class destructor
3112 //
3113 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
3114  cudaCheck(cudaSetDevice(deviceID));
3115 
3116  deallocate_device<double>(&energies_virials);
3117  // deallocate_device<BondedVirial>(&virial);
3118 
3119  if (tupleData != NULL) deallocate_device<char>(&tupleData);
3120  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
3121  if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
3122 
3123  if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
3124  if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
3125  if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
3126  if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
3127 
3128  if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
3129  if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
3130  if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
3131  if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
3132  if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
3133 }
3134 
3135 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
3136  allocate_device<CudaBondValue>(&bondValues, numBondValues);
3137  copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
3138 }
3139 
3140 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
3141  allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
3142  copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
3143 }
3144 
3145 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
3146  allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
3147  copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
3148 }
3149 
3150 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
3151  allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
3152  copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
3153 }
3154 
3155 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
3156  allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
3157  copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
3158 }
3159 
3160 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
3161  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
3162 }
3163 
3164 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
3165  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
3166 }
3167 
3168 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
3169  alchFlags = h_cudaAlchFlags;
3170 }
3171 
3172 //
3173 // Update bonded lists
3174 //
3175 void ComputeBondedCUDAKernel::setTupleCounts(
3176  const TupleCounts count
3177 ) {
3178  numBonds = count.bond;
3179  numAngles = count.angle;
3180  numDihedrals = count.dihedral;
3181  numImpropers = count.improper;
3182  numModifiedExclusions = count.modifiedExclusion;
3183  numExclusions = count.exclusion;
3184  numCrossterms = count.crossterm;
3185  numTholes = count.thole;
3186  numAnisos = count.aniso;
3187  numOneFourNbTholes = count.oneFourNbThole;
3188 }
3189 
3190 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
3191  TupleCounts count;
3192 
3193  count.bond = numBonds;
3194  count.angle = numAngles;
3195  count.dihedral = numDihedrals;
3196  count.improper = numImpropers;
3197  count.modifiedExclusion = numModifiedExclusions;
3198  count.exclusion = numExclusions;
3199  count.crossterm = numCrossterms;
3200  count.thole = numTholes;
3201  count.aniso = numAnisos;
3202  count.oneFourNbThole = numOneFourNbTholes;
3203 
3204  return count;
3205 }
3206 
3207 TupleData ComputeBondedCUDAKernel::getData() {
3208  TupleData data;
3209  data.bond = bonds;
3210  data.angle = angles;
3211  data.dihedral = dihedrals;
3212  data.improper = impropers;
3213  data.modifiedExclusion = modifiedExclusions;
3214  data.exclusion = exclusions;
3215  data.crossterm = crossterms;
3216  data.thole = tholes;
3217  data.aniso = anisos;
3218  data.oneFourNbThole = oneFourNbTholes;
3219 
3220  return data;
3221 }
3222 
3223 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
3224  const TupleCounts countIn,
3225  cudaStream_t
3226 ) {
3227  const int numBondsWA = warpAlign(countIn.bond);
3228  const int numAnglesWA = warpAlign(countIn.angle);
3229  const int numDihedralsWA = warpAlign(countIn.dihedral);
3230  const int numImpropersWA = warpAlign(countIn.improper);
3231  const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
3232  const int numExclusionsWA = warpAlign(countIn.exclusion);
3233  const int numCrosstermsWA = warpAlign(countIn.crossterm);
3234  const int numTholesWA = warpAlign(countIn.thole);
3235  const int numAnisosWA = warpAlign(countIn.aniso);
3236  const int numOneFourNbTholesWA = warpAlign(countIn.oneFourNbThole);
3237 
3238  const size_t sizeTot = numBondsWA*sizeof(CudaBond)
3239  + numAnglesWA*sizeof(CudaAngle)
3240  + numDihedralsWA*sizeof(CudaDihedral)
3241  + numImpropersWA*sizeof(CudaDihedral)
3242  + numModifiedExclusionsWA*sizeof(CudaExclusion)
3243  + numExclusionsWA*sizeof(CudaExclusion)
3244  + numCrosstermsWA*sizeof(CudaCrossterm)
3245  + numTholesWA*sizeof(CudaThole)
3246  + numAnisosWA*sizeof(CudaAniso)
3247  + numOneFourNbTholesWA*sizeof(CudaOneFourNbThole);
3248 
3249  reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
3250 
3251  // Setup pointers
3252  size_t pos = 0;
3253  bonds = (CudaBond *)&tupleData[pos];
3254  pos += numBondsWA*sizeof(CudaBond);
3255 
3256  angles = (CudaAngle* )&tupleData[pos];
3257  pos += numAnglesWA*sizeof(CudaAngle);
3258 
3259  dihedrals = (CudaDihedral* )&tupleData[pos];
3260  pos += numDihedralsWA*sizeof(CudaDihedral);
3261 
3262  impropers = (CudaDihedral* )&tupleData[pos];
3263  pos += numImpropersWA*sizeof(CudaDihedral);
3264 
3265  modifiedExclusions = (CudaExclusion* )&tupleData[pos];
3266  pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
3267 
3268  exclusions = (CudaExclusion* )&tupleData[pos];
3269  pos += numExclusionsWA*sizeof(CudaExclusion);
3270 
3271  crossterms = (CudaCrossterm* )&tupleData[pos];
3272  pos += numCrosstermsWA*sizeof(CudaCrossterm);
3273 
3274  tholes = (CudaThole *)&tupleData[pos];
3275  pos += numTholesWA*sizeof(CudaThole);
3276 
3277  anisos = (CudaAniso *)&tupleData[pos];
3278  pos += numAnisosWA*sizeof(CudaAniso);
3279 
3280  oneFourNbTholes = (CudaOneFourNbThole *)&tupleData[pos];
3281  pos += numOneFourNbTholesWA*sizeof(CudaOneFourNbThole);
3282 
3283  return sizeTot;
3284 }
3285 
3286 void ComputeBondedCUDAKernel::update(
3287  const int numBondsIn,
3288  const int numAnglesIn,
3289  const int numDihedralsIn,
3290  const int numImpropersIn,
3291  const int numModifiedExclusionsIn,
3292  const int numExclusionsIn,
3293  const int numCrosstermsIn,
3294  const int numTholesIn,
3295  const int numAnisosIn,
3296  const int numOneFourNbTholesIn,
3297  const char* h_tupleData,
3298  cudaStream_t stream) {
3299 
3300  numBonds = numBondsIn;
3301  numAngles = numAnglesIn;
3302  numDihedrals = numDihedralsIn;
3303  numImpropers = numImpropersIn;
3304  numModifiedExclusions = numModifiedExclusionsIn;
3305  numExclusions = numExclusionsIn;
3306  numCrossterms = numCrosstermsIn;
3307  numTholes = numTholesIn;
3308  numAnisos = numAnisosIn;
3309  numOneFourNbTholes = numOneFourNbTholesIn;
3310 
3311  const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
3312 
3313  copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
3314 }
3315 
3316 void ComputeBondedCUDAKernel::updateAtomBuffer(
3317  const int atomStorageSize,
3318  cudaStream_t stream
3319 ) {
3320  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3321 }
3322 
3323 //
3324 // Return stride for force array
3325 //
3326 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
3327 #ifdef USE_STRIDED_FORCE
3328  // Align stride to 256 bytes
3329  return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
3330 #else
3331  return 1;
3332 #endif
3333 }
3334 
3335 //
3336 // Return size of single force array
3337 //
3338 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
3339 #ifdef USE_STRIDED_FORCE
3340  return (3*getForceStride(atomStorageSize));
3341 #else
3342  return (3*atomStorageSize);
3343 #endif
3344 }
3345 
3346 //
3347 // Return size of the all force arrays
3348 //
3349 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
3350 
3351  int forceSize = getForceSize(atomStorageSize);
3352 
3353  if (numModifiedExclusions > 0 || numExclusions > 0) {
3354  if (doSlow) {
3355  // All three force arrays [normal, nbond, slow]
3356  forceSize *= 3;
3357  } else {
3358  // Two force arrays [normal, nbond]
3359  forceSize *= 2;
3360  }
3361  }
3362 
3363  return forceSize;
3364 }
3365 
3366 //
3367 // Compute bonded forces
3368 //
3369 void ComputeBondedCUDAKernel::bondedForce(
3370  const double scale14, const int atomStorageSize,
3371  const bool doEnergy, const bool doVirial, const bool doSlow,
3372  const bool doTable,
3373  const float3 lata, const float3 latb, const float3 latc,
3374  const float cutoff2, const float r2_delta, const int r2_delta_expc,
3375  const CudaNBConstants nbConstants,
3376  const float4* h_xyzq, FORCE_TYPE* h_forces,
3377  double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
3378  bool useDeviceMigration, cudaStream_t stream) {
3379 
3380  int forceStorageSize = getAllForceSize(atomStorageSize, true);
3381  int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
3382  int forceStride = getForceStride(atomStorageSize);
3383 
3384  int forceSize = getForceSize(atomStorageSize);
3385  int startNbond = forceSize;
3386  int startSlow = 2*forceSize;
3387  // Re-allocate coordinate and force arrays if neccessary
3388  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3389  reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
3390 
3391 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3392  // function stores
3393  // numBonds bondForce 2
3394  // numAngles angleForce 3
3395  // numDihedrals diheForce 4
3396  // numImpropers diheForce 4
3397  // numExclusions exclusionForce 2
3398  // numCrossterms crosstermForce 8
3399  // numModifiedExclusions modifiedExclusionForce 4
3400  // numTholes tholeForce 4
3401  // numAnisos anisoForce 5
3402  // numOneFourNbTholes oneFourNbTholeForce 4
3403  int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4 + numTholes * 4 + numAnisos * 5 + numOneFourNbTholes * 4);
3404  reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
3405  reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
3406  reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
3407  int* forceListStartsNbond = forceListStarts + atomStorageSize;
3408  int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
3409 
3410  clear_device_array<int>(forceListCounter, 1, stream);
3411  cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
3412 #else
3413  int* forceListStartsNbond = NULL;
3414  int* forceListStartsSlow = NULL;
3415 #endif
3416 
3417 #ifdef NODEGROUP_FORCE_REGISTER
3418  if(CUDASOAintegrate){
3419  if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3420  }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3421 #else
3422  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3423 #endif
3424 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
3425  clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
3426 #endif
3427  if (doEnergy || doVirial ) {
3428  clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
3429  }
3430 
3431  // Check if we are doing alchemical free energy calculation
3432  // Is checking alchOn required?
3433  const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
3434  const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
3435 
3436  float one_scale14 = (float)(1.0 - scale14);
3437  bool doElect = (one_scale14 == 0.0f) ? false : true;
3438 
3439  // If doSlow = false, these exclusions are not calculated
3440  int numExclusionsDoSlow = doSlow ? numExclusions : 0;
3441 
3442  int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3443 
3444  int numBondsTB = (numBonds + nthread - 1)/nthread;
3445  int numAnglesTB = (numAngles + nthread - 1)/nthread;
3446  int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
3447  int numImpropersTB = (numImpropers + nthread - 1)/nthread;
3448  int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
3449  int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
3450  int numTholesTB = (numTholes + nthread - 1)/nthread;
3451  int numAnisosTB = (numAnisos + nthread - 1)/nthread;
3452  int numOneFourNbTholesTB = doElect ? (numOneFourNbTholes + nthread - 1)/nthread : 0;
3453 
3454  int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
3455  numExclusionsTB + numCrosstermsTB + numTholesTB + numAnisosTB + numOneFourNbTholesTB;
3456  int shmem_size = 0;
3457 
3458  // printf("%d %d %d %d %d %d nblock %d\n",
3459  // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
3460 
3461  int max_nblock = deviceCUDA->getMaxNumBlocks();
3462 
3463  int start = 0;
3464  while (start < nblock)
3465  {
3466  int nleft = nblock - start;
3467  int nblock_use = min(max_nblock, nleft);
3468 
3469 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3470  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
3471  cudaNonbondedTables.getExclusionTable()
3472 #else
3473  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
3474  cudaNonbondedTables.getExclusionTableTex()
3475 #endif
3476 
3477 #if defined(USE_TABLE_ARRAYS)
3478 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI) \
3479  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI> \
3480  <<< nblock_use, nthread, shmem_size, stream >>> \
3481  (start, numBonds, bonds, bondValues, \
3482  numAngles, angles, angleValues, \
3483  numDihedrals, dihedrals, dihedralValues, \
3484  numImpropers, impropers, improperValues, \
3485  numExclusionsDoSlow, exclusions, \
3486  numCrossterms, crossterms, crosstermValues, \
3487  numTholes, tholes, \
3488  numAnisos, anisos, \
3489  numOneFourNbTholes, oneFourNbTholes, \
3490  cutoff2, one_scale14, \
3491  r2_delta, r2_delta_expc, \
3492  TABLE_PARAMS, \
3493  xyzq, forceStride, \
3494  lata, latb, latc, \
3495  forces, &forces[startSlow], \
3496  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3497  energies_virials);
3498 #else
3499 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI) \
3500  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI> \
3501  <<< nblock_use, nthread, shmem_size, stream >>> \
3502  (start, numBonds, bonds, bondValues, \
3503  numAngles, angles, angleValues, \
3504  numDihedrals, dihedrals, dihedralValues, \
3505  numImpropers, impropers, improperValues, \
3506  numExclusionsDoSlow, exclusions, \
3507  numCrossterms, crossterms, crosstermValues, \
3508  numTholes, tholes, \
3509  numAnisos, anisos, \
3510  numOneFourNbTholes, oneFourNbTholes, \
3511  cutoff2, one_scale14, \
3512  r2_delta, r2_delta_expc, \
3513  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
3514  cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
3515  xyzq, forceStride, \
3516  lata, latb, latc, \
3517  forces, &forces[startSlow], \
3518  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3519  energies_virials);
3520 #endif
3521  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI) CALL(0, 0, 0, 0, 0);
3522  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI) CALL(0, 1, 0, 0, 0);
3523  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI) CALL(1, 0, 0, 0, 0);
3524  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI) CALL(1, 1, 0, 0, 0);
3525 
3526  if (!doEnergy && !doVirial && !doElect && doFEP && !doTI) CALL(0, 0, 0, 1, 0);
3527  if (!doEnergy && doVirial && !doElect && doFEP && !doTI) CALL(0, 1, 0, 1, 0);
3528  if ( doEnergy && !doVirial && !doElect && doFEP && !doTI) CALL(1, 0, 0, 1, 0);
3529  if ( doEnergy && doVirial && !doElect && doFEP && !doTI) CALL(1, 1, 0, 1, 0);
3530 
3531  if (!doEnergy && !doVirial && !doElect && !doFEP && doTI) CALL(0, 0, 0, 0, 1);
3532  if (!doEnergy && doVirial && !doElect && !doFEP && doTI) CALL(0, 1, 0, 0, 1);
3533  if ( doEnergy && !doVirial && !doElect && !doFEP && doTI) CALL(1, 0, 0, 0, 1);
3534  if ( doEnergy && doVirial && !doElect && !doFEP && doTI) CALL(1, 1, 0, 0, 1);
3535 
3536  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI) CALL(0, 0, 1, 0, 0);
3537  if (!doEnergy && doVirial && doElect && !doFEP && !doTI) CALL(0, 1, 1, 0, 0);
3538  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI) CALL(1, 0, 1, 0, 0);
3539  if ( doEnergy && doVirial && doElect && !doFEP && !doTI) CALL(1, 1, 1, 0, 0);
3540 
3541  if (!doEnergy && !doVirial && doElect && doFEP && !doTI) CALL(0, 0, 1, 1, 0);
3542  if (!doEnergy && doVirial && doElect && doFEP && !doTI) CALL(0, 1, 1, 1, 0);
3543  if ( doEnergy && !doVirial && doElect && doFEP && !doTI) CALL(1, 0, 1, 1, 0);
3544  if ( doEnergy && doVirial && doElect && doFEP && !doTI) CALL(1, 1, 1, 1, 0);
3545 
3546  if (!doEnergy && !doVirial && doElect && !doFEP && doTI) CALL(0, 0, 1, 0, 1);
3547  if (!doEnergy && doVirial && doElect && !doFEP && doTI) CALL(0, 1, 1, 0, 1);
3548  if ( doEnergy && !doVirial && doElect && !doFEP && doTI) CALL(1, 0, 1, 0, 1);
3549  if ( doEnergy && doVirial && doElect && !doFEP && doTI) CALL(1, 1, 1, 0, 1);
3550 
3551  // Can't enable both FEP and TI, don't expand the following functions.
3552 #if 0
3553  if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
3554  if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
3555  if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
3556  if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
3557 #endif
3558 
3559 #undef CALL
3560 #undef TABLE_PARAMS
3561  cudaCheck(cudaGetLastError());
3562 
3563  start += nblock_use;
3564  }
3565 
3566  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3567  nblock = (numModifiedExclusions + nthread - 1)/nthread;
3568 
3569  start = 0;
3570  while (start < nblock)
3571  {
3572  int nleft = nblock - start;
3573  int nblock_use = min(max_nblock, nleft);
3574 
3575 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3576 #define TABLE_PARAMS \
3577  cudaNonbondedTables.getExclusionVdwCoefTable(), \
3578  cudaNonbondedTables.getModifiedExclusionForceTable(), \
3579  cudaNonbondedTables.getModifiedExclusionEnergyTable()
3580 #else
3581 #define TABLE_PARAMS \
3582  cudaNonbondedTables.getExclusionVdwCoefTable(), \
3583  cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
3584  cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
3585  cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
3586 #endif
3587 
3588 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
3589  modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
3590  <<< nblock_use, nthread, shmem_size, stream >>> \
3591  (start, numModifiedExclusions, modifiedExclusions, \
3592  doSlow, one_scale14, cutoff2, nbConstants, \
3593  cudaNonbondedTables.getVdwCoefTableWidth(), \
3594  TABLE_PARAMS, \
3595  xyzq, forceStride, lata, latb, latc, \
3596  &forces[startNbond], &forces[startSlow], \
3597  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
3598  energies_virials);
3599 
3600 
3601 
3602  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
3603  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
3604  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
3605  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
3606 
3607  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
3608  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
3609  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
3610  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
3611 
3612  if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
3613  if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
3614  if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
3615  if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
3616 
3617  if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
3618  if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
3619  if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
3620  if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
3621 
3622  if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
3623  if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
3624  if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
3625  if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
3626 
3627  if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
3628  if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
3629  if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
3630  if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
3631 
3632  // doTable disabled is only supported by no FEP/ no TI
3633  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
3634  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
3635  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
3636  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
3637 
3638  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
3639  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
3640  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
3641  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
3642 
3643  // Can't enable both FEP and TI, don't expand the following functions.
3644 #if 0
3645  if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
3646  if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
3647  if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
3648  if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
3649 
3650  if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
3651  if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
3652  if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
3653  if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
3654 #endif
3655 
3656 #undef CALL
3657  cudaCheck(cudaGetLastError());
3658 
3659  start += nblock_use;
3660  }
3661 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3662  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3663  nblock = (atomStorageSize + nthread - 1)/nthread;
3664 
3665  start = 0;
3666  while (start < nblock)
3667  {
3668  int nleft = nblock - start;
3669  int nblock_use = min(max_nblock, nleft);
3670 
3671  // cudaCheck(hipDeviceSynchronize());
3672  // auto t0 = std::chrono::high_resolution_clock::now();
3673 
3674  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3675  start, atomStorageSize, forceStride,
3676  forceList, forceListStarts, forceListNexts,
3677  forces);
3678  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3679  start, atomStorageSize, forceStride,
3680  forceList, forceListStartsNbond, forceListNexts,
3681  &forces[startNbond]);
3682  if (doSlow) {
3683  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3684  start, atomStorageSize, forceStride,
3685  forceList, forceListStartsSlow, forceListNexts,
3686  &forces[startSlow]);
3687  }
3688  cudaCheck(cudaGetLastError());
3689 
3690  // cudaCheck(hipStreamSynchronize(stream));
3691  // auto t1 = std::chrono::high_resolution_clock::now();
3692  // std::chrono::duration<double> diff1 = t1 - t0;
3693  // std::cout << "gatherBondedForcesKernel";
3694  // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
3695 
3696  start += nblock_use;
3697  }
3698 #endif
3699 
3700 #ifdef NODEGROUP_FORCE_REGISTER
3701  if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
3702  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3703 #if 0
3704  // XXX TODO: ERASE THIS AFTERWARDS
3705  // this is not numAtoms, this is something else
3706  // will print the force inside the compute and afterwards
3707  FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
3708  //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
3709  // positions are wrong here.
3710  //for(int i = 0; i < atomStorageSize; i++){
3711  for(int i = 29415; i < 29895; i++){
3712  fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3713  }
3714  fclose(pos_nb_atoms);
3715 #endif
3716  }
3717 
3718 #else
3719  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3720 #endif
3721  if (doEnergy || doVirial) {
3722  if (ATOMIC_BINS > 1) {
3723  // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3724  reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3725  }
3726  // virial copy, is this necessary?
3727  copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
3728  }
3729 }
3730 
3731