1 #include "TupleTypesCUDA.h"
3 #define _USE_MATH_DEFINES
4 #define __thread __declspec(thread)
8 #include <hipcub/hipcub.hpp>
15 #if (NAMD_CCCL_MAJOR_VERSION >= 3)
19 #if __CUDACC_VER_MAJOR__ >= 11
20 #include <cub/cub.cuh>
22 #include <namd_cub/cub.cuh>
28 #include "ComputeBondedCUDAKernel.h"
29 #include "CudaComputeNonbondedInteractions.h"
31 #ifdef FUTURE_CUDADEVICE
32 #include "CudaDevice.h"
34 #include "DeviceCUDA.h"
35 extern __thread DeviceCUDA *deviceCUDA;
38 #ifndef USE_TABLE_ARRAYS
39 __device__ __forceinline__
40 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
41 #if defined(NAMD_CUDA)
42 return tex1D<float4>(tex, k);
44 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
45 const float x = k * (float)tableSize - 0.5f;
46 const float f = floorf(x);
47 const float a = x - f;
48 const unsigned int i = (unsigned int)f;
49 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
50 const int i1 = i0 + 1;
51 const float4 t0 = tex1Dfetch<float4>(tex, i0);
52 const float4 t1 = tex1Dfetch<float4>(tex, i1);
54 a * (t1.x - t0.x) + t0.x,
55 a * (t1.y - t0.y) + t0.y,
56 a * (t1.z - t0.z) + t0.z,
57 a * (t1.w - t0.w) + t0.w);
62 __device__ __forceinline__
63 float4 tableLookup(const float4* table, const float k)
65 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
66 const float x = k * static_cast<float>(tableSize) - 0.5f;
67 const float f = floorf(x);
68 const float a = x - f;
69 const int i = static_cast<int>(f);
70 const int i0 = max(0, min(tableSize - 1, i));
71 const int i1 = max(0, min(tableSize - 1, i + 1));
72 const float4 t0 = __ldg(&table[i0]);
73 const float4 t1 = __ldg(&table[i1]);
75 a * (t1.x - t0.x) + t0.x,
76 a * (t1.y - t0.y) + t0.y,
77 a * (t1.z - t0.z) + t0.z,
78 a * (t1.w - t0.w) + t0.w);
81 // global variable for alchemical transformation
82 namespace AlchBondedCUDA {
83 // Alchemical parameters and lambdas
84 __constant__ CudaAlchParameters alchParams;
85 __constant__ CudaAlchLambdas alchLambdas;
90 __forceinline__ __device__
91 void convertForces(const float x, const float y, const float z,
92 T &Tx, T &Ty, T &Tz) {
100 __forceinline__ __device__
101 void calcComponentForces(
103 const float dx, const float dy, const float dz,
104 T &fxij, T &fyij, T &fzij) {
112 template <typename T>
113 __forceinline__ __device__
114 void calcComponentForces(
116 const float dx1, const float dy1, const float dz1,
118 const float dx2, const float dy2, const float dz2,
119 T &fxij, T &fyij, T &fzij) {
121 fxij = (T)(fij1*dx1 + fij2*dx2);
122 fyij = (T)(fij1*dy1 + fij2*dy2);
123 fzij = (T)(fij1*dz1 + fij2*dz2);
126 __forceinline__ __device__
127 int warpAggregatingAtomicInc(int* counter) {
128 #if BOUNDINGBOXSIZE == 64
129 WarpMask mask = __ballot(1);
130 int total = __popcll(mask);
131 int prefix = __popcll(mask & cub::LaneMaskLt());
132 int firstLane = __ffsll(mask) - 1;
133 #elif defined(NAMD_CUDA) && (NAMD_CCCL_MAJOR_VERSION >= 3)
134 // cub::LaneMaskLt was removed with CCCL 3.0
135 WarpMask mask = __ballot_sync(WARP_FULL_MASK, 1);
136 int total = __popc(mask);
137 int prefix = __popc(mask & cuda::ptx::get_sreg_lanemask_lt());
138 int firstLane = __ffs(mask) - 1;
140 WarpMask mask = __ballot(1);
141 int total = __popc(mask);
142 int prefix = __popc(mask & cub::LaneMaskLt());
143 int firstLane = __ffs(mask) - 1;
147 start = atomicAdd(counter, total);
149 start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
150 return start + prefix;
153 template <typename T>
154 __forceinline__ __device__
155 void storeForces(const T fx, const T fy, const T fz,
156 const int ind, const int stride,
158 T* forceList, int* forceListCounter,
159 int* forceListStarts, int* forceListNexts) {
161 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
162 #if defined(NAMD_HIP)
163 // Try to decrease conflicts between lanes if there are repeting ind
164 WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
165 if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
166 const int laneId = threadIdx.x % WARPSIZE;
167 const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
168 const bool isHead = laneId == 0 || ind != prevLaneInd;
169 if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
170 // There are segments of repeating ind
171 typedef cub::WarpReduce<T> WarpReduce;
172 __shared__ typename WarpReduce::TempStorage temp_storage;
173 const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
174 const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
175 const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
177 atomicAdd(&force[ind ], sumfx);
178 atomicAdd(&force[ind + stride ], sumfy);
179 atomicAdd(&force[ind + stride*2], sumfz);
184 // Not all lanes are active (the last block) or there is no repeating ind
185 atomicAdd(&force[ind ], fx);
186 atomicAdd(&force[ind + stride ], fy);
187 atomicAdd(&force[ind + stride*2], fz);
189 atomicAdd(&force[ind ], fx);
190 atomicAdd(&force[ind + stride ], fy);
191 atomicAdd(&force[ind + stride*2], fz);
194 const int newPos = warpAggregatingAtomicInc(forceListCounter);
195 forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
196 forceList[newPos * 3 + 0] = fx;
197 forceList[newPos * 3 + 1] = fy;
198 forceList[newPos * 3 + 2] = fz;
205 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
206 __device__ void bondForce(
208 const CudaBond* __restrict__ bondList,
209 const CudaBondValue* __restrict__ bondValues,
210 const float4* __restrict__ xyzq,
212 const float3 lata, const float3 latb, const float3 latc,
213 T* __restrict__ force, double &energy,
214 T* __restrict__ forceList, int* forceListCounter,
215 int* forceListStarts, int* forceListNexts,
216 #ifdef WRITE_FULL_VIRIALS
217 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
219 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
221 double &energy_F, double &energy_TI_1, double &energy_TI_2
224 CudaBond bl = bondList[index];
225 CudaBondValue bondValue = bondValues[bl.itype];
226 // if (bondValue.x0 == 0.0f) return;
228 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
229 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
230 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
232 float4 xyzqi = xyzq[bl.i];
233 float4 xyzqj = xyzq[bl.j];
235 float xij = xyzqi.x + shx - xyzqj.x;
236 float yij = xyzqi.y + shy - xyzqj.y;
237 float zij = xyzqi.z + shz - xyzqj.z;
239 float r = sqrtf(xij*xij + yij*yij + zij*zij);
241 float db = r - bondValue.x0;
243 // in this case, the bond represents a harmonic wall potential
244 // where x0 is the lower wall and x1 is the upper
245 db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
247 float fij = db * bondValue.k * bl.scale;
249 // Define a temporary variable to store the energy
250 // used by both alchemical and non-alchemical route
251 float energy_tmp = 0.0f;
253 energy_tmp += fij * db;
257 // JM: Get this shit off here
258 float alch_scale = 1.0f;
260 switch (bl.fepBondedType) {
262 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
264 energy_TI_1 += (double)(energy_tmp);
267 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
272 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
274 energy_TI_2 += (double)(energy_tmp);
277 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
286 energy += (double)(energy_tmp * alch_scale);
288 energy += (double)(energy_tmp);
292 // XXX TODO: Get this off here
293 // XXX TODO: Decide on a templated parameter nomenclature
298 T T_fxij, T_fyij, T_fzij;
299 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
302 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
303 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
307 #ifdef WRITE_FULL_VIRIALS
308 float fxij = fij*xij;
309 float fyij = fij*yij;
310 float fzij = fij*zij;
311 virial.xx = (fxij*xij);
312 virial.xy = (fxij*yij);
313 virial.xz = (fxij*zij);
314 virial.yx = (fyij*xij);
315 virial.yy = (fyij*yij);
316 virial.yz = (fyij*zij);
317 virial.zx = (fzij*xij);
318 virial.zy = (fzij*yij);
319 virial.zz = (fzij*zij);
325 // Calculates modified exclusions
327 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
328 __device__ void modifiedExclusionForce(
330 const CudaExclusion* __restrict__ exclusionList,
332 const float one_scale14, // 1 - scale14
333 const int vdwCoefTableWidth,
334 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
335 const float2* __restrict__ vdwCoefTable,
337 cudaTextureObject_t vdwCoefTableTex,
339 #ifdef USE_TABLE_ARRAYS
340 const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
342 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
344 const float4* __restrict__ xyzq,
346 const float3 lata, const float3 latb, const float3 latc,
348 const CudaNBConstants nbConstants,
350 T* __restrict__ forceNbond, double &energyNbond,
351 T* __restrict__ forceSlow, double &energySlow,
352 T* __restrict__ forceList, int* forceListCounter,
353 int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
354 #ifdef WRITE_FULL_VIRIALS
355 ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
356 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
358 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
359 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
361 double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
362 double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
363 double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
366 CudaExclusion bl = exclusionList[index];
368 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
369 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
370 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
372 float4 xyzqi = xyzq[bl.i];
373 float4 xyzqj = xyzq[bl.j];
375 float xij = xyzqi.x + shx - xyzqj.x;
376 float yij = xyzqi.y + shy - xyzqj.y;
377 float zij = xyzqi.z + shz - xyzqj.z;
379 float r2 = xij*xij + yij*yij + zij*zij;
382 float rinv = rsqrtf(r2);
385 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
387 int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
388 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
389 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
391 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
395 float myElecLambda = 1.0f;
396 float myElecLambda2 = 1.0f;
397 float myVdwLambda = 1.0f;
398 float myVdwLambda2 = 1.0f;
399 float myVdwShift = 0.0f;
400 float myVdwShift2 = 0.0f;
401 double alch_vdw_energy;
402 double alch_vdw_energy_2;
403 float alch_vdw_force = 0.0f;
404 float alch_vdw_dUdl = 0.0f;
405 double* p_energyVdw_TI = NULL;
406 double* p_energyNbond_TI = NULL;
407 double* p_energySlow_TI = NULL;
411 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
412 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
413 float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
414 float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
415 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
416 float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
417 float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
418 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
419 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
420 float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
421 float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
422 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
423 float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
424 float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
425 switch (bl.pswitch) {
426 case 0: myVdwLambda = 1.0f;
429 myElecLambda2 = 1.0f;
432 case 1: myVdwLambda = vdwLambdaUp;
433 myVdwLambda2 = vdwLambda2Up;
434 myElecLambda = elecLambdaUp;
435 myElecLambda2 = elecLambda2Up;
436 myVdwShift = vdwShiftUp;
437 myVdwShift2 = vdwShift2Up;
438 p_energyVdw_TI = &(energyVdw_TI_1);
439 p_energyNbond_TI= &(energyNbond_TI_1);
440 p_energySlow_TI = &(energySlow_TI_1);
442 case 2: myVdwLambda = vdwLambdaDown;
443 myVdwLambda2 = vdwLambda2Down;
444 myElecLambda = elecLambdaDown;
445 myElecLambda2 = elecLambda2Down;
446 myVdwShift = vdwShiftDown;
447 myVdwShift2 = vdwShift2Down;
448 p_energyVdw_TI = &(energyVdw_TI_2);
449 p_energyNbond_TI= &(energyNbond_TI_2);
450 p_energySlow_TI = &(energySlow_TI_2);
452 default: myVdwLambda = 0.0f;
455 myElecLambda2 = 0.0f;
458 if (bl.pswitch != 99 && bl.pswitch != 0) {
459 // Common part of FEP and TI
460 const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
461 const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
462 const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
463 const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
464 // A = ljab.x ; B = ljab.y
465 const float r2_1 = 1.0f / (r2 + myVdwShift);
466 const float r2_2 = 1.0f / (r2 + myVdwShift2);
467 const float r6_1 = r2_1 * r2_1 * r2_1;
468 const float r6_2 = r2_2 * r2_2 * r2_2;
469 const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
470 const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
471 // rinv is already calculated above
473 alch_vdw_energy = double(myVdwLambda * switchmul * U1);
475 alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
478 alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
480 alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
484 alch_vdw_energy = 0.0;
485 alch_vdw_energy_2 = 0.0;
494 #ifdef USE_TABLE_ARRAYS
495 fi = tableLookup(forceTable, rinv);
497 fi = sampleTableTex(forceTableTex, rinv);
500 fi = tex1D<float4>(forceTableTex, rinv);
504 if (doEnergy && doTable) {
506 #ifdef USE_TABLE_ARRAYS
507 ei = tableLookup(energyTable, rinv);
509 ei = sampleTableTex(energyTableTex, rinv);
512 ei = tex1D<float4>(energyTableTex, rinv);
515 energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
516 energyVdw += alch_vdw_energy;
518 energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
519 energyVdw_F += alch_vdw_energy_2;
522 if (p_energyVdw_TI != NULL) {
523 (*p_energyVdw_TI) += alch_vdw_dUdl;
527 energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
530 energyNbond += qq * ei.x * myElecLambda;
532 energyNbond_F += qq * ei.x * myElecLambda2;
535 if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
538 energySlow += qq * ei.w * myElecLambda;
540 energySlow_F += qq * ei.w * myElecLambda2;
543 if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
553 fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
555 fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
558 float e_vdw, e_elec, e_slow;
565 cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
566 doSlow, doElect, r2, rinv, qq, ljab,
567 nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
570 energyVdw += (double) (e_vdw);
571 energyNbond += (double) (e_elec * myElecLambda);
572 energySlow += (double) (e_slow * myElecLambda);
575 // fNbond = fFast + fVdw
577 if (bl.pswitch != 0) {
578 fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
579 fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
581 // if pswitch == 0, myElecLambda = 1.0
583 T T_fxij, T_fyij, T_fzij;
584 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
585 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
586 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
588 if (doSlow && doElect) {
593 fSlow *= myElecLambda;
595 T T_fxij, T_fyij, T_fzij;
596 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
597 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
598 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
603 #ifdef WRITE_FULL_VIRIALS
604 float fxij = fNbond*xij;
605 float fyij = fNbond*yij;
606 float fzij = fNbond*zij;
607 virialNbond.xx = (fxij*xij);
608 virialNbond.xy = (fxij*yij);
609 virialNbond.xz = (fxij*zij);
610 virialNbond.yx = (fyij*xij);
611 virialNbond.yy = (fyij*yij);
612 virialNbond.yz = (fyij*zij);
613 virialNbond.zx = (fzij*xij);
614 virialNbond.zy = (fzij*yij);
615 virialNbond.zz = (fzij*zij);
620 if (doVirial && doSlow && doElect) {
621 #ifdef WRITE_FULL_VIRIALS
622 float fxij = fSlow*xij;
623 float fyij = fSlow*yij;
624 float fzij = fSlow*zij;
625 virialSlow.xx = (fxij*xij);
626 virialSlow.xy = (fxij*yij);
627 virialSlow.xz = (fxij*zij);
628 virialSlow.yx = (fyij*xij);
629 virialSlow.yy = (fyij*yij);
630 virialSlow.yz = (fyij*zij);
631 virialSlow.zx = (fzij*xij);
632 virialSlow.zy = (fzij*yij);
633 virialSlow.zz = (fzij*zij);
641 // Calculates exclusions. Here doSlow = true
643 // doFull = doFullElectrostatics = doSlow
644 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
645 __device__ void exclusionForce(
647 const CudaExclusion* __restrict__ exclusionList,
648 const float r2_delta, const int r2_delta_expc,
650 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
651 const float* __restrict__ r2_table,
652 const float4* __restrict__ exclusionTable,
654 cudaTextureObject_t r2_table_tex,
655 cudaTextureObject_t exclusionTableTex,
657 const float4* __restrict__ xyzq,
659 const float3 lata, const float3 latb, const float3 latc,
661 T* __restrict__ forceSlow, double &energySlow,
662 T* __restrict__ forceList, int* forceListCounter,
663 int* forceListStartsSlow, int* forceListNexts,
664 #ifdef WRITE_FULL_VIRIALS
665 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
667 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
669 double &energy_F, double &energy_TI_1, double &energy_TI_2
672 CudaExclusion bl = exclusionList[index];
674 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
675 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
676 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
678 float4 xyzqi = xyzq[bl.i];
679 float4 xyzqj = xyzq[bl.j];
681 float xij = xyzqi.x + shx - xyzqj.x;
682 float yij = xyzqi.y + shy - xyzqj.y;
683 float zij = xyzqi.z + shz - xyzqj.z;
685 float r2 = xij*xij + yij*yij + zij*zij;
689 union { float f; int i; } r2i;
691 int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
693 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
695 float r2_table_val = r2_table[table_i];
697 float r2_table_val = __ldg(&r2_table[table_i]);
700 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
702 float diffa = r2 - r2_table_val;
703 float qq = xyzqi.w * xyzqj.w;
705 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
707 float4 slow = exclusionTable[table_i];
709 float4 slow = __ldg(&exclusionTable[table_i]);
712 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
720 myElecLambda2 = 1.0f;
721 if (doTI) p_energy_TI = NULL;
722 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
723 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
724 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
725 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
726 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
727 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
728 switch (bl.pswitch) {
729 case 0: myElecLambda = 1.0f;
730 myElecLambda2 = 1.0f;
732 case 1: myElecLambda = elecLambdaUp;
733 myElecLambda2 = elecLambda2Up;
734 p_energy_TI = &(energy_TI_1);
736 case 2: myElecLambda = elecLambdaDown;
737 myElecLambda2 = elecLambda2Down;
738 p_energy_TI = &(energy_TI_2);
740 default: myElecLambda = 0.0f;
741 myElecLambda2 = 0.0f;
746 double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
748 energySlow += energy_slow_tmp * myElecLambda;
750 energy_F += energy_slow_tmp * myElecLambda2;
753 if (p_energy_TI != NULL) {
754 (*p_energy_TI) += energy_slow_tmp;
758 energySlow += energy_slow_tmp;
762 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
764 fSlow *= myElecLambda;
767 T T_fxij, T_fyij, T_fzij;
768 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
769 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
770 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
774 #ifdef WRITE_FULL_VIRIALS
775 float fxij = fSlow*xij;
776 float fyij = fSlow*yij;
777 float fzij = fSlow*zij;
778 virialSlow.xx = (fxij*xij);
779 virialSlow.xy = (fxij*yij);
780 virialSlow.xz = (fxij*zij);
781 virialSlow.yx = (fyij*xij);
782 virialSlow.yy = (fyij*yij);
783 virialSlow.yz = (fyij*zij);
784 virialSlow.zx = (fzij*xij);
785 virialSlow.zy = (fzij*yij);
786 virialSlow.zz = (fzij*zij);
792 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
793 __device__ void angleForce(const int index,
794 const CudaAngle* __restrict__ angleList,
795 const CudaAngleValue* __restrict__ angleValues,
796 const float4* __restrict__ xyzq,
798 const float3 lata, const float3 latb, const float3 latc,
799 T* __restrict__ force, double &energy,
800 T* __restrict__ forceList, int* forceListCounter,
801 int* forceListStarts, int* forceListNexts,
802 #ifdef WRITE_FULL_VIRIALS
803 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
805 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
807 double &energy_F, double &energy_TI_1, double &energy_TI_2
810 CudaAngle al = angleList[index];
812 float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
813 float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
814 float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
816 float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
817 float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
818 float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
820 float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
821 float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
822 float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
824 float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
825 float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
826 float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
828 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
829 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
831 float xijr = xij*rij_inv;
832 float yijr = yij*rij_inv;
833 float zijr = zij*rij_inv;
834 float xkjr = xkj*rkj_inv;
835 float ykjr = ykj*rkj_inv;
836 float zkjr = zkj*rkj_inv;
837 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
839 CudaAngleValue angleValue = angleValues[al.itype];
840 angleValue.k *= al.scale;
843 if (angleValue.normal == 1) {
844 // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
845 cos_theta = min(0.999f, max(-0.999f, cos_theta));
846 float theta = acosf(cos_theta);
847 diff = theta - angleValue.theta0;
849 diff = cos_theta - angleValue.theta0;
852 float energy_tmp = 0.0f;
854 energy_tmp += angleValue.k * diff * diff;
859 // Be careful: alch_scale_energy_fep is 0 here!
860 float alch_scale_energy_fep;
862 // NOTE: On account of the Urey-Bradley terms, energy calculation is not
863 // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
864 // but we still need the scaling factor for forces,
865 // so the code here is a little bit different from the bondForce.
866 // calculate a scale factor for FEP energy first, and then scale it to
867 // the final result after finishing energy evaluation, and also use
868 // a pointer point to the correct energy_TI_x term.
871 alch_scale_energy_fep = 0.0f;
872 if (doTI) p_energy_TI = NULL;
873 switch (al.fepBondedType) {
875 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
878 p_energy_TI = &(energy_TI_1);
881 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
887 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
890 p_energy_TI = &(energy_TI_2);
893 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
901 if (angleValue.normal == 1) {
902 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
903 if (inv_sin_theta > 1.0e6) {
904 diff = (diff < 0.0f) ? 1.0f : -1.0f;
906 diff *= -inv_sin_theta;
909 diff *= -2.0f*angleValue.k;
911 // Do alchemical scaling
916 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
917 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
918 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
919 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
920 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
921 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
923 T T_dtxi, T_dtyi, T_dtzi;
924 T T_dtxj, T_dtyj, T_dtzj;
925 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
926 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
927 T T_dtxk = -T_dtxi - T_dtxj;
928 T T_dtyk = -T_dtyi - T_dtyj;
929 T T_dtzk = -T_dtzi - T_dtzj;
930 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
932 if (angleValue.k_ub) {
933 float xik = xij - xkj;
934 float yik = yij - ykj;
935 float zik = zij - zkj;
936 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
937 float rik = 1.0f/rik_inv;
938 float diff_ub = rik - angleValue.r_ub;
940 energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
942 diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
944 diff_ub *= alch_scale;
946 T T_dxik, T_dyik, T_dzik;
947 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
960 energy += double(energy_tmp * alch_scale);
962 if (p_energy_TI != NULL) {
963 *(p_energy_TI) += double(energy_tmp);
967 energy_F += double(energy_tmp * alch_scale_energy_fep);
970 energy += double(energy_tmp);
974 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
975 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
979 #ifdef WRITE_FULL_VIRIALS
980 float fxi = ((float)T_dtxi);
981 float fyi = ((float)T_dtyi);
982 float fzi = ((float)T_dtzi);
983 float fxk = ((float)T_dtxj);
984 float fyk = ((float)T_dtyj);
985 float fzk = ((float)T_dtzj);
986 virial.xx = (fxi*xij) + (fxk*xkj);
987 virial.xy = (fxi*yij) + (fxk*ykj);
988 virial.xz = (fxi*zij) + (fxk*zkj);
989 virial.yx = (fyi*xij) + (fyk*xkj);
990 virial.yy = (fyi*yij) + (fyk*ykj);
991 virial.yz = (fyi*zij) + (fyk*zkj);
992 virial.zx = (fzi*xij) + (fzk*xkj);
993 virial.zy = (fzi*yij) + (fzk*ykj);
994 virial.zz = (fzi*zij) + (fzk*zkj);
1000 // Dihedral computation
1004 template <bool doEnergy>
1005 __forceinline__ __device__
1006 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
1007 const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
1010 if (doEnergy) e = 0.0;
1012 float phi = atan2f(sin_phi, cos_phi);
1016 CudaDihedralValue dihedralValue = dihedralValues[ic];
1017 dihedralValue.k *= scale;
1019 // Last dihedral has n low bit set to 0
1020 lrep = (dihedralValue.n & 1);
1021 dihedralValue.n >>= 1;
1022 if (dihedralValue.n) {
1023 float nf = dihedralValue.n;
1024 float x = nf*phi - dihedralValue.delta;
1027 sincosf(x, &sin_x, &cos_x);
1028 e += (double)(dihedralValue.k*(1.0f + cos_x));
1029 df += (double)(nf*dihedralValue.k*sin_x);
1031 float sin_x = sinf(x);
1032 df += (double)(nf*dihedralValue.k*sin_x);
1035 float diff = phi - dihedralValue.delta;
1036 if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1037 if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1039 e += (double)(dihedralValue.k*diff*diff);
1041 df -= 2.0f*dihedralValue.k*diff;
1049 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1050 __device__ void diheForce(const int index,
1051 const CudaDihedral* __restrict__ diheList,
1052 const CudaDihedralValue* __restrict__ dihedralValues,
1053 const float4* __restrict__ xyzq,
1055 const float3 lata, const float3 latb, const float3 latc,
1056 T* __restrict__ force, double &energy,
1057 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1058 #ifdef WRITE_FULL_VIRIALS
1059 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1061 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1063 double &energy_F, double &energy_TI_1, double &energy_TI_2
1066 CudaDihedral dl = diheList[index];
1068 float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1069 float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1070 float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1072 float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1073 float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1074 float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1076 float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1077 float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1078 float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1080 float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1081 float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1082 float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1084 float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1085 float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1086 float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1088 float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1089 float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1090 float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1093 float ax = yij*zjk - zij*yjk;
1094 float ay = zij*xjk - xij*zjk;
1095 float az = xij*yjk - yij*xjk;
1096 float bx = ylk*zjk - zlk*yjk;
1097 float by = zlk*xjk - xlk*zjk;
1098 float bz = xlk*yjk - ylk*xjk;
1100 float ra2 = ax*ax + ay*ay + az*az;
1101 float rb2 = bx*bx + by*by + bz*bz;
1102 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1104 // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1105 // nlinear = nlinear + 1
1108 float rgr = 1.0f / rg;
1109 float ra2r = 1.0f / ra2;
1110 float rb2r = 1.0f / rb2;
1111 float rabr = sqrtf(ra2r*rb2r);
1113 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1115 // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1116 // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1117 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1119 // Energy and derivative contributions.
1123 diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1125 // Alchemical transformation
1127 if (doFEP || doTI) {
1129 switch (dl.fepBondedType) {
1131 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1134 energy_TI_1 += (double)(e);
1137 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1143 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1146 energy_TI_2 += (double)(e);
1149 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1158 if (doFEP || doTI) {
1159 energy += e * alch_scale;
1166 // Compute derivatives wrt catesian coordinates.
1168 // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1169 // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1171 float fg = xij*xjk + yij*yjk + zij*zjk;
1172 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1175 float fga = fg*ra2r*rgr;
1176 float hgb = hg*rb2r*rgr;
1177 float gaa = ra2r*rg;
1178 float gbb = rb2r*rg;
1179 // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1181 // Remember T is long long int
1182 // Don't try to scale T_fix and similar variables directly by float
1183 if (doFEP || doTI) {
1191 T T_fix, T_fiy, T_fiz;
1192 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1193 storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1196 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1197 T T_fjx = dgx - T_fix;
1198 T T_fjy = dgy - T_fiy;
1199 T T_fjz = dgz - T_fiz;
1200 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1203 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1204 T T_fkx = -dhx - dgx;
1205 T T_fky = -dhy - dgy;
1206 T T_fkz = -dhz - dgz;
1207 storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1208 storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1212 #ifdef WRITE_FULL_VIRIALS
1213 float fix = ((float)T_fix);
1214 float fiy = ((float)T_fiy);
1215 float fiz = ((float)T_fiz);
1216 float fjx = ((float)dgx);
1217 float fjy = ((float)dgy);
1218 float fjz = ((float)dgz);
1219 float flx = ((float)dhx);
1220 float fly = ((float)dhy);
1221 float flz = ((float)dhz);
1222 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1223 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1224 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1225 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1226 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1227 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1228 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1229 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1230 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1236 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1238 v1.y*v2.z-v2.y*v1.z,
1239 v2.x*v1.z-v1.x*v2.z,
1244 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1245 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1249 // Calculates crossterms
1251 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1252 __device__ void crosstermForce(
1254 const CudaCrossterm* __restrict__ crosstermList,
1255 const CudaCrosstermValue* __restrict__ crosstermValues,
1256 const float4* __restrict__ xyzq,
1258 const float3 lata, const float3 latb, const float3 latc,
1259 T* __restrict__ force, double &energy,
1260 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1261 #ifdef WRITE_FULL_VIRIALS
1262 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1264 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1266 double &energy_F, double &energy_TI_1, double &energy_TI_2
1269 CudaCrossterm cl = crosstermList[index];
1271 // ----------------------------------------------------------------------------
1272 // Angle between 1 - 2 - 3 - 4
1275 float3 sh12 = make_float3(
1276 cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1277 cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1278 cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1280 float4 xyzq1 = xyzq[cl.i1];
1281 float4 xyzq2 = xyzq[cl.i2];
1283 float3 r12 = make_float3(
1284 xyzq1.x + sh12.x - xyzq2.x,
1285 xyzq1.y + sh12.y - xyzq2.y,
1286 xyzq1.z + sh12.z - xyzq2.z);
1289 float3 sh23 = make_float3(
1290 cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1291 cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1292 cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1294 float4 xyzq3 = xyzq[cl.i3];
1296 float3 r23 = make_float3(
1297 xyzq2.x + sh23.x - xyzq3.x,
1298 xyzq2.y + sh23.y - xyzq3.y,
1299 xyzq2.z + sh23.z - xyzq3.z);
1302 float3 sh34 = make_float3(
1303 cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1304 cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1305 cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1307 float4 xyzq4 = xyzq[cl.i4];
1309 float3 r34 = make_float3(
1310 xyzq3.x + sh34.x - xyzq4.x,
1311 xyzq3.y + sh34.y - xyzq4.y,
1312 xyzq3.z + sh34.z - xyzq4.z);
1314 // Calculate the cross products
1315 float3 A = cross(r12, r23);
1316 float3 B = cross(r23, r34);
1317 float3 C = cross(r23, A);
1319 // Calculate the inverse distances
1320 float inv_rA = rsqrtf(dot(A, A));
1321 float inv_rB = rsqrtf(dot(B, B));
1322 float inv_rC = rsqrtf(dot(C, C));
1324 // Calculate the sin and cos
1325 float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1326 float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1328 float phi = -atan2f(sin_phi,cos_phi);
1330 // ----------------------------------------------------------------------------
1331 // Angle between 5 - 6 - 7 - 8
1335 float3 sh56 = make_float3(
1336 cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1337 cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1338 cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1340 float4 xyzq5 = xyzq[cl.i5];
1341 float4 xyzq6 = xyzq[cl.i6];
1343 float3 r56 = make_float3(
1344 xyzq5.x + sh56.x - xyzq6.x,
1345 xyzq5.y + sh56.y - xyzq6.y,
1346 xyzq5.z + sh56.z - xyzq6.z);
1349 float3 sh67 = make_float3(
1350 cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1351 cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1352 cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1354 float4 xyzq7 = xyzq[cl.i7];
1356 float3 r67 = make_float3(
1357 xyzq6.x + sh67.x - xyzq7.x,
1358 xyzq6.y + sh67.y - xyzq7.y,
1359 xyzq6.z + sh67.z - xyzq7.z);
1362 float3 sh78 = make_float3(
1363 cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1364 cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1365 cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1367 float4 xyzq8 = xyzq[cl.i8];
1369 float3 r78 = make_float3(
1370 xyzq7.x + sh78.x - xyzq8.x,
1371 xyzq7.y + sh78.y - xyzq8.y,
1372 xyzq7.z + sh78.z - xyzq8.z);
1374 // Calculate the cross products
1375 float3 D = cross(r56, r67);
1376 float3 E = cross(r67, r78);
1377 float3 F = cross(r67, D);
1379 // Calculate the inverse distances
1380 float inv_rD = rsqrtf(dot(D, D));
1381 float inv_rE = rsqrtf(dot(E, E));
1382 float inv_rF = rsqrtf(dot(F, F));
1384 // Calculate the sin and cos
1385 float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1386 float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1388 float psi = -atan2f(sin_psi,cos_psi);
1390 // ----------------------------------------------------------------------------
1391 // Calculate the energy
1393 const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1396 phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1397 psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1399 // distance measured in grid points between angle and smallest value
1400 float phi_h = phi * inv_h;
1401 float psi_h = psi * inv_h;
1403 // find smallest numbered grid point in stencil
1404 int iphi = (int)floor(phi_h);
1405 int ipsi = (int)floor(psi_h);
1407 const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1409 // Load coefficients
1411 c[0] = cp.c[iphi][ipsi][0];
1412 c[1] = cp.c[iphi][ipsi][1];
1413 c[2] = cp.c[iphi][ipsi][2];
1414 c[3] = cp.c[iphi][ipsi][3];
1416 float dphi = phi_h - (float)iphi;
1417 float dpsi = psi_h - (float)ipsi;
1420 float alch_scale_energy_fep;
1421 double* p_energy_TI;
1422 if (doFEP || doTI) {
1424 alch_scale_energy_fep = 0.0f;
1425 if (doTI) p_energy_TI = NULL;
1426 switch (cl.fepBondedType) {
1428 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1431 p_energy_TI = &(energy_TI_1);
1434 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1440 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1443 p_energy_TI = &(energy_TI_2);
1446 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1455 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1456 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1457 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1458 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1459 if (doFEP || doTI) {
1460 energy += energyf * cl.scale * alch_scale;
1462 energy_F += energyf * cl.scale * alch_scale_energy_fep;
1465 if (p_energy_TI != NULL) {
1466 (*p_energy_TI) += energyf * cl.scale;
1470 energy += energyf * cl.scale;
1474 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1475 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1476 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1477 dEdphi *= cl.scale*inv_h;
1479 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1480 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1481 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1482 dEdpsi *= cl.scale*inv_h;
1484 // float normCross1 = dot(A, A);
1485 float square_r23 = dot(r23, r23);
1486 float norm_r23 = sqrtf(square_r23);
1487 float inv_square_r23 = 1.0f/square_r23;
1488 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1489 float ff2 = -dot(r12, r23)*inv_square_r23;
1490 float ff3 = -dot(r34, r23)*inv_square_r23;
1491 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1493 // NOTE: The reason why scaling ff1 and ff4 is enough:
1494 // f1, f4 are already scaled, and in following t1's formula:
1495 // first term (t1.x) : ff2*f1.x - ff3*f4.x
1498 // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1499 // then let's look at f2.x:
1500 // f2.x = t1.x - f1.x
1502 // scaled scaled => f2.x is scaled
1503 // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1504 // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1505 if (doFEP || doTI) {
1510 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1511 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1512 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1513 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1514 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1516 T T_f1x, T_f1y, T_f1z;
1517 T T_f2x, T_f2y, T_f2z;
1518 T T_f3x, T_f3y, T_f3z;
1519 T T_f4x, T_f4y, T_f4z;
1520 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1521 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1522 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1523 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1524 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1525 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1526 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1527 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1529 float square_r67 = dot(r67, r67);
1530 float norm_r67 = sqrtf(square_r67);
1531 float inv_square_r67 = 1.0f/(square_r67);
1532 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1533 ff2 = -dot(r56, r67)*inv_square_r67;
1534 ff3 = -dot(r78, r67)*inv_square_r67;
1535 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1537 if (doFEP || doTI) {
1542 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1543 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1544 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1545 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1546 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1548 T T_f5x, T_f5y, T_f5z;
1549 T T_f6x, T_f6y, T_f6z;
1550 T T_f7x, T_f7y, T_f7z;
1551 T T_f8x, T_f8y, T_f8z;
1552 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1553 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1554 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1555 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1556 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1557 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1558 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1559 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1563 #ifdef WRITE_FULL_VIRIALS
1564 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1565 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1566 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1567 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1568 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1569 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1570 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1571 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1572 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1573 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1574 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1580 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1581 __device__ void tholeForce(
1583 const CudaThole* __restrict__ tholeList,
1584 const float4* __restrict__ xyzq,
1586 const float3 lata, const float3 latb, const float3 latc,
1587 T* __restrict__ force, double &energy,
1588 T* __restrict__ forceList, int* forceListCounter,
1589 int* forceListStarts, int* forceListNexts,
1590 #ifdef WRITE_FULL_VIRIALS
1591 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1593 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1595 double &energy_F, double &energy_TI_1, double &energy_TI_2
1597 // For reference, similar calculation can be found in OpenMM's code repo:
1598 // https://github.com/openmm/openmm/blob/8f123ef287693bb4b553163a9d5fd101ea9e4462/plugins/drude/platforms/common/src/kernels/drudePairForce.cc
1599 // CHARMM source code: charmm/source/gener/drude.F90, FUNCTION E_THOLE
1600 const CudaThole tl = tholeList[index];
1602 const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1603 const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1604 const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1606 const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1607 const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1608 const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1610 const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1611 const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1612 const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1614 const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1615 const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1616 const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1618 const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1619 const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1620 const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1622 const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1623 const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1624 const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1626 const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1627 const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1628 const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1630 const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1631 const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1632 const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1634 const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1635 const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1636 const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1637 const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1639 const float auaa = tl.aa / raa_invlen;
1640 const float auad = tl.aa / rad_invlen;
1641 const float auda = tl.aa / rda_invlen;
1642 const float audd = tl.aa / rdd_invlen;
1645 const float expauaa = expf(-auaa);
1646 const float expauad = expf(-auad);
1647 const float expauda = expf(-auda);
1648 const float expaudd = expf(-audd);
1651 float polyauaa = 1.0f + 0.5f*auaa;
1652 float polyauad = 1.0f + 0.5f*auad;
1653 float polyauda = 1.0f + 0.5f*auda;
1654 float polyaudd = 1.0f + 0.5f*audd;
1658 ethole += tl.qq * raa_invlen * (1.0f - polyauaa * expauaa);
1659 ethole += -tl.qq * rad_invlen * (1.0f - polyauad * expauad);
1660 ethole += -tl.qq * rda_invlen * (1.0f - polyauda * expauda);
1661 ethole += tl.qq * rdd_invlen * (1.0f - polyaudd * expaudd);
1662 if (doFEP || doTI) {
1663 switch (tl.fepBondedType) {
1665 // 0 < typeSum && typeSum < order
1666 if (doTI) energy_TI_1 += (double)ethole;
1668 // elecLambda2Up = simParams->getElecLambda(alchLambda2)
1669 energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Up * ethole;
1670 // elecLambdaUp = simParams->getElecLambda(alchLambda)
1671 ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1676 if (doTI) energy_TI_2 += (double)ethole;
1678 // elecLambda2Down = simParams->getElecLambda(1-alchLambda2)
1679 energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Down * ethole;
1680 // elecLambdaDown = simParams->getElecLambda(1-alchLambda)
1681 ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1687 energy += (double)ethole;
1690 polyauaa = 1.0f + auaa*polyauaa;
1691 polyauad = 1.0f + auad*polyauad;
1692 polyauda = 1.0f + auda*polyauda;
1693 polyaudd = 1.0f + audd*polyaudd;
1695 const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
1696 const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
1697 const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
1698 const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
1700 // df = (1/r) (dU/dr)
1701 float dfaa = -tl.qq * raa_invlen3 * (polyauaa*expauaa - 1);
1702 float dfad = tl.qq * rad_invlen3 * (polyauad*expauad - 1);
1703 float dfda = tl.qq * rda_invlen3 * (polyauda*expauda - 1);
1704 float dfdd = -tl.qq * rdd_invlen3 * (polyaudd*expaudd - 1);
1706 if (doFEP || doTI) {
1707 switch (tl.fepBondedType) {
1709 dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1710 dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1711 dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1712 dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1716 dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1717 dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1718 dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1719 dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1725 T T_fx0, T_fy0, T_fz0;
1726 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
1727 storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1729 T T_fx1, T_fy1, T_fz1;
1730 calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
1731 storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1733 T T_fx2, T_fy2, T_fz2;
1734 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
1735 storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1737 T T_fx3, T_fy3, T_fz3;
1738 calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
1739 storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1742 #ifdef WRITE_FULL_VIRIALS
1743 const float faa_x = dfaa * raa_x;
1744 const float faa_y = dfaa * raa_y;
1745 const float faa_z = dfaa * raa_z;
1746 const float fad_x = dfad * rad_x;
1747 const float fad_y = dfad * rad_y;
1748 const float fad_z = dfad * rad_z;
1749 const float fda_x = dfda * rda_x;
1750 const float fda_y = dfda * rda_y;
1751 const float fda_z = dfda * rda_z;
1752 const float fdd_x = dfdd * rdd_x;
1753 const float fdd_y = dfdd * rdd_y;
1754 const float fdd_z = dfdd * rdd_z;
1755 virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
1756 virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
1757 virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
1758 virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
1759 virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
1760 virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
1761 virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
1762 virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
1763 virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
1768 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1769 __device__ void anisoForce(
1771 const CudaAniso* __restrict__ anisoList,
1772 const float4* __restrict__ xyzq,
1774 const float3 lata, const float3 latb, const float3 latc,
1775 T* __restrict__ force, double &energy,
1776 T* __restrict__ forceList, int* forceListCounter,
1777 int* forceListStarts, int* forceListNexts,
1778 #ifdef WRITE_FULL_VIRIALS
1779 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1781 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1783 double &energy_F, double &energy_TI_1, double &energy_TI_2
1785 const CudaAniso al = anisoList[index];
1787 const float shx_il = al.offset_il.x*lata.x + al.offset_il.y*latb.x + al.offset_il.z*latc.x;
1788 const float shy_il = al.offset_il.x*lata.y + al.offset_il.y*latb.y + al.offset_il.z*latc.y;
1789 const float shz_il = al.offset_il.x*lata.z + al.offset_il.y*latb.z + al.offset_il.z*latc.z;
1791 const float shx_mn = al.offset_mn.x*lata.x + al.offset_mn.y*latb.x + al.offset_mn.z*latc.x;
1792 const float shy_mn = al.offset_mn.x*lata.y + al.offset_mn.y*latb.y + al.offset_mn.z*latc.y;
1793 const float shz_mn = al.offset_mn.x*lata.z + al.offset_mn.y*latb.z + al.offset_mn.z*latc.z;
1795 // calculate parallel and perpendicular displacement vectors
1796 // shortest vector image: ri - rl
1797 const float ril_x = xyzq[al.i].x + shx_il - xyzq[al.l].x;
1798 const float ril_y = xyzq[al.i].y + shy_il - xyzq[al.l].y;
1799 const float ril_z = xyzq[al.i].z + shz_il - xyzq[al.l].z;
1801 // shortest vector image: rm - rn
1802 const float rmn_x = xyzq[al.m].x + shx_mn - xyzq[al.n].x;
1803 const float rmn_y = xyzq[al.m].y + shy_mn - xyzq[al.n].y;
1804 const float rmn_z = xyzq[al.m].z + shz_mn - xyzq[al.n].z;
1806 // need recip lengths of r_il, r_mn
1807 const float ril_invlen = rnorm3df(ril_x, ril_y, ril_z);
1808 const float rmn_invlen = rnorm3df(rmn_x, rmn_y, rmn_z);
1810 // normalize r_il, r_mn
1811 const float u1_x = ril_x * ril_invlen;
1812 const float u1_y = ril_y * ril_invlen;
1813 const float u1_z = ril_z * ril_invlen;
1814 const float u2_x = rmn_x * rmn_invlen;
1815 const float u2_y = rmn_y * rmn_invlen;
1816 const float u2_z = rmn_z * rmn_invlen;
1818 // Drude displacement vector (ri, rj are in same patch)
1819 const float dr_x = xyzq[al.j].x - xyzq[al.i].x;
1820 const float dr_y = xyzq[al.j].y - xyzq[al.i].y;
1821 const float dr_z = xyzq[al.j].z - xyzq[al.i].z;
1823 // parallel displacement
1824 const float dpar = dr_x * u1_x + dr_y * u1_y + dr_z * u1_z;
1826 // perpendicular displacement
1827 const float dperp = dr_x * u2_x + dr_y * u2_y + dr_z * u2_z;
1830 float fj_x = -al.kiso0 * dr_x;
1831 float fj_y = -al.kiso0 * dr_y;
1832 float fj_z = -al.kiso0 * dr_z;
1833 fj_x -= al.kpar0 * dpar * u1_x;
1834 fj_y -= al.kpar0 * dpar * u1_y;
1835 fj_z -= al.kpar0 * dpar * u1_z;
1836 fj_x -= al.kperp0 * dperp * u2_x;
1837 fj_y -= al.kperp0 * dperp * u2_y;
1838 fj_z -= al.kperp0 * dperp * u2_z;
1841 float fl_x = al.kpar0 * dpar * ril_invlen * dr_x;
1842 float fl_y = al.kpar0 * dpar * ril_invlen * dr_y;
1843 float fl_z = al.kpar0 * dpar * ril_invlen * dr_z;
1844 fl_x -= al.kpar0 * dpar * dpar * ril_invlen * u1_x;
1845 fl_y -= al.kpar0 * dpar * dpar * ril_invlen * u1_y;
1846 fl_z -= al.kpar0 * dpar * dpar * ril_invlen * u1_z;
1849 float fm_x = al.kperp0 * dperp * dperp * rmn_invlen * u2_x;
1850 float fm_y = al.kperp0 * dperp * dperp * rmn_invlen * u2_y;
1851 float fm_z = al.kperp0 * dperp * dperp * rmn_invlen * u2_z;
1852 fm_x -= al.kperp0 * dperp * rmn_invlen * dr_x;
1853 fm_y -= al.kperp0 * dperp * rmn_invlen * dr_y;
1854 fm_z -= al.kperp0 * dperp * rmn_invlen * dr_z;
1857 // aniso spring energy
1858 // kpar reduces response along carbonyl vector
1859 // kperp reduces response perp to bond vector
1860 // (reg in and out of plane response)
1861 const float dr2 = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
1862 float eaniso = 0.5f * al.kpar0 * dpar * dpar + 0.5f * al.kperp0 * dperp * dperp + 0.5f * al.kiso0 * dr2;
1863 if (doFEP || doTI) {
1864 switch (al.fepBondedType) {
1866 if (doTI) energy_TI_1 += (double)eaniso;
1868 energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1869 eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1870 fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1871 fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1872 fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1873 fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1874 fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1875 fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1876 fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1877 fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1878 fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1883 if (doTI) energy_TI_2 += (double)eaniso;
1885 energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1886 eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1887 fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1888 fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1889 fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1890 fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1891 fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1892 fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1893 fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1894 fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1895 fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1901 energy += (double)eaniso;
1904 // accumulate forces
1905 T T_fx0, T_fy0, T_fz0;
1906 convertForces<T>(-(fj_x + fl_x), -(fj_y + fl_y), -(fj_z + fl_z), T_fx0, T_fy0, T_fz0);
1907 storeForces<T>(T_fx0, T_fy0, T_fz0, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1909 T T_fx1, T_fy1, T_fz1;
1910 convertForces<T>(fj_x, fj_y, fj_z, T_fx1, T_fy1, T_fz1);
1911 storeForces<T>(T_fx1, T_fy1, T_fz1, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1913 T T_fx2, T_fy2, T_fz2;
1914 convertForces<T>(fl_x, fl_y, fl_z, T_fx2, T_fy2, T_fz2);
1915 storeForces<T>(T_fx2, T_fy2, T_fz2, al.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1917 T T_fx3, T_fy3, T_fz3;
1918 convertForces<T>(fm_x, fm_y, fm_z, T_fx3, T_fy3, T_fz3);
1919 storeForces<T>(T_fx3, T_fy3, T_fz3, al.m, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1921 T T_fx4, T_fy4, T_fz4;
1922 convertForces<T>(-fm_x, -fm_y, -fm_z, T_fx4, T_fy4, T_fz4);
1923 storeForces<T>(T_fx4, T_fy4, T_fz4, al.n, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1926 #ifdef WRITE_FULL_VIRIALS
1927 virial.xx = fj_x * dr_x - fl_x * ril_x + fm_x * rmn_x;
1928 virial.xy = fj_x * dr_y - fl_x * ril_y + fm_x * rmn_y;
1929 virial.xz = fj_x * dr_z - fl_x * ril_z + fm_x * rmn_z;
1930 virial.yx = fj_y * dr_x - fl_y * ril_x + fm_y * rmn_x;
1931 virial.yy = fj_y * dr_y - fl_y * ril_y + fm_y * rmn_y;
1932 virial.yz = fj_y * dr_z - fl_y * ril_z + fm_y * rmn_z;
1933 virial.zx = fj_z * dr_x - fl_z * ril_x + fm_z * rmn_x;
1934 virial.zy = fj_z * dr_y - fl_z * ril_y + fm_z * rmn_y;
1935 virial.zz = fj_z * dr_z - fl_z * ril_z + fm_z * rmn_z;
1940 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1941 __device__ void oneFourNbTholeForce(
1942 const int index, const float one_scale14,
1943 const CudaOneFourNbThole* __restrict__ oneFourNbTholeList,
1944 const float4* __restrict__ xyzq,
1946 const float3 lata, const float3 latb, const float3 latc,
1947 T* __restrict__ force, double &energy,
1948 T* __restrict__ forceList, int* forceListCounter,
1949 int* forceListStarts, int* forceListNexts,
1950 #ifdef WRITE_FULL_VIRIALS
1951 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1953 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1955 double &energy_F, double &energy_TI_1, double &energy_TI_2
1957 const auto tl = oneFourNbTholeList[index];
1959 const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1960 const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1961 const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1963 const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1964 const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1965 const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1967 const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1968 const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1969 const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1971 const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1972 const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1973 const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1975 const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1976 const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1977 const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1979 const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1980 const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1981 const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1983 const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1984 const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1985 const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1987 const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1988 const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1989 const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1991 const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1992 const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1993 const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1994 const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1996 const float auaa = tl.aa / raa_invlen;
1997 const float auad = tl.aa / rad_invlen;
1998 const float auda = tl.aa / rda_invlen;
1999 const float audd = tl.aa / rdd_invlen;
2002 const float expauaa = expf(-auaa);
2003 const float expauad = expf(-auad);
2004 const float expauda = expf(-auda);
2005 const float expaudd = expf(-audd);
2008 float polyauaa = 1.0f + 0.5f*auaa;
2009 float polyauad = 1.0f + 0.5f*auad;
2010 float polyauda = 1.0f + 0.5f*auda;
2011 float polyaudd = 1.0f + 0.5f*audd;
2014 * Although CHARMM Drude force field expects the 1-4 scaling factor to be 1.0,
2015 * I try my best to generalize the case here. The GPU nonbonded kernel differs
2016 * from the CPU kernel as the former always calculate the full 1-4 interactions
2017 * without scaling (or just with a scaling factor 1.0), and then the bonded
2018 * kernel "modifiedExclusionForce" in ComputeBondedCUDAKernel.cu actually
2019 * computes a complementary part scaled by -(1.0 - ComputeNonbondedUtil::scale14).
2021 * As a result, the following scaling factor uses the same convention. In the
2022 * GPU kernel, the NbThole correction of 1-4 interactions is computed by
2023 * calcForceEnergyNbThole, so the following code should only computes the
2024 * complementary part.
2028 const float qqaa = -one_scale14 * xyzq[tl.i].w * xyzq[tl.k].w;
2029 const float qqad = -one_scale14 * xyzq[tl.i].w * xyzq[tl.l].w;
2030 const float qqda = -one_scale14 * xyzq[tl.j].w * xyzq[tl.k].w;
2031 const float qqdd = -one_scale14 * xyzq[tl.j].w * xyzq[tl.l].w;
2035 ethole += qqaa * raa_invlen * (-polyauaa * expauaa);
2036 ethole += qqad * rad_invlen * (-polyauad * expauad);
2037 ethole += qqda * rda_invlen * (-polyauda * expauda);
2038 ethole += qqdd * rdd_invlen * (-polyaudd * expaudd);
2039 energy += (double)ethole;
2042 polyauaa = 1.0f + auaa*polyauaa;
2043 polyauad = 1.0f + auad*polyauad;
2044 polyauda = 1.0f + auda*polyauda;
2045 polyaudd = 1.0f + audd*polyaudd;
2047 const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
2048 const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
2049 const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
2050 const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
2052 float dfaa = -qqaa * raa_invlen3 * (polyauaa*expauaa);
2053 float dfad = -qqad * rad_invlen3 * (polyauad*expauad);
2054 float dfda = -qqda * rda_invlen3 * (polyauda*expauda);
2055 float dfdd = -qqdd * rdd_invlen3 * (polyaudd*expaudd);
2057 T T_fx0, T_fy0, T_fz0;
2058 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
2059 storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2061 T T_fx1, T_fy1, T_fz1;
2062 calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
2063 storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2065 T T_fx2, T_fy2, T_fz2;
2066 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
2067 storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2069 T T_fx3, T_fy3, T_fz3;
2070 calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
2071 storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
2074 #ifdef WRITE_FULL_VIRIALS
2075 const float faa_x = dfaa * raa_x;
2076 const float faa_y = dfaa * raa_y;
2077 const float faa_z = dfaa * raa_z;
2078 const float fad_x = dfad * rad_x;
2079 const float fad_y = dfad * rad_y;
2080 const float fad_z = dfad * rad_z;
2081 const float fda_x = dfda * rda_x;
2082 const float fda_y = dfda * rda_y;
2083 const float fda_z = dfda * rda_z;
2084 const float fdd_x = dfdd * rdd_x;
2085 const float fdd_y = dfdd * rdd_y;
2086 const float fdd_z = dfdd * rdd_z;
2087 virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
2088 virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
2089 virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
2090 virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
2091 virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
2092 virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
2093 virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
2094 virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
2095 virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
2101 #define BONDEDFORCESKERNEL_NUM_WARP 2
2103 #define BONDEDFORCESKERNEL_NUM_WARP 4
2106 * @brief Calculates all bonded forces in a single kernel call
2108 * @remark This kernel may be called multiple times. For example,
2109 * assuming that we have 123 bonds and 456 angles, then there are
2110 * 579 bonded terms. The size of a thread block is 2*32=64 (CUDA),
2111 * so we need use 2 blocks for bonds, and 8 blocks for angles. Due to
2112 * hardware limitation we may be limited to use 4 blocks at a time, but
2113 * we need to compute all 10 (2 bonds + 8 angles) blocks, and then we
2114 * need to launch bondedForcesKernel three times, with 4, 4 and 2 blocks
2116 * 1st launch: starts with an offset of 0 blocks (start=0), and in such
2117 * case, we always have indexTB < numBondsTB, where numBondsTB
2118 * is the number of blocks used for bonds, so the code in
2119 * if (indexTB < numBondsTB) {...} will compute the bond
2120 * force, and after that it jumps to the reduce part by goto.
2121 * 2nd launch: starts with an offset of 4 blocks (start=4), and just like the
2122 * 1st iteration, these blocks will be used for the bond forces.
2123 * 3rd launch: starts with an offset of 8 blocks (start=8), but now
2124 * indexTB must be larger than numBondsTB, so the bond forces
2125 * will be skipped, and the remaining two blocks will continue
2126 * to the angle forces.
2127 * In summary, the goto in the kernel is used to guarantee that a group of
2128 * blocks are only used for a specific bonded term, while reusing the
2129 * reduction code for accumulating the energies.
2131 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI>
2132 __global__ void bondedForcesKernel(
2136 const CudaBond* __restrict__ bonds,
2137 const CudaBondValue* __restrict__ bondValues,
2139 const int numAngles,
2140 const CudaAngle* __restrict__ angles,
2141 const CudaAngleValue* __restrict__ angleValues,
2143 const int numDihedrals,
2144 const CudaDihedral* __restrict__ dihedrals,
2145 const CudaDihedralValue* __restrict__ dihedralValues,
2147 const int numImpropers,
2148 const CudaDihedral* __restrict__ impropers,
2149 const CudaDihedralValue* __restrict__ improperValues,
2151 const int numExclusions,
2152 const CudaExclusion* __restrict__ exclusions,
2154 const int numCrossterms,
2155 const CudaCrossterm* __restrict__ crossterms,
2156 const CudaCrosstermValue* __restrict__ crosstermValues,
2158 const int numTholes,
2159 const CudaThole* __restrict__ tholes,
2161 const int numAnisos,
2162 const CudaAniso* __restrict__ anisos,
2164 const int numOneFourNbTholes,
2165 const CudaOneFourNbThole* __restrict oneFourNbTholes,
2167 const float cutoff2, const float one_scale14,
2168 const float r2_delta, const int r2_delta_expc,
2170 const float* __restrict__ r2_table,
2171 const float4* __restrict__ exclusionTable,
2173 #if !defined(USE_TABLE_ARRAYS)
2174 cudaTextureObject_t r2_table_tex,
2175 cudaTextureObject_t exclusionTableTex,
2178 const float4* __restrict__ xyzq,
2180 const float3 lata, const float3 latb, const float3 latc,
2181 T* __restrict__ force,
2182 T* __restrict__ forceSlow,
2183 T* __restrict__ forceList,
2184 int* forceListCounter,
2185 int* forceListStarts,
2186 int* forceListStartsSlow,
2187 int* forceListNexts,
2188 double* __restrict__ energies_virials) {
2190 // Thread-block index
2191 int indexTB = start + blockIdx.x;
2193 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
2194 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
2195 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
2196 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
2197 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
2198 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
2199 const int numTholesTB = (numTholes + blockDim.x - 1)/blockDim.x;
2200 const int numAnisosTB = (numAnisos + blockDim.x - 1)/blockDim.x;
2201 int numOneFourNbTholesTB;
2202 if (doElect) numOneFourNbTholesTB = (numOneFourNbTholes + blockDim.x - 1)/blockDim.x;
2204 // Each thread computes single bonded interaction.
2205 // Each thread block computes single bonded type
2212 int energyIndex_TI_1;
2213 int energyIndex_TI_2;
2225 energyIndex_TI_1 = -1;
2226 energyIndex_TI_2 = -1;
2230 #ifdef WRITE_FULL_VIRIALS
2231 ComputeBondedCUDAKernel::BondedVirial<double> virial;
2243 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2247 if (indexTB < numBondsTB) {
2248 int i = threadIdx.x + indexTB*blockDim.x;
2250 energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
2252 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
2255 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
2256 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
2260 bondForce<T, doEnergy, doVirial, doFEP, doTI>
2261 (i, bonds, bondValues, xyzq,
2262 stride, lata, latb, latc,
2264 forceList, forceListCounter, forceListStarts, forceListNexts,
2266 energy_F, energy_TI_1, energy_TI_2);
2270 indexTB -= numBondsTB;
2272 if (indexTB < numAnglesTB) {
2273 int i = threadIdx.x + indexTB*blockDim.x;
2275 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
2277 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
2280 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
2281 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
2284 if (i < numAngles) {
2285 angleForce<T, doEnergy, doVirial, doFEP, doTI>
2286 (i, angles, angleValues, xyzq, stride,
2289 forceList, forceListCounter, forceListStarts, forceListNexts,
2291 energy_F, energy_TI_1, energy_TI_2);
2295 indexTB -= numAnglesTB;
2297 if (indexTB < numDihedralsTB) {
2298 int i = threadIdx.x + indexTB*blockDim.x;
2300 energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
2302 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
2305 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
2306 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
2310 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2312 if (i < numDihedrals) {
2313 diheForce<T, doEnergy, doVirial, doFEP, doTI>
2314 (i, dihedrals, dihedralValues, xyzq, stride,
2317 forceList, forceListCounter, forceListStarts, forceListNexts,
2319 energy_F, energy_TI_1, energy_TI_2);
2323 indexTB -= numDihedralsTB;
2325 if (indexTB < numImpropersTB) {
2326 int i = threadIdx.x + indexTB*blockDim.x;
2328 energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
2330 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
2333 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
2334 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
2337 if (i < numImpropers) {
2338 diheForce<T, doEnergy, doVirial, doFEP, doTI>
2339 (i, impropers, improperValues, xyzq, stride,
2342 forceList, forceListCounter, forceListStarts, forceListNexts,
2344 energy_F, energy_TI_1, energy_TI_2);
2348 indexTB -= numImpropersTB;
2350 if (indexTB < numExclusionsTB) {
2351 int i = threadIdx.x + indexTB*blockDim.x;
2353 energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
2355 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
2358 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
2359 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
2363 virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
2365 if (i < numExclusions) {
2366 exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
2367 (i, exclusions, r2_delta, r2_delta_expc,
2368 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
2369 r2_table, exclusionTable,
2371 r2_table_tex, exclusionTableTex,
2373 xyzq, stride, lata, latb, latc, cutoff2,
2375 forceList, forceListCounter, forceListStartsSlow, forceListNexts,
2377 energy_F, energy_TI_1, energy_TI_2);
2381 indexTB -= numExclusionsTB;
2383 if (indexTB < numCrosstermsTB) {
2384 int i = threadIdx.x + indexTB*blockDim.x;
2386 energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
2388 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
2391 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
2392 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
2396 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2398 if (i < numCrossterms) {
2399 crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
2400 (i, crossterms, crosstermValues,
2401 xyzq, stride, lata, latb, latc,
2403 forceList, forceListCounter, forceListStarts, forceListNexts,
2405 energy_F, energy_TI_1, energy_TI_2);
2409 indexTB -= numCrosstermsTB;
2411 if (indexTB < numTholesTB) {
2412 int i = threadIdx.x + indexTB*blockDim.x;
2414 energyIndex = ComputeBondedCUDAKernel::energyIndex_THOLE;
2416 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_THOLE_F;
2419 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_1;
2420 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_2;
2424 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2426 if (i < numTholes) {
2427 tholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2428 i, tholes, xyzq, stride, lata, latb, latc,
2429 force, energy, forceList, forceListCounter,
2430 forceListStarts, forceListNexts,
2431 virial, energy_F, energy_TI_1, energy_TI_2);
2435 indexTB -= numTholesTB;
2437 if (indexTB < numAnisosTB) {
2438 int i = threadIdx.x + indexTB*blockDim.x;
2440 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANISO;
2442 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANISO_F;
2445 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_1;
2446 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_2;
2450 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2453 anisoForce<T, doEnergy, doVirial, doFEP, doTI>(
2454 i, anisos, xyzq, stride, lata, latb, latc,
2455 force, energy, forceList, forceListCounter,
2456 forceListStarts, forceListNexts,
2457 virial, energy_F, energy_TI_1, energy_TI_2);
2461 if (doElect) indexTB -= numAnisosTB;
2464 if (indexTB < numOneFourNbTholesTB) {
2465 int i = threadIdx.x + indexTB*blockDim.x;
2467 energyIndex = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE;
2469 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_F;
2472 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_TI_1;
2473 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ONEFOURNBTHOLE_TI_2;
2477 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2479 if (i < numOneFourNbTholes) {
2480 oneFourNbTholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2481 i, one_scale14, oneFourNbTholes, xyzq, stride, lata, latb, latc,
2482 force, energy, forceList, forceListCounter,
2483 forceListStarts, forceListNexts,
2484 virial, energy_F, energy_TI_1, energy_TI_2);
2488 // indexTB -= numOneFourNbTholesTB;
2493 // Write energies to global memory
2495 // energyIndex is constant within thread-block
2496 __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
2497 __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2498 __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2499 __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
2501 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2502 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2505 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2508 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2509 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2512 int laneID = (threadIdx.x & (WARPSIZE - 1));
2513 int warpID = threadIdx.x / WARPSIZE;
2515 shEnergy[warpID] = energy;
2517 shEnergy_F[warpID] = energy_F;
2520 shEnergy_TI_1[warpID] = energy_TI_1;
2521 shEnergy_TI_2[warpID] = energy_TI_2;
2526 energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
2528 energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
2531 energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
2532 energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
2535 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2536 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2539 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2542 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2543 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2547 const int bin = blockIdx.x % ATOMIC_BINS;
2548 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
2550 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
2553 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
2554 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
2560 // Write virials to global memory
2561 #ifdef WRITE_FULL_VIRIALS
2564 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2565 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2566 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2567 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2568 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2569 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2570 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2571 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2572 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2573 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2575 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
2576 int laneID = (threadIdx.x & (WARPSIZE - 1));
2577 int warpID = threadIdx.x / WARPSIZE;
2579 shVirial[warpID] = virial;
2593 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
2595 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2596 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2597 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2598 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2599 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2600 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2601 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2602 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2603 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2604 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2608 const int bin = blockIdx.x % ATOMIC_BINS;
2609 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
2610 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
2611 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
2612 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
2613 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
2614 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
2615 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
2616 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
2617 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
2625 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
2626 __global__ void modifiedExclusionForcesKernel(
2628 const int numModifiedExclusions,
2629 const CudaExclusion* __restrict__ modifiedExclusions,
2631 const float one_scale14, // 1 - scale14
2632 const float cutoff2,
2633 const CudaNBConstants nbConstants,
2634 const int vdwCoefTableWidth,
2635 const float2* __restrict__ vdwCoefTable,
2636 #if !defined(USE_TABLE_ARRAYS)
2637 cudaTextureObject_t vdwCoefTableTex,
2639 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2640 const float4* __restrict__ modifiedExclusionForceTable,
2641 const float4* __restrict__ modifiedExclusionEnergyTable,
2643 cudaTextureObject_t modifiedExclusionForceTableTex,
2644 cudaTextureObject_t modifiedExclusionEnergyTableTex,
2646 const float4* __restrict__ xyzq,
2648 const float3 lata, const float3 latb, const float3 latc,
2649 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2650 T* __restrict__ forceList,
2651 int* forceListCounter,
2652 int* forceListStartsNbond,
2653 int* forceListStartsSlow,
2654 int* forceListNexts,
2655 double* __restrict__ energies_virials) {
2658 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2660 double energyVdw, energyNbond, energySlow;
2661 // Alchemical energies
2662 double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2663 double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2664 double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2671 energyVdw_TI_1 = 0.0;
2672 energyVdw_TI_2 = 0.0;
2678 energyNbond_F = 0.0;
2682 energyNbond_TI_1 = 0.0;
2683 energyNbond_TI_2 = 0.0;
2684 energySlow_TI_1 = 0.0;
2685 energySlow_TI_2 = 0.0;
2690 #ifdef WRITE_FULL_VIRIALS
2691 ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2692 ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2694 virialNbond.xx = 0.0;
2695 virialNbond.xy = 0.0;
2696 virialNbond.xz = 0.0;
2697 virialNbond.yx = 0.0;
2698 virialNbond.yy = 0.0;
2699 virialNbond.yz = 0.0;
2700 virialNbond.zx = 0.0;
2701 virialNbond.zy = 0.0;
2702 virialNbond.zz = 0.0;
2704 virialSlow.xx = 0.0;
2705 virialSlow.xy = 0.0;
2706 virialSlow.xz = 0.0;
2707 virialSlow.yx = 0.0;
2708 virialSlow.yy = 0.0;
2709 virialSlow.yz = 0.0;
2710 virialSlow.zx = 0.0;
2711 virialSlow.zy = 0.0;
2712 virialSlow.zz = 0.0;
2717 if (i < numModifiedExclusions)
2719 modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2720 (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2721 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2726 // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2727 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2729 modifiedExclusionForceTable,
2730 modifiedExclusionEnergyTable,
2732 // if CUDA build, non-tables, fall back to texture force/energy tables
2733 modifiedExclusionForceTableTex,
2734 modifiedExclusionEnergyTableTex,
2736 xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2737 energyVdw, forceNbond, energyNbond,
2738 forceSlow, energySlow,
2739 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2740 virialNbond, virialSlow,
2741 energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2742 energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2743 energySlow_F, energySlow_TI_1, energySlow_TI_2);
2746 // Write energies to global memory
2748 __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2749 __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2750 __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2751 __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2752 __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2753 __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2754 __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2755 __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2756 __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2757 __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2758 __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2759 __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2761 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2762 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2765 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2768 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2769 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2772 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2773 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2775 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2776 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2779 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2780 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2781 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2782 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2786 int laneID = (threadIdx.x & (WARPSIZE - 1));
2787 int warpID = threadIdx.x / WARPSIZE;
2789 shEnergyVdw[warpID] = energyVdw;
2791 shEnergyVdw_F[warpID] = energyVdw_F;
2794 shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2795 shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2798 shEnergyNbond[warpID] = energyNbond;
2799 shEnergySlow[warpID] = energySlow;
2801 shEnergyNbond_F[warpID] = energyNbond_F;
2802 shEnergySlow_F[warpID] = energySlow_F;
2805 shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2806 shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2807 shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2808 shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2814 energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2816 energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2819 energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2820 energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2823 energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2824 energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2826 energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2827 energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2830 energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2831 energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2832 energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2833 energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2837 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2838 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2841 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2844 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2845 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2848 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2849 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2851 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2852 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2855 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2856 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2857 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2858 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2863 const int bin = blockIdx.x % ATOMIC_BINS;
2864 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2866 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2869 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2870 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2873 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2874 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2876 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2877 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2880 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2881 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2882 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2883 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2890 // Write virials to global memory
2891 #ifdef WRITE_FULL_VIRIALS
2894 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2895 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2896 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2897 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2898 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2899 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2900 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2901 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2902 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2903 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2904 if (doElect && doSlow) {
2905 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2906 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2907 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2908 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2909 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2910 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2911 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2912 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2913 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2916 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2917 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2918 int laneID = (threadIdx.x & (WARPSIZE - 1));
2919 int warpID = threadIdx.x / WARPSIZE;
2921 shVirialNbond[warpID] = virialNbond;
2922 if (doElect && doSlow) {
2923 shVirialSlow[warpID] = virialSlow;
2928 virialNbond.xx = 0.0;
2929 virialNbond.xy = 0.0;
2930 virialNbond.xz = 0.0;
2931 virialNbond.yx = 0.0;
2932 virialNbond.yy = 0.0;
2933 virialNbond.yz = 0.0;
2934 virialNbond.zx = 0.0;
2935 virialNbond.zy = 0.0;
2936 virialNbond.zz = 0.0;
2937 if (doElect && doSlow) {
2938 virialSlow.xx = 0.0;
2939 virialSlow.xy = 0.0;
2940 virialSlow.xz = 0.0;
2941 virialSlow.yx = 0.0;
2942 virialSlow.yy = 0.0;
2943 virialSlow.yz = 0.0;
2944 virialSlow.zx = 0.0;
2945 virialSlow.zy = 0.0;
2946 virialSlow.zz = 0.0;
2950 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2951 virialNbond = shVirialNbond[laneID];
2952 if (doElect && doSlow) {
2953 virialSlow = shVirialSlow[laneID];
2957 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2958 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2959 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2960 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2961 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2962 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2963 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2964 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2965 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2966 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2967 if (doElect && doSlow) {
2968 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2969 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2970 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2971 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2972 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2973 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2974 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2975 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2976 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2982 const int bin = blockIdx.x % ATOMIC_BINS;
2983 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2984 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2985 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2986 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2987 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2988 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2989 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2990 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2991 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2992 if (doElect && doSlow) {
2993 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2994 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2995 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2996 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2997 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2998 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2999 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
3000 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
3001 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
3010 template <typename T>
3011 __global__ void gatherBondedForcesKernel(
3013 const int atomStorageSize,
3015 const T* __restrict__ forceList,
3016 const int* __restrict__ forceListStarts,
3017 const int* __restrict__ forceListNexts,
3018 T* __restrict__ force) {
3020 int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
3022 if (i < atomStorageSize) {
3026 int pos = forceListStarts[i];
3028 fx += forceList[pos * 3 + 0];
3029 fy += forceList[pos * 3 + 1];
3030 fz += forceList[pos * 3 + 2];
3031 pos = forceListNexts[pos];
3034 force[i + stride ] = fy;
3035 force[i + stride * 2] = fz;
3039 __global__ void reduceBondedBinsKernel(double* energies_virials) {
3040 const int bin = threadIdx.x;
3042 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3043 __shared__ typename WarpReduce::TempStorage tempStorage;
3045 double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
3046 if (threadIdx.x == 0) {
3047 energies_virials[blockIdx.x] = v;
3051 // ##############################################################################################
3052 // ##############################################################################################
3053 // ##############################################################################################
3056 // Class constructor
3058 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
3059 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
3061 cudaCheck(cudaSetDevice(deviceID));
3070 modifiedExclusions = NULL;
3074 oneFourNbTholes = NULL;
3080 numModifiedExclusions = 0;
3085 numOneFourNbTholes = 0;
3089 dihedralValues = NULL;
3090 improperValues = NULL;
3091 crosstermValues = NULL;
3100 forceListStarts = NULL;
3101 forceListNexts = NULL;
3103 forceListStartsSize = 0;
3104 forceListNextsSize = 0;
3105 allocate_device<int>(&forceListCounter, 1);
3107 allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
3113 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
3114 cudaCheck(cudaSetDevice(deviceID));
3116 deallocate_device<double>(&energies_virials);
3117 // deallocate_device<BondedVirial>(&virial);
3119 if (tupleData != NULL) deallocate_device<char>(&tupleData);
3120 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
3121 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
3123 if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
3124 if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
3125 if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
3126 if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
3128 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
3129 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
3130 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
3131 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
3132 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
3135 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
3136 allocate_device<CudaBondValue>(&bondValues, numBondValues);
3137 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
3140 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
3141 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
3142 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
3145 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
3146 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
3147 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
3150 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
3151 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
3152 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
3155 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
3156 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
3157 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
3160 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
3161 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
3164 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
3165 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
3168 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
3169 alchFlags = h_cudaAlchFlags;
3173 // Update bonded lists
3175 void ComputeBondedCUDAKernel::setTupleCounts(
3176 const TupleCounts count
3178 numBonds = count.bond;
3179 numAngles = count.angle;
3180 numDihedrals = count.dihedral;
3181 numImpropers = count.improper;
3182 numModifiedExclusions = count.modifiedExclusion;
3183 numExclusions = count.exclusion;
3184 numCrossterms = count.crossterm;
3185 numTholes = count.thole;
3186 numAnisos = count.aniso;
3187 numOneFourNbTholes = count.oneFourNbThole;
3190 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
3193 count.bond = numBonds;
3194 count.angle = numAngles;
3195 count.dihedral = numDihedrals;
3196 count.improper = numImpropers;
3197 count.modifiedExclusion = numModifiedExclusions;
3198 count.exclusion = numExclusions;
3199 count.crossterm = numCrossterms;
3200 count.thole = numTholes;
3201 count.aniso = numAnisos;
3202 count.oneFourNbThole = numOneFourNbTholes;
3207 TupleData ComputeBondedCUDAKernel::getData() {
3210 data.angle = angles;
3211 data.dihedral = dihedrals;
3212 data.improper = impropers;
3213 data.modifiedExclusion = modifiedExclusions;
3214 data.exclusion = exclusions;
3215 data.crossterm = crossterms;
3216 data.thole = tholes;
3217 data.aniso = anisos;
3218 data.oneFourNbThole = oneFourNbTholes;
3223 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
3224 const TupleCounts countIn,
3227 const int numBondsWA = warpAlign(countIn.bond);
3228 const int numAnglesWA = warpAlign(countIn.angle);
3229 const int numDihedralsWA = warpAlign(countIn.dihedral);
3230 const int numImpropersWA = warpAlign(countIn.improper);
3231 const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
3232 const int numExclusionsWA = warpAlign(countIn.exclusion);
3233 const int numCrosstermsWA = warpAlign(countIn.crossterm);
3234 const int numTholesWA = warpAlign(countIn.thole);
3235 const int numAnisosWA = warpAlign(countIn.aniso);
3236 const int numOneFourNbTholesWA = warpAlign(countIn.oneFourNbThole);
3238 const size_t sizeTot = numBondsWA*sizeof(CudaBond)
3239 + numAnglesWA*sizeof(CudaAngle)
3240 + numDihedralsWA*sizeof(CudaDihedral)
3241 + numImpropersWA*sizeof(CudaDihedral)
3242 + numModifiedExclusionsWA*sizeof(CudaExclusion)
3243 + numExclusionsWA*sizeof(CudaExclusion)
3244 + numCrosstermsWA*sizeof(CudaCrossterm)
3245 + numTholesWA*sizeof(CudaThole)
3246 + numAnisosWA*sizeof(CudaAniso)
3247 + numOneFourNbTholesWA*sizeof(CudaOneFourNbThole);
3249 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
3253 bonds = (CudaBond *)&tupleData[pos];
3254 pos += numBondsWA*sizeof(CudaBond);
3256 angles = (CudaAngle* )&tupleData[pos];
3257 pos += numAnglesWA*sizeof(CudaAngle);
3259 dihedrals = (CudaDihedral* )&tupleData[pos];
3260 pos += numDihedralsWA*sizeof(CudaDihedral);
3262 impropers = (CudaDihedral* )&tupleData[pos];
3263 pos += numImpropersWA*sizeof(CudaDihedral);
3265 modifiedExclusions = (CudaExclusion* )&tupleData[pos];
3266 pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
3268 exclusions = (CudaExclusion* )&tupleData[pos];
3269 pos += numExclusionsWA*sizeof(CudaExclusion);
3271 crossterms = (CudaCrossterm* )&tupleData[pos];
3272 pos += numCrosstermsWA*sizeof(CudaCrossterm);
3274 tholes = (CudaThole *)&tupleData[pos];
3275 pos += numTholesWA*sizeof(CudaThole);
3277 anisos = (CudaAniso *)&tupleData[pos];
3278 pos += numAnisosWA*sizeof(CudaAniso);
3280 oneFourNbTholes = (CudaOneFourNbThole *)&tupleData[pos];
3281 pos += numOneFourNbTholesWA*sizeof(CudaOneFourNbThole);
3286 void ComputeBondedCUDAKernel::update(
3287 const int numBondsIn,
3288 const int numAnglesIn,
3289 const int numDihedralsIn,
3290 const int numImpropersIn,
3291 const int numModifiedExclusionsIn,
3292 const int numExclusionsIn,
3293 const int numCrosstermsIn,
3294 const int numTholesIn,
3295 const int numAnisosIn,
3296 const int numOneFourNbTholesIn,
3297 const char* h_tupleData,
3298 cudaStream_t stream) {
3300 numBonds = numBondsIn;
3301 numAngles = numAnglesIn;
3302 numDihedrals = numDihedralsIn;
3303 numImpropers = numImpropersIn;
3304 numModifiedExclusions = numModifiedExclusionsIn;
3305 numExclusions = numExclusionsIn;
3306 numCrossterms = numCrosstermsIn;
3307 numTholes = numTholesIn;
3308 numAnisos = numAnisosIn;
3309 numOneFourNbTholes = numOneFourNbTholesIn;
3311 const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
3313 copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
3316 void ComputeBondedCUDAKernel::updateAtomBuffer(
3317 const int atomStorageSize,
3320 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3324 // Return stride for force array
3326 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
3327 #ifdef USE_STRIDED_FORCE
3328 // Align stride to 256 bytes
3329 return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
3336 // Return size of single force array
3338 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
3339 #ifdef USE_STRIDED_FORCE
3340 return (3*getForceStride(atomStorageSize));
3342 return (3*atomStorageSize);
3347 // Return size of the all force arrays
3349 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
3351 int forceSize = getForceSize(atomStorageSize);
3353 if (numModifiedExclusions > 0 || numExclusions > 0) {
3355 // All three force arrays [normal, nbond, slow]
3358 // Two force arrays [normal, nbond]
3367 // Compute bonded forces
3369 void ComputeBondedCUDAKernel::bondedForce(
3370 const double scale14, const int atomStorageSize,
3371 const bool doEnergy, const bool doVirial, const bool doSlow,
3373 const float3 lata, const float3 latb, const float3 latc,
3374 const float cutoff2, const float r2_delta, const int r2_delta_expc,
3375 const CudaNBConstants nbConstants,
3376 const float4* h_xyzq, FORCE_TYPE* h_forces,
3377 double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
3378 bool useDeviceMigration, cudaStream_t stream) {
3380 int forceStorageSize = getAllForceSize(atomStorageSize, true);
3381 int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
3382 int forceStride = getForceStride(atomStorageSize);
3384 int forceSize = getForceSize(atomStorageSize);
3385 int startNbond = forceSize;
3386 int startSlow = 2*forceSize;
3387 // Re-allocate coordinate and force arrays if neccessary
3388 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3389 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
3391 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3393 // numBonds bondForce 2
3394 // numAngles angleForce 3
3395 // numDihedrals diheForce 4
3396 // numImpropers diheForce 4
3397 // numExclusions exclusionForce 2
3398 // numCrossterms crosstermForce 8
3399 // numModifiedExclusions modifiedExclusionForce 4
3400 // numTholes tholeForce 4
3401 // numAnisos anisoForce 5
3402 // numOneFourNbTholes oneFourNbTholeForce 4
3403 int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4 + numTholes * 4 + numAnisos * 5 + numOneFourNbTholes * 4);
3404 reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
3405 reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
3406 reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
3407 int* forceListStartsNbond = forceListStarts + atomStorageSize;
3408 int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
3410 clear_device_array<int>(forceListCounter, 1, stream);
3411 cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
3413 int* forceListStartsNbond = NULL;
3414 int* forceListStartsSlow = NULL;
3417 #ifdef NODEGROUP_FORCE_REGISTER
3418 if(CUDASOAintegrate){
3419 if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3420 }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3422 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3424 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
3425 clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
3427 if (doEnergy || doVirial ) {
3428 clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
3431 // Check if we are doing alchemical free energy calculation
3432 // Is checking alchOn required?
3433 const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
3434 const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
3436 float one_scale14 = (float)(1.0 - scale14);
3437 bool doElect = (one_scale14 == 0.0f) ? false : true;
3439 // If doSlow = false, these exclusions are not calculated
3440 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
3442 int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3444 int numBondsTB = (numBonds + nthread - 1)/nthread;
3445 int numAnglesTB = (numAngles + nthread - 1)/nthread;
3446 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
3447 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
3448 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
3449 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
3450 int numTholesTB = (numTholes + nthread - 1)/nthread;
3451 int numAnisosTB = (numAnisos + nthread - 1)/nthread;
3452 int numOneFourNbTholesTB = doElect ? (numOneFourNbTholes + nthread - 1)/nthread : 0;
3454 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
3455 numExclusionsTB + numCrosstermsTB + numTholesTB + numAnisosTB + numOneFourNbTholesTB;
3458 // printf("%d %d %d %d %d %d nblock %d\n",
3459 // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
3461 int max_nblock = deviceCUDA->getMaxNumBlocks();
3464 while (start < nblock)
3466 int nleft = nblock - start;
3467 int nblock_use = min(max_nblock, nleft);
3469 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3470 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
3471 cudaNonbondedTables.getExclusionTable()
3473 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
3474 cudaNonbondedTables.getExclusionTableTex()
3477 #if defined(USE_TABLE_ARRAYS)
3478 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI) \
3479 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI> \
3480 <<< nblock_use, nthread, shmem_size, stream >>> \
3481 (start, numBonds, bonds, bondValues, \
3482 numAngles, angles, angleValues, \
3483 numDihedrals, dihedrals, dihedralValues, \
3484 numImpropers, impropers, improperValues, \
3485 numExclusionsDoSlow, exclusions, \
3486 numCrossterms, crossterms, crosstermValues, \
3487 numTholes, tholes, \
3488 numAnisos, anisos, \
3489 numOneFourNbTholes, oneFourNbTholes, \
3490 cutoff2, one_scale14, \
3491 r2_delta, r2_delta_expc, \
3493 xyzq, forceStride, \
3495 forces, &forces[startSlow], \
3496 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3499 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI) \
3500 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI> \
3501 <<< nblock_use, nthread, shmem_size, stream >>> \
3502 (start, numBonds, bonds, bondValues, \
3503 numAngles, angles, angleValues, \
3504 numDihedrals, dihedrals, dihedralValues, \
3505 numImpropers, impropers, improperValues, \
3506 numExclusionsDoSlow, exclusions, \
3507 numCrossterms, crossterms, crosstermValues, \
3508 numTholes, tholes, \
3509 numAnisos, anisos, \
3510 numOneFourNbTholes, oneFourNbTholes, \
3511 cutoff2, one_scale14, \
3512 r2_delta, r2_delta_expc, \
3513 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
3514 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
3515 xyzq, forceStride, \
3517 forces, &forces[startSlow], \
3518 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3521 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI) CALL(0, 0, 0, 0, 0);
3522 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI) CALL(0, 1, 0, 0, 0);
3523 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI) CALL(1, 0, 0, 0, 0);
3524 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI) CALL(1, 1, 0, 0, 0);
3526 if (!doEnergy && !doVirial && !doElect && doFEP && !doTI) CALL(0, 0, 0, 1, 0);
3527 if (!doEnergy && doVirial && !doElect && doFEP && !doTI) CALL(0, 1, 0, 1, 0);
3528 if ( doEnergy && !doVirial && !doElect && doFEP && !doTI) CALL(1, 0, 0, 1, 0);
3529 if ( doEnergy && doVirial && !doElect && doFEP && !doTI) CALL(1, 1, 0, 1, 0);
3531 if (!doEnergy && !doVirial && !doElect && !doFEP && doTI) CALL(0, 0, 0, 0, 1);
3532 if (!doEnergy && doVirial && !doElect && !doFEP && doTI) CALL(0, 1, 0, 0, 1);
3533 if ( doEnergy && !doVirial && !doElect && !doFEP && doTI) CALL(1, 0, 0, 0, 1);
3534 if ( doEnergy && doVirial && !doElect && !doFEP && doTI) CALL(1, 1, 0, 0, 1);
3536 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI) CALL(0, 0, 1, 0, 0);
3537 if (!doEnergy && doVirial && doElect && !doFEP && !doTI) CALL(0, 1, 1, 0, 0);
3538 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI) CALL(1, 0, 1, 0, 0);
3539 if ( doEnergy && doVirial && doElect && !doFEP && !doTI) CALL(1, 1, 1, 0, 0);
3541 if (!doEnergy && !doVirial && doElect && doFEP && !doTI) CALL(0, 0, 1, 1, 0);
3542 if (!doEnergy && doVirial && doElect && doFEP && !doTI) CALL(0, 1, 1, 1, 0);
3543 if ( doEnergy && !doVirial && doElect && doFEP && !doTI) CALL(1, 0, 1, 1, 0);
3544 if ( doEnergy && doVirial && doElect && doFEP && !doTI) CALL(1, 1, 1, 1, 0);
3546 if (!doEnergy && !doVirial && doElect && !doFEP && doTI) CALL(0, 0, 1, 0, 1);
3547 if (!doEnergy && doVirial && doElect && !doFEP && doTI) CALL(0, 1, 1, 0, 1);
3548 if ( doEnergy && !doVirial && doElect && !doFEP && doTI) CALL(1, 0, 1, 0, 1);
3549 if ( doEnergy && doVirial && doElect && !doFEP && doTI) CALL(1, 1, 1, 0, 1);
3551 // Can't enable both FEP and TI, don't expand the following functions.
3553 if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
3554 if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
3555 if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
3556 if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
3561 cudaCheck(cudaGetLastError());
3563 start += nblock_use;
3566 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3567 nblock = (numModifiedExclusions + nthread - 1)/nthread;
3570 while (start < nblock)
3572 int nleft = nblock - start;
3573 int nblock_use = min(max_nblock, nleft);
3575 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3576 #define TABLE_PARAMS \
3577 cudaNonbondedTables.getExclusionVdwCoefTable(), \
3578 cudaNonbondedTables.getModifiedExclusionForceTable(), \
3579 cudaNonbondedTables.getModifiedExclusionEnergyTable()
3581 #define TABLE_PARAMS \
3582 cudaNonbondedTables.getExclusionVdwCoefTable(), \
3583 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
3584 cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
3585 cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
3588 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
3589 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
3590 <<< nblock_use, nthread, shmem_size, stream >>> \
3591 (start, numModifiedExclusions, modifiedExclusions, \
3592 doSlow, one_scale14, cutoff2, nbConstants, \
3593 cudaNonbondedTables.getVdwCoefTableWidth(), \
3595 xyzq, forceStride, lata, latb, latc, \
3596 &forces[startNbond], &forces[startSlow], \
3597 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
3602 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
3603 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
3604 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
3605 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
3607 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
3608 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
3609 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
3610 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
3612 if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
3613 if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
3614 if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
3615 if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
3617 if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
3618 if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
3619 if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
3620 if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
3622 if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
3623 if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
3624 if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
3625 if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
3627 if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
3628 if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
3629 if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
3630 if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
3632 // doTable disabled is only supported by no FEP/ no TI
3633 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
3634 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
3635 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
3636 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
3638 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
3639 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
3640 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
3641 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
3643 // Can't enable both FEP and TI, don't expand the following functions.
3645 if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
3646 if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
3647 if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
3648 if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
3650 if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
3651 if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
3652 if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
3653 if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
3657 cudaCheck(cudaGetLastError());
3659 start += nblock_use;
3661 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3662 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3663 nblock = (atomStorageSize + nthread - 1)/nthread;
3666 while (start < nblock)
3668 int nleft = nblock - start;
3669 int nblock_use = min(max_nblock, nleft);
3671 // cudaCheck(hipDeviceSynchronize());
3672 // auto t0 = std::chrono::high_resolution_clock::now();
3674 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3675 start, atomStorageSize, forceStride,
3676 forceList, forceListStarts, forceListNexts,
3678 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3679 start, atomStorageSize, forceStride,
3680 forceList, forceListStartsNbond, forceListNexts,
3681 &forces[startNbond]);
3683 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3684 start, atomStorageSize, forceStride,
3685 forceList, forceListStartsSlow, forceListNexts,
3686 &forces[startSlow]);
3688 cudaCheck(cudaGetLastError());
3690 // cudaCheck(hipStreamSynchronize(stream));
3691 // auto t1 = std::chrono::high_resolution_clock::now();
3692 // std::chrono::duration<double> diff1 = t1 - t0;
3693 // std::cout << "gatherBondedForcesKernel";
3694 // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
3696 start += nblock_use;
3700 #ifdef NODEGROUP_FORCE_REGISTER
3701 if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
3702 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3704 // XXX TODO: ERASE THIS AFTERWARDS
3705 // this is not numAtoms, this is something else
3706 // will print the force inside the compute and afterwards
3707 FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
3708 //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
3709 // positions are wrong here.
3710 //for(int i = 0; i < atomStorageSize; i++){
3711 for(int i = 29415; i < 29895; i++){
3712 fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3714 fclose(pos_nb_atoms);
3719 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3721 if (doEnergy || doVirial) {
3722 if (ATOMIC_BINS > 1) {
3723 // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3724 reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3726 // virial copy, is this necessary?
3727 copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);