NAMD
SequencerCUDAKernel.cu
Go to the documentation of this file.
1 #include "common.h"
2 #ifdef NAMD_CUDA
3 
4 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <cub/cub.cuh>
6 #else
7 #include <namd_cub/cub.cuh>
8 #endif
9 
10 #endif //NAMD_CUDA
11 
12 #ifdef NAMD_HIP //NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
15 #define cub hipcub
16 #endif
17 
18 #include "HipDefines.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
21 
22 #ifdef NODEGROUP_FORCE_REGISTER
23 
24 #define OVERALLOC 2.0f
25 
26 void NAMD_die(const char *);
27 
28 
29 __constant__ double c_pairlistTrigger;
30 __constant__ double c_pairlistGrow;
31 __constant__ double c_pairlistShrink;
32 
33 __device__ void reset_atomic_counter(unsigned int *counter) {
34  counter[0] = 0;
35  __threadfence();
36 }
37 
38 
39 template <int T_MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
40 __global__ void accumulateForceToSOAKernelMGPU(
41  const int numPatches,
42  CudaLocalRecord* localRecords,
43  const double * __restrict f_bond,
44  const double * __restrict f_bond_nbond,
45  const double * __restrict f_bond_slow,
46  int forceStride,
47  const float4 * __restrict f_nbond,
48  const float4 * __restrict f_nbond_slow,
49  const CudaForce* __restrict f_slow,
50  double * __restrict d_f_global_x,
51  double * __restrict d_f_global_y,
52  double * __restrict d_f_global_z,
53  double * __restrict d_f_normal_x,
54  double * __restrict d_f_normal_y,
55  double * __restrict d_f_normal_z,
56  double * __restrict d_f_nbond_x,
57  double * __restrict d_f_nbond_y,
58  double * __restrict d_f_nbond_z,
59  double * __restrict d_f_slow_x,
60  double * __restrict d_f_slow_y,
61  double * __restrict d_f_slow_z,
62  const int * __restrict patchUnsortOrder,
63  const Lattice lattice,
64  unsigned int** __restrict deviceQueues,
65  unsigned int* __restrict queueCounters,
66  unsigned int* __restrict tbcatomic
67 ) {
68  __shared__ CudaLocalRecord s_record;
69  using AccessType = int32_t;
70  AccessType* s_record_buffer = (AccessType*) &s_record;
71 
72  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
73  // Read in the CudaLocalRecord using multiple threads. This should
74  #pragma unroll 1
75  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
76  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
77  }
78  __syncthreads();
79 
80  const int numAtoms = s_record.numAtoms;
81  const int offset = s_record.bufferOffset;
82  const int offsetNB = s_record.bufferOffsetNBPad;
83 
84  double f_slow_x, f_slow_y, f_slow_z;
85  double f_nbond_x, f_nbond_y, f_nbond_z;
86  double f_normal_x, f_normal_y, f_normal_z;
87 
88  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
89  if (T_MAX_FORCE_NUMBER >= 1) {
90  int unorder = patchUnsortOrder[offset + i];
91  // gather from sorted nonbonded force array
92  float4 fnbond = f_nbond[offsetNB + unorder];
93  // set (medium) nonbonded force accumulators
94  f_nbond_x = (double)fnbond.x;
95  f_nbond_y = (double)fnbond.y;
96  f_nbond_z = (double)fnbond.z;
97  if (T_MAX_FORCE_NUMBER == 2) {
98  // gather from sorted nonbonded slow force array
99  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
100  // accumulate slow force contributions from nonbonded calculation
101  f_slow_x = (double)fnbslow.x;
102  f_slow_y = (double)fnbslow.y;
103  f_slow_z = (double)fnbslow.z;
104  }
105  }
106  // gather from strided bond force array
107  // set (fast) normal force accumulators
108  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
109  // Global forces is stored in d_f_global, add to bonded force
110  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
111  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
112  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
113  if (T_DOCUDAGLOBAL) {
114  d_f_global_x[offset + i] = 0;
115  d_f_global_y[offset + i] = 0;
116  d_f_global_z[offset + i] = 0;
117  }
118  }
119  else{
120  f_normal_x = f_bond[offset + i];
121  f_normal_y = f_bond[offset + i + forceStride];
122  f_normal_z = f_bond[offset + i + 2*forceStride];
123  }
124  if (T_MAX_FORCE_NUMBER >= 1) {
125  // gather from strided bond nonbonded force array
126  // accumulate (medium) nonbonded force accumulators
127  f_nbond_x += f_bond_nbond[offset + i];
128  f_nbond_y += f_bond_nbond[offset + i + forceStride];
129  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
130  if (T_MAX_FORCE_NUMBER == 2) {
131  // gather from strided bond slow force array
132  // accumulate slow force accumulators
133  f_slow_x += f_bond_slow[offset + i];
134  f_slow_y += f_bond_slow[offset + i + forceStride];
135  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
136  }
137  }
138  // set normal, nonbonded, and slow SOA force buffers
139  d_f_normal_x[offset + i] = f_normal_x;
140  d_f_normal_y[offset + i] = f_normal_y;
141  d_f_normal_z[offset + i] = f_normal_z;
142  if (T_MAX_FORCE_NUMBER >= 1) {
143  d_f_nbond_x[offset + i] = f_nbond_x;
144  d_f_nbond_y[offset + i] = f_nbond_y;
145  d_f_nbond_z[offset + i] = f_nbond_z;
146  if (T_MAX_FORCE_NUMBER == 2){
147  d_f_slow_x[offset + i] = f_slow_x;
148  d_f_slow_y[offset + i] = f_slow_y;
149  d_f_slow_z[offset + i] = f_slow_z;
150  }
151  }
152  }
153  __syncthreads();
154  }
155 }
156 
157 // DMC This was in the accumulateForceToSOAKernelMGPU (commented out)
158 #if 0
159  __syncthreads();
160  if(threadIdx.x == 0){
161  // Need another value here
162  unsigned int value = atomicInc(&tbcatomic[4], gridDim.x);
163  isLastBlockDone = (value == (gridDim.x - 1));
164  }
165  __syncthreads();
166  if(isLastBlockDone){
167  // thread0 flags everything
168  if(threadIdx.x == 0){
169  for(int i = 0; i < nDev; i++ ){
170  if (i == devID) continue;
171  unsigned int value = atomicInc(& (queueCounters[i]), nDev);
172  printf(" Device[%d] queue[%d] pos[%d]\n", devID, i, value);
173  // deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
174  deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
175  __threadfence_system();
176  }
177  // sets tcbCounter back to zero
178  tbcatomic[4] = 0;
179  printf(" GPU[%d] finished accumulating SOA buffers\n", devID);
180  }
181  }
182 #endif
183 
184 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
185 __global__ void accumulateForceToSOAKernel(
186  const int numPatches,
187  CudaLocalRecord* localRecords,
188  const double * __restrict f_bond,
189  const double * __restrict f_bond_nbond,
190  const double * __restrict f_bond_slow,
191  int forceStride,
192  const float4 * __restrict f_nbond,
193  const float4 * __restrict f_nbond_slow,
194  const CudaForce* __restrict f_slow,
195  double * __restrict d_f_global_x,
196  double * __restrict d_f_global_y,
197  double * __restrict d_f_global_z,
198  double * __restrict d_f_normal_x,
199  double * __restrict d_f_normal_y,
200  double * __restrict d_f_normal_z,
201  double * __restrict d_f_nbond_x,
202  double * __restrict d_f_nbond_y,
203  double * __restrict d_f_nbond_z,
204  double * __restrict d_f_slow_x,
205  double * __restrict d_f_slow_y,
206  double * __restrict d_f_slow_z,
207  const int * __restrict patchUnsortOrder,
208  const Lattice lattice
209 ) {
210  __shared__ CudaLocalRecord s_record;
211  using AccessType = int32_t;
212  AccessType* s_record_buffer = (AccessType*) &s_record;
213 
214  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
215  // Read in the CudaLocalRecord using multiple threads. This should
216  #pragma unroll 1
217  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
218  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
219  }
220  __syncthreads();
221 
222  const int numAtoms = s_record.numAtoms;
223  const int offset = s_record.bufferOffset;
224  const int offsetNB = s_record.bufferOffsetNBPad;
225 
226  double f_slow_x, f_slow_y, f_slow_z;
227  double f_nbond_x, f_nbond_y, f_nbond_z;
228  double f_normal_x, f_normal_y, f_normal_z;
229 
230  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
231  if (MAX_FORCE_NUMBER == 2) {
232  CudaForce f = f_slow[offset + i];
233  double3 f_scaled = lattice.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
234  // set slow force accumulators
235  f_slow_x = f_scaled.x;
236  f_slow_y = f_scaled.y;
237  f_slow_z = f_scaled.z;
238  }
239  if (MAX_FORCE_NUMBER >= 1) {
240  int unorder = patchUnsortOrder[offset + i];
241  // gather from sorted nonbonded force array
242  float4 fnbond = f_nbond[offsetNB + unorder];
243  // set (medium) nonbonded force accumulators
244  f_nbond_x = (double)fnbond.x;
245  f_nbond_y = (double)fnbond.y;
246  f_nbond_z = (double)fnbond.z;
247  if (MAX_FORCE_NUMBER == 2) {
248  // gather from sorted nonbonded slow force array
249  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
250  // accumulate slow force contributions from nonbonded calculation
251  f_slow_x += (double)fnbslow.x;
252  f_slow_y += (double)fnbslow.y;
253  f_slow_z += (double)fnbslow.z;
254  }
255  }
256  // gather from strided bond force array
257  // set (fast) normal force accumulators
258  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
259  // Global forces is stored in d_f_global, add to bonded force
260  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
261  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
262  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
263  if (T_DOCUDAGLOBAL) {
264  d_f_global_x[offset + i] = 0;
265  d_f_global_y[offset + i] = 0;
266  d_f_global_z[offset + i] = 0;
267  }
268  }
269  else
270  {
271  f_normal_x = f_bond[offset + i];
272  f_normal_y = f_bond[offset + i + forceStride];
273  f_normal_z = f_bond[offset + i + 2*forceStride];
274  }
275 
276  if (MAX_FORCE_NUMBER >= 1) {
277  // gather from strided bond nonbonded force array
278  // accumulate (medium) nonbonded force accumulators
279  f_nbond_x += f_bond_nbond[offset + i];
280  f_nbond_y += f_bond_nbond[offset + i + forceStride];
281  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
282  if (MAX_FORCE_NUMBER == 2) {
283  // gather from strided bond slow force array
284  // accumulate slow force accumulators
285  f_slow_x += f_bond_slow[offset + i];
286  f_slow_y += f_bond_slow[offset + i + forceStride];
287  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
288  }
289  }
290  // set normal, nonbonded, and slow SOA force buffers
291  d_f_normal_x[offset + i] = f_normal_x;
292  d_f_normal_y[offset + i] = f_normal_y;
293  d_f_normal_z[offset + i] = f_normal_z;
294  if (MAX_FORCE_NUMBER >= 1) {
295  d_f_nbond_x[offset + i] = f_nbond_x;
296  d_f_nbond_y[offset + i] = f_nbond_y;
297  d_f_nbond_z[offset + i] = f_nbond_z;
298  if (MAX_FORCE_NUMBER == 2) {
299  d_f_slow_x[offset + i] = f_slow_x;
300  d_f_slow_y[offset + i] = f_slow_y;
301  d_f_slow_z[offset + i] = f_slow_z;
302  }
303  }
304  }
305  __syncthreads();
306  }
307 }
308 
309 __global__ void accumulatePMEForces(
310  const int numAtoms,
311  const CudaForce* f_slow,
312  double* d_f_slow_x,
313  double* d_f_slow_y,
314  double* d_f_slow_z,
315  const int* patchOffsets,
316  const Lattice lat
317 ){
318  int tid = threadIdx.x + (blockDim.x * blockIdx.x);
319  double f_slow_x, f_slow_y, f_slow_z;
320 
321  if(tid < numAtoms){
322  CudaForce f = f_slow[tid];
323 
324  double3 f_scaled = lat.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
325  f_slow_x = f_scaled.x;
326  f_slow_y = f_scaled.y;
327  f_slow_z = f_scaled.z;
328 
329  d_f_slow_x[tid] += f_slow_x;
330  d_f_slow_y[tid] += f_slow_y;
331  d_f_slow_z[tid] += f_slow_z;
332  }
333 }
334 
335 void SequencerCUDAKernel::accumulateForceToSOA(
336  const int doGlobal,
337  const int doCudaGlobal,
338  const int maxForceNumber,
339  const int numPatches,
340  const int nDevices,
341  CudaLocalRecord* localRecords,
342  const double* f_bond,
343  const double* f_bond_nbond,
344  const double* f_bond_slow,
345  int forceStride,
346  const float4* f_nbond,
347  const float4* f_nbond_slow,
348  const CudaForce* f_slow,
349  double* d_f_global_x,
350  double* d_f_global_y,
351  double* d_f_global_z,
352  double* d_f_normal_x,
353  double* d_f_normal_y,
354  double* d_f_normal_z,
355  double* d_f_nbond_x,
356  double* d_f_nbond_y,
357  double* d_f_nbond_z,
358  double* d_f_slow_x,
359  double* d_f_slow_y,
360  double* d_f_slow_z,
361  const int* patchUnsortOrder,
362  const Lattice lattice,
363  unsigned int** deviceQueues,
364  unsigned int* queueCounters,
365  unsigned int* tbcatomic,
366  cudaStream_t stream
367 ) {
368  // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
369  const int options = (maxForceNumber << 2) + (doGlobal << 1) + doCudaGlobal;
370  if (nDevices > 1) {
371 #define CALL_MGPU(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
372  accumulateForceToSOAKernelMGPU<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
373  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
374  ( numPatches, localRecords, \
375  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
376  f_nbond, f_nbond_slow, f_slow, \
377  d_f_global_x, d_f_global_y, d_f_global_z, \
378  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
379  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
380  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
381  patchUnsortOrder, lattice, \
382  deviceQueues, queueCounters, tbcatomic \
383  )
384  switch (options) {
385  case ((0<<2) + (0<<1) + (0<<0)): CALL_MGPU(0, 0, 0); break;
386  case ((0<<2) + (0<<1) + (1<<0)): CALL_MGPU(0, 0, 1); break;
387  case ((0<<2) + (1<<1) + (1<<0)): CALL_MGPU(0, 1, 1); break;
388  case ((0<<2) + (1<<1) + (0<<0)): CALL_MGPU(0, 1, 0); break;
389  case ((1<<2) + (0<<1) + (0<<0)): CALL_MGPU(1, 0, 0); break;
390  case ((1<<2) + (0<<1) + (1<<0)): CALL_MGPU(1, 0, 1); break;
391  case ((1<<2) + (1<<1) + (1<<0)): CALL_MGPU(1, 1, 1); break;
392  case ((1<<2) + (1<<1) + (0<<0)): CALL_MGPU(1, 1, 0); break;
393  case ((2<<2) + (0<<1) + (0<<0)): CALL_MGPU(2, 0, 0); break;
394  case ((2<<2) + (0<<1) + (1<<0)): CALL_MGPU(2, 0, 1); break;
395  case ((2<<2) + (1<<1) + (1<<0)): CALL_MGPU(2, 1, 1); break;
396  case ((2<<2) + (1<<1) + (0<<0)): CALL_MGPU(2, 1, 0); break;
397  default: {
398  const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
399  NAMD_bug(error.c_str());
400  }
401  }
402 #undef CALL_MGPU
403  } else {
404 #define CALL(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
405  accumulateForceToSOAKernel<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
406  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
407  ( numPatches, localRecords, \
408  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
409  f_nbond, f_nbond_slow, f_slow, \
410  d_f_global_x, d_f_global_y, d_f_global_z, \
411  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
412  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
413  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
414  patchUnsortOrder, lattice \
415  )
416  switch (options) {
417  case ((0<<2) + (0<<1) + (0<<0)): CALL(0, 0, 0); break;
418  case ((0<<2) + (0<<1) + (1<<0)): CALL(0, 0, 1); break;
419  case ((0<<2) + (1<<1) + (1<<0)): CALL(0, 1, 1); break;
420  case ((0<<2) + (1<<1) + (0<<0)): CALL(0, 1, 0); break;
421  case ((1<<2) + (0<<1) + (0<<0)): CALL(1, 0, 0); break;
422  case ((1<<2) + (0<<1) + (1<<0)): CALL(1, 0, 1); break;
423  case ((1<<2) + (1<<1) + (1<<0)): CALL(1, 1, 1); break;
424  case ((1<<2) + (1<<1) + (0<<0)): CALL(1, 1, 0); break;
425  case ((2<<2) + (0<<1) + (0<<0)): CALL(2, 0, 0); break;
426  case ((2<<2) + (0<<1) + (1<<0)): CALL(2, 0, 1); break;
427  case ((2<<2) + (1<<1) + (1<<0)): CALL(2, 1, 1); break;
428  case ((2<<2) + (1<<1) + (0<<0)): CALL(2, 1, 0); break;
429  default: {
430  const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
431  NAMD_bug(error.c_str());
432  }
433  }
434 #undef CALL
435  }
436 }
437 
438 template <int T_MAX_FORCE_NUMBER>
439 __global__ void mergeForcesFromPeersKernel(
440  const int numPatches,
441  const int devID, // GPU ID
442  CudaLocalRecord* localRecords,
443  CudaPeerRecord* peerRecords,
444  const Lattice lat,
445  double** __restrict f_normal_x,
446  double** __restrict f_normal_y,
447  double** __restrict f_normal_z,
448  double** __restrict f_nbond_x,
449  double** __restrict f_nbond_y,
450  double** __restrict f_nbond_z,
451  double** __restrict f_slow_x,
452  double** __restrict f_slow_y,
453  double** __restrict f_slow_z,
454  const CudaForce* __restrict pmeForces
455 ) {
456  __shared__ CudaLocalRecord s_record;
457  using AccessType = int32_t;
458  AccessType* s_record_buffer = (AccessType*) &s_record;
459 
460  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
461  // Read in the CudaLocalRecord using multiple threads. This should
462  #pragma unroll 1
463  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
464  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
465  }
466  __syncthreads();
467 
468  const int numAtoms = s_record.numAtoms;
469  const int dstOffset = s_record.bufferOffset;
470  const int numPeerRecords = s_record.numPeerRecord;
471  const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
472 
473  const int inlinePeers = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
474 
475  for (int peer = 0; peer < inlinePeers; peer++) {
476  const int srcDevice = s_record.inline_peers[peer].deviceIndex;
477  const int srcOffset = s_record.inline_peers[peer].bufferOffset;
478 
479  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
480  double f_x = f_normal_x[srcDevice][srcOffset + i];
481  double f_y = f_normal_y[srcDevice][srcOffset + i];
482  double f_z = f_normal_z[srcDevice][srcOffset + i];
483 
484  f_normal_x[devID][dstOffset + i] += f_x;
485  f_normal_y[devID][dstOffset + i] += f_y;
486  f_normal_z[devID][dstOffset + i] += f_z;
487 
488  if(T_MAX_FORCE_NUMBER >= 1){
489  // We don't need the nonbonded offset here since
490  // this isn't the actual nonbonded buffer
491  f_x = f_nbond_x[srcDevice][srcOffset + i];
492  f_y = f_nbond_y[srcDevice][srcOffset + i];
493  f_z = f_nbond_z[srcDevice][srcOffset + i];
494 
495  f_nbond_x[devID][dstOffset + i] += f_x;
496  f_nbond_y[devID][dstOffset + i] += f_y;
497  f_nbond_z[devID][dstOffset + i] += f_z;
498 
499  if(T_MAX_FORCE_NUMBER == 2){
500  f_x = f_slow_x[srcDevice][srcOffset + i];
501  f_y = f_slow_y[srcDevice][srcOffset + i];
502  f_z = f_slow_z[srcDevice][srcOffset + i];
503 
504  f_slow_x[devID][dstOffset + i] += f_x;
505  f_slow_y[devID][dstOffset + i] += f_y;
506  f_slow_z[devID][dstOffset + i] += f_z;
507  }
508  }
509  }
510  }
511 
512  for (int peer = inlinePeers; peer < numPeerRecords; peer++) {
513  const int srcDevice = peerRecords[peerRecordStartIndexs + peer].deviceIndex;
514  const int srcOffset = peerRecords[peerRecordStartIndexs + peer].bufferOffset;
515 
516  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
517  double f_x = f_normal_x[srcDevice][srcOffset + i];
518  double f_y = f_normal_y[srcDevice][srcOffset + i];
519  double f_z = f_normal_z[srcDevice][srcOffset + i];
520 
521  f_normal_x[devID][dstOffset + i] += f_x;
522  f_normal_y[devID][dstOffset + i] += f_y;
523  f_normal_z[devID][dstOffset + i] += f_z;
524 
525  if(T_MAX_FORCE_NUMBER >= 1){
526  // We don't need the nonbonded offset here since
527  // this isn't the actual nonbonded buffer
528  f_x = f_nbond_x[srcDevice][srcOffset + i];
529  f_y = f_nbond_y[srcDevice][srcOffset + i];
530  f_z = f_nbond_z[srcDevice][srcOffset + i];
531 
532  f_nbond_x[devID][dstOffset + i] += f_x;
533  f_nbond_y[devID][dstOffset + i] += f_y;
534  f_nbond_z[devID][dstOffset + i] += f_z;
535 
536  if(T_MAX_FORCE_NUMBER == 2){
537  f_x = f_slow_x[srcDevice][srcOffset + i];
538  f_y = f_slow_y[srcDevice][srcOffset + i];
539  f_z = f_slow_z[srcDevice][srcOffset + i];
540 
541  f_slow_x[devID][dstOffset + i] += f_x;
542  f_slow_y[devID][dstOffset + i] += f_y;
543  f_slow_z[devID][dstOffset + i] += f_z;
544  }
545  }
546  }
547  }
548 
549  // Merge PME forces here instead of in the fetch forcing kernel
550  if(T_MAX_FORCE_NUMBER == 2){
551  for(int i = threadIdx.x; i < numAtoms; i += blockDim.x){
552  CudaForce f = pmeForces[dstOffset + i];
553 
554  double3 f_scaled = lat.scale_force(
555  Vector((double)f.x, (double)f.y, (double)f.z));
556  f_slow_x[devID][dstOffset + i] += f_scaled.x;
557  f_slow_y[devID][dstOffset + i] += f_scaled.y;
558  f_slow_z[devID][dstOffset + i] += f_scaled.z;
559  }
560  }
561  __syncthreads();
562  }
563 }
564 
565 void SequencerCUDAKernel::mergeForcesFromPeers(
566  const int devID,
567  const int maxForceNumber,
568  const Lattice lat,
569  const int numPatchesHomeAndProxy,
570  const int numPatchesHome,
571  double** f_normal_x,
572  double** f_normal_y,
573  double** f_normal_z,
574  double** f_nbond_x,
575  double** f_nbond_y,
576  double** f_nbond_z,
577  double** f_slow_x,
578  double** f_slow_y,
579  double** f_slow_z,
580  const CudaForce* pmeForces,
581  CudaLocalRecord* localRecords,
582  CudaPeerRecord* peerRecords,
583  std::vector<int>& atomCounts,
584  cudaStream_t stream
585 ){
586  // atom-based kernel here
587  //int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
588  int grid = (numPatchesHome);
589  int blocks = PATCH_BLOCKS;
590  // launch a single-threaded grid for debugging
591 
592  int offset = 0;
593  for (int i = 0; i < devID; i++) {
594  offset += atomCounts[i];
595  }
596 
597  switch (maxForceNumber) {
598  case 0:
599  mergeForcesFromPeersKernel<0><<<grid, blocks, 0, stream>>>(
600  numPatchesHome, devID, localRecords, peerRecords,
601  lat,
602  f_normal_x, f_normal_y, f_normal_z,
603  f_nbond_x, f_nbond_y, f_nbond_z,
604  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
605  );
606  break;
607  case 1:
608  mergeForcesFromPeersKernel<1><<<grid, blocks, 0, stream>>>(
609  numPatchesHome, devID, localRecords, peerRecords,
610  lat,
611  f_normal_x, f_normal_y, f_normal_z,
612  f_nbond_x, f_nbond_y, f_nbond_z,
613  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
614  );
615  break;
616  case 2:
617  mergeForcesFromPeersKernel<2><<<grid, blocks, 0, stream>>>(
618  numPatchesHome, devID, localRecords, peerRecords,
619  lat,
620  f_normal_x, f_normal_y, f_normal_z,
621  f_nbond_x, f_nbond_y, f_nbond_z,
622  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
623  );
624  break;
625  }
626 }
627 
628 // JM NOTE: This is a fused version of accumulateForceToSOA + addForceToMomentum.
629 // addForceToMomentum barely has any math and is memory-bandwith bound, so
630 // fusing these kernels allows us to keep the forces in the registers and
631 // reusing its values for velocities
632 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL, int DO_FIXED>
633 __global__ void accumulateForceKick(
634  const int numPatches,
635  CudaLocalRecord* localRecords,
636  const double * __restrict f_bond,
637  const double * __restrict f_bond_nbond,
638  const double * __restrict f_bond_slow,
639  int forceStride,
640  const float4 * __restrict f_nbond,
641  const float4 * __restrict f_nbond_slow,
642  const CudaForce* __restrict f_slow,
643  double * __restrict d_f_global_x,
644  double * __restrict d_f_global_y,
645  double * __restrict d_f_global_z,
646  double * __restrict d_f_normal_x,
647  double * __restrict d_f_normal_y,
648  double * __restrict d_f_normal_z,
649  double * __restrict d_f_nbond_x,
650  double * __restrict d_f_nbond_y,
651  double * __restrict d_f_nbond_z,
652  double * __restrict d_f_slow_x,
653  double * __restrict d_f_slow_y,
654  double * __restrict d_f_slow_z,
655  const int * __restrict patchUnsortOrder,
656  const Lattice lattice,
657  double *d_vel_x,
658  double *d_vel_y,
659  double *d_vel_z,
660  const double * __restrict recipMass,
661  const int* __restrict atomFixed,
662  const double dt_normal,
663  const double dt_nbond,
664  const double dt_slow,
665  const double scaling
666 ) {
667  __shared__ CudaLocalRecord s_record;
668  using AccessType = int32_t;
669  AccessType* s_record_buffer = (AccessType*) &s_record;
670 
671  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
672  // Read in the CudaLocalRecord using multiple threads. This should
673  #pragma unroll 1
674  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
675  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
676  }
677  __syncthreads();
678 
679  const int numAtoms = s_record.numAtoms;
680  const int offset = s_record.bufferOffset;
681  const int offsetNB = s_record.bufferOffsetNBPad;
682 
683  double f_slow_x, f_slow_y, f_slow_z;
684  double f_nbond_x, f_nbond_y, f_nbond_z;
685  double f_normal_x, f_normal_y, f_normal_z;
686 
687  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
688  f_slow_x = 0.0;
689  f_slow_y = 0.0;
690  f_slow_z = 0.0;
691  f_nbond_x = 0.0;
692  f_nbond_y = 0.0;
693  f_nbond_z = 0.0;
694  f_normal_x = 0.0;
695  f_normal_y = 0.0;
696  f_normal_z = 0.0;
697 
698  if (MAX_FORCE_NUMBER == 2) {
699  CudaForce f = f_slow[offset + i];
700 
701  double3 f_scaled = lattice.scale_force(
702  Vector((double)f.x, (double)f.y, (double)f.z));
703 
704  // set slow force accumulators
705  f_slow_x = f_scaled.x;
706  f_slow_y = f_scaled.y;
707  f_slow_z = f_scaled.z;
708  }
709  if (MAX_FORCE_NUMBER >= 1) {
710  int unorder = patchUnsortOrder[offset + i];
711  // gather from sorted nonbonded force array
712  float4 fnbond = f_nbond[offsetNB + unorder];
713  // set (medium) nonbonded force accumulators
714  f_nbond_x = (double)fnbond.x;
715  f_nbond_y = (double)fnbond.y;
716  f_nbond_z = (double)fnbond.z;
717  if (MAX_FORCE_NUMBER == 2) {
718  // gather from sorted nonbonded slow force array
719  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
720  // accumulate slow force contributions from nonbonded calculation
721  f_slow_x += (double)fnbslow.x;
722  f_slow_y += (double)fnbslow.y;
723  f_slow_z += (double)fnbslow.z;
724  }
725  }
726  // gather from strided bond force array
727  // set (fast) normal force accumulators
728  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
729  // Global forces is stored in d_f_global, add to bonded force
730  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
731  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
732  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
733  if (T_DOCUDAGLOBAL) {
734  d_f_global_x[offset + i] = 0;
735  d_f_global_y[offset + i] = 0;
736  d_f_global_z[offset + i] = 0;
737  }
738  }
739  else
740  {
741 
742  f_normal_x = f_bond[offset + i];
743  f_normal_y = f_bond[offset + i + forceStride];
744  f_normal_z = f_bond[offset + i + 2*forceStride];
745  }
746 
747  if (MAX_FORCE_NUMBER >= 1) {
748  // gather from strided bond nonbonded force array
749  // accumulate (medium) nonbonded force accumulators
750  f_nbond_x += f_bond_nbond[offset + i];
751  f_nbond_y += f_bond_nbond[offset + i + forceStride];
752  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
753  if (MAX_FORCE_NUMBER == 2) {
754  // gather from strided bond slow force array
755  // accumulate slow force accumulators
756  f_slow_x += f_bond_slow[offset + i];
757  f_slow_y += f_bond_slow[offset + i + forceStride];
758  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
759  }
760  }
761  // set normal, nonbonded, and slow SOA force buffers
762  d_f_normal_x[offset + i] = f_normal_x;
763  d_f_normal_y[offset + i] = f_normal_y;
764  d_f_normal_z[offset + i] = f_normal_z;
765  if (MAX_FORCE_NUMBER >= 1) {
766  d_f_nbond_x[offset + i] = f_nbond_x;
767  d_f_nbond_y[offset + i] = f_nbond_y;
768  d_f_nbond_z[offset + i] = f_nbond_z;
769  if (MAX_FORCE_NUMBER == 2) {
770  d_f_slow_x[offset + i] = f_slow_x;
771  d_f_slow_y[offset + i] = f_slow_y;
772  d_f_slow_z[offset + i] = f_slow_z;
773  }
774  }
775 
776  /* Velocity updates */
777  double rp = recipMass[offset + i];
778  double vx, vy, vz;
779  vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
780  vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
781  vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
782  if (!DO_FIXED) {
783  d_vel_x[offset + i] += vx;
784  d_vel_y[offset + i] += vy;
785  d_vel_z[offset + i] += vz;
786  } else {
787  if (!atomFixed[offset + i]) {
788  d_vel_x[offset + i] += vx;
789  d_vel_y[offset + i] += vy;
790  d_vel_z[offset + i] += vz;
791  } else {
792  d_vel_x[offset + i] = 0;
793  d_vel_y[offset + i] = 0;
794  d_vel_z[offset + i] = 0;
795  }
796  }
797  }
798  __syncthreads();
799  }
800 }
801 
802 
803 #if 0
804 template <int MAX_FORCE_NUMBER>
805 __global__ void accumulateForceKick(
806  const double * __restrict f_bond,
807  const double * __restrict f_bond_nbond,
808  const double * __restrict f_bond_slow,
809  int forceStride,
810  const PatchRecord * __restrict bond_pr,
811  const float4 * __restrict f_nbond,
812  const float4 * __restrict f_nbond_slow,
813  const CudaPatchRecord * __restrict nbond_pr,
814  const CudaForce * __restrict f_slow,
815  double *d_f_normal_x,
816  double *d_f_normal_y,
817  double *d_f_normal_z,
818  double *d_f_nbond_x,
819  double *d_f_nbond_y,
820  double *d_f_nbond_z,
821  double *d_f_slow_x,
822  double *d_f_slow_y,
823  double *d_f_slow_z,
824  double *d_vel_x,
825  double *d_vel_y,
826  double *d_vel_z,
827  const double * __restrict recipMass,
828  const double dt_normal,
829  const double dt_nbond,
830  const double dt_slow,
831  const double scaling,
832  const int * __restrict patchIDs,
833  const int * __restrict patchOffsets,
834  const int * __restrict patchUnsortOrder,
835  const CudaLattice lat)
836 {
837  int tid = threadIdx.x;
838  int stride = blockDim.x;
839 
840  __shared__ int sh_patchID;
841  __shared__ int sh_patchOffset;
842  // number of atoms per patch should be same no matter
843  // which data structure we access it through
844  __shared__ int sh_natoms;
845  __shared__ int sh_bondForceOffset;
846  __shared__ int sh_nbondForceOffset;
847 
848  // int patchID;
849  int patchOffset;
850  // number of atoms per patch should be same no matter
851  // which data structure we access it through
852  int natoms;
853  int bondForceOffset;
854  int nbondForceOffset;
855 
856  if(threadIdx.x == 0){
857  sh_patchID = patchIDs[blockIdx.x];
858  sh_patchOffset = patchOffsets[blockIdx.x];
859  // number of atoms per patch should be same no matter
860  // which data structure we access it through
861  sh_natoms = bond_pr[sh_patchID].numAtoms;
862  sh_bondForceOffset = bond_pr[sh_patchID].atomStart;
863  if(MAX_FORCE_NUMBER >= 1){
864  sh_nbondForceOffset = nbond_pr[sh_patchID].atomStart;
865  }
866  }
867 
868  __syncthreads();
869 
870  // pull stuff to registers
871  // patchID = sh_patchID;
872  patchOffset = sh_patchOffset;
873  natoms = sh_natoms;
874  bondForceOffset = sh_bondForceOffset;
875  nbondForceOffset = (MAX_FORCE_NUMBER >= 1) ? sh_nbondForceOffset : 0;
876 
877  double r1x, r1y, r1z, r2x, r2y, r2z, r3x, r3y, r3z;
878 
879  if (MAX_FORCE_NUMBER == 2) {
880  r1x = lat.b1.x;
881  r1y = lat.b1.y;
882  r1z = lat.b1.z;
883  r2x = lat.b2.x;
884  r2y = lat.b2.y;
885  r2z = lat.b2.z;
886  r3x = lat.b3.x;
887  r3y = lat.b3.y;
888  r3z = lat.b3.z;
889  }
890 
891  double f_slow_x, f_slow_y, f_slow_z;
892  double f_nbond_x, f_nbond_y, f_nbond_z;
893  double f_normal_x, f_normal_y, f_normal_z;
894  double vx, vy, vz;
895 
896  for (int i = tid; i < natoms; i += stride) {
897  f_slow_x = 0.0;
898  f_slow_y = 0.0;
899  f_slow_z = 0.0;
900  f_nbond_x = 0.0;
901  f_nbond_y = 0.0;
902  f_nbond_z = 0.0;
903  f_normal_x = 0.0;
904  f_normal_y = 0.0;
905  f_normal_z = 0.0;
906 
907  if(T_DOGLOBAL) {
908  // Global forces is stored in d_f_normal, just need to add bonded force
909  f_normal_x = d_f_normal_x[patchOffset + i];
910  f_normal_y = d_f_normal_y[patchOffset + i];
911  f_normal_z = d_f_normal_z[patchOffset + i];
912  }
913 
914  if (MAX_FORCE_NUMBER == 2) {
915  CudaForce f = f_slow[patchOffset + i];
916  double fx = (double) f.x;
917  double fy = (double) f.y;
918  double fz = (double) f.z;
919  // set slow force accumulators
920  f_slow_x = (fx*r1x + fy*r2x + fz*r3x);
921  f_slow_y = (fx*r1y + fy*r2y + fz*r3y);
922  f_slow_z = (fx*r1z + fy*r2z + fz*r3z);
923  }
924  if (MAX_FORCE_NUMBER >= 1) {
925  int unorder = patchUnsortOrder[patchOffset + i];
926  // gather from sorted nonbonded force array
927  float4 fnbond = f_nbond[nbondForceOffset + unorder];
928  // set (medium) nonbonded force accumulators
929  f_nbond_x = (double)fnbond.x;
930  f_nbond_y = (double)fnbond.y;
931  f_nbond_z = (double)fnbond.z;
932  if (MAX_FORCE_NUMBER == 2) {
933  // gather from sorted nonbonded slow force array
934  float4 fnbslow = f_nbond_slow[nbondForceOffset + unorder];
935  // accumulate slow force contributions from nonbonded calculation
936  f_slow_x += (double)fnbslow.x;
937  f_slow_y += (double)fnbslow.y;
938  f_slow_z += (double)fnbslow.z;
939  }
940  }
941  // gather from strided bond force array
942  // set (fast) normal force accumulators
943  f_normal_x += f_bond[bondForceOffset + i];
944  f_normal_y += f_bond[bondForceOffset + i + forceStride];
945  f_normal_z += f_bond[bondForceOffset + i + 2*forceStride];
946  if (MAX_FORCE_NUMBER >= 1) {
947  // gather from strided bond nonbonded force array
948  // accumulate (medium) nonbonded force accumulators
949  f_nbond_x += f_bond_nbond[bondForceOffset + i];
950  f_nbond_y += f_bond_nbond[bondForceOffset + i + forceStride];
951  f_nbond_z += f_bond_nbond[bondForceOffset + i + 2*forceStride];
952  if (MAX_FORCE_NUMBER == 2) {
953  // gather from strided bond slow force array
954  // accumulate slow force accumulators
955  f_slow_x += f_bond_slow[bondForceOffset + i];
956  f_slow_y += f_bond_slow[bondForceOffset + i + forceStride];
957  f_slow_z += f_bond_slow[bondForceOffset + i + 2*forceStride];
958  }
959  }
960  // set normal, nonbonded, and slow SOA force buffers
961  d_f_normal_x[patchOffset + i] = f_normal_x;
962  d_f_normal_y[patchOffset + i] = f_normal_y;
963  d_f_normal_z[patchOffset + i] = f_normal_z;
964 
965  if (MAX_FORCE_NUMBER >= 1) {
966  d_f_nbond_x[patchOffset + i] = f_nbond_x;
967  d_f_nbond_y[patchOffset + i] = f_nbond_y;
968  d_f_nbond_z[patchOffset + i] = f_nbond_z;
969  if (MAX_FORCE_NUMBER == 2) {
970  d_f_slow_x[patchOffset + i] = f_slow_x;
971  d_f_slow_y[patchOffset + i] = f_slow_y;
972  d_f_slow_z[patchOffset + i] = f_slow_z;
973  }
974  }
975 
976  /* Velocity updates */
977  double rp = recipMass[patchOffset + i];
978  vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
979  vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
980  vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
981 
982  d_vel_x[patchOffset + i] += vx;
983  d_vel_y[patchOffset + i] += vy;
984  d_vel_z[patchOffset + i] += vz;
985  }
986 }
987 #endif
988 
989 
990 void SequencerCUDAKernel::accumulate_force_kick(
991  const bool doFixedAtoms,
992  const int doGlobal,
993  const int doCudaGlobal,
994  const int maxForceNumber,
995  const int numPatches,
996  CudaLocalRecord* localRecords,
997  const double* f_bond,
998  const double* f_bond_nbond,
999  const double* f_bond_slow,
1000  int forceStride,
1001  const float4* f_nbond,
1002  const float4* f_nbond_slow,
1003  const CudaForce* f_slow,
1004  double* d_f_global_x,
1005  double* d_f_global_y,
1006  double* d_f_global_z,
1007  double* d_f_normal_x,
1008  double* d_f_normal_y,
1009  double* d_f_normal_z,
1010  double* d_f_nbond_x,
1011  double* d_f_nbond_y,
1012  double* d_f_nbond_z,
1013  double* d_f_slow_x,
1014  double* d_f_slow_y,
1015  double* d_f_slow_z,
1016  double* d_vel_x,
1017  double* d_vel_y,
1018  double* d_vel_z,
1019  const double* recipMass,
1020  const int* d_atomFixed,
1021  const double dt_normal,
1022  const double dt_nbond,
1023  const double dt_slow,
1024  const double scaling,
1025  const int* patchUnsortOrder,
1026  const Lattice lattice,
1027  cudaStream_t stream
1028 ) {
1029  // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
1030  const int options = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms);
1031 #define CALL(MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED) \
1032  accumulateForceKick<MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED> \
1033  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
1034  ( numPatches, localRecords, \
1035  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
1036  f_nbond, f_nbond_slow, f_slow, \
1037  d_f_global_x, d_f_global_y, d_f_global_z, \
1038  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
1039  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
1040  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
1041  patchUnsortOrder, lattice, \
1042  d_vel_x, d_vel_y, d_vel_z, recipMass, \
1043  d_atomFixed, dt_normal, dt_nbond, dt_slow, scaling)
1044  /*
1045  * Generated by the following python script:
1046  * #!/usr/bin/env python3
1047  * for maxForceNumber in range(3):
1048  * for doGlobal in range(2):
1049  * for doCudaGlobal in range(2):
1050  * for doFixedAtoms in range(2):
1051  * option = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms)
1052  * print(f' case {option}: CALL ({maxForceNumber}, {doGlobal}, {doCudaGlobal}, {doFixedAtoms}); break;')
1053  */
1054  switch (options) {
1055  case 0: CALL (0, 0, 0, 0); break;
1056  case 1: CALL (0, 0, 0, 1); break;
1057  case 2: CALL (0, 0, 1, 0); break;
1058  case 3: CALL (0, 0, 1, 1); break;
1059  case 4: CALL (0, 1, 0, 0); break;
1060  case 5: CALL (0, 1, 0, 1); break;
1061  case 6: CALL (0, 1, 1, 0); break;
1062  case 7: CALL (0, 1, 1, 1); break;
1063  case 8: CALL (1, 0, 0, 0); break;
1064  case 9: CALL (1, 0, 0, 1); break;
1065  case 10: CALL (1, 0, 1, 0); break;
1066  case 11: CALL (1, 0, 1, 1); break;
1067  case 12: CALL (1, 1, 0, 0); break;
1068  case 13: CALL (1, 1, 0, 1); break;
1069  case 14: CALL (1, 1, 1, 0); break;
1070  case 15: CALL (1, 1, 1, 1); break;
1071  case 16: CALL (2, 0, 0, 0); break;
1072  case 17: CALL (2, 0, 0, 1); break;
1073  case 18: CALL (2, 0, 1, 0); break;
1074  case 19: CALL (2, 0, 1, 1); break;
1075  case 20: CALL (2, 1, 0, 0); break;
1076  case 21: CALL (2, 1, 0, 1); break;
1077  case 22: CALL (2, 1, 1, 0); break;
1078  case 23: CALL (2, 1, 1, 1); break;
1079  default: {
1080  const std::string error =
1081  "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " +
1082  std::to_string(maxForceNumber) +
1083  ", doGlobal = " + std::to_string(doGlobal) +
1084  ", doFixedAtoms = " + std::to_string(doFixedAtoms) +
1085  ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
1086  NAMD_bug(error.c_str());
1087  }
1088  }
1089 #undef CALL
1090 }
1091 
1092 template <bool doSlow>
1093 __global__ void copyFNBondToSOA(float4* __restrict__ f_nbond,
1094  float4* __restrict__ f_nbond_slow,
1095  double* f_nbond_x,
1096  double* f_nbond_y,
1097  double* f_nbond_z,
1098  double* f_slow_x,
1099  double* f_slow_y,
1100  double* f_slow_z,
1101  const int* __restrict__ patchIDs,
1102  const int* __restrict__ patchOffsets,
1103  const int* __restrict__ patchUnsortOrder,
1104  const CudaPatchRecord* __restrict__ nbondIndexPerPatch)
1105 {
1106  int unorder;
1107  int stride = blockDim.x;
1108  int pid = patchIDs[blockIdx.x];
1109 
1110  // nbondIndexPerPatch is organized on a per local ID on the compute
1111  // It's also device-wide, which means it hopefully has more patches than
1112  // PatchMap but that means the ordering is different. I can't use the
1113  // PatchID here to index nbondIndexPerPatch I need something else.
1114  // Each patchID from patchMap needs to map somewhere to nbondIndexPerPatch
1115  // Means I can't use patchIDs anymore. I need to use a different ordering
1116  // straight from the computes nbondIndexPerPatch is the same for
1117  // both multi and single PE. What changes is force storage?
1118  // The only thing that needs updating is the usage of the local compute index
1119  // instead of using the global patchID.
1120  // I need the local compute index
1121  // what do... Patches never migrate, so hopefully this datastructure will never change.
1122  // how do I map????
1123  // I can build an index at patchMap with the ordering inside the computes.
1124  int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1125  int patchOffset = patchOffsets[blockIdx.x];
1126  int natom = nbondIndexPerPatch[pid].numAtoms;
1127  for(int i = threadIdx.x; i < natom; i+= stride){
1128  unorder = patchUnsortOrder[patchOffset + i];
1129  float4 force = f_nbond[globalForceOffset + unorder];
1130  // SOA GLOBAL
1131  f_nbond_x[patchOffset + i] = (double)force.x;
1132  f_nbond_y[patchOffset + i] = (double)force.y;
1133  f_nbond_z[patchOffset + i] = (double)force.z;
1134 
1135  if(doSlow){
1136  // XXX Accumulation here works because each f_slow_* buffer
1137  // is first initialized in copyFSlowToSOA.
1138  float4 f_nb_slow = f_nbond_slow[globalForceOffset + unorder];
1139  f_slow_x[patchOffset + i] += (double)f_nb_slow.x;
1140  f_slow_y[patchOffset + i] += (double)f_nb_slow.y;
1141  f_slow_z[patchOffset + i] += (double)f_nb_slow.z;
1142  }
1143  }
1144 }
1145 
1146 void SequencerCUDAKernel::copy_nbond_forces(int numPatches,
1147  float4* f_nbond,
1148  float4* f_nbond_slow,
1149  double* f_nbond_x,
1150  double* f_nbond_y,
1151  double* f_nbond_z,
1152  double* f_slow_x,
1153  double* f_slow_y,
1154  double* f_slow_z,
1155  const int* patchIDs,
1156  const int* patchOffsets,
1157  const int* patchUnsortOrder,
1158  const CudaPatchRecord* pr,
1159  const bool doSlow,
1160  cudaStream_t stream){
1161 
1162  if(doSlow){
1163  copyFNBondToSOA<1><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1164  f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1165  patchIDs, patchOffsets, patchUnsortOrder, pr);
1166  }else {
1167  copyFNBondToSOA<0><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1168  f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1169  patchIDs, patchOffsets, patchUnsortOrder, pr);
1170  }
1171 }
1172 
1173 #if 0
1174 // this function will fuse copyFNBond, copyFbond, addForceToMomentum and updateVelocities (hopefully)
1175 // XXX TODO: missing slow forces support
1176 template<bool T_DONBOND, bool T_DOSLOW>
1177 __global__ void fetchForcesAndUpdateVelocitiesKernel(
1178  const double scaling,
1179  double dt_normal,
1180  double dt_nbond,
1181  double dt_slow,
1182  /* copyFbondToSOA parameters*/
1183  const double* __restrict__ f_bond, // forces from bond compute
1184  const double* __restrict__ f_bond_nbond, // forces from bond compute
1185  double* __restrict__ f_bond_x,
1186  double* __restrict__ f_bond_y,
1187  double* __restrict__ f_bond_z,
1188  double* __restrict__ f_nbond_x,
1189  double* __restrict__ f_nbond_y,
1190  double* __restrict__ f_nbond_z,
1191  double* __restrict__ f_slow_x,
1192  double* __restrict__ f_slow_y,
1193  double* __restrict__ f_slow_z,
1194  int forceStride,
1195  const PatchRecord* __restrict__ b_pr,
1196  const int* __restrict__ patchIDs,
1197  const int* __restrict__ patchOffsets,
1198  /* copyFNbondToSOA parameters*/
1199  const float4* __restrict__ f_nbond, // forces from bbond compute
1200  const float4* __restrict__ f_nbond_slow, // forces from bond compute
1201  const int* __restrict__ patchSortOrder,
1202  const CudaPatchRecord* __restrict__ nbondIndexPerPatch,
1203  /* copyFSlowToSOA */
1204  const CudaForce* __restrict__ f_slow,
1205  const int* __restrict__ patchPositions,
1206  const int* __restrict__ pencilPatchIndex,
1207  const int* __restrict__ patchOffsets,
1208  const int* __restrict__ patchIDs,
1209  const int* __restrict__ slow_patchIDs,
1210  const CudaLattice* __restrict__ lattices,
1211  /* addForceToMomentum */
1212  const double * __restrict recipMass,
1213  double * __restrict vel_x,
1214  double * __restrict vel_y,
1215  double * __restrict vel_z,
1216  /* updateRigidArrays */
1217  const double * __restrict pos_x,
1218  const double * __restrict pos_y,
1219  const double * __restrict pos_z,
1220  double * __restrict velNew_x,
1221  double * __restrict velNew_y,
1222  double * __restrict velNew_z,
1223  double * __restrict posNew_x,
1224  double * __restrict posNew_y,
1225  double * __restrict posNew_z
1226  ){
1227  int order;
1228  int stride = blockDim.x;
1229  int pid = patchIDs[blockIdx.x];
1230 
1231  int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1232  int b_forceOffset = b_pr[patchID].atomStart;
1233 
1234  int patchOffset = patchOffsets[blockIdx.x];
1235  int natom = nbondIndexPerPatch.numAtoms;
1236 
1237  float4 f_b;
1238  float4 f_nb;
1239  float4 f_s;
1240  for(int i = 0; i < threadIdx.x; i < natom; i += stride){
1241  f_b = {0};
1242  if(T_DONBOND) f_nb = {0};
1243  if(T_DOSLOW) f_s = {0};
1244  // fetch the bonded forces first
1245  f_b.x = f_bond[b_forceOffset + i];
1246  f_b.y = f_bond[b_forceOffset + i + forceStride];
1247  f_b.z = f_bond[b_forceOffset + i + 2*forceStride];
1248 
1249  if(T_DONBOND){
1250  f_nb = f_nbond[globalForceOffset + i];
1251  f_nb.x += f_bond_nbond[b_forceOffset + i];
1252  f_nb.y += f_bond_nbond[b_forceOffset + i + forceStride];
1253  f_nb.z += f_bond_nbond[b_forceOffset + i + 2*forceStride];
1254  }
1255  // addForceToMomentum now
1256  // this striding is not good
1257  float vx = vel_x[patchOffset + i];
1258  float vy = vel_y[patchOffset + i];
1259  float vz = vel_z[patchOffset + i];
1260  float rmass = recipMass[patchOffset + i];
1261  switch(maxForceNumber){
1262  // XXX TODO: Case 2 for slow forces
1263  case 1:
1264  dt_nbond *= scaling;
1265  vx += f_nb.x * rmass * dt_nbond;
1266  vy += f_nb.y * rmass * dt_nbond;
1267  vz += f_nb.z * rmass * dt_nbond;
1268  case 0:
1269  dt_normal *= scaling;
1270  vx += f_b.x * rmass * dt_nbond;
1271  vy += f_b.y * rmass * dt_nbond;
1272  vz += f_b.z * rmass * dt_nbond;
1273  }
1274  // that's it
1275  // updateRigidArrays
1276 
1277  posNew_x[patchOffset + i] = pos_x[i] + (vx * dt);
1278  posNew_y[patchOffset + i] = pos_y[i] + (vy * dt);
1279  posNew_z[patchOffset + i] = pos_z[i] + (vz * dt);
1280  vel_x[patchOffset + i] = vx;
1281  vel_y[patchOffset + i] = vy;
1282  vel_z[patchOffset + i] = vz;
1283  velNew_x[patchOffset + i] = vx;
1284  velNew_y[patchOffset + i] = vy;
1285  velNew_z[patchOffset + i] = vz;
1286  f_normal_x[patchOffset + i] = f_b.x;
1287  f_normal_y[patchOffset + i] = f_b.y;
1288  f_normal_z[patchOffset + i] = f_b.z;
1289 
1290  if(T_DONBOND){
1291  order = patchSortOrder[patchOffset + i];
1292  f_nbond_x[patchOffset + order] = f_nb.x;
1293  f_nbond_y[patchOffset + order] = f_nb.y;
1294  f_nbond_z[patchOffset + order] = f_nb.z;
1295  }
1296  }
1297 }
1298 
1299 void SequencerCUDAKernel::fetchForcesAndUpdateVelocities(int numPatches,
1300  const bool doNbond,
1301  const bool doSlow,
1302  const double scaling,
1303  double dt_normal,
1304  double dt_nbond,
1305  double dt_slow,
1306  const int maxForceNumber,
1307  /* copyFbondToSOA parameters*/
1308  const double* __restrict__ f_bond, // forces from bond compute
1309  const double* __restrict__ f_bond_nbond, // forces from bond compute
1310  double* f_bond_x,
1311  double* f_bond_y,
1312  double* f_bond_z,
1313  double* f_nbond_x,
1314  double* f_nbond_y,
1315  double* f_nbond_z,
1316  double* f_slow_x,
1317  double* f_slow_y,
1318  double* f_slow_z,
1319  int forceStride,
1320  const PatchRecord* b_pr,
1321  const int* patchIDs,
1322  const int* patchOffsets,
1323  /* copyFNbondToSOA parameters*/
1324  const float4* f_nbond, // forces from bbond compute
1325  const float4* f_nbond_slow, // forces from bond compute
1326  const int* patchSortOrder,
1327  const CudaPatchRecord* nbondIndexPerPatch,
1328  /* copyFSlowToSOA */
1329  const CudaForce* f_slow,
1330  const int* patchPositions,
1331  const int* pencilPatchIndex,
1332  const int* patchOffsets,
1333  const int* patchIDs,
1334  const int* slow_patchIDs,
1335  const CudaLattice* lattices,
1336  /* addForceToMomentum */
1337  const double * recipMass,
1338  double * vel_x,
1339  double * vel_y,
1340  double * vel_z,
1341  /* updateRigidArrays */
1342  const double * pos_x,
1343  const double * pos_y,
1344  const double * pos_z,
1345  double * velNew_x,
1346  double * velNew_y,
1347  double * velNew_z,
1348  double * posNew_x,
1349  double * posNew_y,
1350  double * posNew_z,
1351  cudaStream_t stream){
1352 
1353  // reduce the amount of arguments in this function
1354  int blocks = 128;
1355  int grid = numPatches;
1356 
1357  // XXX TODO finish this
1358  if(doNbond){
1359  if(doSlow){
1360  fetchForcesAndUpdateVelocities<true, true><<<numPatches, 128, 0, stream>>>();
1361  }else{
1362  fetchForcesAndUpdateVelocities<true, false><<<numPatches, 128, 0, stream>>>();
1363  }
1364  }else{
1365  fetchForcesAndUpdateVelocities<false, false><<<numPatches, 128, 0, stream>>>();
1366  }
1367 }
1368 #endif
1369 
1370 // JM: I need a function to do the pairlistCheck
1371 template<bool T_ISPERIODIC>
1372 __global__ void pairListMarginCheckKernel(
1373  const int numPatches,
1374  CudaLocalRecord* localRecords,
1375  const double* __restrict pos_x,
1376  const double* __restrict pos_y,
1377  const double* __restrict pos_z,
1378  const double* __restrict pos_old_x,
1379  const double* __restrict pos_old_y,
1380  const double* __restrict pos_old_z,
1381  const double3* __restrict awayDists, // for margin check
1382  const Lattice lattice,
1383  const Lattice latticeOld,
1384  const double3* __restrict patchMins,
1385  const double3* __restrict patchMaxes,
1386  const double3* __restrict patchCenter,
1387  const CudaMInfo* __restrict mInfo,
1388  unsigned int* __restrict tbcatomic,
1389  double* patchMaxAtomMovement,
1390  double* h_patchMaxAtomMovement,
1391  double* patchNewTolerance,
1392  double* h_patchNewTolerance,
1393  const double minSize,
1394  const double cutoff,
1395  const double sysdima,
1396  const double sysdimb,
1397  const double sysdimc,
1398  unsigned int* __restrict h_marginViolations,
1399  unsigned int* __restrict h_periodicCellSmall,
1400  const bool rescalePairlistTolerance
1401 ) {
1402  __shared__ CudaLocalRecord s_record;
1403  using AccessType = int32_t;
1404  AccessType* s_record_buffer = (AccessType*) &s_record;
1405 
1406  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1407  // Read in the CudaLocalRecord using multiple threads. This should
1408  #pragma unroll 1
1409  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1410  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1411  }
1412  __syncthreads();
1413 
1414  const int pid = s_record.patchID;
1415 
1416 
1417  // So each patch has a max_CD particular to it
1418  // since we have a block per patch, we get the stuff here
1419  // we need to check eight points for each patch. Might not pay off to do that
1420  // we do that once per patch I guess
1421  double cd, max_cd, max_pd;
1422  __shared__ double minx, miny, minz, maxx, maxy, maxz;
1423  double3 corner, corner_unscaled, corner2_unscaled;
1424  double3 mValue;
1425  double3 s;
1426 
1427  const int stride = blockDim.x;
1428 
1429  const int ind = patchIndex;
1430  double3 min = patchMins[ind];
1431  double3 max = patchMaxes[ind];
1432  double3 aDist = awayDists[ind];
1433 
1434  const int start = s_record.bufferOffset;
1435  const int numAtoms = s_record.numAtoms;
1436  double3 center = patchCenter[ind];
1437 
1438  double3 center_cur = lattice.unscale(center);
1439  double3 center_old = latticeOld.unscale(center);
1440 
1441  center_cur.x -= center_old.x;
1442  center_cur.y -= center_old.y;
1443  center_cur.z -= center_old.z;
1444 
1445  max_cd = 0.f;
1446  max_pd = 0.f;
1447  if(threadIdx.x == 0){
1448  if (rescalePairlistTolerance) patchNewTolerance[ind] *= (1.0f - c_pairlistShrink);
1449 
1450  // doMargincheck -> checks if the periodic cell has became too small:
1451 
1452  // Those values are in registers to begin with, so the overhead here is not that large
1453  // if this condition is set to true, simulation will stop
1454  mValue.x = (0.5) * (aDist.x - cutoff / sysdima);
1455  mValue.y = (0.5) * (aDist.y - cutoff / sysdimb);
1456  mValue.z = (0.5) * (aDist.z - cutoff / sysdimc);
1457  minx = min.x - mValue.x;
1458  miny = min.y - mValue.y;
1459  minz = min.z - mValue.z;
1460  maxx = max.x + mValue.x;
1461  maxy = max.y + mValue.y;
1462  maxz = max.z + mValue.z;
1463 
1464  for(int i = 0; i < 2; i++){
1465  for(int j =0 ; j< 2; j++){
1466  for(int k = 0; k < 2; k++){
1467  corner.x = (i ? min.x : max.x);
1468  corner.y = (j ? min.y : max.y);
1469  corner.z = (k ? min.z : max.z);
1470  corner_unscaled = lattice.unscale(corner);
1471  corner2_unscaled = latticeOld.unscale(corner);
1472  corner.x = (corner_unscaled.x - corner2_unscaled.x) - center_cur.x;
1473  corner.y = (corner_unscaled.y - corner2_unscaled.y) - center_cur.y;
1474  corner.z = (corner_unscaled.z - corner2_unscaled.z) - center_cur.z;
1475  cd = corner.x * corner.x +
1476  corner.y * corner.y + corner.z * corner.z;
1477  if (cd > max_cd) max_cd = cd;
1478  }
1479  }
1480  }
1481  }
1482  __syncthreads();
1483 
1484  // Alrights, so we have max_cd for each patch, now we get the max_pd
1485  // XXX TODO: Get the number of atoms per patch
1486  // we need the nAtomsPerPatch
1487  // we need an atomStart here
1488  unsigned int mc = 0;
1489  for(int i = threadIdx.x; i < numAtoms; i += stride){
1490  // Here I can also check for margin violations - we need the aAwayDists
1491  double3 pos;
1492  pos.x = pos_x[start + i];
1493  pos.y = pos_y[start + i];
1494  pos.z = pos_z[start + i];
1495  s = lattice.scale(pos);
1496  if(T_ISPERIODIC){
1497  // This is true if the system in periodic in A, B and C
1498  // if any of these clauses are true, atom needs to migrate
1499  mc += ((s.x < minx || s.x >= maxx) || (s.y < miny || s.y >= maxy) || (s.z < minz || s.z >= maxz)) ? 1 : 0;
1500  }else{
1501  int xdev, ydev, zdev;
1502  // Ok, so if the system is not periodic, we need to access the mInfo data structure to determine migrations
1503  if (s.x < minx) xdev = 0;
1504  else if (maxx <= s.x) xdev = 2;
1505  else xdev = 1;
1506 
1507  if (s.y < miny) ydev = 0;
1508  else if (maxy <= s.y) ydev = 2;
1509  else ydev = 1;
1510 
1511  if (s.z < minz) zdev = 0;
1512  else if (maxz <= s.z) zdev = 2;
1513  else zdev = 1;
1514 
1515  if(xdev != 1 || ydev != 1 || zdev != 1){
1516  // we check if any *dev are different than zero to prevent this horrible global memory access here
1517  int destPatch = mInfo[ind].destPatchID[xdev][ydev][zdev];
1518  if(destPatch != -1 && destPatch != pid) mc += 1; // atom needs to migrate
1519  }
1520  }
1521  corner.x = (pos.x - pos_old_x[start + i]) - center_cur.x;
1522  corner.y = (pos.y - pos_old_y[start + i]) - center_cur.y;
1523  corner.z = (pos.z - pos_old_z[start + i]) - center_cur.z;
1524  cd = corner.x * corner.x +
1525  corner.y * corner.y + corner.z * corner.z;
1526  if(cd > max_pd) max_pd = cd;
1527  }
1528  // JM NOTE: The atomic add to host memory is bad, but if you have margin violations the simulation is going badly
1529  // to begin with
1530  if (mc != 0) {
1531  atomicAdd(h_marginViolations, mc);
1532  }
1533 
1534  typedef cub::BlockReduce<BigReal, PATCH_BLOCKS> BlockReduce;
1535  __shared__ typename BlockReduce::TempStorage temp_storage;
1536 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
1537  max_pd = BlockReduce(temp_storage).Reduce(max_pd, cub::Max());
1538 #else
1539  max_pd = BlockReduce(temp_storage).Reduce(max_pd, cuda::maximum());
1540 #endif
1541  if(threadIdx.x == 0){
1542  max_pd = sqrt(max_cd) + sqrt(max_pd);
1543  // this might not be needed since I'm updating host-mapped values anyways
1544  patchMaxAtomMovement[ind] = max_pd;
1545  if(max_pd > (1.f - c_pairlistTrigger) * patchNewTolerance[ind]){
1546  patchNewTolerance[ind] *= (1.f + c_pairlistGrow);
1547  }
1548  if(max_pd > patchNewTolerance[ind]){
1549  patchNewTolerance[ind] = max_pd / (1.f - c_pairlistTrigger);
1550  }
1551  // printf("patchNewTolerance[%d] = %lf %lf\n", ind,
1552  // patchNewTolerance[ind], patchMaxAtomMovement[ind]);
1553  h_patchMaxAtomMovement[ind] = max_pd; // Host-mapped update
1554  h_patchNewTolerance[ind] = patchNewTolerance[ind];
1555  }
1556 
1557  // Checks if periodic cell has become too small and flags it
1558  if(threadIdx.x == 0){
1559  if ( ( aDist.x*sysdima < minSize*0.9999 ) ||
1560  ( aDist.y*sysdimb < minSize*0.9999 ) ||
1561  ( aDist.z*sysdimc < minSize*0.9999 ) ||
1562  ( mValue.x < -0.0001 ) ||
1563  ( mValue.y < -0.0001 ) ||
1564  ( mValue.z < -0.0001)) {
1565  *h_periodicCellSmall = 1;
1566  }
1567  }
1568  }
1569 }
1570 
1571 void SequencerCUDAKernel::PairListMarginCheck(const int numPatches,
1572  CudaLocalRecord* localRecords,
1573  const double* pos_x,
1574  const double* pos_y,
1575  const double* pos_z,
1576  const double* pos_old_x,
1577  const double* pos_old_y,
1578  const double* pos_old_z,
1579  const double3* awayDists, // for margin check
1580  const Lattice lattices,
1581  const Lattice lattices_old,
1582  const double3* __restrict patchMins,
1583  const double3* __restrict patchMaxes,
1584  const double3* __restrict patchCenter,
1585  const CudaMInfo* __restrict mInfo,
1586  unsigned int* __restrict tbcatomic,
1587  const double pairlistTrigger,
1588  const double pairlistGrow,
1589  const double pairlistShrink,
1590  double* __restrict patchMaxAtomMovement,
1591  double* __restrict h_patchMaxAtomMovement,
1592  double* __restrict patchNewTolerance,
1593  double* __restrict h_patchNewTolerance,
1594  const double minSize,
1595  const double cutoff,
1596  const double sysdima,
1597  const double sysdimb,
1598  const double sysdimc,
1599  unsigned int* h_marginViolations,
1600  unsigned int* h_periodicCellSmall,
1601  const bool rescalePairlistTolerance,
1602  const bool isPeriodic,
1603  cudaStream_t stream){
1604  int grid = numPatches;
1605 
1606  if(!this->intConstInit){
1607  this->intConstInit = true;
1608  cudaCheck(cudaMemcpyToSymbol(c_pairlistTrigger, &pairlistTrigger, sizeof(double)));
1609  cudaCheck(cudaMemcpyToSymbol(c_pairlistGrow, &pairlistGrow, sizeof(double)));
1610  cudaCheck(cudaMemcpyToSymbol(c_pairlistShrink, &pairlistShrink, sizeof(double)));
1611  }
1612 
1613  if(isPeriodic){
1614  pairListMarginCheckKernel<true><<<grid, PATCH_BLOCKS, 0, stream>>>(
1615  numPatches, localRecords,
1616  pos_x, pos_y, pos_z, pos_old_x,
1617  pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1618  patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1619  patchMaxAtomMovement, h_patchMaxAtomMovement,
1620  patchNewTolerance, h_patchNewTolerance,
1621  minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1622  rescalePairlistTolerance);
1623  }
1624  else{
1625  pairListMarginCheckKernel<false><<<grid, PATCH_BLOCKS, 0, stream>>>(
1626  numPatches, localRecords,
1627  pos_x, pos_y, pos_z, pos_old_x,
1628  pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1629  patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1630  patchMaxAtomMovement, h_patchMaxAtomMovement,
1631  patchNewTolerance, h_patchNewTolerance,
1632  minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1633  rescalePairlistTolerance);
1634  }
1635 }
1636 
1637 template <bool doNbond, bool doSlow>
1638 __global__ void copyFBondToSOA(double *f_bond,
1639  double *f_bond_nbond,
1640  double *f_bond_slow,
1641  double *f_bond_x,
1642  double *f_bond_y,
1643  double *f_bond_z,
1644  double *f_nbond_x,
1645  double *f_nbond_y,
1646  double *f_nbond_z,
1647  double *f_slow_x,
1648  double *f_slow_y,
1649  double *f_slow_z,
1650  const int forceStride,
1651  const PatchRecord *pr,
1652  const int *patchIDs,
1653  const int *patchOffsets)
1654 {
1655  // I suppose if I work with the entire PatchMap, this should work?
1656  // Same thing, each block gets a patch
1657  // What do I need -> Forces + atomStart
1658  // the bonded forces are wrong here.
1659  // What isforceOffset and patchOffset?
1660  int stride = blockDim.x;
1661  int patchID = patchIDs[blockIdx.x];
1662  // we need to check this data structure first to make sure it is safe to access it using patchID
1663  // I think patchId is the correct way of indexing this datastructure. Does this change with +p?
1664  int natoms = pr[patchID].numAtoms;
1665  int forceOffset = pr[patchID].atomStart;
1666  int patchOffset = patchOffsets[blockIdx.x];
1667 
1668  for(int i = threadIdx.x; i < natoms; i+=stride){
1669  // LOCAL GLOBAL
1670  f_bond_x[patchOffset + i] = f_bond[forceOffset + i];
1671  f_bond_y[patchOffset + i] = f_bond[forceOffset + i + forceStride];
1672  f_bond_z[patchOffset + i] = f_bond[forceOffset + i + 2*forceStride];
1673 
1674  if(doNbond){
1675  // XXX Accumulation here works because each f_nbond_* buffer
1676  // is first initialized in copyFNBondToSOA.
1677  f_nbond_x[patchOffset + i] += f_bond_nbond[forceOffset + i];
1678  f_nbond_y[patchOffset + i] += f_bond_nbond[forceOffset + i + forceStride];
1679  f_nbond_z[patchOffset + i] += f_bond_nbond[forceOffset + i + 2*forceStride];
1680  }
1681 
1682  if(doSlow){
1683  // XXX Accumulation here works because each f_slow_* buffer
1684  // is first initialized in copyFSlowToSOA.
1685  f_slow_x[patchOffset + i] += f_bond_slow[forceOffset + i];
1686  f_slow_y[patchOffset + i] += f_bond_slow[forceOffset + i + forceStride];
1687  f_slow_z[patchOffset + i] += f_bond_slow[forceOffset + i + 2*forceStride];
1688  }
1689  }
1690 }
1691 
1692 void SequencerCUDAKernel::copy_bond_forces(int numPatches,
1693  double* f_bond,
1694  double* f_bond_nbond,
1695  double* f_bond_slow,
1696  double* f_bond_x,
1697  double* f_bond_y,
1698  double* f_bond_z,
1699  double* f_nbond_x,
1700  double* f_nbond_y,
1701  double* f_nbond_z,
1702  double* f_slow_x,
1703  double* f_slow_y,
1704  double* f_slow_z,
1705  int forceStride, //if stridedForces
1706  PatchRecord* pr,
1707  const int* patchIDs,
1708  const int* patchOffsets,
1709  bool doNbond,
1710  bool doSlow,
1711  cudaStream_t stream)
1712 {
1713  if(doSlow){
1714  copyFBondToSOA<true, true><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1715  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1716  f_nbond_x, f_nbond_y, f_nbond_z,
1717  f_slow_x, f_slow_y, f_slow_z, forceStride,
1718  pr, patchIDs, patchOffsets);
1719  }else if(doNbond){
1720  copyFBondToSOA<true, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1721  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1722  f_nbond_x, f_nbond_y, f_nbond_z,
1723  f_slow_x, f_slow_y, f_slow_z, forceStride,
1724  pr, patchIDs, patchOffsets);
1725  }else{
1726  copyFBondToSOA<false, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1727  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1728  f_nbond_x, f_nbond_y, f_nbond_z,
1729  f_slow_x, f_slow_y, f_slow_z, forceStride,
1730  pr, patchIDs, patchOffsets);
1731  }
1732 }
1733 
1734 __global__ void copyFSlowToSOA(const CudaForce* __restrict__ f_slow,
1735  double* f_slow_x,
1736  double* f_slow_y,
1737  double* f_slow_z,
1738  const int* __restrict__ patchOffsets,
1739  const Lattice* __restrict__ lattices)
1740 {
1741  int tid = threadIdx.x;
1742  int stride = blockDim.x;
1743  int patchOffset = patchOffsets[blockIdx.x];
1744  int numAtoms = patchOffsets[blockIdx.x + 1] - patchOffset;
1745 
1746  Lattice lat = lattices[0];
1747 
1748  for(int i = tid; i < numAtoms; i += stride){
1749  CudaForce f = f_slow[patchOffset + i];
1750 
1751  double3 f_scaled = lat.scale_force(
1752  Vector((double)f.x, (double)f.y, (double)f.z));
1753 
1754  // XXX Instead of accumulating slow force (+=), set them here (=)!
1755  f_slow_x[patchOffset + i] = f_scaled.x;
1756  f_slow_y[patchOffset + i] = f_scaled.y;
1757  f_slow_z[patchOffset + i] = f_scaled.z;
1758  }
1759 }
1760 
1761 void SequencerCUDAKernel::copy_slow_forces(int numPatches,
1762  const CudaForce* f_slow,
1763  double* f_slow_x,
1764  double* f_slow_y,
1765  double* f_slow_z,
1766  const int* d_patchOffsets,
1767  const Lattice *lattices,
1768  cudaStream_t stream){
1769 
1770  copyFSlowToSOA<<<numPatches, PATCH_BLOCKS, 0, stream>>>(f_slow,
1771  f_slow_x, f_slow_y, f_slow_z, d_patchOffsets, lattices);
1772 }
1773 
1774 __forceinline__ __device__ void zero_cudaTensor(cudaTensor &v)
1775 {
1776  v.xx = 0.0;
1777  v.xy = 0.0;
1778  v.xz = 0.0;
1779  v.yx = 0.0;
1780  v.yy = 0.0;
1781  v.yz = 0.0;
1782  v.zx = 0.0;
1783  v.zy = 0.0;
1784  v.zz = 0.0;
1785 }
1786 
1787 template <int MAX_FORCE_NUMBER, bool DO_VEL_RESCALING, bool DO_FIXED>
1788 __global__ void addForceToMomentumKernel(const double scaling,
1789  double dt_normal,
1790  double dt_nbond,
1791  double dt_slow,
1792  double velrescaling, // for stochastic velocity rescaling
1793  const double * __restrict recipMass,
1794  const double * __restrict f_normal_x,
1795  const double * __restrict f_normal_y,
1796  const double * __restrict f_normal_z,
1797  const double * __restrict f_nbond_x,
1798  const double * __restrict f_nbond_y,
1799  const double * __restrict f_nbond_z,
1800  const double * __restrict f_slow_x,
1801  const double * __restrict f_slow_y,
1802  const double * __restrict f_slow_z,
1803  double * __restrict vel_x,
1804  double * __restrict vel_y,
1805  double * __restrict vel_z,
1806  const int * __restrict atomFixed,
1807  const int numAtoms)
1808 {
1809  int i = threadIdx.x + blockIdx.x*blockDim.x;
1810  if (i < numAtoms) {
1811 #if 0
1812  switch (maxForceNumber) {
1813  case 2:
1814  dt_slow *= scaling;
1815  vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
1816  vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
1817  vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
1818  // fall through because we will always have the "faster" forces
1819  case 1:
1820  dt_nbond *= scaling;
1821  vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
1822  vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
1823  vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
1824  // fall through because we will always have the "faster" forces
1825  case 0:
1826  dt_normal *= scaling;
1827  vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
1828  vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
1829  vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
1830  }
1831 #else
1832  double vx = 0;
1833  double vy = 0;
1834  double vz = 0;
1835  switch (MAX_FORCE_NUMBER) {
1836  case 2:
1837  vx += f_slow_x[i] * dt_slow;
1838  vy += f_slow_y[i] * dt_slow;
1839  vz += f_slow_z[i] * dt_slow;
1840  // fall through because we will always have the "faster" forces
1841  case 1:
1842  vx += f_nbond_x[i] * dt_nbond;
1843  vy += f_nbond_y[i] * dt_nbond;
1844  vz += f_nbond_z[i] * dt_nbond;
1845  // fall through because we will always have the "faster" forces
1846  case 0:
1847  vx += f_normal_x[i] * dt_normal;
1848  vy += f_normal_y[i] * dt_normal;
1849  vz += f_normal_z[i] * dt_normal;
1850  }
1851  vx *= scaling * recipMass[i];
1852  vy *= scaling * recipMass[i];
1853  vz *= scaling * recipMass[i];
1854  if (!DO_FIXED) {
1855  if (DO_VEL_RESCALING) {
1856  vel_x[i] = velrescaling*vel_x[i] + vx;
1857  vel_y[i] = velrescaling*vel_y[i] + vy;
1858  vel_z[i] = velrescaling*vel_z[i] + vz;
1859  }
1860  else {
1861  vel_x[i] += vx;
1862  vel_y[i] += vy;
1863  vel_z[i] += vz;
1864  }
1865  } else { // fixed atoms
1866  if (!atomFixed[i]) {
1867  if (DO_VEL_RESCALING) {
1868  vel_x[i] = velrescaling*vel_x[i] + vx;
1869  vel_y[i] = velrescaling*vel_y[i] + vy;
1870  vel_z[i] = velrescaling*vel_z[i] + vz;
1871  }
1872  else {
1873  vel_x[i] += vx;
1874  vel_y[i] += vy;
1875  vel_z[i] += vz;
1876  }
1877  } else {
1878 
1879  vel_x[i] = 0;
1880  vel_y[i] = 0;
1881  vel_z[i] = 0;
1882  }
1883  }
1884 #endif
1885  }
1886 }
1887 
1888 // JM: This sets the cudaAtom vector from the nonbonded compute
1889 template <bool t_doNbond, bool t_doHomePatches>
1890 __global__ void setComputePositionsKernel(
1891  CudaLocalRecord* localRecords,
1892  CudaPeerRecord* peerRecords,
1893  const double3* __restrict patchCenter,
1894  const int* __restrict patchSortOrder,
1895  const int numPatches,
1896  const unsigned int devID,
1897  const Lattice lat,
1898  const double charge_scaling,
1899  const double* __restrict d_pos_x,
1900  const double* __restrict d_pos_y,
1901  const double* __restrict d_pos_z,
1902  const float* __restrict charges,
1903  double** __restrict d_peer_pos_x,
1904  double** __restrict d_peer_pos_y,
1905  double** __restrict d_peer_pos_z,
1906  float4* __restrict nb_atoms,
1907  float4* __restrict b_atoms,
1908  float4* __restrict s_atoms
1909 ) {
1910 
1911  __shared__ CudaLocalRecord s_record;
1912  using AccessType = int32_t;
1913  AccessType* s_record_buffer = (AccessType*) &s_record;
1914 
1915  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1916  // Read in the CudaLocalRecord using multiple threads
1917  #pragma unroll 1
1918  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1919  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1920  }
1921  __syncthreads();
1922 
1923  double3 center;
1924  //int soapid = globalToLocalID[record.patchID];
1925  center = patchCenter[patchIndex];
1926  double3 ucenter = lat.unscale(center);
1927 
1928  const int numAtoms = s_record.numAtoms;
1929  const int dstOffset = s_record.bufferOffset;
1930  const int dstOffsetNB = s_record.bufferOffsetNBPad;
1931 
1932  int srcDevice, srcOffset;
1933  const double *srcPosX, *srcPosY, *srcPosZ;
1934  if (t_doHomePatches) {
1935  srcDevice = devID;
1936  srcOffset = dstOffset;
1937  srcPosX = d_pos_x;
1938  srcPosY = d_pos_y;
1939  srcPosZ = d_pos_z;
1940  } else {
1941  srcDevice = s_record.inline_peers[0].deviceIndex;
1942  srcOffset = s_record.inline_peers[0].bufferOffset;
1943  srcPosX = d_peer_pos_x[srcDevice];
1944  srcPosY = d_peer_pos_y[srcDevice];
1945  srcPosZ = d_peer_pos_z[srcDevice];
1946  }
1947 
1948  float4 atom;
1949  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1950  const int order = patchSortOrder[dstOffset + i]; // Should this be order or unorder?
1951  atom.x = srcPosX[srcOffset + order] - ucenter.x;
1952  atom.y = srcPosY[srcOffset + order] - ucenter.y;
1953  atom.z = srcPosZ[srcOffset + order] - ucenter.z;
1954  atom.w = charges[dstOffset + order] * charge_scaling;
1955 
1956  b_atoms[dstOffset + order] = atom;
1957  if (t_doNbond) nb_atoms[dstOffsetNB + i] = atom;
1958  }
1959 
1960  if (t_doNbond) {
1961  if (threadIdx.x / WARPSIZE == 0) {
1962  const int to_write = (((numAtoms+32-1)/32)*32) - numAtoms; // WARPSIZE
1963 
1964  float4 lastAtom;
1965  const int order = patchSortOrder[dstOffset + numAtoms - 1]; // Should this be order or unorder?
1966  lastAtom.x = srcPosX[srcOffset + order] - ucenter.x;
1967  lastAtom.y = srcPosY[srcOffset + order] - ucenter.y;
1968  lastAtom.z = srcPosZ[srcOffset + order] - ucenter.z;
1969  lastAtom.w = charges[dstOffset + order] * charge_scaling;
1970 
1971  if (threadIdx.x < to_write) {
1972  nb_atoms[dstOffsetNB+numAtoms+threadIdx.x] = lastAtom;
1973  }
1974  }
1975  }
1976  __syncthreads();
1977  }
1978 }
1979 
1980 template<bool t_doHomePatches, bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
1981 __global__ void setComputePositionsKernel_PME (
1982  const double* __restrict d_pos_x,
1983  const double* __restrict d_pos_y,
1984  const double* __restrict d_pos_z,
1985  const float* __restrict charges,
1986  double** __restrict d_peer_pos_x,
1987  double** __restrict d_peer_pos_y,
1988  double** __restrict d_peer_pos_z,
1989  float** __restrict d_peer_charge,
1990  int** __restrict d_peer_partition,
1991  const int* __restrict partition,
1992  const double charge_scaling,
1993  const int* __restrict s_patchPositions,
1994  const int* __restrict s_pencilPatchIndex,
1995  const int* __restrict s_patchIDs,
1996  const Lattice lat,
1997  float4* __restrict s_atoms,
1998  int numTotalAtoms,
1999  int* __restrict s_partition,
2000 // NEW INPUT
2001  const int peerDevice,
2002  const int numAtoms,
2003  const int offset,
2004  const bool handleBoundary
2005 ) {
2006 
2007  float4 foo;
2008 
2009  const int pmeBufferOffset = offset;
2010  const int srcBufferOffset = 0;
2011  const int srcDevice = peerDevice;
2012 
2013  // double precision calculation
2014  double px, py, pz, wx, wy, wz, q;
2015  int part;
2016  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numAtoms; i += blockDim.x * gridDim.x) {
2017  // this gets atoms in same order as PmeCuda code path
2018  if (t_doHomePatches) {
2019  q = (double)(charges[srcBufferOffset + i]);
2020  px = d_pos_x[srcBufferOffset + i];
2021  py = d_pos_y[srcBufferOffset + i];
2022  pz = d_pos_z[srcBufferOffset + i];
2023  if (t_doFEP || t_doTI) {
2024  part = partition[srcBufferOffset + i];
2025  }
2026  } else {
2027  q = (double)(d_peer_charge[srcDevice][srcBufferOffset + i]);
2028  px = d_peer_pos_x[srcDevice][srcBufferOffset + i];
2029  py = d_peer_pos_y[srcDevice][srcBufferOffset + i];
2030  pz = d_peer_pos_z[srcDevice][srcBufferOffset + i];
2031  if (t_doFEP || t_doTI) {
2032  part = d_peer_partition[srcDevice][srcBufferOffset + i];
2033  }
2034  }
2035 
2036  double3 w_vec = lat.scale(Vector(px, py, pz));
2037  wx = w_vec.x;
2038  wy = w_vec.y;
2039  wz = w_vec.z;
2040 
2041  if (handleBoundary) {
2042  wx = (wx - (floor(wx + 0.5) - 0.5));
2043  wy = (wy - (floor(wy + 0.5) - 0.5));
2044  wz = (wz - (floor(wz + 0.5) - 0.5));
2045  }
2046 
2047  foo.x = (float) wx;
2048  foo.y = (float) wy;
2049  foo.z = (float) wz;
2050 
2051  if (handleBoundary) {
2052  foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2053  foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2054  foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2055  }
2056 
2057  foo.w = (float) (charge_scaling * q);
2058  if (!t_doFEP && !t_doTI) {
2059  s_atoms[pmeBufferOffset + i] = foo;
2060  }
2061  else { // alchemical multiple grids
2062  float4 foo_zero_charge = foo;
2063  foo_zero_charge.w = 0.0f;
2064  s_partition[pmeBufferOffset + i] = part;
2065  /* grid 0 grid 1 grid 2 grid 3 grid 4
2066  * non-alch (p = 0) Y Y N N Y
2067  * appearing (p = 1) Y N Y N N
2068  * disappearing (p = 2) N Y N Y N
2069  * Requirements of grids:
2070  * 1. t_doFEP || t_doTI : grid 0, grid 1
2071  * 2. t_doAlchDecouple : grid 2, grid 3
2072  * 3. t_doAlchSoftCore || t_doTI: grid 4
2073  * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2074  * although the atoms that have zero charges in extra grids would not change in non-migration steps,
2075  * I still find no way to get rid of these copying, because positions of the atoms can be changed.
2076  * The non-zero charges may also change if they are computed from some QM engines or some new kinds of FF.
2077  * It seems these branchings in non-migration steps are inevitable.
2078  */
2079  switch (part) {
2080  // non-alch atoms
2081  case 0: {
2082  s_atoms[pmeBufferOffset + i] = foo;
2083  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2084  if (t_doAlchDecouple) {
2085  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2086  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2087  if (t_doAlchSoftCore || t_doTI) {
2088  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo;
2089  }
2090  } else {
2091  if (t_doAlchSoftCore || t_doTI) {
2092  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2093  }
2094  }
2095  break;
2096  }
2097  // appearing atoms
2098  case 1: {
2099  s_atoms[pmeBufferOffset + i] = foo;
2100  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo_zero_charge;
2101  if (t_doAlchDecouple) {
2102  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2103  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2104  if (t_doAlchSoftCore || t_doTI) {
2105  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2106  }
2107  } else {
2108  if (t_doAlchSoftCore || t_doTI) {
2109  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2110  }
2111  }
2112  break;
2113  }
2114  // disappearing atoms
2115  case 2: {
2116  s_atoms[pmeBufferOffset + i] = foo_zero_charge;
2117  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2118  if (t_doAlchDecouple) {
2119  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2120  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo;
2121  if (t_doAlchSoftCore || t_doTI) {
2122  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2123  }
2124  } else {
2125  if (t_doAlchSoftCore || t_doTI) {
2126  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2127  }
2128  }
2129  break;
2130  }
2131  }
2132  }
2133  }
2134 }
2135 
2136 void SequencerCUDAKernel::set_compute_positions(
2137  const int devID,
2138  const bool isPmeDevice,
2139  const int nDev,
2140  const int numPatchesHomeAndProxy,
2141  const int numPatchesHome,
2142  const bool doNbond,
2143  const bool doSlow,
2144  const bool doFEP,
2145  const bool doTI,
2146  const bool doAlchDecouple,
2147  const bool doAlchSoftCore,
2148  const bool handleBoundary,
2149  const double* d_pos_x,
2150  const double* d_pos_y,
2151  const double* d_pos_z,
2152 #ifndef NAMD_NCCL_ALLREDUCE
2153  double** d_peer_pos_x,
2154  double** d_peer_pos_y,
2155  double** d_peer_pos_z,
2156  float** d_peer_charge,
2157  int** d_peer_partition,
2158 #endif
2159  const float* charges,
2160  const int* partition,
2161  const double charge_scaling,
2162  const double3* patchCenter,
2163  const int* s_patchPositions,
2164  const int* s_pencilPatchIndex,
2165  const int* s_patchIDs,
2166  const int* patchSortOrder,
2167  const Lattice lattice,
2168  float4* nb_atoms,
2169  float4* b_atoms,
2170  float4* s_atoms,
2171  int* s_partition,
2172  int numTotalAtoms,
2173  CudaLocalRecord* localRecords,
2174  CudaPeerRecord* peerRecords,
2175  std::vector<int>& atomCounts,
2176  cudaStream_t stream)
2177 {
2178  // Launch Local Set Compute Positions
2179  if(doNbond){
2180  setComputePositionsKernel<true, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2181  localRecords, peerRecords, patchCenter, patchSortOrder,
2182  numPatchesHome, devID, lattice, charge_scaling,
2183  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2184  nb_atoms, b_atoms, s_atoms
2185  );
2186  } else {
2187  setComputePositionsKernel<false, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2188  localRecords, peerRecords, patchCenter, patchSortOrder,
2189  numPatchesHome, devID, lattice, charge_scaling,
2190  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2191  nb_atoms, b_atoms, s_atoms
2192  );
2193  }
2194 
2195  // Launch Peer Set Computes
2196  if (nDev != 1) {
2197  const int numProxyPatches = numPatchesHomeAndProxy - numPatchesHome;
2198  if(doNbond){
2199  setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2200  localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2201  numProxyPatches, devID, lattice, charge_scaling,
2202  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2203  nb_atoms, b_atoms, s_atoms
2204  );
2205  } else {
2206  setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2207  localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2208  numProxyPatches, devID, lattice, charge_scaling,
2209  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2210  nb_atoms, b_atoms, s_atoms
2211  );
2212  }
2213  }
2214 
2215  // Launch PME setComputes
2216  #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
2217  setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
2218  <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
2219  d_pos_x, d_pos_y, d_pos_z, charges, \
2220  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
2221  d_peer_partition, partition, charge_scaling, \
2222  s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
2223  lattice, s_atoms, numTotalAtoms, s_partition, \
2224  i, numAtoms, offset, handleBoundary \
2225  );
2226  // Only when PME long-range electrostaic is enabled (doSlow is true)
2227  // The partition is needed for alchemical calculation.
2228  if(doSlow && isPmeDevice) {
2229  int offset = 0;
2230  for (int i = 0; i < nDev; i++) {
2231  const bool home = (i == devID);
2232  const int numAtoms = atomCounts[i];
2233  const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
2234  const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
2235 
2236  switch (options) {
2237  case 0: CALL(0, 0, 0, 0, 0); break;
2238  case 1: CALL(0, 0, 0, 0, 1); break;
2239  case 2: CALL(0, 0, 0, 1, 0); break;
2240  case 3: CALL(0, 0, 0, 1, 1); break;
2241  case 4: CALL(0, 0, 1, 0, 0); break;
2242  case 5: CALL(0, 0, 1, 0, 1); break;
2243  case 6: CALL(0, 0, 1, 1, 0); break;
2244  case 7: CALL(0, 0, 1, 1, 1); break;
2245  case 8: CALL(0, 1, 0, 0, 0); break;
2246  case 9: CALL(0, 1, 0, 0, 1); break;
2247  case 10: CALL(0, 1, 0, 1, 0); break;
2248  case 11: CALL(0, 1, 0, 1, 1); break;
2249  case 12: CALL(0, 1, 1, 0, 0); break;
2250  case 13: CALL(0, 1, 1, 0, 1); break;
2251  case 14: CALL(0, 1, 1, 1, 0); break;
2252  case 15: CALL(0, 1, 1, 1, 1); break;
2253  case 16: CALL(1, 0, 0, 0, 0); break;
2254  case 17: CALL(1, 0, 0, 0, 1); break;
2255  case 18: CALL(1, 0, 0, 1, 0); break;
2256  case 19: CALL(1, 0, 0, 1, 1); break;
2257  case 20: CALL(1, 0, 1, 0, 0); break;
2258  case 21: CALL(1, 0, 1, 0, 1); break;
2259  case 22: CALL(1, 0, 1, 1, 0); break;
2260  case 23: CALL(1, 0, 1, 1, 1); break;
2261  case 24: CALL(1, 1, 0, 0, 0); break;
2262  case 25: CALL(1, 1, 0, 0, 1); break;
2263  case 26: CALL(1, 1, 0, 1, 0); break;
2264  case 27: CALL(1, 1, 0, 1, 1); break;
2265  case 28: CALL(1, 1, 1, 0, 0); break;
2266  case 29: CALL(1, 1, 1, 0, 1); break;
2267  case 30: CALL(1, 1, 1, 1, 0); break;
2268  case 31: CALL(1, 1, 1, 1, 1); break;
2269  default:
2270  NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
2271  }
2272 
2273  offset += numAtoms;
2274  }
2275  }
2276  #undef CALL
2277 }
2278 
2279 template <bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
2280 __global__ void setPmePositionsKernel(
2281  double charge_scaling,
2282  const Lattice lattice,
2283  const double *pos_x,
2284  const double *pos_y,
2285  const double *pos_z,
2286  const float *charges,
2287  const int* partition,
2288  float4 *s_atoms,
2289  int* s_atoms_partition,
2290  int numAtoms)
2291 {
2292  int i = threadIdx.x + blockIdx.x*blockDim.x;
2293  if (i < numAtoms) {
2294  Lattice lat = lattice;
2295  float4 foo;
2296  double wx, wy, wz, q;
2297  q = (double)(charges[i]);
2298 
2299  double3 w_vec = lat.scale(Vector(pos_x[i], pos_y[i], pos_z[i]));
2300 
2301  wx = w_vec.x;
2302  wy = w_vec.y;
2303  wz = w_vec.z;
2304  wx = (wx - (floor(wx + 0.5) - 0.5));
2305  wy = (wy - (floor(wy + 0.5) - 0.5));
2306  wz = (wz - (floor(wz + 0.5) - 0.5));
2307  foo.x = (float) wx;
2308  foo.y = (float) wy;
2309  foo.z = (float) wz;
2310  foo.w = (float) (charge_scaling * q);
2311  foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2312  foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2313  foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2314  if (!t_doFEP && !t_doTI) {
2315  s_atoms[i] = foo;
2316  }
2317  else { // alchemical multiple grids
2318  float4 foo_zero_charge = foo;
2319  foo_zero_charge.w = 0.0f;
2320  s_atoms_partition[i] = partition[i];
2321  /* grid 0 grid 1 grid 2 grid 3 grid 4
2322  * non-alch (p = 0) Y Y N N Y
2323  * appearing (p = 1) Y N Y N N
2324  * disappearing (p = 2) N Y N Y N
2325  * Requirements of grids:
2326  * 1. t_doFEP || t_doTI : grid 0, grid 1
2327  * 2. t_doAlchDecouple : grid 2, grid 3
2328  * 3. t_doAlchSoftCore || t_doTI: grid 4
2329  * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2330  */
2331  switch (partition[i]) {
2332  // non-alch atoms
2333  case 0: {
2334  s_atoms[i] = foo;
2335  s_atoms[i + numAtoms] = foo;
2336  if (t_doAlchDecouple) {
2337  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2338  s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2339  if (t_doAlchSoftCore || t_doTI) {
2340  s_atoms[i + 4 * numAtoms] = foo;
2341  }
2342  } else {
2343  if (t_doAlchSoftCore || t_doTI) {
2344  s_atoms[i + 2 * numAtoms] = foo;
2345  }
2346  }
2347  break;
2348  }
2349  // appearing atoms
2350  case 1: {
2351  s_atoms[i] = foo;
2352  s_atoms[i + numAtoms] = foo_zero_charge;
2353  if (t_doAlchDecouple) {
2354  s_atoms[i + 2 * numAtoms] = foo;
2355  s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2356  if (t_doAlchSoftCore || t_doTI) {
2357  s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2358  }
2359  } else {
2360  if (t_doAlchSoftCore || t_doTI) {
2361  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2362  }
2363  }
2364  break;
2365  }
2366  // disappearing atoms
2367  case 2: {
2368  s_atoms[i] = foo_zero_charge;
2369  s_atoms[i + numAtoms] = foo;
2370  if (t_doAlchDecouple) {
2371  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2372  s_atoms[i + 3 * numAtoms] = foo;
2373  if (t_doAlchSoftCore || t_doTI) {
2374  s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2375  }
2376  } else {
2377  if (t_doAlchSoftCore || t_doTI) {
2378  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2379  }
2380  }
2381  break;
2382  }
2383  }
2384  }
2385  }
2386 }
2387 
2388 __global__ void maximumMoveKernel(const double maxvel2,
2389  const double * __restrict vel_x,
2390  const double * __restrict vel_y,
2391  const double * __restrict vel_z,
2392  int *killme,
2393  const int numAtoms)
2394 {
2395  int i = threadIdx.x + blockIdx.x*blockDim.x;
2396  if (i < numAtoms) {
2397  double vel2 = vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i];
2398  if (vel2 > maxvel2) {
2399  //If this ever happens, we're already screwed, so performance does not matter
2400  atomicAdd(killme, 1);
2401  }
2402  }
2403 }
2404 
2405 void SequencerCUDAKernel::maximumMove(const double maxvel2,
2406  const double *vel_x,
2407  const double *vel_y,
2408  const double *vel_z,
2409  int *killme,
2410  const int numAtoms,
2411  cudaStream_t stream)
2412 {
2413  //cudaCheck(cudaMemset(killme, 0, sizeof(int)));
2414  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2415  maximumMoveKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
2416  maxvel2, vel_x, vel_y, vel_z, killme, numAtoms);
2417 }
2418 
2419 void SequencerCUDAKernel::addForceToMomentum(
2420  bool doFixedAtoms,
2421  const double scaling,
2422  double dt_normal,
2423  double dt_nbond,
2424  double dt_slow,
2425  double velrescaling, // for stochastic velocity rescaling
2426  const double *recipMass,
2427  const double *f_normal_x,
2428  const double *f_normal_y,
2429  const double *f_normal_z,
2430  const double *f_nbond_x,
2431  const double *f_nbond_y,
2432  const double *f_nbond_z,
2433  const double *f_slow_x,
2434  const double *f_slow_y,
2435  const double *f_slow_z,
2436  double *vel_x,
2437  double *vel_y,
2438  double *vel_z,
2439  const int *atomFixed,
2440  int numAtoms,
2441  int maxForceNumber,
2442  cudaStream_t stream)
2443 {
2444  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2445  const bool doVelRescaling = (velrescaling != 1.0);
2446  int kernelCalled = 0;
2447 #define CALL(MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED) \
2448  do { \
2449  addForceToMomentumKernel<MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED> \
2450  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
2451  scaling, dt_normal, dt_nbond, dt_slow, velrescaling, \
2452  recipMass, f_normal_x, f_normal_y, f_normal_z, \
2453  f_nbond_x, f_nbond_y, f_nbond_z, \
2454  f_slow_x, f_slow_y, f_slow_z, \
2455  vel_x, vel_y, vel_z, atomFixed, \
2456  numAtoms ); \
2457  kernelCalled++; \
2458  } while (0)
2459  switch (maxForceNumber) {
2460  case 2: {
2461  if (doVelRescaling && doFixedAtoms) CALL(2, true, true);
2462  if (doVelRescaling && !doFixedAtoms) CALL(2, true, false);
2463  if (!doVelRescaling && !doFixedAtoms) CALL(2, false, false);
2464  if (!doVelRescaling && doFixedAtoms) CALL(2, false, true);
2465  break;
2466  }
2467  case 1: {
2468  if (doVelRescaling && doFixedAtoms) CALL(1, true, true);
2469  if (doVelRescaling && !doFixedAtoms) CALL(1, true, false);
2470  if (!doVelRescaling && !doFixedAtoms) CALL(1, false, false);
2471  if (!doVelRescaling && doFixedAtoms) CALL(1, false, true);
2472  break;
2473  }
2474  case 0: {
2475  if (doVelRescaling && doFixedAtoms) CALL(0, true, true);
2476  if (doVelRescaling && !doFixedAtoms) CALL(0, true, false);
2477  if (!doVelRescaling && !doFixedAtoms) CALL(0, false, false);
2478  if (!doVelRescaling && doFixedAtoms) CALL(0, false, true);
2479  break;
2480  }
2481  }
2482 #undef CALL
2483  if (kernelCalled != 1) NAMD_bug("Incorrect number of calls to kernel in SequencerCUDAKernel::addForceToMomentum!\n");
2484  // cudaCheck(cudaGetLastError());
2485 }
2486 
2487 template <bool copyPos, bool DO_FIXED>
2488 __global__ void addVelocityToPositionKernel(
2489  const double dt,
2490  const int* __restrict atomFixed,
2491  const double * __restrict vel_x,
2492  const double * __restrict vel_y,
2493  const double * __restrict vel_z,
2494  double * __restrict pos_x,
2495  double * __restrict pos_y,
2496  double * __restrict pos_z,
2497  double * __restrict h_pos_x, // host-mapped vectors
2498  double * __restrict h_pos_y,
2499  double * __restrict h_pos_z,
2500  int numAtoms)
2501 {
2502  int i = threadIdx.x + blockIdx.x*blockDim.x;
2503  double x,y,z;
2504  if (i < numAtoms) {
2505  x = pos_x[i];
2506  y = pos_y[i];
2507  z = pos_z[i];
2508  if (DO_FIXED) {
2509  if (!atomFixed[i]) {
2510  x += vel_x[i] * dt;
2511  y += vel_y[i] * dt;
2512  z += vel_z[i] * dt;
2513  }
2514  } else {
2515  x += vel_x[i] * dt;
2516  y += vel_y[i] * dt;
2517  z += vel_z[i] * dt;
2518  }
2519  pos_x[i] = x;
2520  pos_y[i] = y;
2521  pos_z[i] = z;
2522  if(copyPos){
2523  h_pos_x[i] = x;
2524  h_pos_y[i] = y;
2525  h_pos_z[i] = z;
2526  }
2527  }
2528 }
2529 
2530 void SequencerCUDAKernel::addVelocityToPosition(
2531  bool doFixedAtoms,
2532  const double dt,
2533  const int *atomFixed,
2534  const double *vel_x,
2535  const double *vel_y,
2536  const double *vel_z,
2537  double *pos_x,
2538  double *pos_y,
2539  double *pos_z,
2540  double *h_pos_x,
2541  double *h_pos_y,
2542  double *h_pos_z,
2543  int numAtoms,
2544  bool copyPositions,
2545  cudaStream_t stream)
2546 {
2547  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2548 #define CALL(copyPos, DO_FIXED) \
2549  addVelocityToPositionKernel<copyPos, DO_FIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
2550  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z, \
2551  h_pos_x, h_pos_y, h_pos_z, numAtoms)
2552  if ( copyPositions && doFixedAtoms) CALL(true, true);
2553  else if (!copyPositions && doFixedAtoms) CALL(false, true);
2554  else if ( copyPositions && !doFixedAtoms) CALL(true, false);
2555  else if (!copyPositions && !doFixedAtoms) CALL(false, false);
2556  else NAMD_bug("No kernel was called in SequencerCUDAKernel::addVelocityToPosition!\n");
2557 #undef CALL
2558  //cudaCheck(cudaGetLastError());
2559 }
2560 
2561 template <bool doFixed>
2562 __global__ void updateRigidArraysKernel(
2563  const double dt,
2564  const int * __restrict atomFixed,
2565  const double * __restrict vel_x,
2566  const double * __restrict vel_y,
2567  const double * __restrict vel_z,
2568  const double * __restrict pos_x,
2569  const double * __restrict pos_y,
2570  const double * __restrict pos_z,
2571  double * __restrict velNew_x,
2572  double * __restrict velNew_y,
2573  double * __restrict velNew_z,
2574  double * __restrict posNew_x,
2575  double * __restrict posNew_y,
2576  double * __restrict posNew_z,
2577  int numAtoms)
2578 {
2579  int i = threadIdx.x + blockIdx.x*blockDim.x;
2580  if (i < numAtoms) {
2581  double vx = vel_x[i];
2582  double vy = vel_y[i];
2583  double vz = vel_z[i];
2584  if (!doFixed) {
2585  posNew_x[i] = pos_x[i] + (vx * dt);
2586  posNew_y[i] = pos_y[i] + (vy * dt);
2587  posNew_z[i] = pos_z[i] + (vz * dt);
2588  } else {
2589  if (!atomFixed[i]) {
2590  posNew_x[i] = pos_x[i] + (vx * dt);
2591  posNew_y[i] = pos_y[i] + (vy * dt);
2592  posNew_z[i] = pos_z[i] + (vz * dt);
2593  } else {
2594  posNew_x[i] = pos_x[i];
2595  posNew_y[i] = pos_y[i];
2596  posNew_z[i] = pos_z[i];
2597  }
2598  }
2599  velNew_x[i] = vx;
2600  velNew_y[i] = vy;
2601  velNew_z[i] = vz;
2602  }
2603 }
2604 
2605 
2606 // JM NOTE: Optimize this kernel further to improve global memory access pattern
2607 __global__ void centerOfMassKernel(
2608  const double * __restrict coor_x,
2609  const double * __restrict coor_y,
2610  const double * __restrict coor_z,
2611  double * __restrict cm_x, // center of mass is atom-sized
2612  double * __restrict cm_y,
2613  double * __restrict cm_z,
2614  const float * __restrict mass,
2615  const int * __restrict hydrogenGroupSize,
2616  const int numAtoms)
2617 {
2618  int i = threadIdx.x + blockIdx.x*blockDim.x;
2619  if (i < numAtoms) {
2620  int hgs = hydrogenGroupSize[i];
2621  int j;
2622  if (hgs > 0) {
2623  // missing fixed atoms
2624  BigReal m_cm = 0;
2625  BigReal reg_cm_x = 0;
2626  BigReal reg_cm_y = 0;
2627  BigReal reg_cm_z = 0;
2628  for ( j = i; j < (i+hgs); ++j ) {
2629  m_cm += mass[j];
2630  reg_cm_x += mass[j] * coor_x[j];
2631  reg_cm_y += mass[j] * coor_y[j];
2632  reg_cm_z += mass[j] * coor_z[j];
2633  }
2634  BigReal inv_m_cm = 1.0/m_cm;
2635  reg_cm_x *= inv_m_cm;
2636  reg_cm_y *= inv_m_cm;
2637  reg_cm_z *= inv_m_cm;
2638 
2639  for(j = i ; j < (i + hgs); j++){
2640  cm_x[j] = reg_cm_x;
2641  cm_y[j] = reg_cm_y;
2642  cm_z[j] = reg_cm_z;
2643  }
2644  }
2645  }
2646 }
2647 
2648 void SequencerCUDAKernel::centerOfMass(
2649  const double *coor_x,
2650  const double *coor_y,
2651  const double *coor_z,
2652  double *cm_x,
2653  double *cm_y,
2654  double *cm_z,
2655  const float* mass,
2656  const int* hydrogenGroupSize,
2657  const int numAtoms,
2658  cudaStream_t stream
2659 ){
2660  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2661  //Calls kernels
2662  centerOfMassKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(coor_x, coor_y, coor_z,
2663  cm_x, cm_y, cm_z, mass, hydrogenGroupSize, numAtoms);
2664 }
2665 
2666 
2667 void SequencerCUDAKernel::updateRigidArrays(
2668  const bool doFixed,
2669  const double dt,
2670  const int *atomFixed,
2671  const double *vel_x,
2672  const double *vel_y,
2673  const double *vel_z,
2674  const double *pos_x,
2675  const double *pos_y,
2676  const double *pos_z,
2677  double *velNew_x,
2678  double *velNew_y,
2679  double *velNew_z,
2680  double *posNew_x,
2681  double *posNew_y,
2682  double *posNew_z,
2683  int numAtoms,
2684  cudaStream_t stream)
2685 {
2686  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2687  if (doFixed)
2688  updateRigidArraysKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2689  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2690  velNew_x, velNew_y, velNew_z,
2691  posNew_x, posNew_y, posNew_z, numAtoms);
2692  else
2693  updateRigidArraysKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2694  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2695  velNew_x, velNew_y, velNew_z,
2696  posNew_x, posNew_y, posNew_z, numAtoms);
2697  // cudaCheck(cudaGetLastError());
2698 }
2699 
2700 template<int BLOCK_SIZE, bool DO_FIXED>
2701 __global__ void submitHalfKernel(
2702  // const int* __restrict atomFixed,
2703  const double * __restrict vel_x,
2704  const double * __restrict vel_y,
2705  const double * __restrict vel_z,
2706  const double * __restrict vcm_x,
2707  const double * __restrict vcm_y,
2708  const double * __restrict vcm_z,
2709  const float * __restrict mass,
2710  BigReal *kineticEnergy,
2711  BigReal *intKineticEnergy,
2712  cudaTensor *virial,
2713  cudaTensor *intVirialNormal,
2714  BigReal *h_kineticEnergy,
2715  BigReal *h_intKineticEnergy,
2716  cudaTensor *h_virial,
2717  cudaTensor *h_intVirialNormal,
2718  int *hydrogenGroupSize,
2719  int numAtoms,
2720  unsigned int* tbcatomic)
2721 {
2722  BigReal k = 0, intK = 0;
2723  cudaTensor v, intV;
2724  zero_cudaTensor(v);
2725  zero_cudaTensor(intV);
2726  int i = threadIdx.x + blockIdx.x*blockDim.x;
2727  int totaltb = gridDim.x;
2728  __shared__ bool isLastBlockDone;
2729 
2730 
2731  if(threadIdx.x == 0){
2732  isLastBlockDone = 0;
2733  }
2734 
2735  __syncthreads();
2736 
2737  if (i < numAtoms) {
2738  k = mass[i] *
2739  (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
2740  v.xx = mass[i] * vel_x[i] * vel_x[i];
2741  if (DO_FIXED) {
2742  v.xy = mass[i] * vel_x[i] * vel_y[i];
2743  v.xz = mass[i] * vel_x[i] * vel_z[i];
2744  }
2745  v.yx = mass[i] * vel_y[i] * vel_x[i];
2746  v.yy = mass[i] * vel_y[i] * vel_y[i];
2747  if (DO_FIXED) {
2748  v.yz = mass[i] * vel_y[i] * vel_z[i];
2749  }
2750  v.zx = mass[i] * vel_z[i] * vel_x[i];
2751  v.zy = mass[i] * vel_z[i] * vel_y[i];
2752  v.zz = mass[i] * vel_z[i] * vel_z[i];
2753 
2754 #if 0
2755  int hgs = hydrogenGroupSize[i];
2756  if (hgs > 0) {
2757  BigReal m_cm = 0;
2758  BigReal v_cm_x = 0;
2759  BigReal v_cm_y = 0;
2760  BigReal v_cm_z = 0;
2761  for (int j = i; j < (i+hgs); j++) {
2762  m_cm += mass[j];
2763  v_cm_x += mass[j] * vel_x[j];
2764  v_cm_y += mass[j] * vel_y[j];
2765  v_cm_z += mass[j] * vel_z[j];
2766  }
2767  BigReal recip_m_cm = 1.0 / m_cm;
2768  v_cm_x *= recip_m_cm;
2769  v_cm_y *= recip_m_cm;
2770  v_cm_z *= recip_m_cm;
2771 
2772  for (int j = i; j < (i+hgs); j++) {
2773  BigReal dv_x = vel_x[j] - v_cm_x;
2774  BigReal dv_y = vel_y[j] - v_cm_y;
2775  BigReal dv_z = vel_z[j] - v_cm_z;
2776  intK += mass[j] *
2777  (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
2778  intV.xx += mass[j] * vel_x[j] * dv_x;
2779  intV.xy += mass[j] * vel_x[j] * dv_y;
2780  intV.xz += mass[j] * vel_x[j] * dv_z;
2781  intV.yx += mass[j] * vel_y[j] * dv_x;
2782  intV.yy += mass[j] * vel_y[j] * dv_y;
2783  intV.yz += mass[j] * vel_y[j] * dv_z;
2784  intV.zx += mass[j] * vel_z[j] * dv_x;
2785  intV.zy += mass[j] * vel_z[j] * dv_y;
2786  intV.zz += mass[j] * vel_z[j] * dv_z;
2787  }
2788  }
2789 #else
2790  // JM: New version with centers of mass calculated by a separate kernel
2791  BigReal v_cm_x = vcm_x[i];
2792  BigReal v_cm_y = vcm_y[i];
2793  BigReal v_cm_z = vcm_z[i];
2794  BigReal dv_x = vel_x[i] - v_cm_x;
2795  BigReal dv_y = vel_y[i] - v_cm_y;
2796  BigReal dv_z = vel_z[i] - v_cm_z;
2797  intK += mass[i] *
2798  (vel_x[i] * dv_x + vel_y[i] * dv_y + vel_z[i] * dv_z);
2799  intV.xx += mass[i] * vel_x[i] * dv_x;
2800  intV.xy += mass[i] * vel_x[i] * dv_y;
2801  intV.xz += mass[i] * vel_x[i] * dv_z;
2802  intV.yx += mass[i] * vel_y[i] * dv_x;
2803  intV.yy += mass[i] * vel_y[i] * dv_y;
2804  intV.yz += mass[i] * vel_y[i] * dv_z;
2805  intV.zx += mass[i] * vel_z[i] * dv_x;
2806  intV.zy += mass[i] * vel_z[i] * dv_y;
2807  intV.zz += mass[i] * vel_z[i] * dv_z;
2808 #endif
2809  }
2810 
2811  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
2812  __shared__ typename BlockReduce::TempStorage temp_storage;
2813  k = BlockReduce(temp_storage).Sum(k);
2814  __syncthreads();
2815  v.xx = BlockReduce(temp_storage).Sum(v.xx);
2816  __syncthreads();
2817  if (DO_FIXED) {
2818  v.xy = BlockReduce(temp_storage).Sum(v.xy);
2819  __syncthreads();
2820  v.xz = BlockReduce(temp_storage).Sum(v.xz);
2821  __syncthreads();
2822  }
2823  v.yx = BlockReduce(temp_storage).Sum(v.yx);
2824  __syncthreads();
2825  v.yy = BlockReduce(temp_storage).Sum(v.yy);
2826  __syncthreads();
2827  if (DO_FIXED) {
2828  v.yz = BlockReduce(temp_storage).Sum(v.yz);
2829  __syncthreads();
2830  }
2831  v.zx = BlockReduce(temp_storage).Sum(v.zx);
2832  __syncthreads();
2833  v.zy = BlockReduce(temp_storage).Sum(v.zy);
2834  __syncthreads();
2835  v.zz = BlockReduce(temp_storage).Sum(v.zz);
2836  __syncthreads();
2837  intK = BlockReduce(temp_storage).Sum(intK);
2838  __syncthreads();
2839  intV.xx = BlockReduce(temp_storage).Sum(intV.xx);
2840  __syncthreads();
2841  intV.xy = BlockReduce(temp_storage).Sum(intV.xy);
2842  __syncthreads();
2843  intV.xz = BlockReduce(temp_storage).Sum(intV.xz);
2844  __syncthreads();
2845  intV.yx = BlockReduce(temp_storage).Sum(intV.yx);
2846  __syncthreads();
2847  intV.yy = BlockReduce(temp_storage).Sum(intV.yy);
2848  __syncthreads();
2849  intV.yz = BlockReduce(temp_storage).Sum(intV.yz);
2850  __syncthreads();
2851  intV.zx = BlockReduce(temp_storage).Sum(intV.zx);
2852  __syncthreads();
2853  intV.zy = BlockReduce(temp_storage).Sum(intV.zy);
2854  __syncthreads();
2855  intV.zz = BlockReduce(temp_storage).Sum(intV.zz);
2856  __syncthreads();
2857 
2858  if (threadIdx.x == 0) {
2859  const int bin = blockIdx.x % ATOMIC_BINS;
2860 
2861  atomicAdd(&kineticEnergy[bin], k);
2862  atomicAdd(&virial[bin].xx, v.xx);
2863  if (DO_FIXED) {
2864  atomicAdd(&virial[bin].xy, v.xy);
2865  atomicAdd(&virial[bin].xz, v.xz);
2866  }
2867  atomicAdd(&virial[bin].yx, v.yx);
2868  atomicAdd(&virial[bin].yy, v.yy);
2869  if (DO_FIXED) {
2870  atomicAdd(&virial[bin].yz, v.yz);
2871  }
2872  atomicAdd(&virial[bin].zx, v.zx);
2873  atomicAdd(&virial[bin].zy, v.zy);
2874  atomicAdd(&virial[bin].zz, v.zz);
2875  atomicAdd(&intKineticEnergy[bin], intK);
2876  atomicAdd(&intVirialNormal[bin].xx, intV.xx);
2877  atomicAdd(&intVirialNormal[bin].xy, intV.xy);
2878  atomicAdd(&intVirialNormal[bin].xz, intV.xz);
2879  atomicAdd(&intVirialNormal[bin].yx, intV.yx);
2880  atomicAdd(&intVirialNormal[bin].yy, intV.yy);
2881  atomicAdd(&intVirialNormal[bin].yz, intV.yz);
2882  atomicAdd(&intVirialNormal[bin].zx, intV.zx);
2883  atomicAdd(&intVirialNormal[bin].zy, intV.zy);
2884  atomicAdd(&intVirialNormal[bin].zz, intV.zz);
2885 
2886  __threadfence();
2887  unsigned int value = atomicInc(&tbcatomic[1], totaltb);
2888  isLastBlockDone = (value == (totaltb -1));
2889  }
2890  __syncthreads();
2891 
2892 #if 1
2893  if(isLastBlockDone){
2894  if(threadIdx.x < ATOMIC_BINS){
2895  const int bin = threadIdx.x;
2896 
2897  double k = kineticEnergy[bin];
2898  double intK = intKineticEnergy[bin];
2899  cudaTensor v = virial[bin];
2900  cudaTensor intV = intVirialNormal[bin];
2901 
2902  // sets device scalars back to zero
2903  kineticEnergy[bin] = 0.0;
2904  intKineticEnergy[bin] = 0.0;
2905  zero_cudaTensor(virial[bin]);
2906  zero_cudaTensor(intVirialNormal[bin]);
2907 
2908  if(ATOMIC_BINS > 1){
2909  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2910  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
2911  __shared__ typename WarpReduce::TempStorage tempStorage;
2912  __shared__ typename WarpReduceT::TempStorage tempStorageT;
2913 
2914  k = WarpReduce(tempStorage).Sum(k);
2915  intK = WarpReduce(tempStorage).Sum(intK);
2916  v = WarpReduceT(tempStorageT).Sum(v);
2917  intV = WarpReduceT(tempStorageT).Sum(intV);
2918  }
2919 
2920  if(threadIdx.x == 0){
2921  h_kineticEnergy[0] = k;
2922  h_intKineticEnergy[0] = intK;
2923  h_virial[0] = v;
2924  h_intVirialNormal[0] = intV;
2925 
2926  //resets atomic counter
2927  reset_atomic_counter(&tbcatomic[1]);
2928  }
2929  }
2930  }
2931 #endif
2932 }
2933 
2934 void SequencerCUDAKernel::submitHalf(
2935  const bool doFixedAtoms,
2936  const double *vel_x,
2937  const double *vel_y,
2938  const double *vel_z,
2939  const double *vcm_x,
2940  const double *vcm_y,
2941  const double *vcm_z,
2942  const float *mass,
2943  BigReal *kineticEnergy,
2944  BigReal *intKineticEnergy,
2945  cudaTensor *virial,
2946  cudaTensor *intVirialNormal,
2947  BigReal *h_kineticEnergy,
2948  BigReal *h_intKineticEnergy,
2949  cudaTensor *h_virial,
2950  cudaTensor *h_intVirialNormal,
2951  int *hydrogenGroupSize,
2952  int numAtoms,
2953  unsigned int* tbcatomic,
2954  cudaStream_t stream)
2955 {
2956  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2957  if (doFixedAtoms)
2958  submitHalfKernel<ATOM_BLOCKS, true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2959  vel_x, vel_y, vel_z,
2960  vcm_x, vcm_y, vcm_z, mass,
2961  kineticEnergy, intKineticEnergy,
2962  virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2963  h_virial, h_intVirialNormal,
2964  hydrogenGroupSize, numAtoms, tbcatomic);
2965  else
2966  submitHalfKernel<ATOM_BLOCKS, false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2967  vel_x, vel_y, vel_z,
2968  vcm_x, vcm_y, vcm_z, mass,
2969  kineticEnergy, intKineticEnergy,
2970  virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2971  h_virial, h_intVirialNormal,
2972  hydrogenGroupSize, numAtoms, tbcatomic);
2973  //cudaCheck(cudaGetLastError());
2974 }
2975 
2976 __global__ void scaleCoordinateUseGroupPressureKernel(
2977  double * __restrict pos_x,
2978  double * __restrict pos_y,
2979  double * __restrict pos_z,
2980  float *mass,
2981  int *hydrogenGroupSize,
2982  cudaTensor factor,
2983  cudaVector origin,
2984  int numAtoms)
2985 {
2986  int i = threadIdx.x + blockIdx.x * blockDim.x;
2987  if (i < numAtoms) {
2988  int hgs = hydrogenGroupSize[i];
2989  int j;
2990  if (hgs > 0) {
2991  // missing fixed atoms implementation
2992  BigReal m_cm = 0;
2993  BigReal r_cm_x = 0;
2994  BigReal r_cm_y = 0;
2995  BigReal r_cm_z = 0;
2996  // calculate the center of mass
2997  for ( j = i; j < (i+hgs); ++j ) {
2998  m_cm += mass[j];
2999  r_cm_x += mass[j] * pos_x[j];
3000  r_cm_y += mass[j] * pos_y[j];
3001  r_cm_z += mass[j] * pos_z[j];
3002  }
3003  BigReal inv_m_cm = 1.0/m_cm;
3004  r_cm_x *= inv_m_cm;
3005  r_cm_y *= inv_m_cm;
3006  r_cm_z *= inv_m_cm;
3007  // scale the center of mass with factor
3008  // shift to origin
3009  double tx = r_cm_x - origin.x;
3010  double ty = r_cm_y - origin.y;
3011  double tz = r_cm_z - origin.z;
3012  // apply transformation
3013  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3014  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3015  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3016  // shift back
3017  new_r_cm_x += origin.x;
3018  new_r_cm_y += origin.y;
3019  new_r_cm_z += origin.z;
3020  // translation vector from old COM and new COM
3021  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3022  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3023  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3024  // shift the hydrogen group with translation vector
3025  for (j = i; j < (i+hgs); j++) {
3026  pos_x[j] += delta_r_cm_x;
3027  pos_y[j] += delta_r_cm_y;
3028  pos_z[j] += delta_r_cm_z;
3029  }
3030  }
3031  }
3032 }
3033 
3034 __global__ void scaleCoordinateNoGroupPressureKernel(
3035  double * __restrict pos_x,
3036  double * __restrict pos_y,
3037  double * __restrict pos_z,
3038  cudaTensor factor,
3039  cudaVector origin,
3040  int numAtoms)
3041 {
3042  int i = threadIdx.x + blockIdx.x * blockDim.x;
3043  if (i < numAtoms) {
3044  // missing fixed atoms implementation
3045  // shift to origin
3046  double tx = pos_x[i] - origin.x;
3047  double ty = pos_y[i] - origin.y;
3048  double tz = pos_z[i] - origin.z;
3049  // apply transformation
3050  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3051  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3052  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3053  // shift back
3054  pos_x[i] = ftx + origin.x;
3055  pos_y[i] = fty + origin.y;
3056  pos_z[i] = ftz + origin.z;
3057  }
3058 }
3059 
3060 __global__ void SetAtomIndexOrderKernel(
3061  int *id,
3062  int *idOrder,
3063  int numAtoms)
3064 {
3065  int i = threadIdx.x + blockIdx.x * blockDim.x;
3066  if (i < numAtoms) {
3067  int atomIndex = id[i];
3068  idOrder[atomIndex] = i;
3069  }
3070 }
3071 
3072 __global__ void scaleCoordinateUsingGCKernel(
3073  double* __restrict pos_x,
3074  double* __restrict pos_y,
3075  double* __restrict pos_z,
3076  const int* __restrict idOrder,
3077  const int* __restrict moleculeStartIndex,
3078  const int* __restrict moleculeAtom,
3079  const cudaTensor factor,
3080  const cudaVector origin,
3081  const Lattice oldLattice,
3082  const Lattice newLattice,
3083  const char3* __restrict transform,
3084  const int numMolecules,
3085  const int numLargeMolecules)
3086 {
3087  // missing fixed atoms implementation
3088  int startIdx, endIdx, i, j, jmapped, atomIndex;
3089  double3 position, r_gc, new_r_gc, delta_r_gc;
3090  char3 tr;
3091  r_gc.x = 0; r_gc.y = 0; r_gc.z = 0;
3092  if (blockIdx.x < numLargeMolecules){ //large molecule case
3093  i = blockIdx.x;
3094  startIdx = moleculeStartIndex[i];
3095  endIdx = moleculeStartIndex[i + 1];
3096  __shared__ double3 sh_gc;
3097  double inv_length = 1.0/(double)(endIdx - startIdx);
3098  // calculate the geometric center
3099  for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3100  atomIndex = moleculeAtom[j];
3101  jmapped = idOrder[atomIndex];
3102  tr = transform[jmapped];
3103  position.x = pos_x[jmapped];
3104  position.y = pos_y[jmapped];
3105  position.z = pos_z[jmapped];
3106  //Unwrap the coordinate with oldLattice
3107  position = oldLattice.reverse_transform(position ,tr);
3108  r_gc.x += position.x;
3109  r_gc.y += position.y;
3110  r_gc.z += position.z;
3111  }
3112  __syncthreads();
3113  typedef cub::BlockReduce<double, 64> BlockReduce;
3114  __shared__ typename BlockReduce::TempStorage temp_storage;
3115 
3116  r_gc.x = BlockReduce(temp_storage).Sum(r_gc.x);
3117  __syncthreads();
3118  r_gc.y = BlockReduce(temp_storage).Sum(r_gc.y);
3119  __syncthreads();
3120  r_gc.z = BlockReduce(temp_storage).Sum(r_gc.z);
3121  __syncthreads();
3122 
3123  if (threadIdx.x == 0) {
3124  sh_gc.x = r_gc.x * inv_length;
3125  sh_gc.y = r_gc.y * inv_length;
3126  sh_gc.z = r_gc.z * inv_length;
3127  }
3128  __syncthreads();
3129 
3130  // scale the geometric center with factor
3131  // shift to origin
3132  double tx = sh_gc.x - origin.x;
3133  double ty = sh_gc.y - origin.y;
3134  double tz = sh_gc.z - origin.z;
3135  // apply transformation
3136  new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3137  new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3138  new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3139  // shift back
3140  new_r_gc.x += origin.x;
3141  new_r_gc.y += origin.y;
3142  new_r_gc.z += origin.z;
3143  // translation vector from old GC to new GC
3144  delta_r_gc.x = new_r_gc.x - sh_gc.x;
3145  delta_r_gc.y = new_r_gc.y - sh_gc.y;
3146  delta_r_gc.z = new_r_gc.z - sh_gc.z;
3147 
3148  // shift the atoms in molecule with translation vector
3149  for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3150  atomIndex = moleculeAtom[j];
3151  jmapped = idOrder[atomIndex];
3152  tr = transform[jmapped];
3153  position.x = pos_x[jmapped];
3154  position.y = pos_y[jmapped];
3155  position.z = pos_z[jmapped];
3156  //Unwrap the coordinate with oldLattice
3157  position = oldLattice.reverse_transform(position, tr);
3158  position.x += delta_r_gc.x;
3159  position.y += delta_r_gc.y;
3160  position.z += delta_r_gc.z;
3161  // wrap the coordinate with new lattice parameter
3162  position = newLattice.apply_transform(position, tr);
3163  pos_x[jmapped] = position.x;
3164  pos_y[jmapped] = position.y;
3165  pos_z[jmapped] = position.z;
3166  }
3167  } else { //Small molecule
3168  // numLargeMolecules block has been assigned to large molecule
3169  i = numLargeMolecules + threadIdx.x +
3170  (blockIdx.x - numLargeMolecules) * blockDim.x;
3171 
3172  if (i < numMolecules) {
3173  startIdx = moleculeStartIndex[i];
3174  endIdx = moleculeStartIndex[i+1];
3175  double inv_length = 1.0/(double)(endIdx - startIdx);
3176 
3177  // calculate the geometric center
3178  for ( j = startIdx; j < endIdx; j++ ) {
3179  atomIndex = moleculeAtom[j];
3180  jmapped = idOrder[atomIndex];
3181  tr = transform[jmapped];
3182  position.x = pos_x[jmapped];
3183  position.y = pos_y[jmapped];
3184  position.z = pos_z[jmapped];
3185  //Unwrap the coordinate with oldLattice
3186  position = oldLattice.reverse_transform(position, tr);
3187  r_gc.x += position.x;
3188  r_gc.y += position.y;
3189  r_gc.z += position.z;
3190  }
3191 
3192  r_gc.x *= inv_length;
3193  r_gc.y *= inv_length;
3194  r_gc.z *= inv_length;
3195 
3196  // scale the geometric center with factor
3197  // shift to origin
3198  double tx = r_gc.x - origin.x;
3199  double ty = r_gc.y - origin.y;
3200  double tz = r_gc.z - origin.z;
3201  // apply transformation
3202  new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3203  new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3204  new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3205  // shift back
3206  new_r_gc.x += origin.x;
3207  new_r_gc.y += origin.y;
3208  new_r_gc.z += origin.z;
3209  // translation vector from old GC to new GC
3210  delta_r_gc.x = new_r_gc.x - r_gc.x;
3211  delta_r_gc.y = new_r_gc.y - r_gc.y;
3212  delta_r_gc.z = new_r_gc.z - r_gc.z;
3213 
3214  // shift the atoms in molecule with translation vector
3215  for (j = startIdx; j < endIdx; j++) {
3216  atomIndex = moleculeAtom[j];
3217  jmapped = idOrder[atomIndex];
3218 
3219  tr = transform[jmapped];
3220  position.x = pos_x[jmapped];
3221  position.y = pos_y[jmapped];
3222  position.z = pos_z[jmapped];
3223  // unwrap the coordinate with oldLattice
3224  position = oldLattice.reverse_transform(position, tr);
3225  position.x += delta_r_gc.x;
3226  position.y += delta_r_gc.y;
3227  position.z += delta_r_gc.z;
3228  // wrap the coordinate with new lattice parameter
3229  position = newLattice.apply_transform(position, tr);
3230  pos_x[jmapped] = position.x;
3231  pos_y[jmapped] = position.y;
3232  pos_z[jmapped] = position.z;
3233  }
3234  }
3235  }
3236 }
3237 
3238 template <bool doFixed>
3239 __global__ void langevinPistonUseGroupPressureKernel(
3240  const int* __restrict atomFixed,
3241  const int* __restrict groupFixed,
3242  const Lattice lattice,
3243  const char3* transform,
3244  const double* __restrict fixedPosition_x,
3245  const double* __restrict fixedPosition_y,
3246  const double* __restrict fixedPosition_z,
3247  double * __restrict pos_x,
3248  double * __restrict pos_y,
3249  double * __restrict pos_z,
3250  double * __restrict vel_x,
3251  double * __restrict vel_y,
3252  double * __restrict vel_z,
3253  float *mass,
3254  int *hydrogenGroupSize,
3255  cudaTensor factor,
3256  cudaVector origin,
3257  double velFactor_x,
3258  double velFactor_y,
3259  double velFactor_z,
3260  int numAtoms)
3261 {
3262  int i = threadIdx.x + blockIdx.x*blockDim.x;
3263  if (i < numAtoms) {
3264  int hgs = hydrogenGroupSize[i];
3265  int j;
3266  if (hgs > 0) {
3267  if (doFixed) {
3268  if (groupFixed[i]) {
3269  for ( j = i; j < (i+hgs); ++j ) {
3270  const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3271  const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3272  pos_x[j] = pos_trans.x;
3273  pos_y[j] = pos_trans.y;
3274  pos_z[j] = pos_trans.z;
3275  }
3276  return;
3277  }
3278  }
3279  BigReal m_cm = 0;
3280  BigReal r_cm_x = 0;
3281  BigReal r_cm_y = 0;
3282  BigReal r_cm_z = 0;
3283  BigReal v_cm_x = 0;
3284  BigReal v_cm_y = 0;
3285  BigReal v_cm_z = 0;
3286  for ( j = i; j < (i+hgs); ++j ) {
3287  if (doFixed) {
3288  if (atomFixed[j]) continue;
3289  }
3290  m_cm += mass[j];
3291  r_cm_x += mass[j] * pos_x[j];
3292  r_cm_y += mass[j] * pos_y[j];
3293  r_cm_z += mass[j] * pos_z[j];
3294  v_cm_x += mass[j] * vel_x[j];
3295  v_cm_y += mass[j] * vel_y[j];
3296  v_cm_z += mass[j] * vel_z[j];
3297  }
3298  BigReal inv_m_cm = 1.0/m_cm;
3299  r_cm_x *= inv_m_cm;
3300  r_cm_y *= inv_m_cm;
3301  r_cm_z *= inv_m_cm;
3302 
3303  double tx = r_cm_x - origin.x;
3304  double ty = r_cm_y - origin.y;
3305  double tz = r_cm_z - origin.z;
3306  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3307  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3308  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3309  new_r_cm_x += origin.x;
3310  new_r_cm_y += origin.y;
3311  new_r_cm_z += origin.z;
3312 
3313  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3314  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3315  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3316  v_cm_x *= inv_m_cm;
3317  v_cm_y *= inv_m_cm;
3318  v_cm_z *= inv_m_cm;
3319  double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
3320  double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
3321  double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
3322  for (j = i; j < (i+hgs); j++) {
3323  if (doFixed) {
3324  if (atomFixed[j]) {
3325  const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3326  const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3327  pos_x[j] = pos_trans.x;
3328  pos_y[j] = pos_trans.y;
3329  pos_z[j] = pos_trans.z;
3330  continue;
3331  }
3332  }
3333  pos_x[j] += delta_r_cm_x;
3334  pos_y[j] += delta_r_cm_y;
3335  pos_z[j] += delta_r_cm_z;
3336  vel_x[j] += delta_v_cm_x;
3337  vel_y[j] += delta_v_cm_y;
3338  vel_z[j] += delta_v_cm_z;
3339  }
3340  }
3341 
3342  }
3343 }
3344 
3345 template <bool doFixed>
3346 __global__ void langevinPistonNoGroupPressureKernel(
3347  const int* __restrict atomFixed,
3348  const Lattice lattice,
3349  const char3* transform,
3350  const double* __restrict fixedPosition_x,
3351  const double* __restrict fixedPosition_y,
3352  const double* __restrict fixedPosition_z,
3353  double * __restrict pos_x,
3354  double * __restrict pos_y,
3355  double * __restrict pos_z,
3356  double * __restrict vel_x,
3357  double * __restrict vel_y,
3358  double * __restrict vel_z,
3359  cudaTensor factor,
3360  cudaVector origin,
3361  double velFactor_x,
3362  double velFactor_y,
3363  double velFactor_z,
3364  int numAtoms)
3365 {
3366  int i = threadIdx.x + blockIdx.x*blockDim.x;
3367  if (i < numAtoms) {
3368  if (doFixed) {
3369  if (atomFixed[i]) {
3370  const double3 pos_origin{fixedPosition_x[i], fixedPosition_y[i], fixedPosition_z[i]};
3371  const auto pos_trans = lattice.apply_transform(pos_origin, transform[i]);
3372  pos_x[i] = pos_trans.x;
3373  pos_y[i] = pos_trans.y;
3374  pos_z[i] = pos_trans.z;
3375  return;
3376  }
3377  }
3378  double tx = pos_x[i] - origin.x;
3379  double ty = pos_y[i] - origin.y;
3380  double tz = pos_z[i] - origin.z;
3381  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3382  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3383  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3384  pos_x[i] = ftx + origin.x;
3385  pos_y[i] = fty + origin.y;
3386  pos_z[i] = ftz + origin.z;
3387  vel_x[i] *= velFactor_x;
3388  vel_y[i] *= velFactor_y;
3389  vel_z[i] *= velFactor_z;
3390  }
3391 }
3392 
3393 void SequencerCUDAKernel::scaleCoordinateWithFactor(
3394  double *pos_x,
3395  double *pos_y,
3396  double *pos_z,
3397  float *mass,
3398  int *hydrogenGroupSize,
3399  cudaTensor factor,
3400  cudaVector origin,
3401  int useGroupPressure,
3402  int numAtoms,
3403  cudaStream_t stream)
3404 {
3405  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3406  if (useGroupPressure) {
3407  scaleCoordinateUseGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3408  pos_x, pos_y, pos_z, mass, hydrogenGroupSize, factor, origin, numAtoms);
3409  } else {
3410  scaleCoordinateNoGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3411  pos_x, pos_y, pos_z, factor, origin, numAtoms);
3412  }
3413 }
3414 
3415 
3416 void SequencerCUDAKernel::SetAtomIndexOrder(
3417  int *id,
3418  int *idOrder,
3419  int numAtoms,
3420  cudaStream_t stream)
3421 {
3422  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3423  SetAtomIndexOrderKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3424  id, idOrder, numAtoms);
3425 }
3426 
3427 void SequencerCUDAKernel::scaleCoordinateUsingGC(
3428  double *pos_x,
3429  double *pos_y,
3430  double *pos_z,
3431  const int *idOrder,
3432  const int *moleculeStartIndex,
3433  const int *moleculeAtom,
3434  const cudaTensor factor,
3435  const cudaVector origin,
3436  const Lattice oldLattice,
3437  const Lattice newLattice,
3438  const char3 *transform,
3439  const int numMolecules,
3440  const int numLargeMolecules,
3441  cudaStream_t stream)
3442 {
3443  // we want each thread to calculate geometric center for molecule with
3444  // less than 128 atoms, and 1 threadblock to calculate each molecule
3445  // with larger than 128 atoms
3446  int numThreadsPerBlock = 64;
3447  int grid = ((numMolecules - numLargeMolecules + numThreadsPerBlock - 1) /
3448  numThreadsPerBlock) + numLargeMolecules;
3449  scaleCoordinateUsingGCKernel<<<grid, numThreadsPerBlock, 0, stream>>>(
3450  pos_x, pos_y, pos_z, idOrder, moleculeStartIndex,
3451  moleculeAtom, factor, origin, oldLattice, newLattice,
3452  transform, numMolecules, numLargeMolecules);
3453 
3454 }
3455 
3456 void SequencerCUDAKernel::langevinPiston(
3457  const bool doFixedAtoms,
3458  const int* atomFixed,
3459  const int* groupFixed,
3460  const char3* transform,
3461  const Lattice lattice,
3462  const double* fixedPosition_x,
3463  const double* fixedPosition_y,
3464  const double* fixedPosition_z,
3465  double *pos_x,
3466  double *pos_y,
3467  double *pos_z,
3468  double *vel_x,
3469  double *vel_y,
3470  double *vel_z,
3471  float *mass,
3472  int *hydrogenGroupSize,
3473  cudaTensor factor,
3474  cudaVector origin,
3475  double velFactor_x,
3476  double velFactor_y,
3477  double velFactor_z,
3478  int useGroupPressure,
3479  int numAtoms,
3480  cudaStream_t stream)
3481 {
3482  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3483  #if 0
3484  double *h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3485  double *h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3486  double *h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3487 
3488  double *h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3489  double *h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3490  double *h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3491 
3492  copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3493  copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3494  copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3495 
3496  copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3497  copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3498  copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3499 
3500  fprintf(stderr, "velFactors = %lf %lf %lf\n",
3501  velFactor_x, velFactor_y, velFactor_z);
3502  for(int i = 0; i < numAtoms; i++){
3503  fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3504  h_pos_x[i], h_pos_y[i], h_pos_z[i],
3505  h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3506  }
3507  free(h_pos_x);
3508  free(h_pos_y);
3509  free(h_pos_z);
3510  free(h_vel_x);
3511  free(h_vel_y);
3512  free(h_vel_z);
3513  #endif
3514 
3515  if (useGroupPressure) {
3516  if (doFixedAtoms) {
3517  langevinPistonUseGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3518  atomFixed, groupFixed, lattice, transform,
3519  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3520  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3521  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3522  } else {
3523  langevinPistonUseGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3524  atomFixed, groupFixed, lattice, transform,
3525  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3526  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3527  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3528  }
3529  } else {
3530  if (doFixedAtoms) {
3531  langevinPistonNoGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3532  atomFixed, lattice, transform,
3533  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3534  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3535  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3536  } else {
3537  langevinPistonNoGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3538  atomFixed, lattice, transform,
3539  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3540  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3541  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3542  }
3543  }
3544  //cudaCheck(cudaGetLastError());
3545 #if 0
3546  h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3547  h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3548  h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3549 
3550  h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3551  h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3552  h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3553 
3554  copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3555  copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3556  copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3557 
3558  copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3559  copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3560  copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3561 
3562  for(int i = 0; i < numAtoms; i++){
3563  fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3564  h_pos_x[i], h_pos_y[i], h_pos_z[i],
3565  h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3566  }
3567 #endif
3568 }
3569 
3570 template <int BLOCK_SIZE>
3571 __global__ void submitReduction1Kernel(
3572  double * __restrict pos_x,
3573  double * __restrict pos_y,
3574  double * __restrict pos_z,
3575  double * __restrict vel_x,
3576  double * __restrict vel_y,
3577  double * __restrict vel_z,
3578  float * __restrict mass,
3579  // TODO: wrap these scalars in a struct
3580  BigReal *kineticEnergy,
3581  BigReal *momentum_x,
3582  BigReal *momentum_y,
3583  BigReal *momentum_z,
3584  BigReal *angularMomentum_x,
3585  BigReal *angularMomentum_y,
3586  BigReal *angularMomentum_z,
3587  BigReal origin_x,
3588  BigReal origin_y,
3589  BigReal origin_z,
3590  BigReal *h_kineticEnergy,
3591  BigReal *h_momentum_x,
3592  BigReal *h_momentum_y,
3593  BigReal *h_momentum_z,
3594  BigReal *h_angularMomentum_x,
3595  BigReal *h_angularMomentum_y,
3596  BigReal *h_angularMomentum_z,
3597  unsigned int* tbcatomic,
3598  int numAtoms)
3599 {
3600  BigReal m_x = 0, m_y = 0, m_z = 0,
3601  a_x = 0, a_y = 0, a_z = 0;
3602  int i = threadIdx.x + blockIdx.x*blockDim.x;
3603  int totaltb = gridDim.x;
3604  __shared__ bool isLastBlockDone;
3605 
3606  if(threadIdx.x == 0){
3607  isLastBlockDone = 0;
3608  }
3609  __syncthreads();
3610 
3611  if (i < numAtoms) {
3612  // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
3613  // k += mass[i] *
3614  // (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
3615 
3616  // vector momentum += mass[i] * vel[i]
3617  m_x += mass[i] * vel_x[i];
3618  m_y += mass[i] * vel_y[i];
3619  m_z += mass[i] * vel_z[i];
3620 
3621  // vector dpos = pos[i] - origin
3622  BigReal dpos_x = pos_x[i] - origin_x;
3623  BigReal dpos_y = pos_y[i] - origin_y;
3624  BigReal dpos_z = pos_z[i] - origin_z;
3625 
3626  // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
3627  a_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
3628  a_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
3629  a_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
3630  }
3631  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
3632  __shared__ typename BlockReduce::TempStorage temp_storage;
3633  // k = BlockReduce(temp_storage).Sum(k);
3634  // __syncthreads();
3635 
3636  m_x = BlockReduce(temp_storage).Sum(m_x);
3637  __syncthreads();
3638  m_y = BlockReduce(temp_storage).Sum(m_y);
3639  __syncthreads();
3640  m_z = BlockReduce(temp_storage).Sum(m_z);
3641  __syncthreads();
3642  a_x = BlockReduce(temp_storage).Sum(a_x);
3643  __syncthreads();
3644  a_y = BlockReduce(temp_storage).Sum(a_y);
3645  __syncthreads();
3646  a_z = BlockReduce(temp_storage).Sum(a_z);
3647  __syncthreads();
3648 
3649  if (threadIdx.x == 0) {
3650  const int bin = blockIdx.x % ATOMIC_BINS;
3651 
3652  // atomicAdd(&kineticEnergy[bin], k);
3653  atomicAdd(&momentum_x[bin], m_x);
3654  atomicAdd(&momentum_y[bin], m_y);
3655  atomicAdd(&momentum_z[bin], m_z);
3656  atomicAdd(&angularMomentum_x[bin], a_x);
3657  atomicAdd(&angularMomentum_y[bin], a_y);
3658  atomicAdd(&angularMomentum_z[bin], a_z);
3659  __threadfence();
3660  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
3661  isLastBlockDone = (value == (totaltb-1));
3662  }
3663 
3664  __syncthreads();
3665 
3666 #if 1
3667  if(isLastBlockDone){
3668  if(threadIdx.x < ATOMIC_BINS){
3669  const int bin = threadIdx.x;
3670 
3671  double m_x = momentum_x[bin];
3672  double m_y = momentum_y[bin];
3673  double m_z = momentum_z[bin];
3674  double a_x = angularMomentum_x[bin];
3675  double a_y = angularMomentum_y[bin];
3676  double a_z = angularMomentum_z[bin];
3677 
3678  // sets device scalars back to zero
3679  kineticEnergy[0] = 0.0;
3680  momentum_x[bin] = 0.0;
3681  momentum_y[bin] = 0.0;
3682  momentum_z[bin] = 0.0;
3683  angularMomentum_x[bin] = 0.0;
3684  angularMomentum_y[bin] = 0.0;
3685  angularMomentum_z[bin] = 0.0;
3686 
3687  if(ATOMIC_BINS > 1){
3688  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3689  __shared__ typename WarpReduce::TempStorage tempStorage;
3690 
3691  m_x = WarpReduce(tempStorage).Sum(m_x);
3692  m_y = WarpReduce(tempStorage).Sum(m_y);
3693  m_z = WarpReduce(tempStorage).Sum(m_z);
3694  a_x = WarpReduce(tempStorage).Sum(a_x);
3695  a_y = WarpReduce(tempStorage).Sum(a_y);
3696  a_z = WarpReduce(tempStorage).Sum(a_z);
3697  }
3698 
3699  if(threadIdx.x == 0){
3700  h_momentum_x[0] = m_x;
3701  h_momentum_y[0] = m_y;
3702  h_momentum_z[0] = m_z;
3703  h_angularMomentum_x[0] = a_x;
3704  h_angularMomentum_y[0] = a_y;
3705  h_angularMomentum_z[0] = a_z;
3706 
3707  //resets atomic counter
3708  reset_atomic_counter(&tbcatomic[0]);
3709  }
3710  }
3711  }
3712 #endif
3713 }
3714 
3715 
3716 // JM: does addForcetoMomentum, maximumMove, and addVelocityToPosition
3717 template <bool DO_VEL_RESCALING, bool DO_FIXED>
3718 __global__ void velocityVerlet1Kernel(
3719  const int step,
3720  const double scaling,
3721  const double dt_normal,
3722  const double dt_nbond,
3723  const double dt_slow,
3724  const double velrescaling, // for stochastic velocity rescaling
3725  const double* __restrict recipMass,
3726  double* __restrict vel_x,
3727  double* __restrict vel_y,
3728  double* __restrict vel_z,
3729  const double maxvel2,
3730  int* h_killme,
3731  double* __restrict pos_x,
3732  double* __restrict pos_y,
3733  double* __restrict pos_z,
3734  double* h_pos_x,
3735  double* h_pos_y,
3736  double* h_pos_z,
3737  double* __restrict f_normal_x,
3738  double* __restrict f_normal_y,
3739  double* __restrict f_normal_z,
3740  double* __restrict f_nbond_x,
3741  double* __restrict f_nbond_y,
3742  double* __restrict f_nbond_z,
3743  double* __restrict f_slow_x,
3744  double* __restrict f_slow_y,
3745  double* __restrict f_slow_z,
3746  const int* atomFixed,
3747  const int numAtoms,
3748  const int maxForceNumber
3749  ){
3750  // fusion of addForceToMomentum, maximumMove, addVelocityToPosition
3751  double dt, dt_b, dt_nb, dt_s;
3752  int i = threadIdx.x + blockIdx.x*blockDim.x;
3753 
3754  if(i < numAtoms){
3755  dt = dt_normal;
3756  double velx = vel_x[i];
3757  double vely = vel_y[i];
3758  double velz = vel_z[i];
3759  if (DO_VEL_RESCALING) {
3760  velx *= velrescaling;
3761  vely *= velrescaling;
3762  velz *= velrescaling;
3763  }
3764  // JM NOTE: these need to be patch-centered
3765  double posx = pos_x[i];
3766  double posy = pos_y[i];
3767  double posz = pos_z[i];
3768  double rmass = recipMass[i];
3769  /* addForceToMomentum*/
3770  // keep velocities in registers so I can access them faster when calculating positions!
3771  switch(maxForceNumber){
3772  case 2:
3773  dt_s = dt_slow * scaling;
3774  velx += f_slow_x[i] * rmass * dt_s;
3775  vely += f_slow_y[i] * rmass * dt_s;
3776  velz += f_slow_z[i] * rmass * dt_s;
3777  // f_slow_x[i] = 0.0;
3778  // f_slow_y[i] = 0.0;
3779  // f_slow_z[i] = 0.0;
3780  case 1:
3781  dt_nb = dt_nbond * scaling;
3782  velx += f_nbond_x[i] * rmass * dt_nb;
3783  vely += f_nbond_y[i] * rmass * dt_nb;
3784  velz += f_nbond_z[i] * rmass * dt_nb;
3785  // f_nbond_x[i] = 0.0;
3786  // f_nbond_y[i] = 0.0;
3787  // f_nbond_z[i] = 0.0;
3788  case 0:
3789  dt_b = dt_normal * scaling;
3790  velx += f_normal_x[i] * rmass * dt_b;
3791  vely += f_normal_y[i] * rmass * dt_b;
3792  velz += f_normal_z[i] * rmass * dt_b;
3793  // f_normal_x[i] = 0.0;
3794  // f_normal_y[i] = 0.0;
3795  // f_normal_z[i] = 0.0;
3796  }
3797 
3798  // --------------------------------------------------------
3799 
3800  // -- MaximumMove --
3801  double vel2 = velx * velx + vely * vely + velz * velz;
3802  if(vel2 > maxvel2) atomicAdd(h_killme, 1); // stops dynamics if true, perf does not matter
3803  // --------------------------------------------------------
3804 
3805  // -- AddVelocityToPosition --
3806  // ---------------------------------------------------------
3807  if (!DO_FIXED) {
3808  posx += velx * dt;
3809  posy += vely * dt;
3810  posz += velz * dt;
3811  pos_x[i] = posx;
3812  pos_y[i] = posy;
3813  pos_z[i] = posz;
3814  vel_x[i] = velx;
3815  vel_y[i] = vely;
3816  vel_z[i] = velz;
3817  } else {
3818  // Haochuan: for fixed atoms, we need to clear their velocities (see HomePatch::addForceToMomentum)
3819  if (!atomFixed[i]) {
3820  posx += velx * dt;
3821  posy += vely * dt;
3822  posz += velz * dt;
3823  pos_x[i] = posx;
3824  pos_y[i] = posy;
3825  pos_z[i] = posz;
3826  vel_x[i] = velx;
3827  vel_y[i] = vely;
3828  vel_z[i] = velz;
3829  } else {
3830  vel_x[i] = 0;
3831  vel_y[i] = 0;
3832  vel_z[i] = 0;
3833  }
3834  }
3835  }
3836 
3837 }
3838 
3839 void SequencerCUDAKernel::velocityVerlet1(
3840  const bool doFixedAtoms,
3841  const int step,
3842  const double scaling,
3843  const double dt_normal,
3844  const double dt_nbond,
3845  const double dt_slow,
3846  const double velrescaling, // for stochastic velocity rescaling
3847  const double *recipMass,
3848  double* vel_x,
3849  double* vel_y,
3850  double* vel_z,
3851  double maxvel2,
3852  int* h_killme,
3853  double* pos_x,
3854  double* pos_y,
3855  double* pos_z,
3856  double* h_pos_x,
3857  double* h_pos_y,
3858  double* h_pos_z,
3859  double* f_normal_x,
3860  double* f_normal_y,
3861  double* f_normal_z,
3862  double* f_nbond_x,
3863  double* f_nbond_y,
3864  double* f_nbond_z,
3865  double* f_slow_x,
3866  double* f_slow_y,
3867  double* f_slow_z,
3868  const int* atomFixed,
3869  const int numAtoms,
3870  const int maxForceNumber,
3871  cudaStream_t stream)
3872 {
3873  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3874  const bool doVelRescaling = (velrescaling != 1.0);
3875 #define CALL(DO_VEL_RESCALING, DO_FIXED) \
3876  velocityVerlet1Kernel<DO_VEL_RESCALING, DO_FIXED> \
3877  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
3878  step, scaling, dt_normal, dt_nbond, dt_slow, velrescaling, recipMass, \
3879  vel_x, vel_y, vel_z, maxvel2, h_killme, \
3880  pos_x, pos_y, pos_z, h_pos_x, h_pos_y, h_pos_z, \
3881  f_normal_x, f_normal_y, f_normal_z, \
3882  f_nbond_x, f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z, \
3883  atomFixed, numAtoms, maxForceNumber \
3884  )
3885  if ( doVelRescaling && doFixedAtoms) CALL(true, true);
3886  else if ( doVelRescaling && !doFixedAtoms) CALL(true, false);
3887  else if (!doVelRescaling && doFixedAtoms) CALL(false, true);
3888  else if (!doVelRescaling && !doFixedAtoms) CALL(false, false);
3889  else {
3890  NAMD_bug("No kernel was called in SequencerCUDAKernel::velocityVerlet1!\n");
3891  }
3892 #undef CALL
3893 }
3894 
3895 void SequencerCUDAKernel::submitReduction1(
3896  double *pos_x,
3897  double *pos_y,
3898  double *pos_z,
3899  double *vel_x,
3900  double *vel_y,
3901  double *vel_z,
3902  float *mass,
3903  BigReal *kineticEnergy,
3904  BigReal *momentum_x,
3905  BigReal *momentum_y,
3906  BigReal *momentum_z,
3907  BigReal *angularMomentum_x,
3908  BigReal *angularMomentum_y,
3909  BigReal *angularMomentum_z,
3910  BigReal origin_x,
3911  BigReal origin_y,
3912  BigReal origin_z,
3913  BigReal *h_kineticEnergy,
3914  BigReal *h_momentum_x,
3915  BigReal *h_momentum_y,
3916  BigReal *h_momentum_z,
3917  BigReal *h_angularMomentum_x,
3918  BigReal *h_angularMomentum_y,
3919  BigReal *h_angularMomentum_z,
3920  unsigned int* tbcatomic,
3921  int numAtoms,
3922  cudaStream_t stream)
3923 {
3924  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3925  submitReduction1Kernel<ATOM_BLOCKS><<<grid, ATOM_BLOCKS, 0, stream>>>(
3926  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass,
3927  kineticEnergy, momentum_x, momentum_y, momentum_z,
3928  angularMomentum_x, angularMomentum_y, angularMomentum_z,
3929  origin_x, origin_y, origin_z, h_kineticEnergy, h_momentum_x, h_momentum_y,
3930  h_momentum_z, h_angularMomentum_x, h_angularMomentum_y, h_angularMomentum_z,
3931  tbcatomic, numAtoms);
3932  //cudaCheck(cudaGetLastError());
3933 }
3934 
3935 template <int BLOCK_SIZE, bool DO_FIXED, bool DO_MTS>
3936 __global__ void submitReduction2Kernel(
3937  const int* __restrict atomFixed,
3938  const double * __restrict pos_x,
3939  const double * __restrict pos_y,
3940  const double * __restrict pos_z,
3941  const double * __restrict vel_x,
3942  const double * __restrict vel_y,
3943  const double * __restrict vel_z,
3944  const double * __restrict rcm_x,
3945  const double * __restrict rcm_y,
3946  const double * __restrict rcm_z,
3947  const double * __restrict vcm_x,
3948  const double * __restrict vcm_y,
3949  const double * __restrict vcm_z,
3950  const double * __restrict f_normal_x,
3951  const double * __restrict f_normal_y,
3952  const double * __restrict f_normal_z,
3953  const double * __restrict f_nbond_x,
3954  const double * __restrict f_nbond_y,
3955  const double * __restrict f_nbond_z,
3956  const double * __restrict f_slow_x,
3957  const double * __restrict f_slow_y,
3958  const double * __restrict f_slow_z,
3959  const float * __restrict mass,
3960  const int * __restrict hydrogenGroupSize,
3961  BigReal *kineticEnergy,
3962  BigReal *h_kineticEnergy,
3963  BigReal *intKineticEnergy,
3964  BigReal *h_intKineticEnergy,
3965  cudaTensor *intVirialNormal,
3966  cudaTensor *intVirialNbond,
3967  cudaTensor *intVirialSlow,
3968  cudaTensor *h_intVirialNormal,
3969  cudaTensor *h_intVirialNbond,
3970  cudaTensor *h_intVirialSlow,
3971  cudaTensor* rigidVirial,
3972  cudaTensor* h_rigidVirial,
3973  unsigned int *tbcatomic,
3974  const int numAtoms,
3975  const int maxForceNumber)
3976 {
3977  BigReal K = 0;
3978  BigReal intK = 0;
3979  cudaTensor intVNormal, intVNbond, intVSlow;
3980  zero_cudaTensor(intVNormal);
3981  if (DO_MTS) {
3982  zero_cudaTensor(intVNbond);
3983  zero_cudaTensor(intVSlow);
3984  }
3985  int i = threadIdx.x + blockIdx.x*blockDim.x;
3986  __shared__ bool isLastBlockDone;
3987  int totaltb = gridDim.x;
3988 
3989  if(threadIdx.x == 0){
3990  isLastBlockDone = false;
3991  }
3992  __syncthreads();
3993 
3994  if (i < numAtoms) {
3995 #if 0
3996  int hgs = hydrogenGroupSize[i];
3997  // let's see, I have the hydrogenGroupSize, but that's not what I need;
3998  if (hgs > 0) {
3999  int j;
4000  BigReal m_cm = 0;
4001  BigReal r_cm_x = 0;
4002  BigReal r_cm_y = 0;
4003  BigReal r_cm_z = 0;
4004  BigReal v_cm_x = 0;
4005  BigReal v_cm_y = 0;
4006  BigReal v_cm_z = 0;
4007  for ( j = i; j < (i+hgs); ++j ) {
4008  r_mass[j -i ] = mass[j];
4009  r_pos[j - i].x = pos_x[j];
4010  r_pos[j - i].y = pos_y[j];
4011  r_pos[j - i].z = pos_z[j];
4012  r_vel[j - i].x = vel_x[j];
4013  r_vel[j - i].y = vel_y[j];
4014  r_vel[j - i].z = vel_z[j];
4015  // m_cm += mass[j];
4016  // r_cm_x += mass[j] * pos_x[j];
4017  // r_cm_y += mass[j] * pos_y[j];
4018  // r_cm_z += mass[j] * pos_z[j];
4019  // v_cm_x += mass[j] * vel_x[j];
4020  // v_cm_y += mass[j] * vel_y[j];
4021  // v_cm_z += mass[j] * vel_z[j];
4022  m_cm += r_mass[j - i];
4023  r_cm_x += r_mass[j - i] * r_pos[j-i].x;
4024  r_cm_y += r_mass[j - i] * r_pos[j-i].y;
4025  r_cm_z += r_mass[j - i] * r_pos[j-i].z;
4026  v_cm_x += r_mass[j - i] * r_vel[j-i].x;
4027  v_cm_y += r_mass[j - i] * r_vel[j-i].y;
4028  v_cm_z += r_mass[j - i] * r_vel[j-i].z;
4029  }
4030  BigReal inv_m_cm = 1.0/m_cm;
4031  r_cm_x *= inv_m_cm;
4032  r_cm_y *= inv_m_cm;
4033  r_cm_z *= inv_m_cm;
4034  v_cm_x *= inv_m_cm;
4035  v_cm_y *= inv_m_cm;
4036  v_cm_z *= inv_m_cm;
4037 
4038  // XXX removed pairInteraction
4039  for ( j = i; j < (i+hgs); ++j ) {
4040  // XXX removed fixed atoms
4041 
4042  // vector vel[j] used twice below
4043  BigReal v_x = r_vel[j-i].x;
4044  BigReal v_y = r_vel[j-i].y;
4045  BigReal v_z = r_vel[j-i].z;
4046 
4047  // vector dv = vel[j] - v_cm
4048  BigReal dv_x = v_x - v_cm_x;
4049  BigReal dv_y = v_y - v_cm_y;
4050  BigReal dv_z = v_z - v_cm_z;
4051 
4052  // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4053  //intK += mass[j] *
4054  // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4055  intK += r_mass[j-i] *
4056  (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4057 
4058  // vector dr = pos[j] - r_cm
4059  // BigReal dr_x = pos_x[j] - r_cm_x;
4060  // BigReal dr_y = pos_y[j] - r_cm_y;
4061  // BigReal dr_z = pos_z[j] - r_cm_z;
4062 
4063  BigReal dr_x = r_pos[j -i].x - r_cm_x;
4064  BigReal dr_y = r_pos[j -i].y - r_cm_y;
4065  BigReal dr_z = r_pos[j -i].z - r_cm_z;
4066 
4067  // tensor intVirialNormal += outer_product(f_normal[j], dr)
4068 
4069  // we're not going to make this function any faster if we don't fix
4070  // the global memory access pattern
4071  intVNormal.xx += f_normal_x[j] * dr_x;
4072  intVNormal.xy += f_normal_x[j] * dr_y;
4073  intVNormal.xz += f_normal_x[j] * dr_z;
4074  intVNormal.yx += f_normal_y[j] * dr_x;
4075  intVNormal.yy += f_normal_y[j] * dr_y;
4076  intVNormal.yz += f_normal_y[j] * dr_z;
4077  intVNormal.zx += f_normal_z[j] * dr_x;
4078  intVNormal.zy += f_normal_z[j] * dr_y;
4079  intVNormal.zz += f_normal_z[j] * dr_z;
4080 
4081  if (maxForceNumber >= 1) {
4082  // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4083  intVNbond.xx += f_nbond_x[j] * dr_x;
4084  intVNbond.xy += f_nbond_x[j] * dr_y;
4085  intVNbond.xz += f_nbond_x[j] * dr_z;
4086  intVNbond.yx += f_nbond_y[j] * dr_x;
4087  intVNbond.yy += f_nbond_y[j] * dr_y;
4088  intVNbond.yz += f_nbond_y[j] * dr_z;
4089  intVNbond.zx += f_nbond_z[j] * dr_x;
4090  intVNbond.zy += f_nbond_z[j] * dr_y;
4091  intVNbond.zz += f_nbond_z[j] * dr_z;
4092  }
4093 
4094  if (maxForceNumber >= 2) {
4095  // tensor intVirialSlow += outer_product(f_slow[j], dr)
4096  intVSlow.xx += f_slow_x[j] * dr_x;
4097  intVSlow.xy += f_slow_x[j] * dr_y;
4098  intVSlow.xz += f_slow_x[j] * dr_z;
4099  intVSlow.yx += f_slow_y[j] * dr_x;
4100  intVSlow.yy += f_slow_y[j] * dr_y;
4101  intVSlow.yz += f_slow_y[j] * dr_z;
4102  intVSlow.zx += f_slow_z[j] * dr_x;
4103  intVSlow.zy += f_slow_z[j] * dr_y;
4104  intVSlow.zz += f_slow_z[j] * dr_z;
4105  }
4106  }
4107  }
4108 #else
4109  // Haochuan: I need to skip the fixed atoms
4110  if (!DO_FIXED || (DO_FIXED && !atomFixed[i])) {
4111  BigReal r_cm_x = rcm_x[i];
4112  BigReal r_cm_y = rcm_y[i];
4113  BigReal r_cm_z = rcm_z[i];
4114  BigReal v_cm_x = vcm_x[i];
4115  BigReal v_cm_y = vcm_y[i];
4116  BigReal v_cm_z = vcm_z[i];
4117 
4118  BigReal v_x = vel_x[i];
4119  BigReal v_y = vel_y[i];
4120  BigReal v_z = vel_z[i];
4121 
4122  // vector dv = vel[j] - v_cm
4123  BigReal dv_x = v_x - v_cm_x;
4124  BigReal dv_y = v_y - v_cm_y;
4125  BigReal dv_z = v_z - v_cm_z;
4126 
4127  // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4128  //intK += mass[j] *
4129  // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4130  K += mass[i] *
4131  (v_x * v_x + v_y*v_y + v_z*v_z);
4132  intK += mass[i] *
4133  (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4134 
4135  // vector dr = pos[j] - r_cm
4136  // BigReal dr_x = pos_x[j] - r_cm_x;
4137  // BigReal dr_y = pos_y[j] - r_cm_y;
4138  // BigReal dr_z = pos_z[j] - r_cm_z;
4139 
4140  BigReal dr_x = pos_x[i] - r_cm_x;
4141  BigReal dr_y = pos_y[i] - r_cm_y;
4142  BigReal dr_z = pos_z[i] - r_cm_z;
4143 
4144  // tensor intVirialNormal += outer_product(f_normal[j], dr)
4145 
4146  // we're not going to make this function any faster if we don't fix
4147  // the global memory access pattern
4148  intVNormal.xx += f_normal_x[i] * dr_x;
4149  intVNormal.xy += f_normal_x[i] * dr_y;
4150  intVNormal.xz += f_normal_x[i] * dr_z;
4151  intVNormal.yx += f_normal_y[i] * dr_x;
4152  intVNormal.yy += f_normal_y[i] * dr_y;
4153  intVNormal.yz += f_normal_y[i] * dr_z;
4154  intVNormal.zx += f_normal_z[i] * dr_x;
4155  intVNormal.zy += f_normal_z[i] * dr_y;
4156  intVNormal.zz += f_normal_z[i] * dr_z;
4157 
4158  if (maxForceNumber >= 1) {
4159  // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4160  // Haochuan: For a MTS simulation we have to calculate intVNbond separately
4161  if (DO_MTS) {
4162  intVNbond.xx += f_nbond_x[i] * dr_x;
4163  intVNbond.xy += f_nbond_x[i] * dr_y;
4164  intVNbond.xz += f_nbond_x[i] * dr_z;
4165  intVNbond.yx += f_nbond_y[i] * dr_x;
4166  intVNbond.yy += f_nbond_y[i] * dr_y;
4167  intVNbond.yz += f_nbond_y[i] * dr_z;
4168  intVNbond.zx += f_nbond_z[i] * dr_x;
4169  intVNbond.zy += f_nbond_z[i] * dr_y;
4170  intVNbond.zz += f_nbond_z[i] * dr_z;
4171  } else {
4172  // If this is not a MTS simulation, then we can sum the internal virials into a single one (intVNormal)
4173  intVNormal.xx += f_nbond_x[i] * dr_x;
4174  intVNormal.xy += f_nbond_x[i] * dr_y;
4175  intVNormal.xz += f_nbond_x[i] * dr_z;
4176  intVNormal.yx += f_nbond_y[i] * dr_x;
4177  intVNormal.yy += f_nbond_y[i] * dr_y;
4178  intVNormal.yz += f_nbond_y[i] * dr_z;
4179  intVNormal.zx += f_nbond_z[i] * dr_x;
4180  intVNormal.zy += f_nbond_z[i] * dr_y;
4181  intVNormal.zz += f_nbond_z[i] * dr_z;
4182  }
4183  }
4184 
4185  if (maxForceNumber >= 2) {
4186  // tensor intVirialSlow += outer_product(f_slow[j], dr)
4187  if (DO_MTS) {
4188  intVSlow.xx += f_slow_x[i] * dr_x;
4189  intVSlow.xy += f_slow_x[i] * dr_y;
4190  intVSlow.xz += f_slow_x[i] * dr_z;
4191  intVSlow.yx += f_slow_y[i] * dr_x;
4192  intVSlow.yy += f_slow_y[i] * dr_y;
4193  intVSlow.yz += f_slow_y[i] * dr_z;
4194  intVSlow.zx += f_slow_z[i] * dr_x;
4195  intVSlow.zy += f_slow_z[i] * dr_y;
4196  intVSlow.zz += f_slow_z[i] * dr_z;
4197  } else {
4198  intVNormal.xx += f_slow_x[i] * dr_x;
4199  intVNormal.xy += f_slow_x[i] * dr_y;
4200  intVNormal.xz += f_slow_x[i] * dr_z;
4201  intVNormal.yx += f_slow_y[i] * dr_x;
4202  intVNormal.yy += f_slow_y[i] * dr_y;
4203  intVNormal.yz += f_slow_y[i] * dr_z;
4204  intVNormal.zx += f_slow_z[i] * dr_x;
4205  intVNormal.zy += f_slow_z[i] * dr_y;
4206  intVNormal.zz += f_slow_z[i] * dr_z;
4207  }
4208  }
4209  }
4210 #endif
4211  }
4212 
4213  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
4214  __shared__ typename BlockReduce::TempStorage temp_storage;
4215  // XXX TODO: If we handwrite these reductions, it might avoid a
4216  // lot of overhead from launching CUB
4217 
4218  K = BlockReduce(temp_storage).Sum(K);
4219  __syncthreads();
4220  intK = BlockReduce(temp_storage).Sum(intK);
4221  __syncthreads();
4222  intVNormal.xx = BlockReduce(temp_storage).Sum(intVNormal.xx);
4223  __syncthreads();
4224  intVNormal.xy = BlockReduce(temp_storage).Sum(intVNormal.xy);
4225  __syncthreads();
4226  intVNormal.xz = BlockReduce(temp_storage).Sum(intVNormal.xz);
4227  __syncthreads();
4228  intVNormal.yx = BlockReduce(temp_storage).Sum(intVNormal.yx);
4229  __syncthreads();
4230  intVNormal.yy = BlockReduce(temp_storage).Sum(intVNormal.yy);
4231  __syncthreads();
4232  intVNormal.yz = BlockReduce(temp_storage).Sum(intVNormal.yz);
4233  __syncthreads();
4234  intVNormal.zx = BlockReduce(temp_storage).Sum(intVNormal.zx);
4235  __syncthreads();
4236  intVNormal.zy = BlockReduce(temp_storage).Sum(intVNormal.zy);
4237  __syncthreads();
4238  intVNormal.zz = BlockReduce(temp_storage).Sum(intVNormal.zz);
4239  __syncthreads();
4240  if (DO_MTS) {
4241  if (maxForceNumber >= 1) {
4242  intVNbond.xx = BlockReduce(temp_storage).Sum(intVNbond.xx);
4243  __syncthreads();
4244  intVNbond.xy = BlockReduce(temp_storage).Sum(intVNbond.xy);
4245  __syncthreads();
4246  intVNbond.xz = BlockReduce(temp_storage).Sum(intVNbond.xz);
4247  __syncthreads();
4248  intVNbond.yx = BlockReduce(temp_storage).Sum(intVNbond.yx);
4249  __syncthreads();
4250  intVNbond.yy = BlockReduce(temp_storage).Sum(intVNbond.yy);
4251  __syncthreads();
4252  intVNbond.yz = BlockReduce(temp_storage).Sum(intVNbond.yz);
4253  __syncthreads();
4254  intVNbond.zx = BlockReduce(temp_storage).Sum(intVNbond.zx);
4255  __syncthreads();
4256  intVNbond.zy = BlockReduce(temp_storage).Sum(intVNbond.zy);
4257  __syncthreads();
4258  intVNbond.zz = BlockReduce(temp_storage).Sum(intVNbond.zz);
4259  __syncthreads();
4260  }
4261  if (maxForceNumber >= 2) {
4262  intVSlow.xx = BlockReduce(temp_storage).Sum(intVSlow.xx);
4263  __syncthreads();
4264  intVSlow.xy = BlockReduce(temp_storage).Sum(intVSlow.xy);
4265  __syncthreads();
4266  intVSlow.xz = BlockReduce(temp_storage).Sum(intVSlow.xz);
4267  __syncthreads();
4268  intVSlow.yx = BlockReduce(temp_storage).Sum(intVSlow.yx);
4269  __syncthreads();
4270  intVSlow.yy = BlockReduce(temp_storage).Sum(intVSlow.yy);
4271  __syncthreads();
4272  intVSlow.yz = BlockReduce(temp_storage).Sum(intVSlow.yz);
4273  __syncthreads();
4274  intVSlow.zx = BlockReduce(temp_storage).Sum(intVSlow.zx);
4275  __syncthreads();
4276  intVSlow.zy = BlockReduce(temp_storage).Sum(intVSlow.zy);
4277  __syncthreads();
4278  intVSlow.zz = BlockReduce(temp_storage).Sum(intVSlow.zz);
4279  __syncthreads();
4280  }
4281  }
4282 
4283 
4284  if (threadIdx.x == 0) {
4285  const int bin = blockIdx.x % ATOMIC_BINS;
4286 
4287  atomicAdd(&kineticEnergy[bin], K);
4288  atomicAdd(&intKineticEnergy[bin], intK);
4289  atomicAdd(&intVirialNormal[bin].xx, intVNormal.xx);
4290  atomicAdd(&intVirialNormal[bin].xy, intVNormal.xy);
4291  atomicAdd(&intVirialNormal[bin].xz, intVNormal.xz);
4292  atomicAdd(&intVirialNormal[bin].yx, intVNormal.yx);
4293  atomicAdd(&intVirialNormal[bin].yy, intVNormal.yy);
4294  atomicAdd(&intVirialNormal[bin].yz, intVNormal.yz);
4295  atomicAdd(&intVirialNormal[bin].zx, intVNormal.zx);
4296  atomicAdd(&intVirialNormal[bin].zy, intVNormal.zy);
4297  atomicAdd(&intVirialNormal[bin].zz, intVNormal.zz);
4298  if (DO_MTS) {
4299  if (maxForceNumber >= 1) {
4300  atomicAdd(&intVirialNbond[bin].xx, intVNbond.xx);
4301  atomicAdd(&intVirialNbond[bin].xy, intVNbond.xy);
4302  atomicAdd(&intVirialNbond[bin].xz, intVNbond.xz);
4303  atomicAdd(&intVirialNbond[bin].yx, intVNbond.yx);
4304  atomicAdd(&intVirialNbond[bin].yy, intVNbond.yy);
4305  atomicAdd(&intVirialNbond[bin].yz, intVNbond.yz);
4306  atomicAdd(&intVirialNbond[bin].zx, intVNbond.zx);
4307  atomicAdd(&intVirialNbond[bin].zy, intVNbond.zy);
4308  atomicAdd(&intVirialNbond[bin].zz, intVNbond.zz);
4309  }
4310  if (maxForceNumber >= 2) {
4311  atomicAdd(&intVirialSlow[bin].xx, intVSlow.xx);
4312  atomicAdd(&intVirialSlow[bin].xy, intVSlow.xy);
4313  atomicAdd(&intVirialSlow[bin].xz, intVSlow.xz);
4314  atomicAdd(&intVirialSlow[bin].yx, intVSlow.yx);
4315  atomicAdd(&intVirialSlow[bin].yy, intVSlow.yy);
4316  atomicAdd(&intVirialSlow[bin].yz, intVSlow.yz);
4317  atomicAdd(&intVirialSlow[bin].zx, intVSlow.zx);
4318  atomicAdd(&intVirialSlow[bin].zy, intVSlow.zy);
4319  atomicAdd(&intVirialSlow[bin].zz, intVSlow.zz);
4320  }
4321  }
4322  __threadfence();
4323  unsigned int value = atomicInc(&tbcatomic[3], totaltb);
4324  isLastBlockDone = (value == (totaltb -1 ));
4325  }
4326  __syncthreads();
4327  // this function calculates the internal pressures and is a huge bottleneck if we
4328  // do this host-mapped update
4329  // How do I know if this is really the bottleneck?
4330  if(isLastBlockDone){
4331  if(threadIdx.x < ATOMIC_BINS){
4332  const int bin = threadIdx.x;
4333 
4334  double k = kineticEnergy[bin];
4335  double intK = intKineticEnergy[bin];
4336  cudaTensor intVNormal = intVirialNormal[bin];
4337  if (DO_MTS) {
4338  if (maxForceNumber >= 1) {
4339  intVNbond = intVirialNbond[bin];
4340  }
4341  if (maxForceNumber >= 2) {
4342  intVSlow = intVirialSlow[bin];
4343  }
4344  }
4345  cudaTensor rigidV = rigidVirial[bin];
4346 
4347  // sets device scalars back to zero
4348  kineticEnergy[bin] = 0.0;
4349  intKineticEnergy[bin] = 0.0;
4350  zero_cudaTensor(intVirialNormal[bin]);
4351  zero_cudaTensor(intVirialNbond[bin]);
4352  zero_cudaTensor(intVirialSlow[bin]);
4353  zero_cudaTensor(rigidVirial[bin]);
4354 
4355  if(ATOMIC_BINS > 1){
4356  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
4357  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
4358  __shared__ typename WarpReduce::TempStorage tempStorage;
4359  __shared__ typename WarpReduceT::TempStorage tempStorageT;
4360 
4361  // According to https://nvidia.github.io/cccl/cub/developer_overview.html#temporary-storage-usage,
4362  // I should use __syncwarp() here.
4363  k = WarpReduce(tempStorage).Sum(k); NAMD_WARP_SYNC(WARP_FULL_MASK);
4364  intK = WarpReduce(tempStorage).Sum(intK); NAMD_WARP_SYNC(WARP_FULL_MASK);
4365  intVNormal = WarpReduceT(tempStorageT).Sum(intVNormal); NAMD_WARP_SYNC(WARP_FULL_MASK);
4366  rigidV = WarpReduceT(tempStorageT).Sum(rigidV); NAMD_WARP_SYNC(WARP_FULL_MASK);
4367  if (DO_MTS) {
4368  if (maxForceNumber >= 1) {
4369  intVNbond = WarpReduceT(tempStorageT).Sum(intVNbond); NAMD_WARP_SYNC(WARP_FULL_MASK);
4370  }
4371  if (maxForceNumber >= 2) {
4372  intVSlow = WarpReduceT(tempStorageT).Sum(intVSlow); NAMD_WARP_SYNC(WARP_FULL_MASK);
4373  }
4374  }
4375  }
4376 
4377  if(threadIdx.x == 0){
4378  h_kineticEnergy[0] = k;
4379  h_intKineticEnergy[0] = intK;
4380  h_intVirialNormal[0] = intVNormal;
4381  if (DO_MTS) {
4382  if (maxForceNumber >= 1) {
4383  h_intVirialNbond[0] = intVNbond;
4384  }
4385  if (maxForceNumber >= 2) {
4386  h_intVirialSlow[0] = intVSlow;
4387  }
4388  }
4389  h_rigidVirial[0] = rigidV;
4390 
4391  //resets atomic counter
4392  reset_atomic_counter(&tbcatomic[3]);
4393  }
4394  }
4395  }
4396 }
4397 
4398 void SequencerCUDAKernel::submitReduction2(
4399  const bool doFixedAtoms,
4400  const int* atomFixed,
4401  const double *pos_x,
4402  const double *pos_y,
4403  const double *pos_z,
4404  const double *vel_x,
4405  const double *vel_y,
4406  const double *vel_z,
4407  const double *rcm_x,
4408  const double *rcm_y,
4409  const double *rcm_z,
4410  const double *vcm_x,
4411  const double *vcm_y,
4412  const double *vcm_z,
4413  const double *f_normal_x,
4414  const double *f_normal_y,
4415  const double *f_normal_z,
4416  const double *f_nbond_x,
4417  const double *f_nbond_y,
4418  const double *f_nbond_z,
4419  const double *f_slow_x,
4420  const double *f_slow_y,
4421  const double *f_slow_z,
4422  float *mass,
4423  int *hydrogenGroupSize,
4424  BigReal *kineticEnergy,
4425  BigReal *h_kineticEnergy,
4426  BigReal *intKineticEnergy,
4427  BigReal *h_intKineticEnergy,
4428  cudaTensor *intVirialNormal,
4429  cudaTensor *intVirialNbond,
4430  cudaTensor *intVirialSlow,
4431  cudaTensor *h_intVirialNormal,
4432  cudaTensor *h_intVirialNbond,
4433  cudaTensor *h_intVirialSlow,
4434  cudaTensor* rigidVirial,
4435  cudaTensor* h_rigidVirial,
4436  unsigned int *tbcatomic,
4437  int numAtoms,
4438  int maxForceNumber,
4439  const bool doMTS,
4440  cudaStream_t stream)
4441 {
4442 
4443  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4444 #define CALL(DO_FIXED, DO_MTS) \
4445  submitReduction2Kernel<ATOM_BLOCKS, DO_FIXED, DO_MTS><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4446  atomFixed, pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, \
4447  rcm_x, rcm_y, rcm_z, vcm_x, vcm_y, vcm_z, \
4448  f_normal_x, f_normal_y, f_normal_z, \
4449  f_nbond_x, f_nbond_y, f_nbond_z, \
4450  f_slow_x, f_slow_y, f_slow_z, \
4451  mass, hydrogenGroupSize, \
4452  kineticEnergy, h_kineticEnergy, \
4453  intKineticEnergy, h_intKineticEnergy, \
4454  intVirialNormal, intVirialNbond, intVirialSlow, \
4455  h_intVirialNormal, h_intVirialNbond, h_intVirialSlow, rigidVirial, \
4456  h_rigidVirial, tbcatomic, numAtoms, maxForceNumber)
4457  if (doFixedAtoms) {
4458  if (doMTS) CALL(true, true);
4459  else CALL(true, false);
4460  }
4461  else if (!doFixedAtoms) {
4462  if (doMTS) CALL(false, true);
4463  else CALL(false, false);
4464  }
4465  else NAMD_bug("No kernel was called in SequencerCUDAKernel::submitReduction2!\n");
4466 #undef CALL
4467  // cudaCheck(cudaGetLastError());
4468 }
4469 
4470 __global__ void langevinVelocitiesBBK1Kernel(
4471  BigReal timestep,
4472  const float * __restrict langevinParam,
4473  double * __restrict vel_x,
4474  double * __restrict vel_y,
4475  double * __restrict vel_z,
4476  int numAtoms)
4477 {
4478  BigReal dt = timestep * (0.001 * TIMEFACTOR);
4479  int i = threadIdx.x + blockIdx.x*blockDim.x;
4480  if (i < numAtoms) {
4481  BigReal dt_gamma = dt * langevinParam[i];
4482  BigReal scaling = 1. - 0.5 * dt_gamma;
4483  vel_x[i] *= scaling;
4484  vel_y[i] *= scaling;
4485  vel_z[i] *= scaling;
4486  }
4487 }
4488 
4489 void SequencerCUDAKernel::langevinVelocitiesBBK1(
4490  BigReal timestep,
4491  const float *langevinParam,
4492  double *vel_x,
4493  double *vel_y,
4494  double *vel_z,
4495  int numAtoms,
4496  cudaStream_t stream)
4497 {
4498  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4499  langevinVelocitiesBBK1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4500  timestep, langevinParam, vel_x, vel_y, vel_z, numAtoms);
4501  // cudaCheck(cudaGetLastError());
4502 }
4503 
4504 __global__ void langevinVelocitiesBBK2Kernel(
4505  BigReal timestep,
4506  const float * __restrict langScalVelBBK2,
4507  const float * __restrict langScalRandBBK2,
4508  const float * __restrict gaussrand_x,
4509  const float * __restrict gaussrand_y,
4510  const float * __restrict gaussrand_z,
4511  double * __restrict vel_x,
4512  double * __restrict vel_y,
4513  double * __restrict vel_z,
4514  const int numAtoms)
4515 {
4516  int i = threadIdx.x + blockIdx.x*blockDim.x;
4517  if (i < numAtoms) {
4518  vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
4519  vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
4520  vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
4521  vel_x[i] *= langScalVelBBK2[i];
4522  vel_y[i] *= langScalVelBBK2[i];
4523  vel_z[i] *= langScalVelBBK2[i];
4524  }
4525 }
4526 
4527 
4528 
4529 void SequencerCUDAKernel::langevinVelocitiesBBK2(
4530  BigReal timestep,
4531  const float *langScalVelBBK2,
4532  const float *langScalRandBBK2,
4533  float *gaussrand_x,
4534  float *gaussrand_y,
4535  float *gaussrand_z,
4536  double *vel_x,
4537  double *vel_y,
4538  double *vel_z,
4539  const int numAtoms,
4540  const int numAtomsGlobal,
4541  const int stride,
4542  curandGenerator_t gen,
4543  cudaStream_t stream)
4544 {
4545 
4546  // For some reason, if n == nAtomsDevice + 1, this gives me a misaligned address
4547  // Generating the full array on gaussrand without striding for multiple GPU simulations for now
4548 
4549  // Adding missing call to hiprandSetStream to set the current stream for HIPRAND kernel launches
4550  curandCheck(curandSetStream(gen, stream));
4551  // array buffers have to be even length for pseudorandom normal distribution
4552  // choose n to be the smallest even number >= numAtoms
4553  // we can choose 1 larger than numAtoms since allocation is > numAtoms
4554  int n = (numAtomsGlobal + 1) & (~1);
4555 
4556  curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4557  curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4558  curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4559  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4560  langevinVelocitiesBBK2Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4561  timestep, langScalVelBBK2, langScalRandBBK2,
4562  gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride,
4563  vel_x, vel_y, vel_z, numAtoms);
4564  // cudaCheck(cudaGetLastError());
4565 }
4566 
4567 template <bool doFixed>
4568 __global__ void reassignVelocitiesKernel(
4569  const BigReal timestep,
4570  const int* __restrict atomFixed,
4571  const float * __restrict gaussrand_x,
4572  const float * __restrict gaussrand_y,
4573  const float * __restrict gaussrand_z,
4574  double * __restrict vel_x,
4575  double * __restrict vel_y,
4576  double * __restrict vel_z,
4577  const double* __restrict d_recipMass,
4578  const BigReal kbT,
4579  const int numAtoms)
4580 {
4581  //! note we are NOT supporting LES (locally enhanced sampling) here
4582  //! hence no switch on the partition to apply tempfactor
4583 
4584  /* */
4585  int i = threadIdx.x + blockIdx.x*blockDim.x;
4586  if (i < numAtoms) {
4587  if (doFixed) {
4588  if (atomFixed[i]) {
4589  vel_x[i] = 0.0;
4590  vel_y[i] = 0.0;
4591  vel_z[i] = 0.0;
4592  } else {
4593  vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4594  vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4595  vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4596  }
4597  } else {
4598  vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4599  vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4600  vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4601  }
4602  }
4603  //__syncthreads();
4604 }
4605 
4606 void SequencerCUDAKernel::reassignVelocities(
4607  const BigReal timestep,
4608  const bool doFixedAtoms,
4609  const int* atomFixed,
4610  float *gaussrand_x,
4611  float *gaussrand_y,
4612  float *gaussrand_z,
4613  double *vel_x,
4614  double *vel_y,
4615  double *vel_z,
4616  const double* d_recipMass,
4617  const BigReal kbT,
4618  const int numAtoms,
4619  const int numAtomsGlobal,
4620  const int stride,
4621  curandGenerator_t gen,
4622  cudaStream_t stream)
4623 {
4624 
4625  curandCheck(curandSetStream(gen, stream));
4626  // array buffers have to be even length for pseudorandom normal distribution
4627  // choose n to be the smallest even number >= numAtoms
4628  // we can choose 1 larger than numAtoms since allocation is > numAtoms
4629  int n = (numAtomsGlobal + 1) & (~1);
4630 
4631  curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4632  curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4633  curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4634  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4635 #define CALL(DOFIXED) \
4636  reassignVelocitiesKernel<DOFIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4637  timestep, atomFixed, \
4638  gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride, \
4639  vel_x, vel_y, vel_z, d_recipMass, kbT, numAtoms)
4640  if (doFixedAtoms) CALL(true);
4641  else CALL(false);
4642 #undef CALL
4643  cudaCheck(cudaStreamSynchronize(stream));
4644 
4645 }
4646 
4647 
4648 
4649 __global__ void updateVelocities(const int nRattles,
4650  const int nSettles,
4651  const float invdt,
4652  const int * __restrict settleList,
4653  const CudaRattleElem * __restrict rattleList,
4654  const double * __restrict pos_x,
4655  const double * __restrict pos_y,
4656  const double * __restrict pos_z,
4657  const double * __restrict posNew_x,
4658  const double * __restrict posNew_y,
4659  const double * __restrict posNew_z,
4660  double * velNew_x,
4661  double * velNew_y,
4662  double * velNew_z){
4663 
4664  int tid = threadIdx.x + blockIdx.x*blockDim.x;
4665  int stride = gridDim.x*blockDim.x;
4666 
4667 #if 0
4668  for(int i = tid; i < nSettles; i += stride){
4669  int ig = settleList[i];
4670  //Now I update the position
4671  //Updates three atoms in the settleList
4672  velNew_x[ig] = (posNew_x[ig] - pos_x[ig]) * invdt;
4673  velNew_y[ig] = (posNew_y[ig] - pos_y[ig]) * invdt;
4674  velNew_z[ig] = (posNew_z[ig] - pos_z[ig]) * invdt;
4675 
4676  velNew_x[ig+1] = (posNew_x[ig+1] - pos_x[ig+1]) * invdt;
4677  velNew_y[ig+1] = (posNew_y[ig+1] - pos_y[ig+1]) * invdt;
4678  velNew_z[ig+1] = (posNew_z[ig+1] - pos_z[ig+1]) * invdt;
4679 
4680  velNew_x[ig+2] = (posNew_x[ig+2] - pos_x[ig+2]) * invdt;
4681  velNew_y[ig+2] = (posNew_y[ig+2] - pos_y[ig+2]) * invdt;
4682  velNew_z[ig+2] = (posNew_z[ig+2] - pos_z[ig+2]) * invdt;
4683  }
4684 #endif
4685 
4686  for(int i = tid; i < nRattles; i += stride){
4687  int ig = rattleList[i].ig;
4688  int icnt = rattleList[i].icnt;
4689  bool fixed[4];
4690  fixed[0] = false; fixed[1] = false;
4691  fixed[2] = false; fixed[3] = false;
4692  for(int j = 0; j < icnt; j++){
4693  //Gets two positions
4694  int ia = rattleList[i].params[j].ia;
4695  int ib = rattleList[i].params[j].ib;
4696  //checks if any of these positions have been updated yet
4697  if(!fixed[ia]){
4698  velNew_x[ig+ia] = (posNew_x[ig+ia] - pos_x[ig+ia]) * invdt;
4699  velNew_y[ig+ia] = (posNew_y[ig+ia] - pos_y[ig+ia]) * invdt;
4700  velNew_z[ig+ia] = (posNew_z[ig+ia] - pos_z[ig+ia]) * invdt;
4701  fixed[ia] = true;
4702  }
4703  if(!fixed[ib]){
4704  velNew_x[ig+ib] = (posNew_x[ig+ib] - pos_x[ig+ib]) * invdt;
4705  velNew_y[ig+ib] = (posNew_y[ig+ib] - pos_y[ig+ib]) * invdt;
4706  velNew_z[ig+ib] = (posNew_z[ig+ib] - pos_z[ig+ib]) * invdt;
4707  fixed[ib] = true;
4708  }
4709  }
4710  }
4711 }
4712 
4713 //JM: Function that adds a correction to the bonded forces
4714 template<bool doEnergy, bool doFixed>
4715 __global__ void addRattleForce(int numAtoms,
4716  const double invdt,
4717  const int* __restrict atomFixed,
4718  const float * __restrict mass,
4719  const double * __restrict pos_x,
4720  const double * __restrict pos_y,
4721  const double * __restrict pos_z,
4722  double * __restrict vel_x,
4723  double * __restrict vel_y,
4724  double * __restrict vel_z,
4725  double * __restrict velNew_x,
4726  double * __restrict velNew_y,
4727  double * __restrict velNew_z,
4728  double * __restrict f_normal_x,
4729  double * __restrict f_normal_y,
4730  double * __restrict f_normal_z,
4731  cudaTensor* __restrict virial,
4732  cudaTensor* __restrict h_virial,
4733  unsigned int* tbcatomic){
4734 
4735  cudaTensor lVirial;
4736  // zero_cudaTensor(lVirial);
4737  int i = threadIdx.x + blockIdx.x*blockDim.x;
4738 
4739  double df[3];
4740  double pos[3];
4741  lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
4742  lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
4743  lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
4744  //for(int i = tid; i < numAtoms; i += stride){
4745  if(i < numAtoms){
4746  df[0] = (velNew_x[i] - vel_x[i]) * ((double)mass[i] * invdt);
4747  df[1] = (velNew_y[i] - vel_y[i]) * ((double)mass[i] * invdt);
4748  df[2] = (velNew_z[i] - vel_z[i]) * ((double)mass[i] * invdt);
4749  if(doEnergy){
4750  pos[0] = pos_x[i]; pos[1] = pos_y[i]; pos[2] = pos_z[i];
4751  lVirial.xx += df[0] * pos[0];
4752  lVirial.xy += df[0] * pos[1];
4753  lVirial.xz += df[0] * pos[2];
4754  lVirial.yx += df[1] * pos[0];
4755  lVirial.yy += df[1] * pos[1];
4756  lVirial.yz += df[1] * pos[2];
4757  lVirial.zx += df[2] * pos[0];
4758  lVirial.zy += df[2] * pos[1];
4759  lVirial.zz += df[2] * pos[2];
4760  }
4761 
4762  //Updates force and velocities
4763  f_normal_x[i] += df[0];
4764  f_normal_y[i] += df[1];
4765  f_normal_z[i] += df[2];
4766 
4767  // instead of copying I can swap the pointers
4768  vel_x[i] = velNew_x[i];
4769  vel_y[i] = velNew_y[i];
4770  vel_z[i] = velNew_z[i];
4771 
4772  }
4773  //Makes sure that every thread from block has its lVirial set
4774  // Now do a block reduce on each position of virialTensor
4775  // to get consistent values
4776  // The virial tensor is supposed to be symmetric
4777  // XXX TODO: Handwrite the reductions to avoid syncthreads
4778  if(doEnergy){
4779  typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
4780  __shared__ typename BlockReduce::TempStorage temp_storage;
4781  lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
4782  if (doFixed) {
4783  lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
4784  lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
4785  }
4786  lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
4787  lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
4788  __syncthreads();
4789  if (doFixed) {
4790  lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
4791  }
4792  lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
4793  lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
4794  lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
4795 
4796  // Every block has a locally reduced blockVirial
4797  // Now every thread does an atomicAdd to get a global virial
4798  if(threadIdx.x == 0){
4799  const int bin = blockIdx.x % ATOMIC_BINS;
4800 
4801  atomicAdd(&virial[bin].xx, lVirial.xx);
4802  if (doFixed) {
4803  atomicAdd(&virial[bin].xy, lVirial.xy);
4804  atomicAdd(&virial[bin].xz, lVirial.xz);
4805  }
4806  atomicAdd(&virial[bin].yx, lVirial.yx);
4807  atomicAdd(&virial[bin].yy, lVirial.yy);
4808  if (doFixed) {
4809  atomicAdd(&virial[bin].yz, lVirial.yz);
4810  }
4811  atomicAdd(&virial[bin].zx, lVirial.zx);
4812  atomicAdd(&virial[bin].zy, lVirial.zy);
4813  atomicAdd(&virial[bin].zz, lVirial.zz);
4814  }
4815  }
4816 }
4817 
4818 
4819 template <bool fillIndexes>
4820 __global__ void buildRattleParams(const int size,
4821  const float dt,
4822  const float invdt,
4823  const int *hgi,
4824  //Do I need new vectors for holding the rattle forces? Let's do this for now
4825  const float * __restrict mass,
4826  const int * __restrict hydrogenGroupSize,
4827  const float * __restrict rigidBondLength,
4828  const int * __restrict atomFixed,
4829  CudaRattleElem* rattleList,
4830  int *rattleIndexes)
4831 {
4832  int tid = threadIdx.x + blockIdx.x*blockDim.x;
4833  int stride = gridDim.x * blockDim.x;
4834  float rmass[10];
4835  float dsq[10];
4836  int fixed[10];
4837  int ial[10];
4838  int ibl[10];
4839  int ig, hgs, i;
4840  bool anyfixed;
4841 
4842  for(int k = tid; k < size; k += stride){
4843  if(!fillIndexes){
4844  i = rattleIndexes[k];
4845  if(i == -1) break;
4846  }else i = k;
4847  ig = hgi[i];
4848  hgs = hydrogenGroupSize[ig];
4849  if(hgs == 1) continue; //early bail. Sucks;
4850  int icnt = 0;
4851  anyfixed = false;
4852  for(int j = 0; j < hgs; j++){
4853  rmass[j] = (mass[ig + j] > 0.f ? __frcp_rn(mass[ig+j]) : 0.f);
4854  fixed[j] = atomFixed[ig+j];
4855  if(fixed[j]) {
4856  anyfixed = true;
4857  rmass[j] = 0.f;
4858  }
4859  }
4860  float tmp = rigidBondLength[ig];
4861  //We bail here if hydrogens are not fixed
4862  if(tmp > 0.f){
4863  if(!anyfixed) continue;
4864  if( !(fixed[1] && fixed[2]) ){
4865  dsq[icnt] = tmp*tmp;
4866  ial[icnt] = 1;
4867  ibl[icnt++] = 2;
4868  }
4869  }
4870  if(fillIndexes){
4871  rattleIndexes[i] = i;
4872  continue;
4873  }else{
4874  for(int j = 1; j < hgs; j++){
4875  if((tmp = rigidBondLength[ig+j]) > 0){
4876  if( !(fixed[0] && fixed[j])){
4877  dsq[icnt] = tmp * tmp;
4878  ial[icnt] = 0;
4879  ibl[icnt++] = j;
4880  }
4881  }
4882  }
4883  // This is really bad: does an atomicadd for each rattle
4884  // Improve this later
4885  rattleList[k].ig = ig;
4886  rattleList[k].icnt = icnt;
4887  for(int j = 0; j < icnt; j++){
4888  int ia = ial[j];
4889  int ib = ibl[j];
4890  rattleList[k].params[j].ia = ia;
4891  rattleList[k].params[j].ib = ib;
4892  rattleList[k].params[j].dsq = dsq[j];
4893  rattleList[k].params[j].rma = rmass[ia];
4894  rattleList[k].params[j].rmb = rmass[ib];
4895  }
4896  }
4897  }
4898 }
4899 
4900 void buildRattleLists(
4901  const bool doFixedAtoms,
4902  const int natoms,
4903  const float dt,
4904  const float invdt,
4905  const float *mass,
4906  const int *hydrogenGroupSize,
4907  const float *rigidBondLength,
4908  const int *atomFixed,
4909  int **settleList,
4910  size_t& settleListSize,
4911  int **consFailure,
4912  size_t& consFailureSize,
4913  CudaRattleElem **rattleList,
4914  size_t& rattleListSize,
4915  int &nSettles,
4916  int &nRattles,
4917  int*& d_nHG,
4918  int*& d_nSettles,
4919  int*& d_nRattles,
4920  int*& hgi,
4921  size_t& hgi_size,
4922  char*& d_rattleList_temp_storage,
4923  size_t& temp_storage_bytes,
4924  int*& rattleIndexes,
4925  size_t& rattleIndexes_size,
4926  bool first,
4927  cudaStream_t stream){
4928  size_t nHG = 0;
4929 
4930  //thrust::device_ptr<const int> h_dev = thrust::device_pointer_cast(hydrogenGroupSize)
4931  size_t st_hgi_size = (size_t)(hgi_size);
4932  reallocate_device<int>(&hgi, &st_hgi_size, natoms, OVERALLOC);
4933  hgi_size = (size_t)(st_hgi_size);
4934  size_t temp_length = 1;
4935  reallocate_device<int>(&d_nHG, &temp_length, 1);
4936  reallocate_device<int>(&d_nSettles, &temp_length, 1);
4937  reallocate_device<int>(&d_nRattles, &temp_length, 1);
4938 
4939  copy_DtoD<int>(hydrogenGroupSize, hgi, natoms, stream);
4940  size_t new_storage_bytes = 0;
4941  // Removes zeroes from HGI array
4942  // Pass NULL to first CUB call to get temp buffer size
4943  cudaCheck(cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, hgi,
4944  d_nHG, natoms, notZero(), stream));
4945  reallocate_device<char>(&d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4946  // DC: There were issues where reallocate wanted int types and CUB wanted unsigned ints
4947  new_storage_bytes = temp_storage_bytes;
4948  cudaCheck( cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, hgi,
4949  d_nHG, natoms, notZero(), stream));
4950 
4951  int temp;
4952  copy_DtoH<int>(d_nHG, &temp, 1, stream );
4953  nHG = temp;
4954 
4955  cudaCheck(cudaStreamSynchronize(stream));
4956 
4957  // Exclusive scan to build an array of indices
4958  new_storage_bytes = 0;
4959  cub::DeviceScan::ExclusiveSum(NULL,
4960  new_storage_bytes, hgi, hgi, nHG, stream);
4961  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4962  new_storage_bytes = temp_storage_bytes;
4963  cub::DeviceScan::ExclusiveSum(d_rattleList_temp_storage, new_storage_bytes,
4964  hgi, hgi, nHG, stream);
4965 
4966  size_t st_tempListSize = (size_t)(settleListSize);
4967  reallocate_device<int>(settleList, &st_tempListSize, nHG, OVERALLOC);
4968  settleListSize = (size_t)(st_tempListSize);
4969 
4970  st_tempListSize = (size_t)(rattleListSize);
4971  reallocate_device<CudaRattleElem>(rattleList, &st_tempListSize, nHG, OVERALLOC);
4972  rattleListSize = (size_t)(st_tempListSize);
4973 
4974  st_tempListSize = (size_t)(consFailureSize);
4975  reallocate_device<int>(consFailure, &st_tempListSize, nHG, OVERALLOC);
4976  consFailureSize = (size_t)(st_tempListSize);
4977 
4978  st_tempListSize = (size_t)(rattleIndexes_size);
4979  reallocate_device<int>(&rattleIndexes, &st_tempListSize, nHG, OVERALLOC);
4980  rattleIndexes_size = (size_t)(st_tempListSize);
4981 
4982  // Flagging arrays
4983  cudaCheck(cudaMemsetAsync(*rattleList, 0, sizeof(CudaRattleElem)*nHG, stream));
4984  cudaCheck(cudaMemsetAsync(rattleIndexes,-1, sizeof(int)*nHG, stream));
4985  cudaCheck(cudaMemsetAsync(*settleList, 0, sizeof(int)*nHG, stream));
4986 
4987  ///thrust::device_vector<int> hgi(natoms);
4988  new_storage_bytes = 0;
4989  if (doFixedAtoms) {
4990  isWater<true> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4991  // builds SettleList with isWater functor
4992  cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4993  d_nSettles, nHG, comp, stream);
4994  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4995  new_storage_bytes = temp_storage_bytes;
4996  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
4997  d_nSettles, nHG, comp, stream);
4998  } else {
4999  isWater<false> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
5000  // builds SettleList with isWater functor
5001  cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
5002  d_nSettles, nHG, comp, stream);
5003  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5004  new_storage_bytes = temp_storage_bytes;
5005  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
5006  d_nSettles, nHG, comp, stream);
5007  }
5008 
5009  // alright I have the number of settles here
5010  copy_DtoH<int>(d_nSettles, &nSettles, 1, stream);
5011  // fprintf(stderr, "nSettles = %d", nSettles);
5012 
5013  //Warmup call to buildRattleParams
5014  buildRattleParams<true><<<128, 128, 0, stream>>>(nHG, dt, invdt,
5015  hgi, mass, hydrogenGroupSize,
5016  rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5017  cudaCheck(cudaGetLastError());
5018 
5019  // Removing -1 from rattleIndexes
5020  new_storage_bytes = 0;
5021  cub::DeviceSelect::If(NULL, new_storage_bytes,
5022  rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5023  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5024  new_storage_bytes = temp_storage_bytes;
5025  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes,
5026  rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5027  copy_DtoH<int>(d_nRattles, &nRattles, 1, stream);
5028 
5029  // Calculates rattleParams on rattleIndexes
5030  buildRattleParams<false><<<128, 128, 0, stream>>>(nRattles, dt, invdt,
5031  hgi, mass, hydrogenGroupSize,
5032  rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5033 
5034  cudaCheck(cudaGetLastError());
5035 }
5036 
5037 // XXX TODO: JM: Memory access pattern in this function is bad, since we have to
5038 // access memory by the hydrogen groups. Improve it later
5039 
5040 void SequencerCUDAKernel::rattle1(
5041  const bool doFixedAtoms,
5042  const bool doEnergy,
5043  const bool pressure, // pressure is only false for startRun1 and startRun2 right now
5044  const int numAtoms,
5045  const double dt,
5046  const double invdt,
5047  const double tol2,
5048  double *vel_x,
5049  double *vel_y,
5050  double *vel_z,
5051  double *pos_x,
5052  double *pos_y,
5053  double *pos_z,
5054  double *velNew_x,
5055  double *velNew_y,
5056  double *velNew_z,
5057  double *posNew_x,
5058  double *posNew_y,
5059  double *posNew_z,
5060  double *f_normal_x,
5061  double *f_normal_y,
5062  double *f_normal_z,
5063  const int *hydrogenGroupSize,
5064  const float *rigidBondLength,
5065  const float *mass,
5066  const int *atomFixed,
5067  int **settleList,
5068  size_t& settleListSize,
5069  int **consFailure_d,
5070  size_t& consFailureSize,
5071  CudaRattleElem **rattleList,
5072  size_t& rattleListSize,
5073  int *nSettle,
5074  int *nRattle,
5075  cudaTensor *virial,
5076  cudaTensor *h_virial,
5077  unsigned int* tbcatomic,
5078  int migration,
5079  SettleParameters *sp,
5080  bool first,
5081  int* consFailure,
5082  const WaterModel water_model,
5083  cudaStream_t stream)
5084  {
5085  //Calls the necessary functions to enforce rigid bonds
5086  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5087  int maxiter = 10;
5088  cudaCheck(cudaStreamSynchronize(stream));
5089  if(migration || !firstRattleDone){
5090  // I need to allocate the water positions here
5091  buildRattleLists(
5092  doFixedAtoms, numAtoms, dt, invdt, mass, hydrogenGroupSize,
5093  rigidBondLength, atomFixed, settleList, settleListSize,
5094  consFailure_d, consFailureSize, rattleList, rattleListSize,
5095  *nSettle, *nRattle,
5096  d_nHG, d_nSettles, d_nRattles, hgi, hgi_size,
5097  d_rattleList_temp_storage, temp_storage_bytes,
5098  rattleIndexes, rattleIndexes_size,
5099  first,stream);
5100  firstRattleDone = true;
5101  }
5102 
5103 #if 1
5104  // if (1){
5105  this->updateRigidArrays(
5106  doFixedAtoms, dt, atomFixed,
5107  vel_x, vel_y, vel_z,
5108  pos_x, pos_y, pos_z,
5109  velNew_x, velNew_y, velNew_z,
5110  posNew_x, posNew_y, posNew_z,
5111  numAtoms, stream);
5112 
5113  #if 1
5114  // if(*nSettle != 0){
5115  if(*nSettle != 0){
5116  Settle(doEnergy,
5117  numAtoms, dt, invdt, *nSettle,
5118  vel_x, vel_y, vel_z,
5119  pos_x, pos_y, pos_z,
5120  velNew_x, velNew_y, velNew_z,
5121  posNew_x, posNew_y, posNew_z,
5122  f_normal_x, f_normal_y, f_normal_z, virial, mass,
5123  hydrogenGroupSize, rigidBondLength, atomFixed,
5124  *settleList, sp, water_model, stream);
5125  }
5126 
5127 
5128  if(*nRattle != 0){
5129  MSHAKE_CUDA(
5130  doEnergy, doFixedAtoms, *rattleList, *nRattle, hydrogenGroupSize,
5131  atomFixed, pos_x, pos_y, pos_z,
5132  posNew_x, posNew_y, posNew_z,
5133  vel_x, vel_y, vel_z,
5134  velNew_x, velNew_y,velNew_z,
5135  f_normal_x, f_normal_y, f_normal_z, virial, mass,
5136  invdt, tol2, maxiter, *consFailure_d,
5137  consFailure, stream);
5138  }
5139  #else
5140  Rattle1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5141  dt, invdt, *nSettle,
5142  vel_x, vel_y, vel_z,
5143  pos_x, pos_y, pos_z,
5144  velNew_x, velNew_y, velNew_z,
5145  posNew_x, posNew_y, posNew_z,
5146  f_normal_x, f_normal_y, f_normal_z,
5147  virial, mass,
5148  hydrogenGroupSize, rigidBondLength, atomFixed,
5149  *settleList, sp,
5150  *rattleList, *nRattle, tol2, maxiter, *consFailure_d);
5151  #endif
5152 
5153 
5154  if(invdt == 0){
5155  // only for the first time step
5156  copy_DtoD<double>(posNew_x, pos_x, numAtoms, stream);
5157  copy_DtoD<double>(posNew_y, pos_y, numAtoms, stream);
5158  copy_DtoD<double>(posNew_z, pos_z, numAtoms, stream);
5159  cudaCheck(cudaStreamSynchronize(stream));
5160  }else if (pressure == 0){ // pressure is zero for the first timesteps
5161  copy_DtoD<double>(velNew_x, vel_x, numAtoms, stream);
5162  copy_DtoD<double>(velNew_y, vel_y, numAtoms, stream);
5163  copy_DtoD<double>(velNew_z, vel_z, numAtoms, stream);
5164  cudaCheck(cudaStreamSynchronize(stream));
5165  }else{
5166 #define CALL(DOENERGY, DOFIXED) \
5167  addRattleForce<DOENERGY, DOFIXED><<<grid, ATOM_BLOCKS, 0 ,stream>>>( \
5168  numAtoms, invdt, atomFixed, mass, \
5169  pos_x, pos_y, pos_z, \
5170  vel_x, vel_y, vel_z, \
5171  velNew_x, velNew_y, velNew_z, \
5172  f_normal_x, f_normal_y, f_normal_z, virial, h_virial, tbcatomic)
5173  if (doEnergy && doFixedAtoms) CALL(true, true);
5174  else if (doEnergy && !doFixedAtoms) CALL(true, false);
5175  else if (!doEnergy && doFixedAtoms) CALL(false, true);
5176  else if (!doEnergy && !doFixedAtoms) CALL(false, false);
5177  else NAMD_bug("addRattleForce was not called in SequencerCUDAKernel::rattle1!\n");
5178 #undef CALL
5179  }
5180  // }else{
5181 #else
5182  // Calls a fused kernel without any tensor update
5183  // let's calculate the number of settle blocks
5184  int nSettleBlocks = (*nSettle + 128 -1 ) / 128;
5185  int nShakeBlocks = (*nRattle + 128 -1 ) / 128;
5186  // int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
5187  if (doFixedAtoms) NAMD_bug("CallRattle1Kernel does not support fixed atoms!\n");
5188  CallRattle1Kernel(
5189  numAtoms, dt, invdt, *nSettle,
5190  vel_x, vel_y, vel_z,
5191  pos_x, pos_y, pos_z,
5192  f_normal_x, f_normal_y, f_normal_z,
5193  mass, hydrogenGroupSize, rigidBondLength, atomFixed,
5194  *settleList, sp,
5195  *rattleList, *nRattle, tol2, maxiter, *consFailure_d, nSettleBlocks,
5196  nShakeBlocks, water_model, stream
5197  );
5198 #endif
5199  // }
5200 }
5201 
5202 
5203 template <bool T_NORMALIZED, bool T_DOENERGY>
5204 __global__ void eFieldKernel(
5205  const int numAtoms,
5206  const double3 eField,
5207  const double eFieldOmega,
5208  const double eFieldPhi,
5209  const double t,
5210  const Lattice lat,
5211  const char3* __restrict transform,
5212  const float* __restrict charges,
5213  const double* __restrict pos_x,
5214  const double* __restrict pos_y,
5215  const double* __restrict pos_z,
5216  double* __restrict f_normal_x,
5217  double* __restrict f_normal_y,
5218  double* __restrict f_normal_z,
5219  double3* __restrict d_extForce,
5220  cudaTensor* __restrict d_extVirial,
5221  double* __restrict d_extEnergy,
5222  double3* __restrict h_extForce,
5223  cudaTensor* __restrict h_extVirial,
5224  double* __restrict h_extEnergy,
5225  unsigned int* tbcatomic)
5226 {
5227  int i = threadIdx.x + blockIdx.x*blockDim.x;
5228  double3 eField1;
5229 
5230  // This can all be moved outside of the kernel
5231 
5232  double fx, fy, fz;
5233  double eCos = cos(eFieldOmega * t - eFieldPhi);
5234  double charge = charges[i];
5235  double3 r_extForce = {0, 0, 0};
5236  double r_extEnergy = 0;
5237  cudaTensor r_extVirial = {0};
5238  int totaltb = gridDim.x;
5239  __shared__ bool isLastBlockDone;
5240  eField1.x = eCos * eField.x;
5241  eField1.y = eCos * eField.y;
5242  eField1.z = eCos * eField.z;
5243 
5244  if(threadIdx.x == 0) {
5245  isLastBlockDone = 0;
5246  }
5247 
5248  __syncthreads();
5249 
5250  if(i < numAtoms){
5251  // This can be moved outside of the kernel
5252  if(T_NORMALIZED){
5253  eField1 = Vector(lat.a_r()* (Vector)eField1, lat.b_r()* (Vector)eField1, lat.c_r()* (Vector)eField1);
5254  }
5255 
5256  fx = charge * eField1.x;
5257  fy = charge * eField1.y;
5258  fz = charge * eField1.z;
5259 
5260  f_normal_x[i] += fx;
5261  f_normal_y[i] += fy;
5262  f_normal_z[i] += fz;
5263 
5264  if(T_DOENERGY){
5265  const char3 tr = transform[i];
5266  double3 vpos;
5267  vpos.x = pos_x[i];
5268  vpos.y = pos_y[i];
5269  vpos.z = pos_z[i];
5270 
5271  //all threads from block calculate their contribution to energy, extForce and extVirial
5272  vpos = lat.reverse_transform(vpos, tr);
5273  double3 o = lat.origin();
5274  r_extEnergy -= (fx * (vpos.x - o.x)) + (fy * (vpos.y - o.y)) + (fz * (vpos.z - o.z));
5275 
5276  if(!T_NORMALIZED){
5277  r_extForce.x = fx;
5278  r_extForce.y = fy;
5279  r_extForce.z = fz;
5280 
5281  // outer product to calculate a virial here
5282  r_extVirial.xx = fx * vpos.x;
5283  r_extVirial.xy = fx * vpos.y;
5284  r_extVirial.xz = fx * vpos.z;
5285  r_extVirial.yx = fy * vpos.x;
5286  r_extVirial.yy = fy * vpos.y;
5287  r_extVirial.yz = fy * vpos.z;
5288  r_extVirial.zx = fz * vpos.x;
5289  r_extVirial.zz = fz * vpos.z;
5290  r_extVirial.zy = fz * vpos.y;
5291  }
5292  }
5293  }
5294 
5295  __syncthreads();
5296  if(T_DOENERGY){
5297  // use cub to reduce energies
5298  typedef cub::BlockReduce<double, ATOM_BLOCKS> BlockReduce;
5299  __shared__ typename BlockReduce::TempStorage temp_storage;
5300  r_extEnergy = BlockReduce(temp_storage).Sum(r_extEnergy);
5301  __syncthreads();
5302  if(!T_NORMALIZED){
5303  // Do the remaining reductions
5304  // external force
5305  r_extForce.x = BlockReduce(temp_storage).Sum(r_extForce.x);
5306  __syncthreads();
5307  r_extForce.y = BlockReduce(temp_storage).Sum(r_extForce.y);
5308  __syncthreads();
5309  r_extForce.z = BlockReduce(temp_storage).Sum(r_extForce.z);
5310  __syncthreads();
5311  // external virial
5312  r_extVirial.xx = BlockReduce(temp_storage).Sum(r_extVirial.xx);
5313  __syncthreads();
5314  r_extVirial.xy = BlockReduce(temp_storage).Sum(r_extVirial.xy);
5315  __syncthreads();
5316  r_extVirial.xz = BlockReduce(temp_storage).Sum(r_extVirial.xz);
5317  __syncthreads();
5318  r_extVirial.yx = BlockReduce(temp_storage).Sum(r_extVirial.yx);
5319  __syncthreads();
5320  r_extVirial.yy = BlockReduce(temp_storage).Sum(r_extVirial.yy);
5321  __syncthreads();
5322  r_extVirial.yz = BlockReduce(temp_storage).Sum(r_extVirial.yz);
5323  __syncthreads();
5324  r_extVirial.zx = BlockReduce(temp_storage).Sum(r_extVirial.zx);
5325  __syncthreads();
5326  r_extVirial.zy = BlockReduce(temp_storage).Sum(r_extVirial.zy);
5327  __syncthreads();
5328  r_extVirial.zz = BlockReduce(temp_storage).Sum(r_extVirial.zz);
5329  __syncthreads();
5330  }
5331 
5332  // Cool now atomic add to gmem
5333  if(threadIdx.x == 0 ){
5334  const int bin = blockIdx.x % ATOMIC_BINS;
5335 
5336  // printf("adding %lf to d_extEnergy\n", r_extEnergy);
5337  atomicAdd(&d_extEnergy[bin], r_extEnergy);
5338 
5339  if(!T_NORMALIZED){
5340  // add force and virial as well
5341  atomicAdd(&d_extForce[bin].x, r_extForce.x);
5342  atomicAdd(&d_extForce[bin].y, r_extForce.y);
5343  atomicAdd(&d_extForce[bin].z, r_extForce.z);
5344 
5345  atomicAdd(&d_extVirial[bin].xx, r_extVirial.xx);
5346  atomicAdd(&d_extVirial[bin].xy, r_extVirial.xy);
5347  atomicAdd(&d_extVirial[bin].xz, r_extVirial.xz);
5348 
5349  atomicAdd(&d_extVirial[bin].yx, r_extVirial.yx);
5350  atomicAdd(&d_extVirial[bin].yy, r_extVirial.yy);
5351  atomicAdd(&d_extVirial[bin].yz, r_extVirial.yz);
5352 
5353  atomicAdd(&d_extVirial[bin].zx, r_extVirial.zx);
5354  atomicAdd(&d_extVirial[bin].zy, r_extVirial.zy);
5355  atomicAdd(&d_extVirial[bin].zz, r_extVirial.zz);
5356  }
5357 
5358  __threadfence();
5359  unsigned int value = atomicInc(&tbcatomic[2], totaltb);
5360  isLastBlockDone = (value == (totaltb -1));
5361 
5362  }
5363  __syncthreads();
5364  if(isLastBlockDone){
5365  if(threadIdx.x < ATOMIC_BINS){
5366  const int bin = threadIdx.x;
5367 
5368  double e = d_extEnergy[bin];
5369  double3 f;
5370  cudaTensor v;
5371  if(!T_NORMALIZED){
5372  f = d_extForce[bin];
5373  v = d_extVirial[bin];
5374  }
5375 
5376  // sets device scalars back to zero
5377  d_extEnergy[bin] = 0.0;
5378  if(!T_NORMALIZED){
5379  d_extForce[bin] = make_double3(0.0, 0.0, 0.0);
5380  zero_cudaTensor(d_extVirial[bin]);
5381  }
5382 
5383  if(ATOMIC_BINS > 1){
5384  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
5385  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
5386  __shared__ typename WarpReduce::TempStorage tempStorage;
5387  __shared__ typename WarpReduceT::TempStorage tempStorageT;
5388 
5389  e = WarpReduce(tempStorage).Sum(e);
5390  if(!T_NORMALIZED){
5391  f.x = WarpReduce(tempStorage).Sum(f.x);
5392  f.y = WarpReduce(tempStorage).Sum(f.y);
5393  f.z = WarpReduce(tempStorage).Sum(f.z);
5394  v = WarpReduceT(tempStorageT).Sum(v);
5395  }
5396  }
5397 
5398  if(threadIdx.x == 0){
5399  h_extEnergy[0] = e;
5400  if(!T_NORMALIZED){
5401  h_extForce[0] = f;
5402  h_extVirial[0] = v;
5403  }
5404 
5405  //resets atomic counter
5406  reset_atomic_counter(&(tbcatomic[2]));
5407  }
5408  }
5409  }
5410  }
5411 }
5412 
5413 // JM NOTE: Apply external electric field to every atom in the system equally
5414 void SequencerCUDAKernel::apply_Efield(
5415  const int numAtoms,
5416  const bool normalized,
5417  const bool doEnergy,
5418  const double3 eField,
5419  const double eFieldOmega,
5420  const double eFieldPhi,
5421  const double t,
5422  const Lattice lat,
5423  const char3* transform,
5424  const float* charges,
5425  const double* pos_x,
5426  const double* pos_y,
5427  const double* pos_z,
5428  double* f_normal_x,
5429  double* f_normal_y,
5430  double* f_normal_z,
5431  double3* d_extForce,
5432  cudaTensor* d_extVirial,
5433  double* d_extEnergy,
5434  double3* h_extForce,
5435  cudaTensor* h_extVirial,
5436  double* h_extEnergy,
5437  unsigned int* tbcatomic,
5438  cudaStream_t stream
5439 ){
5440  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5441  if(normalized){
5442  if(doEnergy) {
5443  eFieldKernel<true, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5444  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5445  pos_x, pos_y, pos_z,
5446  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5447  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5448  }else{
5449  eFieldKernel<true, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5450  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5451  pos_x, pos_y, pos_z,
5452  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5453  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5454  }
5455  }else{
5456  if(doEnergy) {
5457  eFieldKernel<false, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5458  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5459  pos_x, pos_y, pos_z,
5460  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5461  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5462  }else{
5463  eFieldKernel<false, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5464  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5465  pos_x, pos_y, pos_z,
5466  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5467  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5468  }
5469  }
5470 }
5471 
5472 SequencerCUDAKernel::SequencerCUDAKernel() {
5473  firstRattleDone = false;
5474  intConstInit = false;
5475 
5476  d_nHG = NULL;
5477  d_nSettles = NULL;
5478  d_nRattles = NULL;
5479 
5480  hgi = NULL;
5481  hgi_size = 0;
5482 
5483  d_rattleList_temp_storage = NULL;
5484  temp_storage_bytes = 0;
5485 
5486  rattleIndexes = NULL;
5487  rattleIndexes_size = 0;
5488 }
5489 
5490 SequencerCUDAKernel::~SequencerCUDAKernel() {
5491  deallocate_device<int>(&d_nHG);
5492  deallocate_device<int>(&d_nSettles);
5493  deallocate_device<int>(&d_nRattles);
5494  deallocate_device<int>(&hgi);
5495  deallocate_device<int>(&rattleIndexes);
5496  deallocate_device<char>(&d_rattleList_temp_storage);
5497 }
5498 
5499 
5500 
5501 #if 1
5502 
5503 
5504 
5505 
5506 void SequencerCUDAKernel::set_pme_positions(
5507  const int devID,
5508  const bool isPmeDevice,
5509  const int nDev,
5510  const int numPatchesHomeAndProxy,
5511  const int numPatchesHome,
5512  const bool doNbond,
5513  const bool doSlow,
5514  const bool doFEP,
5515  const bool doTI,
5516  const bool doAlchDecouple,
5517  const bool doAlchSoftCore,
5518  const bool handleBoundary,
5519  const double* d_pos_x,
5520  const double* d_pos_y,
5521  const double* d_pos_z,
5522 #ifndef NAMD_NCCL_ALLREDUCE
5523  double** d_peer_pos_x,
5524  double** d_peer_pos_y,
5525  double** d_peer_pos_z,
5526  float** d_peer_charge,
5527  int** d_peer_partition,
5528 #endif
5529  const float* charges,
5530  const int* partition,
5531  const double charge_scaling,
5532  const double3* patchCenter,
5533  const int* s_patchPositions,
5534  const int* s_pencilPatchIndex,
5535  const int* s_patchIDs,
5536  const int* patchSortOrder,
5537  const Lattice lattice,
5538  float4* nb_atoms,
5539  float4* b_atoms,
5540  float4* s_atoms,
5541  int* s_partition,
5542  int numTotalAtoms,
5543  CudaLocalRecord* localRecords,
5544  CudaPeerRecord* peerRecords,
5545  std::vector<int>& atomCounts,
5546  cudaStream_t stream
5547 ) {
5548  // Launch PME setComputes
5549  #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
5550  setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
5551  <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
5552  d_pos_x, d_pos_y, d_pos_z, charges, \
5553  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
5554  d_peer_partition, partition, charge_scaling, \
5555  s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
5556  lattice, s_atoms, numTotalAtoms, s_partition, \
5557  i, numAtoms, offset, handleBoundary \
5558  );
5559  // Only when PME long-range electrostaic is enabled (doSlow is true)
5560  // The partition is needed for alchemical calculation.
5561  if(doSlow && isPmeDevice) {
5562  int offset = 0;
5563  for (int i = 0; i < nDev; i++) {
5564  const bool home = (i == devID);
5565  const int numAtoms = atomCounts[i];
5566  const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
5567  const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
5568 
5569  switch (options) {
5570  case 0: CALL(0, 0, 0, 0, 0); break;
5571  case 1: CALL(0, 0, 0, 0, 1); break;
5572  case 2: CALL(0, 0, 0, 1, 0); break;
5573  case 3: CALL(0, 0, 0, 1, 1); break;
5574  case 4: CALL(0, 0, 1, 0, 0); break;
5575  case 5: CALL(0, 0, 1, 0, 1); break;
5576  case 6: CALL(0, 0, 1, 1, 0); break;
5577  case 7: CALL(0, 0, 1, 1, 1); break;
5578  case 8: CALL(0, 1, 0, 0, 0); break;
5579  case 9: CALL(0, 1, 0, 0, 1); break;
5580  case 10: CALL(0, 1, 0, 1, 0); break;
5581  case 11: CALL(0, 1, 0, 1, 1); break;
5582  case 12: CALL(0, 1, 1, 0, 0); break;
5583  case 13: CALL(0, 1, 1, 0, 1); break;
5584  case 14: CALL(0, 1, 1, 1, 0); break;
5585  case 15: CALL(0, 1, 1, 1, 1); break;
5586  case 16: CALL(1, 0, 0, 0, 0); break;
5587  case 17: CALL(1, 0, 0, 0, 1); break;
5588  case 18: CALL(1, 0, 0, 1, 0); break;
5589  case 19: CALL(1, 0, 0, 1, 1); break;
5590  case 20: CALL(1, 0, 1, 0, 0); break;
5591  case 21: CALL(1, 0, 1, 0, 1); break;
5592  case 22: CALL(1, 0, 1, 1, 0); break;
5593  case 23: CALL(1, 0, 1, 1, 1); break;
5594  case 24: CALL(1, 1, 0, 0, 0); break;
5595  case 25: CALL(1, 1, 0, 0, 1); break;
5596  case 26: CALL(1, 1, 0, 1, 0); break;
5597  case 27: CALL(1, 1, 0, 1, 1); break;
5598  case 28: CALL(1, 1, 1, 0, 0); break;
5599  case 29: CALL(1, 1, 1, 0, 1); break;
5600  case 30: CALL(1, 1, 1, 1, 0); break;
5601  case 31: CALL(1, 1, 1, 1, 1); break;
5602  default:
5603  NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
5604  }
5605 
5606  offset += numAtoms;
5607  }
5608  }
5609  #undef CALL
5610 }
5611 
5612 template <bool doGlobal, bool doForcesOutput>
5613 __global__ void copyForcesToHostSOAKernel(
5614  const int numPatches,
5615  CudaLocalRecord* localRecords,
5616  const int maxForceNumber,
5617  const double* d_f_normal_x,
5618  const double* d_f_normal_y,
5619  const double* d_f_normal_z,
5620  const double* d_f_nbond_x,
5621  const double* d_f_nbond_y,
5622  const double* d_f_nbond_z,
5623  const double* d_f_slow_x,
5624  const double* d_f_slow_y,
5625  const double* d_f_slow_z,
5626  PatchDataSOA* d_HostPatchDataSOA
5627 ) {
5628  __shared__ CudaLocalRecord s_record;
5629  using AccessType = int32_t;
5630  AccessType* s_record_buffer = (AccessType*) &s_record;
5631 
5632  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5633  // Read in the CudaLocalRecord using multiple threads. This should
5634  #pragma unroll 1
5635  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5636  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5637  }
5638  __syncthreads();
5639 
5640  const int numAtoms = s_record.numAtoms;
5641  const int offset = s_record.bufferOffset;
5642 
5643  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5644  if (maxForceNumber >= 0) {
5645  d_HostPatchDataSOA[patchIndex].f_normal_x[i] = d_f_normal_x[offset + i];
5646  d_HostPatchDataSOA[patchIndex].f_normal_y[i] = d_f_normal_y[offset + i];
5647  d_HostPatchDataSOA[patchIndex].f_normal_z[i] = d_f_normal_z[offset + i];
5648  }
5649  if (maxForceNumber >= 1) {
5650  if (doGlobal) {
5651  d_HostPatchDataSOA[patchIndex].f_saved_nbond_x[i] = d_f_nbond_x[offset + i];
5652  d_HostPatchDataSOA[patchIndex].f_saved_nbond_y[i] = d_f_nbond_y[offset + i];
5653  d_HostPatchDataSOA[patchIndex].f_saved_nbond_z[i] = d_f_nbond_z[offset + i];
5654  }
5655  if (doForcesOutput) {
5656  d_HostPatchDataSOA[patchIndex].f_nbond_x[i] = d_f_nbond_x[offset + i];
5657  d_HostPatchDataSOA[patchIndex].f_nbond_y[i] = d_f_nbond_y[offset + i];
5658  d_HostPatchDataSOA[patchIndex].f_nbond_z[i] = d_f_nbond_z[offset + i];
5659  }
5660  }
5661  if (maxForceNumber >= 2) {
5662  if (doGlobal) {
5663  d_HostPatchDataSOA[patchIndex].f_saved_slow_x[i] = d_f_slow_x[offset + i];
5664  d_HostPatchDataSOA[patchIndex].f_saved_slow_y[i] = d_f_slow_y[offset + i];
5665  d_HostPatchDataSOA[patchIndex].f_saved_slow_z[i] = d_f_slow_z[offset + i];
5666  }
5667  if (doForcesOutput) {
5668  d_HostPatchDataSOA[patchIndex].f_slow_x[i] = d_f_slow_x[offset + i];
5669  d_HostPatchDataSOA[patchIndex].f_slow_y[i] = d_f_slow_y[offset + i];
5670  d_HostPatchDataSOA[patchIndex].f_slow_z[i] = d_f_slow_z[offset + i];
5671  }
5672  }
5673 
5674  }
5675  __syncthreads();
5676  }
5677 }
5678 
5679 void SequencerCUDAKernel::copyForcesToHostSOA(
5680  const int numPatches,
5681  CudaLocalRecord* localRecords,
5682  const int maxForceNumber,
5683  const double* d_f_normal_x,
5684  const double* d_f_normal_y,
5685  const double* d_f_normal_z,
5686  const double* d_f_nbond_x,
5687  const double* d_f_nbond_y,
5688  const double* d_f_nbond_z,
5689  const double* d_f_slow_x,
5690  const double* d_f_slow_y,
5691  const double* d_f_slow_z,
5692  PatchDataSOA* d_HostPatchDataSOA,
5693  const bool doGlobal,
5694  const bool doForcesOutput,
5695  cudaStream_t stream
5696 ) {
5697 #define CALL(GLOBAL, FORCESOUTPUT) \
5698  do {copyForcesToHostSOAKernel<GLOBAL, FORCESOUTPUT><<<numPatches, PATCH_BLOCKS, 0, stream>>>( \
5699  numPatches, \
5700  localRecords, \
5701  maxForceNumber, \
5702  d_f_normal_x, \
5703  d_f_normal_y, \
5704  d_f_normal_z, \
5705  d_f_nbond_x, \
5706  d_f_nbond_y, \
5707  d_f_nbond_z, \
5708  d_f_slow_x, \
5709  d_f_slow_y, \
5710  d_f_slow_z, \
5711  d_HostPatchDataSOA \
5712  );} while (0)
5713  if (doGlobal && !doForcesOutput) CALL(true, false);
5714  else if (doGlobal && doForcesOutput) CALL(true, true);
5715  else if (!doGlobal && doForcesOutput) CALL(false, true);
5716  else NAMD_bug("No kernel is called in copyForcesToHostSOA");
5717 #undef CALL
5718 }
5719 
5720 __global__ void copyPositionsToHostSOAKernel(
5721  const int numPatches,
5722  CudaLocalRecord* localRecords,
5723  const double* pos_x,
5724  const double* pos_y,
5725  const double* pos_z,
5726  PatchDataSOA* d_HostPatchDataSOA
5727 ) {
5728  __shared__ CudaLocalRecord s_record;
5729  using AccessType = int32_t;
5730  AccessType* s_record_buffer = (AccessType*) &s_record;
5731 
5732  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5733  // Read in the CudaLocalRecord using multiple threads. This should
5734  #pragma unroll 1
5735  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5736  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5737  }
5738  __syncthreads();
5739 
5740  const int numAtoms = s_record.numAtoms;
5741  const int offset = s_record.bufferOffset;
5742 
5743  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5744  d_HostPatchDataSOA[patchIndex].pos_x[i] = pos_x[offset + i];
5745  d_HostPatchDataSOA[patchIndex].pos_y[i] = pos_y[offset + i];
5746  d_HostPatchDataSOA[patchIndex].pos_z[i] = pos_z[offset + i];
5747  }
5748  __syncthreads();
5749  }
5750 }
5751 
5752 void SequencerCUDAKernel::copyPositionsToHostSOA(
5753  const int numPatches,
5754  CudaLocalRecord* localRecords,
5755  const double* pos_x,
5756  const double* pos_y,
5757  const double* pos_z,
5758  PatchDataSOA* d_HostPatchDataSOA,
5759  cudaStream_t stream
5760 ) {
5761  copyPositionsToHostSOAKernel<<<numPatches, PATCH_BLOCKS, 0, stream>>>(
5762  numPatches,
5763  localRecords,
5764  pos_x, pos_y, pos_z,
5765  d_HostPatchDataSOA
5766  );
5767 }
5768 
5769 // Swipe from HomePatch::redistrib_lp_water_force
5770 /* Redistribute forces from the massless lonepair charge particle onto
5771  * the other atoms of the water.
5772  *
5773  * This is done using the same algorithm as charmm uses for TIP4P lonepairs.
5774  *
5775  * Pass by reference the forces (O H1 H2 LP) to be modified,
5776  * pass by constant reference the corresponding positions.
5777  */
5778 template <bool doVirial>
5779 __device__ void redistrib_lp_water_force(
5780  double f_ox[3], double f_h1[3], double f_h2[3], double f_lp[3],
5781  const double p_ox[3], const double p_h1[3], const double p_h2[3],
5782  const double p_lp[3], cudaTensor& virial) {
5783  // Accumulate force adjustments
5784  double fad_ox[3] = {};
5785  double fad_h[3] = {};
5786  // Calculate the radial component of the force and add it to the oxygen
5787  const double r_ox_lp[3] = {p_lp[0] - p_ox[0],
5788  p_lp[1] - p_ox[1],
5789  p_lp[2] - p_ox[2]};
5790  double invlen2_r_ox_lp = rnorm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5791  invlen2_r_ox_lp *= invlen2_r_ox_lp;
5792  const double rad_factor =
5793  (f_lp[0] * r_ox_lp[0] + f_lp[1] * r_ox_lp[1] + f_lp[2] * r_ox_lp[2]) * invlen2_r_ox_lp;
5794  const double f_rad[3] = {r_ox_lp[0] * rad_factor,
5795  r_ox_lp[1] * rad_factor,
5796  r_ox_lp[2] * rad_factor};
5797  fad_ox[0] += f_rad[0];
5798  fad_ox[1] += f_rad[1];
5799  fad_ox[2] += f_rad[2];
5800  // Calculate the angular component
5801  const double r_hcom_ox[3] = {
5802  p_ox[0] - ( (p_h1[0] + p_h2[0]) * 0.5 ),
5803  p_ox[1] - ( (p_h1[1] + p_h2[1]) * 0.5 ),
5804  p_ox[2] - ( (p_h1[2] + p_h2[2]) * 0.5 )};
5805  const double f_ang[3] = {
5806  f_lp[0] - f_rad[0],
5807  f_lp[1] - f_rad[1],
5808  f_lp[2] - f_rad[2]};
5809  // now split this component onto the other atoms
5810  const double len_r_ox_lp = norm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5811  const double invlen_r_hcom_ox = rnorm3d(
5812  r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]);
5813  const double oxcomp =
5814  (norm3d(r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]) - len_r_ox_lp) *
5815  invlen_r_hcom_ox;
5816  const double hydcomp = 0.5 * len_r_ox_lp * invlen_r_hcom_ox;
5817  fad_ox[0] += (f_ang[0] * oxcomp);
5818  fad_ox[1] += (f_ang[1] * oxcomp);
5819  fad_ox[2] += (f_ang[2] * oxcomp);
5820  fad_h[0] += (f_ang[0] * hydcomp);
5821  fad_h[1] += (f_ang[1] * hydcomp);
5822  fad_h[2] += (f_ang[2] * hydcomp);
5823  if (doVirial) {
5824  virial.xx = fad_ox[0] * p_ox[0] + fad_h[0] * p_h1[0] + fad_h[0] * p_h2[0] - f_lp[0] * p_lp[0];
5825  virial.xy = fad_ox[0] * p_ox[1] + fad_h[0] * p_h1[1] + fad_h[0] * p_h2[1] - f_lp[0] * p_lp[1];
5826  virial.xz = fad_ox[0] * p_ox[2] + fad_h[0] * p_h1[2] + fad_h[0] * p_h2[2] - f_lp[0] * p_lp[2];
5827  virial.yx = fad_ox[1] * p_ox[0] + fad_h[1] * p_h1[0] + fad_h[1] * p_h2[0] - f_lp[1] * p_lp[0];
5828  virial.yy = fad_ox[1] * p_ox[1] + fad_h[1] * p_h1[1] + fad_h[1] * p_h2[1] - f_lp[1] * p_lp[1];
5829  virial.yz = fad_ox[1] * p_ox[2] + fad_h[1] * p_h1[2] + fad_h[1] * p_h2[2] - f_lp[1] * p_lp[2];
5830  virial.zx = fad_ox[2] * p_ox[0] + fad_h[2] * p_h1[0] + fad_h[2] * p_h2[0] - f_lp[2] * p_lp[0];
5831  virial.zy = fad_ox[2] * p_ox[1] + fad_h[2] * p_h1[1] + fad_h[2] * p_h2[1] - f_lp[2] * p_lp[1];
5832  virial.zz = fad_ox[2] * p_ox[2] + fad_h[2] * p_h1[2] + fad_h[2] * p_h2[2] - f_lp[2] * p_lp[2];
5833  }
5834  f_lp[0] = 0;
5835  f_lp[1] = 0;
5836  f_lp[2] = 0;
5837  f_ox[0] += fad_ox[0];
5838  f_ox[1] += fad_ox[1];
5839  f_ox[2] += fad_ox[2];
5840  f_h1[0] += fad_h[0];
5841  f_h1[1] += fad_h[1];
5842  f_h1[2] += fad_h[2];
5843  f_h2[0] += fad_h[0];
5844  f_h2[1] += fad_h[1];
5845  f_h2[2] += fad_h[2];
5846 }
5847 
5848 template <bool doVirial>
5849 __global__ void redistributeTip4pForcesKernel2(
5850  double* d_f_x,
5851  double* d_f_y,
5852  double* d_f_z,
5853  cudaTensor* d_virial,
5854  const double* d_pos_x,
5855  const double* d_pos_y,
5856  const double* d_pos_z,
5857  const float* d_mass,
5858  const int numAtoms)
5859 {
5860  const int i = threadIdx.x + blockIdx.x * blockDim.x;
5861  cudaTensor lVirial = {0};
5862  if (i < numAtoms) {
5863  if (d_mass[i] < 0.01f) {
5864  double f_ox[3] = {d_f_x[i-3], d_f_y[i-3], d_f_z[i-3]};
5865  double f_h1[3] = {d_f_x[i-2], d_f_y[i-2], d_f_z[i-2]};
5866  double f_h2[3] = {d_f_x[i-1], d_f_y[i-1], d_f_z[i-1]};
5867  double f_lp[3] = {d_f_x[i], d_f_y[i], d_f_z[i]};
5868  const double p_ox[3] = {d_pos_x[i-3], d_pos_y[i-3], d_pos_z[i-3]};
5869  const double p_h1[3] = {d_pos_x[i-2], d_pos_y[i-2], d_pos_z[i-2]};
5870  const double p_h2[3] = {d_pos_x[i-1], d_pos_y[i-1], d_pos_z[i-1]};
5871  const double p_lp[3] = {d_pos_x[i], d_pos_y[i], d_pos_z[i]};
5872  redistrib_lp_water_force<doVirial>(
5873  f_ox, f_h1, f_h2, f_lp, p_ox, p_h1, p_h2, p_lp, lVirial);
5874  // copy the force back
5875  d_f_x[i-3] = f_ox[0];
5876  d_f_x[i-2] = f_h1[0];
5877  d_f_x[i-1] = f_h2[0];
5878  d_f_x[i] = f_lp[0];
5879  d_f_y[i-3] = f_ox[1];
5880  d_f_y[i-2] = f_h1[1];
5881  d_f_y[i-1] = f_h2[1];
5882  d_f_y[i] = f_lp[1];
5883  d_f_z[i-3] = f_ox[2];
5884  d_f_z[i-2] = f_h1[2];
5885  d_f_z[i-1] = f_h2[2];
5886  d_f_z[i] = f_lp[2];
5887  }
5888  }
5889  typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
5890  __shared__ typename BlockReduce::TempStorage temp_storage;
5891  if (doVirial) {
5892  lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx);
5893  __syncthreads();
5894  lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy);
5895  __syncthreads();
5896  lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz);
5897  __syncthreads();
5898  lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx);
5899  __syncthreads();
5900  lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
5901  __syncthreads();
5902  lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz);
5903  __syncthreads();
5904  lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx);
5905  __syncthreads();
5906  lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy);
5907  __syncthreads();
5908  lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz);
5909  __syncthreads();
5910  if (threadIdx.x == 0) {
5911  atomicAdd(&(d_virial->xx), lVirial.xx);
5912  atomicAdd(&(d_virial->xy), lVirial.xy);
5913  atomicAdd(&(d_virial->xz), lVirial.xz);
5914  atomicAdd(&(d_virial->yx), lVirial.yx);
5915  atomicAdd(&(d_virial->yy), lVirial.yy);
5916  atomicAdd(&(d_virial->yz), lVirial.yz);
5917  atomicAdd(&(d_virial->zx), lVirial.zx);
5918  atomicAdd(&(d_virial->zy), lVirial.zy);
5919  atomicAdd(&(d_virial->zz), lVirial.zz);
5920  }
5921  }
5922 }
5923 
5924 void SequencerCUDAKernel::redistributeTip4pForces(
5925  double* d_f_normal_x,
5926  double* d_f_normal_y,
5927  double* d_f_normal_z,
5928  double* d_f_nbond_x,
5929  double* d_f_nbond_y,
5930  double* d_f_nbond_z,
5931  double* d_f_slow_x,
5932  double* d_f_slow_y,
5933  double* d_f_slow_z,
5934  cudaTensor* d_virial_normal,
5935  cudaTensor* d_virial_nbond,
5936  cudaTensor* d_virial_slow,
5937  const double* d_pos_x,
5938  const double* d_pos_y,
5939  const double* d_pos_z,
5940  const float* d_mass,
5941  const int numAtoms,
5942  const int doVirial,
5943  const int maxForceNumber,
5944  cudaStream_t stream
5945 ) {
5946  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5947  switch (maxForceNumber) {
5948  case 2:
5949  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5950  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5951  case 1:
5952  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5953  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5954  case 0:
5955  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5956  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5957  }
5958 }
5959 
5960 __forceinline__ __device__ void calcVirial(
5961  const int i, const double factor,
5962  const double* __restrict f_x, const double* __restrict f_y, const double* __restrict f_z,
5963  const double* __restrict r_x, const double* __restrict r_y, const double* __restrict r_z,
5964  cudaTensor& tensor_out) {
5965  tensor_out.xx = factor * f_x[i] * r_x[i];
5966  tensor_out.xy = factor * f_x[i] * r_y[i];
5967  tensor_out.xz = factor * f_x[i] * r_z[i];
5968  tensor_out.yx = factor * f_y[i] * r_x[i];
5969  tensor_out.yy = factor * f_y[i] * r_y[i];
5970  tensor_out.yz = factor * f_y[i] * r_z[i];
5971  tensor_out.zx = factor * f_z[i] * r_x[i];
5972  tensor_out.zy = factor * f_z[i] * r_y[i];
5973  tensor_out.zz = factor * f_z[i] * r_z[i];
5974 }
5975 
5976 template <bool doNbond, bool doSlow>
5977 __global__ void calcFixVirialKernel(
5978  const int numAtoms,
5979  const int* __restrict d_atomFixed,
5980  const double* __restrict d_fixedPosition_x,
5981  const double* __restrict d_fixedPosition_y,
5982  const double* __restrict d_fixedPosition_z,
5983  const double* __restrict d_f_normal_x,
5984  const double* __restrict d_f_normal_y,
5985  const double* __restrict d_f_normal_z,
5986  const double* __restrict d_f_nbond_x,
5987  const double* __restrict d_f_nbond_y,
5988  const double* __restrict d_f_nbond_z,
5989  const double* __restrict d_f_slow_x,
5990  const double* __restrict d_f_slow_y,
5991  const double* __restrict d_f_slow_z,
5992  cudaTensor* __restrict d_virial_normal,
5993  cudaTensor* __restrict d_virial_nbond,
5994  cudaTensor* __restrict d_virial_slow,
5995  double3* __restrict d_extForce_normal,
5996  double3* __restrict d_extForce_nbond,
5997  double3* __restrict d_extForce_slow)
5998 {
5999  const int i = threadIdx.x + blockIdx.x * blockDim.x;
6000  cudaTensor vir_normal = {0};
6001  cudaTensor vir_nbond = {0};
6002  cudaTensor vir_slow = {0};
6003  double3 f_sum_normal = {0, 0, 0};
6004  double3 f_sum_nbond;
6005  if (doNbond) {
6006  f_sum_nbond = {0, 0, 0};
6007  }
6008  double3 f_sum_slow;
6009  if (doSlow) {
6010  f_sum_slow = {0, 0, 0};
6011  }
6012  if (i < numAtoms) {
6013  if (d_atomFixed[i]) {
6014  calcVirial(i, -1.0, d_f_normal_x, d_f_normal_y, d_f_normal_z,
6015  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_normal);
6016  f_sum_normal.x = -1.0 * d_f_normal_x[i];
6017  f_sum_normal.y = -1.0 * d_f_normal_y[i];
6018  f_sum_normal.z = -1.0 * d_f_normal_z[i];
6019  if (doNbond) {
6020  calcVirial(i, -1.0, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6021  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_nbond);
6022  f_sum_nbond.x = -1.0 * d_f_nbond_x[i];
6023  f_sum_nbond.y = -1.0 * d_f_nbond_y[i];
6024  f_sum_nbond.z = -1.0 * d_f_nbond_z[i];
6025  }
6026  if (doSlow) {
6027  calcVirial(i, -1.0, d_f_slow_x, d_f_slow_y, d_f_slow_z,
6028  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_slow);
6029  f_sum_slow.x = -1.0 * d_f_slow_x[i];
6030  f_sum_slow.y = -1.0 * d_f_slow_y[i];
6031  f_sum_slow.z = -1.0 * d_f_slow_z[i];
6032  }
6033  }
6034  }
6035  // Haochuan: is it OK to use lambda to reduce the cudaTensor as a whole instead of component-wise??
6036  typedef cub::BlockReduce<cudaTensor, ATOM_BLOCKS> BlockReduceCudaTensor;
6037  __shared__ typename BlockReduceCudaTensor::TempStorage temp_storage;
6038  vir_normal = BlockReduceCudaTensor(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6039  __syncthreads();
6040  if (doNbond) {
6041  vir_nbond = BlockReduceCudaTensor(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6042  __syncthreads();
6043  }
6044  if (doSlow) {
6045  vir_slow = BlockReduceCudaTensor(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6046  __syncthreads();
6047  }
6048  typedef cub::BlockReduce<double3, ATOM_BLOCKS> BlockReduceDouble3;
6049  __shared__ typename BlockReduceDouble3::TempStorage temp_storage2;
6050  f_sum_normal = BlockReduceDouble3(temp_storage2).Reduce(f_sum_normal, [](const double3& a, const double3& b){return a+b;});
6051  __syncthreads();
6052  if (doNbond) {
6053  f_sum_nbond = BlockReduceDouble3(temp_storage2).Reduce(f_sum_nbond, [](const double3& a, const double3& b){return a+b;});
6054  __syncthreads();
6055  }
6056  if (doSlow) {
6057  f_sum_slow = BlockReduceDouble3(temp_storage2).Reduce(f_sum_slow, [](const double3& a, const double3& b){return a+b;});
6058  __syncthreads();
6059  }
6060  if (threadIdx.x == 0) {
6061  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
6062  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
6063  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
6064  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
6065  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
6066  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
6067  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
6068  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
6069  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
6070  atomicAdd(&(d_extForce_normal->x), f_sum_normal.x);
6071  atomicAdd(&(d_extForce_normal->y), f_sum_normal.y);
6072  atomicAdd(&(d_extForce_normal->z), f_sum_normal.z);
6073  if (doNbond) {
6074  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
6075  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
6076  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
6077  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
6078  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
6079  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
6080  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
6081  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
6082  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
6083  atomicAdd(&(d_extForce_nbond->x), f_sum_nbond.x);
6084  atomicAdd(&(d_extForce_nbond->y), f_sum_nbond.y);
6085  atomicAdd(&(d_extForce_nbond->z), f_sum_nbond.z);
6086  }
6087  if (doSlow) {
6088  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
6089  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
6090  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
6091  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
6092  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
6093  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
6094  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
6095  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
6096  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
6097  atomicAdd(&(d_extForce_slow->x), f_sum_slow.x);
6098  atomicAdd(&(d_extForce_slow->y), f_sum_slow.y);
6099  atomicAdd(&(d_extForce_slow->z), f_sum_slow.z);
6100  }
6101  }
6102 }
6103 
6104 __global__ void copyForcesToDeviceKernel(
6105  const int numAtoms,
6106  const double* d_f_nbond_x,
6107  const double* d_f_nbond_y,
6108  const double* d_f_nbond_z,
6109  const double* d_f_slow_x,
6110  const double* d_f_slow_y,
6111  const double* d_f_slow_z,
6112  double* d_f_saved_nbond_x,
6113  double* d_f_saved_nbond_y,
6114  double* d_f_saved_nbond_z,
6115  double* d_f_saved_slow_x,
6116  double* d_f_saved_slow_y,
6117  double* d_f_saved_slow_z,
6118  const int maxForceNumber
6119 ) {
6120  const int i = threadIdx.x + blockIdx.x * blockDim.x;
6121  if (i < numAtoms) {
6122  if (maxForceNumber >= 2) {
6123  d_f_saved_slow_x[i] = d_f_slow_x[i];
6124  d_f_saved_slow_y[i] = d_f_slow_y[i];
6125  d_f_saved_slow_z[i] = d_f_slow_z[i];
6126  }
6127  if (maxForceNumber >= 1) {
6128  d_f_saved_nbond_x[i] = d_f_nbond_x[i];
6129  d_f_saved_nbond_y[i] = d_f_nbond_y[i];
6130  d_f_saved_nbond_z[i] = d_f_nbond_z[i];
6131  }
6132  }
6133 }
6134 
6135 void SequencerCUDAKernel::calcFixVirial(
6136  const int maxForceNumber,
6137  const int numAtoms,
6138  const int* d_atomFixed,
6139  const double* d_fixedPosition_x,
6140  const double* d_fixedPosition_y,
6141  const double* d_fixedPosition_z,
6142  const double* d_f_normal_x,
6143  const double* d_f_normal_y,
6144  const double* d_f_normal_z,
6145  const double* d_f_nbond_x,
6146  const double* d_f_nbond_y,
6147  const double* d_f_nbond_z,
6148  const double* d_f_slow_x,
6149  const double* d_f_slow_y,
6150  const double* d_f_slow_z,
6151  cudaTensor* d_virial_normal,
6152  cudaTensor* d_virial_nbond,
6153  cudaTensor* d_virial_slow,
6154  double3* d_extForce_normal,
6155  double3* d_extForce_nbond,
6156  double3* d_extForce_slow,
6157  cudaStream_t stream)
6158 {
6159  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6160 #define CALL(doNbond, doSlow) \
6161  calcFixVirialKernel<doNbond, doSlow><<<grid, ATOM_BLOCKS, 0, stream>>>( \
6162  numAtoms, d_atomFixed, \
6163  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, \
6164  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
6165  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
6166  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
6167  d_virial_normal, d_virial_nbond, d_virial_slow, \
6168  d_extForce_normal, d_extForce_nbond, d_extForce_slow \
6169  );
6170  switch (maxForceNumber) {
6171  case 0: CALL(false, false); break;
6172  case 1: CALL(true, false); break;
6173  case 2: CALL(true, true); break;
6174  default: NAMD_bug("No kernel is called in SequencerCUDAKernel::calcFixVirial!\n");
6175  }
6176 #undef CALL
6177 }
6178 
6179 void SequencerCUDAKernel::copyForcesToDevice(
6180  const int numAtoms,
6181  const double* d_f_nbond_x,
6182  const double* d_f_nbond_y,
6183  const double* d_f_nbond_z,
6184  const double* d_f_slow_x,
6185  const double* d_f_slow_y,
6186  const double* d_f_slow_z,
6187  double* d_f_saved_nbond_x,
6188  double* d_f_saved_nbond_y,
6189  double* d_f_saved_nbond_z,
6190  double* d_f_saved_slow_x,
6191  double* d_f_saved_slow_y,
6192  double* d_f_saved_slow_z,
6193  const int maxForceNumber,
6194  cudaStream_t stream
6195 ) {
6196  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6197  copyForcesToDeviceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
6198  numAtoms, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6199  d_f_slow_x, d_f_slow_y, d_f_slow_z,
6200  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
6201  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z,
6202  maxForceNumber
6203  );
6204 }
6205 
6206 #endif
6207 #endif // NODEGROUP_FORCE_REGISTER