NAMD
MigrationCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 
8 #include <thrust/iterator/transform_iterator.h>
9 #include <thrust/device_vector.h>
10 
11 #else // NAMD_HIP
12 #include <hip/hip_runtime.h>
13 #include <hipcub/hipcub.hpp>
14 #define cub hipcub
15 #endif
16 
17 #include "HipDefines.h"
18 
19 #include "MigrationCUDAKernel.h"
20 #include "CudaComputeNonbondedKernel.h"
21 #include "CudaComputeNonbondedKernel.hip.h"
22 
23 #ifdef NODEGROUP_FORCE_REGISTER
24 
25 #define MAX_VALUE 2147483647
26 #define BIG_FLOAT 1e20
27 #define SMALL_FLOAT -1e20
28 
29 __device__ __forceinline__ void singleBisect(
30  const int bit_pos,
31  const int bit_total,
32  const double min_dim,
33  const double max_dim,
34  const double* current_dim,
35  const int numAtoms,
36  unsigned int* s_indexBuffer,
37  unsigned int (&thread_start)[MigrationCUDAKernel::kValuesPerThread],
38  unsigned int (&thread_end)[MigrationCUDAKernel::kValuesPerThread],
39  unsigned int (&thread_out)[MigrationCUDAKernel::kValuesPerThread],
40  unsigned int (&thread_keys)[MigrationCUDAKernel::kValuesPerThread],
41  unsigned int (&thread_values)[MigrationCUDAKernel::kValuesPerThread]
42 ) {
43  constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
44  typedef cub::BlockRadixSort<unsigned int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, unsigned int> BlockRadixSort;
45  __shared__ typename BlockRadixSort::TempStorage temp_sort;
46 
47 
48  for (int i = 0; i < kValuesPerThread; i++) {
49  const int idx = kValuesPerThread * threadIdx.x + i;
50  if (idx < numAtoms) {
51  const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
52  thread_keys[i] = thread_out[i] | pos_norm;
53  thread_values[i] = idx;
54  } else {
55  thread_keys[i] = ~0;
56  thread_values[i] = ~0;
57  }
58  }
59  __syncthreads();
60 
61  BlockRadixSort(temp_sort).Sort(thread_keys, thread_values);
62  __syncthreads();
63 
64  for (int i = 0; i < kValuesPerThread; i++) {
65  const int idx = kValuesPerThread * threadIdx.x + i;
66  if (thread_values[i] < numAtoms) {
67  s_indexBuffer[thread_values[i]] = idx;
68  }
69  }
70  __syncthreads();
71 
72  // Get sort index
73  for (int i = 0; i < kValuesPerThread; i++) {
74  const int idx = kValuesPerThread * threadIdx.x + i;
75  if (idx < numAtoms) {
76  const int newIndex = s_indexBuffer[idx];
77 
78  int split_factor = 32;
79  int split = -1;
80  while (split == -1) {
81  if ( thread_start[i]/split_factor < (thread_end[i]-1)/split_factor ) {
82  split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
83  }
84  split_factor /= 2;
85  if (split_factor == 1){
86  split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
87  }
88  }
89 
90  if (newIndex >= split) {
91  thread_out[i] |= 1 << (30 - bit_pos);
92  thread_start[i] = split;
93  } else {
94  thread_end[i] = split;
95  }
96  if (bit_pos == bit_total - 1) {
97  const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
98  thread_out[i] |= pos_norm;
99  }
100  }
101  }
102  __syncthreads();
103 }
104 
105 
106 __global__ void computeBisect(
107  const int numPatches,
108  const int numGrids,
109  const CudaLocalRecord* localRecords,
110  const double3* patchMin,
111  const double3* patchMax,
112  const double* pos_x,
113  const double* pos_y,
114  const double* pos_z,
115  int* sortOrder,
116  int* sortIndex
117 ) {
118  constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
119 
120  __shared__ CudaLocalRecord s_record;
121  using AccessType = int32_t;
122  AccessType* s_record_buffer = (AccessType*) &s_record;
123 
124  typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
125  __shared__ typename BlockReduce::TempStorage temp_storage;
126 
127  __shared__ double3 s_pmin;
128  __shared__ double3 s_pmax;
129 
130 
131  __shared__ unsigned int s_indexBuffer[MigrationCUDAKernel::kMaxAtomsPerPatch];
132 
133  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
134  #pragma unroll
135  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
136  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
137  }
138  __syncthreads();
139 
140  const int numAtoms = s_record.numAtoms;
141  const int offset = s_record.bufferOffset;
142 
143  // Compute the thread-local min/max values
144  double3 pmin, pmax;
145  pmin.x = BIG_FLOAT;
146  pmin.y = BIG_FLOAT;
147  pmin.z = BIG_FLOAT;
148  pmax.x = SMALL_FLOAT;
149  pmax.y = SMALL_FLOAT;
150  pmax.z = SMALL_FLOAT;
151  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
152  pmin.x = fmin(pmin.x, pos_x[offset + i]);
153  pmin.y = fmin(pmin.y, pos_y[offset + i]);
154  pmin.z = fmin(pmin.z, pos_z[offset + i]);
155  pmax.x = fmax(pmax.x, pos_x[offset + i]);
156  pmax.y = fmax(pmax.y, pos_y[offset + i]);
157  pmax.z = fmax(pmax.z, pos_z[offset + i]);
158  }
159  __syncthreads();
160 
161  // Compute the thread-block min/max values
162 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
163  auto minOp = cub::Min();
164  auto maxOp = cub::Max();
165 #else
166  auto minOp = cuda::minimum<double>();
167  auto maxOp = cuda::maximum<double>();
168 #endif
169  pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, minOp);
170  __syncthreads();
171  pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, minOp);
172  __syncthreads();
173  pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, minOp);
174  __syncthreads();
175 
176  pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, maxOp);
177  __syncthreads();
178  pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, maxOp);
179  __syncthreads();
180  pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, maxOp);
181  __syncthreads();
182 
183  if (threadIdx.x == 0) {
184  s_pmin = pmin;
185  s_pmax = pmax;
186  }
187  __syncthreads();
188 
189  pmin = s_pmin;
190  pmax = s_pmax;
191 
192 
193  unsigned int thread_start[kValuesPerThread];
194  unsigned int thread_end[kValuesPerThread];
195  unsigned int thread_out[kValuesPerThread];
196  unsigned int thread_keys[kValuesPerThread];
197  unsigned int thread_values[kValuesPerThread];
198 
199  for (int i = 0; i < kValuesPerThread; i++) {
200  thread_out[i] = 0;
201  thread_start[i] = 0;
202  thread_end[i] = numAtoms;
203  }
204 
205  double diff_x = pmax.x - pmin.x;
206  double diff_y = pmax.y - pmin.y;
207  double diff_z = pmax.z - pmin.z;
208  int bit = 0;
209  const int num_iters = 3;
210  for (int i = 0; i < num_iters; i++) {
211  if (diff_x > diff_y && diff_y > diff_z) {
212  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
213  thread_start, thread_end, thread_out, thread_keys, thread_values);
214  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
215  thread_start, thread_end, thread_out, thread_keys, thread_values);
216  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
217  thread_start, thread_end, thread_out, thread_keys, thread_values);
218  } else if (diff_x > diff_z && diff_z > diff_y) {
219  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
220  thread_start, thread_end, thread_out, thread_keys, thread_values);
221  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
222  thread_start, thread_end, thread_out, thread_keys, thread_values);
223  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
224  thread_start, thread_end, thread_out, thread_keys, thread_values);
225  } else if (diff_y > diff_x && diff_x > diff_z) {
226  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
227  thread_start, thread_end, thread_out, thread_keys, thread_values);
228  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
229  thread_start, thread_end, thread_out, thread_keys, thread_values);
230  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
231  thread_start, thread_end, thread_out, thread_keys, thread_values);
232  } else if (diff_y > diff_z && diff_z > diff_x) {
233  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
234  thread_start, thread_end, thread_out, thread_keys, thread_values);
235  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
236  thread_start, thread_end, thread_out, thread_keys, thread_values);
237  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
238  thread_start, thread_end, thread_out, thread_keys, thread_values);
239  } else if (diff_z > diff_x && diff_x > diff_y) {
240  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
241  thread_start, thread_end, thread_out, thread_keys, thread_values);
242  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
243  thread_start, thread_end, thread_out, thread_keys, thread_values);
244  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
245  thread_start, thread_end, thread_out, thread_keys, thread_values);
246  } else {
247  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
248  thread_start, thread_end, thread_out, thread_keys, thread_values);
249  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
250  thread_start, thread_end, thread_out, thread_keys, thread_values);
251  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
252  thread_start, thread_end, thread_out, thread_keys, thread_values);
253  }
254  }
255 
256  for (int i = 0; i < kValuesPerThread; i++) {
257  const int idx = kValuesPerThread * threadIdx.x + i;
258  if (idx < numAtoms) {
259  sortIndex[offset + idx] = thread_out[i];
260  sortOrder[offset + idx] = idx;
261  }
262  }
263  __syncthreads();
264 
265  }
266 }
267 
268 
269 __device__ __forceinline__ int simple_grid(const int numGrids, const int x, const int y, const int z) {
270  const int index = x + numGrids * (y + numGrids * z);
271  return index;
272 }
273 
274 __device__ __forceinline__ int snake_grid(const int numGrids, const int x, const int y, const int z) {
275  int index = numGrids * numGrids * x;
276  if (x % 2 == 0) {
277  index += numGrids * y;
278  } else {
279  index += numGrids * (numGrids - 1 - y);
280  }
281  if (y % 2 == x % 2) {
282  index += z;
283  } else {
284  index += (numGrids - 1 - z);
285  }
286  return index;
287 }
288 
289 
290 /**
291  * \brief Computes the nonbonded index of atoms for optimal clustering
292  *
293  * \par
294  * Each thread-block is assigned to a patch, and does the following:
295  * - First, the minimum and maximum coordinates of the patch are computed.
296  * the patchMin and patchMax don't produce the same results. I'm not sure
297  * if this is because of migration coords or something with the lattice...
298  * - Seconds, the nonbonded index of each atom is computed and stored
299  * The exact spatial hashing algorithm needs investigation
300  *
301  */
302 __global__ void computeNonbondedIndex(
303  const int numPatches,
304  const int numGrids,
305  const CudaLocalRecord* localRecords,
306  const double3* patchMin,
307  const double3* patchMax,
308  const double* pos_x,
309  const double* pos_y,
310  const double* pos_z,
311  int* sortOrder,
312  int* sortIndex
313 ) {
314  __shared__ CudaLocalRecord s_record;
315  using AccessType = int32_t;
316  AccessType* s_record_buffer = (AccessType*) &s_record;
317 
318  typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
319  __shared__ typename BlockReduce::TempStorage temp_storage;
320 
321  __shared__ double3 s_pmin;
322  __shared__ double3 s_pmax;
323 
324  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
325  #pragma unroll
326  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
327  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
328  }
329  __syncthreads();
330 
331  const int numAtoms = s_record.numAtoms;
332  const int offset = s_record.bufferOffset;
333 
334  // Compute the thread-local min/max values
335  double3 pmin, pmax;
336  pmin.x = BIG_FLOAT;
337  pmin.y = BIG_FLOAT;
338  pmin.z = BIG_FLOAT;
339  pmax.x = SMALL_FLOAT;
340  pmax.y = SMALL_FLOAT;
341  pmax.z = SMALL_FLOAT;
342  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
343  pmin.x = fmin(pmin.x, pos_x[offset + i]);
344  pmin.y = fmin(pmin.y, pos_y[offset + i]);
345  pmin.z = fmin(pmin.z, pos_z[offset + i]);
346  pmax.x = fmax(pmax.x, pos_x[offset + i]);
347  pmax.y = fmax(pmax.y, pos_y[offset + i]);
348  pmax.z = fmax(pmax.z, pos_z[offset + i]);
349  }
350  __syncthreads();
351 
352  // Compute the thread-block min/max values
353 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
354  auto minOp = cub::Min();
355  auto maxOp = cub::Max();
356 #else
357  auto minOp = cuda::minimum<double>();
358  auto maxOp = cuda::maximum<double>();
359 #endif
360  pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, minOp);
361  __syncthreads();
362  pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, minOp);
363  __syncthreads();
364  pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, minOp);
365  __syncthreads();
366 
367  pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, maxOp);
368  __syncthreads();
369  pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, maxOp);
370  __syncthreads();
371  pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, maxOp);
372  __syncthreads();
373 
374  if (threadIdx.x == 0) {
375  s_pmin = pmin;
376  s_pmax = pmax;
377  }
378  __syncthreads();
379 
380  pmin = s_pmin;
381  pmax = s_pmax;
382 
383  // Compute the sort index
384  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
385  const double x = pos_x[offset + i];
386  const double y = pos_y[offset + i];
387  const double z = pos_z[offset + i];
388 
389  // Compute subpatch index
390  int idx_x = (int)((x - pmin.x) / (pmax.x - pmin.x) * ((double) numGrids));
391  int idx_y = (int)((y - pmin.y) / (pmax.y - pmin.y) * ((double) numGrids));
392  int idx_z = (int)((z - pmin.z) / (pmax.z - pmin.z) * ((double) numGrids));
393  idx_x = min(max(idx_x, 0), numGrids-1);
394  idx_y = min(max(idx_y, 0), numGrids-1);
395  idx_z = min(max(idx_z, 0), numGrids-1);
396 
397  // Compute sort index
398  const int block_index = snake_grid(numGrids, idx_x, idx_y, idx_z);
399  const double z_norm = (z - pmin.z) / (pmax.z - pmin.z);
400 
401  const int reverse = ((idx_y % 2) == (idx_x % 2));
402  int inner_index;
403  if (reverse) {
404  inner_index = (unsigned int) (z_norm * (1 << 16));
405  } else {
406  inner_index = ~((unsigned int) (z_norm * (1 << 16)));
407  }
408 
409  sortIndex[offset + i] = (block_index << 17) + inner_index;
410  sortOrder[offset + i] = i;
411  }
412  __syncthreads();
413  }
414 }
415 
416 /**
417  * \brief Sorts the nonbonded sorting based on previously computed indices
418  *
419  * /par
420  * Uses CUB's block-level sort algorithm to generate the nonbonded ordering based on
421  * previously computed spatial hashing. It will generate both a forward and backward maping
422  *
423  */
424 __global__ void sortAtomsKernel(
425  const int numPatches,
426  const CudaLocalRecord* localRecords,
427  int* sortOrder,
428  int* unsortOrder,
429  int* sortIndex
430 ) {
431  __shared__ CudaLocalRecord s_record;
432  using AccessType = int32_t;
433  AccessType* s_record_buffer = (AccessType*) &s_record;
434 
435  typedef cub::BlockRadixSort<int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, int> BlockRadixSort;
436  __shared__ typename BlockRadixSort::TempStorage temp_storage;
437 
438  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
439  // Read in the CudaLocalRecord using multiple threads. This should
440  #pragma unroll 1
441  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
442  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
443  }
444  __syncthreads();
445 
446  const int numAtoms = s_record.numAtoms;
447  const int offset = s_record.bufferOffset;
448 
449  int thread_keys[MigrationCUDAKernel::kValuesPerThread];
450  int thread_values[MigrationCUDAKernel::kValuesPerThread];
451  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
452  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
453  if (idx < numAtoms) {
454  thread_keys[i] = sortIndex[offset + idx];
455  thread_values[i] = sortOrder[offset + idx];
456  } else {
457  thread_keys[i] = MAX_VALUE;
458  thread_values[i] = -1;
459  }
460  }
461  __syncthreads();
462 
463  BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
464  __syncthreads();
465 
466  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
467  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
468  if (thread_keys[i] != MAX_VALUE) {
469  unsortOrder[offset + thread_values[i]] = idx;
470  sortOrder[offset + idx] = thread_values[i];
471  sortIndex[offset + idx] = thread_keys[i];
472  }
473  }
474  __syncthreads();
475  }
476 }
477 
478 
479 __global__ void printSortOrder(
480  const int numPatches,
481  const CudaLocalRecord* localRecords,
482  int* sortOrder,
483  const double* pos_x,
484  const double* pos_y,
485  const double* pos_z
486 ) {
487  __shared__ CudaLocalRecord s_record;
488  using AccessType = int32_t;
489  AccessType* s_record_buffer = (AccessType*) &s_record;
490 
491  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
492  // Read in the CudaLocalRecord using multiple threads. This should
493  #pragma unroll 1
494  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
495  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
496  }
497  __syncthreads();
498 
499  const int numAtoms = s_record.numAtoms;
500  const int offset = s_record.bufferOffset;
501 
502  if (patchIndex == 0) {
503  if (threadIdx.x == 0) {
504  for (int i = 0; i < numAtoms; i ++) {
505  printf("%d %d %f %f %f\n", i, sortOrder[offset + i], pos_x[offset + i], pos_y[offset+i], pos_z[offset + i]);
506  }
507  }
508  }
509  __syncthreads();
510  }
511 }
512 
513 void MigrationCUDAKernel::sortAtoms(
514  const int numPatches,
515  const int numAtoms,
516  const CudaLocalRecord* records,
517  const double3* patchMin,
518  const double3* patchMax,
519  const double* pos_x,
520  const double* pos_y,
521  const double* pos_z,
522  int* sortOrder,
523  int* unsortOrder,
524  int* sortIndex,
525  cudaStream_t stream
526 ) {
527  constexpr int numThreads = kSortNumThreads;
528  const int numBlocks = numPatches;
529 
530  // Knob for the spatial hashing
531  const int numGrid = 4;
532 
533  computeBisect<<<numBlocks, numThreads, 0, stream>>>(
534 // computeNonbondedIndex<<<numBlocks, numThreads, 0, stream>>>(
535  numPatches,
536  numGrid,
537  records,
538  patchMin,
539  patchMax,
540  pos_x,
541  pos_y,
542  pos_z,
543  sortOrder,
544  sortIndex
545  );
546 
547  sortAtomsKernel<<<numBlocks, numThreads, 0, stream>>>(
548  numPatches,
549  records,
550  sortOrder,
551  unsortOrder,
552  sortIndex
553  );
554 
555 #if 0
556  printSortOrder<<<numBlocks, numThreads, 0, stream>>>(
557  numPatches,
558  records,
559  sortOrder,
560  pos_x,
561  pos_y,
562  pos_z
563  );
564 #endif
565 }
566 
567 
568 /**
569  * \brief Computes the derived quantities for SoA data
570  *
571  * \par
572  * Some data isn't stored in AoS data. This kernel computes quantities that are
573  * used elsewhere in calculation like reciprocal mass
574  *
575  */
576 __global__ void calculateDerivedKernel(
577  const int numPatches,
578  const CudaLocalRecord* localRecords,
579  float* mass,
580  double* recipMass
581 ) {
582  __shared__ CudaLocalRecord s_record;
583  using AccessType = int32_t;
584  AccessType* s_record_buffer = (AccessType*) &s_record;
585 
586  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
587  // Read in the CudaLocalRecord using multiple threads. This should
588  #pragma unroll 1
589  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
590  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
591  }
592  __syncthreads();
593 
594  const int numAtoms = s_record.numAtoms;
595  const int offset = s_record.bufferOffset;
596 
597  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
598  const float m = mass[offset + i];
599  recipMass[offset + i] = (m > 0) ? 1.0f / m : 0.0;
600  }
601  __syncthreads();
602  }
603 }
604 
605 /**
606  * \brief Computes the Langevin derived quantities for SoA data
607  *
608  * \par
609  * Some data isn't stored in AoS data. This kernel computes quantities that are
610  * used elsewhere in the Langevin calculation like langScalRandBBK2
611  *
612  */
613 template<bool kDoAlch>
614 __global__ void calculateDerivedLangevinKernel(
615  const int numPatches,
616  const CudaLocalRecord* localRecords,
617  const double dt,
618  const double kbT,
619  const double tempFactor,
620  double* recipMass,
621  int* partition,
622  float* langevinParam,
623  float* langScalVelBBK2,
624  float* langScalRandBBK2
625 ) {
626  __shared__ CudaLocalRecord s_record;
627  using AccessType = int32_t;
628  AccessType* s_record_buffer = (AccessType*) &s_record;
629 
630  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
631  // Read in the CudaLocalRecord using multiple threads. This should
632  #pragma unroll 1
633  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
634  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
635  }
636  __syncthreads();
637 
638  const int numAtoms = s_record.numAtoms;
639  const int offset = s_record.bufferOffset;
640 
641  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
642  const double dt_gamma = dt * langevinParam[offset + i];
643  const float invm = recipMass[offset + i];
644  const double factor = (kDoAlch && partition[offset + i]) ? tempFactor : 1.0;
645  langScalRandBBK2[offset + i] = (float) sqrt( 2.0 * dt_gamma * kbT * factor * invm);
646  langScalVelBBK2[offset + i] = (float) (1.0 / (1.0 + 0.5 * dt_gamma));
647  }
648  __syncthreads();
649  }
650 }
651 
652 
653 
654 /**
655  * \brief Copies AoS data into SoA buffers
656  *
657  * \par
658  * During migration, atomic data is stored as AoS. After migration, we must copy
659  * the data to SoA buffers for the rest of the calculation. Since the AoS atomic data
660  * is 128 bytes, we can do this with efficient memory access using shared memory as a
661  * buffer for the transpose
662  *
663  * TODO Remove ugly if statement for writing buffers
664  *
665  */
666 template<bool kDoAlch, bool kDoLangevin>
667 __global__ void copy_AoS_to_SoAKernel(
668  const int numPatches,
669  const CudaLocalRecord* localRecords,
670  const FullAtom* atomdata_AoS,
671  double* vel_x,
672  double* vel_y,
673  double* vel_z,
674  double* pos_x,
675  double* pos_y,
676  double* pos_z,
677  float* mass,
678  float* charge,
679  int* id,
680  int* vdwType,
681  int* hydrogenGroupSize,
682  int* migrationGroupSize,
683  int* atomFixed,
684  float* rigidBondLength,
685  char3* transform,
686  int* partition,
687  float* langevinParam
688 ) {
689  constexpr int kAtomsPerBuffer = MigrationCUDAKernel::kAtomsPerBuffer;
690  constexpr int kNumSoABuffers = MigrationCUDAKernel::kNumSoABuffers;
691 
692  __shared__ CudaLocalRecord s_record;
693  using AccessType = int32_t;
694  AccessType* s_record_buffer = (AccessType*) &s_record;
695 
696  // FullAtom contains "Vectors" which initialize to 99999. This isn't allow for shared memory
697  // suppressing this warning as it doesn't rely on the dynamic initialization
698 #ifdef NAMD_CUDA
699  #pragma diag_suppress static_var_with_dynamic_init
700  __shared__ FullAtom s_atombuffer[kAtomsPerBuffer]; // For a 128 byte cache line
701 #else // NAMD_HIP
702  // NVCC might allow the code above but initializations on shared memory
703  // are not allowed on clang
704  extern __shared__ FullAtom s_atombuffer[];
705 #endif
706 
707  const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
708  const int wid = threadIdx.x / WARPSIZE;
709  const int tid = threadIdx.x % WARPSIZE;
710 
711  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
712  // Read in the CudaLocalRecord using multiple threads. This should
713  #pragma unroll 1
714  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
715  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
716  }
717  __syncthreads();
718 
719  const int numAtoms = s_record.numAtoms;
720  const int numAtomsPad = (s_record.numAtoms + kAtomsPerBuffer - 1) / kAtomsPerBuffer;
721  const int offset = s_record.bufferOffset;
722 
723  for (int i = 0; i < numAtomsPad; i++) {
724  const int atomOffset = i * kAtomsPerBuffer;
725  // Load atoms. 1 warp per atom
726  for (int atom_idx = wid; atom_idx < kAtomsPerBuffer; atom_idx += warps_per_threadblock) {
727  if (atomOffset + atom_idx < numAtoms) {
728  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
729  int32_t *s_int = (int32_t*) &(s_atombuffer[atom_idx]);
730  int32_t *g_int = (int32_t*) &(atomdata_AoS[offset + atomOffset + atom_idx]);
731  s_int[tid] = g_int[tid];
732  }
733  }
734  }
735  __syncthreads();
736 
737  // Store atoms in SoA. 1 thread per atom
738  for (int buffer = wid; buffer < kNumSoABuffers; buffer += warps_per_threadblock) {
739  if (atomOffset + tid < numAtoms) {
740  if (buffer == 0) vel_x [offset + atomOffset + tid] = ((FullAtom)(s_atombuffer[tid])).velocity.x;
741  if (buffer == 1) vel_y [offset + atomOffset + tid] = s_atombuffer[tid].velocity.y;
742  if (buffer == 2) vel_z [offset + atomOffset + tid] = s_atombuffer[tid].velocity.z;
743  if (buffer == 3) pos_x [offset + atomOffset + tid] = s_atombuffer[tid].position.x;
744  if (buffer == 4) pos_y [offset + atomOffset + tid] = s_atombuffer[tid].position.y;
745  if (buffer == 5) pos_z [offset + atomOffset + tid] = s_atombuffer[tid].position.z;
746  if (buffer == 6) mass [offset + atomOffset + tid] = s_atombuffer[tid].mass;
747  if (buffer == 7) charge [offset + atomOffset + tid] = s_atombuffer[tid].charge;
748  if (buffer == 8) id [offset + atomOffset + tid] = s_atombuffer[tid].id;
749  if (buffer == 9) vdwType [offset + atomOffset + tid] = s_atombuffer[tid].vdwType;
750  if (buffer == 10) hydrogenGroupSize [offset + atomOffset + tid] = s_atombuffer[tid].hydrogenGroupSize;
751  if (buffer == 11) migrationGroupSize[offset + atomOffset + tid] = s_atombuffer[tid].migrationGroupSize;
752  if (buffer == 12) atomFixed [offset + atomOffset + tid] = s_atombuffer[tid].atomFixed;
753  if (buffer == 13) rigidBondLength [offset + atomOffset + tid] = s_atombuffer[tid].rigidBondLength;
754  if (buffer == 14) transform [offset + atomOffset + tid] = s_atombuffer[tid].transform;
755  if (kDoAlch &&
756  buffer == 15) partition [offset + atomOffset + tid] = s_atombuffer[tid].partition;
757  if (kDoLangevin &&
758  buffer == 16) langevinParam [offset + atomOffset + tid] = s_atombuffer[tid].langevinParam;
759  }
760  }
761  __syncthreads();
762  }
763  __syncthreads();
764  }
765 }
766 
767 
768 void MigrationCUDAKernel::copy_AoS_to_SoA(
769  const int numPatches,
770  const bool alchOn,
771  const bool langevinOn,
772  const double dt,
773  const double kbT,
774  const double tempFactor,
775  const CudaLocalRecord* records,
776  const FullAtom* atomdata_AoS,
777  double* recipMass,
778  double* vel_x,
779  double* vel_y,
780  double* vel_z,
781  double* pos_x,
782  double* pos_y,
783  double* pos_z,
784  float* mass,
785  float* charge,
786  int* id,
787  int* vdwType,
788  int* hydrogenGroupSize,
789  int* migrationGroupSize,
790  int* atomFixed,
791  float* rigidBondLength,
792  char3* transform,
793  int* partition,
794  float* langevinParam,
795  float* langScalVelBBK2,
796  float* langScalRandBBK2,
797  cudaStream_t stream
798 ) {
799  constexpr int numThreads = kSortNumThreads;
800  const int numBlocks = numPatches;
801 
802 #ifdef NAMD_CUDA
803  constexpr size_t sizeatoms = 0;
804 #else
805  constexpr size_t sizeatoms = MigrationCUDAKernel::kAtomsPerBuffer*sizeof(FullAtom);
806 #endif
807 
808  #define CALL(alchOn, langevinOn) do { \
809  if (numBlocks) { \
810  copy_AoS_to_SoAKernel<alchOn, langevinOn><<<numBlocks, numThreads, sizeatoms, stream>>>( \
811  numPatches, \
812  records, \
813  atomdata_AoS, \
814  vel_x, vel_y, vel_z, \
815  pos_x, pos_y, pos_z, \
816  mass, charge, \
817  id, vdwType, \
818  hydrogenGroupSize, \
819  migrationGroupSize, \
820  atomFixed, \
821  rigidBondLength, \
822  transform, \
823  partition, \
824  langevinParam \
825  ); \
826  } \
827  } while (0);
828 
829  if (alchOn && langevinOn) {
830  CALL(true, true);
831  } else if (!alchOn && langevinOn) {
832  CALL(false, true);
833  } else if (alchOn && !langevinOn) {
834  CALL(true, false);
835  } else {
836  CALL(false, false);
837  }
838 
839  #undef CALL
840 
841  calculateDerivedKernel<<<numBlocks, numThreads, 0, stream>>>(
842  numPatches,
843  records,
844  mass,
845  recipMass
846  );
847 
848  // This needs the recipMass
849  if (langevinOn) {
850  if (alchOn) {
851  calculateDerivedLangevinKernel<true><<<numBlocks, numThreads, 0, stream>>>(
852  numPatches,
853  records,
854  dt, kbT, tempFactor,
855  recipMass,
856  partition,
857  langevinParam,
858  langScalVelBBK2,
859  langScalRandBBK2
860  );
861  } else {
862  calculateDerivedLangevinKernel<false><<<numBlocks, numThreads, 0, stream>>>(
863  numPatches,
864  records,
865  dt, kbT, tempFactor,
866  recipMass,
867  partition,
868  langevinParam,
869  langScalVelBBK2,
870  langScalRandBBK2
871  );
872  }
873  }
874 }
875 
876 
877 /**
878  * \brief Computes the solvent index with the patch
879  *
880  * \par
881  * Within a patch, the atoms must be sorted such that solvent atoms are at the end of patch. This sorting
882  * must be sorting.
883  *
884  * We do this by using a scan to compute the index for the solute atoms. And then another scan to compute
885  * the index for solvent atoms.
886  *
887  */
888 __global__
889 void computeSolventIndexKernel(
890  const int numPatches,
891  const CudaLocalRecord* localRecords,
892  const FullAtom* atomdata_AoS_in,
893  int* sortIndex
894 ) {
895  __shared__ CudaLocalRecord s_record;
896  using AccessType = int32_t;
897  AccessType* s_record_buffer = (AccessType*) &s_record;
898 
899  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
900  __shared__ typename BlockScan::TempStorage temp_storage;
901 
902  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
903  // Read in the CudaLocalRecord using multiple threads. This should
904  #pragma unroll 1
905  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
906  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
907  }
908  __syncthreads();
909 
910  const int numAtoms = s_record.numAtoms;
911  const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
912 
913  int thread_input[MigrationCUDAKernel::kValuesPerThread];
914  int thread_soluteIndex[MigrationCUDAKernel::kValuesPerThread];
915  int thread_solventIndex[MigrationCUDAKernel::kValuesPerThread];
916  int numSolute;
917 
918  // Load the isWater value
919  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
920  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
921  if (idx < numAtoms) {
922  thread_input[i] = 1 - atomdata_AoS_in[offsetIn + idx].isWater;
923  } else {
924  thread_input[i] = 0;
925  }
926  }
927  __syncthreads();
928 
929  // Compute the prefix sum of solvent
930  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_soluteIndex, numSolute);
931  __syncthreads();
932 
933  // Flip input data
934  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
935  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
936  if (idx < numAtoms) {
937  thread_input[i] = 1 - thread_input[i];
938  } else {
939  thread_input[i] = 0;
940  }
941  }
942  __syncthreads();
943 
944  // Compute the prefix sum of water
945  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_solventIndex);
946  __syncthreads();
947 
948  // write index
949  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
950  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
951  if (idx < numAtoms) {
952  if (thread_input[i]) {
953  sortIndex[offsetIn + idx] = numSolute + thread_solventIndex[i];
954  } else {
955  sortIndex[offsetIn + idx] = thread_soluteIndex[i];
956  }
957  }
958  }
959  __syncthreads();
960  }
961 }
962 
963 /**
964  * \brief Sorts atoms into solute-solvent ordering based on previous compute indices
965  *
966  * \par
967  * Using the previously computed indices, this kernel moves the atomic data in AoS format
968  * into the desired order. Note, this kernel copies the atomic data from the scratch buffer
969  * into the true atomic AoS buffer. The actual migration data movement copies into the scratch
970  * buffer and this kernel copies back.
971  *
972  * Since the AoS data is ~128 bytes per atom. Each warp will move a single atom
973  *
974  */
975 __global__ void sortSolventKernel(
976  const int numPatches,
977  const CudaLocalRecord* localRecords,
978  const FullAtom* atomdata_AoS_in,
979  FullAtom* atomdata_AoS_out,
980  const int* sortIndex
981 ) {
982  __shared__ CudaLocalRecord s_record;
983  using AccessType = int32_t;
984  AccessType* s_record_buffer = (AccessType*) &s_record;
985 
986  const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
987  const int wid = threadIdx.x / WARPSIZE;
988  const int tid = threadIdx.x % WARPSIZE;
989 
990  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
991  // Read in the CudaLocalRecord using multiple threads. This should
992  #pragma unroll 1
993  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
994  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
995  }
996  __syncthreads();
997 
998  const int numAtoms = s_record.numAtoms;
999 
1000  // This was causing issues with CUDA11.3. Needed to explicitly make the offset
1001  // 64-bit
1002 #if 0
1003  const int64_t offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1004  const int64_t offset = s_record.bufferOffset;
1005 #else
1006  const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1007  const int offset = s_record.bufferOffset;
1008 #endif
1009 
1010  for (int i = wid; i < numAtoms; i+=warps_per_threadblock) {
1011  const int dst_idx = sortIndex[offsetIn + i];
1012  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1013  int32_t *src_int = (int32_t*) &(atomdata_AoS_in[offsetIn + i]);
1014  int32_t *dst_int = (int32_t*) &(atomdata_AoS_out[offset + dst_idx]);
1015  dst_int[tid] = src_int[tid];
1016  }
1017 #if defined(NAMD_HIP)
1018  NAMD_WARP_SYNC(WARP_FULL_MASK);
1019 #else
1020  WARP_SYNC(WARP_FULL_MASK);
1021 #endif
1022  }
1023 
1024  __syncthreads();
1025  }
1026 }
1027 
1028 void MigrationCUDAKernel::sortSolventAtoms(
1029  const int numPatches,
1030  const CudaLocalRecord* records,
1031  const FullAtom* atomdata_AoS_in,
1032  FullAtom* atomdata_AoS_out,
1033  int* sortIndex,
1034  cudaStream_t stream
1035 ) {
1036  constexpr int numThreads = kSortNumThreads;
1037  const int numBlocks = numPatches;
1038 
1039  computeSolventIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1040  numPatches,
1041  records,
1042  atomdata_AoS_in,
1043  sortIndex
1044  );
1045 
1046  sortSolventKernel<<<numBlocks, numThreads, 0, stream>>>(
1047  numPatches,
1048  records,
1049  atomdata_AoS_in,
1050  atomdata_AoS_out,
1051  sortIndex
1052  );
1053 }
1054 
1055 /**
1056  * \brief Computes the migration group index
1057  *
1058  * \par
1059  * Atom migration must occur at the level of migration group. Some molecules are
1060  * moved together. I.e. hydrogen of a water molecular are moved based on oxygen's position
1061  *
1062  * This kernel computes the number of migration groups per patch as well as their starting
1063  * indices. This will be used later during migration. It does this with a scan operation
1064  *
1065  */
1066 __global__ void computeMigrationGroupIndexKernel(
1067  const int numPatches,
1068  CudaLocalRecord* localRecords,
1069  const int* migrationGroupSize,
1070  int* migrationGroupIndex
1071 ) {
1072  __shared__ CudaLocalRecord s_record;
1073  using AccessType = int32_t;
1074  AccessType* s_record_buffer = (AccessType*) &s_record;
1075 
1076  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1077  __shared__ typename BlockScan::TempStorage temp_storage;
1078 
1079  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1080  // Read in the CudaLocalRecord using multiple threads. This should
1081  #pragma unroll 1
1082  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1083  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1084  }
1085  __syncthreads();
1086 
1087  const int numAtoms = s_record.numAtoms;
1088  const int offset = s_record.bufferOffset;
1089 
1090  int thread_size[MigrationCUDAKernel::kValuesPerThread];
1091  int thread_index[MigrationCUDAKernel::kValuesPerThread];
1092  int numGroups;
1093 
1094  // Load the migration group size
1095  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1096  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1097  if (idx < numAtoms) {
1098  thread_size[i] = migrationGroupSize[offset + idx] ? 1 : 0;
1099  } else {
1100  thread_size[i] = 0;
1101  }
1102  }
1103  __syncthreads();
1104 
1105  // Compute the prefix sum of solvent
1106  BlockScan(temp_storage).ExclusiveSum(thread_size, thread_index, numGroups);
1107  __syncthreads();
1108 
1109  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1110  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1111  if (thread_size[i] != 0) {
1112  migrationGroupIndex[offset + thread_index[i]] = idx;
1113  }
1114  }
1115  __syncthreads();
1116 
1117  if (threadIdx.x == 0) {
1118  localRecords[patchIndex].numMigrationGroups = numGroups;
1119  }
1120  __syncthreads();
1121  }
1122 }
1123 
1124 
1125 void MigrationCUDAKernel::computeMigrationGroupIndex(
1126  const int numPatches,
1127  CudaLocalRecord* records,
1128  const int* migrationGroupSize,
1129  int* migrationGroupIndex,
1130  cudaStream_t stream
1131 ) {
1132  constexpr int numThreads = kSortNumThreads;
1133  const int numBlocks = numPatches;
1134 
1135  computeMigrationGroupIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1136  numPatches,
1137  records,
1138  migrationGroupSize,
1139  migrationGroupIndex
1140  );
1141 }
1142 
1143 /**
1144  * \brief Computes the transformed positions of migrated atoms
1145  *
1146  * \par
1147  * When atoms move between patches, we need to do a transform on the position based
1148  * on the lattice. This logic is largely copied from HomePatch.C. This is only done
1149  * on the atoms that have actually moved between patches. This uses the numAtomsLocal
1150  * field of the CudaLocalRecord
1151  *
1152  * Note this kernel could likely be optimized. Threads that are not assigned to the
1153  * parent of a migration group do no work. But not that many atoms actually migrate
1154  *
1155  */
1156 __global__ void transformMigratedPositionsKernel(
1157  const int numPatches,
1158  const CudaLocalRecord* localRecords,
1159  const double3* patchCenter,
1160  FullAtom* atomdata_AoS,
1161  const Lattice lattice
1162 ) {
1163  __shared__ CudaLocalRecord s_record;
1164  using AccessType = int32_t;
1165  AccessType* s_record_buffer = (AccessType*) &s_record;
1166 
1167  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1168  // Read in the CudaLocalRecord using multiple threads. This should
1169  #pragma unroll 1
1170  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1171  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1172  }
1173  __syncthreads();
1174 
1175  const int offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1176  const int numAtoms = s_record.numAtoms;
1177  const int numAtomsLocal = s_record.numAtomsLocal;
1178  const int numAtomsMigrated = numAtoms - numAtomsLocal;
1179  const double3 center = patchCenter[patchIndex];
1180 
1181  for (int i = threadIdx.x; i < numAtomsMigrated; i += blockDim.x) {
1182  const int startingIndex = numAtomsLocal + i;
1183  const int migrationSize = atomdata_AoS[offset + startingIndex].migrationGroupSize;
1184  const int hydrogenSize = atomdata_AoS[offset + startingIndex].hydrogenGroupSize;
1185 
1186  if (migrationSize == 0) continue;
1187 
1188  Transform parent_transform;
1189  if (migrationSize != hydrogenSize) {
1190  double3 c_pos = make_double3(0.0, 0.0, 0.0);
1191  int c = 0;
1192  int tmp_hydrogenGroupSize = hydrogenSize;
1193  for (int j = 0; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1194  c_pos = c_pos + (double3) atomdata_AoS[offset + startingIndex + j].position;
1195  c++;
1196  tmp_hydrogenGroupSize = atomdata_AoS[offset + startingIndex + j].hydrogenGroupSize;
1197  }
1198  c_pos.x = c_pos.x / ((double) c);
1199  c_pos.y = c_pos.y / ((double) c);
1200  c_pos.z = c_pos.z / ((double) c);
1201 
1202  parent_transform = atomdata_AoS[offset+startingIndex].transform;
1203  c_pos = lattice.nearest(c_pos, center, &parent_transform);
1204 
1205  double3 pos = atomdata_AoS[offset+startingIndex].position;
1206  Transform transform = atomdata_AoS[offset+startingIndex].transform;
1207 
1208  pos = lattice.reverse_transform(pos, transform);
1209  pos = lattice.apply_transform(pos, parent_transform);
1210 
1211  atomdata_AoS[offset+startingIndex].transform = parent_transform;
1212  atomdata_AoS[offset+startingIndex].position = pos;
1213  } else {
1214 
1215  double3 pos = atomdata_AoS[offset+startingIndex].position;
1216  Transform transform = atomdata_AoS[offset+startingIndex].transform;
1217 
1218  pos = lattice.nearest(pos, center, &transform);
1219  parent_transform = transform;
1220 
1221  atomdata_AoS[offset+startingIndex].transform = transform;
1222  atomdata_AoS[offset+startingIndex].position = pos;
1223  }
1224  for (int j = 1; j < migrationSize; j++) {
1225  double3 pos = atomdata_AoS[offset+startingIndex + j].position;
1226  Transform transform = atomdata_AoS[offset+startingIndex +j].transform;
1227 
1228  pos = lattice.reverse_transform(pos, transform);
1229  pos = lattice.apply_transform(pos, parent_transform);
1230 
1231  atomdata_AoS[offset+startingIndex+j].transform = parent_transform;
1232  atomdata_AoS[offset+startingIndex+j].position = pos;
1233  }
1234  }
1235  __syncthreads();
1236  }
1237 }
1238 
1239 void MigrationCUDAKernel::transformMigratedPositions(
1240  const int numPatches,
1241  const CudaLocalRecord* localRecords,
1242  const double3* patchCenter,
1243  FullAtom* atomdata_AoS,
1244  const Lattice lattice,
1245  cudaStream_t stream
1246 ) {
1247  constexpr int numThreads = kSortNumThreads;
1248  const int numBlocks = numPatches;
1249 
1250  transformMigratedPositionsKernel<<<numBlocks, numThreads, 0, stream>>>(
1251  numPatches,
1252  localRecords,
1253  patchCenter,
1254  atomdata_AoS,
1255  lattice
1256  );
1257 }
1258 
1259 
1260 /**
1261  * \brief Computes the new location of a migration group
1262  *
1263  * \par
1264  * Computes the migration destination for each migration group. This info includes:
1265  * - Destination patch ID
1266  * - Destination patch local index (this is the home patches index)
1267  * - Destination device
1268  *
1269  * While the information is computed at the migration group level, it is stored for each atom
1270  * in a migration group, so some of the later data movement can happen at the per atom level
1271  *
1272  * Note the migrationDestination structure will eventually include the atoms new index within a
1273  * patch, but that is determined later
1274  *
1275  */
1276 __global__ void
1277 computeMigrationDestinationKernel(
1278  const int numPatches,
1279  CudaLocalRecord* localRecords,
1280  const Lattice lattice,
1281  const CudaMInfo* mInfo,
1282  const int* patchToDeviceMap,
1283  const int* globalToLocalMap,
1284  const double3* patchMin,
1285  const double3* patchMax,
1286  const int* hydrogenGroupSize,
1287  const int* migrationGroupSize,
1288  const int* migrationGroupIndex,
1289  const double* pos_x,
1290  const double* pos_y,
1291  const double* pos_z,
1292  int4* migrationDestination
1293 ) {
1294  __shared__ CudaLocalRecord s_record;
1295  using AccessType = int32_t;
1296  AccessType* s_record_buffer = (AccessType*) &s_record;
1297 
1298  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1299  // Read in the CudaLocalRecord using multiple threads. This should
1300  #pragma unroll 1
1301  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1302  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1303  }
1304 
1305  // Clear migration atom count
1306  if (threadIdx.x == 0) {
1307  localRecords[patchIndex].numAtomsNew = 0;
1308  }
1309  __syncthreads();
1310 
1311  const int numMigrationGroups = s_record.numMigrationGroups;
1312  const int offset = s_record.bufferOffset;
1313  const double3 min = patchMin[patchIndex];
1314  const double3 max = patchMax[patchIndex];
1315  CudaMInfo migration_info = mInfo[patchIndex];
1316 
1317  int xdev, ydev, zdev;
1318 
1319  for (int i = threadIdx.x; i < numMigrationGroups; i += blockDim.x) {
1320 
1321  const int startingIndex = migrationGroupIndex[offset + i];
1322  const int migrationSize = migrationGroupSize[offset + startingIndex];
1323  const int hydrogenSize = hydrogenGroupSize[offset + startingIndex];
1324 
1325  double3 pos;
1326  pos.x = pos_x[offset + startingIndex];
1327  pos.y = pos_y[offset + startingIndex];
1328  pos.z = pos_z[offset + startingIndex];
1329 
1330  if (migrationSize != hydrogenSize) {
1331  int c = 1;
1332  int tmp_hydrogenGroupSize = 1;
1333  for (int j = hydrogenSize; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1334  pos.x = pos.x + pos_x[offset + startingIndex + j];
1335  pos.y = pos.y + pos_y[offset + startingIndex + j];
1336  pos.z = pos.z + pos_z[offset + startingIndex + j];
1337  c++;
1338  tmp_hydrogenGroupSize = hydrogenGroupSize[offset + startingIndex + j];
1339  }
1340  pos.x = pos.x / ((double) c);
1341  pos.y = pos.y / ((double) c);
1342  pos.z = pos.z / ((double) c);
1343  }
1344 
1345  double3 s = lattice.scale(pos);
1346 
1347  if (s.x < min.x) xdev = 0;
1348  else if (max.x <= s.x) xdev = 2;
1349  else xdev = 1;
1350 
1351  if (s.y < min.y) ydev = 0;
1352  else if (max.y <= s.y) ydev = 2;
1353  else ydev = 1;
1354 
1355  if (s.z < min.z) zdev = 0;
1356  else if (max.z <= s.z) zdev = 2;
1357  else zdev = 1;
1358 
1359  int dest_patch = migration_info.destPatchID[xdev][ydev][zdev];
1360  int dest_device = patchToDeviceMap[dest_patch];
1361  int dest_localID = globalToLocalMap[dest_patch];
1362  int4 dest_info;
1363  dest_info.x = dest_patch;
1364  dest_info.y = dest_device;
1365  dest_info.z = dest_localID;
1366  dest_info.w = 0;
1367 
1368  for (int j = 0; j < migrationSize; j++) {
1369  migrationDestination[offset + startingIndex + j] = dest_info;
1370  }
1371  }
1372  __syncthreads();
1373  }
1374 }
1375 
1376 void MigrationCUDAKernel::computeMigrationDestination(
1377  const int numPatches,
1378  CudaLocalRecord* localRecords,
1379  const Lattice lattice,
1380  const CudaMInfo* mInfo,
1381  const int* patchToDeviceMap,
1382  const int* globalToLocalMap,
1383  const double3* patchMin,
1384  const double3* patchMax,
1385  const int* hydrogenGroupSize,
1386  const int* migrationGroupSize,
1387  const int* migrationGroupIndex,
1388  const double* pos_x,
1389  const double* pos_y,
1390  const double* pos_z,
1391  int4* migrationDestination,
1392  cudaStream_t stream
1393 ) {
1394  constexpr int numThreads = kSortNumThreads;
1395  const int numBlocks = numPatches;
1396 
1397  computeMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1398  numPatches,
1399  localRecords,
1400  lattice,
1401  mInfo,
1402  patchToDeviceMap,
1403  globalToLocalMap,
1404  patchMin,
1405  patchMax,
1406  hydrogenGroupSize,
1407  migrationGroupSize,
1408  migrationGroupIndex,
1409  pos_x, pos_y, pos_z,
1410  migrationDestination
1411  );
1412 }
1413 
1414 
1415 /**
1416  * \brief Updates AoS data structure before migration
1417  *
1418  * \par
1419  * Copies the necessary data from the SoA buffers to the AoS buffers
1420  * This is just the positions and velocities
1421  *
1422  * TODO optimize this kernel
1423  *
1424  */
1425 __global__ void update_AoSKernel(
1426  const int numPatches,
1427  const CudaLocalRecord* localRecords,
1428  FullAtom* atomdata_AoS,
1429  const double* vel_x,
1430  const double* vel_y,
1431  const double* vel_z,
1432  const double* pos_x,
1433  const double* pos_y,
1434  const double* pos_z
1435 ) {
1436  __shared__ CudaLocalRecord s_record;
1437  using AccessType = int32_t;
1438  AccessType* s_record_buffer = (AccessType*) &s_record;
1439 
1440  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1441  // Read in the CudaLocalRecord using multiple threads. This should
1442  #pragma unroll 1
1443  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1444  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1445  }
1446  __syncthreads();
1447 
1448  const int numAtoms = s_record.numAtoms;
1449  const int offset = s_record.bufferOffset;
1450 
1451  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1452  double3 pos;
1453  pos.x = pos_x[offset + i];
1454  pos.y = pos_y[offset + i];
1455  pos.z = pos_z[offset + i];
1456 
1457  double3 vel;
1458  vel.x = vel_x[offset + i];
1459  vel.y = vel_y[offset + i];
1460  vel.z = vel_z[offset + i];
1461 
1462  atomdata_AoS[offset + i].position = pos;
1463  atomdata_AoS[offset + i].velocity = vel;
1464  }
1465  __syncthreads();
1466  }
1467 }
1468 
1469 void MigrationCUDAKernel::update_AoS(
1470  const int numPatches,
1471  const CudaLocalRecord* records,
1472  FullAtom* atomdata_AoS,
1473  const double* vel_x,
1474  const double* vel_y,
1475  const double* vel_z,
1476  const double* pos_x,
1477  const double* pos_y,
1478  const double* pos_z,
1479  cudaStream_t stream
1480 ) {
1481  constexpr int numThreads = kSortNumThreads;
1482  const int numBlocks = numPatches;
1483 
1484  update_AoSKernel<<<numBlocks, numThreads, 0, stream>>>(
1485  numPatches,
1486  records,
1487  atomdata_AoS,
1488  vel_x, vel_y, vel_z,
1489  pos_x, pos_y, pos_z
1490  );
1491 }
1492 
1493 
1494 /**
1495  * \brief Copies atoms that aren't "moving" into the scratch buffers
1496  *
1497  * \par
1498  * This kernel will copy the atoms that are not moving into a new patch into that
1499  * patches scratch buffers. This uses a scan operation to eliminate the need for
1500  * atomic operations to compute atom's new location.
1501  *
1502  * This scan will not change the order of the atoms within a migration group
1503  *
1504  * Note this will set the localID of the atom in the migrationDestination
1505  * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1506  *
1507  */
1508 __global__ void performLocalMigrationKernel(
1509  const int numPatches,
1510  CudaLocalRecord* localRecords,
1511  const FullAtom* atomdata_AoS_in,
1512  FullAtom* atomdata_AoS_out,
1513  int4* migrationDestination
1514 ) {
1515  __shared__ CudaLocalRecord s_record;
1516  using AccessType = int32_t;
1517  AccessType* s_record_buffer = (AccessType*) &s_record;
1518 
1519  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1520  __shared__ typename BlockScan::TempStorage temp_storage;
1521 
1522  __shared__ int s_local_index[MigrationCUDAKernel::kSortNumThreads * MigrationCUDAKernel::kValuesPerThread];
1523 
1524  const int warps_per_threadblock = blockDim.x / WARPSIZE;
1525  const int wid = threadIdx.x / WARPSIZE;
1526  const int tid = threadIdx.x % WARPSIZE;
1527 
1528  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1529  // Read in the CudaLocalRecord using multiple threads. This should
1530  #pragma unroll 1
1531  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1532  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1533  }
1534  __syncthreads();
1535 
1536  const int patchID = s_record.patchID;
1537  const int numAtoms = s_record.numAtoms;
1538  const int offset = s_record.bufferOffset;
1539  const int dst_offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1540 
1541  __syncthreads();
1542  // Load in if the atom is staying in the same patch
1543  int thread_input[MigrationCUDAKernel::kValuesPerThread];
1544  int thread_output[MigrationCUDAKernel::kValuesPerThread];
1545  int numLocal;
1546  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1547  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1548  if (idx < numAtoms && migrationDestination[offset + idx].x == patchID) {
1549  thread_input[i] = 1;
1550  } else {
1551  thread_input[i] = 0;
1552  }
1553  }
1554  __syncthreads();
1555 
1556  // Compute the prefix sum of local
1557  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, numLocal);
1558  __syncthreads();
1559 
1560  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1561  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1562  if (idx < numAtoms && thread_input[i]) {
1563  s_local_index[thread_output[i]] = idx;
1564  }
1565  }
1566  if (threadIdx.x == 0) {
1567  localRecords[patchIndex].numAtomsLocal = numLocal;
1568  localRecords[patchIndex].numAtomsNew = numLocal;
1569  }
1570  __syncthreads();
1571 
1572  // We can do this at the atom level instead of the migrationGroup level
1573  // because the migrationDestinations take that into account and
1574  // the prefix sum means we do not change the ordering of local atoms
1575  for (int i = wid; i < numLocal; i += warps_per_threadblock) {
1576  const int src_atom_idx = s_local_index[i];
1577  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1578  int32_t *src_int = (int32_t*) &(atomdata_AoS_in [offset + src_atom_idx]);
1579  int32_t *dst_int = (int32_t*) &(atomdata_AoS_out [dst_offset + i]);
1580  dst_int[tid] = src_int[tid];
1581  }
1582  if (tid == 0) {
1583  migrationDestination[offset + src_atom_idx].w = i;
1584  }
1585 #if defined(NAMD_HIP)
1586  NAMD_WARP_SYNC(WARP_FULL_MASK);
1587 #else
1588  WARP_SYNC(WARP_FULL_MASK);
1589 #endif
1590  }
1591  __syncthreads();
1592  }
1593 }
1594 
1595 void MigrationCUDAKernel::performLocalMigration(
1596  const int numPatches,
1597  CudaLocalRecord* records,
1598  const FullAtom* atomdata_AoS_in,
1599  FullAtom* atomdata_AoS_out,
1600  int4* migrationDestination,
1601  cudaStream_t stream
1602 ) {
1603  constexpr int numThreads = kSortNumThreads;
1604  const int numBlocks = numPatches;
1605 
1606  performLocalMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1607  numPatches,
1608  records,
1609  atomdata_AoS_in,
1610  atomdata_AoS_out,
1611  migrationDestination
1612  );
1613 }
1614 
1615 
1616 /**
1617  * \brief Moves the migrating atoms into their new patches
1618  *
1619  * \par
1620  * Copies the atoms which have moved into new patches
1621  * into those patches scratch buffers. This will use atomic operations to
1622  * determine the new index of each migration group. This happens at the migration
1623  * group level so atoms within a migration group stay in the same order.
1624  *
1625  * Note because not many atoms move, the use of atomics isn't too expensive
1626  * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1627  *
1628  */
1629 __global__ void performMigrationKernel(
1630  const int numPatches,
1631  CudaLocalRecord* localRecords,
1632  CudaLocalRecord** peer_records,
1633  const FullAtom* local_atomdata_AoS,
1634  FullAtom** peer_atomdata_AoS,
1635  const int* migrationGroupSize,
1636  const int* migrationGroupIndex,
1637  int4* migrationDestination
1638 ) {
1639  __shared__ CudaLocalRecord s_record;
1640  using AccessType = int32_t;
1641  AccessType* s_record_buffer = (AccessType*) &s_record;
1642 
1643  const int warps_per_threadblock = blockDim.x / WARPSIZE;
1644  const int wid = threadIdx.x / WARPSIZE;
1645  const int tid = threadIdx.x % WARPSIZE;
1646 
1647  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1648  // Read in the CudaLocalRecord using multiple threads. This should
1649  #pragma unroll 1
1650  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1651  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1652  }
1653  __syncthreads();
1654 
1655  const int numMigrationGroups = s_record.numMigrationGroups;
1656  const int offset = s_record.bufferOffset;
1657  const int patchID = s_record.patchID;
1658 
1659  __syncthreads();
1660 
1661  for (int i = wid; i < numMigrationGroups; i += warps_per_threadblock) {
1662  const int startingIndex = migrationGroupIndex[offset + i];
1663 
1664  const int size = migrationGroupSize[offset + startingIndex];
1665 
1666  const int4 migrationInfo = migrationDestination[offset + startingIndex];
1667  const int dst_patchID = migrationInfo.x;
1668  const int dst_device = migrationInfo.y;
1669  const int dst_localID = migrationInfo.z;
1670  const int dst_offset = dst_localID * MigrationCUDAKernel::kMaxAtomsPerPatch;
1671 
1672  if (dst_patchID == patchID) continue;
1673 
1674  // Get index via atomic operation
1675  int dst_atom_idx;
1676  if (tid == 0) {
1677  int* counter_index = &(peer_records[dst_device][dst_localID].numAtomsNew);
1678 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
1679  dst_atom_idx = atomicAdd_system(counter_index, size);
1680 #else
1681  // support single-GPU Maxwell
1682  dst_atom_idx = atomicAdd(counter_index, size);
1683 #endif
1684  }
1685  dst_atom_idx = WARP_SHUFFLE(WARP_FULL_MASK, dst_atom_idx, 0, WARPSIZE);
1686 
1687 
1688  FullAtom* remote_atomdata_AoS = peer_atomdata_AoS[dst_device];
1689  for (int j = 0; j < size; j++) {
1690  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1691  int32_t *src_int = (int32_t*) &(local_atomdata_AoS [offset + startingIndex + j]);
1692  int32_t *dst_int = (int32_t*) &(remote_atomdata_AoS [dst_offset + dst_atom_idx + j]);
1693  dst_int[tid] = src_int[tid];
1694  }
1695  if (tid == 0) {
1696  migrationDestination[offset + startingIndex + j].w = dst_atom_idx + j;
1697  }
1698 #if defined(NAMD_HIP)
1699  NAMD_WARP_SYNC(WARP_FULL_MASK);
1700 #else
1701  WARP_SYNC(WARP_FULL_MASK);
1702 #endif
1703  }
1704  }
1705 
1706  __syncthreads();
1707  }
1708 }
1709 
1710 void MigrationCUDAKernel::performMigration(
1711  const int numPatches,
1712  CudaLocalRecord* records,
1713  CudaLocalRecord** peer_records,
1714  const FullAtom* local_atomdata_AoS,
1715  FullAtom** peer_atomdata_AoS,
1716  const int* migrationGroupSize,
1717  const int* migrationGroupIndex,
1718  int4* migrationDestination,
1719  cudaStream_t stream
1720 ) {
1721  constexpr int numThreads = kSortNumThreads;
1722  const int numBlocks = numPatches;
1723 
1724  performMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1725  numPatches,
1726  records,
1727  peer_records,
1728  local_atomdata_AoS,
1729  peer_atomdata_AoS,
1730  migrationGroupSize,
1731  migrationGroupIndex,
1732  migrationDestination
1733  );
1734 }
1735 
1736 
1737 /**
1738  * \brief Updates migrationDestination array based on solute-solvent order
1739  *
1740  * \par
1741  * The migrationDestation structure is used later by the tuple migration, so
1742  * we need to keep it accurate after the solute-solvent sorting. This uses peer
1743  * access to update each atom's new index within its new patch.
1744  *
1745  */
1746 __global__ void updateMigrationDestinationKernel(
1747  const int numAtomsHome,
1748  int4* migrationDestination,
1749  int** d_peer_sortSoluteIndex
1750 ) {
1751  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1752  for (int i = tid; i < numAtomsHome; i += blockDim.x * gridDim.x) {
1753  const int4 dest = migrationDestination[i];
1754  const int dst_device = dest.y;
1755  const int dst_patchID = dest.z;
1756  const int dst_patchOffset = MigrationCUDAKernel::kMaxAtomsPerPatch * dst_patchID;
1757  const int inputIndex = dest.w;
1758 
1759  const int outputIndex = d_peer_sortSoluteIndex[dst_device][dst_patchOffset + inputIndex];
1760  migrationDestination[i].w = outputIndex;
1761  }
1762 }
1763 
1764 void MigrationCUDAKernel::updateMigrationDestination(
1765  const int numAtomsHome,
1766  int4* migrationDestination,
1767  int** d_peer_sortSoluteIndex,
1768  cudaStream_t stream
1769 ) {
1770  constexpr int numThreads = kSortNumThreads;
1771  const int numBlocks = (numAtomsHome + kSortNumThreads - 1) / kSortNumThreads;
1772 
1773  updateMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1774  numAtomsHome,
1775  migrationDestination,
1776  d_peer_sortSoluteIndex
1777  );
1778 }
1779 
1780 
1781 /**
1782  * \brief Copies static atomic data to proxy patches
1783  *
1784  * \par
1785  * Copies atomic data that does not change to proxy patches.
1786  *
1787  */
1788 template <bool doAlch>
1789 __global__ void copyDataToProxiesKernel(
1790  const int deviceIndex,
1791  const int numPatchesHome,
1792  const int numPatchesHomeAndProxy,
1793  const CudaLocalRecord* localRecords,
1794  int** peer_id,
1795  int** peer_vdwType,
1796  int** peer_sortOrder,
1797  int** peer_unsortOrder,
1798  float** peer_charge,
1799  int** peer_partition,
1800  double3** peer_patchCenter
1801 ) {
1802  __shared__ CudaLocalRecord s_record;
1803  using AccessType = int32_t;
1804  AccessType* s_record_buffer = (AccessType*) &s_record;
1805 
1806  for (int patchIndex = blockIdx.x + numPatchesHome; patchIndex < numPatchesHomeAndProxy; patchIndex += gridDim.x) {
1807  // Read in the CudaLocalRecord using multiple threads. This should
1808  #pragma unroll 1
1809  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1810  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1811  }
1812  __syncthreads();
1813 
1814  const int numAtoms = s_record.numAtoms;
1815  const int dstOffset = s_record.bufferOffset;
1816  const int srcDevice = s_record.inline_peers[0].deviceIndex;
1817  const int srcOffset = s_record.inline_peers[0].bufferOffset;
1818  const int srcPatchIndex = s_record.inline_peers[0].patchIndex;
1819 
1820  // TODO this data is probably on the host somewhere...
1821  // And it probably doesn't change????
1822  if (threadIdx.x == 0) {
1823  peer_patchCenter[deviceIndex][patchIndex] = peer_patchCenter[srcDevice][srcPatchIndex];
1824  }
1825  __syncthreads();
1826 
1827  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1828  peer_id [deviceIndex][dstOffset + i] = peer_id[srcDevice][srcOffset + i];
1829  peer_vdwType [deviceIndex][dstOffset + i] = peer_vdwType[srcDevice][srcOffset + i];
1830  peer_sortOrder [deviceIndex][dstOffset + i] = peer_sortOrder[srcDevice][srcOffset + i];
1831  peer_unsortOrder[deviceIndex][dstOffset + i] = peer_unsortOrder[srcDevice][srcOffset + i];
1832  peer_charge [deviceIndex][dstOffset + i] = peer_charge[srcDevice][srcOffset + i];
1833  if (doAlch) {
1834  peer_partition[deviceIndex][dstOffset + i] = peer_partition[srcDevice][srcOffset + i];
1835  }
1836  }
1837  __syncthreads();
1838  }
1839 }
1840 
1841 void MigrationCUDAKernel::copyDataToProxies(
1842  const int deviceIndex,
1843  const int numPatchesHome,
1844  const int numPatchesHomeAndProxy,
1845  const CudaLocalRecord* records,
1846  int** peer_id,
1847  int** peer_vdwType,
1848  int** peer_sortOrder,
1849  int** peer_unsortOrder,
1850  float** peer_charge,
1851  int** peer_partition,
1852  double3** peer_patchCenter,
1853  bool doAlch,
1854  cudaStream_t stream
1855 ) {
1856  constexpr int numThreads = kSortNumThreads;
1857  const int numPatches = numPatchesHomeAndProxy - numPatchesHome;
1858  const int numBlocks = numPatches;
1859 
1860  if (doAlch) {
1861  copyDataToProxiesKernel<true><<<numBlocks, numThreads, 0, stream>>>(
1862  deviceIndex,
1863  numPatchesHome, numPatchesHomeAndProxy,
1864  records,
1865  peer_id, peer_vdwType, peer_sortOrder,
1866  peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1867  );
1868  }
1869  else {
1870  copyDataToProxiesKernel<false><<<numBlocks, numThreads, 0, stream>>>(
1871  deviceIndex,
1872  numPatchesHome, numPatchesHomeAndProxy,
1873  records,
1874  peer_id, peer_vdwType, peer_sortOrder,
1875  peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1876  );
1877  }
1878 }
1879 
1880 /**
1881  * \brief Copies migrationDestination to proxy patches
1882  *
1883  * \par
1884  * This copies the migrationDestination to proxies to be used in tuple migration
1885  * This needs to use the old bufferOffsets and atom count so it is separate from the
1886  * other copyDataToProxies kernel. Additionally, we do this as a put operation to
1887  * avoid a node barrier. The home patch produces this data and then it needs to write it
1888  * to the proxy patch; it the device which owns the proxy patch was doing a get operation,
1889  * we'd need to have a node barrier to make sure this data was ready to be ready. By using
1890  * put operation, we can do a device local synchonization (i.e. a new kernel launch) to
1891  * make sure the data is ready
1892  *
1893  */
1894 __global__ void copyMigrationDestinationToProxiesKernel(
1895  const int deviceIndex,
1896  const int numPatchesHome,
1897  const int numPatchesHomeAndProxy,
1898  const CudaLocalRecord* localRecords,
1899  const CudaPeerRecord* peerRecords,
1900  int4** peer_migrationDestination
1901 ) {
1902  __shared__ CudaLocalRecord s_record;
1903  using AccessType = int32_t;
1904  AccessType* s_record_buffer = (AccessType*) &s_record;
1905 
1906  for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
1907  // Read in the CudaLocalRecord using multiple threads. This should
1908  #pragma unroll 1
1909  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1910  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1911  }
1912  __syncthreads();
1913 
1914  const int numAtoms = s_record.numAtomsNew;
1915  const int dstOffset = s_record.bufferOffsetOld;
1916  const int numPeerRecords = s_record.numPeerRecord;
1917  const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
1918 
1919  const int inlineRemotes = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
1920 
1921  for (int remote = 0; remote < inlineRemotes; remote++) {
1922  const int srcDevice = s_record.inline_peers[remote].deviceIndex;
1923  const int srcOffset = s_record.inline_peers[remote].bufferOffset;
1924  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1925  peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1926  }
1927  }
1928 
1929  for (int remote = inlineRemotes; remote < numPeerRecords; remote++) {
1930  const int srcDevice = peerRecords[peerRecordStartIndexs + remote].deviceIndex;
1931  const int srcOffset = peerRecords[peerRecordStartIndexs + remote].bufferOffset;
1932  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1933  peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1934  }
1935  }
1936  __syncthreads();
1937  }
1938 }
1939 
1940 void MigrationCUDAKernel::copyMigrationDestinationToProxies(
1941  const int deviceIndex,
1942  const int numPatchesHome,
1943  const int numPatchesHomeAndProxy,
1944  const CudaLocalRecord* records,
1945  const CudaPeerRecord* peerRecords,
1946  int4** peer_migrationDestination,
1947  cudaStream_t stream
1948 ) {
1949  constexpr int numThreads = kSortNumThreads;
1950  const int numBlocks = numPatchesHome;
1951 
1952  copyMigrationDestinationToProxiesKernel<<<numBlocks, numThreads, 0, stream>>>(
1953  deviceIndex,
1954  numPatchesHome, numPatchesHomeAndProxy,
1955  records,
1956  peerRecords,
1957  peer_migrationDestination
1958  );
1959 }
1960 
1961 /**
1962  * \brief Updates the CudaLocalRecord data structure
1963  *
1964  * \par
1965  * This updates the number of atoms and buffer offsets of the patches. This should only
1966  * be ran on the home patches since it uses peer access to update remote structures
1967  *
1968  */
1969 __global__ void updateLocalRecordsKernel(
1970  const int numPatchesHome,
1971  CudaLocalRecord* localRecords,
1972  CudaLocalRecord** peer_records,
1973  const CudaPeerRecord* peerRecords
1974 ) {
1975  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1976  for (int i = tid; i < numPatchesHome; i += blockDim.x * gridDim.x) {
1977  const int numAtomsOld = localRecords[i].numAtoms;
1978  const int numAtoms = localRecords[i].numAtomsNew;
1979  const int numAtomsNBPad = CudaComputeNonbondedKernel::computeAtomPad(numAtoms);
1980  localRecords[i].numAtoms = numAtoms;
1981  localRecords[i].numAtomsNBPad = numAtomsNBPad;
1982  localRecords[i].bufferOffsetOld = localRecords[i].bufferOffset;
1983  localRecords[i].numAtomsNew = numAtomsOld;
1984 
1985  const int numPeerRecord = localRecords[i].numPeerRecord;
1986  const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
1987  // TODO use inline remotes??
1988 
1989  for (int remote = 0; remote < numPeerRecord; remote++) {
1990  const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
1991  const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
1992  peer_records[srcDevice][srcOffset].numAtoms = numAtoms;
1993  peer_records[srcDevice][srcOffset].numAtomsNBPad = numAtomsNBPad;
1994  }
1995  }
1996 }
1997 
1998 void MigrationCUDAKernel::updateLocalRecords(
1999  const int numPatchesHome,
2000  CudaLocalRecord* records,
2001  CudaLocalRecord** peer_records,
2002  const CudaPeerRecord* peerRecords,
2003  cudaStream_t stream
2004 ) {
2005  const int numThreads = kSortNumThreads;
2006  const int numBlocks = (numPatchesHome + kSortNumThreads - 1) / kSortNumThreads;
2007 
2008  updateLocalRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2009  numPatchesHome,
2010  records,
2011  peer_records,
2012  peerRecords
2013  );
2014 
2015 }
2016 
2017 
2018 struct RecordToNumAtoms {
2019  __host__ __device__ __forceinline__
2020  int operator()(const CudaLocalRecord &a) const {
2021  return a.numAtoms;
2022  }
2023 };
2024 
2025 struct RecordTonumAtomsNBPad {
2026  __host__ __device__ __forceinline__
2027  int operator()(const CudaLocalRecord &a) const {
2028  return a.numAtomsNBPad;
2029  }
2030 };
2031 
2032 /**
2033  * \brief Updates the buffer offsets based on scratch patchOffsets
2034  *
2035  */
2036 __global__ void updateLocalRecordsOffsetKernel(
2037  const int numPatchesHomeAndProxy,
2038  CudaLocalRecord* localRecords,
2039  int* patchOffsets,
2040  int* patchOffsetsNB
2041 ) {
2042  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2043  for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2044  localRecords[i].bufferOffset = patchOffsets[i];
2045  localRecords[i].bufferOffsetNBPad = patchOffsetsNB[i];
2046  }
2047 }
2048 
2049 void MigrationCUDAKernel::updateLocalRecordsOffset(
2050  const int numPatchesHomeAndProxy,
2051  CudaLocalRecord* records,
2052  cudaStream_t stream
2053 ) {
2054  const int numThreads = kSortNumThreads;
2055  const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2056  size_t temp = patchDeviceScan_alloc;
2057 
2058  //
2059  // Integration Offsets
2060  //
2061  RecordToNumAtoms conversion_op;
2062 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2063  using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2064 #else
2065  using InputIter = thrust::transform_iterator<RecordToNumAtoms, CudaLocalRecord*, int>;
2066 #endif
2067  InputIter iter(records, conversion_op);
2068 
2069  cub::DeviceScan::ExclusiveSum<InputIter>(
2070  d_patchDeviceScan_scratch, temp,
2071  iter, d_patchOffset_temp, numPatchesHomeAndProxy, stream
2072  );
2073 
2074  //
2075  // Nonbonded Offsets
2076  //
2077  RecordTonumAtomsNBPad conversion_op_nb;
2078 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2079  using InputIterNB = cub::TransformInputIterator<int, RecordTonumAtomsNBPad, CudaLocalRecord*>;
2080 #else
2081  using InputIterNB = thrust::transform_iterator<RecordTonumAtomsNBPad, CudaLocalRecord*, int>;
2082 #endif
2083  InputIterNB iterNB(records, conversion_op_nb);
2084 
2085  cub::DeviceScan::ExclusiveSum<InputIterNB>(
2086  d_patchDeviceScan_scratch, temp,
2087  iterNB, d_patchOffsetNB_temp, numPatchesHomeAndProxy, stream
2088  );
2089 
2090  //
2091  // Update AoS data
2092  //
2093  updateLocalRecordsOffsetKernel<<<numBlocks, numThreads, 0, stream>>>(
2094  numPatchesHomeAndProxy,
2095  records,
2096  d_patchOffset_temp,
2097  d_patchOffsetNB_temp
2098  );
2099 }
2100 
2101 /**
2102  * \brief Updates the remote records on this device
2103  *
2104  */
2105 __global__ void updatePeerRecordsKernel(
2106  const int numPatchesHomeAndProxy,
2107  CudaLocalRecord* localRecords,
2108  CudaLocalRecord** peer_records,
2109  CudaPeerRecord* peerRecords
2110 ) {
2111  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2112  for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2113  const int numPeerRecord = localRecords[i].numPeerRecord;
2114  const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
2115 
2116  for (int remote = 0; remote < numPeerRecord; remote++) {
2117  const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
2118  const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
2119 
2120  const int bufferOffset = peer_records[srcDevice][srcOffset].bufferOffset;
2121  const int bufferOffsetNBPad = peer_records[srcDevice][srcOffset].bufferOffsetNBPad;
2122 
2123  peerRecords[peerRecordStartIndex + remote].bufferOffset = bufferOffset;
2124  peerRecords[peerRecordStartIndex + remote].bufferOffsetNBPad = bufferOffsetNBPad;
2125 
2126  if (remote < CudaLocalRecord::num_inline_peer) {
2127  localRecords[i].inline_peers[remote].bufferOffset = bufferOffset;
2128  localRecords[i].inline_peers[remote].bufferOffsetNBPad = bufferOffsetNBPad;
2129  }
2130  }
2131  }
2132 }
2133 
2134 void MigrationCUDAKernel::updatePeerRecords(
2135  const int numPatchesHomeAndProxy,
2136  CudaLocalRecord* records,
2137  CudaLocalRecord** peer_records,
2138  CudaPeerRecord* peerRecords,
2139  cudaStream_t stream
2140 ) {
2141  const int numThreads = kSortNumThreads;
2142  const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2143 
2144  updatePeerRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2145  numPatchesHomeAndProxy,
2146  records,
2147  peer_records,
2148  peerRecords
2149  );
2150 }
2151 
2152 MigrationCUDAKernel::MigrationCUDAKernel() {
2153  d_patchOffset_temp = NULL;
2154  d_patchOffsetNB_temp = NULL;
2155 
2156  patchDeviceScan_alloc = 0;
2157  d_patchDeviceScan_scratch = NULL;
2158 }
2159 
2160 MigrationCUDAKernel::~MigrationCUDAKernel() {
2161  if (d_patchOffset_temp != NULL) deallocate_device<int>(&d_patchOffset_temp);
2162  if (d_patchOffsetNB_temp != NULL) deallocate_device<int>(&d_patchOffsetNB_temp);
2163  if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
2164 }
2165 
2166 void MigrationCUDAKernel::allocateScratch(const int numPatchesHomeAndProxy) {
2167  allocate_device<int>(&d_patchOffset_temp, numPatchesHomeAndProxy);
2168  allocate_device<int>(&d_patchOffsetNB_temp, numPatchesHomeAndProxy);
2169 
2170  // Fake call to CUB to get required size
2171  RecordToNumAtoms conversion_op;
2172 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2173  using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2174 #else
2175  using InputIter = thrust::transform_iterator<RecordToNumAtoms, CudaLocalRecord*, int>;
2176 #endif
2177  InputIter iter(NULL, conversion_op);
2178 
2179  void *d_temp_storage = NULL;
2180  size_t temp_storage_bytes = 0;
2181 
2182  cub::DeviceScan::ExclusiveSum<InputIter>(
2183  d_temp_storage, temp_storage_bytes,
2184  iter, d_patchOffset_temp, numPatchesHomeAndProxy
2185  );
2186 
2187  patchDeviceScan_alloc = temp_storage_bytes;
2188  allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
2189 }
2190 #endif // NODEGROUP_FORCE_REGISTER
2191