2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
8 #include <thrust/iterator/transform_iterator.h>
9 #include <thrust/device_vector.h>
12 #include <hip/hip_runtime.h>
13 #include <hipcub/hipcub.hpp>
17 #include "HipDefines.h"
19 #include "MigrationCUDAKernel.h"
20 #include "CudaComputeNonbondedKernel.h"
21 #include "CudaComputeNonbondedKernel.hip.h"
23 #ifdef NODEGROUP_FORCE_REGISTER
25 #define MAX_VALUE 2147483647
26 #define BIG_FLOAT 1e20
27 #define SMALL_FLOAT -1e20
29 __device__ __forceinline__ void singleBisect(
34 const double* current_dim,
36 unsigned int* s_indexBuffer,
37 unsigned int (&thread_start)[MigrationCUDAKernel::kValuesPerThread],
38 unsigned int (&thread_end)[MigrationCUDAKernel::kValuesPerThread],
39 unsigned int (&thread_out)[MigrationCUDAKernel::kValuesPerThread],
40 unsigned int (&thread_keys)[MigrationCUDAKernel::kValuesPerThread],
41 unsigned int (&thread_values)[MigrationCUDAKernel::kValuesPerThread]
43 constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
44 typedef cub::BlockRadixSort<unsigned int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, unsigned int> BlockRadixSort;
45 __shared__ typename BlockRadixSort::TempStorage temp_sort;
48 for (int i = 0; i < kValuesPerThread; i++) {
49 const int idx = kValuesPerThread * threadIdx.x + i;
51 const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
52 thread_keys[i] = thread_out[i] | pos_norm;
53 thread_values[i] = idx;
56 thread_values[i] = ~0;
61 BlockRadixSort(temp_sort).Sort(thread_keys, thread_values);
64 for (int i = 0; i < kValuesPerThread; i++) {
65 const int idx = kValuesPerThread * threadIdx.x + i;
66 if (thread_values[i] < numAtoms) {
67 s_indexBuffer[thread_values[i]] = idx;
73 for (int i = 0; i < kValuesPerThread; i++) {
74 const int idx = kValuesPerThread * threadIdx.x + i;
76 const int newIndex = s_indexBuffer[idx];
78 int split_factor = 32;
81 if ( thread_start[i]/split_factor < (thread_end[i]-1)/split_factor ) {
82 split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
85 if (split_factor == 1){
86 split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
90 if (newIndex >= split) {
91 thread_out[i] |= 1 << (30 - bit_pos);
92 thread_start[i] = split;
94 thread_end[i] = split;
96 if (bit_pos == bit_total - 1) {
97 const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
98 thread_out[i] |= pos_norm;
106 __global__ void computeBisect(
107 const int numPatches,
109 const CudaLocalRecord* localRecords,
110 const double3* patchMin,
111 const double3* patchMax,
118 constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
120 __shared__ CudaLocalRecord s_record;
121 using AccessType = int32_t;
122 AccessType* s_record_buffer = (AccessType*) &s_record;
124 typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
125 __shared__ typename BlockReduce::TempStorage temp_storage;
127 __shared__ double3 s_pmin;
128 __shared__ double3 s_pmax;
131 __shared__ unsigned int s_indexBuffer[MigrationCUDAKernel::kMaxAtomsPerPatch];
133 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
135 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
136 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
140 const int numAtoms = s_record.numAtoms;
141 const int offset = s_record.bufferOffset;
143 // Compute the thread-local min/max values
148 pmax.x = SMALL_FLOAT;
149 pmax.y = SMALL_FLOAT;
150 pmax.z = SMALL_FLOAT;
151 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
152 pmin.x = fmin(pmin.x, pos_x[offset + i]);
153 pmin.y = fmin(pmin.y, pos_y[offset + i]);
154 pmin.z = fmin(pmin.z, pos_z[offset + i]);
155 pmax.x = fmax(pmax.x, pos_x[offset + i]);
156 pmax.y = fmax(pmax.y, pos_y[offset + i]);
157 pmax.z = fmax(pmax.z, pos_z[offset + i]);
161 // Compute the thread-block min/max values
162 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
163 auto minOp = cub::Min();
164 auto maxOp = cub::Max();
166 auto minOp = cuda::minimum<double>();
167 auto maxOp = cuda::maximum<double>();
169 pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, minOp);
171 pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, minOp);
173 pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, minOp);
176 pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, maxOp);
178 pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, maxOp);
180 pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, maxOp);
183 if (threadIdx.x == 0) {
193 unsigned int thread_start[kValuesPerThread];
194 unsigned int thread_end[kValuesPerThread];
195 unsigned int thread_out[kValuesPerThread];
196 unsigned int thread_keys[kValuesPerThread];
197 unsigned int thread_values[kValuesPerThread];
199 for (int i = 0; i < kValuesPerThread; i++) {
202 thread_end[i] = numAtoms;
205 double diff_x = pmax.x - pmin.x;
206 double diff_y = pmax.y - pmin.y;
207 double diff_z = pmax.z - pmin.z;
209 const int num_iters = 3;
210 for (int i = 0; i < num_iters; i++) {
211 if (diff_x > diff_y && diff_y > diff_z) {
212 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
213 thread_start, thread_end, thread_out, thread_keys, thread_values);
214 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
215 thread_start, thread_end, thread_out, thread_keys, thread_values);
216 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
217 thread_start, thread_end, thread_out, thread_keys, thread_values);
218 } else if (diff_x > diff_z && diff_z > diff_y) {
219 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
220 thread_start, thread_end, thread_out, thread_keys, thread_values);
221 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
222 thread_start, thread_end, thread_out, thread_keys, thread_values);
223 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
224 thread_start, thread_end, thread_out, thread_keys, thread_values);
225 } else if (diff_y > diff_x && diff_x > diff_z) {
226 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
227 thread_start, thread_end, thread_out, thread_keys, thread_values);
228 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
229 thread_start, thread_end, thread_out, thread_keys, thread_values);
230 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
231 thread_start, thread_end, thread_out, thread_keys, thread_values);
232 } else if (diff_y > diff_z && diff_z > diff_x) {
233 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
234 thread_start, thread_end, thread_out, thread_keys, thread_values);
235 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
236 thread_start, thread_end, thread_out, thread_keys, thread_values);
237 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
238 thread_start, thread_end, thread_out, thread_keys, thread_values);
239 } else if (diff_z > diff_x && diff_x > diff_y) {
240 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
241 thread_start, thread_end, thread_out, thread_keys, thread_values);
242 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
243 thread_start, thread_end, thread_out, thread_keys, thread_values);
244 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
245 thread_start, thread_end, thread_out, thread_keys, thread_values);
247 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
248 thread_start, thread_end, thread_out, thread_keys, thread_values);
249 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
250 thread_start, thread_end, thread_out, thread_keys, thread_values);
251 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
252 thread_start, thread_end, thread_out, thread_keys, thread_values);
256 for (int i = 0; i < kValuesPerThread; i++) {
257 const int idx = kValuesPerThread * threadIdx.x + i;
258 if (idx < numAtoms) {
259 sortIndex[offset + idx] = thread_out[i];
260 sortOrder[offset + idx] = idx;
269 __device__ __forceinline__ int simple_grid(const int numGrids, const int x, const int y, const int z) {
270 const int index = x + numGrids * (y + numGrids * z);
274 __device__ __forceinline__ int snake_grid(const int numGrids, const int x, const int y, const int z) {
275 int index = numGrids * numGrids * x;
277 index += numGrids * y;
279 index += numGrids * (numGrids - 1 - y);
281 if (y % 2 == x % 2) {
284 index += (numGrids - 1 - z);
291 * \brief Computes the nonbonded index of atoms for optimal clustering
294 * Each thread-block is assigned to a patch, and does the following:
295 * - First, the minimum and maximum coordinates of the patch are computed.
296 * the patchMin and patchMax don't produce the same results. I'm not sure
297 * if this is because of migration coords or something with the lattice...
298 * - Seconds, the nonbonded index of each atom is computed and stored
299 * The exact spatial hashing algorithm needs investigation
302 __global__ void computeNonbondedIndex(
303 const int numPatches,
305 const CudaLocalRecord* localRecords,
306 const double3* patchMin,
307 const double3* patchMax,
314 __shared__ CudaLocalRecord s_record;
315 using AccessType = int32_t;
316 AccessType* s_record_buffer = (AccessType*) &s_record;
318 typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
319 __shared__ typename BlockReduce::TempStorage temp_storage;
321 __shared__ double3 s_pmin;
322 __shared__ double3 s_pmax;
324 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
326 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
327 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
331 const int numAtoms = s_record.numAtoms;
332 const int offset = s_record.bufferOffset;
334 // Compute the thread-local min/max values
339 pmax.x = SMALL_FLOAT;
340 pmax.y = SMALL_FLOAT;
341 pmax.z = SMALL_FLOAT;
342 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
343 pmin.x = fmin(pmin.x, pos_x[offset + i]);
344 pmin.y = fmin(pmin.y, pos_y[offset + i]);
345 pmin.z = fmin(pmin.z, pos_z[offset + i]);
346 pmax.x = fmax(pmax.x, pos_x[offset + i]);
347 pmax.y = fmax(pmax.y, pos_y[offset + i]);
348 pmax.z = fmax(pmax.z, pos_z[offset + i]);
352 // Compute the thread-block min/max values
353 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
354 auto minOp = cub::Min();
355 auto maxOp = cub::Max();
357 auto minOp = cuda::minimum<double>();
358 auto maxOp = cuda::maximum<double>();
360 pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, minOp);
362 pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, minOp);
364 pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, minOp);
367 pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, maxOp);
369 pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, maxOp);
371 pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, maxOp);
374 if (threadIdx.x == 0) {
383 // Compute the sort index
384 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
385 const double x = pos_x[offset + i];
386 const double y = pos_y[offset + i];
387 const double z = pos_z[offset + i];
389 // Compute subpatch index
390 int idx_x = (int)((x - pmin.x) / (pmax.x - pmin.x) * ((double) numGrids));
391 int idx_y = (int)((y - pmin.y) / (pmax.y - pmin.y) * ((double) numGrids));
392 int idx_z = (int)((z - pmin.z) / (pmax.z - pmin.z) * ((double) numGrids));
393 idx_x = min(max(idx_x, 0), numGrids-1);
394 idx_y = min(max(idx_y, 0), numGrids-1);
395 idx_z = min(max(idx_z, 0), numGrids-1);
397 // Compute sort index
398 const int block_index = snake_grid(numGrids, idx_x, idx_y, idx_z);
399 const double z_norm = (z - pmin.z) / (pmax.z - pmin.z);
401 const int reverse = ((idx_y % 2) == (idx_x % 2));
404 inner_index = (unsigned int) (z_norm * (1 << 16));
406 inner_index = ~((unsigned int) (z_norm * (1 << 16)));
409 sortIndex[offset + i] = (block_index << 17) + inner_index;
410 sortOrder[offset + i] = i;
417 * \brief Sorts the nonbonded sorting based on previously computed indices
420 * Uses CUB's block-level sort algorithm to generate the nonbonded ordering based on
421 * previously computed spatial hashing. It will generate both a forward and backward maping
424 __global__ void sortAtomsKernel(
425 const int numPatches,
426 const CudaLocalRecord* localRecords,
431 __shared__ CudaLocalRecord s_record;
432 using AccessType = int32_t;
433 AccessType* s_record_buffer = (AccessType*) &s_record;
435 typedef cub::BlockRadixSort<int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, int> BlockRadixSort;
436 __shared__ typename BlockRadixSort::TempStorage temp_storage;
438 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
439 // Read in the CudaLocalRecord using multiple threads. This should
441 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
442 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
446 const int numAtoms = s_record.numAtoms;
447 const int offset = s_record.bufferOffset;
449 int thread_keys[MigrationCUDAKernel::kValuesPerThread];
450 int thread_values[MigrationCUDAKernel::kValuesPerThread];
451 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
452 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
453 if (idx < numAtoms) {
454 thread_keys[i] = sortIndex[offset + idx];
455 thread_values[i] = sortOrder[offset + idx];
457 thread_keys[i] = MAX_VALUE;
458 thread_values[i] = -1;
463 BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
466 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
467 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
468 if (thread_keys[i] != MAX_VALUE) {
469 unsortOrder[offset + thread_values[i]] = idx;
470 sortOrder[offset + idx] = thread_values[i];
471 sortIndex[offset + idx] = thread_keys[i];
479 __global__ void printSortOrder(
480 const int numPatches,
481 const CudaLocalRecord* localRecords,
487 __shared__ CudaLocalRecord s_record;
488 using AccessType = int32_t;
489 AccessType* s_record_buffer = (AccessType*) &s_record;
491 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
492 // Read in the CudaLocalRecord using multiple threads. This should
494 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
495 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
499 const int numAtoms = s_record.numAtoms;
500 const int offset = s_record.bufferOffset;
502 if (patchIndex == 0) {
503 if (threadIdx.x == 0) {
504 for (int i = 0; i < numAtoms; i ++) {
505 printf("%d %d %f %f %f\n", i, sortOrder[offset + i], pos_x[offset + i], pos_y[offset+i], pos_z[offset + i]);
513 void MigrationCUDAKernel::sortAtoms(
514 const int numPatches,
516 const CudaLocalRecord* records,
517 const double3* patchMin,
518 const double3* patchMax,
527 constexpr int numThreads = kSortNumThreads;
528 const int numBlocks = numPatches;
530 // Knob for the spatial hashing
531 const int numGrid = 4;
533 computeBisect<<<numBlocks, numThreads, 0, stream>>>(
534 // computeNonbondedIndex<<<numBlocks, numThreads, 0, stream>>>(
547 sortAtomsKernel<<<numBlocks, numThreads, 0, stream>>>(
556 printSortOrder<<<numBlocks, numThreads, 0, stream>>>(
569 * \brief Computes the derived quantities for SoA data
572 * Some data isn't stored in AoS data. This kernel computes quantities that are
573 * used elsewhere in calculation like reciprocal mass
576 __global__ void calculateDerivedKernel(
577 const int numPatches,
578 const CudaLocalRecord* localRecords,
582 __shared__ CudaLocalRecord s_record;
583 using AccessType = int32_t;
584 AccessType* s_record_buffer = (AccessType*) &s_record;
586 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
587 // Read in the CudaLocalRecord using multiple threads. This should
589 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
590 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
594 const int numAtoms = s_record.numAtoms;
595 const int offset = s_record.bufferOffset;
597 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
598 const float m = mass[offset + i];
599 recipMass[offset + i] = (m > 0) ? 1.0f / m : 0.0;
606 * \brief Computes the Langevin derived quantities for SoA data
609 * Some data isn't stored in AoS data. This kernel computes quantities that are
610 * used elsewhere in the Langevin calculation like langScalRandBBK2
613 template<bool kDoAlch>
614 __global__ void calculateDerivedLangevinKernel(
615 const int numPatches,
616 const CudaLocalRecord* localRecords,
619 const double tempFactor,
622 float* langevinParam,
623 float* langScalVelBBK2,
624 float* langScalRandBBK2
626 __shared__ CudaLocalRecord s_record;
627 using AccessType = int32_t;
628 AccessType* s_record_buffer = (AccessType*) &s_record;
630 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
631 // Read in the CudaLocalRecord using multiple threads. This should
633 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
634 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
638 const int numAtoms = s_record.numAtoms;
639 const int offset = s_record.bufferOffset;
641 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
642 const double dt_gamma = dt * langevinParam[offset + i];
643 const float invm = recipMass[offset + i];
644 const double factor = (kDoAlch && partition[offset + i]) ? tempFactor : 1.0;
645 langScalRandBBK2[offset + i] = (float) sqrt( 2.0 * dt_gamma * kbT * factor * invm);
646 langScalVelBBK2[offset + i] = (float) (1.0 / (1.0 + 0.5 * dt_gamma));
655 * \brief Copies AoS data into SoA buffers
658 * During migration, atomic data is stored as AoS. After migration, we must copy
659 * the data to SoA buffers for the rest of the calculation. Since the AoS atomic data
660 * is 128 bytes, we can do this with efficient memory access using shared memory as a
661 * buffer for the transpose
663 * TODO Remove ugly if statement for writing buffers
666 template<bool kDoAlch, bool kDoLangevin>
667 __global__ void copy_AoS_to_SoAKernel(
668 const int numPatches,
669 const CudaLocalRecord* localRecords,
670 const FullAtom* atomdata_AoS,
681 int* hydrogenGroupSize,
682 int* migrationGroupSize,
684 float* rigidBondLength,
689 constexpr int kAtomsPerBuffer = MigrationCUDAKernel::kAtomsPerBuffer;
690 constexpr int kNumSoABuffers = MigrationCUDAKernel::kNumSoABuffers;
692 __shared__ CudaLocalRecord s_record;
693 using AccessType = int32_t;
694 AccessType* s_record_buffer = (AccessType*) &s_record;
696 // FullAtom contains "Vectors" which initialize to 99999. This isn't allow for shared memory
697 // suppressing this warning as it doesn't rely on the dynamic initialization
699 #pragma diag_suppress static_var_with_dynamic_init
700 __shared__ FullAtom s_atombuffer[kAtomsPerBuffer]; // For a 128 byte cache line
702 // NVCC might allow the code above but initializations on shared memory
703 // are not allowed on clang
704 extern __shared__ FullAtom s_atombuffer[];
707 const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
708 const int wid = threadIdx.x / WARPSIZE;
709 const int tid = threadIdx.x % WARPSIZE;
711 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
712 // Read in the CudaLocalRecord using multiple threads. This should
714 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
715 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
719 const int numAtoms = s_record.numAtoms;
720 const int numAtomsPad = (s_record.numAtoms + kAtomsPerBuffer - 1) / kAtomsPerBuffer;
721 const int offset = s_record.bufferOffset;
723 for (int i = 0; i < numAtomsPad; i++) {
724 const int atomOffset = i * kAtomsPerBuffer;
725 // Load atoms. 1 warp per atom
726 for (int atom_idx = wid; atom_idx < kAtomsPerBuffer; atom_idx += warps_per_threadblock) {
727 if (atomOffset + atom_idx < numAtoms) {
728 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
729 int32_t *s_int = (int32_t*) &(s_atombuffer[atom_idx]);
730 int32_t *g_int = (int32_t*) &(atomdata_AoS[offset + atomOffset + atom_idx]);
731 s_int[tid] = g_int[tid];
737 // Store atoms in SoA. 1 thread per atom
738 for (int buffer = wid; buffer < kNumSoABuffers; buffer += warps_per_threadblock) {
739 if (atomOffset + tid < numAtoms) {
740 if (buffer == 0) vel_x [offset + atomOffset + tid] = ((FullAtom)(s_atombuffer[tid])).velocity.x;
741 if (buffer == 1) vel_y [offset + atomOffset + tid] = s_atombuffer[tid].velocity.y;
742 if (buffer == 2) vel_z [offset + atomOffset + tid] = s_atombuffer[tid].velocity.z;
743 if (buffer == 3) pos_x [offset + atomOffset + tid] = s_atombuffer[tid].position.x;
744 if (buffer == 4) pos_y [offset + atomOffset + tid] = s_atombuffer[tid].position.y;
745 if (buffer == 5) pos_z [offset + atomOffset + tid] = s_atombuffer[tid].position.z;
746 if (buffer == 6) mass [offset + atomOffset + tid] = s_atombuffer[tid].mass;
747 if (buffer == 7) charge [offset + atomOffset + tid] = s_atombuffer[tid].charge;
748 if (buffer == 8) id [offset + atomOffset + tid] = s_atombuffer[tid].id;
749 if (buffer == 9) vdwType [offset + atomOffset + tid] = s_atombuffer[tid].vdwType;
750 if (buffer == 10) hydrogenGroupSize [offset + atomOffset + tid] = s_atombuffer[tid].hydrogenGroupSize;
751 if (buffer == 11) migrationGroupSize[offset + atomOffset + tid] = s_atombuffer[tid].migrationGroupSize;
752 if (buffer == 12) atomFixed [offset + atomOffset + tid] = s_atombuffer[tid].atomFixed;
753 if (buffer == 13) rigidBondLength [offset + atomOffset + tid] = s_atombuffer[tid].rigidBondLength;
754 if (buffer == 14) transform [offset + atomOffset + tid] = s_atombuffer[tid].transform;
756 buffer == 15) partition [offset + atomOffset + tid] = s_atombuffer[tid].partition;
758 buffer == 16) langevinParam [offset + atomOffset + tid] = s_atombuffer[tid].langevinParam;
768 void MigrationCUDAKernel::copy_AoS_to_SoA(
769 const int numPatches,
771 const bool langevinOn,
774 const double tempFactor,
775 const CudaLocalRecord* records,
776 const FullAtom* atomdata_AoS,
788 int* hydrogenGroupSize,
789 int* migrationGroupSize,
791 float* rigidBondLength,
794 float* langevinParam,
795 float* langScalVelBBK2,
796 float* langScalRandBBK2,
799 constexpr int numThreads = kSortNumThreads;
800 const int numBlocks = numPatches;
803 constexpr size_t sizeatoms = 0;
805 constexpr size_t sizeatoms = MigrationCUDAKernel::kAtomsPerBuffer*sizeof(FullAtom);
808 #define CALL(alchOn, langevinOn) do { \
810 copy_AoS_to_SoAKernel<alchOn, langevinOn><<<numBlocks, numThreads, sizeatoms, stream>>>( \
814 vel_x, vel_y, vel_z, \
815 pos_x, pos_y, pos_z, \
819 migrationGroupSize, \
829 if (alchOn && langevinOn) {
831 } else if (!alchOn && langevinOn) {
833 } else if (alchOn && !langevinOn) {
841 calculateDerivedKernel<<<numBlocks, numThreads, 0, stream>>>(
848 // This needs the recipMass
851 calculateDerivedLangevinKernel<true><<<numBlocks, numThreads, 0, stream>>>(
862 calculateDerivedLangevinKernel<false><<<numBlocks, numThreads, 0, stream>>>(
878 * \brief Computes the solvent index with the patch
881 * Within a patch, the atoms must be sorted such that solvent atoms are at the end of patch. This sorting
884 * We do this by using a scan to compute the index for the solute atoms. And then another scan to compute
885 * the index for solvent atoms.
889 void computeSolventIndexKernel(
890 const int numPatches,
891 const CudaLocalRecord* localRecords,
892 const FullAtom* atomdata_AoS_in,
895 __shared__ CudaLocalRecord s_record;
896 using AccessType = int32_t;
897 AccessType* s_record_buffer = (AccessType*) &s_record;
899 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
900 __shared__ typename BlockScan::TempStorage temp_storage;
902 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
903 // Read in the CudaLocalRecord using multiple threads. This should
905 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
906 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
910 const int numAtoms = s_record.numAtoms;
911 const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
913 int thread_input[MigrationCUDAKernel::kValuesPerThread];
914 int thread_soluteIndex[MigrationCUDAKernel::kValuesPerThread];
915 int thread_solventIndex[MigrationCUDAKernel::kValuesPerThread];
918 // Load the isWater value
919 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
920 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
921 if (idx < numAtoms) {
922 thread_input[i] = 1 - atomdata_AoS_in[offsetIn + idx].isWater;
929 // Compute the prefix sum of solvent
930 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_soluteIndex, numSolute);
934 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
935 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
936 if (idx < numAtoms) {
937 thread_input[i] = 1 - thread_input[i];
944 // Compute the prefix sum of water
945 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_solventIndex);
949 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
950 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
951 if (idx < numAtoms) {
952 if (thread_input[i]) {
953 sortIndex[offsetIn + idx] = numSolute + thread_solventIndex[i];
955 sortIndex[offsetIn + idx] = thread_soluteIndex[i];
964 * \brief Sorts atoms into solute-solvent ordering based on previous compute indices
967 * Using the previously computed indices, this kernel moves the atomic data in AoS format
968 * into the desired order. Note, this kernel copies the atomic data from the scratch buffer
969 * into the true atomic AoS buffer. The actual migration data movement copies into the scratch
970 * buffer and this kernel copies back.
972 * Since the AoS data is ~128 bytes per atom. Each warp will move a single atom
975 __global__ void sortSolventKernel(
976 const int numPatches,
977 const CudaLocalRecord* localRecords,
978 const FullAtom* atomdata_AoS_in,
979 FullAtom* atomdata_AoS_out,
982 __shared__ CudaLocalRecord s_record;
983 using AccessType = int32_t;
984 AccessType* s_record_buffer = (AccessType*) &s_record;
986 const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
987 const int wid = threadIdx.x / WARPSIZE;
988 const int tid = threadIdx.x % WARPSIZE;
990 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
991 // Read in the CudaLocalRecord using multiple threads. This should
993 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
994 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
998 const int numAtoms = s_record.numAtoms;
1000 // This was causing issues with CUDA11.3. Needed to explicitly make the offset
1003 const int64_t offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1004 const int64_t offset = s_record.bufferOffset;
1006 const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1007 const int offset = s_record.bufferOffset;
1010 for (int i = wid; i < numAtoms; i+=warps_per_threadblock) {
1011 const int dst_idx = sortIndex[offsetIn + i];
1012 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1013 int32_t *src_int = (int32_t*) &(atomdata_AoS_in[offsetIn + i]);
1014 int32_t *dst_int = (int32_t*) &(atomdata_AoS_out[offset + dst_idx]);
1015 dst_int[tid] = src_int[tid];
1017 #if defined(NAMD_HIP)
1018 NAMD_WARP_SYNC(WARP_FULL_MASK);
1020 WARP_SYNC(WARP_FULL_MASK);
1028 void MigrationCUDAKernel::sortSolventAtoms(
1029 const int numPatches,
1030 const CudaLocalRecord* records,
1031 const FullAtom* atomdata_AoS_in,
1032 FullAtom* atomdata_AoS_out,
1036 constexpr int numThreads = kSortNumThreads;
1037 const int numBlocks = numPatches;
1039 computeSolventIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1046 sortSolventKernel<<<numBlocks, numThreads, 0, stream>>>(
1056 * \brief Computes the migration group index
1059 * Atom migration must occur at the level of migration group. Some molecules are
1060 * moved together. I.e. hydrogen of a water molecular are moved based on oxygen's position
1062 * This kernel computes the number of migration groups per patch as well as their starting
1063 * indices. This will be used later during migration. It does this with a scan operation
1066 __global__ void computeMigrationGroupIndexKernel(
1067 const int numPatches,
1068 CudaLocalRecord* localRecords,
1069 const int* migrationGroupSize,
1070 int* migrationGroupIndex
1072 __shared__ CudaLocalRecord s_record;
1073 using AccessType = int32_t;
1074 AccessType* s_record_buffer = (AccessType*) &s_record;
1076 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1077 __shared__ typename BlockScan::TempStorage temp_storage;
1079 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1080 // Read in the CudaLocalRecord using multiple threads. This should
1082 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1083 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1087 const int numAtoms = s_record.numAtoms;
1088 const int offset = s_record.bufferOffset;
1090 int thread_size[MigrationCUDAKernel::kValuesPerThread];
1091 int thread_index[MigrationCUDAKernel::kValuesPerThread];
1094 // Load the migration group size
1095 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1096 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1097 if (idx < numAtoms) {
1098 thread_size[i] = migrationGroupSize[offset + idx] ? 1 : 0;
1105 // Compute the prefix sum of solvent
1106 BlockScan(temp_storage).ExclusiveSum(thread_size, thread_index, numGroups);
1109 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1110 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1111 if (thread_size[i] != 0) {
1112 migrationGroupIndex[offset + thread_index[i]] = idx;
1117 if (threadIdx.x == 0) {
1118 localRecords[patchIndex].numMigrationGroups = numGroups;
1125 void MigrationCUDAKernel::computeMigrationGroupIndex(
1126 const int numPatches,
1127 CudaLocalRecord* records,
1128 const int* migrationGroupSize,
1129 int* migrationGroupIndex,
1132 constexpr int numThreads = kSortNumThreads;
1133 const int numBlocks = numPatches;
1135 computeMigrationGroupIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1144 * \brief Computes the transformed positions of migrated atoms
1147 * When atoms move between patches, we need to do a transform on the position based
1148 * on the lattice. This logic is largely copied from HomePatch.C. This is only done
1149 * on the atoms that have actually moved between patches. This uses the numAtomsLocal
1150 * field of the CudaLocalRecord
1152 * Note this kernel could likely be optimized. Threads that are not assigned to the
1153 * parent of a migration group do no work. But not that many atoms actually migrate
1156 __global__ void transformMigratedPositionsKernel(
1157 const int numPatches,
1158 const CudaLocalRecord* localRecords,
1159 const double3* patchCenter,
1160 FullAtom* atomdata_AoS,
1161 const Lattice lattice
1163 __shared__ CudaLocalRecord s_record;
1164 using AccessType = int32_t;
1165 AccessType* s_record_buffer = (AccessType*) &s_record;
1167 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1168 // Read in the CudaLocalRecord using multiple threads. This should
1170 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1171 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1175 const int offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1176 const int numAtoms = s_record.numAtoms;
1177 const int numAtomsLocal = s_record.numAtomsLocal;
1178 const int numAtomsMigrated = numAtoms - numAtomsLocal;
1179 const double3 center = patchCenter[patchIndex];
1181 for (int i = threadIdx.x; i < numAtomsMigrated; i += blockDim.x) {
1182 const int startingIndex = numAtomsLocal + i;
1183 const int migrationSize = atomdata_AoS[offset + startingIndex].migrationGroupSize;
1184 const int hydrogenSize = atomdata_AoS[offset + startingIndex].hydrogenGroupSize;
1186 if (migrationSize == 0) continue;
1188 Transform parent_transform;
1189 if (migrationSize != hydrogenSize) {
1190 double3 c_pos = make_double3(0.0, 0.0, 0.0);
1192 int tmp_hydrogenGroupSize = hydrogenSize;
1193 for (int j = 0; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1194 c_pos = c_pos + (double3) atomdata_AoS[offset + startingIndex + j].position;
1196 tmp_hydrogenGroupSize = atomdata_AoS[offset + startingIndex + j].hydrogenGroupSize;
1198 c_pos.x = c_pos.x / ((double) c);
1199 c_pos.y = c_pos.y / ((double) c);
1200 c_pos.z = c_pos.z / ((double) c);
1202 parent_transform = atomdata_AoS[offset+startingIndex].transform;
1203 c_pos = lattice.nearest(c_pos, center, &parent_transform);
1205 double3 pos = atomdata_AoS[offset+startingIndex].position;
1206 Transform transform = atomdata_AoS[offset+startingIndex].transform;
1208 pos = lattice.reverse_transform(pos, transform);
1209 pos = lattice.apply_transform(pos, parent_transform);
1211 atomdata_AoS[offset+startingIndex].transform = parent_transform;
1212 atomdata_AoS[offset+startingIndex].position = pos;
1215 double3 pos = atomdata_AoS[offset+startingIndex].position;
1216 Transform transform = atomdata_AoS[offset+startingIndex].transform;
1218 pos = lattice.nearest(pos, center, &transform);
1219 parent_transform = transform;
1221 atomdata_AoS[offset+startingIndex].transform = transform;
1222 atomdata_AoS[offset+startingIndex].position = pos;
1224 for (int j = 1; j < migrationSize; j++) {
1225 double3 pos = atomdata_AoS[offset+startingIndex + j].position;
1226 Transform transform = atomdata_AoS[offset+startingIndex +j].transform;
1228 pos = lattice.reverse_transform(pos, transform);
1229 pos = lattice.apply_transform(pos, parent_transform);
1231 atomdata_AoS[offset+startingIndex+j].transform = parent_transform;
1232 atomdata_AoS[offset+startingIndex+j].position = pos;
1239 void MigrationCUDAKernel::transformMigratedPositions(
1240 const int numPatches,
1241 const CudaLocalRecord* localRecords,
1242 const double3* patchCenter,
1243 FullAtom* atomdata_AoS,
1244 const Lattice lattice,
1247 constexpr int numThreads = kSortNumThreads;
1248 const int numBlocks = numPatches;
1250 transformMigratedPositionsKernel<<<numBlocks, numThreads, 0, stream>>>(
1261 * \brief Computes the new location of a migration group
1264 * Computes the migration destination for each migration group. This info includes:
1265 * - Destination patch ID
1266 * - Destination patch local index (this is the home patches index)
1267 * - Destination device
1269 * While the information is computed at the migration group level, it is stored for each atom
1270 * in a migration group, so some of the later data movement can happen at the per atom level
1272 * Note the migrationDestination structure will eventually include the atoms new index within a
1273 * patch, but that is determined later
1277 computeMigrationDestinationKernel(
1278 const int numPatches,
1279 CudaLocalRecord* localRecords,
1280 const Lattice lattice,
1281 const CudaMInfo* mInfo,
1282 const int* patchToDeviceMap,
1283 const int* globalToLocalMap,
1284 const double3* patchMin,
1285 const double3* patchMax,
1286 const int* hydrogenGroupSize,
1287 const int* migrationGroupSize,
1288 const int* migrationGroupIndex,
1289 const double* pos_x,
1290 const double* pos_y,
1291 const double* pos_z,
1292 int4* migrationDestination
1294 __shared__ CudaLocalRecord s_record;
1295 using AccessType = int32_t;
1296 AccessType* s_record_buffer = (AccessType*) &s_record;
1298 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1299 // Read in the CudaLocalRecord using multiple threads. This should
1301 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1302 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1305 // Clear migration atom count
1306 if (threadIdx.x == 0) {
1307 localRecords[patchIndex].numAtomsNew = 0;
1311 const int numMigrationGroups = s_record.numMigrationGroups;
1312 const int offset = s_record.bufferOffset;
1313 const double3 min = patchMin[patchIndex];
1314 const double3 max = patchMax[patchIndex];
1315 CudaMInfo migration_info = mInfo[patchIndex];
1317 int xdev, ydev, zdev;
1319 for (int i = threadIdx.x; i < numMigrationGroups; i += blockDim.x) {
1321 const int startingIndex = migrationGroupIndex[offset + i];
1322 const int migrationSize = migrationGroupSize[offset + startingIndex];
1323 const int hydrogenSize = hydrogenGroupSize[offset + startingIndex];
1326 pos.x = pos_x[offset + startingIndex];
1327 pos.y = pos_y[offset + startingIndex];
1328 pos.z = pos_z[offset + startingIndex];
1330 if (migrationSize != hydrogenSize) {
1332 int tmp_hydrogenGroupSize = 1;
1333 for (int j = hydrogenSize; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1334 pos.x = pos.x + pos_x[offset + startingIndex + j];
1335 pos.y = pos.y + pos_y[offset + startingIndex + j];
1336 pos.z = pos.z + pos_z[offset + startingIndex + j];
1338 tmp_hydrogenGroupSize = hydrogenGroupSize[offset + startingIndex + j];
1340 pos.x = pos.x / ((double) c);
1341 pos.y = pos.y / ((double) c);
1342 pos.z = pos.z / ((double) c);
1345 double3 s = lattice.scale(pos);
1347 if (s.x < min.x) xdev = 0;
1348 else if (max.x <= s.x) xdev = 2;
1351 if (s.y < min.y) ydev = 0;
1352 else if (max.y <= s.y) ydev = 2;
1355 if (s.z < min.z) zdev = 0;
1356 else if (max.z <= s.z) zdev = 2;
1359 int dest_patch = migration_info.destPatchID[xdev][ydev][zdev];
1360 int dest_device = patchToDeviceMap[dest_patch];
1361 int dest_localID = globalToLocalMap[dest_patch];
1363 dest_info.x = dest_patch;
1364 dest_info.y = dest_device;
1365 dest_info.z = dest_localID;
1368 for (int j = 0; j < migrationSize; j++) {
1369 migrationDestination[offset + startingIndex + j] = dest_info;
1376 void MigrationCUDAKernel::computeMigrationDestination(
1377 const int numPatches,
1378 CudaLocalRecord* localRecords,
1379 const Lattice lattice,
1380 const CudaMInfo* mInfo,
1381 const int* patchToDeviceMap,
1382 const int* globalToLocalMap,
1383 const double3* patchMin,
1384 const double3* patchMax,
1385 const int* hydrogenGroupSize,
1386 const int* migrationGroupSize,
1387 const int* migrationGroupIndex,
1388 const double* pos_x,
1389 const double* pos_y,
1390 const double* pos_z,
1391 int4* migrationDestination,
1394 constexpr int numThreads = kSortNumThreads;
1395 const int numBlocks = numPatches;
1397 computeMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1408 migrationGroupIndex,
1409 pos_x, pos_y, pos_z,
1410 migrationDestination
1416 * \brief Updates AoS data structure before migration
1419 * Copies the necessary data from the SoA buffers to the AoS buffers
1420 * This is just the positions and velocities
1422 * TODO optimize this kernel
1425 __global__ void update_AoSKernel(
1426 const int numPatches,
1427 const CudaLocalRecord* localRecords,
1428 FullAtom* atomdata_AoS,
1429 const double* vel_x,
1430 const double* vel_y,
1431 const double* vel_z,
1432 const double* pos_x,
1433 const double* pos_y,
1436 __shared__ CudaLocalRecord s_record;
1437 using AccessType = int32_t;
1438 AccessType* s_record_buffer = (AccessType*) &s_record;
1440 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1441 // Read in the CudaLocalRecord using multiple threads. This should
1443 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1444 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1448 const int numAtoms = s_record.numAtoms;
1449 const int offset = s_record.bufferOffset;
1451 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1453 pos.x = pos_x[offset + i];
1454 pos.y = pos_y[offset + i];
1455 pos.z = pos_z[offset + i];
1458 vel.x = vel_x[offset + i];
1459 vel.y = vel_y[offset + i];
1460 vel.z = vel_z[offset + i];
1462 atomdata_AoS[offset + i].position = pos;
1463 atomdata_AoS[offset + i].velocity = vel;
1469 void MigrationCUDAKernel::update_AoS(
1470 const int numPatches,
1471 const CudaLocalRecord* records,
1472 FullAtom* atomdata_AoS,
1473 const double* vel_x,
1474 const double* vel_y,
1475 const double* vel_z,
1476 const double* pos_x,
1477 const double* pos_y,
1478 const double* pos_z,
1481 constexpr int numThreads = kSortNumThreads;
1482 const int numBlocks = numPatches;
1484 update_AoSKernel<<<numBlocks, numThreads, 0, stream>>>(
1488 vel_x, vel_y, vel_z,
1495 * \brief Copies atoms that aren't "moving" into the scratch buffers
1498 * This kernel will copy the atoms that are not moving into a new patch into that
1499 * patches scratch buffers. This uses a scan operation to eliminate the need for
1500 * atomic operations to compute atom's new location.
1502 * This scan will not change the order of the atoms within a migration group
1504 * Note this will set the localID of the atom in the migrationDestination
1505 * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1508 __global__ void performLocalMigrationKernel(
1509 const int numPatches,
1510 CudaLocalRecord* localRecords,
1511 const FullAtom* atomdata_AoS_in,
1512 FullAtom* atomdata_AoS_out,
1513 int4* migrationDestination
1515 __shared__ CudaLocalRecord s_record;
1516 using AccessType = int32_t;
1517 AccessType* s_record_buffer = (AccessType*) &s_record;
1519 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1520 __shared__ typename BlockScan::TempStorage temp_storage;
1522 __shared__ int s_local_index[MigrationCUDAKernel::kSortNumThreads * MigrationCUDAKernel::kValuesPerThread];
1524 const int warps_per_threadblock = blockDim.x / WARPSIZE;
1525 const int wid = threadIdx.x / WARPSIZE;
1526 const int tid = threadIdx.x % WARPSIZE;
1528 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1529 // Read in the CudaLocalRecord using multiple threads. This should
1531 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1532 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1536 const int patchID = s_record.patchID;
1537 const int numAtoms = s_record.numAtoms;
1538 const int offset = s_record.bufferOffset;
1539 const int dst_offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1542 // Load in if the atom is staying in the same patch
1543 int thread_input[MigrationCUDAKernel::kValuesPerThread];
1544 int thread_output[MigrationCUDAKernel::kValuesPerThread];
1546 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1547 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1548 if (idx < numAtoms && migrationDestination[offset + idx].x == patchID) {
1549 thread_input[i] = 1;
1551 thread_input[i] = 0;
1556 // Compute the prefix sum of local
1557 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, numLocal);
1560 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1561 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1562 if (idx < numAtoms && thread_input[i]) {
1563 s_local_index[thread_output[i]] = idx;
1566 if (threadIdx.x == 0) {
1567 localRecords[patchIndex].numAtomsLocal = numLocal;
1568 localRecords[patchIndex].numAtomsNew = numLocal;
1572 // We can do this at the atom level instead of the migrationGroup level
1573 // because the migrationDestinations take that into account and
1574 // the prefix sum means we do not change the ordering of local atoms
1575 for (int i = wid; i < numLocal; i += warps_per_threadblock) {
1576 const int src_atom_idx = s_local_index[i];
1577 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1578 int32_t *src_int = (int32_t*) &(atomdata_AoS_in [offset + src_atom_idx]);
1579 int32_t *dst_int = (int32_t*) &(atomdata_AoS_out [dst_offset + i]);
1580 dst_int[tid] = src_int[tid];
1583 migrationDestination[offset + src_atom_idx].w = i;
1585 #if defined(NAMD_HIP)
1586 NAMD_WARP_SYNC(WARP_FULL_MASK);
1588 WARP_SYNC(WARP_FULL_MASK);
1595 void MigrationCUDAKernel::performLocalMigration(
1596 const int numPatches,
1597 CudaLocalRecord* records,
1598 const FullAtom* atomdata_AoS_in,
1599 FullAtom* atomdata_AoS_out,
1600 int4* migrationDestination,
1603 constexpr int numThreads = kSortNumThreads;
1604 const int numBlocks = numPatches;
1606 performLocalMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1611 migrationDestination
1617 * \brief Moves the migrating atoms into their new patches
1620 * Copies the atoms which have moved into new patches
1621 * into those patches scratch buffers. This will use atomic operations to
1622 * determine the new index of each migration group. This happens at the migration
1623 * group level so atoms within a migration group stay in the same order.
1625 * Note because not many atoms move, the use of atomics isn't too expensive
1626 * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1629 __global__ void performMigrationKernel(
1630 const int numPatches,
1631 CudaLocalRecord* localRecords,
1632 CudaLocalRecord** peer_records,
1633 const FullAtom* local_atomdata_AoS,
1634 FullAtom** peer_atomdata_AoS,
1635 const int* migrationGroupSize,
1636 const int* migrationGroupIndex,
1637 int4* migrationDestination
1639 __shared__ CudaLocalRecord s_record;
1640 using AccessType = int32_t;
1641 AccessType* s_record_buffer = (AccessType*) &s_record;
1643 const int warps_per_threadblock = blockDim.x / WARPSIZE;
1644 const int wid = threadIdx.x / WARPSIZE;
1645 const int tid = threadIdx.x % WARPSIZE;
1647 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1648 // Read in the CudaLocalRecord using multiple threads. This should
1650 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1651 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1655 const int numMigrationGroups = s_record.numMigrationGroups;
1656 const int offset = s_record.bufferOffset;
1657 const int patchID = s_record.patchID;
1661 for (int i = wid; i < numMigrationGroups; i += warps_per_threadblock) {
1662 const int startingIndex = migrationGroupIndex[offset + i];
1664 const int size = migrationGroupSize[offset + startingIndex];
1666 const int4 migrationInfo = migrationDestination[offset + startingIndex];
1667 const int dst_patchID = migrationInfo.x;
1668 const int dst_device = migrationInfo.y;
1669 const int dst_localID = migrationInfo.z;
1670 const int dst_offset = dst_localID * MigrationCUDAKernel::kMaxAtomsPerPatch;
1672 if (dst_patchID == patchID) continue;
1674 // Get index via atomic operation
1677 int* counter_index = &(peer_records[dst_device][dst_localID].numAtomsNew);
1678 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
1679 dst_atom_idx = atomicAdd_system(counter_index, size);
1681 // support single-GPU Maxwell
1682 dst_atom_idx = atomicAdd(counter_index, size);
1685 dst_atom_idx = WARP_SHUFFLE(WARP_FULL_MASK, dst_atom_idx, 0, WARPSIZE);
1688 FullAtom* remote_atomdata_AoS = peer_atomdata_AoS[dst_device];
1689 for (int j = 0; j < size; j++) {
1690 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1691 int32_t *src_int = (int32_t*) &(local_atomdata_AoS [offset + startingIndex + j]);
1692 int32_t *dst_int = (int32_t*) &(remote_atomdata_AoS [dst_offset + dst_atom_idx + j]);
1693 dst_int[tid] = src_int[tid];
1696 migrationDestination[offset + startingIndex + j].w = dst_atom_idx + j;
1698 #if defined(NAMD_HIP)
1699 NAMD_WARP_SYNC(WARP_FULL_MASK);
1701 WARP_SYNC(WARP_FULL_MASK);
1710 void MigrationCUDAKernel::performMigration(
1711 const int numPatches,
1712 CudaLocalRecord* records,
1713 CudaLocalRecord** peer_records,
1714 const FullAtom* local_atomdata_AoS,
1715 FullAtom** peer_atomdata_AoS,
1716 const int* migrationGroupSize,
1717 const int* migrationGroupIndex,
1718 int4* migrationDestination,
1721 constexpr int numThreads = kSortNumThreads;
1722 const int numBlocks = numPatches;
1724 performMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1731 migrationGroupIndex,
1732 migrationDestination
1738 * \brief Updates migrationDestination array based on solute-solvent order
1741 * The migrationDestation structure is used later by the tuple migration, so
1742 * we need to keep it accurate after the solute-solvent sorting. This uses peer
1743 * access to update each atom's new index within its new patch.
1746 __global__ void updateMigrationDestinationKernel(
1747 const int numAtomsHome,
1748 int4* migrationDestination,
1749 int** d_peer_sortSoluteIndex
1751 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1752 for (int i = tid; i < numAtomsHome; i += blockDim.x * gridDim.x) {
1753 const int4 dest = migrationDestination[i];
1754 const int dst_device = dest.y;
1755 const int dst_patchID = dest.z;
1756 const int dst_patchOffset = MigrationCUDAKernel::kMaxAtomsPerPatch * dst_patchID;
1757 const int inputIndex = dest.w;
1759 const int outputIndex = d_peer_sortSoluteIndex[dst_device][dst_patchOffset + inputIndex];
1760 migrationDestination[i].w = outputIndex;
1764 void MigrationCUDAKernel::updateMigrationDestination(
1765 const int numAtomsHome,
1766 int4* migrationDestination,
1767 int** d_peer_sortSoluteIndex,
1770 constexpr int numThreads = kSortNumThreads;
1771 const int numBlocks = (numAtomsHome + kSortNumThreads - 1) / kSortNumThreads;
1773 updateMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1775 migrationDestination,
1776 d_peer_sortSoluteIndex
1782 * \brief Copies static atomic data to proxy patches
1785 * Copies atomic data that does not change to proxy patches.
1788 template <bool doAlch>
1789 __global__ void copyDataToProxiesKernel(
1790 const int deviceIndex,
1791 const int numPatchesHome,
1792 const int numPatchesHomeAndProxy,
1793 const CudaLocalRecord* localRecords,
1796 int** peer_sortOrder,
1797 int** peer_unsortOrder,
1798 float** peer_charge,
1799 int** peer_partition,
1800 double3** peer_patchCenter
1802 __shared__ CudaLocalRecord s_record;
1803 using AccessType = int32_t;
1804 AccessType* s_record_buffer = (AccessType*) &s_record;
1806 for (int patchIndex = blockIdx.x + numPatchesHome; patchIndex < numPatchesHomeAndProxy; patchIndex += gridDim.x) {
1807 // Read in the CudaLocalRecord using multiple threads. This should
1809 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1810 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1814 const int numAtoms = s_record.numAtoms;
1815 const int dstOffset = s_record.bufferOffset;
1816 const int srcDevice = s_record.inline_peers[0].deviceIndex;
1817 const int srcOffset = s_record.inline_peers[0].bufferOffset;
1818 const int srcPatchIndex = s_record.inline_peers[0].patchIndex;
1820 // TODO this data is probably on the host somewhere...
1821 // And it probably doesn't change????
1822 if (threadIdx.x == 0) {
1823 peer_patchCenter[deviceIndex][patchIndex] = peer_patchCenter[srcDevice][srcPatchIndex];
1827 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1828 peer_id [deviceIndex][dstOffset + i] = peer_id[srcDevice][srcOffset + i];
1829 peer_vdwType [deviceIndex][dstOffset + i] = peer_vdwType[srcDevice][srcOffset + i];
1830 peer_sortOrder [deviceIndex][dstOffset + i] = peer_sortOrder[srcDevice][srcOffset + i];
1831 peer_unsortOrder[deviceIndex][dstOffset + i] = peer_unsortOrder[srcDevice][srcOffset + i];
1832 peer_charge [deviceIndex][dstOffset + i] = peer_charge[srcDevice][srcOffset + i];
1834 peer_partition[deviceIndex][dstOffset + i] = peer_partition[srcDevice][srcOffset + i];
1841 void MigrationCUDAKernel::copyDataToProxies(
1842 const int deviceIndex,
1843 const int numPatchesHome,
1844 const int numPatchesHomeAndProxy,
1845 const CudaLocalRecord* records,
1848 int** peer_sortOrder,
1849 int** peer_unsortOrder,
1850 float** peer_charge,
1851 int** peer_partition,
1852 double3** peer_patchCenter,
1856 constexpr int numThreads = kSortNumThreads;
1857 const int numPatches = numPatchesHomeAndProxy - numPatchesHome;
1858 const int numBlocks = numPatches;
1861 copyDataToProxiesKernel<true><<<numBlocks, numThreads, 0, stream>>>(
1863 numPatchesHome, numPatchesHomeAndProxy,
1865 peer_id, peer_vdwType, peer_sortOrder,
1866 peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1870 copyDataToProxiesKernel<false><<<numBlocks, numThreads, 0, stream>>>(
1872 numPatchesHome, numPatchesHomeAndProxy,
1874 peer_id, peer_vdwType, peer_sortOrder,
1875 peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1881 * \brief Copies migrationDestination to proxy patches
1884 * This copies the migrationDestination to proxies to be used in tuple migration
1885 * This needs to use the old bufferOffsets and atom count so it is separate from the
1886 * other copyDataToProxies kernel. Additionally, we do this as a put operation to
1887 * avoid a node barrier. The home patch produces this data and then it needs to write it
1888 * to the proxy patch; it the device which owns the proxy patch was doing a get operation,
1889 * we'd need to have a node barrier to make sure this data was ready to be ready. By using
1890 * put operation, we can do a device local synchonization (i.e. a new kernel launch) to
1891 * make sure the data is ready
1894 __global__ void copyMigrationDestinationToProxiesKernel(
1895 const int deviceIndex,
1896 const int numPatchesHome,
1897 const int numPatchesHomeAndProxy,
1898 const CudaLocalRecord* localRecords,
1899 const CudaPeerRecord* peerRecords,
1900 int4** peer_migrationDestination
1902 __shared__ CudaLocalRecord s_record;
1903 using AccessType = int32_t;
1904 AccessType* s_record_buffer = (AccessType*) &s_record;
1906 for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
1907 // Read in the CudaLocalRecord using multiple threads. This should
1909 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1910 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1914 const int numAtoms = s_record.numAtomsNew;
1915 const int dstOffset = s_record.bufferOffsetOld;
1916 const int numPeerRecords = s_record.numPeerRecord;
1917 const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
1919 const int inlineRemotes = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
1921 for (int remote = 0; remote < inlineRemotes; remote++) {
1922 const int srcDevice = s_record.inline_peers[remote].deviceIndex;
1923 const int srcOffset = s_record.inline_peers[remote].bufferOffset;
1924 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1925 peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1929 for (int remote = inlineRemotes; remote < numPeerRecords; remote++) {
1930 const int srcDevice = peerRecords[peerRecordStartIndexs + remote].deviceIndex;
1931 const int srcOffset = peerRecords[peerRecordStartIndexs + remote].bufferOffset;
1932 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1933 peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1940 void MigrationCUDAKernel::copyMigrationDestinationToProxies(
1941 const int deviceIndex,
1942 const int numPatchesHome,
1943 const int numPatchesHomeAndProxy,
1944 const CudaLocalRecord* records,
1945 const CudaPeerRecord* peerRecords,
1946 int4** peer_migrationDestination,
1949 constexpr int numThreads = kSortNumThreads;
1950 const int numBlocks = numPatchesHome;
1952 copyMigrationDestinationToProxiesKernel<<<numBlocks, numThreads, 0, stream>>>(
1954 numPatchesHome, numPatchesHomeAndProxy,
1957 peer_migrationDestination
1962 * \brief Updates the CudaLocalRecord data structure
1965 * This updates the number of atoms and buffer offsets of the patches. This should only
1966 * be ran on the home patches since it uses peer access to update remote structures
1969 __global__ void updateLocalRecordsKernel(
1970 const int numPatchesHome,
1971 CudaLocalRecord* localRecords,
1972 CudaLocalRecord** peer_records,
1973 const CudaPeerRecord* peerRecords
1975 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1976 for (int i = tid; i < numPatchesHome; i += blockDim.x * gridDim.x) {
1977 const int numAtomsOld = localRecords[i].numAtoms;
1978 const int numAtoms = localRecords[i].numAtomsNew;
1979 const int numAtomsNBPad = CudaComputeNonbondedKernel::computeAtomPad(numAtoms);
1980 localRecords[i].numAtoms = numAtoms;
1981 localRecords[i].numAtomsNBPad = numAtomsNBPad;
1982 localRecords[i].bufferOffsetOld = localRecords[i].bufferOffset;
1983 localRecords[i].numAtomsNew = numAtomsOld;
1985 const int numPeerRecord = localRecords[i].numPeerRecord;
1986 const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
1987 // TODO use inline remotes??
1989 for (int remote = 0; remote < numPeerRecord; remote++) {
1990 const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
1991 const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
1992 peer_records[srcDevice][srcOffset].numAtoms = numAtoms;
1993 peer_records[srcDevice][srcOffset].numAtomsNBPad = numAtomsNBPad;
1998 void MigrationCUDAKernel::updateLocalRecords(
1999 const int numPatchesHome,
2000 CudaLocalRecord* records,
2001 CudaLocalRecord** peer_records,
2002 const CudaPeerRecord* peerRecords,
2005 const int numThreads = kSortNumThreads;
2006 const int numBlocks = (numPatchesHome + kSortNumThreads - 1) / kSortNumThreads;
2008 updateLocalRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2018 struct RecordToNumAtoms {
2019 __host__ __device__ __forceinline__
2020 int operator()(const CudaLocalRecord &a) const {
2025 struct RecordTonumAtomsNBPad {
2026 __host__ __device__ __forceinline__
2027 int operator()(const CudaLocalRecord &a) const {
2028 return a.numAtomsNBPad;
2033 * \brief Updates the buffer offsets based on scratch patchOffsets
2036 __global__ void updateLocalRecordsOffsetKernel(
2037 const int numPatchesHomeAndProxy,
2038 CudaLocalRecord* localRecords,
2042 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2043 for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2044 localRecords[i].bufferOffset = patchOffsets[i];
2045 localRecords[i].bufferOffsetNBPad = patchOffsetsNB[i];
2049 void MigrationCUDAKernel::updateLocalRecordsOffset(
2050 const int numPatchesHomeAndProxy,
2051 CudaLocalRecord* records,
2054 const int numThreads = kSortNumThreads;
2055 const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2056 size_t temp = patchDeviceScan_alloc;
2059 // Integration Offsets
2061 RecordToNumAtoms conversion_op;
2062 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2063 using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2065 using InputIter = thrust::transform_iterator<RecordToNumAtoms, CudaLocalRecord*, int>;
2067 InputIter iter(records, conversion_op);
2069 cub::DeviceScan::ExclusiveSum<InputIter>(
2070 d_patchDeviceScan_scratch, temp,
2071 iter, d_patchOffset_temp, numPatchesHomeAndProxy, stream
2075 // Nonbonded Offsets
2077 RecordTonumAtomsNBPad conversion_op_nb;
2078 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2079 using InputIterNB = cub::TransformInputIterator<int, RecordTonumAtomsNBPad, CudaLocalRecord*>;
2081 using InputIterNB = thrust::transform_iterator<RecordTonumAtomsNBPad, CudaLocalRecord*, int>;
2083 InputIterNB iterNB(records, conversion_op_nb);
2085 cub::DeviceScan::ExclusiveSum<InputIterNB>(
2086 d_patchDeviceScan_scratch, temp,
2087 iterNB, d_patchOffsetNB_temp, numPatchesHomeAndProxy, stream
2093 updateLocalRecordsOffsetKernel<<<numBlocks, numThreads, 0, stream>>>(
2094 numPatchesHomeAndProxy,
2097 d_patchOffsetNB_temp
2102 * \brief Updates the remote records on this device
2105 __global__ void updatePeerRecordsKernel(
2106 const int numPatchesHomeAndProxy,
2107 CudaLocalRecord* localRecords,
2108 CudaLocalRecord** peer_records,
2109 CudaPeerRecord* peerRecords
2111 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2112 for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2113 const int numPeerRecord = localRecords[i].numPeerRecord;
2114 const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
2116 for (int remote = 0; remote < numPeerRecord; remote++) {
2117 const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
2118 const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
2120 const int bufferOffset = peer_records[srcDevice][srcOffset].bufferOffset;
2121 const int bufferOffsetNBPad = peer_records[srcDevice][srcOffset].bufferOffsetNBPad;
2123 peerRecords[peerRecordStartIndex + remote].bufferOffset = bufferOffset;
2124 peerRecords[peerRecordStartIndex + remote].bufferOffsetNBPad = bufferOffsetNBPad;
2126 if (remote < CudaLocalRecord::num_inline_peer) {
2127 localRecords[i].inline_peers[remote].bufferOffset = bufferOffset;
2128 localRecords[i].inline_peers[remote].bufferOffsetNBPad = bufferOffsetNBPad;
2134 void MigrationCUDAKernel::updatePeerRecords(
2135 const int numPatchesHomeAndProxy,
2136 CudaLocalRecord* records,
2137 CudaLocalRecord** peer_records,
2138 CudaPeerRecord* peerRecords,
2141 const int numThreads = kSortNumThreads;
2142 const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2144 updatePeerRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2145 numPatchesHomeAndProxy,
2152 MigrationCUDAKernel::MigrationCUDAKernel() {
2153 d_patchOffset_temp = NULL;
2154 d_patchOffsetNB_temp = NULL;
2156 patchDeviceScan_alloc = 0;
2157 d_patchDeviceScan_scratch = NULL;
2160 MigrationCUDAKernel::~MigrationCUDAKernel() {
2161 if (d_patchOffset_temp != NULL) deallocate_device<int>(&d_patchOffset_temp);
2162 if (d_patchOffsetNB_temp != NULL) deallocate_device<int>(&d_patchOffsetNB_temp);
2163 if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
2166 void MigrationCUDAKernel::allocateScratch(const int numPatchesHomeAndProxy) {
2167 allocate_device<int>(&d_patchOffset_temp, numPatchesHomeAndProxy);
2168 allocate_device<int>(&d_patchOffsetNB_temp, numPatchesHomeAndProxy);
2170 // Fake call to CUB to get required size
2171 RecordToNumAtoms conversion_op;
2172 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
2173 using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2175 using InputIter = thrust::transform_iterator<RecordToNumAtoms, CudaLocalRecord*, int>;
2177 InputIter iter(NULL, conversion_op);
2179 void *d_temp_storage = NULL;
2180 size_t temp_storage_bytes = 0;
2182 cub::DeviceScan::ExclusiveSum<InputIter>(
2183 d_temp_storage, temp_storage_bytes,
2184 iter, d_patchOffset_temp, numPatchesHomeAndProxy
2187 patchDeviceScan_alloc = temp_storage_bytes;
2188 allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
2190 #endif // NODEGROUP_FORCE_REGISTER