2 // #include <algorithm>
6 #if __CUDACC_VER_MAJOR__ >= 11
7 #include <cub/device/device_radix_sort.cuh>
8 #include <cub/device/device_scan.cuh>
11 #include <namd_cub/device/device_radix_sort.cuh>
12 #include <namd_cub/device/device_scan.cuh>
13 #include <namd_cub/cub.cuh>
17 #include "CudaUtils.h"
18 #include "CudaTileListKernel.h"
19 #include "CudaComputeNonbondedKernel.h"
20 #include "DeviceCUDA.h"
22 #if defined(NAMD_CUDA)
25 #define __thread __declspec(thread)
28 extern __thread DeviceCUDA *deviceCUDA;
30 #define OVERALLOC 2.0f
32 #if __CUDA_ARCH__ < 350
36 void NAMD_die(const char *);
39 // Calculate the number of lists that contribute to each patch
41 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
42 const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
44 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
46 int2 patchInd = tileLists[i].patchInd;
47 atomicAdd(&patchNumLists[patchInd.x], 1);
48 if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
54 // Write patchNumList back to tile list and
55 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
57 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
58 TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
59 const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
61 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
63 int2 patchInd = tileLists[i].patchInd;
64 int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
65 tileLists[i].patchNumList = patchNumList;
68 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
70 if (patchNumLists[i] == 0) {
71 int ind = atomicAdd(numEmptyPatches, 1);
72 emptyPatches[ind] = i;
79 // Builds a sort key that removes zeros but keeps the order otherwise the same
81 __global__ void buildRemoveZerosSortKey(const int numTileLists,
82 const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
84 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
86 int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
87 sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
92 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
93 const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
94 const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
96 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
98 int icompute = tileLists[itileList].icompute;
99 int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
100 int i = icompute*maxTileListLen + (depth - 1);
101 sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
107 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
108 unsigned int* __restrict__ keys, int* __restrict__ vals) {
110 // NOTE: blockDim.x = width
112 for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
114 int i = base + threadIdx.x;
115 typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
116 __shared__ typename BlockRadixSort::TempStorage tempStorage;
117 unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
118 int val[1] = {(i < n) ? vals[i] : 0};
119 BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
129 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
130 const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
131 const int* __restrict__ tileListOrderSrc,
132 const unsigned int* __restrict__ tileListDepthSrc,
133 int* __restrict__ tileListOrderDst,
134 unsigned int* __restrict__ tileListDepthDst) {
136 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
138 int j = outputOrder[numTileListsSrc - i - 1];
139 if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
140 int k = tileListPos[i];
141 tileListDepthDst[k] = tileListDepthSrc[j];
142 tileListOrderDst[k] = j; //tileListOrderSrc[j];
148 // Bit shift tileListDepth so that only lower 16 bits are used
150 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
151 const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
152 unsigned int* __restrict__ tileListDepthDst) {
154 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
156 int j = outputOrder[numTileLists - i - 1];
157 tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
162 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
163 int2* __restrict__ minmaxListLen) {
166 val.x = maxTileListLen+1;
168 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
170 minmaxListLen[i] = val;
176 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
178 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
179 const TileList* __restrict__ tileListsSrc,
180 const int* __restrict__ tileListOrderDst,
181 const unsigned int* __restrict__ tileListDepthDst,
182 int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
184 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
186 int k = tileListOrderDst[i];
187 int icompute = tileListsSrc[k].icompute;
188 int depth = tileListDepthDst[i] & 65535;
189 // depth is in range [1 ... maxTileListLen]
190 int j = icompute*maxTileListLen + (depth-1);
192 int2 minmax = minmaxListLen[icompute];
193 int2 minmaxOrig = minmax;
194 if (minmax.x > depth) minmax.x = depth;
195 if (minmax.y < depth) minmax.y = depth;
196 if (minmax.x != minmaxOrig.x) {
197 atomicMin(&minmaxListLen[icompute].x, minmax.x);
199 if (minmax.y != minmaxOrig.y) {
200 atomicMax(&minmaxListLen[icompute].y, minmax.y);
206 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
207 const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
209 for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
210 const int wid = threadIdx.x % WARPSIZE;
211 int2 minmax = minmaxListLen[i];
212 int minlen = minmax.x;
213 int maxlen = minmax.y;
214 // minlen, maxlen are in range [1 ... maxTileListLen]
215 // as long as i is in tileListsSrc[].icompute above
216 if ( maxlen < minlen ) {
218 maxlen = maxTileListLen;
220 unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
221 unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
222 unsigned int aveKey = (maxKey + minKey)/2;
223 for (int j=wid;j < minlen-1;j+=WARPSIZE) {
224 sortKeys[i*maxTileListLen + j] = minKey;
226 for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
227 sortKeys[i*maxTileListLen + j] = maxKey;
229 for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
230 if (sortKeys[i*maxTileListLen + j] == 0) {
231 sortKeys[i*maxTileListLen + j] = aveKey;
239 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
241 #define BOUNDINGBOXKERNEL_NUM_WARP 8
243 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
244 BoundingBox* __restrict__ boundingBoxes) {
246 const int warpId = threadIdx.x / WARPSIZE;
247 const int wid = threadIdx.x % WARPSIZE;
249 // Loop with warp-aligned index to avoid warp-divergence
250 for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
252 const int i = iwarp + wid;
253 // Bounding box index
254 const int ibb = i/WARPSIZE;
256 float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
258 volatile float3 minxyz, maxxyz;
260 typedef cub::WarpReduce<float> WarpReduce;
261 __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
262 #if (NAMD_CCCL_MAJOR_VERSION <= 2)
263 auto minOp = cub::Min();
264 auto maxOp = cub::Max();
266 auto minOp = cuda::minimum<double>();
267 auto maxOp = cuda::maximum<double>();
269 minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, minOp);
270 minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, minOp);
271 minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, minOp);
272 maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, maxOp);
273 maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, maxOp);
274 maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, maxOp);
277 BoundingBox boundingBox;
278 boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
279 boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
280 boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
281 boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
282 boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
283 boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
284 boundingBoxes[ibb] = boundingBox;
291 // Returns the lower estimate for the distance between two bounding boxes
293 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
294 float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
295 float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
296 float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
297 float r2 = dx*dx + dy*dy + dz*dz;
303 // Performs warp-level exclusive sum on a shared memory array:
304 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
305 // sh_in and sh_out can point to same array
306 // Returns the total sum
308 template <typename T>
309 __device__ __forceinline__
310 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
311 const int wid = threadIdx.x % WARPSIZE;
312 volatile int blockOffset = 0;
313 for (int iblock=0;iblock < n;iblock += WARPSIZE) {
314 // Size of the exclusive sum
315 int blockLen = min(WARPSIZE, n-iblock);
316 // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
317 typedef cub::WarpScan<int> WarpScan;
318 __shared__ typename WarpScan::TempStorage tempStorage;
319 int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
320 WarpScan(tempStorage).ExclusiveSum(data, data);
321 // Shift by block offset
324 int last = (int)sh_in[iblock + blockLen-1];
326 if (wid < blockLen) sh_out[iblock + wid] = data;
327 // block offset = last term of the exclusive sum + last value
328 blockOffset = sh_out[iblock + blockLen-1] + last;
334 #define TILELISTKERNELNEW_NUM_WARP 4
337 // NOTE: Executed on a single thread block
339 template<int nthread>
340 __global__ void calcTileListPosKernel(const int numComputes,
341 const CudaComputeRecord* __restrict__ computes,
342 const CudaPatchRecord* __restrict__ patches,
343 int* __restrict__ tilePos) {
345 typedef cub::BlockScan<int, nthread> BlockScan;
347 __shared__ typename BlockScan::TempStorage tempStorage;
348 __shared__ int shTilePos0;
350 if (threadIdx.x == nthread-1) {
354 for (int base=0;base < numComputes;base+=nthread) {
355 int k = base + threadIdx.x;
357 int numTiles1 = (k < numComputes) ?
358 (CudaComputeNonbondedKernel::computeNumTiles(patches[computes[k].patchInd.x].numAtoms, WARPSIZE)) : 0;
359 // Calculate positions in tile list and jtile list
361 BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
363 // Store into global memory
364 if (k < numComputes) {
365 tilePos[k] = shTilePos0 + tilePosVal;
369 // Store block end position
370 if (threadIdx.x == nthread-1) {
371 shTilePos0 += tilePosVal + numTiles1;
377 template<int nthread>
378 __global__ void updatePatchesKernel(const int numComputes,
379 const int* __restrict__ tilePos,
380 const CudaComputeRecord* __restrict__ computes,
381 const CudaPatchRecord* __restrict__ patches,
382 TileList* __restrict__ tileLists) {
384 const int tid = threadIdx.x % nthread;
386 // nthread threads takes care of one compute
387 for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
389 CudaComputeRecord compute = computes[k];
390 float3 offsetXYZ = compute.offsetXYZ;
391 int2 patchInd = compute.patchInd;
392 int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(patches[patchInd.x].numAtoms);
393 int itileList0 = tilePos[k];
394 for (int i=tid;i < numTiles1;i+=nthread) {
395 tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
396 tileLists[itileList0 + i].patchInd = patchInd;
397 tileLists[itileList0 + i].icompute = k;
403 __host__ __device__ __forceinline__
404 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
407 maxTileListLen*sizeof(char)
413 buildTileListsBBKernel(const int numTileLists,
414 TileList* __restrict__ tileLists,
415 const CudaPatchRecord* __restrict__ patches,
416 const int* __restrict__ tileListPos,
417 const float3 lata, const float3 latb, const float3 latc,
418 const float cutoff2, const int maxTileListLen,
419 const BoundingBox* __restrict__ boundingBoxes,
420 int* __restrict__ tileJatomStart,
421 const int tileJatomStartSize,
422 unsigned int* __restrict__ tileListDepth,
423 int* __restrict__ tileListOrder,
424 PatchPairRecord* __restrict__ patchPairs,
425 TileListStat* __restrict__ tileListStat) {
427 extern __shared__ char sh_buffer[];
428 int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
429 int pos = threadIdx.x*sizePerThread;
430 volatile char* sh_tile = (char*)&sh_buffer[pos];
432 // Loop with warp-aligned index to avoid warp-divergence
433 for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
435 // Use one thread per tile list
436 const int wid = threadIdx.x % WARPSIZE;
437 const int itileList = iwarp + wid;
440 int itileListLen = 0;
441 CudaPatchRecord patch1;
442 CudaPatchRecord patch2;
448 if (itileList < numTileLists) {
449 offsetXYZ = tileLists[itileList].offsetXYZ;
450 patchInd = tileLists[itileList].patchInd;
451 icompute = tileLists[itileList].icompute;
453 i = itileList - tileListPos[icompute];
455 float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
456 float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
457 float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
459 // DH - set zeroShift flag if magnitude of shift vector is zero
460 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
463 patch1 = patches[patchInd.x];
464 patch2 = patches[patchInd.y];
465 // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
466 numTiles2 = CudaComputeNonbondedKernel::computeNumTiles(patch2.numAtoms);
467 int tileStart1 = patch1.atomStart/WARPSIZE;
468 int tileStart2 = patch2.atomStart/WARPSIZE;
470 // DH - self requires that zeroShift is also set
471 bool self = zeroShift && (tileStart1 == tileStart2);
473 // Load i-atom data (and shift coordinates)
474 BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
475 boundingBoxI.x += shx;
476 boundingBoxI.y += shy;
477 boundingBoxI.z += shz;
479 for (int j=0;j < numTiles2;j++) {
481 if (!self || j >= i) {
482 BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
483 float r2bb = distsq(boundingBoxI, boundingBoxJ);
484 if (r2bb < cutoff2) {
491 tileListDepth[itileList] = (unsigned int)itileListLen;
492 tileListOrder[itileList] = itileList;
495 typedef cub::WarpScan<int> WarpScan;
496 __shared__ typename WarpScan::TempStorage tempStorage;
497 int active = (itileListLen > 0);
499 WarpScan(tempStorage).ExclusiveSum(active, activePos);
501 WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
503 int jtileStart, numJtiles;
504 // Last thread in the warp knows the total number
505 if (wid == WARPSIZE-1) {
506 atomicAdd(&tileListStat->numTileLists, activePos + active);
507 numJtiles = itileListPos + itileListLen;
508 jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
510 numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
511 jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
512 if (jtileStart + numJtiles > tileJatomStartSize) {
513 // tileJatomStart out of memory, exit
514 if (wid == 0) tileListStat->tilesSizeExceeded = true;
518 int jStart = itileListPos;
519 int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
520 if (wid == WARPSIZE-1) jEnd = numJtiles;
522 if (itileListLen > 0) {
525 TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
526 TLtmp.jtileStart = jtileStart + jStart;
527 TLtmp.jtileEnd = jtileStart + jEnd - 1;
528 TLtmp.patchInd = patchInd;
529 TLtmp.offsetXYZ = offsetXYZ;
530 TLtmp.icompute = icompute;
531 // TLtmp.patchNumList.x = 0;
532 // TLtmp.patchNumList.y = 0;
533 tileLists[itileList] = TLtmp;
535 PatchPairRecord patchPair;
536 patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
537 patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
538 patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
539 patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
540 patchPairs[itileList] = patchPair;
543 int jtile = jtileStart + jStart;
544 for (int j=0;j < numTiles2;j++) {
546 tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
557 #define REPACKTILELISTSKERNEL_NUM_WARP 32
559 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
560 const int* __restrict__ tileListOrder,
561 const int* __restrict__ jtiles,
562 const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
563 const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
564 const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
565 const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
567 const int wid = threadIdx.x % WARPSIZE;
569 // One warp does one tile list
570 for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
572 int j = tileListOrder[i];
573 int start = tileListPos[i];
574 int end = tileListPos[i+1]-1;
575 if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
577 int startOld = __ldg(&tileListsSrc[j].jtileStart);
578 int endOld = __ldg(&tileListsSrc[j].jtileEnd);
579 int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
581 offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
582 offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
583 offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
584 int2 patchInd = tileListsSrc[j].patchInd;
585 int icompute = __ldg(&tileListsSrc[j].icompute);
588 tileList.iatomStart = iatomStart;
589 tileList.offsetXYZ = offsetXYZ;
590 tileList.jtileStart = start;
591 tileList.jtileEnd = end;
592 tileList.patchInd = patchInd;
593 tileList.icompute = icompute;
594 tileListsDst[i] = tileList;
597 if (jtiles == NULL) {
598 // No jtiles, simple copy will do
600 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
601 if (jtileOld + wid <= endOld) {
602 tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
605 if (tileExclsSrc != NULL) {
607 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
608 tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
613 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
614 int t = jtileOld + wid;
615 int jtile = (t <= endOld) ? jtiles[t] : 0;
618 typedef cub::WarpScan<int> WarpScan;
619 __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
620 int warpId = threadIdx.x / WARPSIZE;
622 WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
624 if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
626 if (tileExclsSrc != NULL) {
627 unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
629 // k = index of thread that has data
630 // Promote k to 64-bit integer. DMC doesn't think this is necessary
631 // but it resolves issues for CUDA 11.6
633 int64_t k = __ffs(b) - 1;
635 int k = __ffs(b) - 1;
637 tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
638 // remove 1 bit and advance jtile0
639 b ^= ((unsigned int)1 << k);
643 jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
652 // NOTE: Executed on a single thread block
653 // oobKey = out-of-bounds key value
655 #define SORTTILELISTSKERNEL_NUM_THREAD 512
656 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
657 template <typename keyT, typename valT, bool ascend>
658 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
659 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
660 const int begin_bit, const int end_bit, const keyT oobKey,
661 keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
662 valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
664 typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
665 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
667 typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
668 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
670 typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
671 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
674 typename BlockLoad::TempStorage load;
675 typename BlockLoadU::TempStorage loadU;
676 typename BlockRadixSort::TempStorage sort;
679 keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
680 valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
682 BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
684 BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
688 BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
690 BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
692 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
693 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
696 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
697 unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
699 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
701 int j = tileListOrder[i];
702 tileListDepthDst[i] = tileListDepthSrc[j];
708 // Bit shift tileListDepth so that only lower 16 bits are used
710 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
711 unsigned int* __restrict__ tileListDepth) {
713 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
715 unsigned int a = tileListDepth[itileList];
718 tileListDepth[itileList] = a;
723 // ##############################################################################################
724 // ##############################################################################################
725 // ##############################################################################################
727 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
728 deviceID(deviceID), doStreaming(doStreaming) {
730 cudaCheck(cudaSetDevice(deviceID));
741 cudaComputesSize = 0;
743 patchNumLists = NULL;
744 patchNumListsSize = 0;
747 emptyPatchesSize = 0;
748 h_emptyPatches = NULL;
749 h_emptyPatchesSize = 0;
767 tileJatomStart1 = NULL;
768 tileJatomStart1Size = 0;
769 tileJatomStart2 = NULL;
770 tileJatomStart2Size = 0;
772 boundingBoxes = NULL;
773 boundingBoxesSize = 0;
775 tileListDepth1 = NULL;
776 tileListDepth1Size = 0;
777 tileListDepth2 = NULL;
778 tileListDepth2Size = 0;
780 tileListOrder1 = NULL;
781 tileListOrder1Size = 0;
782 tileListOrder2 = NULL;
783 tileListOrder2Size = 0;
795 allocate_device<TileListStat>(&d_tileListStat, 1);
796 allocate_host<TileListStat>(&h_tileListStat, 1);
809 tileListsGBIS = NULL;
810 tileListsGBISSize = 0;
812 tileJatomStartGBIS = NULL;
813 tileJatomStartGBISSize = 0;
815 tileListVirialEnergy = NULL;
816 tileListVirialEnergySize = 0;
820 numTileListsGBIS = 0;
825 doOutputOrder = false;
827 minmaxListLen = NULL;
828 minmaxListLenSize = 0;
834 cudaCheck(cudaEventCreate(&tileListStatEvent));
835 tileListStatEventRecord = false;
838 CudaTileListKernel::~CudaTileListKernel() {
839 cudaCheck(cudaSetDevice(deviceID));
840 deallocate_device<TileListStat>(&d_tileListStat);
841 deallocate_host<TileListStat>(&h_tileListStat);
843 if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
844 if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
845 if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
846 if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
847 if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
849 if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
850 if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
851 if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
852 if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
853 if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
854 if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
855 if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
856 if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
857 if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
858 if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
859 if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
860 if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
861 if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
862 if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
863 if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
864 if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
865 if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
866 if (jtiles != NULL) deallocate_device<int>(&jtiles);
867 if (tilePos != NULL) deallocate_device<int>(&tilePos);
869 if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
870 if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
872 if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
874 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
875 if (part != NULL) deallocate_device<char>(&part);
877 if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
878 if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
880 cudaCheck(cudaEventDestroy(tileListStatEvent));
883 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
884 clear_device_array<int>(jtiles, numJtiles, stream);
887 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
888 // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
889 memset(h_tileListStat, 0, sizeof(TileListStat));
890 h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
891 copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
894 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
895 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
896 cudaCheck(cudaEventRecord(tileListStatEvent, stream));
897 tileListStatEventRecord = true;
900 void CudaTileListKernel::updateComputes(const int numComputesIn,
901 const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
903 numComputes = numComputesIn;
905 reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
906 copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
908 if (doStreaming) doOutputOrder = true;
911 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
912 const TileList* d_tileLists, cudaStream_t stream) {
914 TileList* h_tileLists = new TileList[numTileLists];
915 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
916 cudaCheck(cudaStreamSynchronize(stream));
917 FILE* handle = fopen(filename,"wt");
918 for (int itileList=0;itileList < numTileLists;itileList++) {
919 TileList tmp = h_tileLists[itileList];
920 fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
921 tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
922 tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
925 delete [] h_tileLists;
928 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
929 const int* d_tileJatomStart, cudaStream_t stream) {
931 int* h_tileJatomStart = new int[numJtiles];
932 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
933 cudaCheck(cudaStreamSynchronize(stream));
934 FILE* handle = fopen(filename,"wt");
935 for (int i=0;i < numJtiles;i++) {
936 fprintf(handle, "%d\n", h_tileJatomStart[i]);
939 delete [] h_tileJatomStart;
943 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
945 return std::pair<int, int>(p.second, p.first);
948 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
949 const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
951 const int shCacheSize = 10;
952 TileList* h_tileLists = new TileList[numTileLists];
953 int* h_tileJatomStart = new int[numJtiles];
954 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
955 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
956 cudaCheck(cudaStreamSynchronize(stream));
959 for (int i=0;i < numTileLists;i+=width) {
960 int jend = min(i + width, numTileLists);
961 std::map<int, int> atomStartMap;
962 std::map<int, int>::iterator it;
963 atomStartMap.clear();
964 for (int j=i;j < jend;j++) {
965 TileList tmp = h_tileLists[j];
966 int iatomStart = tmp.iatomStart;
967 it = atomStartMap.find(iatomStart);
968 if (it == atomStartMap.end()) {
970 atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
975 int jtileStart = tmp.jtileStart;
976 int jtileEnd = tmp.jtileEnd;
977 ntotal += (jtileEnd - jtileStart + 1) + 1;
978 for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
979 int jatomStart = h_tileJatomStart[jtile];
980 it = atomStartMap.find(jatomStart);
981 if (it == atomStartMap.end()) {
983 atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
988 jatomStart |= (65535 << 16);
989 h_tileJatomStart[jtile] = jatomStart;
991 iatomStart |= (65535 << 16);
992 tmp.iatomStart = iatomStart;
993 h_tileLists[j] = tmp;
995 ncache += atomStartMap.size();
996 std::multimap<int, int> imap;
998 std::multimap<int, int>::iterator imap_it;
999 std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
1001 printf("%d %d\n", ntotal, imap.size());
1002 for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
1003 if (imap_it->first != 0)
1004 printf("(%d %d)\n", imap_it->first, imap_it->second);
1008 printf("ntotal %d ncache %d\n", ntotal, ncache);
1009 copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1010 copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1011 cudaCheck(cudaStreamSynchronize(stream));
1012 delete [] h_tileLists;
1013 delete [] h_tileJatomStart;
1017 void CudaTileListKernel::prepareBuffers(
1018 int atomStorageSizeIn, int numPatchesIn,
1019 const CudaPatchRecord* h_cudaPatches,
1022 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSizeIn, OVERALLOC);
1024 // Copy cudaPatches to device
1025 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatchesIn);
1026 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatchesIn, stream);
1029 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1030 const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1031 const float3 lata, const float3 latb, const float3 latc,
1032 const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1033 const float plcutoff2In, const size_t maxShmemPerBlock,
1034 cudaStream_t stream, const bool atomsChanged,
1035 const bool allocatePart, bool CUDASOAintegrator, bool deviceMigration) {
1037 numPatches = numPatchesIn;
1038 atomStorageSize = atomStorageSizeIn;
1039 maxTileListLen = maxTileListLenIn;
1040 plcutoff2 = plcutoff2In;
1043 // Re-allocate patchNumLists
1044 reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1045 reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1046 reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1049 // Re-allocate (tileLists1, patchPairs1
1050 reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1051 reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1053 // Copy cudaPatches to device
1054 // If DeviceMigration is on then this happens in prepareBuffers()
1055 if (!CUDASOAintegrator || !deviceMigration) {
1056 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1057 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1060 // Re-allocate temporary storage
1061 reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1062 // Calculate tile list positions (tilePos)
1066 calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1067 (numComputes, cudaComputes, cudaPatches, tilePos);
1068 cudaCheck(cudaGetLastError());
1071 // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1074 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
1075 updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1076 (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1077 cudaCheck(cudaGetLastError());
1080 // ---------------------------------------------------------------------------------------------
1083 // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1084 // tileListDepth2 and tileListOrder2 since they're used in sorting
1085 reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1086 reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1088 // Allocate with +1 to include last term in the exclusive sum
1089 reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1091 reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1093 // deviceMigration could be true with CUDASOAintegrator as false if minimization is being
1094 // performed. So we want this to be true if CUDASOAintegrator is false or if deviceMigration
1096 if (!CUDASOAintegrator || !deviceMigration) {
1097 if(atomsChanged) reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1098 if(!CUDASOAintegrator || atomsChanged) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1101 //This should be allocated for FEP only
1102 if(allocatePart) reallocate_device<char>(&part, &partSize, atomStorageSize, OVERALLOC);
1104 // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1106 int numBoundingBoxes = atomStorageSize/WARPSIZE;
1107 reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1109 int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1110 int nthread = WARPSIZE*nwarp;
1111 int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1112 buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1113 cudaCheck(cudaGetLastError());
1117 int nwarp = TILELISTKERNELNEW_NUM_WARP;
1118 int nthread = WARPSIZE*nwarp;
1119 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1121 int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1122 if(shmem_size > maxShmemPerBlock){
1123 NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1126 // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1127 // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1128 // re-allocation only happens when the size is exceeded.
1129 h_tileListStat->tilesSizeExceeded = true;
1130 int reallocCount = 0;
1131 while (h_tileListStat->tilesSizeExceeded) {
1132 reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1134 clearTileListStat(stream);
1135 // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1137 buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1138 (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1139 lata, latb, latc, plcutoff2, maxTileListLen,
1140 boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1141 tileListDepth1, tileListOrder1, patchPairs1,
1144 cudaCheck(cudaGetLastError());
1146 // get (numATileLists, numJtiles, tilesSizeExceeded)
1147 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1148 cudaCheck(cudaStreamSynchronize(stream));
1149 numJtiles = h_tileListStat->numJtiles;
1151 if (h_tileListStat->tilesSizeExceeded) {
1153 if (reallocCount > 1) {
1154 NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1160 numTileLists = h_tileListStat->numTileLists;
1162 reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1165 // Re-allocate tileListVirialEnergy.
1166 // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1167 // we're quaranteed to have enough space
1168 reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1170 reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1171 reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1172 reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1173 reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1174 reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1176 int numTileListsSrc = numTileListsPrev;
1177 int numJtilesSrc = numJtiles;
1178 int numTileListsDst = numTileLists;
1179 int numJtilesDst = numJtiles;
1185 numTileListsSrc, numJtilesSrc,
1186 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1187 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1188 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1189 numTileListsDst, numJtilesDst,
1190 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1191 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1192 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1195 // Set active buffer to 2
1198 if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1202 // Returns integer log2(a) rounded up
1206 // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1208 while (a >>= 1) k++;
1215 void CudaTileListKernel::sortTileLists(
1216 const bool useJtiles,
1217 const int begin_bit, const bool highDepthBitsSetIn,
1219 const int numTileListsSrc, const int numJtilesSrc,
1220 PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1221 PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1222 PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1224 const int numTileListsDst, const int numJtilesDst,
1225 PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1226 PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1227 PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1228 cudaStream_t stream) {
1230 bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1232 // if (numTileListsDst == 0)
1233 // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1235 // Check that the array sizes are adequate
1236 if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1237 numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1238 (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1239 (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1240 NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1242 if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1243 numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1244 (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1245 (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1246 NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1248 if (begin_bit != 0 && begin_bit != 16)
1249 NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1251 // Number of bits needed in the sort
1252 int num_bit = ilog2(maxTileListLen);
1254 NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1255 int end_bit = begin_bit + num_bit;
1259 // ----------------------------------------------------------------------------------------
1260 if (doOutputOrder && useJtiles) {
1261 // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1262 // NOTE: This is done very infrequently, typically only once when the MD run starts.
1264 // Calculate position from depth
1266 // -----------------------------------------------------------------------------
1267 // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1268 // -----------------------------------------------------------------------------
1272 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1273 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1274 (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1275 cudaCheck(cudaGetLastError());
1278 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1280 // --------------------------------------------------------------------
1281 // Compute itileList positions to store tileLists
1282 // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1283 // --------------------------------------------------------------------
1286 cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1287 (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1288 // Make sure tempStorage doesn't remain NULL
1289 if (size == 0) size = 128;
1290 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1291 size = tempStorageSize;
1292 cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1293 (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1297 // Store in reverse order from outputOrder
1300 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1301 storeInReverse <<< nblock, nthread, 0, stream >>>
1302 (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1303 tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1304 tileListOrderDst.ptr, tileListDepthDst.ptr);
1305 cudaCheck(cudaGetLastError());
1310 maxTileListLen_sortKeys = maxTileListLen;
1312 reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1313 clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1315 // Re-allocate and initialize minmaxListLen
1317 reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1320 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1321 initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1322 (numComputes, maxTileListLen, minmaxListLen);
1323 cudaCheck(cudaGetLastError());
1326 // Build sortKeys and calculate minmaxListLen
1329 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1330 buildSortKeys <<< nblock, nthread, 0, stream >>>
1331 (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1332 tileListDepthDst.ptr, minmaxListLen, sortKeys);
1333 cudaCheck(cudaGetLastError());
1335 // Maximum value in sortKeys[] is numTileListsDst - 1
1336 sortKeys_endbit = ilog2(numTileListsDst);
1339 // Fill in missing sortKeys using minmaxListLen
1342 int nwarp = nthread/WARPSIZE;
1343 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1344 fillSortKeys <<< nblock, nthread, 0, stream >>>
1345 (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1346 cudaCheck(cudaGetLastError());
1351 doOutputOrder = false;
1353 } else if (doOutputOrder) {
1354 // OutputOrder will be produced in next pairlist non-bond kernel.
1355 // This time just remove zero length lists
1356 // NOTE: This is done very infrequently, typically only once when the MD run starts.
1358 int endbit_tmp = ilog2(numTileListsSrc);
1362 reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1363 reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1366 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1367 buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1368 (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1369 cudaCheck(cudaGetLastError());
1372 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1374 // Short list, sort withing a single thread block
1375 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1378 unsigned int oobKey = numTileListsSrc;
1379 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1380 (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1381 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1382 cudaCheck(cudaGetLastError());
1386 // Long list, sort on multiple thread blocks
1388 cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1389 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1390 numTileListsSrc, 0, endbit_tmp, stream));
1391 // Make sure tempStorage doesn't remain NULL
1392 if (size == 0) size = 128;
1393 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1394 size = tempStorageSize;
1395 cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1396 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1397 numTileListsSrc, 0, endbit_tmp, stream));
1400 // Re-order tileListDepth using tileListOrderDst
1403 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1404 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1405 (numTileListsDst, tileListOrderDst.ptr,
1406 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1407 cudaCheck(cudaGetLastError());
1411 // This is done during regular MD cycle
1413 if (sortKeys_endbit <= 0)
1414 NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1418 reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1419 reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1422 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1423 setupSortKey <<< nblock, nthread, 0, stream >>>
1424 (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1425 cudaCheck(cudaGetLastError());
1430 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1433 // Short list, sort withing a single thread block
1434 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1437 unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1438 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1439 (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1440 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1441 cudaCheck(cudaGetLastError());
1445 // Long list, sort on multiple thread blocks
1447 cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1448 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1449 numTileListsSrc, 0, sortKeys_endbit, stream));
1450 // Make sure tempStorage doesn't remain NULL
1451 if (size == 0) size = 128;
1452 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1453 size = tempStorageSize;
1454 cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1455 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1456 numTileListsSrc, 0, sortKeys_endbit, stream));
1459 // Re-order tileListDepth using tileListOrderDst
1462 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1463 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1464 (numTileListsDst, tileListOrderDst.ptr,
1465 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1466 cudaCheck(cudaGetLastError());
1472 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1473 localSort<32> <<< nblock, nthread, 0, stream >>>
1474 (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1475 cudaCheck(cudaGetLastError());
1477 // No need to shift any more
1478 doShiftDown = false;
1482 // ----------------------------------------------------------------------------------------
1487 // --------------------------------------------------------------------
1488 // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1489 // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1490 // --------------------------------------------------------------------
1491 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1493 // Short list, sort withing a single thread block
1494 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1497 sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1498 (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1499 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1500 cudaCheck(cudaGetLastError());
1505 // Long list, sort on multiple thread blocks
1507 cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1508 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1509 numTileListsSrc, begin_bit, end_bit, stream));
1510 // Make sure tempStorage doesn't remain NULL
1511 if (size == 0) size = 128;
1512 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1513 size = tempStorageSize;
1514 cudaCheck(cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1515 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1516 numTileListsSrc, begin_bit, end_bit, stream));
1520 // -----------------------------------------------------------------------------
1521 // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1522 // -----------------------------------------------------------------------------
1526 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1527 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1528 (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1529 cudaCheck(cudaGetLastError());
1532 // Allocate with +1 to include last term in the exclusive sum
1533 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1535 // --------------------------------------------------------------------
1536 // Compute itileList positions to store tileLists
1537 // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1538 // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1539 // since this is an exclusive sum. But with this trick,
1540 // tileListPos[numTileListsDst] will contain the total number
1542 // --------------------------------------------------------------------
1545 cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1546 (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1547 // Make sure tempStorage doesn't remain NULL
1548 if (size == 0) size = 128;
1549 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1550 size = tempStorageSize;
1551 // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1552 // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1553 cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1554 (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1557 // --------------------------------------------------------------------
1559 // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1561 // tileJatomStartDst[]
1562 // tileExclsDst[0 ... numJtilesDst-1]
1563 // --------------------------------------------------------------------
1565 int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1566 int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1567 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1569 repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1570 (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1571 (useJtiles) ? jtiles : NULL,
1572 tileListsSrc.ptr, tileListsDst.ptr,
1573 patchPairsSrc.ptr, patchPairsDst.ptr,
1574 tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1575 tileExclsSrc.ptr, tileExclsDst.ptr);
1576 cudaCheck(cudaGetLastError());
1579 // Count the number of tileLists that contribute to each patch
1582 clear_device_array<int>(patchNumLists, numPatches, stream);
1584 // Fill in patchNumLists[0...numPatches-1]
1586 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1587 calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1588 (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1589 cudaCheck(cudaGetLastError());
1591 // Use emptyPatches[numPatches] as the count variable
1592 clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1594 // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1595 // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1596 setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1597 (numTileListsDst, tileListsDst.ptr, patchNumLists,
1598 numPatches, &emptyPatches[numPatches], emptyPatches);
1599 cudaCheck(cudaGetLastError());
1601 // // Copy emptyPatches[0 ... numPatches] to host
1602 copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1603 cudaCheck(cudaStreamSynchronize(stream));
1604 numEmptyPatches = h_emptyPatches[numPatches];
1610 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1612 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
1613 // Store previous number of active lists
1614 int numTileListsPrev = numTileLists;
1616 // Wait for finishTileList() to stop copying
1617 if (!tileListStatEventRecord)
1618 NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1619 cudaCheck(cudaEventSynchronize(tileListStatEvent));
1621 // Get numTileLists, numTileListsGBIS, and numExcluded
1623 numTileLists = h_tileListStat->numTileLists;
1624 numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1625 numExcluded = h_tileListStat->numExcluded;
1628 // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1629 // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1630 sortTileLists(true, 0, true,
1631 numTileListsPrev, numJtiles,
1632 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1633 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1634 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1635 numTileLists, numJtiles,
1636 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1637 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1638 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1641 // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1642 // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1643 // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1645 // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1648 // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1649 // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1652 // GBIS is used => produce a second tile list
1653 // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1654 reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1655 reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1657 sortTileLists(true, 16, true,
1658 numTileListsPrev, numJtiles,
1659 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1660 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1661 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1662 numTileListsGBIS, numJtiles,
1663 PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1664 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1665 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1669 // Set active buffer to be 1
1676 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1678 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1681 if (!doStreaming || !doOutputOrder)
1684 // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1685 // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1686 sortTileLists(false, 0, true,
1687 numTileLists, numJtiles,
1688 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1689 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1690 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1691 numTileLists, numJtiles,
1692 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1693 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1694 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1697 // Set active buffer to be 2
1703 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
1704 if (len > tileListVirialEnergySize) {
1705 NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1707 tileListVirialEnergyLength = len;
1710 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
1711 if (len > tileListVirialEnergySize) {
1712 NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1714 tileListVirialEnergyGBISLength = len;