NAMD
CudaTileListKernel.cu
Go to the documentation of this file.
1 // #include <map>
2 // #include <algorithm>
3 
4 #if defined(NAMD_CUDA)
5 #include <cuda.h>
6 #if __CUDACC_VER_MAJOR__ >= 11
7 #include <cub/device/device_radix_sort.cuh>
8 #include <cub/device/device_scan.cuh>
9 #include <cub/cub.cuh>
10 #else
11 #include <namd_cub/device/device_radix_sort.cuh>
12 #include <namd_cub/device/device_scan.cuh>
13 #include <namd_cub/cub.cuh>
14 #endif
15 #endif // NAMD_CUDA
16 
17 #include "CudaUtils.h"
18 #include "CudaTileListKernel.h"
19 #include "CudaComputeNonbondedKernel.h"
20 #include "DeviceCUDA.h"
21 
22 #if defined(NAMD_CUDA)
23 
24 #ifdef WIN32
25 #define __thread __declspec(thread)
26 #endif
27 
28 extern __thread DeviceCUDA *deviceCUDA;
29 
30 #define OVERALLOC 2.0f
31 
32 #if __CUDA_ARCH__ < 350
33 #define __ldg *
34 #endif
35 
36 void NAMD_die(const char *);
37 
38 //
39 // Calculate the number of lists that contribute to each patch
40 //
41 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
42  const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
43 
44  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
45  {
46  int2 patchInd = tileLists[i].patchInd;
47  atomicAdd(&patchNumLists[patchInd.x], 1);
48  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
49  }
50 
51 }
52 
53 //
54 // Write patchNumList back to tile list and
55 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
56 //
57 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
58  TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
59  const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
60 
61  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
62  {
63  int2 patchInd = tileLists[i].patchInd;
64  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
65  tileLists[i].patchNumList = patchNumList;
66  }
67 
68  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
69  {
70  if (patchNumLists[i] == 0) {
71  int ind = atomicAdd(numEmptyPatches, 1);
72  emptyPatches[ind] = i;
73  }
74  }
75 
76 }
77 
78 //
79 // Builds a sort key that removes zeros but keeps the order otherwise the same
80 //
81 __global__ void buildRemoveZerosSortKey(const int numTileLists,
82  const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
83 
84  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
85  {
86  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
87  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
88  }
89 
90 }
91 
92 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
93  const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
94  const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
95 
96  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
97  {
98  int icompute = tileLists[itileList].icompute;
99  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
100  int i = icompute*maxTileListLen + (depth - 1);
101  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
102  }
103 
104 }
105 
106 template <int width>
107 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
108  unsigned int* __restrict__ keys, int* __restrict__ vals) {
109 
110  // NOTE: blockDim.x = width
111 
112  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
113  {
114  int i = base + threadIdx.x;
115  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
116  __shared__ typename BlockRadixSort::TempStorage tempStorage;
117  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
118  int val[1] = {(i < n) ? vals[i] : 0};
119  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
120  if (i < n) {
121  keys[i] = key[0];
122  vals[i] = val[0];
123  }
124  BLOCK_SYNC;
125  }
126 
127 }
128 
129 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
130  const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
131  const int* __restrict__ tileListOrderSrc,
132  const unsigned int* __restrict__ tileListDepthSrc,
133  int* __restrict__ tileListOrderDst,
134  unsigned int* __restrict__ tileListDepthDst) {
135 
136  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
137  {
138  int j = outputOrder[numTileListsSrc - i - 1];
139  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
140  int k = tileListPos[i];
141  tileListDepthDst[k] = tileListDepthSrc[j];
142  tileListOrderDst[k] = j; //tileListOrderSrc[j];
143  }
144  }
145 }
146 
147 //
148 // Bit shift tileListDepth so that only lower 16 bits are used
149 //
150 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
151  const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
152  unsigned int* __restrict__ tileListDepthDst) {
153 
154  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
155  {
156  int j = outputOrder[numTileLists - i - 1];
157  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
158  }
159 
160 }
161 
162 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
163  int2* __restrict__ minmaxListLen) {
164 
165  int2 val;
166  val.x = maxTileListLen+1;
167  val.y = 0;
168  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
169  {
170  minmaxListLen[i] = val;
171  }
172 
173 }
174 
175 //
176 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
177 //
178 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
179  const TileList* __restrict__ tileListsSrc,
180  const int* __restrict__ tileListOrderDst,
181  const unsigned int* __restrict__ tileListDepthDst,
182  int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
183 
184  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
185  {
186  int k = tileListOrderDst[i];
187  int icompute = tileListsSrc[k].icompute;
188  int depth = tileListDepthDst[i] & 65535;
189  // depth is in range [1 ... maxTileListLen]
190  int j = icompute*maxTileListLen + (depth-1);
191  sortKeys[j] = i;
192  int2 minmax = minmaxListLen[icompute];
193  int2 minmaxOrig = minmax;
194  if (minmax.x > depth) minmax.x = depth;
195  if (minmax.y < depth) minmax.y = depth;
196  if (minmax.x != minmaxOrig.x) {
197  atomicMin(&minmaxListLen[icompute].x, minmax.x);
198  }
199  if (minmax.y != minmaxOrig.y) {
200  atomicMax(&minmaxListLen[icompute].y, minmax.y);
201  }
202  }
203 
204 }
205 
206 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
207  const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
208 
209  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
210  const int wid = threadIdx.x % WARPSIZE;
211  int2 minmax = minmaxListLen[i];
212  int minlen = minmax.x;
213  int maxlen = minmax.y;
214  // minlen, maxlen are in range [1 ... maxTileListLen]
215  // as long as i is in tileListsSrc[].icompute above
216  if ( maxlen < minlen ) {
217  minlen = 1;
218  maxlen = maxTileListLen;
219  }
220  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
221  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
222  unsigned int aveKey = (maxKey + minKey)/2;
223  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
224  sortKeys[i*maxTileListLen + j] = minKey;
225  }
226  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
227  sortKeys[i*maxTileListLen + j] = maxKey;
228  }
229  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
230  if (sortKeys[i*maxTileListLen + j] == 0) {
231  sortKeys[i*maxTileListLen + j] = aveKey;
232  }
233  }
234  }
235 
236 }
237 
238 //
239 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
240 //
241 #define BOUNDINGBOXKERNEL_NUM_WARP 8
242 __global__ void
243 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
244  BoundingBox* __restrict__ boundingBoxes) {
245 
246  const int warpId = threadIdx.x / WARPSIZE;
247  const int wid = threadIdx.x % WARPSIZE;
248 
249  // Loop with warp-aligned index to avoid warp-divergence
250  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
251  // Full atom index
252  const int i = iwarp + wid;
253  // Bounding box index
254  const int ibb = i/WARPSIZE;
255 
256  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
257 
258  volatile float3 minxyz, maxxyz;
259 
260  typedef cub::WarpReduce<float> WarpReduce;
261  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
262 #if (NAMD_CCCL_MAJOR_VERSION <= 2)
263  auto minOp = cub::Min();
264  auto maxOp = cub::Max();
265 #else
266  auto minOp = cuda::minimum<double>();
267  auto maxOp = cuda::maximum<double>();
268 #endif
269  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, minOp);
270  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, minOp);
271  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, minOp);
272  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, maxOp);
273  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, maxOp);
274  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, maxOp);
275 
276  if (wid == 0) {
277  BoundingBox boundingBox;
278  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
279  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
280  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
281  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
282  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
283  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
284  boundingBoxes[ibb] = boundingBox;
285  }
286  }
287 
288 }
289 
290 //
291 // Returns the lower estimate for the distance between two bounding boxes
292 //
293 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
294  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
295  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
296  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
297  float r2 = dx*dx + dy*dy + dz*dz;
298  return r2;
299 }
300 
301 #if 0
302 //
303 // Performs warp-level exclusive sum on a shared memory array:
304 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
305 // sh_in and sh_out can point to same array
306 // Returns the total sum
307 //
308 template <typename T>
309 __device__ __forceinline__
310 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
311  const int wid = threadIdx.x % WARPSIZE;
312  volatile int blockOffset = 0;
313  for (int iblock=0;iblock < n;iblock += WARPSIZE) {
314  // Size of the exclusive sum
315  int blockLen = min(WARPSIZE, n-iblock);
316  // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
317  typedef cub::WarpScan<int> WarpScan;
318  __shared__ typename WarpScan::TempStorage tempStorage;
319  int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
320  WarpScan(tempStorage).ExclusiveSum(data, data);
321  // Shift by block offset
322  data += blockOffset;
323  // Save last value
324  int last = (int)sh_in[iblock + blockLen-1];
325  // Write output
326  if (wid < blockLen) sh_out[iblock + wid] = data;
327  // block offset = last term of the exclusive sum + last value
328  blockOffset = sh_out[iblock + blockLen-1] + last;
329  }
330  return blockOffset;
331 }
332 #endif
333 
334 #define TILELISTKERNELNEW_NUM_WARP 4
335 
336 //
337 // NOTE: Executed on a single thread block
338 //
339 template<int nthread>
340 __global__ void calcTileListPosKernel(const int numComputes,
341  const CudaComputeRecord* __restrict__ computes,
342  const CudaPatchRecord* __restrict__ patches,
343  int* __restrict__ tilePos) {
344 
345  typedef cub::BlockScan<int, nthread> BlockScan;
346 
347  __shared__ typename BlockScan::TempStorage tempStorage;
348  __shared__ int shTilePos0;
349 
350  if (threadIdx.x == nthread-1) {
351  shTilePos0 = 0;
352  }
353 
354  for (int base=0;base < numComputes;base+=nthread) {
355  int k = base + threadIdx.x;
356 
357  int numTiles1 = (k < numComputes) ?
358  (CudaComputeNonbondedKernel::computeNumTiles(patches[computes[k].patchInd.x].numAtoms, WARPSIZE)) : 0;
359  // Calculate positions in tile list and jtile list
360  int tilePosVal;
361  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
362 
363  // Store into global memory
364  if (k < numComputes) {
365  tilePos[k] = shTilePos0 + tilePosVal;
366  }
367 
368  BLOCK_SYNC;
369  // Store block end position
370  if (threadIdx.x == nthread-1) {
371  shTilePos0 += tilePosVal + numTiles1;
372  }
373  }
374 }
375 
376 
377 template<int nthread>
378 __global__ void updatePatchesKernel(const int numComputes,
379  const int* __restrict__ tilePos,
380  const CudaComputeRecord* __restrict__ computes,
381  const CudaPatchRecord* __restrict__ patches,
382  TileList* __restrict__ tileLists) {
383 
384  const int tid = threadIdx.x % nthread;
385 
386  // nthread threads takes care of one compute
387  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
388  {
389  CudaComputeRecord compute = computes[k];
390  float3 offsetXYZ = compute.offsetXYZ;
391  int2 patchInd = compute.patchInd;
392  int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(patches[patchInd.x].numAtoms);
393  int itileList0 = tilePos[k];
394  for (int i=tid;i < numTiles1;i+=nthread) {
395  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
396  tileLists[itileList0 + i].patchInd = patchInd;
397  tileLists[itileList0 + i].icompute = k;
398  }
399  }
400 
401 }
402 
403 __host__ __device__ __forceinline__
404 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
405  // Size in bytes
406  int size = (
407  maxTileListLen*sizeof(char)
408  );
409  return size;
410 }
411 
412 __global__ void
413 buildTileListsBBKernel(const int numTileLists,
414  TileList* __restrict__ tileLists,
415  const CudaPatchRecord* __restrict__ patches,
416  const int* __restrict__ tileListPos,
417  const float3 lata, const float3 latb, const float3 latc,
418  const float cutoff2, const int maxTileListLen,
419  const BoundingBox* __restrict__ boundingBoxes,
420  int* __restrict__ tileJatomStart,
421  const int tileJatomStartSize,
422  unsigned int* __restrict__ tileListDepth,
423  int* __restrict__ tileListOrder,
424  PatchPairRecord* __restrict__ patchPairs,
425  TileListStat* __restrict__ tileListStat) {
426 
427  extern __shared__ char sh_buffer[];
428  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
429  int pos = threadIdx.x*sizePerThread;
430  volatile char* sh_tile = (char*)&sh_buffer[pos];
431 
432  // Loop with warp-aligned index to avoid warp-divergence
433  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
434 
435  // Use one thread per tile list
436  const int wid = threadIdx.x % WARPSIZE;
437  const int itileList = iwarp + wid;
438 
439  int i;
440  int itileListLen = 0;
441  CudaPatchRecord patch1;
442  CudaPatchRecord patch2;
443  float3 offsetXYZ;
444  int2 patchInd;
445  int numTiles2;
446  int icompute;
447 
448  if (itileList < numTileLists) {
449  offsetXYZ = tileLists[itileList].offsetXYZ;
450  patchInd = tileLists[itileList].patchInd;
451  icompute = tileLists[itileList].icompute;
452  // Get i-column
453  i = itileList - tileListPos[icompute];
454 
455  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
456  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
457  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
458 
459  // DH - set zeroShift flag if magnitude of shift vector is zero
460  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
461 
462  // Load patches
463  patch1 = patches[patchInd.x];
464  patch2 = patches[patchInd.y];
465  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
466  numTiles2 = CudaComputeNonbondedKernel::computeNumTiles(patch2.numAtoms);
467  int tileStart1 = patch1.atomStart/WARPSIZE;
468  int tileStart2 = patch2.atomStart/WARPSIZE;
469 
470  // DH - self requires that zeroShift is also set
471  bool self = zeroShift && (tileStart1 == tileStart2);
472 
473  // Load i-atom data (and shift coordinates)
474  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
475  boundingBoxI.x += shx;
476  boundingBoxI.y += shy;
477  boundingBoxI.z += shz;
478 
479  for (int j=0;j < numTiles2;j++) {
480  sh_tile[j] = 0;
481  if (!self || j >= i) {
482  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
483  float r2bb = distsq(boundingBoxI, boundingBoxJ);
484  if (r2bb < cutoff2) {
485  sh_tile[j] = 1;
486  itileListLen++;
487  }
488  }
489  }
490 
491  tileListDepth[itileList] = (unsigned int)itileListLen;
492  tileListOrder[itileList] = itileList;
493  }
494 
495  typedef cub::WarpScan<int> WarpScan;
496  __shared__ typename WarpScan::TempStorage tempStorage;
497  int active = (itileListLen > 0);
498  int activePos;
499  WarpScan(tempStorage).ExclusiveSum(active, activePos);
500  int itileListPos;
501  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
502 
503  int jtileStart, numJtiles;
504  // Last thread in the warp knows the total number
505  if (wid == WARPSIZE-1) {
506  atomicAdd(&tileListStat->numTileLists, activePos + active);
507  numJtiles = itileListPos + itileListLen;
508  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
509  }
510  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
511  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
512  if (jtileStart + numJtiles > tileJatomStartSize) {
513  // tileJatomStart out of memory, exit
514  if (wid == 0) tileListStat->tilesSizeExceeded = true;
515  return;
516  }
517 
518  int jStart = itileListPos;
519  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
520  if (wid == WARPSIZE-1) jEnd = numJtiles;
521 
522  if (itileListLen > 0) {
523  // Setup tileLists[]
524  TileList TLtmp;
525  TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
526  TLtmp.jtileStart = jtileStart + jStart;
527  TLtmp.jtileEnd = jtileStart + jEnd - 1;
528  TLtmp.patchInd = patchInd;
529  TLtmp.offsetXYZ = offsetXYZ;
530  TLtmp.icompute = icompute;
531  // TLtmp.patchNumList.x = 0;
532  // TLtmp.patchNumList.y = 0;
533  tileLists[itileList] = TLtmp;
534  // PatchPair
535  PatchPairRecord patchPair;
536  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
537  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
538  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
539  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
540  patchPairs[itileList] = patchPair;
541 
542  // Write tiles
543  int jtile = jtileStart + jStart;
544  for (int j=0;j < numTiles2;j++) {
545  if (sh_tile[j]) {
546  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
547  jtile++;
548  }
549  }
550 
551  }
552 
553  }
554 
555 }
556 
557 #define REPACKTILELISTSKERNEL_NUM_WARP 32
558 __global__ void
559 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
560  const int* __restrict__ tileListOrder,
561  const int* __restrict__ jtiles,
562  const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
563  const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
564  const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
565  const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
566 
567  const int wid = threadIdx.x % WARPSIZE;
568 
569  // One warp does one tile list
570  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
571  {
572  int j = tileListOrder[i];
573  int start = tileListPos[i];
574  int end = tileListPos[i+1]-1;
575  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
576  // TileList
577  int startOld = __ldg(&tileListsSrc[j].jtileStart);
578  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
579  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
580  float3 offsetXYZ;
581  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
582  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
583  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
584  int2 patchInd = tileListsSrc[j].patchInd;
585  int icompute = __ldg(&tileListsSrc[j].icompute);
586  if (wid == 0) {
587  TileList tileList;
588  tileList.iatomStart = iatomStart;
589  tileList.offsetXYZ = offsetXYZ;
590  tileList.jtileStart = start;
591  tileList.jtileEnd = end;
592  tileList.patchInd = patchInd;
593  tileList.icompute = icompute;
594  tileListsDst[i] = tileList;
595  }
596 
597  if (jtiles == NULL) {
598  // No jtiles, simple copy will do
599  int jtile = start;
600  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
601  if (jtileOld + wid <= endOld) {
602  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
603  }
604  }
605  if (tileExclsSrc != NULL) {
606  int jtile = start;
607  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
608  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
609  }
610  }
611  } else {
612  int jtile0 = start;
613  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
614  int t = jtileOld + wid;
615  int jtile = (t <= endOld) ? jtiles[t] : 0;
616  jtile >>= begin_bit;
617  jtile &= 65535;
618  typedef cub::WarpScan<int> WarpScan;
619  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
620  int warpId = threadIdx.x / WARPSIZE;
621  int jtilePos;
622  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
623 
624  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
625 
626  if (tileExclsSrc != NULL) {
627  unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
628  while (b != 0) {
629  // k = index of thread that has data
630  // Promote k to 64-bit integer. DMC doesn't think this is necessary
631  // but it resolves issues for CUDA 11.6
632 #if 1
633  int64_t k = __ffs(b) - 1;
634 #else
635  int k = __ffs(b) - 1;
636 #endif
637  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
638  // remove 1 bit and advance jtile0
639  b ^= ((unsigned int)1 << k);
640  jtile0++;
641  }
642  } else {
643  jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
644  }
645  }
646  }
647  }
648 
649 }
650 
651 //
652 // NOTE: Executed on a single thread block
653 // oobKey = out-of-bounds key value
654 //
655 #define SORTTILELISTSKERNEL_NUM_THREAD 512
656 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
657 template <typename keyT, typename valT, bool ascend>
658 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
659 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
660  const int begin_bit, const int end_bit, const keyT oobKey,
661  keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
662  valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
663 
664  typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
665  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
666 
667  typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
668  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
669 
670  typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
671  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
672 
673  __shared__ union {
674  typename BlockLoad::TempStorage load;
675  typename BlockLoadU::TempStorage loadU;
676  typename BlockRadixSort::TempStorage sort;
677  } tempStorage;
678 
679  keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
680  valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
681 
682  BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
683  BLOCK_SYNC;
684  BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
685  BLOCK_SYNC;
686 
687  if (ascend)
688  BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
689  else
690  BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
691 
692  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
693  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
694 }
695 
696 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
697  unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
698 
699  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
700  {
701  int j = tileListOrder[i];
702  tileListDepthDst[i] = tileListDepthSrc[j];
703  }
704 
705 }
706 
707 //
708 // Bit shift tileListDepth so that only lower 16 bits are used
709 //
710 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
711  unsigned int* __restrict__ tileListDepth) {
712 
713  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
714  {
715  unsigned int a = tileListDepth[itileList];
716  a >>= begin_bit;
717  a &= 65535;
718  tileListDepth[itileList] = a;
719  }
720 
721 }
722 
723 // ##############################################################################################
724 // ##############################################################################################
725 // ##############################################################################################
726 
727 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
728 deviceID(deviceID), doStreaming(doStreaming) {
729 
730  cudaCheck(cudaSetDevice(deviceID));
731 
732  activeBuffer = 1;
733 
734  numPatches = 0;
735  numComputes = 0;
736 
737  cudaPatches = NULL;
738  cudaPatchesSize = 0;
739 
740  cudaComputes = NULL;
741  cudaComputesSize = 0;
742 
743  patchNumLists = NULL;
744  patchNumListsSize = 0;
745 
746  emptyPatches = NULL;
747  emptyPatchesSize = 0;
748  h_emptyPatches = NULL;
749  h_emptyPatchesSize = 0;
750  numEmptyPatches = 0;
751 
752  sortKeySrc = NULL;
753  sortKeySrcSize = 0;
754  sortKeyDst = NULL;
755  sortKeyDstSize = 0;
756 
757  tileLists1 = NULL;
758  tileLists1Size = 0;
759  tileLists2 = NULL;
760  tileLists2Size = 0;
761 
762  patchPairs1 = NULL;
763  patchPairs1Size = 0;
764  patchPairs2 = NULL;
765  patchPairs2Size = 0;
766 
767  tileJatomStart1 = NULL;
768  tileJatomStart1Size = 0;
769  tileJatomStart2 = NULL;
770  tileJatomStart2Size = 0;
771 
772  boundingBoxes = NULL;
773  boundingBoxesSize = 0;
774 
775  tileListDepth1 = NULL;
776  tileListDepth1Size = 0;
777  tileListDepth2 = NULL;
778  tileListDepth2Size = 0;
779 
780  tileListOrder1 = NULL;
781  tileListOrder1Size = 0;
782  tileListOrder2 = NULL;
783  tileListOrder2Size = 0;
784 
785  tileExcls1 = NULL;
786  tileExcls1Size = 0;
787  tileExcls2 = NULL;
788  tileExcls2Size = 0;
789 
790  xyzq = NULL;
791  xyzqSize = 0;
792  part = NULL;
793  partSize = 0;
794 
795  allocate_device<TileListStat>(&d_tileListStat, 1);
796  allocate_host<TileListStat>(&h_tileListStat, 1);
797 
798  tileListPos = NULL;
799  tileListPosSize = 0;
800  tempStorage = NULL;
801  tempStorageSize = 0;
802 
803  jtiles = NULL;
804  jtilesSize = 0;
805 
806  tilePos = NULL;
807  tilePosSize = 0;
808 
809  tileListsGBIS = NULL;
810  tileListsGBISSize = 0;
811 
812  tileJatomStartGBIS = NULL;
813  tileJatomStartGBISSize = 0;
814 
815  tileListVirialEnergy = NULL;
816  tileListVirialEnergySize = 0;
817 
818  atomStorageSize = 0;
819  numTileLists = 0;
820  numTileListsGBIS = 0;
821  numJtiles = 1;
822 
823  outputOrder = NULL;
824  outputOrderSize = 0;
825  doOutputOrder = false;
826 
827  minmaxListLen = NULL;
828  minmaxListLenSize = 0;
829 
830  sortKeys = NULL;
831  sortKeysSize = 0;
832  sortKeys_endbit = 0;
833 
834  cudaCheck(cudaEventCreate(&tileListStatEvent));
835  tileListStatEventRecord = false;
836 }
837 
838 CudaTileListKernel::~CudaTileListKernel() {
839  cudaCheck(cudaSetDevice(deviceID));
840  deallocate_device<TileListStat>(&d_tileListStat);
841  deallocate_host<TileListStat>(&h_tileListStat);
842  //
843  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
844  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
845  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
846  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
847  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
848  //
849  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
850  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
851  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
852  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
853  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
854  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
855  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
856  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
857  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
858  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
859  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
860  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
861  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
862  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
863  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
864  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
865  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
866  if (jtiles != NULL) deallocate_device<int>(&jtiles);
867  if (tilePos != NULL) deallocate_device<int>(&tilePos);
868 
869  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
870  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
871 
872  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
873 
874  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
875  if (part != NULL) deallocate_device<char>(&part);
876 
877  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
878  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
879 
880  cudaCheck(cudaEventDestroy(tileListStatEvent));
881 }
882 
883 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
884  clear_device_array<int>(jtiles, numJtiles, stream);
885 }
886 
887 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
888  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
889  memset(h_tileListStat, 0, sizeof(TileListStat));
890  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
891  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
892 }
893 
894 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
895  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
896  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
897  tileListStatEventRecord = true;
898 }
899 
900 void CudaTileListKernel::updateComputes(const int numComputesIn,
901  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
902 
903  numComputes = numComputesIn;
904 
905  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
906  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
907 
908  if (doStreaming) doOutputOrder = true;
909 }
910 
911 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
912  const TileList* d_tileLists, cudaStream_t stream) {
913 
914  TileList* h_tileLists = new TileList[numTileLists];
915  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
916  cudaCheck(cudaStreamSynchronize(stream));
917  FILE* handle = fopen(filename,"wt");
918  for (int itileList=0;itileList < numTileLists;itileList++) {
919  TileList tmp = h_tileLists[itileList];
920  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
921  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
922  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
923  }
924  fclose(handle);
925  delete [] h_tileLists;
926 }
927 
928 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
929  const int* d_tileJatomStart, cudaStream_t stream) {
930 
931  int* h_tileJatomStart = new int[numJtiles];
932  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
933  cudaCheck(cudaStreamSynchronize(stream));
934  FILE* handle = fopen(filename,"wt");
935  for (int i=0;i < numJtiles;i++) {
936  fprintf(handle, "%d\n", h_tileJatomStart[i]);
937  }
938  fclose(handle);
939  delete [] h_tileJatomStart;
940 }
941 
942 /*
943 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
944 {
945  return std::pair<int, int>(p.second, p.first);
946 }
947 
948 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
949  const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
950 
951  const int shCacheSize = 10;
952  TileList* h_tileLists = new TileList[numTileLists];
953  int* h_tileJatomStart = new int[numJtiles];
954  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
955  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
956  cudaCheck(cudaStreamSynchronize(stream));
957  int ntotal = 0;
958  int ncache = 0;
959  for (int i=0;i < numTileLists;i+=width) {
960  int jend = min(i + width, numTileLists);
961  std::map<int, int> atomStartMap;
962  std::map<int, int>::iterator it;
963  atomStartMap.clear();
964  for (int j=i;j < jend;j++) {
965  TileList tmp = h_tileLists[j];
966  int iatomStart = tmp.iatomStart;
967  it = atomStartMap.find(iatomStart);
968  if (it == atomStartMap.end()) {
969  // Insert new
970  atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
971  } else {
972  // Increase counter
973  it->second--;
974  }
975  int jtileStart = tmp.jtileStart;
976  int jtileEnd = tmp.jtileEnd;
977  ntotal += (jtileEnd - jtileStart + 1) + 1;
978  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
979  int jatomStart = h_tileJatomStart[jtile];
980  it = atomStartMap.find(jatomStart);
981  if (it == atomStartMap.end()) {
982  // Insert new
983  atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
984  } else {
985  // Increase counter
986  it->second--;
987  }
988  jatomStart |= (65535 << 16);
989  h_tileJatomStart[jtile] = jatomStart;
990  }
991  iatomStart |= (65535 << 16);
992  tmp.iatomStart = iatomStart;
993  h_tileLists[j] = tmp;
994  }
995  ncache += atomStartMap.size();
996  std::multimap<int, int> imap;
997  imap.clear();
998  std::multimap<int, int>::iterator imap_it;
999  std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
1000  if (i < 400) {
1001  printf("%d %d\n", ntotal, imap.size());
1002  for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
1003  if (imap_it->first != 0)
1004  printf("(%d %d)\n", imap_it->first, imap_it->second);
1005  }
1006  }
1007  }
1008  printf("ntotal %d ncache %d\n", ntotal, ncache);
1009  copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1010  copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1011  cudaCheck(cudaStreamSynchronize(stream));
1012  delete [] h_tileLists;
1013  delete [] h_tileJatomStart;
1014 }
1015 */
1016 
1017 void CudaTileListKernel::prepareBuffers(
1018  int atomStorageSizeIn, int numPatchesIn,
1019  const CudaPatchRecord* h_cudaPatches,
1020  cudaStream_t stream
1021 ) {
1022  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSizeIn, OVERALLOC);
1023 
1024  // Copy cudaPatches to device
1025  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatchesIn);
1026  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatchesIn, stream);
1027 }
1028 
1029 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1030  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1031  const float3 lata, const float3 latb, const float3 latc,
1032  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1033  const float plcutoff2In, const size_t maxShmemPerBlock,
1034  cudaStream_t stream, const bool atomsChanged,
1035  const bool allocatePart, bool CUDASOAintegrator, bool deviceMigration) {
1036 
1037  numPatches = numPatchesIn;
1038  atomStorageSize = atomStorageSizeIn;
1039  maxTileListLen = maxTileListLenIn;
1040  plcutoff2 = plcutoff2In;
1041 
1042  if (doStreaming) {
1043  // Re-allocate patchNumLists
1044  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1045  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1046  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1047  }
1048 
1049  // Re-allocate (tileLists1, patchPairs1
1050  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1051  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1052 
1053  // Copy cudaPatches to device
1054  // If DeviceMigration is on then this happens in prepareBuffers()
1055  if (!CUDASOAintegrator || !deviceMigration) {
1056  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1057  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1058  }
1059 
1060  // Re-allocate temporary storage
1061  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1062  // Calculate tile list positions (tilePos)
1063  {
1064  int nthread = 1024;
1065  int nblock = 1;
1066  calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1067  (numComputes, cudaComputes, cudaPatches, tilePos);
1068  cudaCheck(cudaGetLastError());
1069  }
1070 
1071  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1072  {
1073  int nthread = 512;
1074  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
1075  updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1076  (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1077  cudaCheck(cudaGetLastError());
1078  }
1079 
1080  // ---------------------------------------------------------------------------------------------
1081 
1082 
1083  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1084  // tileListDepth2 and tileListOrder2 since they're used in sorting
1085  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1086  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1087 
1088  // Allocate with +1 to include last term in the exclusive sum
1089  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1090 
1091  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1092 
1093  // deviceMigration could be true with CUDASOAintegrator as false if minimization is being
1094  // performed. So we want this to be true if CUDASOAintegrator is false or if deviceMigration
1095  // is false
1096  if (!CUDASOAintegrator || !deviceMigration) {
1097  if(atomsChanged) reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1098  if(!CUDASOAintegrator || atomsChanged) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1099  }
1100 
1101  //This should be allocated for FEP only
1102  if(allocatePart) reallocate_device<char>(&part, &partSize, atomStorageSize, OVERALLOC);
1103 
1104  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1105  {
1106  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1107  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1108 
1109  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1110  int nthread = WARPSIZE*nwarp;
1111  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1112  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1113  cudaCheck(cudaGetLastError());
1114  }
1115 
1116  {
1117  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1118  int nthread = WARPSIZE*nwarp;
1119  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1120 
1121  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1122  if(shmem_size > maxShmemPerBlock){
1123  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1124  }
1125 
1126  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1127  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1128  // re-allocation only happens when the size is exceeded.
1129  h_tileListStat->tilesSizeExceeded = true;
1130  int reallocCount = 0;
1131  while (h_tileListStat->tilesSizeExceeded) {
1132  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1133 
1134  clearTileListStat(stream);
1135  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1136 
1137  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1138  (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1139  lata, latb, latc, plcutoff2, maxTileListLen,
1140  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1141  tileListDepth1, tileListOrder1, patchPairs1,
1142  d_tileListStat);
1143 
1144  cudaCheck(cudaGetLastError());
1145 
1146  // get (numATileLists, numJtiles, tilesSizeExceeded)
1147  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1148  cudaCheck(cudaStreamSynchronize(stream));
1149  numJtiles = h_tileListStat->numJtiles;
1150 
1151  if (h_tileListStat->tilesSizeExceeded) {
1152  reallocCount++;
1153  if (reallocCount > 1) {
1154  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1155  }
1156  }
1157 
1158  }
1159 
1160  numTileLists = h_tileListStat->numTileLists;
1161 
1162  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1163  }
1164 
1165  // Re-allocate tileListVirialEnergy.
1166  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1167  // we're quaranteed to have enough space
1168  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1169 
1170  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1171  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1172  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1173  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1174  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1175 
1176  int numTileListsSrc = numTileListsPrev;
1177  int numJtilesSrc = numJtiles;
1178  int numTileListsDst = numTileLists;
1179  int numJtilesDst = numJtiles;
1180 
1181  // Sort tiles
1182  sortTileLists(
1183  false,
1184  0, false,
1185  numTileListsSrc, numJtilesSrc,
1186  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1187  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1188  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1189  numTileListsDst, numJtilesDst,
1190  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1191  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1192  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1193  stream);
1194 
1195  // Set active buffer to 2
1196  setActiveBuffer(2);
1197 
1198  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1199 }
1200 
1201 //
1202 // Returns integer log2(a) rounded up
1203 //
1204 int ilog2(int a) {
1205  // if (a < 0)
1206  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1207  int k = 1;
1208  while (a >>= 1) k++;
1209  return k;
1210 }
1211 
1212 //
1213 // Sort tile lists
1214 //
1215 void CudaTileListKernel::sortTileLists(
1216  const bool useJtiles,
1217  const int begin_bit, const bool highDepthBitsSetIn,
1218  // Source
1219  const int numTileListsSrc, const int numJtilesSrc,
1220  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1221  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1222  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1223  // Destination
1224  const int numTileListsDst, const int numJtilesDst,
1225  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1226  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1227  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1228  cudaStream_t stream) {
1229 
1230  bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1231 
1232  // if (numTileListsDst == 0)
1233  // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1234 
1235  // Check that the array sizes are adequate
1236  if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1237  numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1238  (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1239  (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1240  NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1241 
1242  if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1243  numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1244  (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1245  (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1246  NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1247 
1248  if (begin_bit != 0 && begin_bit != 16)
1249  NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1250 
1251  // Number of bits needed in the sort
1252  int num_bit = ilog2(maxTileListLen);
1253  if (num_bit > 16)
1254  NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1255  int end_bit = begin_bit + num_bit;
1256 
1257  if (doStreaming)
1258  {
1259  // ----------------------------------------------------------------------------------------
1260  if (doOutputOrder && useJtiles) {
1261  // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1262  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1263 
1264  // Calculate position from depth
1265  {
1266  // -----------------------------------------------------------------------------
1267  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1268  // -----------------------------------------------------------------------------
1269  if (doShiftDown)
1270  {
1271  int nthread = 1024;
1272  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1273  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1274  (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1275  cudaCheck(cudaGetLastError());
1276  }
1277 
1278  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1279 
1280  // --------------------------------------------------------------------
1281  // Compute itileList positions to store tileLists
1282  // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1283  // --------------------------------------------------------------------
1284  {
1285  size_t size = 0;
1286  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1287  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1288  // Make sure tempStorage doesn't remain NULL
1289  if (size == 0) size = 128;
1290  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1291  size = tempStorageSize;
1292  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1293  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1294  }
1295  }
1296 
1297  // Store in reverse order from outputOrder
1298  {
1299  int nthread = 1024;
1300  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1301  storeInReverse <<< nblock, nthread, 0, stream >>>
1302  (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1303  tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1304  tileListOrderDst.ptr, tileListDepthDst.ptr);
1305  cudaCheck(cudaGetLastError());
1306  }
1307 
1308  // Build sortKeys
1309  {
1310  maxTileListLen_sortKeys = maxTileListLen;
1311 
1312  reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1313  clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1314 
1315  // Re-allocate and initialize minmaxListLen
1316  {
1317  reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1318 
1319  int nthread = 1024;
1320  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1321  initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1322  (numComputes, maxTileListLen, minmaxListLen);
1323  cudaCheck(cudaGetLastError());
1324  }
1325 
1326  // Build sortKeys and calculate minmaxListLen
1327  {
1328  int nthread = 1024;
1329  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1330  buildSortKeys <<< nblock, nthread, 0, stream >>>
1331  (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1332  tileListDepthDst.ptr, minmaxListLen, sortKeys);
1333  cudaCheck(cudaGetLastError());
1334 
1335  // Maximum value in sortKeys[] is numTileListsDst - 1
1336  sortKeys_endbit = ilog2(numTileListsDst);
1337  }
1338 
1339  // Fill in missing sortKeys using minmaxListLen
1340  {
1341  int nthread = 1024;
1342  int nwarp = nthread/WARPSIZE;
1343  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1344  fillSortKeys <<< nblock, nthread, 0, stream >>>
1345  (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1346  cudaCheck(cudaGetLastError());
1347  }
1348 
1349  }
1350 
1351  doOutputOrder = false;
1352 
1353  } else if (doOutputOrder) {
1354  // OutputOrder will be produced in next pairlist non-bond kernel.
1355  // This time just remove zero length lists
1356  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1357 
1358  int endbit_tmp = ilog2(numTileListsSrc);
1359 
1360  // Remove zeros
1361  {
1362  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1363  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1364 
1365  int nthread = 1024;
1366  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1367  buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1368  (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1369  cudaCheck(cudaGetLastError());
1370  }
1371 
1372  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1373  {
1374  // Short list, sort withing a single thread block
1375  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1376  int nblock = 1;
1377 
1378  unsigned int oobKey = numTileListsSrc;
1379  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1380  (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1381  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1382  cudaCheck(cudaGetLastError());
1383  }
1384  else
1385  {
1386  // Long list, sort on multiple thread blocks
1387  size_t size = 0;
1388  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1389  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1390  numTileListsSrc, 0, endbit_tmp, stream));
1391  // Make sure tempStorage doesn't remain NULL
1392  if (size == 0) size = 128;
1393  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1394  size = tempStorageSize;
1395  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1396  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1397  numTileListsSrc, 0, endbit_tmp, stream));
1398  }
1399 
1400  // Re-order tileListDepth using tileListOrderDst
1401  {
1402  int nthread = 1024;
1403  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1404  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1405  (numTileListsDst, tileListOrderDst.ptr,
1406  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1407  cudaCheck(cudaGetLastError());
1408  }
1409 
1410  } else {
1411  // This is done during regular MD cycle
1412 
1413  if (sortKeys_endbit <= 0)
1414  NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1415 
1416  // Setup sort keys
1417  {
1418  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1419  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1420 
1421  int nthread = 1024;
1422  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1423  setupSortKey <<< nblock, nthread, 0, stream >>>
1424  (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1425  cudaCheck(cudaGetLastError());
1426 
1427  }
1428 
1429  // Global sort
1430  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1431  // if (false)
1432  {
1433  // Short list, sort withing a single thread block
1434  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1435  int nblock = 1;
1436 
1437  unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1438  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1439  (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1440  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1441  cudaCheck(cudaGetLastError());
1442  }
1443  else
1444  {
1445  // Long list, sort on multiple thread blocks
1446  size_t size = 0;
1447  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1448  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1449  numTileListsSrc, 0, sortKeys_endbit, stream));
1450  // Make sure tempStorage doesn't remain NULL
1451  if (size == 0) size = 128;
1452  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1453  size = tempStorageSize;
1454  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1455  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1456  numTileListsSrc, 0, sortKeys_endbit, stream));
1457  }
1458 
1459  // Re-order tileListDepth using tileListOrderDst
1460  {
1461  int nthread = 1024;
1462  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1463  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1464  (numTileListsDst, tileListOrderDst.ptr,
1465  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1466  cudaCheck(cudaGetLastError());
1467  }
1468 
1469  // Local sort
1470  {
1471  int nthread = 32;
1472  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1473  localSort<32> <<< nblock, nthread, 0, stream >>>
1474  (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1475  cudaCheck(cudaGetLastError());
1476 
1477  // No need to shift any more
1478  doShiftDown = false;
1479  }
1480 
1481  }
1482  // ----------------------------------------------------------------------------------------
1483 
1484  } // (doStreaming)
1485  else
1486  {
1487  // --------------------------------------------------------------------
1488  // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1489  // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1490  // --------------------------------------------------------------------
1491  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1492  {
1493  // Short list, sort withing a single thread block
1494  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1495  int nblock = 1;
1496 
1497  sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1498  (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1499  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1500  cudaCheck(cudaGetLastError());
1501 
1502  }
1503  else
1504  {
1505  // Long list, sort on multiple thread blocks
1506  size_t size = 0;
1507  cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1508  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1509  numTileListsSrc, begin_bit, end_bit, stream));
1510  // Make sure tempStorage doesn't remain NULL
1511  if (size == 0) size = 128;
1512  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1513  size = tempStorageSize;
1514  cudaCheck(cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1515  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1516  numTileListsSrc, begin_bit, end_bit, stream));
1517  }
1518  }
1519 
1520  // -----------------------------------------------------------------------------
1521  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1522  // -----------------------------------------------------------------------------
1523  if (doShiftDown)
1524  {
1525  int nthread = 1024;
1526  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1527  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1528  (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1529  cudaCheck(cudaGetLastError());
1530  }
1531 
1532  // Allocate with +1 to include last term in the exclusive sum
1533  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1534 
1535  // --------------------------------------------------------------------
1536  // Compute itileList positions to store tileLists
1537  // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1538  // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1539  // since this is an exclusive sum. But with this trick,
1540  // tileListPos[numTileListsDst] will contain the total number
1541  // of tile lists
1542  // --------------------------------------------------------------------
1543  {
1544  size_t size = 0;
1545  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1546  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1547  // Make sure tempStorage doesn't remain NULL
1548  if (size == 0) size = 128;
1549  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1550  size = tempStorageSize;
1551  // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1552  // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1553  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1554  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1555  }
1556 
1557  // --------------------------------------------------------------------
1558  // Re-package to
1559  // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1560  // patchPairDst[]
1561  // tileJatomStartDst[]
1562  // tileExclsDst[0 ... numJtilesDst-1]
1563  // --------------------------------------------------------------------
1564  {
1565  int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1566  int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1567  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1568 
1569  repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1570  (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1571  (useJtiles) ? jtiles : NULL,
1572  tileListsSrc.ptr, tileListsDst.ptr,
1573  patchPairsSrc.ptr, patchPairsDst.ptr,
1574  tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1575  tileExclsSrc.ptr, tileExclsDst.ptr);
1576  cudaCheck(cudaGetLastError());
1577  }
1578 
1579  // Count the number of tileLists that contribute to each patch
1580  if (doStreaming)
1581  {
1582  clear_device_array<int>(patchNumLists, numPatches, stream);
1583 
1584  // Fill in patchNumLists[0...numPatches-1]
1585  int nthread = 512;
1586  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1587  calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1588  (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1589  cudaCheck(cudaGetLastError());
1590 
1591  // Use emptyPatches[numPatches] as the count variable
1592  clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1593 
1594  // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1595  // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1596  setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1597  (numTileListsDst, tileListsDst.ptr, patchNumLists,
1598  numPatches, &emptyPatches[numPatches], emptyPatches);
1599  cudaCheck(cudaGetLastError());
1600 
1601  // // Copy emptyPatches[0 ... numPatches] to host
1602  copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1603  cudaCheck(cudaStreamSynchronize(stream));
1604  numEmptyPatches = h_emptyPatches[numPatches];
1605  }
1606 
1607 }
1608 
1609 //
1610 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1611 //
1612 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
1613  // Store previous number of active lists
1614  int numTileListsPrev = numTileLists;
1615 
1616  // Wait for finishTileList() to stop copying
1617  if (!tileListStatEventRecord)
1618  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1619  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1620 
1621  // Get numTileLists, numTileListsGBIS, and numExcluded
1622  {
1623  numTileLists = h_tileListStat->numTileLists;
1624  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1625  numExcluded = h_tileListStat->numExcluded;
1626  }
1627 
1628  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1629  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1630  sortTileLists(true, 0, true,
1631  numTileListsPrev, numJtiles,
1632  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1633  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1634  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1635  numTileLists, numJtiles,
1636  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1637  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1638  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1639  stream);
1640 
1641  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1642  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1643  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1644 
1645  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1646 
1647  // NOTE:
1648  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1649  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1650 
1651  if (doGBIS) {
1652  // GBIS is used => produce a second tile list
1653  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1654  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1655  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1656 
1657  sortTileLists(true, 16, true,
1658  numTileListsPrev, numJtiles,
1659  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1660  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1661  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1662  numTileListsGBIS, numJtiles,
1663  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1664  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1665  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1666  stream);
1667  }
1668 
1669  // Set active buffer to be 1
1670  setActiveBuffer(1);
1671 
1672 }
1673 
1674 /*
1675 //
1676 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1677 //
1678 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1679  return;
1680 
1681  if (!doStreaming || !doOutputOrder)
1682  return;
1683 
1684  // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1685  // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1686  sortTileLists(false, 0, true,
1687  numTileLists, numJtiles,
1688  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1689  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1690  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1691  numTileLists, numJtiles,
1692  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1693  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1694  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1695  stream);
1696 
1697  // Set active buffer to be 2
1698  setActiveBuffer(2);
1699 
1700 }
1701 */
1702 
1703 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
1704  if (len > tileListVirialEnergySize) {
1705  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1706  }
1707  tileListVirialEnergyLength = len;
1708 }
1709 
1710 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
1711  if (len > tileListVirialEnergySize) {
1712  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1713  }
1714  tileListVirialEnergyGBISLength = len;
1715 }
1716 
1717 #endif // NAMD_CUDA