NAMD
SequencerCUDA.C
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "CudaUtils.h"
3 #include "Molecule.h"
4 #include "ReductionMgr.h"
5 #include "SequencerCUDA.h"
6 #include "ComputeNonbondedUtil.h"
7 #include "DeviceCUDA.h"
8 #include "SimParameters.h"
9 #include "TestArray.h"
10 #include "ComputeRestraintsCUDA.h"
11 #include "ComputeGridForceCUDA.h"
12 #include "ComputeConsForceCUDA.h"
13 #include "NamdEventsProfiling.h"
14 #include "AtomMap.h"
15 #include "common.h"
16 #include <algorithm> // std::fill()
18 //#define DEBUGM
19 //#define MIN_DEBUG_LEVEL 3
20 #ifdef NODEGROUP_FORCE_REGISTER
21 extern __thread DeviceCUDA *deviceCUDA;
22 
23 #if 1
24 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
25  size_t offset = 0; \
26  for (int i = 0; i < numPatchesHome; i++) { \
27  PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \
28  const int numPatchAtoms = current.numAtoms; \
29  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
30  offset += numPatchAtoms; \
31  } \
32  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \
33  } while(0);
34 
35 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
36  size_t offset = 0; \
37  for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
38  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
39  const int numPatchAtoms = current.numAtoms; \
40  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
41  offset += numPatchAtoms; \
42  } \
43  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \
44  } while(0);
45 
46 #else
47 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
48  size_t offset = 0; \
49  for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \
50  PatchDataSOA& current = elem->patch->patchDataSOA; \
51  const int numPatchAtoms = current.numAtoms; \
52  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
53  offset += numPatchAtoms; \
54  } \
55  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \
56  } while(0);
57 #endif
58 
59 // This function sets whatever SOA pointer I have an
60 void SequencerCUDA::registerSOAPointersToHost(){
61  // fprintf(stderr, "Dev[%d] setting memories\n", deviceID);
62  patchData->h_soa_pos_x[deviceIndex] = this->d_pos_x;
63  patchData->h_soa_pos_y[deviceIndex] = this->d_pos_y;
64  patchData->h_soa_pos_z[deviceIndex] = this->d_pos_z;
65 
66  patchData->h_soa_vel_x[deviceIndex] = this->d_vel_x;
67  patchData->h_soa_vel_y[deviceIndex] = this->d_vel_y;
68  patchData->h_soa_vel_z[deviceIndex] = this->d_vel_z;
69 
70  patchData->h_soa_fb_x[deviceIndex] = this->d_f_normal_x;
71  patchData->h_soa_fb_y[deviceIndex] = this->d_f_normal_y;
72  patchData->h_soa_fb_z[deviceIndex] = this->d_f_normal_z;
73 
74  patchData->h_soa_fn_x[deviceIndex] = this->d_f_nbond_x;
75  patchData->h_soa_fn_y[deviceIndex] = this->d_f_nbond_y;
76  patchData->h_soa_fn_z[deviceIndex] = this->d_f_nbond_z;
77 
78  patchData->h_soa_fs_x[deviceIndex] = this->d_f_slow_x;
79  patchData->h_soa_fs_y[deviceIndex] = this->d_f_slow_y;
80  patchData->h_soa_fs_z[deviceIndex] = this->d_f_slow_z;
81 
82  patchData->h_soa_id[deviceIndex] = this->d_idMig;
83  patchData->h_soa_vdwType[deviceIndex] = this->d_vdwType;
84  patchData->h_soa_sortOrder[deviceIndex] = this->d_sortOrder;
85  patchData->h_soa_unsortOrder[deviceIndex] = this->d_unsortOrder;
86  patchData->h_soa_charge[deviceIndex] = this->d_charge;
87  patchData->h_soa_patchCenter[deviceIndex] = this->d_patchCenter;
88  patchData->h_soa_migrationDestination[deviceIndex] = this->d_migrationDestination;
89  patchData->h_soa_sortSoluteIndex[deviceIndex] = this->d_sortSoluteIndex;
90 
91  patchData->h_soa_partition[deviceIndex] = this->d_partition;
92 
93  patchData->h_atomdata_AoS[deviceIndex] = (FullAtom*) this->d_atomdata_AoS_in;
94  patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
95 }
96 
97 void SequencerCUDA::copySOAHostRegisterToDevice(){
98  // This function gets the host-registered SOA device pointers and copies the register itself to the device
99  // NOTE: This needs to be called only when ALL masterPEs have safely called ::registerSOAPointersToHost()
100  cudaCheck(cudaSetDevice(deviceID));
101  copy_HtoD<double*>(patchData->h_soa_pos_x, this->d_peer_pos_x, nDevices, stream);
102  copy_HtoD<double*>(patchData->h_soa_pos_y, this->d_peer_pos_y, nDevices, stream);
103  copy_HtoD<double*>(patchData->h_soa_pos_z, this->d_peer_pos_z, nDevices, stream);
104 
105  copy_HtoD<float*>(patchData->h_soa_charge, this->d_peer_charge, nDevices, stream);
106  if (simParams->alchOn) {
107  copy_HtoD<int*>(patchData->h_soa_partition, this->d_peer_partition, nDevices, stream);
108  }
109 
110  copy_HtoD<double*>(patchData->h_soa_vel_x, this->d_peer_vel_x, nDevices, stream);
111  copy_HtoD<double*>(patchData->h_soa_vel_y, this->d_peer_vel_y, nDevices, stream);
112  copy_HtoD<double*>(patchData->h_soa_vel_z, this->d_peer_vel_z, nDevices, stream);
113 
114  copy_HtoD<double*>(patchData->h_soa_fb_x, this->d_peer_fb_x, nDevices, stream);
115  copy_HtoD<double*>(patchData->h_soa_fb_y, this->d_peer_fb_y, nDevices, stream);
116  copy_HtoD<double*>(patchData->h_soa_fb_z, this->d_peer_fb_z, nDevices, stream);
117 
118  copy_HtoD<double*>(patchData->h_soa_fn_x, this->d_peer_fn_x, nDevices, stream);
119  copy_HtoD<double*>(patchData->h_soa_fn_y, this->d_peer_fn_y, nDevices, stream);
120  copy_HtoD<double*>(patchData->h_soa_fn_z, this->d_peer_fn_z, nDevices, stream);
121 
122  copy_HtoD<double*>(patchData->h_soa_fs_x, this->d_peer_fs_x, nDevices, stream);
123  copy_HtoD<double*>(patchData->h_soa_fs_y, this->d_peer_fs_y, nDevices, stream);
124  copy_HtoD<double*>(patchData->h_soa_fs_z, this->d_peer_fs_z, nDevices, stream);
125 
126  copy_HtoD<int4*>(patchData->h_soa_migrationDestination, this->d_peer_migrationDestination, nDevices, stream);
127  copy_HtoD<int*>(patchData->h_soa_sortSoluteIndex, this->d_peer_sortSoluteIndex, nDevices, stream);
128 
129  copy_HtoD<int*>(patchData->h_soa_id, this->d_peer_id, nDevices, stream);
130  copy_HtoD<int*>(patchData->h_soa_vdwType, this->d_peer_vdwType, nDevices, stream);
131  copy_HtoD<int*>(patchData->h_soa_sortOrder, this->d_peer_sortOrder, nDevices, stream);
132  copy_HtoD<int*>(patchData->h_soa_unsortOrder, this->d_peer_unsortOrder, nDevices, stream);
133  copy_HtoD<double3*>(patchData->h_soa_patchCenter, this->d_peer_patchCenter, nDevices, stream);
134 
135  copy_HtoD<FullAtom*>(patchData->h_atomdata_AoS, this->d_peer_atomdata, nDevices, stream);
136  copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
137 
138  // aggregate device pointers
139  for(int i = 0; i < this->nDevices; i++)
140  h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
141  copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
142 }
143 
144 void SequencerCUDA::printSOAPositionsAndVelocities() {
145  BigReal* h_pos_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
146  BigReal* h_pos_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
147  BigReal* h_pos_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
148 
149  BigReal* h_vel_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
150  BigReal* h_vel_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
151  BigReal* h_vel_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
152 
153  // DMC think this condition was a holdover from the NCCL code path
154  if(false && mGpuOn){
155  copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
156  copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
157  copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
158  }else{
159  copy_DtoH_sync<BigReal>(d_pos_x, h_pos_x, numAtomsHome);
160  copy_DtoH_sync<BigReal>(d_pos_y, h_pos_y, numAtomsHome);
161  copy_DtoH_sync<BigReal>(d_pos_z, h_pos_z, numAtomsHome);
162  }
163 
164  copy_DtoH_sync<BigReal>(d_vel_x, h_vel_x, numAtomsHome);
165  copy_DtoH_sync<BigReal>(d_vel_y, h_vel_y, numAtomsHome);
166  copy_DtoH_sync<BigReal>(d_vel_z, h_vel_z, numAtomsHome);
167 
168  CmiLock(this->patchData->printlock);
169  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
170  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
171  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
172  for(int i =0 ; i < numPatchesHome; i++){
173  CudaLocalRecord record = localPatches[i];
174  const int patchID = record.patchID;
175  const int stride = record.bufferOffset;
176  const int numPatchAtoms = record.numAtoms;
177  PatchDataSOA& current = homePatches[i]->patchDataSOA;
178 
179  fprintf(stderr, "Patch [%d]:\n", patchID);
180  for(int j = 0; j < numPatchAtoms; j++){
181  fprintf(stderr, " [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.id[j],
182  h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
183  h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
184  }
185 
186  }
187  CmiUnlock(this->patchData->printlock);
188 
189  free(h_pos_x);
190  free(h_pos_y);
191  free(h_pos_z);
192  free(h_vel_x);
193  free(h_vel_y);
194  free(h_vel_z);
195 }
196 
197 void SequencerCUDA::printSOAForces(char *prefix) {
198  BigReal* h_f_normal_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
199  BigReal* h_f_normal_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
200  BigReal* h_f_normal_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
201 
202  BigReal* h_f_nbond_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
203  BigReal* h_f_nbond_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
204  BigReal* h_f_nbond_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
205 
206  BigReal* h_f_slow_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
207  BigReal* h_f_slow_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
208  BigReal* h_f_slow_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
209 
210  copy_DtoH_sync<BigReal>(d_f_normal_x, h_f_normal_x, numAtomsHome);
211  copy_DtoH_sync<BigReal>(d_f_normal_y, h_f_normal_y, numAtomsHome);
212  copy_DtoH_sync<BigReal>(d_f_normal_z, h_f_normal_z, numAtomsHome);
213 
214  copy_DtoH_sync<BigReal>(d_f_nbond_x, h_f_nbond_x, numAtomsHome);
215  copy_DtoH_sync<BigReal>(d_f_nbond_y, h_f_nbond_y, numAtomsHome);
216  copy_DtoH_sync<BigReal>(d_f_nbond_z, h_f_nbond_z, numAtomsHome);
217 
218  copy_DtoH_sync<BigReal>(d_f_slow_x, h_f_slow_x, numAtomsHome);
219  copy_DtoH_sync<BigReal>(d_f_slow_y, h_f_slow_y, numAtomsHome);
220  copy_DtoH_sync<BigReal>(d_f_slow_z, h_f_slow_z, numAtomsHome);
221 
222  // Great, now let's
223  CmiLock(this->patchData->printlock);
224  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
225  char fname[100];
226  fprintf(stderr, "PE[%d] force printout\n", CkMyPe());
227  for(int i =0 ; i < numPatchesHome; i++){
228  CudaLocalRecord record = localPatches[i];
229  const int patchID = record.patchID;
230  const int stride = record.bufferOffset;
231  const int numPatchAtoms = record.numAtoms;
232  FILE *outfile=stderr;
233  if(prefix!=NULL)
234  {
235  snprintf(fname,100, "%s-patch-%d", prefix, patchID);
236  outfile = fopen(fname, "w");
237  }
238  fprintf(outfile, "Patch [%d]:\n", patchID);
239  for(int j = 0; j < numPatchAtoms; j++){
240  fprintf(outfile, " [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
241  h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
242  h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
243  h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
244  }
245  if(prefix!=NULL) fclose(outfile);
246  }
247 
248  CmiUnlock(this->patchData->printlock);
249 
250  free(h_f_normal_x);
251  free(h_f_normal_y);
252  free(h_f_normal_z);
253 
254  free(h_f_nbond_x);
255  free(h_f_nbond_y);
256  free(h_f_nbond_z);
257 
258  free(h_f_slow_x);
259  free(h_f_slow_y);
260  free(h_f_slow_z);
261 
262 }
263 
264 SequencerCUDA* SequencerCUDA::InstanceInit(const int deviceID_ID,
265  SimParameters *const sim_Params) {
266  if (CkpvAccess(SequencerCUDA_instance) == 0) {
267  CkpvAccess(SequencerCUDA_instance) = new SequencerCUDA(deviceID_ID, sim_Params);
268  }
269  return CkpvAccess(SequencerCUDA_instance);
270 }
271 
272 SequencerCUDA::SequencerCUDA(const int deviceID_ID,
273  SimParameters *const sim_Params):
274  deviceID(deviceID_ID), simParams(sim_Params)
275 {
276  restraintsKernel = NULL;
277  SMDKernel = NULL;
278  groupRestraintsKernel = NULL;
279  gridForceKernel = NULL;
280  consForceKernel = NULL;
281  lonepairsKernel = nullptr;
282 #if 1
283  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
284  patchData = cpdata.ckLocalBranch();
285 #endif
286  initialize();
287  CUDASequencerKernel = new SequencerCUDAKernel();
288  CUDAMigrationKernel = new MigrationCUDAKernel();
289  num_used_grids = simParams->alchGetNumOfPMEGrids();
290  used_grids.resize(num_used_grids, 0);
291  if (simParams->alchFepOn) {
292  // at least two grids are used
293  used_grids[0] = 0;
294  used_grids[1] = 1;
295  // if alchDecouple then two more grids are used
296  if (simParams->alchDecouple) {
297  used_grids[2] = 2;
298  used_grids[3] = 3;
299  // an extra for soft-core potential
300  if (simParams->alchElecLambdaStart > 0) {
301  used_grids[4] = 4;
302  }
303  } else {
304  // in this case alchDecouple is false
305  // but if there is still soft-core potential
306  // then a total of 3 grids are used
307  // mark the last grid for soft-core potential
308  if (simParams->alchElecLambdaStart > 0) {
309  used_grids[2] = 4;
310  }
311  }
312  }
313  if (simParams->alchThermIntOn) {
314  used_grids[0] = 0;
315  used_grids[1] = 1;
316  // in TI, no matter whether soft-core potential is used
317  // it has at least three grids
318  if (simParams->alchDecouple) {
319  used_grids[2] = 2;
320  used_grids[3] = 3;
321  used_grids[4] = 4;
322  } else {
323  used_grids[2] = 4;
324  }
325  }
326  rescalePairlistTolerance = false;
327 }
328 
329 SequencerCUDA::~SequencerCUDA(){
330  cudaCheck(cudaSetDevice(deviceID));
331  deallocateArrays();
332  deallocateStaticArrays();
333  deallocate_device<SettleParameters>(&sp);
334  deallocate_device<int>(&settleList);
335  deallocate_device<CudaRattleElem>(&rattleList);
336  deallocate_device<int>(&d_consFailure);
337  if (CUDASequencerKernel != NULL) delete CUDASequencerKernel;
338  if (CUDAMigrationKernel != NULL) delete CUDAMigrationKernel;
339  if (restraintsKernel != NULL) delete restraintsKernel;
340  if(SMDKernel != NULL) delete SMDKernel;
341  if (groupRestraintsKernel != NULL) delete groupRestraintsKernel;
342  if (gridForceKernel != NULL) delete gridForceKernel;
343  if (lonepairsKernel != nullptr) delete lonepairsKernel;
344  if (consForceKernel != NULL) delete consForceKernel;
345 #if 0
346  cudaCheck(cudaStreamDestroy(stream));
347 #endif
348  cudaCheck(cudaStreamDestroy(stream2));
349  curandCheck(curandDestroyGenerator(curandGen));
350  CmiDestroyLock(printlock);
351 
352 }
353 
354 void SequencerCUDA::zeroScalars(){
355  numAtomsHomeAndProxyAllocated = 0;
356  numAtomsHomeAllocated = 0;
357  buildRigidLists = true;
358  numPatchesCheckedIn = 0;
359  numPatchesReady= 0;
360 }
361 
362 void SequencerCUDA::initialize(){
363  cudaCheck(cudaSetDevice(deviceID));
364  nDevices = deviceCUDA->getNumDevice() + 1 *deviceCUDA->isGpuReservedPme();
365  deviceIndex = deviceCUDA->getDeviceIndex();
366 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
367  int leastPriority, greatestPriority;
368  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
369 #if 0
370  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
371 #else
372  stream = (CkMyPe() == deviceCUDA->getMasterPe()) ? patchData->devData[deviceIndex].nbond_stream : 0;
373 #endif
374  cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
375 #else
376  cudaCheck(cudaStreamCreate(&stream));
377  cudaCheck(cudaStreamCreate(&stream2));
378 #endif
379  curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
380  // each PE's seed needs to be different
381  unsigned long long seed = simParams->randomSeed + CkMyPe();
382  curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
383 
384  numAtomsHomeAllocated = 0;
385  numAtomsHomeAndProxyAllocated = 0;
386 
387  totalMarginViolations = 0;
388  buildRigidLists = true;
389  numPatchesCheckedIn = 0;
390  numPatchesReady= 0;
391  PatchMap* patchMap = PatchMap::Object();
392 #if 1
393  numPatchesGlobal = patchMap->numPatches();
394 #else
395  numPatchesGlobal = patchMap->homePatchList()->size();
396 #endif
397  mGpuOn = nDevices > 1;
398  // Great, now allocate the queue
399  // allocate_device<unsigned int>(&deviceQueue, nDevices);
400  // cudaCheck(cudaMemset(deviceQueue, 999, sizeof(unsigned int)* nDevices));
401  // Allocates and registers the local queue in patchData
402  // patchData->d_queues[deviceIndex] = deviceQueue;
403 
404  printlock = CmiCreateLock();
405 
406  const int numPes = CkNumPes();
407  if (!mGpuOn) {
408  atomMapList.resize(numPes);
409  }
410 
411  //Allocates SOA registers
412  allocate_device<double*>(&d_peer_pos_x, nDevices);
413  allocate_device<double*>(&d_peer_pos_y, nDevices);
414  allocate_device<double*>(&d_peer_pos_z, nDevices);
415  allocate_device<float*>(&d_peer_charge, nDevices);
416  if (simParams->alchOn) {
417  allocate_device<int*>(&d_peer_partition, nDevices);
418  }
419  allocate_device<double*>(&d_peer_vel_x, nDevices);
420  allocate_device<double*>(&d_peer_vel_y, nDevices);
421  allocate_device<double*>(&d_peer_vel_z, nDevices);
422 
423  if (simParams->fixedAtomsOn) {
424  allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
425  allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
426  allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
427  allocate_device<double3>(&d_fixForceNormal, 1);
428  allocate_device<double3>(&d_fixForceNbond, 1);
429  allocate_device<double3>(&d_fixForceSlow, 1);
430  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
431  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
432  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
433  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
434  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
435  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
436  }
437 
438  allocate_device<double*>(&d_peer_fb_x, nDevices);
439  allocate_device<double*>(&d_peer_fb_y, nDevices);
440  allocate_device<double*>(&d_peer_fb_z, nDevices);
441  allocate_device<double*>(&d_peer_fn_x, nDevices);
442  allocate_device<double*>(&d_peer_fn_y, nDevices);
443  allocate_device<double*>(&d_peer_fn_z, nDevices);
444  allocate_device<double*>(&d_peer_fs_x, nDevices);
445  allocate_device<double*>(&d_peer_fs_y, nDevices);
446  allocate_device<double*>(&d_peer_fs_z, nDevices);
447 
448  allocate_device<int4*>(&d_peer_migrationDestination, nDevices);
449  allocate_device<int*>(&d_peer_sortSoluteIndex, nDevices);
450  allocate_device<int*>(&d_peer_id, nDevices);
451  allocate_device<int*>(&d_peer_vdwType, nDevices);
452  allocate_device<int*>(&d_peer_sortOrder, nDevices);
453  allocate_device<int*>(&d_peer_unsortOrder, nDevices);
454  allocate_device<double3*>(&d_peer_patchCenter, nDevices);
455 
456  allocate_device<FullAtom*>(&d_peer_atomdata, nDevices);
457  allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
458 
459  allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
460 
461  allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
462  // Patch-related host datastructures
463  allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
464  allocate_host<double3>(&patchCenter, numPatchesGlobal);
465  allocate_host<int>(&globalToLocalID, numPatchesGlobal);
466  allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
467  allocate_host<double3>(&awayDists, numPatchesGlobal);
468  allocate_host<double3>(&patchMin, numPatchesGlobal);
469  allocate_host<double3>(&patchMax, numPatchesGlobal);
470  //allocate_host<Lattice>(&lattices, numPatchesGlobal);
471  allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal); // only needed for langevin
472  allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
473  allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
474  allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
475 
476  //Patch-related device datastructures
477  allocate_device<double3>(&d_awayDists, numPatchesGlobal);
478  allocate_device<double3>(&d_patchMin, numPatchesGlobal);
479  allocate_device<double3>(&d_patchMax, numPatchesGlobal);
480  allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
481  allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
482  allocate_device<double3>(&d_patchCenter, numPatchesGlobal);
483  allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
484  allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
485  allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
486  allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
487  allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
488 
489  // Allocate host memory for scalar variables
490  allocate_device<int>(&d_killme, 1);
491  allocate_device<char>(&d_barrierFlag, 1);
492  allocate_device<unsigned int>(&d_tbcatomic, 5);
493  allocate_device<BigReal>(&d_kineticEnergy, ATOMIC_BINS);
494  allocate_device<BigReal>(&d_intKineticEnergy, ATOMIC_BINS);
495  allocate_device<BigReal>(&d_momentum_x, ATOMIC_BINS);
496  allocate_device<BigReal>(&d_momentum_y, ATOMIC_BINS);
497  allocate_device<BigReal>(&d_momentum_z, ATOMIC_BINS);
498  allocate_device<BigReal>(&d_angularMomentum_x, ATOMIC_BINS);
499  allocate_device<BigReal>(&d_angularMomentum_y, ATOMIC_BINS);
500  allocate_device<BigReal>(&d_angularMomentum_z, ATOMIC_BINS);
501  allocate_device<cudaTensor>(&d_virial, ATOMIC_BINS);
502  allocate_device<cudaTensor>(&d_intVirialNormal, ATOMIC_BINS);
503  allocate_device<cudaTensor>(&d_intVirialNbond, ATOMIC_BINS);
504  allocate_device<cudaTensor>(&d_intVirialSlow, ATOMIC_BINS);
505  allocate_device<cudaTensor>(&d_rigidVirial, ATOMIC_BINS);
506  // for lone pairs
507  allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
508  allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
509  allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
510  //space for globalmaster forces on device
511  allocate_device<cudaTensor>(&d_extVirial, ATOMIC_BINS * EXT_FORCE_TOTAL);
512  allocate_device<double3>(&d_extForce, ATOMIC_BINS * EXT_FORCE_TOTAL);
513  allocate_device<double>(&d_extEnergy, ATOMIC_BINS * EXT_FORCE_TOTAL);
514 
515  allocate_device<SettleParameters>(&sp, 1);
516 
517  //allocates host scalars in host-mapped, pinned memory
518  allocate_host<int>(&killme, 1);
519  allocate_host<BigReal>(&kineticEnergy, 1);
520  allocate_host<BigReal>(&intKineticEnergy, 1);
521  allocate_host<BigReal>(&kineticEnergy_half, 1);
522  allocate_host<BigReal>(&intKineticEnergy_half, 1);
523  allocate_host<BigReal>(&momentum_x, 1);
524  allocate_host<BigReal>(&momentum_y, 1);
525  allocate_host<BigReal>(&momentum_z, 1);
526  allocate_host<BigReal>(&angularMomentum_x, 1);
527  allocate_host<BigReal>(&angularMomentum_y, 1);
528  allocate_host<BigReal>(&angularMomentum_z, 1);
529  allocate_host<int>(&consFailure, 1);
530  allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
531  allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
532  allocate_host<unsigned int>(&h_marginViolations, 1);
533  allocate_host<unsigned int>(&h_periodicCellSmall, 1);
534 
535  // allocates host cudaTensors in host-mapped, pinned memory
536  allocate_host<cudaTensor>(&virial, 1);
537  allocate_host<cudaTensor>(&virial_half, 1);
538  allocate_host<cudaTensor>(&intVirialNormal, 1);
539  allocate_host<cudaTensor>(&intVirialNormal_half, 1);
540  allocate_host<cudaTensor>(&intVirialNbond, 1);
541  allocate_host<cudaTensor>(&intVirialSlow, 1);
542  allocate_host<cudaTensor>(&rigidVirial, 1);
543  allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
544  allocate_host<cudaTensor>(&lpVirialNormal, 1);
545  allocate_host<cudaTensor>(&lpVirialNbond, 1);
546  allocate_host<cudaTensor>(&lpVirialSlow, 1);
547 
548  // These arrays will get allocated when total forces of the GPU global calculations are requested
549  d_f_saved_nbond_x = nullptr;
550  d_f_saved_nbond_y = nullptr;
551  d_f_saved_nbond_z = nullptr;
552  d_f_saved_slow_x = nullptr;
553  d_f_saved_slow_y = nullptr;
554  d_f_saved_slow_z = nullptr;
555 
556  //Sets values for scalars
557  *kineticEnergy = 0.0;
558  *intKineticEnergy = 0.0;
559  *kineticEnergy_half = 0.0;
560  *intKineticEnergy_half = 0.0;
561  *momentum_x = 0.0;
562  *momentum_y = 0.0;
563  *momentum_z = 0.0;
564  *angularMomentum_x = 0.0;
565  *angularMomentum_y = 0.0;
566  *angularMomentum_z = 0.0;
567  *consFailure = 0;
568  *killme = 0;
569 
570  // JM: Basic infrastructure to time kernels
571  // XXX TODO: Add timing functions to be switched on/off for each of these
572  t_total = 0;
573  t_vverlet = 0;
574  t_pairlistCheck = 0;
575  t_setComputePositions = 0;
576  t_accumulateForceKick = 0;
577  t_rattle = 0;
578  t_submitHalf = 0;
579  t_submitReductions1 = 0;
580  t_submitReductions2 = 0;
581 
582  cudaEventCreate(&eventStart);
583  cudaEventCreate(&eventStop);
584  cudaCheck(cudaMemset(d_patchNewTolerance, 0, sizeof(BigReal)*numPatchesGlobal));
585  cudaCheck(cudaMemset(d_kineticEnergy, 0, sizeof(BigReal)));
586  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int) * 5));
587  cudaCheck(cudaMemset(d_momentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
588  cudaCheck(cudaMemset(d_momentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
589  cudaCheck(cudaMemset(d_momentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
590  cudaCheck(cudaMemset(d_angularMomentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
591  cudaCheck(cudaMemset(d_angularMomentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
592  cudaCheck(cudaMemset(d_angularMomentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
593  cudaCheck(cudaMemset(d_intKineticEnergy, 0, ATOMIC_BINS * sizeof(BigReal)));
594  cudaCheck(cudaMemset(d_virial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
595  cudaCheck(cudaMemset(d_rigidVirial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
596  cudaCheck(cudaMemset(d_intVirialNormal, 0, ATOMIC_BINS * sizeof(cudaTensor)));
597  cudaCheck(cudaMemset(d_intVirialNbond, 0, ATOMIC_BINS * sizeof(cudaTensor)));
598  cudaCheck(cudaMemset(d_intVirialSlow, 0, ATOMIC_BINS * sizeof(cudaTensor)));
599  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
600  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
601  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
602  cudaCheck(cudaMemset(d_extVirial, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(cudaTensor)));
603  cudaCheck(cudaMemset(d_extForce, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double3)));
604  cudaCheck(cudaMemset(d_extEnergy, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double)));
605 
606  memset(h_marginViolations, 0, sizeof(unsigned int));
607  memset(h_periodicCellSmall, 0, sizeof(unsigned int));
608  memset(virial, 0, sizeof(cudaTensor));
609  memset(rigidVirial, 0, sizeof(cudaTensor));
610  memset(intVirialNormal, 0, sizeof(cudaTensor));
611  memset(intVirialNbond, 0, sizeof(cudaTensor));
612  memset(intVirialSlow, 0, sizeof(cudaTensor));
613  memset(lpVirialNormal, 0, sizeof(cudaTensor));
614  memset(lpVirialNbond, 0, sizeof(cudaTensor));
615  memset(lpVirialSlow, 0, sizeof(cudaTensor));
616  memset(globalToLocalID, -1, sizeof(int)*numPatchesGlobal);
617 
618  settleList = NULL;
619  settleListSize = 0;
620  rattleList = NULL;
621  rattleListSize = 0;
622  d_consFailure = NULL;
623  d_consFailureSize = 0;
624 
626 
627  // JM: I can bundle the CompAtom and CudaAtomList pointers from the
628  // patches here and fill the Host-arrays after integration. :]
629  // if single-gpu simulation, numPatchesHome is the same as numPatchesGlobal
630  numPatchesHome = numPatchesGlobal;
631 
632 
633  int count = 0;
634  // Single-GPU case
635  if(!mGpuOn){
636  // JM NOTE: This works for a single device only, but it's fast if we want to have multiple-PE's sharing it
637  if(deviceCUDA->getMasterPe() == CkMyPe()){
638  for(int i = 0; i < numPes; i++) {
639  atomMapList[i] = AtomMap::ObjectOnPe(i);
640  PatchMap* patchMap = PatchMap::ObjectOnPe(i);
641  int npatch = patchMap->numPatches();
642  for (int j = 0; j < npatch; j++) {
643  HomePatch *patch = patchMap->homePatch(j);
644  // JM NOTE: This data structure can be preserved through migration steps
645  if(patch != NULL) {
646  patchList.push_back(patch);
647  patchNewTolerance[count++] =
648  0.5 * ( simParams->pairlistDist - simParams->cutoff );
649  numAtomsHomeAndProxy += patchMap->patch(j)->getNumAtoms();
650 
651  patchData->devData[deviceIndex].patches.push_back(patch);
652  patchListHomeAndProxy.push_back(patch);
653  }
654  }
655  }
656  }
657  }else{
658  // Multi-device case
659  /* The logic here is a bit trickier than the one on the single-GPU case.
660  Each GPU will only allocate space for its home patches and any required proxy patches,
661  This is going to eliminate the possibility of using NCCL-based all reduce for communication
662  but DMC thinks this is okay...
663  */
664  if(deviceCUDA->getMasterPe() == CkMyPe()){
665  // Haochuan: For multi-GPU cases, we need to find the atom maps from
666  // all "non-master PEs" that use the same device as the master PE.
667  // 1. Get the current device ID.
668  const int currentDevice = deviceCUDA->getDeviceIDforPe(CkMyPe());
669  // 2. Iterate over all PEs and find those that have the same device ID as this master PE
670  for (int i = 0; i < numPes; ++i) {
671  if (deviceCUDA->getDeviceIDforPe(i) == currentDevice) {
672  atomMapList.push_back(AtomMap::ObjectOnPe(i));
673  }
674  }
675  for(int i = 0; i < deviceCUDA->getNumPesSharingDevice(); i++){
677  for (int j = 0; j < pm->numPatches(); j++) {
678  // Aggregates the patches in a separate data structure for now
679  HomePatch *patch = pm->homePatch(j);
680  if(patch != NULL) {
681  patchData->devData[deviceIndex].patches.push_back(patch);
682  }
683  }
684  }
685 
686  // if MGPU is On, we also try to set up peer access
688 #ifdef NAMD_NCCL_ALLREDUCE
689  deviceCUDA->setNcclUniqueId(patchData->ncclId);
690  deviceCUDA->setupNcclComm();
691 #endif
692 
693  }
694  }
695  PatchMap *pm = PatchMap::Object();
696  isPeriodic = (pm->periodic_a() && pm->periodic_b() && pm->periodic_c());
697 
698  // XXX TODO decide how the biasing methods will work in the future -
699  // for multiple devices this will be a problem, how to solve it?
700  if (simParams->constraintsOn) {
701  restraintsKernel = new ComputeRestraintsCUDA(patchList, atomMapList,
702  stream, mGpuOn);
703  }
704  if (simParams->SMDOn) {
705  if(deviceCUDA->getMasterPe() == CkMyPe()){
706  SMDKernel = new ComputeSMDCUDA(patchList, simParams->SMDk, simParams->SMDk2,
707  simParams->SMDVel, make_double3(simParams->SMDDir.x, simParams->SMDDir.y, simParams->SMDDir.z),
708  simParams->SMDOutputFreq, simParams->firstTimestep, simParams->SMDFile,
710  Node::Object()->molecule->numAtoms, nDevices, deviceIndex, mGpuOn);
711  // need to set smd atom lists in the mgpu case
712  if(mGpuOn)
713  {
714  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
716  SMDKernel->initPeerCOM(cudaMgr->curSMDCOM, stream);
717  }
718  }
719  }
720  if (simParams->groupRestraintsOn) {
721  if(deviceCUDA->getMasterPe() == CkMyPe()){
722  groupRestraintsKernel = new ComputeGroupRestraintsCUDA(simParams->outputEnergies,
723  simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
724  if(mGpuOn)
725  {
726  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
727  groupRestraintsKernel->initPeerCOM(stream);
728  }
729  }
730  }
731  if (simParams->mgridforceOn || simParams->gridforceOn ){
732  if(deviceCUDA->getMasterPe() == CkMyPe()){
733  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
734  }
735  }
736  if (Node::Object()->molecule->is_lonepairs_psf) {
737  lonepairsKernel = new ComputeLonepairsCUDA();
738  }
739  if (simParams->consForceOn){
740  if(deviceCUDA->getMasterPe() == CkMyPe()){
741  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
742  }
743  }
744 
745 }
746 
747 /* events like ScriptTcl changes to global data require us to rebuild
748  some of the data structures used by some of the kernels. i.e., the
749  gridForceKernel needs to change when the grid, or grid related
750  data, is changed.
751 
752  Technically these may all be in use and updated at different
753  intervals, but that is a weird edge case that doesn't justify
754  adding a whole other sync path. If that happens, we'll be doing
755  this slightly too often. Such a hypothetical user has already
756  bought in to periodically sacrificing performance to radically
757  change the system being simulated at run time in several
758  different ways on different triggers, so as long as this overhead
759  is substantially better than a full restart, it probably doesn't
760  matter.
761  */
762 
763 void SequencerCUDA::updateDeviceKernels()
764 {
765  if (simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn){
766  // clean out the old and bring in the new if we're a master PE
767  if(deviceCUDA->getMasterPe() == CkMyPe()){
768  if(patchData->updateCounter.fetch_sub(1)>=1)
769  {
770  if(gridForceKernel!=NULL)
771  {
772  delete gridForceKernel;
773  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
774  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
775  }
776  if(consForceKernel!=NULL)
777  {
778  delete consForceKernel;
779  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,
780  mGpuOn);
781  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
782  }
783  }
784  }
785  }
786 }
787 
788 bool SequencerCUDA::reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy)
789 {
790  cudaCheck(cudaSetDevice(deviceID));
791  const float OVERALLOC = 1.5f;
792 
793  if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
794  return false;
795  }
796 
797  // Before deallocate the f_saved_nbond_* and f_saved_slow_*,
798  // check if they are nullptr and allocate them if they are null
799  bool realloc_gpu_saved_force = false;
800  if (d_f_saved_nbond_x != nullptr || d_f_saved_slow_x != nullptr) {
801  realloc_gpu_saved_force = true;
802  }
803 
804 
805  deallocateArrays();
806 
807  numAtomsHomeAndProxyAllocated = (int) ((float) in_numAtomsHomeAndProxy * OVERALLOC);
808  numAtomsHomeAllocated = (int) ((float) in_numAtomsHome * OVERALLOC);
809 
810  allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
811  allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
812  allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
813  allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
814  allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
815  allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
816  allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
817  allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
818  allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
819  allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
820  allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
821  allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
822  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
823  allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
824  allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
825  allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
826  }
827  allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
828  allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
829  allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
830 
831  allocate_host<double>(&recipMass, numAtomsHomeAllocated);
832  allocate_host<double>(&vel_x, numAtomsHomeAllocated);
833  allocate_host<double>(&vel_y, numAtomsHomeAllocated);
834  allocate_host<double>(&vel_z, numAtomsHomeAllocated);
835  allocate_host<char3>(&transform, numAtomsHomeAllocated);
836  allocate_host<float>(&mass, numAtomsHomeAllocated);
837  if (simParams->alchOn) {
838  allocate_host<int>(&partition, numAtomsHomeAndProxyAllocated);
839  }
840  allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
841  allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
842  allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
843 
844  // array buffers for pseudorandom normal distribution must be even length
845  // choose n to be the smallest even number >= numIntegrationAtoms
846  // guarantees that n is always > numIntegrationAtoms
847  int n = (numAtomsHomeAllocated + 1) & (~1);
848  allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
849  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
850  allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
851  if (simParams->fixedAtomsOn) {
852  allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
853  allocate_host(&fixedPosition_x, numAtomsHomeAllocated);
854  allocate_host(&fixedPosition_y, numAtomsHomeAllocated);
855  allocate_host(&fixedPosition_z, numAtomsHomeAllocated);
856  }
857  allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
858 
859 
860  if (simParams->useDeviceMigration) {
861  allocate_host<int>(&idMig, numAtomsHomeAllocated);
862  allocate_host<int>(&vdwType, numAtomsHomeAllocated);
863  }
864 
865  allocate_device<double>(&d_f_raw, 9 * numAtomsHomeAndProxyAllocated); // Total number of force buffers
866  d_f_normal_x = &d_f_raw[numAtomsHomeAndProxyAllocated*0];
867  d_f_normal_y = &d_f_raw[numAtomsHomeAndProxyAllocated*1];
868  d_f_normal_z = &d_f_raw[numAtomsHomeAndProxyAllocated*2];
869  d_f_nbond_x = &d_f_raw[numAtomsHomeAndProxyAllocated*3];
870  d_f_nbond_y = &d_f_raw[numAtomsHomeAndProxyAllocated*4];
871  d_f_nbond_z = &d_f_raw[numAtomsHomeAndProxyAllocated*5];
872  d_f_slow_x = &d_f_raw[numAtomsHomeAndProxyAllocated*6];
873  d_f_slow_y = &d_f_raw[numAtomsHomeAndProxyAllocated*7];
874  d_f_slow_z = &d_f_raw[numAtomsHomeAndProxyAllocated*8];
875  allocate_device<double>(&d_pos_raw, 3 * numAtomsHomeAndProxyAllocated);
876  d_pos_x = &d_pos_raw[numAtomsHomeAndProxyAllocated*0];
877  d_pos_y = &d_pos_raw[numAtomsHomeAndProxyAllocated*1];
878  d_pos_z = &d_pos_raw[numAtomsHomeAndProxyAllocated*2];
879  allocate_device<float>(&d_charge, numAtomsHomeAndProxyAllocated);
880  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
881  allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
882  allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
883  allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
884  }
885  if (realloc_gpu_saved_force) {
886  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
887  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
888  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
889  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
890  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
891  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
892  }
893  allocate_device<int>(&d_sortOrder, numAtomsHomeAndProxyAllocated);
894  allocate_device<int>(&d_unsortOrder, numAtomsHomeAndProxyAllocated);
895 
896  // allocate memory for backup forces in MC barostat
897  if (simParams->monteCarloPressureOn) {
898  // Total number of backup force and positions buffers
899  allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
900  allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
901  d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
902  d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
903  d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
904  d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
905  d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
906  d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
907  d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
908  d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
909  d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
910  d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
911  d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
912  d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
913 
914  allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
915  allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
916  allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
917  allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
918  // we can use molecule_size + 1, rather than atom size
919  allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
920  }
921 
922  if (simParams->alchOn) {
923  allocate_device<int>(&d_partition, numAtomsHomeAndProxyAllocated);
924  }
925 
926  allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
927  d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
928  d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
929  d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
930  allocate_device<double>(&d_vel_x, numAtomsHomeAllocated);
931  allocate_device<double>(&d_vel_y, numAtomsHomeAllocated);
932  allocate_device<double>(&d_vel_z, numAtomsHomeAllocated);
933  allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
934  allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
935  allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
936  allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
937  allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
938  allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
939  allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
940  allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
941  allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
942  allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
943  allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
944  allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
945  allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
946  allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
947 
948  allocate_device<float>(&d_mass, numAtomsHomeAllocated);
949  allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
950  allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
951  allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
952  allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
953  allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
954  allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
955  allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
956  allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
957  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
958  allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
959  if (simParams->fixedAtomsOn) {
960  allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
961  allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
962  allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
963  allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
964  }
965 
966  if (simParams->useDeviceMigration) {
967  allocate_device<int>(&d_idMig, numAtomsHomeAndProxyAllocated);
968  allocate_device<int>(&d_vdwType, numAtomsHomeAndProxyAllocated);
969  allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
970  allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
971  allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
972  allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
973  }
974 
975  // These arrays will get allocated when total forces of the GPU global calculations are requested
976  d_f_saved_nbond_x = nullptr;
977  d_f_saved_nbond_y = nullptr;
978  d_f_saved_nbond_z = nullptr;
979  d_f_saved_slow_x = nullptr;
980  d_f_saved_slow_y = nullptr;
981  d_f_saved_slow_z = nullptr;
982 
983  // Memsets the d_pos arrays in order to reduce them afterwards;
984  memset(pos_x, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
985  memset(pos_y, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
986  memset(pos_z, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
987  cudaCheck(cudaMemset(d_pos_x, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
988  cudaCheck(cudaMemset(d_pos_y, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
989  cudaCheck(cudaMemset(d_pos_z, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
990  cudaCheck(cudaMemset(d_vel_x, 0 , sizeof(double)*numAtomsHomeAllocated));
991  cudaCheck(cudaMemset(d_vel_y, 0 , sizeof(double)*numAtomsHomeAllocated));
992  cudaCheck(cudaMemset(d_vel_z, 0 , sizeof(double)*numAtomsHomeAllocated));
993 
994  cudaCheck(cudaMemset(d_posNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
995  cudaCheck(cudaMemset(d_posNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
996  cudaCheck(cudaMemset(d_posNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
997  cudaCheck(cudaMemset(d_velNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
998  cudaCheck(cudaMemset(d_velNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
999  cudaCheck(cudaMemset(d_velNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
1000 
1001  return true;
1002 }
1003 
1004 void SequencerCUDA::reallocateMigrationDestination() {
1005  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1006  allocate_device<int4>(&d_migrationDestination, numAtomsHomeAndProxyAllocated);
1007 }
1008 
1009 void SequencerCUDA::deallocateArrays() {
1010  if (numAtomsHomeAndProxyAllocated != 0) {
1011  cudaCheck(cudaSetDevice(deviceID));
1012 
1013  deallocate_host<double>(&f_normal_x);
1014  deallocate_host<double>(&f_normal_y);
1015  deallocate_host<double>(&f_normal_z);
1016  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
1017  deallocate_host<double>(&f_global_x);
1018  deallocate_host<double>(&f_global_y);
1019  deallocate_host<double>(&f_global_z);
1020  }
1021  deallocate_host<double>(&f_nbond_x);
1022  deallocate_host<double>(&f_nbond_y);
1023  deallocate_host<double>(&f_nbond_z);
1024  deallocate_host<double>(&f_slow_x);
1025  deallocate_host<double>(&f_slow_y);
1026  deallocate_host<double>(&f_slow_z);
1027  deallocate_host<double>(&pos_x);
1028  deallocate_host<double>(&pos_y);
1029  deallocate_host<double>(&pos_z);
1030  deallocate_host<float>(&charge);
1031  deallocate_host<int>(&sortOrder);
1032  deallocate_host<int>(&unsortOrder);
1033  deallocate_host<double>(&recipMass);
1034  deallocate_host<double>(&vel_x);
1035  deallocate_host<double>(&vel_y);
1036  deallocate_host<double>(&vel_z);
1037  deallocate_host<char3>(&transform);
1038  deallocate_host<float>(&mass);
1039  if (simParams->alchOn) {
1040  deallocate_host<int>(&partition);
1041  }
1042  deallocate_host<float>(&langevinParam);
1043  deallocate_host<float>(&langScalVelBBK2);
1044  deallocate_host<float>(&langScalRandBBK2);
1045 
1046  deallocate_host<int>(&hydrogenGroupSize);
1047  deallocate_host<int>(&atomFixed);
1048  if (simParams->fixedAtomsOn) {
1049  deallocate_host<int>(&groupFixed);
1050  deallocate_host<double>(&fixedPosition_x);
1051  deallocate_host<double>(&fixedPosition_y);
1052  deallocate_host<double>(&fixedPosition_z);
1053  }
1054 
1055  deallocate_host<float>(&rigidBondLength);
1056 
1057  deallocate_device<double>(&d_f_raw);
1058  deallocate_device<double>(&d_pos_raw);
1059  deallocate_device<float>(&d_charge);
1060  deallocate_device<int>(&d_sortOrder);
1061  deallocate_device<int>(&d_unsortOrder);
1062  if (simParams->alchOn) {
1063  deallocate_device<int>(&d_partition);
1064  }
1065 
1066  deallocate_device<double>(&d_posNew_raw);
1067 
1068  if (simParams->monteCarloPressureOn) {
1069  deallocate_device<double>(&d_f_rawMC);
1070  deallocate_device<double>(&d_pos_rawMC);
1071 
1072  deallocate_host<int>(&id);
1073  deallocate_device<int>(&d_id);
1074  deallocate_device<int>(&d_idOrder);
1075  deallocate_device<int>(&d_moleculeAtom);
1076  deallocate_device<int>(&d_moleculeStartIndex);
1077  }
1078 
1079  if (simParams->useDeviceMigration) {
1080  deallocate_host<int>(&idMig);
1081  deallocate_host<int>(&vdwType);
1082  deallocate_device<int>(&d_idMig);
1083  deallocate_device<int>(&d_vdwType);
1084  deallocate_device<FullAtom>(&d_atomdata_AoS);
1085  deallocate_device<int>(&d_migrationGroupSize);
1086  deallocate_device<int>(&d_migrationGroupIndex);
1087  deallocate_device<int>(&d_sortIndex);
1088  }
1089  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
1090  deallocate_device<double>(&d_f_global_x);
1091  deallocate_device<double>(&d_f_global_y);
1092  deallocate_device<double>(&d_f_global_z);
1093  }
1094  deallocate_device<double>(&d_vel_x);
1095  deallocate_device<double>(&d_vel_y);
1096  deallocate_device<double>(&d_vel_z);
1097  deallocate_device<double>(&d_recipMass);
1098  deallocate_device<char3>(&d_transform);
1099  deallocate_device<double>(&d_velNew_x);
1100  deallocate_device<double>(&d_velNew_y);
1101  deallocate_device<double>(&d_velNew_z);
1102  deallocate_device<double>(&d_posSave_x);
1103  deallocate_device<double>(&d_posSave_y);
1104  deallocate_device<double>(&d_posSave_z);
1105  deallocate_device<double>(&d_rcm_x);
1106  deallocate_device<double>(&d_rcm_y);
1107  deallocate_device<double>(&d_rcm_z);
1108  deallocate_device<double>(&d_vcm_x);
1109  deallocate_device<double>(&d_vcm_y);
1110  deallocate_device<double>(&d_vcm_z);
1111  deallocate_device<float>(&d_mass);
1112  deallocate_device<float>(&d_langevinParam);
1113  deallocate_device<float>(&d_langScalVelBBK2);
1114  deallocate_device<float>(&d_langScalRandBBK2);
1115  deallocate_device<float>(&d_gaussrand_x);
1116  deallocate_device<float>(&d_gaussrand_y);
1117  deallocate_device<float>(&d_gaussrand_z);
1118  deallocate_device<int>(&d_hydrogenGroupSize);
1119  deallocate_device<float>(&d_rigidBondLength);
1120  deallocate_device<int>(&d_atomFixed);
1121  if (simParams->fixedAtomsOn) {
1122  deallocate_device<int>(&d_groupFixed);
1123  deallocate_device<double>(&d_fixedPosition_x);
1124  deallocate_device<double>(&d_fixedPosition_y);
1125  deallocate_device<double>(&d_fixedPosition_z);
1126  }
1127  deallocate_device<double>(&d_f_saved_nbond_x);
1128  deallocate_device<double>(&d_f_saved_nbond_y);
1129  deallocate_device<double>(&d_f_saved_nbond_z);
1130  deallocate_device<double>(&d_f_saved_slow_x);
1131  deallocate_device<double>(&d_f_saved_slow_y);
1132  deallocate_device<double>(&d_f_saved_slow_z);
1133  }
1134 }
1135 
1136 void SequencerCUDA::deallocateStaticArrays() {
1137  cudaCheck(cudaSetDevice(deviceID));
1138 
1139  deallocate_host<cudaTensor>(&extVirial);
1140  deallocate_host<double3>(&extForce);
1141  deallocate_host<double>(&extEnergy);
1142  deallocate_host<unsigned int>(&h_marginViolations);
1143  deallocate_host<unsigned int>(&h_periodicCellSmall);
1144 
1145  // XXX TODO: Deallocate the additional arrays we added for shmem version
1146  deallocate_host<double3>(&awayDists);
1147  deallocate_host<double3>(&patchMin);
1148  deallocate_host<double3>(&patchMax);
1149  deallocate_host<CudaAtom*>(&cudaAtomLists);
1150  deallocate_host<double3>(&patchCenter);
1151  deallocate_host<int>(&globalToLocalID);
1152  deallocate_host<int>(&patchToDeviceMap);
1153  deallocate_host<Lattice>(&pairlist_lattices);
1154  deallocate_host<double>(&patchMaxAtomMovement);
1155  deallocate_host<double>(&patchNewTolerance);
1156  deallocate_host<CudaMInfo>(&mInfo);
1157  deallocate_host<bool*>(&h_patchRecordHasForces);
1158 
1159  deallocate_host<cudaTensor>(&lpVirialNormal);
1160  deallocate_host<cudaTensor>(&lpVirialNbond);
1161  deallocate_host<cudaTensor>(&lpVirialSlow);
1162 
1163  deallocate_device<double3>(&d_awayDists);
1164  deallocate_device<double3>(&d_patchMin);
1165  deallocate_device<double3>(&d_patchMax);
1166  deallocate_device<int>(&d_globalToLocalID);
1167  deallocate_device<int>(&d_patchToDeviceMap);
1168  deallocate_device<int>(&d_sortOrder);
1169  deallocate_device<int>(&d_unsortOrder);
1170  deallocate_device<double3>(&d_patchCenter);
1171  deallocate_device<Lattice>(&d_lattices);
1172  deallocate_device<Lattice>(&d_pairlist_lattices);
1173  deallocate_device<double>(&d_patchMaxAtomMovement);
1174  deallocate_device<double>(&d_patchNewTolerance);
1175  deallocate_device<CudaMInfo>(&d_mInfo);
1176 
1177  deallocate_device<int>(&d_killme);
1178  deallocate_device<char>(&d_barrierFlag);
1179  deallocate_device<unsigned int>(&d_tbcatomic);
1180  deallocate_device<BigReal>(&d_kineticEnergy);
1181  deallocate_device<BigReal>(&d_intKineticEnergy);
1182  deallocate_device<BigReal>(&d_momentum_x);
1183  deallocate_device<BigReal>(&d_momentum_y);
1184  deallocate_device<BigReal>(&d_momentum_z);
1185  deallocate_device<BigReal>(&d_angularMomentum_x);
1186  deallocate_device<BigReal>(&d_angularMomentum_y);
1187  deallocate_device<BigReal>(&d_angularMomentum_z);
1188  deallocate_device<cudaTensor>(&d_virial);
1189  deallocate_device<cudaTensor>(&d_intVirialNormal);
1190  deallocate_device<cudaTensor>(&d_intVirialNbond);
1191  deallocate_device<cudaTensor>(&d_intVirialSlow);
1192  deallocate_device<cudaTensor>(&d_lpVirialNormal);
1193  deallocate_device<cudaTensor>(&d_lpVirialNbond);
1194  deallocate_device<cudaTensor>(&d_lpVirialSlow);
1195  deallocate_device<cudaTensor>(&d_rigidVirial);
1196  deallocate_device<cudaTensor>(&d_extVirial);
1197  deallocate_device<double3>(&d_extForce);
1198  deallocate_device<double>(&d_extEnergy);
1199  deallocate_device<SettleParameters>(&sp);
1200  deallocate_device<unsigned int>(&deviceQueue);
1201  deallocate_device<double*>(&d_peer_pos_x);
1202  deallocate_device<double*>(&d_peer_pos_y);
1203  deallocate_device<double*>(&d_peer_pos_z);
1204  deallocate_device<float*>(&d_peer_charge);
1205  deallocate_device<int*>(&d_peer_partition);
1206  deallocate_device<double*>(&d_peer_vel_x);
1207  deallocate_device<double*>(&d_peer_vel_y);
1208  deallocate_device<double*>(&d_peer_vel_z);
1209  deallocate_device<double*>(&d_peer_fb_x);
1210  deallocate_device<double*>(&d_peer_fb_y);
1211  deallocate_device<double*>(&d_peer_fb_z);
1212  deallocate_device<double*>(&d_peer_fn_x);
1213  deallocate_device<double*>(&d_peer_fn_y);
1214  deallocate_device<double*>(&d_peer_fn_z);
1215  deallocate_device<double*>(&d_peer_fs_x);
1216  deallocate_device<double*>(&d_peer_fs_y);
1217  deallocate_device<double*>(&d_peer_fs_z);
1218 
1219  deallocate_device<bool*>(&d_patchRecordHasForces);
1220 
1221  deallocate_device(&d_fixVirialNormal);
1222  deallocate_device(&d_fixVirialNbond);
1223  deallocate_device(&d_fixVirialSlow);
1224  deallocate_device(&d_fixForceNormal);
1225  deallocate_device(&d_fixForceNbond);
1226  deallocate_device(&d_fixForceSlow);
1227 
1228  if (simParams->useDeviceMigration) {
1229  deallocate_device<FullAtom>(&d_atomdata_AoS_in);
1230  deallocate_device<int>(&d_sortSoluteIndex);
1231  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1232  }
1233 
1234  deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1235 }
1236 
1237 void SequencerCUDA::copyMigrationInfo(HomePatch *p, int patchIndex){
1238  CudaMInfo &m = mInfo[patchIndex];
1239  if (!p->patchMapRead) p->readPatchMap();
1240  for(int x = 0; x < 3; x++){
1241  for(int y = 0; y < 3; y++){
1242  for(int z = 0; z < 3; z++){
1243  // copies migration info over
1244  MigrationInfo *hm = p->mInfo[x][y][z];
1245  if(hm != NULL) m.destPatchID[x][y][z] = hm->destPatchID;
1246  else m.destPatchID[x][y][z] = -1; // let's flag this as -1 for now
1247  }
1248  }
1249  }
1250  if (simParams->useDeviceMigration) {
1251  m.destPatchID[1][1][1] = p->getPatchID();
1252  }
1253 }
1254 
1255 void SequencerCUDA::assembleOrderedPatchList(){
1256  // Assembles the patches on each Pe sharing a device into a device-ordered list
1257  patchList.clear();
1258 
1259  // Handle our own patches
1260  for (int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1261  HomePatch *p = patchData->devData[deviceIndex].patches[i];
1262  patchList.push_back(p);
1263  }
1264 
1265 
1266  // Do we really need this?
1267 #if 1
1268  patchListHomeAndProxy.clear();
1269  // Set up list of device patches. We need the home patches for everyone
1270  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1271  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1272  const int patchID = localPatches[i].patchID;
1273 
1274 
1275  for(int d = 0; d < CkNumPes(); d++){
1276  PatchMap* pm = PatchMap::ObjectOnPe(d);
1277  HomePatch *patch = pm->homePatch(patchID);
1278  if(patch != NULL) {
1279  patchListHomeAndProxy.push_back(patch);
1280  }
1281  }
1282  }
1283 
1284 #endif
1285 }
1286 
1294 void SequencerCUDA::copyAoSDataToHost() {
1295  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1296  patchData = cpdata.ckLocalBranch();
1297 
1298  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1299  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1300 
1301  CUDAMigrationKernel->update_AoS(
1302  numPatchesHome,
1303  patchData->devData[deviceIndex].d_localPatches,
1304  (FullAtom*) d_atomdata_AoS,
1305  d_vel_x, d_vel_y, d_vel_z,
1306  d_pos_x, d_pos_y, d_pos_z,
1307  stream
1308  );
1309 
1310  for (int i = 0; i < integrationPatches.size(); i++) {
1311  const int numAtoms = localPatches[i].numAtoms;
1312  const int offset = localPatches[i].bufferOffset;
1313  HomePatch *patch = integrationPatches[i];
1314  patch->updateAtomCount(numAtoms, false);
1315  patch->updateAtomBuffers();
1316  FullAtomList& h_atomdata = patch->getAtomList();
1317  copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (FullAtom*)h_atomdata.begin(), numAtoms, stream);
1318  }
1319  cudaCheck(cudaStreamSynchronize(stream));
1320 }
1321 
1322 
1329 void SequencerCUDA::migrationLocalInit() {
1330  CUDAMigrationKernel->computeMigrationGroupIndex(
1331  numPatchesHome,
1332  patchData->devData[deviceIndex].d_localPatches,
1333  d_migrationGroupSize,
1334  d_migrationGroupIndex,
1335  stream
1336  );
1337 
1338  CUDAMigrationKernel->update_AoS(
1339  numPatchesHome,
1340  patchData->devData[deviceIndex].d_localPatches,
1341  (FullAtom*) d_atomdata_AoS,
1342  d_vel_x, d_vel_y, d_vel_z,
1343  d_pos_x, d_pos_y, d_pos_z,
1344  stream
1345  );
1346 
1347  CUDAMigrationKernel->computeMigrationDestination(
1348  numPatchesHome,
1349  patchData->devData[deviceIndex].d_localPatches,
1350  myLattice,
1351  d_mInfo,
1352  d_patchToDeviceMap,
1353  d_globalToLocalID,
1354  d_patchMin,
1355  d_patchMax,
1356  d_hydrogenGroupSize,
1357  d_migrationGroupSize,
1358  d_migrationGroupIndex,
1359  d_pos_x, d_pos_y, d_pos_z,
1360  d_migrationDestination,
1361  stream
1362  );
1363 
1364  CUDAMigrationKernel->performLocalMigration(
1365  numPatchesHome,
1366  patchData->devData[deviceIndex].d_localPatches,
1367  (FullAtom*) d_atomdata_AoS,
1368  (FullAtom*) d_atomdata_AoS_in,
1369  d_migrationDestination,
1370  stream
1371  );
1372 
1373  cudaCheck(cudaStreamSynchronize(stream));
1374 }
1375 
1380 void SequencerCUDA::migrationPerform() {
1381  CUDAMigrationKernel->performMigration(
1382  numPatchesHome,
1383  patchData->devData[deviceIndex].d_localPatches,
1384  d_peer_record,
1385  (FullAtom*) d_atomdata_AoS,
1386  d_peer_atomdata,
1387  d_migrationGroupSize,
1388  d_migrationGroupIndex,
1389  d_migrationDestination,
1390  stream
1391  );
1392  cudaCheck(cudaStreamSynchronize(stream));
1393 }
1394 
1395 
1396 void SequencerCUDA::migrationUpdateAtomCounts() {
1397  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1398  patchData = cpdata.ckLocalBranch();
1399 
1400  CUDAMigrationKernel->updateLocalRecords(
1401  numPatchesHome,
1402  patchData->devData[deviceIndex].d_localPatches,
1403  d_peer_record,
1404  patchData->devData[deviceIndex].d_peerPatches,
1405  stream
1406  );
1407 
1408  cudaCheck(cudaStreamSynchronize(stream));
1409 }
1410 
1411 void SequencerCUDA::migrationUpdateAtomOffsets() {
1412  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1413  patchData = cpdata.ckLocalBranch();
1414 
1415  CUDAMigrationKernel->updateLocalRecordsOffset(
1416  numPatchesHomeAndProxy,
1417  patchData->devData[deviceIndex].d_localPatches,
1418  stream
1419  );
1420 
1421  cudaCheck(cudaStreamSynchronize(stream));
1422 }
1423 
1424 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1425  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1426  patchData = cpdata.ckLocalBranch();
1427 
1428  CUDAMigrationKernel->updatePeerRecords(
1429  numPatchesHomeAndProxy,
1430  patchData->devData[deviceIndex].d_localPatches,
1431  d_peer_record,
1432  patchData->devData[deviceIndex].d_peerPatches,
1433  stream
1434  );
1435 
1436  cudaCheck(cudaStreamSynchronize(stream));
1437 }
1438 
1439 void SequencerCUDA::migrationUpdateProxyDestination() {
1440  if (mGpuOn) {
1441  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1442  patchData = cpdata.ckLocalBranch();
1443 
1444  // This is implemented as a put instead of a get to avoid the need
1445  // for a node synchronization.
1446  CUDAMigrationKernel->copyMigrationDestinationToProxies(
1447  deviceIndex,
1448  numPatchesHome,
1449  numPatchesHomeAndProxy,
1450  patchData->devData[deviceIndex].d_localPatches,
1451  patchData->devData[deviceIndex].d_peerPatches,
1452  d_peer_migrationDestination,
1453  stream
1454  );
1455  }
1456 }
1457 
1458 void SequencerCUDA::copyPatchDataToHost() {
1459  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1460  patchData = cpdata.ckLocalBranch();
1461  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1462 
1463  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1464  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1465 
1466  copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1467  cudaCheck(cudaStreamSynchronize(stream));
1468 
1469 
1470  // Update Atom Counts
1471  for (int i = 0; i < numPatchesHome; i++) {
1472  HomePatch* hp = integrationPatches[i];
1473  hp->updateAtomCount(localPatches[i].numAtoms, false);
1474  }
1475  cudaCheck(cudaStreamSynchronize(stream));
1476 }
1477 
1478 // This code path is used only by device migration
1479 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1480  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1481  patchData = cpdata.ckLocalBranch();
1482 
1483  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1484  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1485  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1486 
1487 
1488  for (int i = 0; i < integrationPatches.size(); i++) {
1489  const int numAtoms = localPatches[i].numAtoms;
1490  if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1491  iout << iERROR << "The number of atoms in patch " << i << " is "
1492  << numAtoms << ", greater than the limit for GPU atom migration ("
1493  << MigrationCUDAKernel::kMaxAtomsPerPatch << ").\n" << endi;
1494  NAMD_bug("NAMD has stopped simulating due to the error above, "
1495  "but you could disable GPUAtomMigration and try again.\n");
1496  }
1497  const int offset = localPatches[i].bufferOffset;
1498  HomePatch *patch = integrationPatches[i];
1499  FullAtomList& h_atomdata = patch->getAtomList();
1500  copy_HtoD<FullAtom>((FullAtom*)h_atomdata.begin(), d_atomdata_AoS_in + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1501  }
1502  cudaCheck(cudaStreamSynchronize(stream));
1503 }
1504 
1505 /*
1506  * Aggregates and copys various data from the host to device
1507  *
1508  * Some data is only copied for home patches, while other data is copied for
1509  * home and proxy patches
1510  *
1511  */
1512 void SequencerCUDA::copyAtomDataToDevice(bool copyForces, int maxForceNumber) {
1513 
1514  AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass, double, stream);
1515  if(copyForces){
1516  switch (maxForceNumber) {
1517  case 2:
1518  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_x, double, stream);
1519  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_y, double, stream);
1520  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_z, double, stream);
1521  case 1:
1522  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_x, double, stream);
1523  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_y, double, stream);
1524  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_z, double, stream);
1525  case 0:
1526  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_x, double, stream);
1527  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_y, double, stream);
1528  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_z, double, stream);
1529  }
1530  }
1531 
1532  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_x, double, stream);
1533  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_y, double, stream);
1534  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_z, double, stream);
1535  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_x, double, stream);
1536  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_y, double, stream);
1537  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_z, double, stream);
1538  AGGREGATE_HOME_ATOMS_TO_DEVICE(mass, float, stream);
1539  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(charge, float, stream);
1540  if (simParams->alchOn) {
1541  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(partition, int, stream);
1542  }
1543 
1544  if (simParams->langevinOn) {
1545  AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam, float, stream);
1546  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2, float, stream);
1547  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2, float, stream);
1548  }
1549 
1550  AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize, int, stream);
1551  AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed, int, stream);
1552  if (simParams->fixedAtomsOn) {
1553  AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed, int, stream);
1554  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x, double, stream);
1555  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y, double, stream);
1556  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z, double, stream);
1557  }
1558  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(sortOrder, int, stream);
1559  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(unsortOrder, int, stream);
1560  AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength, float, stream);
1561 
1562  if (simParams->monteCarloPressureOn) {
1563  AGGREGATE_HOME_ATOMS_TO_DEVICE(id, int, stream);
1564  //set up initial mapping for global index to local array index
1565  CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1566  }
1567 }
1568 
1569 
1570 void SequencerCUDA::migrationLocalPost(int startup) {
1571  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1572  patchData = cpdata.ckLocalBranch();
1573 
1574  if (simParams->useDeviceMigration) {
1575  if (!startup) {
1576  CUDAMigrationKernel->transformMigratedPositions(
1577  numPatchesHome,
1578  patchData->devData[deviceIndex].d_localPatches,
1579  d_patchCenter,
1580  (FullAtom*) d_atomdata_AoS_in,
1581  myLattice,
1582  stream
1583  );
1584  }
1585 
1586  // sort solvent/solute data
1587  CUDAMigrationKernel->sortSolventAtoms(
1588  numPatchesHome,
1589  patchData->devData[deviceIndex].d_localPatches,
1590  (FullAtom*) d_atomdata_AoS_in,
1591  (FullAtom*) d_atomdata_AoS,
1592  d_sortSoluteIndex,
1593  stream
1594  );
1595 
1596  double dt = 1.0;
1597  double kbT = 1.0;
1598  double tempFactor = 1.0;
1599  if (simParams->langevinOn) {
1600  dt = simParams->dt * 0.001; // convert timestep to ps
1601  kbT = BOLTZMANN * simParams->langevinTemp;
1602  int lesReduceTemp = (simParams->lesOn && simParams->lesReduceTemp);
1603  tempFactor = (lesReduceTemp ? 1. / simParams->lesFactor : 1);
1604  }
1605  CUDAMigrationKernel->copy_AoS_to_SoA(
1606  numPatchesHome, simParams->alchOn,
1607  simParams->langevinOn, dt, kbT, tempFactor,
1608  patchData->devData[deviceIndex].d_localPatches,
1609  (FullAtom*) d_atomdata_AoS,
1610  d_recipMass,
1611  d_vel_x, d_vel_y, d_vel_z,
1612  d_pos_x, d_pos_y, d_pos_z,
1613  d_mass, d_charge,
1614  d_idMig, d_vdwType,
1615  d_hydrogenGroupSize, d_migrationGroupSize,
1616  d_atomFixed,
1617  d_rigidBondLength,
1618  d_transform,
1619  d_partition,
1620  d_langevinParam,
1621  d_langScalVelBBK2,
1622  d_langScalRandBBK2,
1623  stream
1624  );
1625  }
1626 
1627  // Other migration post processing steps
1628  if (simParams->useDeviceMigration) {
1629  if (simParams->monteCarloPressureOn) {
1630  CUDASequencerKernel->SetAtomIndexOrder(d_idMig, d_idOrder, numAtomsHome, stream);
1631  }
1632  }
1633 
1634  // JM: Saving position to these doubles for pairListCheck
1635  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
1636  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
1637  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
1638 
1639  // JM NOTE: We need to save the lattice at the beggining of the cycle
1640  // in order to use it in SequencerCUDAKernel::pairlistCheck();
1641  myLatticeOld = myLattice;
1642 
1643  // JM NOTE: Recalculates the centers of mass since we have new positions
1644  CUDASequencerKernel->centerOfMass(
1645  d_pos_x, d_pos_y, d_pos_z,
1646  d_rcm_x, d_rcm_y, d_rcm_z,
1647  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1648  CUDASequencerKernel->centerOfMass(
1649  d_vel_x, d_vel_y, d_vel_z,
1650  d_vcm_x, d_vcm_y, d_vcm_z,
1651  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1652 
1653  cudaCheck(cudaStreamSynchronize(stream));
1654 }
1655 
1656 void SequencerCUDA::migrationUpdateAdvancedFeatures(const int startup) {
1657  if(simParams->eFieldOn || simParams->SMDOn || simParams->groupRestraintsOn ||
1658  simParams->monteCarloPressureOn || simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn ||
1659  simParams->colvarsOn ||
1660  simParams->useCudaGlobal ||
1661  simParams->tclForcesOn){
1662 
1663  // Handcopies the transform field in a char* SOA
1664  // JM NOTE: We're copying ints into char because "transform" is an int
1665  // field in PatchDataSOA but a signed char in FullAtom
1666  size_t offset = 0;
1667  for (int i = 0; i < numPatchesHome; i++) {
1668  PatchDataSOA& current = patchList[i]->patchDataSOA;
1669  const int numPatchAtoms = current.numAtoms;
1670  // memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type));
1671  for(int j = 0; j < numPatchAtoms; j++){
1672  transform[offset + j].x = current.transform_i[j];
1673  transform[offset + j].y = current.transform_j[j];
1674  transform[offset + j].z = current.transform_k[j];
1675  }
1676  offset += numPatchAtoms;
1677  }
1678  copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1679  }
1680 
1681  if (!startup) {
1683  lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1684  }
1685  if(simParams->constraintsOn) {
1686  restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1687  }
1688  if(simParams->SMDOn) {
1689  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1690  }
1691  if(simParams->groupRestraintsOn) {
1692  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1693  }
1694  if(simParams->mgridforceOn || simParams->gridforceOn){
1695 
1696  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1697  }
1698  if(simParams->consForceOn) {
1699  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1700  }
1701 
1702  }
1703 }
1704 
1705 void SequencerCUDA::migrationUpdateDestination() {
1706  CUDAMigrationKernel->updateMigrationDestination(
1707  numAtomsHomePrev,
1708  d_migrationDestination,
1709  d_peer_sortSoluteIndex,
1710  stream
1711  );
1712 }
1713 
1714 bool SequencerCUDA::copyPatchData(
1715  const bool copyIn,
1716  const bool startup
1717 ) {
1718  bool reallocated = false;
1719  if (copyIn) {
1720  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1721  patchData = cpdata.ckLocalBranch();
1722 
1723  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1724 
1725  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1726  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1727 
1728  if (startup) {
1729  // numPatchesHomeAndProxy is set when the patch data is constructed
1730  numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1731  numPatchesHome = homePatches.size();
1732  patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1733 
1734  if (simParams->useDeviceMigration) {
1735  // Padded data structures which will not be reallocated
1736  allocate_device<FullAtom>(&d_atomdata_AoS_in, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch);
1737  allocate_device<int>(&d_sortSoluteIndex, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch);
1738  d_migrationDestination = NULL;
1739  }
1740 #if defined(NAMD_HIP)
1741  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_localPatches,
1742  sizeof(CudaLocalRecord)*numPatchesHomeAndProxy,
1743  hipDeviceMallocFinegrained);
1744  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_peerPatches,
1745  sizeof(CudaLocalRecord)*peerPatches.size(),
1746  hipDeviceMallocFinegrained);
1747 #else
1748  allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1749  allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1750 #endif
1751  if (simParams->useDeviceMigration) {
1752  CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1753  }
1754 
1755  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1756  numPatchesHomeAndProxy, stream);
1757  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1758  peerPatches.size(), stream);
1759  if(true || mGpuOn) {
1760  this->assembleOrderedPatchList();
1761  }
1762  this->copySettleParameter();
1763 
1764  for (int i = 0; i < numPatchesHome; i++) {
1765  HomePatch *patch = homePatches[i];
1766  this->copyMigrationInfo(patch, i);
1767  patchNewTolerance[i] = 0.5 * ( simParams->pairlistDist - simParams->cutoff);
1768 
1769  globalToLocalID[patch->getPatchID()] = i;
1770  patchToDeviceMap[patch->getPatchID()] = deviceIndex;
1771  }
1772  copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1773  copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1774 
1775  // Need the globalToLocalID and patchToDeviceMap data structures to be system wide for migration
1776  // They are also used in tuple migration, so we add them to patchData, so they can be easily
1777  // accessed elsewhere
1778  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
1779  if (i == deviceIndex) continue;
1780  std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1781  for (int j = 0; j < otherPatches.size(); j++) {
1782  HomePatch *patch = otherPatches[j];
1783  globalToLocalID[patch->getPatchID()] = j;
1784  patchToDeviceMap[patch->getPatchID()] = i;
1785  }
1786  }
1787  copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1788  copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1789  patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1790  patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1791 
1792  // Allocate more data
1793  allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1794  }
1795 
1796  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1797  HomePatch *patch = patchListHomeAndProxy[i];
1798  awayDists[i].x = patch->aAwayDist;
1799  awayDists[i].y = patch->bAwayDist;
1800  awayDists[i].z = patch->cAwayDist;
1801  COPY_CUDAVECTOR(patch->center, patchCenter[i]);
1802  COPY_CUDAVECTOR(patch->getMin(), patchMin[i]);
1803  COPY_CUDAVECTOR(patch->getMax(), patchMax[i]);
1804  }
1805 
1806  copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1807  copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1808  copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1809  copy_HtoD<double3>(patchCenter, d_patchCenter, numPatchesHomeAndProxy, stream);
1810 
1811  const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1812  localPatches[numPatchesHomeAndProxy-1].numAtoms;
1813 
1814  const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1815  localPatches[numPatchesHome-1].numAtoms;
1816 
1817  reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1818 
1819 
1820  numAtomsHomePrev = numAtomsHome;
1821  numAtomsHomeAndProxy = totalAtomCount;
1822  numAtomsHome = homeAtomCount;
1823 
1824  patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1825 
1826  if (!startup) {
1827  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1828  numPatchesHomeAndProxy, stream);
1829  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1830  peerPatches.size(), stream);
1831  }
1832  if (startup) {
1833  if (simParams->monteCarloPressureOn) {
1834  //Only at startup, copy the molecule's index info to GPU
1835  Molecule *molecule = Node::Object()->molecule;
1836  copy_HtoD<int>(molecule->moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1837  copy_HtoD<int>(molecule->moleculeStartIndex, d_moleculeStartIndex, molecule->numMolecules + 1, stream);
1838  }
1839  }
1840  }
1841  return reallocated;
1842 }
1843 
1844 void SequencerCUDA::copyDataToPeers(
1845  const bool copyIn
1846 ) {
1847  if (!copyIn) return;
1848  // Positions will be copied by the kernel
1849  // Forces don't need to be copied
1850  // Atom data needed to be copied: sortOrder, unsortOrder
1851 
1852  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1853  patchData = cpdata.ckLocalBranch();
1854  if (mGpuOn) {
1855  CUDAMigrationKernel->copyDataToProxies(
1856  deviceIndex,
1857  numPatchesHome,
1858  numPatchesHomeAndProxy,
1859  patchData->devData[deviceIndex].d_localPatches,
1860  d_peer_id,
1861  d_peer_vdwType,
1862  d_peer_sortOrder,
1863  d_peer_unsortOrder,
1864  d_peer_charge,
1865  d_peer_partition,
1866  d_peer_patchCenter,
1867  simParams->alchOn,
1868  stream
1869  );
1870  }
1871  cudaCheck(cudaStreamSynchronize(stream));
1872 }
1873 
1874 void SequencerCUDA::migrationSortAtomsNonbonded() {
1875  CUDAMigrationKernel->sortAtoms(
1876  numPatchesHome, numAtomsHome,
1877  patchData->devData[deviceIndex].d_localPatches,
1878  patchMin, patchMax,
1879  d_pos_x, d_pos_y, d_pos_z,
1880  d_sortOrder,
1881  d_unsortOrder,
1882  d_sortIndex,
1883  stream
1884  );
1885 }
1886 
1887 void SequencerCUDA::maximumMove(
1888  const double maxvel2,
1889  const int numAtoms)
1890 {
1891  CUDASequencerKernel->maximumMove(
1892  maxvel2, d_vel_x, d_vel_y, d_vel_z,
1893  killme, numAtoms, stream);
1894 }
1895 
1896 void SequencerCUDA::submitHalf(
1897  int numAtoms, int part, const bool doEnergy)
1898 {
1899  //BigReal kineticEnergy;
1900  Tensor reduction_virial;
1901  //cudaTensor h_virial;
1902  //BigReal intKineticEnergy;
1903  Tensor reduction_intVirialNormal;
1904  //cudaTensor h_intVirialNormal;
1905  int hgs;
1906 
1907  if(doEnergy){
1908 #if 0
1909  cudaCheck(cudaEventRecord(eventStart,stream));
1910 #endif
1911  CUDASequencerKernel->submitHalf(
1912  simParams->fixedAtomsOn,
1913  d_vel_x, d_vel_y, d_vel_z,
1914  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1915  d_kineticEnergy, d_intKineticEnergy,
1916  d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1917  virial_half, intVirialNormal_half,
1918  d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1919 #if 0
1920  cudaCheck(cudaEventRecord(eventStop, stream));
1921  cudaCheck(cudaEventSynchronize(eventStop));
1922  cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1923  fprintf(stderr, "submitHalf total elapsed time: %f\n", t_submitHalf);
1924  t_submitReductions2 = 0;
1925 #endif
1926  }
1927 }
1928 
1929 void SequencerCUDA::submitReductions(
1930  BigReal origin_x,
1931  BigReal origin_y,
1932  BigReal origin_z,
1933  int marginViolations,
1934  int doEnergy,
1935  int doMomentum,
1936  int numAtomsReduction,
1937  int maxForceNumber)
1938 {
1939  // reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsReduction; //moved to launch_part2, startRun2
1940  // where do I get the margin violations?
1941  // reduction->item(REDUCTION_MARGIN_VIOLATIONS) += marginViolations;
1942  if(doEnergy){
1943  if(doMomentum){
1944  // JM NOTE: Calculates momenta if copyOut
1945  CUDASequencerKernel->submitReduction1(
1946  d_pos_x, d_pos_y, d_pos_z,
1947  d_vel_x, d_vel_y, d_vel_z, d_mass,
1948  d_kineticEnergy,
1949  d_momentum_x, d_momentum_y, d_momentum_z,
1950  d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1951  origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1952  momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1953  numAtomsReduction, stream);
1954  }
1955  Tensor regintVirialNormal;
1956  Tensor regintVirialNbond;
1957  Tensor regintVirialSlow;
1958 
1959 #if 0
1960  cudaCheck(cudaEventRecord(eventStart,stream));
1961 #endif
1962  CUDASequencerKernel->submitReduction2(
1963  simParams->fixedAtomsOn, d_atomFixed,
1964  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
1965  d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1966  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1967  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1968  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1969  d_mass, d_hydrogenGroupSize,
1970  d_kineticEnergy, kineticEnergy,
1971  d_intKineticEnergy, intKineticEnergy,
1972  d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1973  intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1974  d_tbcatomic, numAtomsReduction, maxForceNumber, simParams->isMultiTimeStepping(), stream);
1975  // Haochuan: actually should be doVirial here
1976  if (simParams->fixedAtomsOn) {
1977  CUDASequencerKernel->calcFixVirial(
1978  maxForceNumber, numAtomsReduction, d_atomFixed,
1979  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1980  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1981  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1982  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1983  d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1984  d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1985  }
1986  }
1987 #if 0
1988  cudaCheck(cudaEventRecord(eventStop, stream));
1989  cudaCheck(cudaEventSynchronize(eventStop));
1990  cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1991  fprintf(stderr, "submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1992  t_submitReductions2 = 0;
1993 #endif
1994 }
1995 
1996 void SequencerCUDA::copySettleParameter(){
1997  // Searching for a patch that contains initialized settle parameters
1998  cudaCheck(cudaSetDevice(deviceID));
1999  if(simParams->rigidBonds != RIGID_NONE){
2000  HomePatch *patch = NULL;
2001  // PatchList contains all patches in the node, so if there's a single water in the system,
2002  // this is guaranteed to catch it
2003  for(int i = 0; i < patchList.size(); i++){
2004  if(patchList[i]->settle_initialized) {
2005  patch = patchList[i];
2006  break;
2007  }
2008  }
2009  if ( patch ) {
2010  SettleParameters h_sp;
2011  h_sp.mO = patch->settle_mO;
2012  h_sp.mH = patch->settle_mH;
2013  h_sp.mOrmT = patch->settle_mOrmT;
2014  h_sp.mHrmT = patch->settle_mHrmT;
2015  h_sp.rra = patch->settle_rra;
2016  h_sp.ra = patch->settle_ra;
2017  h_sp.rb = patch->settle_rb;
2018  h_sp.rc = patch->settle_rc;
2019  h_sp.r_om = patch->r_om;
2020  h_sp.r_ohc = patch->r_ohc;
2021 
2022  // fprintf(stderr, "SETTLEPARAMETER Found: Values %lf %lf %lf %lf %lf %lf %lf %lf\n",
2023  // h_sp.mO, h_sp.mH, h_sp.mOrmT, h_sp.mHrmT, h_sp.rra, h_sp.ra, h_sp.rb, h_sp.rc );
2024  copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
2025  }
2026  }
2027 }
2028 
2029 // Does rattle1_SOA
2030 void SequencerCUDA::startRun1(
2031  int maxForceNumber,
2032  const Lattice& lattice
2033 ) {
2034  // cudaCheck(cudaSetDevice(deviceID));
2035  myLattice = lattice;
2036 
2037  // JM: Enforcing rigid bonds on first iteration
2038  CUDASequencerKernel->rattle1(
2039  simParams->fixedAtomsOn, 1, 0,
2040  numAtomsHome, 0.f, 0.f,
2041  2.0 * simParams->rigidTol,
2042  d_vel_x, d_vel_y, d_vel_z,
2043  d_pos_x, d_pos_y, d_pos_z,
2044  d_velNew_x, d_velNew_y, d_velNew_z,
2045  d_posNew_x, d_posNew_y, d_posNew_z,
2046  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2047  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2048  &settleList, settleListSize, &d_consFailure,
2049  d_consFailureSize, &rattleList, rattleListSize,
2050  &nSettle, &nRattle,
2051  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2052  buildRigidLists, consFailure, simParams->watmodel, stream);
2053 
2054  this->copyPositionsAndVelocitiesToHost(1, 0);
2055  cudaCheck(cudaDeviceSynchronize());
2056 #if 0
2057  printSOAPositionsAndVelocities();
2058 #endif
2059 }
2060 
2061 void SequencerCUDA::startRun2(
2062  double dt_normal,
2063  double dt_nbond,
2064  double dt_slow,
2065  Vector origin,
2066  int doGlobal,
2067  int maxForceNumber
2068 ){
2069  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2070 
2071  // This is a patch-based kernel -> which means each threadblock deals with an entire patch
2072  // So, in the multigpu case, we want to keep non-offset pointers but we want
2073  // to deal only with a handful of patches
2074 
2075  // We dont need to and we should not set normal force to zero
2076  // It stores global forces. Additionally, we set normall forces,
2077  // every step, it's not an addition
2078  // cudaCheck(cudaMemset(d_f_normal_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2079  // cudaCheck(cudaMemset(d_f_normal_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2080  // cudaCheck(cudaMemset(d_f_normal_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2081 
2082  cudaCheck(cudaMemset(d_f_nbond_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2083  cudaCheck(cudaMemset(d_f_nbond_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2084  cudaCheck(cudaMemset(d_f_nbond_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2085 
2086  cudaCheck(cudaMemset(d_f_slow_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2087  cudaCheck(cudaMemset(d_f_slow_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2088  cudaCheck(cudaMemset(d_f_slow_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2089 
2090  CUDASequencerKernel->accumulateForceToSOA(
2091  doGlobal,
2092  simParams->useCudaGlobal,
2093  maxForceNumber,
2094  numPatchesHomeAndProxy,
2095  nDevices,
2096  patchData->devData[deviceIndex].d_localPatches,
2097  patchData->devData[deviceIndex].f_bond,
2098  patchData->devData[deviceIndex].f_bond_nbond,
2099  patchData->devData[deviceIndex].f_bond_slow,
2100  patchData->devData[deviceIndex].forceStride,
2101  patchData->devData[deviceIndex].f_nbond,
2102  patchData->devData[deviceIndex].f_nbond_slow,
2103  patchData->devData[deviceIndex].f_slow,
2104  d_f_global_x,
2105  d_f_global_y,
2106  d_f_global_z,
2107  d_f_normal_x,
2108  d_f_normal_y,
2109  d_f_normal_z,
2110  d_f_nbond_x,
2111  d_f_nbond_y,
2112  d_f_nbond_z,
2113  d_f_slow_x,
2114  d_f_slow_y,
2115  d_f_slow_z,
2116  d_unsortOrder,
2117  myLattice,
2118  patchData->d_queues,
2119  patchData->d_queueCounters,
2120  d_tbcatomic,
2121  stream
2122  );
2123  if(mGpuOn){
2124  if(SMDKernel)
2125  {
2126  // compute distributed center of mass for groups of atoms
2127  SMDKernel->computeCOMMGpu(myLattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2128  d_transform, stream);
2129  }
2130  if(groupRestraintsKernel)
2131  {
2132  groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
2133  d_mass, d_pos_x, d_pos_y, d_pos_z,
2134  stream);
2135  }
2136  // Synchonize device before node barrier
2137  cudaCheck(cudaDeviceSynchronize());
2138  }
2139 #if 0
2140  printSOAPositionsAndVelocities();
2141 #endif
2142 }
2143 
2144 void SequencerCUDA::startRun3(
2145  double dt_normal,
2146  double dt_nbond,
2147  double dt_slow,
2148  Vector origin,
2149  const bool requestGlobalForces,
2150  int doGlobalMasterStaleForces,
2151  const bool requestForcesOutput,
2152  const bool requestGlobalForcesGPU,
2153  int maxForceNumber
2154 ){
2155  const bool doFixed = simParams->fixedAtomsOn;
2156  if(mGpuOn){
2157  // XXX TODO we need to call the force merging kernel here
2158 #if 1
2159  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2160  std::vector<int> atom_counts;
2161  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2162  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2163  }
2164  CUDASequencerKernel->mergeForcesFromPeers(
2165  deviceIndex,
2166  maxForceNumber,
2167  myLattice,
2168  numPatchesHomeAndProxy,
2169  numPatchesHome,
2170  this->d_peer_fb_x,
2171  this->d_peer_fb_y,
2172  this->d_peer_fb_z,
2173  this->d_peer_fn_x,
2174  this->d_peer_fn_y,
2175  this->d_peer_fn_z,
2176  this->d_peer_fs_x,
2177  this->d_peer_fs_y,
2178  this->d_peer_fs_z,
2179  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2180  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2181  patchData->devData[deviceIndex].d_localPatches,
2182  patchData->devData[deviceIndex].d_peerPatches,
2183  atom_counts,
2184  stream
2185  );
2186 #else
2187 
2188  // Before I call nccl, let's see the forces here
2189  // ncclAllReduce(d_f_normal_x, d_f_normal_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2190 
2191  // ncclAllReduce(d_f_normal_y, d_f_normal_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2192  // ncclAllReduce(d_f_normal_z, d_f_normal_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2193  // ncclAllReduce(d_f_nbond_x, d_f_nbond_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2194  // ncclAllReduce(d_f_nbond_y, d_f_nbond_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2195  // ncclAllReduce(d_f_nbond_z, d_f_nbond_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2196  // ncclAllReduce(d_f_slow_x, d_f_slow_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2197  // ncclAllReduce(d_f_slow_y, d_f_slow_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2198  // ncclAllReduce(d_f_slow_z, d_f_slow_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2199  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2200  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2201 #endif
2202  }
2203  if(doGlobalMasterStaleForces)
2204  {
2205  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
2206  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
2207  computeGlobalMasterVirial(
2208  numPatchesHomeAndProxy,
2209  numAtomsHome,
2210  patchData->devData[deviceIndex].d_localPatches,
2211  d_pos_x,
2212  d_pos_y,
2213  d_pos_z,
2214  d_transform,
2215  d_f_global_x,
2216  d_f_global_y,
2217  d_f_global_z,
2218  &d_extForce[EXT_GLOBALMTS],
2219  &extForce[EXT_GLOBALMTS],
2220  &d_extVirial[EXT_GLOBALMTS],
2221  &extVirial[EXT_GLOBALMTS],
2222  myLattice,
2223  d_tbcatomic,
2224  stream);
2225  }
2226 
2227  // do external forces calculation and store energy and virial
2228  calculateExternalForces(simParams->firstTimestep, maxForceNumber, 1, 1);
2229 #if 0
2230  cudaCheck(cudaDeviceSynchronize());
2231  if(true || deviceID == 0){
2232  char prefix[10];
2233  snprintf(prefix, 10, "step-%d",0);
2234  this->printSOAForces(prefix);
2235  }
2236 #endif
2237 
2238  CUDASequencerKernel->addForceToMomentum(
2239  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2240  d_recipMass,
2241  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2242  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2243  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2244  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2245  numAtomsHome, maxForceNumber, stream);
2246 
2247  CUDASequencerKernel->rattle1(
2248  simParams->fixedAtomsOn,1, 0,
2249  numAtomsHome, -dt_normal, -1.0/(dt_normal),
2250  2.0 * simParams->rigidTol,
2251  d_vel_x, d_vel_y, d_vel_z,
2252  d_pos_x, d_pos_y, d_pos_z,
2253  d_velNew_x, d_velNew_y, d_velNew_z,
2254  d_posNew_x, d_posNew_y, d_posNew_z,
2255  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2256  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2257  &settleList, settleListSize, &d_consFailure,
2258  d_consFailureSize, &rattleList, rattleListSize,
2259  &nSettle, &nRattle,
2260  d_rigidVirial, rigidVirial, d_tbcatomic, true, sp,
2261  true, consFailure, simParams->watmodel, stream);
2262 
2263  CUDASequencerKernel->centerOfMass(
2264  d_vel_x, d_vel_y, d_vel_z,
2265  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2266  d_hydrogenGroupSize, numAtomsHome, stream);
2267 
2268 
2269  {
2270  // SubmitHalf and its corresponding reductions
2271  submitHalf(numAtomsHome, 1, 1);
2272  // submitHalf reductions
2273  cudaCheck(cudaStreamSynchronize(stream));
2274  Tensor reduction_virial;
2275  Tensor reduction_intVirialNormal;
2276  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2277  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2278  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2279  // Haochuan: the tensor is not symmetric when there are fixed atoms
2280  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2281  reduction_virial *= 0.5;
2282  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2283  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2284  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2285  += (intKineticEnergy_half[0] * 0.25);
2286  reduction_intVirialNormal *= 0.5;
2287  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2288  reduction_intVirialNormal);
2289  }
2290 
2291  CUDASequencerKernel->addForceToMomentum(
2292  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2293  d_recipMass,
2294  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2295  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2296  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2297  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2298  numAtomsHome, maxForceNumber, stream);
2299 
2300  CUDASequencerKernel->rattle1(
2301  simParams->fixedAtomsOn, 1, 1,
2302  numAtomsHome, dt_normal, 1.0/dt_normal,
2303  2.0 * simParams->rigidTol,
2304  d_vel_x, d_vel_y, d_vel_z,
2305  d_pos_x, d_pos_y, d_pos_z,
2306  d_velNew_x, d_velNew_y, d_velNew_z,
2307  d_posNew_x, d_posNew_y, d_posNew_z,
2308  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2309  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2310  &settleList, settleListSize, &d_consFailure,
2311  d_consFailureSize, &rattleList, rattleListSize,
2312  &nSettle, &nRattle,
2313  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2314  buildRigidLists, consFailure, simParams->watmodel, stream);
2315 
2316  CUDASequencerKernel->centerOfMass(
2317  d_vel_x, d_vel_y, d_vel_z,
2318  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2319  d_hydrogenGroupSize, numAtomsHome, stream);
2320 
2321  {
2322  // JM: SubmitHalf and its corresponding reductions
2323  submitHalf(numAtomsHome, 1, 1);
2324  // submitHalf reductions
2325  cudaCheck(cudaStreamSynchronize(stream));
2326  Tensor reduction_virial;
2327  Tensor reduction_intVirialNormal;
2328  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2329  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2330  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2331  // Haochuan: the tensor is not symmetric when there are fixed atoms
2332  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2333  reduction_virial *= 0.5;
2334  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2335  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2336  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2337  += (intKineticEnergy_half[0] * 0.25);
2338  reduction_intVirialNormal *= 0.5;
2339  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2340  reduction_intVirialNormal);
2341  }
2342 
2343  CUDASequencerKernel->addForceToMomentum(
2344  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2345  d_recipMass,
2346  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2347  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2348  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2349  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2350  numAtomsHome, maxForceNumber, stream);
2351 
2352  if(requestGlobalForces || requestForcesOutput) {
2353  // store the forces for next step,
2354  // when we need it for colvars and Tcl scripting
2355  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2356  }
2357 
2358  if (requestGlobalForcesGPU) {
2359  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2360  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2361  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2362  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2363  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2364  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2365  CUDASequencerKernel->copyForcesToDevice(
2366  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2367  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2368  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2369  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2370  // cudaCheck(cudaStreamSynchronize(stream));
2371  }
2372 
2373  CUDASequencerKernel->centerOfMass(
2374  d_vel_x, d_vel_y, d_vel_z,
2375  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2376  d_hydrogenGroupSize, numAtomsHome, stream);
2377 
2378  submitReductions(origin.x, origin.y, origin.z,
2379  marginViolations, 1,
2380  1,
2381  numAtomsHome, maxForceNumber);
2382 
2383  copyPositionsAndVelocitiesToHost(1, 0);
2384 
2385  if(consFailure[0]){
2386  // Constraint failure. Abort.
2387  int dieOnError = simParams->rigidDie;
2388  if(dieOnError){
2389  // Bails out
2390  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
2391  NAMD_die("constraint failure during CUDA rattle!\n");
2392  }else{
2393  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
2394  }
2395  }else if(1){
2396  cudaCheck(cudaStreamSynchronize(stream));
2397  if (doGlobalMasterStaleForces) {
2398  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2399  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2400  // PRINT_CUDATENSOR(extVirial[EXT_GLOBALMTS], iout);
2401  }
2402  if(simParams->rigidBonds != RIGID_NONE){
2403  Tensor reduction_rigidVirial;
2404  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
2405  // Haochuan: the tensor is not symmetric when there are fixed atoms
2406  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2407  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
2408  }
2409 
2410  //submitReductions1
2411  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
2412  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2413  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
2414  Vector angularMomentum(*angularMomentum_x,
2415  *angularMomentum_y,
2416  *angularMomentum_z);
2417  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
2418  //submitReductions2
2419  Tensor regintVirialNormal;
2420  Tensor regintVirialNbond;
2421  Tensor regintVirialSlow;
2422  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
2423  if (maxForceNumber >= 1) {
2424  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
2425  }
2426  if (maxForceNumber >= 2) {
2427  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
2428  }
2429 
2430  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
2431  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
2432  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
2433  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
2434 
2435  if (simParams->fixedAtomsOn) {
2436  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2437  double3 fixForceNormal, fixForceNbond, fixForceSlow;
2438  switch (maxForceNumber) {
2439  case 2: {
2440  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2441  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2442  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
2443  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
2444  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
2445  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
2446  } // intentionally fallthrough
2447  case 1: {
2448  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2449  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2450  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
2451  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
2452  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
2453  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
2454  } // intentionally fallthrough
2455  default: {
2456  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2457  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2458  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
2459  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
2460  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
2461  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
2462  }
2463  }
2464 #if 0
2465  auto printTensor = [](const cudaTensor& t, const std::string& name){
2466  CkPrintf("%s", name.c_str());
2467  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
2468  "%12.5lf %12.5lf %12.5lf\n"
2469  "%12.5lf %12.5lf %12.5lf\n",
2470  t.xx, t.xy, t.xz,
2471  t.yx, t.yy, t.yz,
2472  t.zx, t.zy, t.zz);
2473  };
2474  printTensor(fixVirialNormal, "fixVirialNormal = ");
2475  printTensor(fixVirialNbond, "fixVirialNbond = ");
2476  printTensor(fixVirialSlow, "fixVirialSlow = ");
2477 #endif
2478 
2479  }
2480  }
2481 
2482 #if 0
2483  if(deviceID == 0){
2484  this->printSOAForces(NULL);
2485  }
2486 #endif
2487 
2488 #if 0
2489  printSOAPositionsAndVelocities();
2490 #endif
2491 }
2492 
2493 void SequencerCUDA::monteCarloPressure_reject(Lattice &lattice)
2494 {
2495  // Restore the myLattice
2496  myLattice = lattice;
2497  double *temp;
2498 
2499  // Restore positions and forces
2500  temp = d_f_normal_x; d_f_normal_x = d_f_normalMC_x; d_f_normalMC_x = temp;
2501  temp = d_f_normal_y; d_f_normal_y = d_f_normalMC_y; d_f_normalMC_y = temp;
2502  temp = d_f_normal_z; d_f_normal_z = d_f_normalMC_z; d_f_normalMC_z = temp;
2503  temp = d_f_nbond_x; d_f_nbond_x = d_f_nbondMC_x; d_f_nbondMC_x = temp;
2504  temp = d_f_nbond_y; d_f_nbond_y = d_f_nbondMC_y; d_f_nbondMC_y = temp;
2505  temp = d_f_nbond_z; d_f_nbond_z = d_f_nbondMC_z; d_f_nbondMC_z = temp;
2506  temp = d_f_slow_x; d_f_slow_x = d_f_slowMC_x; d_f_slowMC_x = temp;
2507  temp = d_f_slow_y; d_f_slow_y = d_f_slowMC_y; d_f_slowMC_y = temp;
2508  temp = d_f_slow_z; d_f_slow_z = d_f_slowMC_z; d_f_slowMC_z = temp;
2509  #ifdef NAMD_NCCL_ALLREDUCE
2510  if(mGpuOn) {
2511  temp = d_posNew_x; d_posNew_x = d_posMC_x; d_posMC_x = temp;
2512  temp = d_posNew_y; d_posNew_y = d_posMC_y; d_posMC_y = temp;
2513  temp = d_posNew_z; d_posNew_z = d_posMC_z; d_posMC_z = temp;
2514  } else {
2515  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2516  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2517  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2518  }
2519  #else
2520  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2521  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2522  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2523  #endif
2524 }
2525 
2526 void SequencerCUDA::monteCarloPressure_accept(
2527  const int doMigration)
2528 {
2529  // do we need to update center of masses?
2530  CUDASequencerKernel->centerOfMass(
2531  d_pos_x, d_pos_y, d_pos_z,
2532  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2533  d_hydrogenGroupSize, numAtomsHome, stream);
2534 
2535  // Add half step kinetic contribution to energy, intEnergy, virial, intVirial,
2536  // calculated by submitHalf in launch_part11
2537  Tensor reduction_virial;
2538  Tensor reduction_intVirialNormal;
2539  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2540  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2541  // Haochuan: the tensor is not symmetric when there are fixed atoms
2542  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2543  reduction_virial *= 0.5;
2544  reduction_intVirialNormal *= 0.5;
2545  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2546  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2547  reduction_intVirialNormal);
2548  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2549  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += (intKineticEnergy_half[0] * 0.25);
2550 
2551  // If we were on migration steps and move was accepted, we need to update
2552  // the myLatticeOld in order to use it in SequencerCUDAKernel::pairlistCheck();
2553  if(doMigration) {
2554  myLatticeOld = myLattice;
2555  }
2556 }
2557 
2558 void SequencerCUDA::monteCarloPressure_part1(
2559  Tensor &factor,
2560  Vector &origin,
2561  Lattice &oldLattice)
2562 {
2563  // Backup positions and forces
2564  //copy_DtoD<double>(d_f_raw, d_f_rawMC, numAtoms*9, stream);
2565  copy_DtoD<double>(d_f_normal_x, d_f_normalMC_x, numAtomsHome, stream);
2566  copy_DtoD<double>(d_f_normal_y, d_f_normalMC_y, numAtomsHome, stream);
2567  copy_DtoD<double>(d_f_normal_z, d_f_normalMC_z, numAtomsHome, stream);
2568  copy_DtoD<double>(d_f_nbond_x, d_f_nbondMC_x, numAtomsHome, stream);
2569  copy_DtoD<double>(d_f_nbond_y, d_f_nbondMC_y, numAtomsHome, stream);
2570  copy_DtoD<double>(d_f_nbond_z, d_f_nbondMC_z, numAtomsHome, stream);
2571  copy_DtoD<double>(d_f_slow_x, d_f_slowMC_x, numAtomsHome, stream);
2572  copy_DtoD<double>(d_f_slow_y, d_f_slowMC_y, numAtomsHome, stream);
2573  copy_DtoD<double>(d_f_slow_z, d_f_slowMC_z, numAtomsHome, stream);
2574 #ifdef NAMD_NCCL_ALLREDUCE
2575  if(mGpuOn) {
2576  //copy_DtoD<double>(d_posNew_raw, d_pos_rawMC, numAtomsHome*3, stream);
2577  copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2578  copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2579  copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2580  } else {
2581  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2582  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2583  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2584  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2585  }
2586 #else
2587  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2588  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2589  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2590  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2591 #endif
2592 
2593  // Scale the old lattice with factor. We need both lattice and newLattice
2594  // to properly unwrap and wrap the atom's coordinate
2595  Lattice newLattice = oldLattice;
2596  newLattice.rescale(factor);
2597  cudaTensor cuFactor;
2598  cudaVector cuOrigin;
2599  COPY_CUDATENSOR(factor, cuFactor);
2600  COPY_CUDAVECTOR(origin, cuOrigin);
2601 
2602  // Scale the coordinate using Molecule's geometric center
2603  Molecule *molecule = Node::Object()->molecule;
2604  CUDASequencerKernel->scaleCoordinateUsingGC(
2605  d_pos_x, d_pos_y, d_pos_z, d_idOrder, d_moleculeStartIndex,
2606  d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2607  d_transform, molecule->numMolecules, molecule->numLargeMolecules,
2608  stream);
2609 
2610  // Update the cuda lattice with newLattice for force calculation
2611  myLattice = newLattice;
2612 
2613  // Set up compute position before calling bonded and nonbonded kernel
2614  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2616  bool doNbond = patchData->flags.doNonbonded;
2617  bool doSlow = patchData->flags.doFullElectrostatics;
2618  bool doFEP = false;
2619  bool doTI = false;
2620  bool doAlchDecouple = false;
2621  bool doAlchSoftCore = false;
2622  if (simParams->alchOn) {
2623  if (simParams->alchFepOn) doFEP = true;
2624  if (simParams->alchThermIntOn) doTI = true;
2625  if (simParams->alchDecouple) doAlchDecouple = true;
2626  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2627  }
2628 
2629  if (Node::Object()->molecule->is_lonepairs_psf) {
2630  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
2631  }
2632 
2633  bool usePatchPme = false;
2634  if (deviceCUDA->getIsPmeDevice() && doSlow) {
2635  // This will check if the current lattice is compatible with the patch-level PME kernels
2636  // This needs to be redone every time the lattice changes, the results are stored in
2637  // the cudaPme object, and the overall compatibility can be computed with the compatible()
2638  // function.
2639  //
2640  // The behavior of set compute positions PME is different depending on the kernels being used,
2641  // so that value needs to be passed to the kernel object
2643  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
2645  myLattice,
2646  getNumPatchesHome(),
2647  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
2648  getHostPatchMin(),
2649  getHostPatchMax(),
2650  getHostAwayDists()
2651  );
2652  usePatchPme = cudaPme->patchLevelPmeData.compatible();
2653  }
2654 
2655  std::vector<int> atom_counts;
2656  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2657  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2658  }
2659  CUDASequencerKernel->set_compute_positions(
2660  deviceIndex,
2662  nDevices,
2663  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2664  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2665 #ifdef NAMD_NCCL_ALLREDUCE
2666  (mGpuOn) ? d_posNew_x: d_pos_x,
2667  (mGpuOn) ? d_posNew_y: d_pos_y,
2668  (mGpuOn) ? d_posNew_z: d_pos_z,
2669 #else
2670  d_pos_x,
2671  d_pos_y,
2672  d_pos_z,
2673  d_peer_pos_x, // passes double-pointer if mgpuOn
2674  d_peer_pos_y,
2675  d_peer_pos_z,
2676  d_peer_charge,
2677  d_peer_partition,
2678 #endif
2679  d_charge, d_partition, charge_scaling,
2680  d_patchCenter,
2681  patchData->devData[deviceIndex].slow_patchPositions,
2682  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2683  d_sortOrder, newLattice,
2684  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2685  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2687  patchData->devData[deviceIndex].d_localPatches,
2688  patchData->devData[deviceIndex].d_peerPatches,
2689  atom_counts,
2690  stream);
2691 
2692  cudaCheck(cudaStreamSynchronize(stream));
2693 }
2694 
2695 void SequencerCUDA::monteCarloPressure_part2(
2696  int step,
2697  int maxForceNumber,
2698  const bool doEnergy,
2699  const bool doGlobal,
2700  const bool doVirial)
2701 {
2702  // we zero all reduction value in part1. Need to add this
2703  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2704 
2705  if(mGpuOn){
2706 #ifdef NAMD_NCCL_ALLREDUCE
2707  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtoms*3*(maxForceNumber+1)));
2708 #endif
2709  }
2710  //Update SOA buffer
2711  CUDASequencerKernel->accumulateForceToSOA(
2712  doGlobal,
2713  simParams->useCudaGlobal,
2714  maxForceNumber,
2715  numPatchesHomeAndProxy,
2716  nDevices,
2717  patchData->devData[deviceIndex].d_localPatches,
2718  patchData->devData[deviceIndex].f_bond,
2719  patchData->devData[deviceIndex].f_bond_nbond,
2720  patchData->devData[deviceIndex].f_bond_slow,
2721  patchData->devData[deviceIndex].forceStride,
2722  patchData->devData[deviceIndex].f_nbond,
2723  patchData->devData[deviceIndex].f_nbond_slow,
2724  patchData->devData[deviceIndex].f_slow,
2725  d_f_global_x,
2726  d_f_global_y,
2727  d_f_global_z,
2728  d_f_normal_x,
2729  d_f_normal_y,
2730  d_f_normal_z,
2731  d_f_nbond_x,
2732  d_f_nbond_y,
2733  d_f_nbond_z,
2734  d_f_slow_x,
2735  d_f_slow_y,
2736  d_f_slow_z,
2737  d_unsortOrder,
2738  myLattice,
2739  patchData->d_queues,
2740  patchData->d_queueCounters,
2741  d_tbcatomic,
2742  stream
2743  );
2744  if(mGpuOn){
2745 #ifndef NAMD_NCCL_ALLREDUCE
2746  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2747  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
2748  // 0, deviceCUDA->getNcclComm(), stream);
2749  std::vector<int> atom_counts;
2750  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2751  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2752  }
2753  CUDASequencerKernel->mergeForcesFromPeers(
2754  deviceIndex,
2755  maxForceNumber,
2756  myLattice,
2757  numPatchesHomeAndProxy,
2758  numPatchesHome,
2759  this->d_peer_fb_x,
2760  this->d_peer_fb_y,
2761  this->d_peer_fb_z,
2762  this->d_peer_fn_x,
2763  this->d_peer_fn_y,
2764  this->d_peer_fn_z,
2765  this->d_peer_fs_x,
2766  this->d_peer_fs_y,
2767  this->d_peer_fs_z,
2768  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2769  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2770  patchData->devData[deviceIndex].d_localPatches,
2771  patchData->devData[deviceIndex].d_peerPatches,
2772  atom_counts,
2773  stream
2774  );
2775 #else
2776  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2777  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2778 #endif
2779  }
2780  // do external forces calculation
2781  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
2782 #if 0
2783  cudaCheck(cudaDeviceSynchronize());
2784  if(true || deviceID == 0){
2785  char prefix[10];
2786  snprintf(prefix, 10, "step-%d",step);
2787  this->printSOAForces(prefix);
2788  }
2789 #endif
2790 
2791 }
2792 
2793 void SequencerCUDA::setRescalePairlistTolerance(const bool val) {
2794  rescalePairlistTolerance = val;
2795 }
2796 
2797 void SequencerCUDA::launch_part1(
2798  int step,
2799  double dt_normal,
2800  double dt_nbond,
2801  double dt_slow,
2802  double velrescaling,
2803  const double maxvel2,
2804  Tensor &factor,
2805  Vector &origin,
2806  Lattice &lattice,
2807  int reassignVelocitiesStep,
2808  int langevinPistonStep,
2809  int berendsenPressureStep,
2810  int maxForceNumber,
2811  const int copyIn,
2812  const int savePairlists,
2813  const int usePairlists,
2814  const bool doEnergy)
2815 {
2816  PatchMap* patchMap = PatchMap::Object();
2817  // Aggregate data from all patches
2818  cudaCheck(cudaSetDevice(deviceID));
2819  this->maxvel2 = maxvel2;
2820  const bool doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
2821  const bool doFixed = simParams->fixedAtomsOn;
2822  // JM: for launch_part1:
2823  // copyIn: first call
2824  myLattice = lattice;
2825  if(reassignVelocitiesStep)
2826  {
2827  const int reassignFreq = simParams->reassignFreq;
2828  BigReal newTemp = simParams->reassignTemp;
2829  newTemp += ( step / reassignFreq ) * simParams->reassignIncr;
2830  if ( simParams->reassignIncr > 0.0 ) {
2831  if ( newTemp > simParams->reassignHold && simParams->reassignHold > 0.0 )
2832  newTemp = simParams->reassignHold;
2833  } else {
2834  if ( newTemp < simParams->reassignHold )
2835  newTemp = simParams->reassignHold;
2836  }
2837  const BigReal kbT = BOLTZMANN * newTemp;
2838 
2839  CUDASequencerKernel->reassignVelocities(
2840  dt_normal, simParams->fixedAtomsOn, d_atomFixed,
2841  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2842  d_vel_x, d_vel_y, d_vel_z,
2843  d_recipMass, kbT,
2844  numAtomsHome, numAtomsHome, 0,
2845  curandGen, stream);
2846  }
2847 
2848  // scale the position for berendsen Pressure Controller
2849  if(berendsenPressureStep) {
2850  cudaTensor cuFactor;
2851  cudaVector cuOrigin;
2852  COPY_CUDATENSOR(factor, cuFactor);
2853  COPY_CUDAVECTOR(origin, cuOrigin);
2854  CUDASequencerKernel->scaleCoordinateWithFactor(
2855  d_pos_x, d_pos_y, d_pos_z, d_mass, d_hydrogenGroupSize,
2856  cuFactor, cuOrigin, simParams->useGroupPressure, numAtomsHome, stream);
2857  }
2858 
2859  if(!langevinPistonStep){
2860  // kernel fusion here
2861  // JM TODO: Fuse kernels for the langevin thermostat
2862  CUDASequencerKernel->velocityVerlet1(
2863  doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2864  dt_slow, velrescaling, d_recipMass,
2865  d_vel_x, d_vel_y, d_vel_z, maxvel2, killme, d_pos_x, d_pos_y, d_pos_z,
2866  pos_x, pos_y, pos_z, d_f_normal_x, d_f_normal_y, d_f_normal_z,
2867  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z,
2868  d_atomFixed, numAtomsHome, maxForceNumber, stream);
2869  }else{
2870  // Zero-out force buffers here
2871  CUDASequencerKernel->addForceToMomentum(
2872  doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2873  d_recipMass,
2874  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2875  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2876  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2877  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2878  numAtomsHome, maxForceNumber, stream);
2879 
2880  maximumMove(maxvel2, numAtomsHome);
2881  cudaTensor cuFactor;
2882  cudaVector cuOrigin;
2883  COPY_CUDATENSOR(factor, cuFactor);
2884  COPY_CUDAVECTOR(origin, cuOrigin);
2885  double velFactor_x = namd_reciprocal(factor.xx);
2886  double velFactor_y = namd_reciprocal(factor.yy);
2887  double velFactor_z = namd_reciprocal(factor.zz);
2888 
2889  CUDASequencerKernel->addVelocityToPosition(
2890  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2891  d_vel_x, d_vel_y, d_vel_z,
2892  d_pos_x, d_pos_y, d_pos_z,
2893  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2894  CUDASequencerKernel->langevinPiston(
2895  simParams->fixedAtomsOn, d_atomFixed,
2896  d_groupFixed, d_transform, lattice,
2897  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2898  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
2899  d_mass, d_hydrogenGroupSize,
2900  cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2901  simParams->useGroupPressure, numAtomsHome, stream);
2902  CUDASequencerKernel->addVelocityToPosition(
2903  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2904  d_vel_x, d_vel_y, d_vel_z,
2905  d_pos_x, d_pos_y, d_pos_z,
2906  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2907  }
2908  if(mGpuOn && SMDKernel)
2909  {
2910  // compute distributed center of mass for groups of atoms
2911  SMDKernel->computeCOMMGpu(lattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2912  d_transform, stream);
2913  }
2914  if(mGpuOn && groupRestraintsKernel)
2915  {
2916  groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
2917  d_mass, d_pos_x, d_pos_y, d_pos_z,
2918  stream);
2919  }
2920 
2921  // JM: Recalculate centers of mass if energy calculation or langevinPistonOn
2922  if( (doEnergy || doVirial) ) {
2923  CUDASequencerKernel->centerOfMass(
2924  d_pos_x, d_pos_y, d_pos_z,
2925  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2926  d_hydrogenGroupSize, numAtomsHome, stream);
2927  CUDASequencerKernel->centerOfMass(
2928  d_vel_x, d_vel_y, d_vel_z,
2929  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2930  d_hydrogenGroupSize, numAtomsHome, stream);
2931  }
2932 
2933  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2935  // We need to find doNbond and doSlow for upcoming step
2936  bool doNbond = patchData->flags.doNonbonded;
2937  bool doSlow = patchData->flags.doFullElectrostatics;
2938 
2939  bool doFEP = false;
2940  bool doTI = false;
2941  bool doAlchDecouple = false;
2942  bool doAlchSoftCore = false;
2943  if (simParams->alchOn) {
2944  if (simParams->alchFepOn) doFEP = true;
2945  if (simParams->alchThermIntOn) doTI = true;
2946  if (simParams->alchDecouple) doAlchDecouple = true;
2947  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2948  }
2949  if ( ! savePairlists ) {
2950 
2951  double minSize = simParams->patchDimension - simParams->margin;
2952  double sysdima = lattice.a_r().unit() * lattice.a();
2953  double sysdimb = lattice.b_r().unit() * lattice.b();
2954  double sysdimc = lattice.c_r().unit() * lattice.c();
2955  // Let's pass migrationInfo here
2956 
2957  CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2958  patchData->devData[deviceIndex].d_localPatches,
2959  d_pos_x, d_pos_y, d_pos_z, d_posSave_x, d_posSave_y, d_posSave_z,
2960  d_awayDists,
2961  myLattice, myLatticeOld,
2962  d_patchMin, d_patchMax, d_patchCenter,
2963  d_mInfo,
2964  d_tbcatomic, simParams->pairlistTrigger,
2965  simParams->pairlistGrow, simParams->pairlistShrink,
2966  d_patchMaxAtomMovement, patchMaxAtomMovement,
2967  d_patchNewTolerance, patchNewTolerance,
2968  minSize, simParams->cutoff, sysdima, sysdimb, sysdimc,
2969  h_marginViolations,
2970  h_periodicCellSmall,
2971  rescalePairlistTolerance,
2972  isPeriodic, stream);
2973  rescalePairlistTolerance = false;
2974  }
2975  else {
2976  rescalePairlistTolerance = true;
2977  }
2978 
2979  if(mGpuOn){
2980  // Synchonize device before node barrier
2981  cudaCheck(cudaStreamSynchronize(stream));
2982  }
2983 }
2984 
2985 void SequencerCUDA::launch_part11(
2986  double dt_normal,
2987  double dt_nbond,
2988  double dt_slow,
2989  double velrescaling,
2990  const double maxvel2,
2991  Tensor &factor,
2992  Vector &origin,
2993  Lattice &lattice,
2994  int langevinPistonStep,
2995  int maxForceNumber,
2996  const int copyIn,
2997  const int savePairlists,
2998  const int usePairlists,
2999  const bool doEnergy)
3000 {
3001  const bool doVirial = simParams->langevinPistonOn;
3002  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3004  // We need to find doNbond and doSlow for upcoming step
3005  bool doNbond = patchData->flags.doNonbonded;
3006  bool doSlow = patchData->flags.doFullElectrostatics;
3007 
3008  bool doFEP = false;
3009  bool doTI = false;
3010  bool doAlchDecouple = false;
3011  bool doAlchSoftCore = false;
3012  if (simParams->alchOn) {
3013  if (simParams->alchFepOn) doFEP = true;
3014  if (simParams->alchThermIntOn) doTI = true;
3015  if (simParams->alchDecouple) doAlchDecouple = true;
3016  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3017  }
3018 
3019  submitHalf(numAtomsHome, 1, doEnergy || doVirial);
3020 
3021  // Updating numerical flags
3022  NAMD_EVENT_START(1, NamdProfileEvent::CPY_PATCHFLAGS);
3023  this->update_patch_flags();
3024  NAMD_EVENT_STOP(1, NamdProfileEvent::CPY_PATCHFLAGS);
3025 
3026  finish_part1(copyIn, patchList[0]->flags.savePairlists,
3027  patchList[0]->flags.usePairlists);
3028 }
3029 
3030 
3031 void SequencerCUDA::launch_set_compute_positions() {
3032 
3033  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3035  // We need to find doNbond and doSlow for upcoming step
3036  bool doNbond = patchData->flags.doNonbonded;
3037  bool doSlow = patchData->flags.doFullElectrostatics;
3038 
3039  bool doFEP = false;
3040  bool doTI = false;
3041  bool doAlchDecouple = false;
3042  bool doAlchSoftCore = false;
3043  if (simParams->alchOn) {
3044  if (simParams->alchFepOn) doFEP = true;
3045  if (simParams->alchThermIntOn) doTI = true;
3046  if (simParams->alchDecouple) doAlchDecouple = true;
3047  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3048  }
3049  const bool doIMD = simParams->IMDon && !(simParams->IMDignoreForces || simParams->IMDignore);
3050  bool doGlobal = simParams->tclForcesOn || simParams->colvarsOn || doIMD;
3051  // Set the positions of lone pairs before copying to NB buffers
3053  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3054  }
3055 
3056  bool usePatchPme = false;
3057  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3058  // This will check if the current lattice is compatible with the patch-level PME kernels
3059  // This needs to be redone every time the lattice changes, the results are stored in
3060  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3061  // function.
3062  //
3063  // The behavior of set compute positions PME is different depending on the kernels being used,
3064  // so that value needs to be passed to the kernel object
3066  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3068  myLattice,
3069  getNumPatchesHome(),
3070  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3071  getHostPatchMin(),
3072  getHostPatchMax(),
3073  getHostAwayDists()
3074  );
3075  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3076  }
3077 
3078  if (1) {
3079  //fprintf(stderr, "calling set_compute_positions() ****************************************\n");
3080  //fprintf(stderr, "calling set_compute_positions\n");
3081  //fprintf(stderr, "doNbond=%d doSlow=%d\n", doNbond, doSlow);
3082  std::vector<int> atom_counts;
3083  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3084  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3085  }
3086  CUDASequencerKernel->set_compute_positions(
3087  deviceIndex,
3089  nDevices,
3090  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3091  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3092 #ifdef NAMD_NCCL_ALLREDUCE
3093  (mGpuOn) ? d_posNew_x: d_pos_x,
3094  (mGpuOn) ? d_posNew_y: d_pos_y,
3095  (mGpuOn) ? d_posNew_z: d_pos_z,
3096 #else
3097  d_pos_x,
3098  d_pos_y,
3099  d_pos_z,
3100  d_peer_pos_x, // passes double-pointer if mgpuOn
3101  d_peer_pos_y,
3102  d_peer_pos_z,
3103  d_peer_charge,
3104  d_peer_partition,
3105 #endif
3106  d_charge, d_partition, charge_scaling,
3107  d_patchCenter,
3108  patchData->devData[deviceIndex].slow_patchPositions,
3109  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3110  d_sortOrder, myLattice,
3111  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3112  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3114  patchData->devData[deviceIndex].d_localPatches,
3115  patchData->devData[deviceIndex].d_peerPatches,
3116  atom_counts,
3117  stream);
3118  // For global forces, copy the coordinate to host with kernel overlap
3119  if (doGlobal) {
3120  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_POSITION);
3121  // CkPrintf("WARNING this probably needs to be changed for multihost\n");
3122  copyPositionsToHost_direct();
3123  //copyPositionsAndVelocitiesToHost(1,0);
3124  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_POSITION);
3125  }
3126  }
3127 }
3128 
3129 void SequencerCUDA:: finish_part1( const int copyIn,
3130  const int savePairlists,
3131  const int usePairlists)
3132 {
3133  // JM: If we're not in a migration step, let's overlap the flagging the
3134  // positions before we synchronize the stream to lessen the compute
3135  // launch overhead
3136  // Hopefully we will see some overlap in this region
3137  //
3138  // TODO: We can just call this a different function and start calling positionsReady
3139  // if we're not on a migration step, so that we can overlap some of the work with the kernels
3140  // before we synchronize, we can clear the device memory
3141  // sets the tileListStat for the nbondKernel
3142  cudaCheck(cudaStreamSynchronize(stream));
3143 
3144  // Checks if periodic cell became too small
3145  if(*h_periodicCellSmall){
3146  NAMD_die("Periodic cell has become too small for original patch grid!\n"
3147  "Possible solutions are to restart from a recent checkpoint,\n"
3148  "increase margin, or disable useFlexibleCell for liquid simulation.");
3149  }
3150 
3151  if (killme[0]) {
3152  const Molecule* mol = Node::Object()->molecule;
3153  // Found at least one atom that is moving too fast.
3154  // Terminating, so loop performance below doesn't matter.
3155  // Loop does not vectorize
3156  double *vel_x, *vel_y, *vel_z;
3157  std::vector<int> id;
3158  std::vector<int> patchIDofAtoms(numAtomsHome);
3159  allocate_host<double>(&vel_x, numAtomsHome);
3160  allocate_host<double>(&vel_y, numAtomsHome);
3161  allocate_host<double>(&vel_z, numAtomsHome);
3162  copy_DtoH_sync<double>(d_vel_x, vel_x, numAtomsHome);
3163  copy_DtoH_sync<double>(d_vel_y, vel_y, numAtomsHome);
3164  copy_DtoH_sync<double>(d_vel_z, vel_z, numAtomsHome);
3165  // Update the id array from patchDataSOA
3166  size_t offset = 0;
3167  // TODO: Does this work for GPU atom migration?
3168  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3169  // std::vector<int> patchIDtoIndex(numPatchesHome);
3170  for (int i = 0; i < numPatchesHome; i++) {
3171  HomePatch *patch = homePatches[i];
3172  PatchDataSOA& current = patchList[i]->patchDataSOA;
3173  const int numPatchAtoms = current.numAtoms;
3174  id.resize(numPatchAtoms + id.size());
3175  for(int j = 0; j < numPatchAtoms; j++){
3176  if (!simParams->useDeviceMigration) {
3177  id[offset + j] = current.id[j];
3178  }
3179  patchIDofAtoms[offset + j] = patch->getPatchID();
3180  }
3181  offset += numPatchAtoms;
3182  }
3183  if (simParams->useDeviceMigration) {
3184  id.resize(numAtomsHome);
3185  copy_DtoH_sync<int>(d_idMig, id.data(), numAtomsHome);
3186  }
3187  int cnt = 0;
3188  for (int i=0; i < numAtomsHome; i++) {
3189  BigReal vel2 =
3190  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3191  if (vel2 > maxvel2) {
3192  ++cnt;
3193  iout << iERROR << " velocity is "
3194  << PDBVELFACTOR * vel_x[i] << " "
3195  << PDBVELFACTOR * vel_y[i] << " "
3196  << PDBVELFACTOR * vel_z[i]
3197  << " (limit is "
3198  << ( PDBVELFACTOR * sqrt(maxvel2) ) << ", atom ID "
3199  << id[i]+1
3200  // XXX: TODO: mol->get_atomtype only works on a single-node
3201  << " of type " << mol->get_atomtype(id[i])
3202  << " in patch " << patchIDofAtoms[i]
3203  << " on PE " << CkMyPe()
3204  << " with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
3205  << " atoms)\n" << endi;
3206  }
3207  }
3208  iout << iERROR << "Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3209  "; simulation has become unstable ("
3210  << cnt << " atoms on pe " << CkMyPe() << ", GPU " << deviceID << ").\n" << endi;
3211  if (simParams->crashOutputFlag & NAMD_CRASH_ATOM_TOO_FAST) {
3212  double *pos_x, *pos_y, *pos_z;
3213  allocate_host<double>(&pos_x, numAtomsHome);
3214  allocate_host<double>(&pos_y, numAtomsHome);
3215  allocate_host<double>(&pos_z, numAtomsHome);
3216  copy_DtoH_sync<double>(d_pos_x, pos_x, numAtomsHome);
3217  copy_DtoH_sync<double>(d_pos_y, pos_y, numAtomsHome);
3218  copy_DtoH_sync<double>(d_pos_z, pos_z, numAtomsHome);
3219  // Save the positions and velocities to a file for debugging
3220  const std::string outfilename =
3221  std::string(simParams->crashFilename) + "." +
3222  std::to_string(deviceIndex);
3223  std::ofstream ofs_crash_dump(outfilename.c_str());
3224  ofs_crash_dump << "atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
3225  for (int i=0; i < numAtomsHome; i++) {
3226  ofs_crash_dump << id[i]+1 << ","
3227  << pos_x[i] << ","
3228  << pos_y[i] << ","
3229  << pos_z[i] << ","
3230  << PDBVELFACTOR * vel_x[i] << ","
3231  << PDBVELFACTOR * vel_y[i] << ","
3232  << PDBVELFACTOR * vel_z[i] << "\n";
3233  }
3234  ofs_crash_dump.flush();
3235  ofs_crash_dump.close();
3236  iout << iWARN << "PE " << CkMyPe() << ", GPU " << deviceID
3237  << ": the atom positions and velocities have been written to "
3238  << outfilename << "\n" << endi;
3239  deallocate_host<double>(&pos_x);
3240  deallocate_host<double>(&pos_y);
3241  deallocate_host<double>(&pos_z);
3242  }
3243  deallocate_host<double>(&vel_x);
3244  deallocate_host<double>(&vel_y);
3245  deallocate_host<double>(&vel_z);
3246  NAMD_die("SequencerCUDA: Atoms moving too fast");
3247  }
3248  else{
3249  // submitHalf reductions
3250  Tensor reduction_virial;
3251  Tensor reduction_intVirialNormal;
3252  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3253  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3254  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3255  // Haochuan: the tensor is not symmetric when there are fixed atoms
3256  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3257  reduction_virial *= 0.5;
3258  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3259  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
3260  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3261  += (intKineticEnergy_half[0] * 0.25);
3262  reduction_intVirialNormal *= 0.5;
3263  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3264  reduction_intVirialNormal);
3265  int migration = (h_marginViolations[0] != 0) ? 1 :0; // flags migration as TRUE if margin violation occured
3266  // if(migration != 0 ) fprintf(stderr, "DEV[%d] = MIGRATION[%d]\n", deviceID, migration);
3267  patchData->migrationFlagPerDevice[deviceIndex] = migration; // Saves the updated migration flag
3268  h_marginViolations[0] = 0;
3269  }
3270 }
3271 
3272 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3273  bool copyOut, const int doGlobal){
3274  // CkPrintf("copy positions and velocities to host copyout %d doGlobal %d\n", copyOut, doGlobal);
3275  if(copyOut){
3276  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3277  patchData = cpdata.ckLocalBranch();
3278  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3279  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3280  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3281  const int numAtomsToCopy = numAtomsHome;
3282  copy_DtoH<double>(d_vel_x, vel_x, numAtomsToCopy, stream);
3283  copy_DtoH<double>(d_vel_y, vel_y, numAtomsToCopy, stream);
3284  copy_DtoH<double>(d_vel_z, vel_z, numAtomsToCopy, stream);
3285  if (!doGlobal) {
3286  // We already copied coordinate if we have global forces
3287  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3288  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3289  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3290  }
3291  cudaCheck(cudaDeviceSynchronize());
3292 
3293  for(int i = 0; i < homePatches.size(); i++){
3294 
3295  // TODO do we need to copy proxy patches as well
3296  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3297  const int numPatchAtoms = localPatches[i].numAtoms;
3298  const int offset = localPatches[i].bufferOffset;
3299  memcpy(current.vel_x, vel_x + offset, numPatchAtoms*sizeof(double));
3300  memcpy(current.vel_y, vel_y + offset, numPatchAtoms*sizeof(double));
3301  memcpy(current.vel_z, vel_z + offset, numPatchAtoms*sizeof(double));
3302  if (!doGlobal) {
3303  // We already copied coordinate if we have global forces
3304  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3305  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3306  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3307  }
3308  }
3309  }
3310 }
3311 
3312 
3313 void SequencerCUDA::copyPositionsToHost(){
3314  // CkPrintf("copy positions to host \n");
3315  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3316  patchData = cpdata.ckLocalBranch();
3317  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3318  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3319  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3320 
3321  const int numAtomsToCopy = numAtomsHome;
3322  // We already copied coordinate if we have global forces
3323  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3324  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3325  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3326  cudaCheck(cudaDeviceSynchronize());
3327 
3328  for(int i = 0; i < homePatches.size(); i++){
3329 
3330  // TODO do we need to copy proxy patches as well
3331  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3332  const int numPatchAtoms = localPatches[i].numAtoms;
3333  const int offset = localPatches[i].bufferOffset;
3334  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3335  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3336  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3337  }
3338 }
3339 
3340 void SequencerCUDA::update_patch_flags()
3341 {
3342  // int pairlists = 1;
3343  int pairlists = (patchData->flags.step < simParams->N);
3344  for (int i=0; i < numPatchesHome; i++) {
3345  HomePatch *patch = patchList[i];
3346  patch->flags.copyIntFlags(patchData->flags); // copy global flags
3347  }
3348 }
3349 
3350 void SequencerCUDA::updatePairlistFlags(const int doMigration){
3351  int pairlists = patchList[0]->flags.step < simParams->N;
3352  for(int i = 0; i < numPatchesHome; i++){
3353  //for(int i = 0; i < numPatches; i++){
3354  HomePatch *patch = patchList[i];
3355  Sequencer *seq = patch->sequencer;
3356  // the following logic is duplicated from Sequencer::runComputeObjects
3357  // Migrations always invalidates pairlists
3358  if (doMigration) {
3359  seq->pairlistsAreValid = 0;
3360  }
3361  if (seq->pairlistsAreValid &&
3362  ( patch->flags.doFullElectrostatics || ! simParams->fullElectFrequency )
3363  && (seq->pairlistsAge > seq->pairlistsAgeLimit) ) {
3364  seq->pairlistsAreValid = 0;
3365  }
3366  patch->flags.usePairlists = pairlists || seq->pairlistsAreValid;
3367  patch->flags.savePairlists = pairlists && !seq->pairlistsAreValid;
3368  if(patch->flags.savePairlists){
3369  // We need to rebuild pairlists -> reset tolerance values
3370  patch->flags.pairlistTolerance = patchList[i]->doPairlistCheck_newTolerance; // update pairListTolerance
3371  patch->flags.maxAtomMovement = 0;
3372  patch->doPairlistCheck_newTolerance *= (1 - simParams->pairlistShrink);
3373  }else if(patch->flags.usePairlists){
3374  // We can keep going with the existing pairlists -> update tolerances
3375  patch->flags.maxAtomMovement = patchMaxAtomMovement[i];
3376  patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3377  }else{
3378  // End of simulation
3379  patch->flags.maxAtomMovement=99999.;
3380  patch->flags.pairlistTolerance = 0.;
3381  }
3382  }
3383  if(patchList[0]->flags.savePairlists){
3384  // Backs up d_posSave_* for pairlistCheck
3385  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
3386  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
3387  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
3388  myLatticeOld = myLattice;
3389  }
3390 }
3391 
3392 void SequencerCUDA::finish_patch_flags(int migration)
3393 {
3394  for (int i=0; i < numPatchesHome; i++) {
3395  HomePatch *patch = patchList[i];
3396  Sequencer *seq = patch->sequencer;
3397  if (patch->flags.savePairlists && patch->flags.doNonbonded) {
3398  seq->pairlistsAreValid = 1;
3399  seq->pairlistsAge = 0;
3400  }
3401  if (seq->pairlistsAreValid /* && ! pressureStep */) {
3402  ++(seq->pairlistsAge);
3403  }
3404  }
3405 }
3406 
3407 
3408 void SequencerCUDA::launch_part2(
3409  const int doMCPressure,
3410  double dt_normal,
3411  double dt_nbond,
3412  double dt_slow,
3413  Vector &origin,
3414  int step,
3415  int maxForceNumber,
3416  const int langevinPistonStep,
3417  const int copyIn,
3418  const int copyOut,
3419  const int doGlobal,
3420  const bool doEnergy)
3421 {
3422  PatchMap* patchMap = PatchMap::Object();
3423  Tensor localVirial;
3424  //cudaTensor h_rigidVirial;
3425  bool doNbond = false;
3426  bool doSlow = false;
3427  cudaCheck(cudaSetDevice(deviceID));
3428  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3429  const int is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3430  // const int doVirial = langevinPistonStep;
3431  // JM: For launch_part2:
3432  // copyIn = migration steps
3433 
3434  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
3435 
3436  if(mGpuOn){
3437 #ifdef NAMD_NCCL_ALLREDUCE
3438  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3439 #endif
3440  }
3441 
3442  if(!simParams->langevinOn && !simParams->eFieldOn && !simParams->constraintsOn &&
3443  !simParams->SMDOn && !simParams->groupRestraintsOn && !doMCPressure &&
3444  !simParams->mgridforceOn && ! simParams->gridforceOn && !mGpuOn &&
3445  !simParams->consForceOn &&
3446  (simParams->watmodel == WaterModel::TIP3) &&
3447  (!is_lonepairs_psf)){
3448  CUDASequencerKernel->accumulate_force_kick(
3449  simParams->fixedAtomsOn,
3450  doGlobal,
3451  simParams->useCudaGlobal,
3452  maxForceNumber,
3453  numPatchesHomeAndProxy,
3454  patchData->devData[deviceIndex].d_localPatches,
3455  patchData->devData[deviceIndex].f_bond,
3456  patchData->devData[deviceIndex].f_bond_nbond,
3457  patchData->devData[deviceIndex].f_bond_slow,
3458  patchData->devData[deviceIndex].forceStride,
3459  patchData->devData[deviceIndex].f_nbond,
3460  patchData->devData[deviceIndex].f_nbond_slow,
3461  patchData->devData[deviceIndex].f_slow,
3462  d_f_global_x,
3463  d_f_global_y,
3464  d_f_global_z,
3465  d_f_normal_x,
3466  d_f_normal_y,
3467  d_f_normal_z,
3468  d_f_nbond_x,
3469  d_f_nbond_y,
3470  d_f_nbond_z,
3471  d_f_slow_x,
3472  d_f_slow_y,
3473  d_f_slow_z,
3474  d_vel_x,
3475  d_vel_y,
3476  d_vel_z,
3477  d_recipMass,
3478  d_atomFixed,
3479  dt_normal,
3480  dt_nbond,
3481  dt_slow,
3482  1.0,
3483  d_unsortOrder,
3484  myLattice,
3485  stream
3486  );
3487  }else{
3488  CUDASequencerKernel->accumulateForceToSOA(
3489  doGlobal,
3490  simParams->useCudaGlobal,
3491  maxForceNumber,
3492  numPatchesHomeAndProxy,
3493  nDevices,
3494  patchData->devData[deviceIndex].d_localPatches,
3495  patchData->devData[deviceIndex].f_bond,
3496  patchData->devData[deviceIndex].f_bond_nbond,
3497  patchData->devData[deviceIndex].f_bond_slow,
3498  patchData->devData[deviceIndex].forceStride,
3499  patchData->devData[deviceIndex].f_nbond,
3500  patchData->devData[deviceIndex].f_nbond_slow,
3501  patchData->devData[deviceIndex].f_slow,
3502  d_f_global_x,
3503  d_f_global_y,
3504  d_f_global_z,
3505  d_f_normal_x,
3506  d_f_normal_y,
3507  d_f_normal_z,
3508  d_f_nbond_x,
3509  d_f_nbond_y,
3510  d_f_nbond_z,
3511  d_f_slow_x,
3512  d_f_slow_y,
3513  d_f_slow_z,
3514  d_unsortOrder,
3515  myLattice,
3516  patchData->d_queues,
3517  patchData->d_queueCounters,
3518  d_tbcatomic,
3519  stream
3520  );
3521 
3522  }
3523 
3524  if (mGpuOn) {
3525  // Synchonize device before node barrier
3526  cudaCheck(cudaDeviceSynchronize());
3527  }
3528 }
3529 
3530 // launch_part2 is broken into 2 part to support MC barostat
3531 void SequencerCUDA::launch_part3(
3532  const int doMCPressure,
3533  double dt_normal,
3534  double dt_nbond,
3535  double dt_slow,
3536  Vector &origin,
3537  int step,
3538  int maxForceNumber,
3539  const bool requestGlobalForces,
3540  const int doGlobalStaleForces,
3541  const bool forceRequestedGPU,
3542  const int copyIn,
3543  const int copyOut,
3544  const bool doEnergy,
3545  const bool requestForcesOutput)
3546 {
3547  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3548  const bool doFixed = simParams->fixedAtomsOn;
3549  const double velrescaling = 1; // no rescaling
3550 
3551  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3552  simParams->SMDOn || simParams->groupRestraintsOn || requestGlobalForces || simParams->gridforceOn|| simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3553  (simParams->watmodel != WaterModel::TIP3) ||
3555  if(mGpuOn){
3556 #ifndef NAMD_NCCL_ALLREDUCE
3557  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
3558  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
3559  // 0, deviceCUDA->getNcclComm(), stream);
3560 
3561  std::vector<int> atom_counts;
3562  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3563  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3564  }
3565  CUDASequencerKernel->mergeForcesFromPeers(
3566  deviceIndex,
3567  maxForceNumber,
3568  myLattice,
3569  numPatchesHomeAndProxy,
3570  numPatchesHome,
3571  this->d_peer_fb_x,
3572  this->d_peer_fb_y,
3573  this->d_peer_fb_z,
3574  this->d_peer_fn_x,
3575  this->d_peer_fn_y,
3576  this->d_peer_fn_z,
3577  this->d_peer_fs_x,
3578  this->d_peer_fs_y,
3579  this->d_peer_fs_z,
3580  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
3581  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
3582  patchData->devData[deviceIndex].d_localPatches,
3583  patchData->devData[deviceIndex].d_peerPatches,
3584  atom_counts,
3585  stream
3586  );
3587 #else
3588  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3589  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
3590 #endif
3591  }
3592  if(doVirial && doGlobalStaleForces)
3593  {
3594  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
3595  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
3596  computeGlobalMasterVirial(
3597  numPatchesHomeAndProxy,
3598  numAtomsHome,
3599  patchData->devData[deviceIndex].d_localPatches,
3600  d_pos_x,
3601  d_pos_y,
3602  d_pos_z,
3603  d_transform,
3604  d_f_global_x,
3605  d_f_global_y,
3606  d_f_global_z,
3607  &d_extForce[EXT_GLOBALMTS],
3608  &extForce[EXT_GLOBALMTS],
3609  &d_extVirial[EXT_GLOBALMTS],
3610  &extVirial[EXT_GLOBALMTS],
3611  myLattice,
3612  d_tbcatomic,
3613  stream);
3614  }
3615  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
3616 #if 0
3617  cudaCheck(cudaDeviceSynchronize());
3618  if(true || deviceID == 0){
3619  char prefix[10];
3620  snprintf(prefix, 10, "step-%d",step);
3621  this->printSOAForces(prefix);
3622  }
3623 #endif
3624 
3625  }
3626 
3627  if (simParams->langevinOn) {
3628  CUDASequencerKernel->langevinVelocitiesBBK1(
3629  dt_normal, d_langevinParam, d_vel_x, d_vel_y, d_vel_z, numAtomsHome, stream);
3630  }
3631 
3632  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3633  simParams->SMDOn || simParams->groupRestraintsOn || doMCPressure || simParams->gridforceOn || simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3634  (simParams->watmodel != WaterModel::TIP3) ||
3636  CUDASequencerKernel->addForceToMomentum(
3637  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3638  d_recipMass,
3639  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3640  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3641  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3642  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3643  numAtomsHome, maxForceNumber, stream);
3644  }
3645 
3646  if (simParams->langevinOn) {
3647 
3648  // must enforce rigid bond constraints if langevin gammas differ
3649  if (simParams->rigidBonds != RIGID_NONE &&
3650  simParams->langevinGammasDiffer) {
3651  CUDASequencerKernel->rattle1(
3652  simParams->fixedAtomsOn, doEnergy || doVirial,
3653  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3654  2.0 * simParams->rigidTol,
3655  d_vel_x, d_vel_y, d_vel_z,
3656  d_pos_x, d_pos_y, d_pos_z,
3657  d_velNew_x, d_velNew_y, d_velNew_z,
3658  d_posNew_x, d_posNew_y, d_posNew_z,
3659  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3660  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3661  &settleList, settleListSize, &d_consFailure,
3662  d_consFailureSize, &rattleList, rattleListSize,
3663  &nSettle, &nRattle,
3664  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3665  buildRigidLists, consFailure, simParams->watmodel, stream);
3666  buildRigidLists = false;
3667  }
3668  CUDASequencerKernel->langevinVelocitiesBBK2(
3669  dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3670  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3671  d_vel_x, d_vel_y, d_vel_z,
3672  numAtomsHome, numAtomsHome, 0,
3673  curandGen, stream);
3674  }
3675  if(simParams->rigidBonds != RIGID_NONE){
3676  CUDASequencerKernel->rattle1(
3677  simParams->fixedAtomsOn, doEnergy || doVirial,
3678  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3679  2.0 * simParams->rigidTol,
3680  d_vel_x, d_vel_y, d_vel_z,
3681  d_pos_x, d_pos_y, d_pos_z,
3682  d_velNew_x, d_velNew_y, d_velNew_z,
3683  d_posNew_x, d_posNew_y, d_posNew_z,
3684  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3685  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3686  &settleList, settleListSize, &d_consFailure,
3687  d_consFailureSize, &rattleList, rattleListSize,
3688  &nSettle, &nRattle,
3689  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3690  buildRigidLists, consFailure, simParams->watmodel, stream);
3691  buildRigidLists = false;
3692  }
3693 
3694  // Update velocity center of mass here
3695  if(doEnergy || doVirial){
3696  CUDASequencerKernel->centerOfMass(
3697  d_vel_x, d_vel_y, d_vel_z,
3698  d_vcm_x, d_vcm_y, d_vcm_z,
3699  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3700  }
3701 
3702  submitHalf(numAtomsHome, 2, doEnergy || doVirial);
3703 
3704  CUDASequencerKernel->addForceToMomentum(
3705  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3706  d_recipMass,
3707  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3708  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3709  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3710  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3711  numAtomsHome, maxForceNumber, stream);
3712 
3713  if(requestGlobalForces || requestForcesOutput) {
3714  // store the forces for next step,
3715  // when we need it for colvars and Tcl scripting
3716  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3717  }
3718 
3719  if (forceRequestedGPU) {
3720  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3721  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3722  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3723  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3724  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3725  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3726  CUDASequencerKernel->copyForcesToDevice(
3727  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3728  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3729  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3730  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3731  // cudaCheck(cudaStreamSynchronize(stream));
3732  }
3733 
3734  //cudaCheck(cudaStreamSynchronize(stream));
3735  submitReductions(origin.x, origin.y, origin.z,
3736  marginViolations, doEnergy || doVirial,
3737  copyOut && simParams->outputMomenta != 0,
3738  numAtomsHome, maxForceNumber);
3739  // This is for collecting coordinate and velocity to print
3740  copyPositionsAndVelocitiesToHost(copyOut, 0);
3741 
3742  if(consFailure[0]){
3743  // Constraint failure. Abort.
3744  int dieOnError = simParams->rigidDie;
3745  if(dieOnError){
3746  // Bails out
3747  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
3748  NAMD_die("constraint failure during CUDA rattle!\n");
3749  }else{
3750  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
3751  }
3752  }else if(doEnergy || doVirial){
3753  cudaCheck(cudaStreamSynchronize(stream));
3754  if(doVirial && doGlobalStaleForces) {
3755  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3756  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3757  }
3758  if(simParams->rigidBonds != RIGID_NONE){
3759  Tensor reduction_rigidVirial;
3760  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
3761  // Haochuan: the tensor is not symmetric when there are fixed atoms
3762  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3763  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
3764  }
3765 
3766  // SUBMITHALF reductions
3767  Tensor reduction_virial;
3768  Tensor reduction_intVirialNormal;
3769  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3770  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3771  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3772  // Haochuan: the tensor is not symmetric when there are fixed atoms
3773  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3774  reduction_virial *= 0.5;
3775  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3776 
3777  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3778  += (intKineticEnergy_half[0] * 0.25);
3779  reduction_intVirialNormal *= 0.5;
3780  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3781  reduction_intVirialNormal);
3782 
3783  //submitReductions1
3784  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
3785  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3786  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
3787  Vector angularMomentum(*angularMomentum_x,
3788  *angularMomentum_y,
3789  *angularMomentum_z);
3790  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3791  //submitReductions2
3792  Tensor regintVirialNormal;
3793  Tensor regintVirialNbond;
3794  Tensor regintVirialSlow;
3795  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
3796  if (maxForceNumber >= 1) {
3797  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
3798  }
3799  if (maxForceNumber >= 2) {
3800  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
3801  }
3802 
3803  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
3804  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
3805  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
3806  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
3807 
3808  if (simParams->fixedAtomsOn) {
3809  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3810  double3 fixForceNormal, fixForceNbond, fixForceSlow;
3811  switch (maxForceNumber) {
3812  case 2: {
3813  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3814  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3815  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
3816  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
3817  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
3818  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
3819  } // intentionally fallthrough
3820  case 1: {
3821  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3822  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3823  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
3824  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
3825  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
3826  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
3827  } // intentionally fallthrough
3828  default: {
3829  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3830  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3831  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
3832  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
3833  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
3834  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
3835  }
3836  }
3837 #if 0
3838  auto printTensor = [](const cudaTensor& t, const std::string& name){
3839  CkPrintf("%s", name.c_str());
3840  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
3841  "%12.5lf %12.5lf %12.5lf\n"
3842  "%12.5lf %12.5lf %12.5lf\n",
3843  t.xx, t.xy, t.xz,
3844  t.yx, t.yy, t.yz,
3845  t.zx, t.zy, t.zz);
3846  };
3847  printTensor(fixVirialNormal, "fixVirialNormal = ");
3848  printTensor(fixVirialNbond, "fixVirialNbond = ");
3849  printTensor(fixVirialSlow, "fixVirialSlow = ");
3850 #endif
3851  }
3852  }
3853 }
3854 
3855 // Adding this function back temporarily until GPU migration is merged
3856 #if 1
3857 // This function will aggregate data within a single GPU, so we need to have it copied over pmePositions
3858 void SequencerCUDA::atomUpdatePme()
3859 {
3860  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3862  // We need to find doNbond and doSlow for upcoming step
3863  bool doNbond = false;
3864  bool doSlow = true;
3865 
3866  bool doFEP = false;
3867  bool doTI = false;
3868  bool doAlchDecouple = false;
3869  bool doAlchSoftCore = false;
3870  if (simParams->alchOn) {
3871  if (simParams->alchFepOn) doFEP = true;
3872  if (simParams->alchThermIntOn) doTI = true;
3873  if (simParams->alchDecouple) doAlchDecouple = true;
3874  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3875  }
3876 
3877  if (Node::Object()->molecule->is_lonepairs_psf) {
3878  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3879  }
3880 
3881  bool usePatchPme = false;
3882  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3883  // This will check if the current lattice is compatible with the patch-level PME kernels
3884  // This needs to be redone every time the lattice changes, the results are stored in
3885  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3886  // function.
3887  //
3888  // The behavior of set compute positions PME is different depending on the kernels being used,
3889  // so that value needs to be passed to the kernel object
3891  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3893  myLattice,
3894  getNumPatchesHome(),
3895  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3896  getHostPatchMin(),
3897  getHostPatchMax(),
3898  getHostAwayDists()
3899  );
3900  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3901  }
3902 
3903  std::vector<int> atom_counts;
3904  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3905  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3906  }
3907  CUDASequencerKernel->set_pme_positions(
3908  deviceIndex,
3910  nDevices,
3911  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3912  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3913 #ifdef NAMD_NCCL_ALLREDUCE
3914  (mGpuOn) ? d_posNew_x: d_pos_x,
3915  (mGpuOn) ? d_posNew_y: d_pos_y,
3916  (mGpuOn) ? d_posNew_z: d_pos_z,
3917 #else
3918  d_pos_x,
3919  d_pos_y,
3920  d_pos_z,
3921  d_peer_pos_x, // passes double-pointer if mgpuOn
3922  d_peer_pos_y,
3923  d_peer_pos_z,
3924  d_peer_charge,
3925  d_peer_partition,
3926 #endif
3927  d_charge, d_partition, charge_scaling,
3928  d_patchCenter,
3929  patchData->devData[deviceIndex].slow_patchPositions,
3930  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3931  d_sortOrder, myLattice,
3932  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3933  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3935  patchData->devData[deviceIndex].d_localPatches,
3936  patchData->devData[deviceIndex].d_peerPatches,
3937  atom_counts,
3938  stream);
3939 
3940  cudaCheck(cudaStreamSynchronize(stream));
3941 }
3942 #endif
3943 
3944 
3945 void SequencerCUDA::sync() {
3946  cudaCheck(cudaStreamSynchronize(stream));
3947 }
3948 
3949 void SequencerCUDA::calculateExternalForces(
3950  const int step,
3951  const int maxForceNumber,
3952  const int doEnergy,
3953  const int doVirial) {
3954 
3955  const bool is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3956  const bool is_tip4_water = simParams->watmodel == WaterModel::TIP4;
3957 
3958  if (is_lonepairs_psf) {
3959  lonepairsKernel->redistributeForce(
3960  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3961  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3962  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3963  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3964  d_pos_x, d_pos_y, d_pos_z, maxForceNumber, doEnergy || doVirial, stream);
3965  }
3966  // TODO: Should I use "else if" here to follow the logic of Sequencer.C?
3967  if (is_tip4_water) {
3968  redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
3969  }
3970 
3971  if(simParams->eFieldOn){
3972  double3 efield;
3973  efield.x = simParams->eField.x;
3974  efield.y = simParams->eField.y;
3975  efield.z = simParams->eField.z;
3976 
3977  double efield_omega = TWOPI * simParams->eFieldFreq / 1000.;
3978  double efield_phi = PI/180. * simParams->eFieldPhase;
3979  double t = step * simParams->dt;
3980 
3981  CUDASequencerKernel->apply_Efield(numAtomsHome, simParams->eFieldNormalized,
3982  doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3983  d_charge, d_pos_x, d_pos_y, d_pos_z,
3984  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3985  &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3986  &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3987  &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3988  d_tbcatomic, stream);
3989  }
3990 
3991  if(simParams->constraintsOn){
3992  restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3993  d_pos_x, d_pos_y, d_pos_z,
3994  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3995  &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3996  &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3997  &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3998  }
3999 
4000  if(simParams->SMDOn){
4001  SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
4002  d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform,
4003  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4004  &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
4005  &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
4006  }
4007 
4008  if(simParams->groupRestraintsOn){
4009  groupRestraintsKernel->doForce(step, doEnergy, doVirial,
4010  myLattice, d_transform,
4011  d_mass, d_pos_x, d_pos_y, d_pos_z,
4012  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4013  &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
4014  &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
4015  }
4016  if(simParams->mgridforceOn || simParams->gridforceOn){
4017  gridForceKernel->doForce(doEnergy, doVirial,
4018  myLattice, step,
4019  d_pos_x, d_pos_y, d_pos_z, d_transform,
4020  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4021  stream);
4022  }
4023  if(simParams->consForceOn){
4024  consForceKernel->doForce(myLattice, doVirial,
4025  d_pos_x, d_pos_y, d_pos_z,
4026  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4027  d_transform,
4028  &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
4029  &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
4030  }
4031 
4032  if(doEnergy || doVirial) {
4033  // Store the external forces and energy data
4034  cudaCheck(cudaStreamSynchronize(stream));
4035  if (is_lonepairs_psf || is_tip4_water) {
4036  switch (maxForceNumber) {
4037  case 2:
4038  copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
4039  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, lpVirialSlow[0]);
4040  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
4041  case 1:
4042  copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
4043  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, lpVirialNbond[0]);
4044  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
4045  case 0:
4046  copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
4047  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, lpVirialNormal[0]);
4048  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
4049  }
4050  }
4051  if(simParams->eFieldOn){
4052  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_ELEC_FIELD];
4053  if (!simParams->eFieldNormalized){
4054  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
4055  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
4056  }
4057  }
4058 
4059  if(simParams->constraintsOn){
4060  reduction->item(REDUCTION_BC_ENERGY) += extEnergy[EXT_CONSTRAINTS];
4061  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
4062  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
4063  }
4064 
4065  if(simParams->SMDOn){
4067  if(cudaMgr->reducerSMDDevice == deviceIndex)
4068  {
4069  // each SMD kernel has the total SMD energy and extForce
4070  // so only one will contribute
4071  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_SMD];
4072  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_SMD]);
4073  }
4074  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_SMD]);
4075  }
4076 
4077  if(simParams->groupRestraintsOn){
4078  // single GPU case, or designated reducer are only contributors for energy
4080  if(!mGpuOn || (deviceCUDA->getDeviceIndex() == cudaMgr->reducerGroupRestraintDevice))
4081  {
4082  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GROUP_RESTRAINTS];
4083  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
4084  }
4085  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
4086  }
4087 
4088  if(simParams->mgridforceOn || simParams->gridforceOn){
4089  // first sum across grids
4090  gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4091  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GRIDFORCE];
4092  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
4093  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
4094  gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4095  }
4096  if(simParams->consForceOn){
4097  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_CONSFORCE];
4098  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
4099  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
4100  }
4101  }
4102 }
4103 
4104 void SequencerCUDA::copyGlobalForcesToDevice(){
4105  // copy the globalMaster forces from host to device.
4106  // Use normal force on host to aggregate it
4107  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
4108  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
4109  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4110  // TODO: determine if this aggregation needs peers and to be home and proxy
4111  for(int i =0 ; i < numPatchesHome; i++){
4112  CudaLocalRecord record = localPatches[i];
4113  const int patchID = record.patchID;
4114  const int stride = record.bufferOffset;
4115  const int numPatchAtoms = record.numAtoms;
4116  PatchDataSOA& current = homePatches[i]->patchDataSOA;
4117  memcpy(f_global_x + stride, current.f_global_x, numPatchAtoms*sizeof(double));
4118  memcpy(f_global_y + stride, current.f_global_y, numPatchAtoms*sizeof(double));
4119  memcpy(f_global_z + stride, current.f_global_z, numPatchAtoms*sizeof(double));
4120  }
4121  // copy aggregated force to device buffer
4122  copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4123  copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4124  copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4125 
4126 }
4127 
4128 void SequencerCUDA::updateHostPatchDataSOA() {
4129  std::vector<PatchDataSOA> host_copy(numPatchesHome);
4130  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4131 
4132  for(int i =0 ; i < numPatchesHome; i++) {
4133  host_copy[i] = homePatches[i]->patchDataSOA;
4134  }
4135  copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4136  cudaCheck(cudaDeviceSynchronize());
4137 }
4138 
4139 void SequencerCUDA::saveForceCUDASOA_direct(
4140  const bool doGlobal, const bool doForcesOutput, const int maxForceNumber) {
4141  CUDASequencerKernel->copyForcesToHostSOA(
4142  numPatchesHome,
4143  patchData->devData[deviceIndex].d_localPatches,
4144  maxForceNumber,
4145  d_f_normal_x,
4146  d_f_normal_y,
4147  d_f_normal_z,
4148  d_f_nbond_x,
4149  d_f_nbond_y,
4150  d_f_nbond_z,
4151  d_f_slow_x,
4152  d_f_slow_y,
4153  d_f_slow_z,
4154  d_HostPatchDataSOA,
4155  doGlobal,
4156  doForcesOutput,
4157  stream
4158  );
4159  cudaCheck(cudaStreamSynchronize(stream));
4160 }
4161 
4162 void SequencerCUDA::copyPositionsToHost_direct() {
4163  CUDASequencerKernel->copyPositionsToHostSOA(
4164  numPatchesHome,
4165  patchData->devData[deviceIndex].d_localPatches,
4166  d_pos_x,
4167  d_pos_y,
4168  d_pos_z,
4169  d_HostPatchDataSOA,
4170  stream
4171  );
4172  cudaCheck(cudaStreamSynchronize(stream));
4173 }
4174 
4175 void SequencerCUDA::redistributeTip4pForces(
4176  const int maxForceNumber,
4177  const int doVirial) {
4178  CUDASequencerKernel->redistributeTip4pForces(
4179  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4180  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
4181  d_f_slow_x, d_f_slow_y, d_f_slow_z,
4182  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4183  d_pos_x, d_pos_y, d_pos_z, d_mass,
4184  numAtomsHome, doVirial, maxForceNumber, stream
4185  );
4186 }
4187 
4188 void SequencerCUDA::allocateGPUSavedForces() {
4189  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4190  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4191  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4192  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4193  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4194  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4195 }
4196 
4197 void SequencerCUDA::submitReductionValues() {
4198  reduction->submit();
4199 }
4200 
4201 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
BigReal yy
Definition: CudaUtils.h:89
double * vel_y
Definition: NamdTypes.h:397
NAMD_HOST_DEVICE void rescale(Tensor factor)
Definition: Lattice.h:60
#define NAMD_EVENT_STOP(eon, id)
int getNumAtoms() const
Definition: Patch.h:105
ScaledPosition getMin()
Definition: HomePatch.h:529
int getDeviceCount()
Definition: DeviceCUDA.h:124
int size(void) const
Definition: ResizeArray.h:131
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void updateAtomBuffers()
int periodic_a(void) const
Definition: PatchMap.h:73
#define BOLTZMANN
Definition: common.h:54
#define curandCheck(stmt)
Definition: CudaUtils.h:251
double3 ** curSMDCOM
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal zz
Definition: CudaUtils.h:93
BigReal yx
Definition: CudaUtils.h:88
int periodic_c(void) const
Definition: PatchMap.h:75
BigReal alchElecLambdaStart
Definition: Vector.h:72
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
void deallocate_device(T **pp)
Definition: CudaUtils.h:342
BigReal yz
Definition: CudaUtils.h:90
int savePairlists
Definition: PatchTypes.h:41
double * f_global_z
Definition: NamdTypes.h:439
int32 * moleculeAtom
atom index for all molecules
Definition: Molecule.h:622
#define COULOMB
Definition: common.h:53
HomePatchList * homePatchList()
Definition: PatchMap.C:438
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
int usePairlists
Definition: PatchTypes.h:40
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
PatchID destPatchID
Definition: Migration.h:20
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
#define iout
Definition: InfoStream.h:51
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
#define COPY_CUDATENSOR(S, D)
Definition: CudaUtils.h:51
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
Definition: Molecule.h:174
double * pos_y
Definition: NamdTypes.h:378
void setupDevicePeerAccess()
Flags flags
Definition: Patch.h:128
double * f_global_y
Definition: NamdTypes.h:438
std::atomic< int > reducerSMDDevice
const char * get_atomtype(int anum) const
Definition: Molecule.h:1192
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
std::atomic< int > reducerGroupRestraintDevice
#define PI
Definition: common.h:92
void updateAtomCount(const int n, const int reallocate)
#define NAMD_CRASH_ATOM_TOO_FAST
FullAtomList & getAtomList()
Definition: HomePatch.h:528
void copyIntFlags(const Flags &flags)
Definition: PatchTypes.h:74
NAMD_HOST_DEVICE double3 make_double3(float3 a)
Definition: Vector.h:343
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
int pairlistsAge
Definition: Sequencer.h:232
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38
int getMasterPe()
Definition: DeviceCUDA.h:137
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
Definition: Molecule.h:620
#define ATOMIC_BINS
Definition: CudaUtils.h:79
void NAMD_bug(const char *err_msg)
Definition: common.C:195
BigReal xx
Definition: CudaUtils.h:85
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:62
static ComputeCUDAMgr * getComputeCUDAMgr()
int doFullElectrostatics
Definition: PatchTypes.h:23
double * vel_x
Jim recommends double precision velocity.
Definition: NamdTypes.h:396
int32 * id
Definition: NamdTypes.h:390
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
Definition: Molecule.h:619
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:305
ScaledPosition getMax()
Definition: HomePatch.h:530
BigReal zx
Definition: CudaUtils.h:91
int32 * transform_i
Definition: NamdTypes.h:410
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:586
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:147
int periodic_b(void) const
Definition: PatchMap.h:74
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
BigReal xz
Definition: CudaUtils.h:87
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
BigReal maxAtomMovement
Definition: PatchTypes.h:43
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
BigReal xx
Definition: Tensor.h:17
#define TWOPI
Definition: common.h:96
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:436
BigReal zz
Definition: Tensor.h:19
int32 * transform_k
Definition: NamdTypes.h:412
#define simParams
Definition: Output.C:131
iterator begin(void)
Definition: ResizeArray.h:36
double * pos_z
Definition: NamdTypes.h:379
const PatchID patchID
Definition: Patch.h:150
Definition: Tensor.h:15
double * pos_x
Definition: NamdTypes.h:377
BigReal y
Definition: Vector.h:74
bool getIsPmeDevice()
Definition: DeviceCUDA.h:168
int32 * transform_j
Definition: NamdTypes.h:411
double * vel_z
Definition: NamdTypes.h:398
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28
BigReal yy
Definition: Tensor.h:18
#define PDBVELFACTOR
Definition: common.h:57
int32 * moleculeStartIndex
starting index of each molecule
Definition: Molecule.h:621
int getDeviceIndex()
Definition: DeviceCUDA.h:166
BigReal pairlistTolerance
Definition: PatchTypes.h:42
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
bool getIsMasterDevice()
Definition: DeviceCUDA.C:646
double * f_global_x
Definition: NamdTypes.h:437
int pairlistsAgeLimit
Definition: Sequencer.h:233
int pairlistsAreValid
Definition: Sequencer.h:231
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int destPatchID[3][3][3]
Definition: CudaUtils.h:132
int is_lonepairs_psf
Definition: Molecule.h:491
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
#define namd_reciprocal(x)
Definition: Vector.h:69
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:529
#define RIGID_NONE
Definition: SimParameters.h:80
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
SimParameters *const simParams
Definition: Sequencer.h:322
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
Molecule * molecule
Definition: Node.h:179
BigReal zy
Definition: CudaUtils.h:92
BigReal xy
Definition: CudaUtils.h:86
double BigReal
Definition: common.h:123
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms
Definition: NamdTypes.h:456