NAMD
SequencerCUDA.C
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "CudaUtils.h"
3 #include "Molecule.h"
4 #include "ReductionMgr.h"
5 #include "SequencerCUDA.h"
6 #include "ComputeNonbondedUtil.h"
7 #include "DeviceCUDA.h"
8 #include "SimParameters.h"
9 #include "TestArray.h"
10 #include "ComputeRestraintsCUDA.h"
11 #include "ComputeGridForceCUDA.h"
12 #include "ComputeConsForceCUDA.h"
13 #include "NamdEventsProfiling.h"
14 #include "AtomMap.h"
15 #include "common.h"
16 #include <algorithm> // std::fill()
18 //#define DEBUGM
19 //#define MIN_DEBUG_LEVEL 3
20 #ifdef NODEGROUP_FORCE_REGISTER
21 #if !defined(WIN64)
22 extern __thread DeviceCUDA *deviceCUDA;
23 #else
24 extern __declspec(thread) DeviceCUDA *deviceCUDA;
25 #endif
26 
27 #if 1
28 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
29  size_t offset = 0; \
30  for (int i = 0; i < numPatchesHome; i++) { \
31  PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \
32  const int numPatchAtoms = current.numAtoms; \
33  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
34  offset += numPatchAtoms; \
35  } \
36  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \
37  } while(0);
38 
39 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
40  size_t offset = 0; \
41  for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
42  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
43  const int numPatchAtoms = current.numAtoms; \
44  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
45  offset += numPatchAtoms; \
46  } \
47  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \
48  } while(0);
49 
50 #define AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do { \
51  size_t offset = 0; \
52  for (int i = 0; i < numPatchesHome; i++) { \
53  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
54  const int numPatchAtoms = current.numAtoms; \
55  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
56  offset += numPatchAtoms; \
57  } \
58  copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHome, stream); \
59  } while(0);
60 
61 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do { \
62  size_t offset = 0; \
63  for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
64  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
65  const int numPatchAtoms = current.numAtoms; \
66  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
67  offset += numPatchAtoms; \
68  } \
69  copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHomeAndProxy, stream); \
70  } while(0);
71 
72 #else
73 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
74  size_t offset = 0; \
75  for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \
76  PatchDataSOA& current = elem->patch->patchDataSOA; \
77  const int numPatchAtoms = current.numAtoms; \
78  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
79  offset += numPatchAtoms; \
80  } \
81  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \
82  } while(0);
83 #endif
84 
85 // This function sets whatever SOA pointer I have an
86 void SequencerCUDA::registerSOAPointersToHost(){
87  patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
88 }
89 
90 void SequencerCUDA::copySOAHostRegisterToDevice(){
91  // This function gets the host-registered SOA device pointers and copies the register itself to the device
92  // NOTE: This needs to be called only when ALL masterPEs have safely called ::registerSOAPointersToHost()
93  cudaCheck(cudaSetDevice(deviceID));
94  copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
95 
96  // Workaround until CUDA compute objects have better access to these buffers
97  for(int i = 0; i < this->nDevices; i++) {
98  patchData->h_soa_sortOrder[i] = coll_sortOrder.getHostPeer()[i];
99  if (simParams->useDeviceMigration) {
100  patchData->h_soa_vdwType[i] = coll_vdwType.getHostPeer()[i];
101  patchData->h_soa_id[i] = coll_idMig.getHostPeer()[i];
102  patchData->h_soa_migrationDestination[i] = coll_migrationDestination.getHostPeer()[i];
103  }
104 
105  if (simParams->alchOn) {
106  patchData->h_soa_partition[i] = coll_partition.getHostPeer()[i];
107  }
108  }
109 
110  // aggregate device pointers
111  for(int i = 0; i < this->nDevices; i++)
112  h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
113  copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
114 }
115 
116 void SequencerCUDA::printSOAPositionsAndVelocities() {
117  BigReal* h_pos_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
118  BigReal* h_pos_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
119  BigReal* h_pos_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
120 
121  BigReal* h_vel_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
122  BigReal* h_vel_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
123  BigReal* h_vel_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
124 
125  // DMC think this condition was a holdover from the NCCL code path
126  if(false && mGpuOn){
127  copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
128  copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
129  copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
130  }else{
131  copy_DtoH_sync<BigReal>(coll_pos_x.getDevicePtr(), h_pos_x, numAtomsHome);
132  copy_DtoH_sync<BigReal>(coll_pos_y.getDevicePtr(), h_pos_y, numAtomsHome);
133  copy_DtoH_sync<BigReal>(coll_pos_z.getDevicePtr(), h_pos_z, numAtomsHome);
134  }
135 
136  copy_DtoH_sync<BigReal>(coll_vel_x.getDevicePtr(), h_vel_x, numAtomsHome);
137  copy_DtoH_sync<BigReal>(coll_vel_y.getDevicePtr(), h_vel_y, numAtomsHome);
138  copy_DtoH_sync<BigReal>(coll_vel_z.getDevicePtr(), h_vel_z, numAtomsHome);
139 
140  CmiLock(this->patchData->printlock);
141  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
142  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
143  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
144  for(int i =0 ; i < numPatchesHome; i++){
145  CudaLocalRecord record = localPatches[i];
146  const int patchID = record.patchID;
147  const int stride = record.bufferOffset;
148  const int numPatchAtoms = record.numAtoms;
149  PatchDataSOA& current = homePatches[i]->patchDataSOA;
150 
151  fprintf(stderr, "Patch [%d]:\n", patchID);
152  for(int j = 0; j < numPatchAtoms; j++){
153  fprintf(stderr, " [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.id[j],
154  h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
155  h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
156  }
157 
158  }
159  CmiUnlock(this->patchData->printlock);
160 
161  free(h_pos_x);
162  free(h_pos_y);
163  free(h_pos_z);
164  free(h_vel_x);
165  free(h_vel_y);
166  free(h_vel_z);
167 }
168 
169 void SequencerCUDA::printSOAForces(char *prefix) {
170  BigReal* h_f_normal_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
171  BigReal* h_f_normal_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
172  BigReal* h_f_normal_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
173 
174  BigReal* h_f_nbond_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
175  BigReal* h_f_nbond_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
176  BigReal* h_f_nbond_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
177 
178  BigReal* h_f_slow_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
179  BigReal* h_f_slow_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
180  BigReal* h_f_slow_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
181 
182  copy_DtoH_sync<BigReal>(coll_f_normal_x.getDevicePtr(), h_f_normal_x, numAtomsHome);
183  copy_DtoH_sync<BigReal>(coll_f_normal_y.getDevicePtr(), h_f_normal_y, numAtomsHome);
184  copy_DtoH_sync<BigReal>(coll_f_normal_z.getDevicePtr(), h_f_normal_z, numAtomsHome);
185 
186  copy_DtoH_sync<BigReal>(coll_f_nbond_x.getDevicePtr(), h_f_nbond_x, numAtomsHome);
187  copy_DtoH_sync<BigReal>(coll_f_nbond_y.getDevicePtr(), h_f_nbond_y, numAtomsHome);
188  copy_DtoH_sync<BigReal>(coll_f_nbond_z.getDevicePtr(), h_f_nbond_z, numAtomsHome);
189 
190  copy_DtoH_sync<BigReal>(coll_f_slow_x.getDevicePtr(), h_f_slow_x, numAtomsHome);
191  copy_DtoH_sync<BigReal>(coll_f_slow_y.getDevicePtr(), h_f_slow_y, numAtomsHome);
192  copy_DtoH_sync<BigReal>(coll_f_slow_z.getDevicePtr(), h_f_slow_z, numAtomsHome);
193 
194  // Great, now let's
195  CmiLock(this->patchData->printlock);
196  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
197  char fname[100];
198  fprintf(stderr, "PE[%d] force printout\n", CkMyPe());
199  for(int i =0 ; i < numPatchesHome; i++){
200  CudaLocalRecord record = localPatches[i];
201  const int patchID = record.patchID;
202  const int stride = record.bufferOffset;
203  const int numPatchAtoms = record.numAtoms;
204  FILE *outfile=stderr;
205  if(prefix!=NULL)
206  {
207  snprintf(fname,100, "%s-patch-%d", prefix, patchID);
208  outfile = fopen(fname, "w");
209  }
210  fprintf(outfile, "Patch [%d]:\n", patchID);
211  for(int j = 0; j < numPatchAtoms; j++){
212  fprintf(outfile, " [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
213  h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
214  h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
215  h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
216  }
217  if(prefix!=NULL) fclose(outfile);
218  }
219 
220  CmiUnlock(this->patchData->printlock);
221 
222  free(h_f_normal_x);
223  free(h_f_normal_y);
224  free(h_f_normal_z);
225 
226  free(h_f_nbond_x);
227  free(h_f_nbond_y);
228  free(h_f_nbond_z);
229 
230  free(h_f_slow_x);
231  free(h_f_slow_y);
232  free(h_f_slow_z);
233 
234 }
235 
236 SequencerCUDA* SequencerCUDA::InstanceInit(const int deviceID_ID,
237  SimParameters *const sim_Params) {
238  if (CkpvAccess(SequencerCUDA_instance) == 0) {
239  CkpvAccess(SequencerCUDA_instance) = new SequencerCUDA(deviceID_ID, sim_Params);
240  }
241  return CkpvAccess(SequencerCUDA_instance);
242 }
243 
244 SequencerCUDA::SequencerCUDA(const int deviceID_ID,
245  SimParameters *const sim_Params):
246  deviceID(deviceID_ID), simParams(sim_Params)
247 {
248  restraintsKernel = NULL;
249  SMDKernel = NULL;
250  groupRestraintsKernel = NULL;
251  gridForceKernel = NULL;
252  consForceKernel = NULL;
253  lonepairsKernel = nullptr;
254 #if 1
255  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
256  patchData = cpdata.ckLocalBranch();
257 #endif
258  initialize();
259  CUDASequencerKernel = new SequencerCUDAKernel();
260  CUDAMigrationKernel = new MigrationCUDAKernel();
261  num_used_grids = simParams->alchGetNumOfPMEGrids();
262  used_grids.resize(num_used_grids, 0);
263  if (simParams->alchFepOn) {
264  // at least two grids are used
265  used_grids[0] = 0;
266  used_grids[1] = 1;
267  // if alchDecouple then two more grids are used
268  if (simParams->alchDecouple) {
269  used_grids[2] = 2;
270  used_grids[3] = 3;
271  // an extra for soft-core potential
272  if (simParams->alchElecLambdaStart > 0) {
273  used_grids[4] = 4;
274  }
275  } else {
276  // in this case alchDecouple is false
277  // but if there is still soft-core potential
278  // then a total of 3 grids are used
279  // mark the last grid for soft-core potential
280  if (simParams->alchElecLambdaStart > 0) {
281  used_grids[2] = 4;
282  }
283  }
284  }
285  if (simParams->alchThermIntOn) {
286  used_grids[0] = 0;
287  used_grids[1] = 1;
288  // in TI, no matter whether soft-core potential is used
289  // it has at least three grids
290  if (simParams->alchDecouple) {
291  used_grids[2] = 2;
292  used_grids[3] = 3;
293  used_grids[4] = 4;
294  } else {
295  used_grids[2] = 4;
296  }
297  }
298  rescalePairlistTolerance = false;
299 }
300 
301 SequencerCUDA::~SequencerCUDA(){
302  cudaCheck(cudaSetDevice(deviceID));
303  deallocateArrays();
304  deallocateStaticArrays();
305  deallocate_device<SettleParameters>(&sp);
306  deallocate_device<int>(&settleList);
307  deallocate_device<CudaRattleElem>(&rattleList);
308  deallocate_device<int>(&d_consFailure);
309  if (CUDASequencerKernel != NULL) delete CUDASequencerKernel;
310  if (CUDAMigrationKernel != NULL) delete CUDAMigrationKernel;
311  if (restraintsKernel != NULL) delete restraintsKernel;
312  if(SMDKernel != NULL) delete SMDKernel;
313  if (groupRestraintsKernel != NULL) delete groupRestraintsKernel;
314  if (gridForceKernel != NULL) delete gridForceKernel;
315  if (lonepairsKernel != nullptr) delete lonepairsKernel;
316  if (consForceKernel != NULL) delete consForceKernel;
317 #if 0
318  cudaCheck(cudaStreamDestroy(stream));
319 #endif
320  cudaCheck(cudaStreamDestroy(stream2));
321  curandCheck(curandDestroyGenerator(curandGen));
322  CmiDestroyLock(printlock);
323 
324 }
325 
326 void SequencerCUDA::zeroScalars(){
327  numAtomsHomeAndProxyAllocated = 0;
328  numAtomsHomeAllocated = 0;
329  buildRigidLists = true;
330  numPatchesCheckedIn = 0;
331  numPatchesReady= 0;
332 }
333 
334 void SequencerCUDA::initialize(){
335  cudaCheck(cudaSetDevice(deviceID));
336  nDevices = deviceCUDA->getNumDevice() + 1 *deviceCUDA->isGpuReservedPme();
337  deviceIndex = deviceCUDA->getDeviceIndex();
338 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
339  int leastPriority, greatestPriority;
340  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
341 #if 0
342  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
343 #else
344  stream = (CkMyPe() == deviceCUDA->getMasterPe()) ? patchData->devData[deviceIndex].nbond_stream : 0;
345 #endif
346  cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
347 #else
348  cudaCheck(cudaStreamCreate(&stream));
349  cudaCheck(cudaStreamCreate(&stream2));
350 #endif
351  curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
352  // each PE's seed needs to be different
353  unsigned long long seed = simParams->randomSeed + CkMyPe();
354  curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
355 
356  numAtomsHomeAllocated = 0;
357  numAtomsHomeAndProxyAllocated = 0;
358 
359  totalMarginViolations = 0;
360  buildRigidLists = true;
361  numPatchesCheckedIn = 0;
362  numPatchesReady= 0;
363  PatchMap* patchMap = PatchMap::Object();
364 #if 1
365  numPatchesGlobal = patchMap->numPatches();
366 #else
367  numPatchesGlobal = patchMap->homePatchList()->size();
368 #endif
369  mGpuOn = nDevices > 1;
370  // Great, now allocate the queue
371  // allocate_device<unsigned int>(&deviceQueue, nDevices);
372  // cudaCheck(cudaMemset(deviceQueue, 999, sizeof(unsigned int)* nDevices));
373  // Allocates and registers the local queue in patchData
374  // patchData->d_queues[deviceIndex] = deviceQueue;
375 
376  printlock = CmiCreateLock();
377 
378  const int numPes = CkNumPes();
379  if (!mGpuOn) {
380  atomMapList.resize(numPes);
381  }
382 
383  if (simParams->fixedAtomsOn) {
384  allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
385  allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
386  allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
387  allocate_device<double3>(&d_fixForceNormal, 1);
388  allocate_device<double3>(&d_fixForceNbond, 1);
389  allocate_device<double3>(&d_fixForceSlow, 1);
390  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
391  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
392  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
393  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
394  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
395  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
396  }
397 
398  allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
399 
400  allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
401 
402  allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
403  // Patch-related host datastructures
404  allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
405  allocate_host<double3>(&patchCenter, numPatchesGlobal);
406  allocate_host<int>(&globalToLocalID, numPatchesGlobal);
407  allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
408  allocate_host<double3>(&awayDists, numPatchesGlobal);
409  allocate_host<double3>(&patchMin, numPatchesGlobal);
410  allocate_host<double3>(&patchMax, numPatchesGlobal);
411  //allocate_host<Lattice>(&lattices, numPatchesGlobal);
412  allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal); // only needed for langevin
413  allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
414  allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
415  allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
416 
417  //Patch-related device datastructures
418  allocate_device<double3>(&d_awayDists, numPatchesGlobal);
419  allocate_device<double3>(&d_patchMin, numPatchesGlobal);
420  allocate_device<double3>(&d_patchMax, numPatchesGlobal);
421  allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
422  allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
423  allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
424  allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
425  allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
426  allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
427  allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
428 
429  // Allocate host memory for scalar variables
430  allocate_device<int>(&d_killme, 1);
431  allocate_device<char>(&d_barrierFlag, 1);
432  allocate_device<unsigned int>(&d_tbcatomic, 5);
433  allocate_device<BigReal>(&d_kineticEnergy, ATOMIC_BINS);
434  allocate_device<BigReal>(&d_intKineticEnergy, ATOMIC_BINS);
435  allocate_device<BigReal>(&d_momentum_x, ATOMIC_BINS);
436  allocate_device<BigReal>(&d_momentum_y, ATOMIC_BINS);
437  allocate_device<BigReal>(&d_momentum_z, ATOMIC_BINS);
438  allocate_device<BigReal>(&d_angularMomentum_x, ATOMIC_BINS);
439  allocate_device<BigReal>(&d_angularMomentum_y, ATOMIC_BINS);
440  allocate_device<BigReal>(&d_angularMomentum_z, ATOMIC_BINS);
441  allocate_device<cudaTensor>(&d_virial, ATOMIC_BINS);
442  allocate_device<cudaTensor>(&d_intVirialNormal, ATOMIC_BINS);
443  allocate_device<cudaTensor>(&d_intVirialNbond, ATOMIC_BINS);
444  allocate_device<cudaTensor>(&d_intVirialSlow, ATOMIC_BINS);
445  allocate_device<cudaTensor>(&d_rigidVirial, ATOMIC_BINS);
446  // for lone pairs
447  allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
448  allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
449  allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
450  //space for globalmaster forces on device
451  allocate_device<cudaTensor>(&d_extVirial, ATOMIC_BINS * EXT_FORCE_TOTAL);
452  allocate_device<double3>(&d_extForce, ATOMIC_BINS * EXT_FORCE_TOTAL);
453  allocate_device<double>(&d_extEnergy, ATOMIC_BINS * EXT_FORCE_TOTAL);
454 
455  allocate_device<SettleParameters>(&sp, 1);
456 
457  //allocates host scalars in host-mapped, pinned memory
458  allocate_host<int>(&killme, 1);
459  allocate_host<BigReal>(&kineticEnergy, 1);
460  allocate_host<BigReal>(&intKineticEnergy, 1);
461  allocate_host<BigReal>(&kineticEnergy_half, 1);
462  allocate_host<BigReal>(&intKineticEnergy_half, 1);
463  allocate_host<BigReal>(&momentum_x, 1);
464  allocate_host<BigReal>(&momentum_y, 1);
465  allocate_host<BigReal>(&momentum_z, 1);
466  allocate_host<BigReal>(&angularMomentum_x, 1);
467  allocate_host<BigReal>(&angularMomentum_y, 1);
468  allocate_host<BigReal>(&angularMomentum_z, 1);
469  allocate_host<int>(&consFailure, 1);
470  allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
471  allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
472  allocate_host<unsigned int>(&h_marginViolations, 1);
473  allocate_host<unsigned int>(&h_periodicCellSmall, 1);
474 
475  // allocates host cudaTensors in host-mapped, pinned memory
476  allocate_host<cudaTensor>(&virial, 1);
477  allocate_host<cudaTensor>(&virial_half, 1);
478  allocate_host<cudaTensor>(&intVirialNormal, 1);
479  allocate_host<cudaTensor>(&intVirialNormal_half, 1);
480  allocate_host<cudaTensor>(&intVirialNbond, 1);
481  allocate_host<cudaTensor>(&intVirialSlow, 1);
482  allocate_host<cudaTensor>(&rigidVirial, 1);
483  allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
484  allocate_host<cudaTensor>(&lpVirialNormal, 1);
485  allocate_host<cudaTensor>(&lpVirialNbond, 1);
486  allocate_host<cudaTensor>(&lpVirialSlow, 1);
487 
488  // These arrays will get allocated when total forces of the GPU global calculations are requested
489  d_f_saved_nbond_x = nullptr;
490  d_f_saved_nbond_y = nullptr;
491  d_f_saved_nbond_z = nullptr;
492  d_f_saved_slow_x = nullptr;
493  d_f_saved_slow_y = nullptr;
494  d_f_saved_slow_z = nullptr;
495 
496  //Sets values for scalars
497  *kineticEnergy = 0.0;
498  *intKineticEnergy = 0.0;
499  *kineticEnergy_half = 0.0;
500  *intKineticEnergy_half = 0.0;
501  *momentum_x = 0.0;
502  *momentum_y = 0.0;
503  *momentum_z = 0.0;
504  *angularMomentum_x = 0.0;
505  *angularMomentum_y = 0.0;
506  *angularMomentum_z = 0.0;
507  *consFailure = 0;
508  *killme = 0;
509 
510  // JM: Basic infrastructure to time kernels
511  // XXX TODO: Add timing functions to be switched on/off for each of these
512  t_total = 0;
513  t_vverlet = 0;
514  t_pairlistCheck = 0;
515  t_setComputePositions = 0;
516  t_accumulateForceKick = 0;
517  t_rattle = 0;
518  t_submitHalf = 0;
519  t_submitReductions1 = 0;
520  t_submitReductions2 = 0;
521 
522  cudaEventCreate(&eventStart);
523  cudaEventCreate(&eventStop);
524  cudaCheck(cudaMemset(d_patchNewTolerance, 0, sizeof(BigReal)*numPatchesGlobal));
525  cudaCheck(cudaMemset(d_kineticEnergy, 0, sizeof(BigReal)));
526  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int) * 5));
527  cudaCheck(cudaMemset(d_momentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
528  cudaCheck(cudaMemset(d_momentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
529  cudaCheck(cudaMemset(d_momentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
530  cudaCheck(cudaMemset(d_angularMomentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
531  cudaCheck(cudaMemset(d_angularMomentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
532  cudaCheck(cudaMemset(d_angularMomentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
533  cudaCheck(cudaMemset(d_intKineticEnergy, 0, ATOMIC_BINS * sizeof(BigReal)));
534  cudaCheck(cudaMemset(d_virial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
535  cudaCheck(cudaMemset(d_rigidVirial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
536  cudaCheck(cudaMemset(d_intVirialNormal, 0, ATOMIC_BINS * sizeof(cudaTensor)));
537  cudaCheck(cudaMemset(d_intVirialNbond, 0, ATOMIC_BINS * sizeof(cudaTensor)));
538  cudaCheck(cudaMemset(d_intVirialSlow, 0, ATOMIC_BINS * sizeof(cudaTensor)));
539  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
540  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
541  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
542  cudaCheck(cudaMemset(d_extVirial, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(cudaTensor)));
543  cudaCheck(cudaMemset(d_extForce, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double3)));
544  cudaCheck(cudaMemset(d_extEnergy, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double)));
545 
546  memset(h_marginViolations, 0, sizeof(unsigned int));
547  memset(h_periodicCellSmall, 0, sizeof(unsigned int));
548  memset(virial, 0, sizeof(cudaTensor));
549  memset(rigidVirial, 0, sizeof(cudaTensor));
550  memset(intVirialNormal, 0, sizeof(cudaTensor));
551  memset(intVirialNbond, 0, sizeof(cudaTensor));
552  memset(intVirialSlow, 0, sizeof(cudaTensor));
553  memset(lpVirialNormal, 0, sizeof(cudaTensor));
554  memset(lpVirialNbond, 0, sizeof(cudaTensor));
555  memset(lpVirialSlow, 0, sizeof(cudaTensor));
556  memset(globalToLocalID, -1, sizeof(int)*numPatchesGlobal);
557 
558  settleList = NULL;
559  settleListSize = 0;
560  rattleList = NULL;
561  rattleListSize = 0;
562  d_consFailure = NULL;
563  d_consFailureSize = 0;
564 
566 
567  // JM: I can bundle the CompAtom and CudaAtomList pointers from the
568  // patches here and fill the Host-arrays after integration. :]
569  // if single-gpu simulation, numPatchesHome is the same as numPatchesGlobal
570  numPatchesHome = numPatchesGlobal;
571 
572 
573  int count = 0;
574  // Single-GPU case
575  if(!mGpuOn){
576  // JM NOTE: This works for a single device only, but it's fast if we want to have multiple-PE's sharing it
577  if(deviceCUDA->getMasterPe() == CkMyPe()){
578  for(int i = 0; i < numPes; i++) {
579  atomMapList[i] = AtomMap::ObjectOnPe(i);
580  PatchMap* patchMap = PatchMap::ObjectOnPe(i);
581  int npatch = patchMap->numPatches();
582  for (int j = 0; j < npatch; j++) {
583  HomePatch *patch = patchMap->homePatch(j);
584  // JM NOTE: This data structure can be preserved through migration steps
585  if(patch != NULL) {
586  patchList.push_back(patch);
587  patchNewTolerance[count++] =
588  0.5 * ( simParams->pairlistDist - simParams->cutoff );
589  numAtomsHomeAndProxy += patchMap->patch(j)->getNumAtoms();
590 
591  patchData->devData[deviceIndex].patches.push_back(patch);
592  patchListHomeAndProxy.push_back(patch);
593  }
594  }
595  }
596  }
597  }else{
598  // Multi-device case
599  /* The logic here is a bit trickier than the one on the single-GPU case.
600  Each GPU will only allocate space for its home patches and any required proxy patches,
601  This is going to eliminate the possibility of using NCCL-based all reduce for communication
602  but DMC thinks this is okay...
603  */
604  if(deviceCUDA->getMasterPe() == CkMyPe()){
605  // Haochuan: For multi-GPU cases, we need to find the atom maps from
606  // all "non-master PEs" that use the same device as the master PE.
607  // 1. Get the current device ID.
608  const int currentDevice = deviceCUDA->getDeviceIDforPe(CkMyPe());
609  // 2. Iterate over all PEs and find those that have the same device ID as this master PE
610  for (int i = 0; i < numPes; ++i) {
611  if (deviceCUDA->getDeviceIDforPe(i) == currentDevice) {
612  atomMapList.push_back(AtomMap::ObjectOnPe(i));
613  }
614  }
615  for(int i = 0; i < deviceCUDA->getNumPesSharingDevice(); i++){
617  for (int j = 0; j < pm->numPatches(); j++) {
618  // Aggregates the patches in a separate data structure for now
619  HomePatch *patch = pm->homePatch(j);
620  if(patch != NULL) {
621  patchData->devData[deviceIndex].patches.push_back(patch);
622  }
623  }
624  }
625 
626  // if MGPU is On, we also try to set up peer access
628 #ifdef NAMD_NCCL_ALLREDUCE
629  deviceCUDA->setNcclUniqueId(patchData->ncclId);
630  deviceCUDA->setupNcclComm();
631 #endif
632 
633  }
634  }
635  PatchMap *pm = PatchMap::Object();
636  isPeriodic = (pm->periodic_a() && pm->periodic_b() && pm->periodic_c());
637 
638  // XXX TODO decide how the biasing methods will work in the future -
639  // for multiple devices this will be a problem, how to solve it?
640  if (simParams->constraintsOn) {
641  restraintsKernel = new ComputeRestraintsCUDA(patchList, atomMapList,
642  stream, mGpuOn);
643  }
644  if (simParams->SMDOn) {
645  if(deviceCUDA->getMasterPe() == CkMyPe()){
646  SMDKernel = new ComputeSMDCUDA(patchList, simParams->SMDk, simParams->SMDk2,
647  simParams->SMDVel, make_double3(simParams->SMDDir.x, simParams->SMDDir.y, simParams->SMDDir.z),
648  simParams->SMDOutputFreq, simParams->firstTimestep, simParams->SMDFile,
650  Node::Object()->molecule->numAtoms, nDevices, deviceIndex, mGpuOn);
651  // need to set smd atom lists in the mgpu case
652  if(mGpuOn)
653  {
654  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
656  SMDKernel->initPeerCOM(cudaMgr->curSMDCOM, stream);
657  }
658  }
659  }
660  if (simParams->groupRestraintsOn) {
661  if(deviceCUDA->getMasterPe() == CkMyPe()){
662  groupRestraintsKernel = new ComputeGroupRestraintsCUDA(simParams->outputEnergies,
663  simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
664  if(mGpuOn)
665  {
666  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
667  groupRestraintsKernel->initPeerCOM(stream);
668  }
669  }
670  }
671  if (simParams->mgridforceOn || simParams->gridforceOn ){
672  if(deviceCUDA->getMasterPe() == CkMyPe()){
673  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
674  }
675  }
676  if (Node::Object()->molecule->is_lonepairs_psf) {
677  lonepairsKernel = new ComputeLonepairsCUDA();
678  }
679  if (simParams->consForceOn){
680  if(deviceCUDA->getMasterPe() == CkMyPe()){
681  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
682  }
683  }
684 
685 }
686 
687 /* events like ScriptTcl changes to global data require us to rebuild
688  some of the data structures used by some of the kernels. i.e., the
689  gridForceKernel needs to change when the grid, or grid related
690  data, is changed.
691 
692  Technically these may all be in use and updated at different
693  intervals, but that is a weird edge case that doesn't justify
694  adding a whole other sync path. If that happens, we'll be doing
695  this slightly too often. Such a hypothetical user has already
696  bought in to periodically sacrificing performance to radically
697  change the system being simulated at run time in several
698  different ways on different triggers, so as long as this overhead
699  is substantially better than a full restart, it probably doesn't
700  matter.
701  */
702 
703 void SequencerCUDA::updateDeviceKernels()
704 {
705  if (simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn){
706  // clean out the old and bring in the new if we're a master PE
707  if(deviceCUDA->getMasterPe() == CkMyPe()){
708  if(patchData->updateCounter.fetch_sub(1)>=1)
709  {
710  if(gridForceKernel!=NULL)
711  {
712  delete gridForceKernel;
713  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
714  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
715  }
716  if(consForceKernel!=NULL)
717  {
718  delete consForceKernel;
719  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,
720  mGpuOn);
721  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
722  }
723  }
724  }
725  }
726 }
727 
728 bool SequencerCUDA::reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy)
729 {
730  cudaCheck(cudaSetDevice(deviceID));
731  const float OVERALLOC = 1.5f;
732 
733  if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
734  return false;
735  }
736 
737  // Before deallocate the f_saved_nbond_* and f_saved_slow_*,
738  // check if they are nullptr and allocate them if they are null
739  bool realloc_gpu_saved_force = false;
740  if (d_f_saved_nbond_x != nullptr || d_f_saved_slow_x != nullptr) {
741  realloc_gpu_saved_force = true;
742  }
743 
744 
745  deallocateArrays();
746 
747  numAtomsHomeAndProxyAllocated = (int) ((float) in_numAtomsHomeAndProxy * OVERALLOC);
748  numAtomsHomeAllocated = (int) ((float) in_numAtomsHome * OVERALLOC);
749 
750  allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
751  allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
752  allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
753  allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
754  allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
755  allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
756  allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
757  allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
758  allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
759  allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
760  allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
761  allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
762  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
763  allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
764  allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
765  allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
766  }
767  allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
768  allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
769  allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
770 
771  allocate_host<double>(&recipMass, numAtomsHomeAllocated);
772  allocate_host<double>(&vel_x, numAtomsHomeAllocated);
773  allocate_host<double>(&vel_y, numAtomsHomeAllocated);
774  allocate_host<double>(&vel_z, numAtomsHomeAllocated);
775  allocate_host<char3>(&transform, numAtomsHomeAllocated);
776  allocate_host<float>(&mass, numAtomsHomeAllocated);
777  if (simParams->alchOn) {
778  allocate_host<int>(&partition, numAtomsHomeAndProxyAllocated);
779  }
780  allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
781  allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
782  allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
783 
784  // array buffers for pseudorandom normal distribution must be even length
785  // choose n to be the smallest even number >= numIntegrationAtoms
786  // guarantees that n is always > numIntegrationAtoms
787  int n = (numAtomsHomeAllocated + 1) & (~1);
788  allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
789  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
790  allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
791  if (simParams->fixedAtomsOn) {
792  allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
793  allocate_host(&fixedPosition_x, numAtomsHomeAllocated);
794  allocate_host(&fixedPosition_y, numAtomsHomeAllocated);
795  allocate_host(&fixedPosition_z, numAtomsHomeAllocated);
796  }
797  allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
798 
799 
800  if (simParams->useDeviceMigration) {
801  allocate_host<int>(&idMig, numAtomsHomeAllocated);
802  allocate_host<int>(&vdwType, numAtomsHomeAllocated);
803  }
804 
805  // Allocate bonded / normal forces
806  coll_f_normal_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
807  coll_f_normal_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
808  coll_f_normal_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
809  // Allocate nonbonded forces
810  coll_f_nbond_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
811  coll_f_nbond_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
812  coll_f_nbond_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
813  // Allocate slow forces
814  coll_f_slow_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
815  coll_f_slow_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
816  coll_f_slow_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
817  // Allocate positions
818  coll_pos_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
819  coll_pos_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
820  coll_pos_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
821  // Allocate velocities
822  coll_vel_x.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
823  coll_vel_y.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
824  coll_vel_z.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
825  // Allocate charges
826  coll_charge.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
827 
828  coll_sortOrder.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
829  coll_unsortOrder.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
830 
831  if (simParams->alchOn) {
832  coll_partition.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
833  }
834 
835 
836  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
837  allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
838  allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
839  allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
840  }
841  if (realloc_gpu_saved_force) {
842  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
843  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
844  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
845  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
846  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
847  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
848  }
849 
850  // allocate memory for backup forces in MC barostat
851  if (simParams->monteCarloPressureOn) {
852  // Total number of backup force and positions buffers
853  allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
854  allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
855  d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
856  d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
857  d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
858  d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
859  d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
860  d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
861  d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
862  d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
863  d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
864  d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
865  d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
866  d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
867 
868  allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
869  allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
870  allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
871  allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
872  // we can use molecule_size + 1, rather than atom size
873  allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
874  }
875 
876  allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
877  d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
878  d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
879  d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
880  allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
881  allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
882  allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
883  allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
884  allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
885  allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
886  allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
887  allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
888  allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
889  allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
890  allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
891  allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
892  allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
893  allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
894 
895  allocate_device<float>(&d_mass, numAtomsHomeAllocated);
896  allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
897  allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
898  allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
899  allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
900  allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
901  allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
902  allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
903  allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
904  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
905  allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
906  if (simParams->fixedAtomsOn) {
907  allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
908  allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
909  allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
910  allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
911  }
912 
913  if (simParams->useDeviceMigration) {
914  coll_idMig.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
915  coll_vdwType.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
916  allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
917  allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
918  allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
919  allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
920  }
921 
922  // These arrays will get allocated when total forces of the GPU global calculations are requested
923  d_f_saved_nbond_x = nullptr;
924  d_f_saved_nbond_y = nullptr;
925  d_f_saved_nbond_z = nullptr;
926  d_f_saved_slow_x = nullptr;
927  d_f_saved_slow_y = nullptr;
928  d_f_saved_slow_z = nullptr;
929 
930  // Memsets the d_pos arrays in order to reduce them afterwards;
931  memset(pos_x, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
932  memset(pos_y, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
933  memset(pos_z, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
934  cudaCheck(cudaMemset(coll_pos_x.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
935  cudaCheck(cudaMemset(coll_pos_y.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
936  cudaCheck(cudaMemset(coll_pos_z.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
937  cudaCheck(cudaMemset(coll_vel_x.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));
938  cudaCheck(cudaMemset(coll_vel_y.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));
939  cudaCheck(cudaMemset(coll_vel_z.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));
940 
941  cudaCheck(cudaMemset(d_posNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
942  cudaCheck(cudaMemset(d_posNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
943  cudaCheck(cudaMemset(d_posNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
944  cudaCheck(cudaMemset(d_velNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
945  cudaCheck(cudaMemset(d_velNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
946  cudaCheck(cudaMemset(d_velNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
947 
948  return true;
949 }
950 
951 void SequencerCUDA::reallocateMigrationDestination() {
952  coll_migrationDestination.deallocate();
953  coll_migrationDestination.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
954 }
955 
956 void SequencerCUDA::deallocateArrays() {
957  if (numAtomsHomeAndProxyAllocated != 0) {
958  cudaCheck(cudaSetDevice(deviceID));
959 
960  deallocate_host<double>(&f_normal_x);
961  deallocate_host<double>(&f_normal_y);
962  deallocate_host<double>(&f_normal_z);
963  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
964  deallocate_host<double>(&f_global_x);
965  deallocate_host<double>(&f_global_y);
966  deallocate_host<double>(&f_global_z);
967  }
968  deallocate_host<double>(&f_nbond_x);
969  deallocate_host<double>(&f_nbond_y);
970  deallocate_host<double>(&f_nbond_z);
971  deallocate_host<double>(&f_slow_x);
972  deallocate_host<double>(&f_slow_y);
973  deallocate_host<double>(&f_slow_z);
974  deallocate_host<double>(&pos_x);
975  deallocate_host<double>(&pos_y);
976  deallocate_host<double>(&pos_z);
977  deallocate_host<float>(&charge);
978  deallocate_host<int>(&sortOrder);
979  deallocate_host<int>(&unsortOrder);
980  deallocate_host<double>(&recipMass);
981  deallocate_host<double>(&vel_x);
982  deallocate_host<double>(&vel_y);
983  deallocate_host<double>(&vel_z);
984  deallocate_host<char3>(&transform);
985  deallocate_host<float>(&mass);
986  if (simParams->alchOn) {
987  deallocate_host<int>(&partition);
988  }
989  deallocate_host<float>(&langevinParam);
990  deallocate_host<float>(&langScalVelBBK2);
991  deallocate_host<float>(&langScalRandBBK2);
992 
993  deallocate_host<int>(&hydrogenGroupSize);
994  deallocate_host<int>(&atomFixed);
995  if (simParams->fixedAtomsOn) {
996  deallocate_host<int>(&groupFixed);
997  deallocate_host<double>(&fixedPosition_x);
998  deallocate_host<double>(&fixedPosition_y);
999  deallocate_host<double>(&fixedPosition_z);
1000  }
1001 
1002  deallocate_host<float>(&rigidBondLength);
1003 
1004  coll_pos_x.deallocate();
1005  coll_pos_y.deallocate();
1006  coll_pos_z.deallocate();
1007  coll_f_normal_x.deallocate();
1008  coll_f_normal_y.deallocate();
1009  coll_f_normal_z.deallocate();
1010  coll_f_nbond_x.deallocate();
1011  coll_f_nbond_y.deallocate();
1012  coll_f_nbond_z.deallocate();
1013  coll_f_slow_x.deallocate();
1014  coll_f_slow_y.deallocate();
1015  coll_f_slow_z.deallocate();
1016 
1017  coll_charge.deallocate();
1018 
1019  coll_sortOrder.deallocate();
1020  coll_unsortOrder.deallocate();
1021  if (simParams->alchOn) {
1022  coll_partition.deallocate();
1023  }
1024 
1025  deallocate_device<double>(&d_posNew_raw);
1026 
1027  if (simParams->monteCarloPressureOn) {
1028  deallocate_device<double>(&d_f_rawMC);
1029  deallocate_device<double>(&d_pos_rawMC);
1030 
1031  deallocate_host<int>(&id);
1032  deallocate_device<int>(&d_id);
1033  deallocate_device<int>(&d_idOrder);
1034  deallocate_device<int>(&d_moleculeAtom);
1035  deallocate_device<int>(&d_moleculeStartIndex);
1036  }
1037 
1038  if (simParams->useDeviceMigration) {
1039  deallocate_host<int>(&idMig);
1040  deallocate_host<int>(&vdwType);
1041  coll_idMig.deallocate();
1042  coll_vdwType.deallocate();
1043  deallocate_device<FullAtom>(&d_atomdata_AoS);
1044  deallocate_device<int>(&d_migrationGroupSize);
1045  deallocate_device<int>(&d_migrationGroupIndex);
1046  deallocate_device<int>(&d_sortIndex);
1047  }
1048  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
1049  deallocate_device<double>(&d_f_global_x);
1050  deallocate_device<double>(&d_f_global_y);
1051  deallocate_device<double>(&d_f_global_z);
1052  }
1053  coll_vel_x.deallocate();
1054  coll_vel_y.deallocate();
1055  coll_vel_z.deallocate();
1056  deallocate_device<double>(&d_recipMass);
1057  deallocate_device<char3>(&d_transform);
1058  deallocate_device<double>(&d_velNew_x);
1059  deallocate_device<double>(&d_velNew_y);
1060  deallocate_device<double>(&d_velNew_z);
1061  deallocate_device<double>(&d_posSave_x);
1062  deallocate_device<double>(&d_posSave_y);
1063  deallocate_device<double>(&d_posSave_z);
1064  deallocate_device<double>(&d_rcm_x);
1065  deallocate_device<double>(&d_rcm_y);
1066  deallocate_device<double>(&d_rcm_z);
1067  deallocate_device<double>(&d_vcm_x);
1068  deallocate_device<double>(&d_vcm_y);
1069  deallocate_device<double>(&d_vcm_z);
1070  deallocate_device<float>(&d_mass);
1071  deallocate_device<float>(&d_langevinParam);
1072  deallocate_device<float>(&d_langScalVelBBK2);
1073  deallocate_device<float>(&d_langScalRandBBK2);
1074  deallocate_device<float>(&d_gaussrand_x);
1075  deallocate_device<float>(&d_gaussrand_y);
1076  deallocate_device<float>(&d_gaussrand_z);
1077  deallocate_device<int>(&d_hydrogenGroupSize);
1078  deallocate_device<float>(&d_rigidBondLength);
1079  deallocate_device<int>(&d_atomFixed);
1080  if (simParams->fixedAtomsOn) {
1081  deallocate_device<int>(&d_groupFixed);
1082  deallocate_device<double>(&d_fixedPosition_x);
1083  deallocate_device<double>(&d_fixedPosition_y);
1084  deallocate_device<double>(&d_fixedPosition_z);
1085  }
1086  deallocate_device<double>(&d_f_saved_nbond_x);
1087  deallocate_device<double>(&d_f_saved_nbond_y);
1088  deallocate_device<double>(&d_f_saved_nbond_z);
1089  deallocate_device<double>(&d_f_saved_slow_x);
1090  deallocate_device<double>(&d_f_saved_slow_y);
1091  deallocate_device<double>(&d_f_saved_slow_z);
1092  }
1093 }
1094 
1095 void SequencerCUDA::deallocateStaticArrays() {
1096  cudaCheck(cudaSetDevice(deviceID));
1097 
1098  deallocate_host<cudaTensor>(&extVirial);
1099  deallocate_host<double3>(&extForce);
1100  deallocate_host<double>(&extEnergy);
1101  deallocate_host<unsigned int>(&h_marginViolations);
1102  deallocate_host<unsigned int>(&h_periodicCellSmall);
1103 
1104  // XXX TODO: Deallocate the additional arrays we added for shmem version
1105  deallocate_host<double3>(&awayDists);
1106  deallocate_host<double3>(&patchMin);
1107  deallocate_host<double3>(&patchMax);
1108  deallocate_host<CudaAtom*>(&cudaAtomLists);
1109  deallocate_host<double3>(&patchCenter);
1110  deallocate_host<int>(&globalToLocalID);
1111  deallocate_host<int>(&patchToDeviceMap);
1112  deallocate_host<Lattice>(&pairlist_lattices);
1113  deallocate_host<double>(&patchMaxAtomMovement);
1114  deallocate_host<double>(&patchNewTolerance);
1115  deallocate_host<CudaMInfo>(&mInfo);
1116  deallocate_host<bool*>(&h_patchRecordHasForces);
1117 
1118  deallocate_host<cudaTensor>(&lpVirialNormal);
1119  deallocate_host<cudaTensor>(&lpVirialNbond);
1120  deallocate_host<cudaTensor>(&lpVirialSlow);
1121 
1122  deallocate_device<double3>(&d_awayDists);
1123  deallocate_device<double3>(&d_patchMin);
1124  deallocate_device<double3>(&d_patchMax);
1125  deallocate_device<int>(&d_globalToLocalID);
1126  deallocate_device<int>(&d_patchToDeviceMap);
1127  deallocate_device<Lattice>(&d_lattices);
1128  deallocate_device<Lattice>(&d_pairlist_lattices);
1129  deallocate_device<double>(&d_patchMaxAtomMovement);
1130  deallocate_device<double>(&d_patchNewTolerance);
1131  deallocate_device<CudaMInfo>(&d_mInfo);
1132 
1133  deallocate_device<int>(&d_killme);
1134  deallocate_device<char>(&d_barrierFlag);
1135  deallocate_device<unsigned int>(&d_tbcatomic);
1136  deallocate_device<BigReal>(&d_kineticEnergy);
1137  deallocate_device<BigReal>(&d_intKineticEnergy);
1138  deallocate_device<BigReal>(&d_momentum_x);
1139  deallocate_device<BigReal>(&d_momentum_y);
1140  deallocate_device<BigReal>(&d_momentum_z);
1141  deallocate_device<BigReal>(&d_angularMomentum_x);
1142  deallocate_device<BigReal>(&d_angularMomentum_y);
1143  deallocate_device<BigReal>(&d_angularMomentum_z);
1144  deallocate_device<cudaTensor>(&d_virial);
1145  deallocate_device<cudaTensor>(&d_intVirialNormal);
1146  deallocate_device<cudaTensor>(&d_intVirialNbond);
1147  deallocate_device<cudaTensor>(&d_intVirialSlow);
1148  deallocate_device<cudaTensor>(&d_lpVirialNormal);
1149  deallocate_device<cudaTensor>(&d_lpVirialNbond);
1150  deallocate_device<cudaTensor>(&d_lpVirialSlow);
1151  deallocate_device<cudaTensor>(&d_rigidVirial);
1152  deallocate_device<cudaTensor>(&d_extVirial);
1153  deallocate_device<double3>(&d_extForce);
1154  deallocate_device<double>(&d_extEnergy);
1155  deallocate_device<SettleParameters>(&sp);
1156  deallocate_device<unsigned int>(&deviceQueue);
1157 
1158  deallocate_device<CudaLocalRecord*>(&d_peer_record);
1159  deallocate_device<bool*>(&d_patchRecordHasForces);
1160 
1161  deallocate_device(&d_fixVirialNormal);
1162  deallocate_device(&d_fixVirialNbond);
1163  deallocate_device(&d_fixVirialSlow);
1164  deallocate_device(&d_fixForceNormal);
1165  deallocate_device(&d_fixForceNbond);
1166  deallocate_device(&d_fixForceSlow);
1167 
1168  if (simParams->useDeviceMigration) {
1169  coll_atomdata_AoS_in.deallocate();
1170  coll_sortSoluteIndex.deallocate();
1171  coll_migrationDestination.deallocate();
1172  }
1173 
1174  deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1175 }
1176 
1177 void SequencerCUDA::copyMigrationInfo(HomePatch *p, int patchIndex){
1178  CudaMInfo &m = mInfo[patchIndex];
1179  if (!p->patchMapRead) p->readPatchMap();
1180  for(int x = 0; x < 3; x++){
1181  for(int y = 0; y < 3; y++){
1182  for(int z = 0; z < 3; z++){
1183  // copies migration info over
1184  MigrationInfo *hm = p->mInfo[x][y][z];
1185  if(hm != NULL) m.destPatchID[x][y][z] = hm->destPatchID;
1186  else m.destPatchID[x][y][z] = -1; // let's flag this as -1 for now
1187  }
1188  }
1189  }
1190  if (simParams->useDeviceMigration) {
1191  m.destPatchID[1][1][1] = p->getPatchID();
1192  }
1193 }
1194 
1195 void SequencerCUDA::assembleOrderedPatchList(){
1196  // Assembles the patches on each Pe sharing a device into a device-ordered list
1197  patchList.clear();
1198 
1199  // Handle our own patches
1200  for (int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1201  HomePatch *p = patchData->devData[deviceIndex].patches[i];
1202  patchList.push_back(p);
1203  }
1204 
1205 
1206  // Do we really need this?
1207 #if 1
1208  patchListHomeAndProxy.clear();
1209  // Set up list of device patches. We need the home patches for everyone
1210  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1211  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1212  const int patchID = localPatches[i].patchID;
1213 
1214 
1215  for(int d = 0; d < CkNumPes(); d++){
1216  PatchMap* pm = PatchMap::ObjectOnPe(d);
1217  HomePatch *patch = pm->homePatch(patchID);
1218  if(patch != NULL) {
1219  patchListHomeAndProxy.push_back(patch);
1220  }
1221  }
1222  }
1223 
1224 #endif
1225 }
1226 
1234 void SequencerCUDA::copyAoSDataToHost() {
1235  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1236  patchData = cpdata.ckLocalBranch();
1237 
1238  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1239  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1240 
1241  CUDAMigrationKernel->update_AoS(
1242  numPatchesHome,
1243  patchData->devData[deviceIndex].d_localPatches,
1244  (FullAtom*) d_atomdata_AoS,
1245  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1246  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1247  stream
1248  );
1249 
1250  for (int i = 0; i < integrationPatches.size(); i++) {
1251  const int numAtoms = localPatches[i].numAtoms;
1252  const int offset = localPatches[i].bufferOffset;
1253  HomePatch *patch = integrationPatches[i];
1254  patch->updateAtomCount(numAtoms, false);
1255  patch->updateAtomBuffers();
1256  FullAtomList& h_atomdata = patch->getAtomList();
1257  copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (FullAtom*)h_atomdata.begin(), numAtoms, stream);
1258  }
1259  cudaCheck(cudaStreamSynchronize(stream));
1260 }
1261 
1262 
1269 void SequencerCUDA::migrationLocalInit() {
1270  CUDAMigrationKernel->computeMigrationGroupIndex(
1271  numPatchesHome,
1272  patchData->devData[deviceIndex].d_localPatches,
1273  d_migrationGroupSize,
1274  d_migrationGroupIndex,
1275  stream
1276  );
1277 
1278  CUDAMigrationKernel->update_AoS(
1279  numPatchesHome,
1280  patchData->devData[deviceIndex].d_localPatches,
1281  (FullAtom*) d_atomdata_AoS,
1282  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1283  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1284  stream
1285  );
1286 
1287  CUDAMigrationKernel->computeMigrationDestination(
1288  numPatchesHome,
1289  patchData->devData[deviceIndex].d_localPatches,
1290  myLattice,
1291  d_mInfo,
1292  d_patchToDeviceMap,
1293  d_globalToLocalID,
1294  d_patchMin,
1295  d_patchMax,
1296  d_hydrogenGroupSize,
1297  d_migrationGroupSize,
1298  d_migrationGroupIndex,
1299  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1300  coll_migrationDestination.getDevicePtr(),
1301  stream
1302  );
1303 
1304  CUDAMigrationKernel->performLocalMigration(
1305  numPatchesHome,
1306  patchData->devData[deviceIndex].d_localPatches,
1307  (FullAtom*) d_atomdata_AoS,
1308  (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1309  coll_migrationDestination.getDevicePtr(),
1310  stream
1311  );
1312 
1313  cudaCheck(cudaStreamSynchronize(stream));
1314 }
1315 
1320 void SequencerCUDA::migrationPerform() {
1321  CUDAMigrationKernel->performMigration(
1322  numPatchesHome,
1323  patchData->devData[deviceIndex].d_localPatches,
1324  d_peer_record,
1325  (FullAtom*) d_atomdata_AoS,
1326  coll_atomdata_AoS_in.getDevicePeerPtr(),
1327  d_migrationGroupSize,
1328  d_migrationGroupIndex,
1329  coll_migrationDestination.getDevicePtr(),
1330  stream
1331  );
1332  cudaCheck(cudaStreamSynchronize(stream));
1333 }
1334 
1335 
1336 void SequencerCUDA::migrationUpdateAtomCounts() {
1337  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1338  patchData = cpdata.ckLocalBranch();
1339 
1340  CUDAMigrationKernel->updateLocalRecords(
1341  numPatchesHome,
1342  patchData->devData[deviceIndex].d_localPatches,
1343  d_peer_record,
1344  patchData->devData[deviceIndex].d_peerPatches,
1345  stream
1346  );
1347 
1348  cudaCheck(cudaStreamSynchronize(stream));
1349 }
1350 
1351 void SequencerCUDA::migrationUpdateAtomOffsets() {
1352  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1353  patchData = cpdata.ckLocalBranch();
1354 
1355  CUDAMigrationKernel->updateLocalRecordsOffset(
1356  numPatchesHomeAndProxy,
1357  patchData->devData[deviceIndex].d_localPatches,
1358  stream
1359  );
1360 
1361  cudaCheck(cudaStreamSynchronize(stream));
1362 }
1363 
1364 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1365  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1366  patchData = cpdata.ckLocalBranch();
1367 
1368  CUDAMigrationKernel->updatePeerRecords(
1369  numPatchesHomeAndProxy,
1370  patchData->devData[deviceIndex].d_localPatches,
1371  d_peer_record,
1372  patchData->devData[deviceIndex].d_peerPatches,
1373  stream
1374  );
1375 
1376  cudaCheck(cudaStreamSynchronize(stream));
1377 }
1378 
1379 void SequencerCUDA::migrationUpdateProxyDestination() {
1380  if (mGpuOn) {
1381  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1382  patchData = cpdata.ckLocalBranch();
1383 
1384  // This is implemented as a put instead of a get to avoid the need
1385  // for a node synchronization.
1386  CUDAMigrationKernel->copyMigrationDestinationToProxies(
1387  deviceIndex,
1388  numPatchesHome,
1389  numPatchesHomeAndProxy,
1390  patchData->devData[deviceIndex].d_localPatches,
1391  patchData->devData[deviceIndex].d_peerPatches,
1392  coll_migrationDestination.getDevicePeerPtr(),
1393  stream
1394  );
1395  }
1396 }
1397 
1398 void SequencerCUDA::copyPatchDataToHost() {
1399  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1400  patchData = cpdata.ckLocalBranch();
1401  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1402 
1403  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1404  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1405 
1406  copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1407  cudaCheck(cudaStreamSynchronize(stream));
1408 
1409 
1410  // Update Atom Counts
1411  for (int i = 0; i < numPatchesHome; i++) {
1412  HomePatch* hp = integrationPatches[i];
1413  hp->updateAtomCount(localPatches[i].numAtoms, false);
1414  }
1415  cudaCheck(cudaStreamSynchronize(stream));
1416 }
1417 
1418 // This code path is used only by device migration
1419 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1420  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1421  patchData = cpdata.ckLocalBranch();
1422 
1423  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1424  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1425  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1426 
1427 
1428  for (int i = 0; i < integrationPatches.size(); i++) {
1429  const int numAtoms = localPatches[i].numAtoms;
1430  if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1431  iout << iERROR << "The number of atoms in patch " << i << " is "
1432  << numAtoms << ", greater than the limit for GPU atom migration ("
1433  << MigrationCUDAKernel::kMaxAtomsPerPatch << ").\n" << endi;
1434  NAMD_bug("NAMD has stopped simulating due to the error above, "
1435  "but you could disable GPUAtomMigration and try again.\n");
1436  }
1437  const int offset = localPatches[i].bufferOffset;
1438  HomePatch *patch = integrationPatches[i];
1439  FullAtomList& h_atomdata = patch->getAtomList();
1440  copy_HtoD<FullAtom>((FullAtom*)h_atomdata.begin(), coll_atomdata_AoS_in.getDevicePtr() + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1441  }
1442  cudaCheck(cudaStreamSynchronize(stream));
1443 }
1444 
1445 /*
1446  * Aggregates and copys various data from the host to device
1447  *
1448  * Some data is only copied for home patches, while other data is copied for
1449  * home and proxy patches
1450  *
1451  */
1452 void SequencerCUDA::copyAtomDataToDevice(bool copyForces, int maxForceNumber) {
1453 
1454  AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass, double, stream);
1455  if(copyForces){
1456  switch (maxForceNumber) {
1457  case 2:
1458  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_x, double, stream);
1459  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_y, double, stream);
1460  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_z, double, stream);
1461  case 1:
1462  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_x, double, stream);
1463  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_y, double, stream);
1464  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_z, double, stream);
1465  case 0:
1466  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_x, double, stream);
1467  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_y, double, stream);
1468  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_z, double, stream);
1469  }
1470  }
1471 
1472  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_x, double, stream);
1473  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_y, double, stream);
1474  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_z, double, stream);
1475  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_x, double, stream);
1476  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_y, double, stream);
1477  AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_z, double, stream);
1478  AGGREGATE_HOME_ATOMS_TO_DEVICE(mass, float, stream);
1479  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(charge, float, stream);
1480  if (simParams->alchOn) {
1481  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(partition, int, stream);
1482  }
1483 
1484  if (simParams->langevinOn) {
1485  AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam, float, stream);
1486  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2, float, stream);
1487  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2, float, stream);
1488  }
1489 
1490  AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize, int, stream);
1491  AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed, int, stream);
1492  if (simParams->fixedAtomsOn) {
1493  AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed, int, stream);
1494  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x, double, stream);
1495  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y, double, stream);
1496  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z, double, stream);
1497  }
1498  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(sortOrder, int, stream);
1499  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(unsortOrder, int, stream);
1500  AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength, float, stream);
1501 
1502  if (simParams->monteCarloPressureOn) {
1503  AGGREGATE_HOME_ATOMS_TO_DEVICE(id, int, stream);
1504  //set up initial mapping for global index to local array index
1505  CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1506  }
1507 }
1508 
1509 
1510 void SequencerCUDA::migrationLocalPost(int startup) {
1511  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1512  patchData = cpdata.ckLocalBranch();
1513 
1514  if (simParams->useDeviceMigration) {
1515  if (!startup) {
1516  CUDAMigrationKernel->transformMigratedPositions(
1517  numPatchesHome,
1518  patchData->devData[deviceIndex].d_localPatches,
1519  coll_patchCenter.getDevicePtr(),
1520  (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1521  myLattice,
1522  stream
1523  );
1524  }
1525 
1526  // sort solvent/solute data
1527  CUDAMigrationKernel->sortSolventAtoms(
1528  numPatchesHome,
1529  patchData->devData[deviceIndex].d_localPatches,
1530  (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1531  (FullAtom*) d_atomdata_AoS,
1532  coll_sortSoluteIndex.getDevicePtr(),
1533  stream
1534  );
1535 
1536  double dt = 1.0;
1537  double kbT = 1.0;
1538  double tempFactor = 1.0;
1539  if (simParams->langevinOn) {
1540  dt = simParams->dt * 0.001; // convert timestep to ps
1541  kbT = BOLTZMANN * simParams->langevinTemp;
1542  int lesReduceTemp = (simParams->lesOn && simParams->lesReduceTemp);
1543  tempFactor = (lesReduceTemp ? 1. / simParams->lesFactor : 1);
1544  }
1545  CUDAMigrationKernel->copy_AoS_to_SoA(
1546  numPatchesHome, simParams->alchOn,
1547  simParams->langevinOn, dt, kbT, tempFactor,
1548  patchData->devData[deviceIndex].d_localPatches,
1549  (FullAtom*) d_atomdata_AoS,
1550  d_recipMass,
1551  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1552  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1553  d_mass, coll_charge.getDevicePtr(),
1554  coll_idMig.getDevicePtr(), coll_vdwType.getDevicePtr(),
1555  d_hydrogenGroupSize, d_migrationGroupSize,
1556  d_atomFixed,
1557  d_rigidBondLength,
1558  d_transform,
1559  coll_partition.getDevicePtr(),
1560  d_langevinParam,
1561  d_langScalVelBBK2,
1562  d_langScalRandBBK2,
1563  stream
1564  );
1565  }
1566 
1567  // Other migration post processing steps
1568  if (simParams->useDeviceMigration) {
1569  if (simParams->monteCarloPressureOn) {
1570  CUDASequencerKernel->SetAtomIndexOrder(coll_idMig.getDevicePtr(), d_idOrder, numAtomsHome, stream);
1571  }
1572  }
1573 
1574  // JM: Saving position to these doubles for pairListCheck
1575  copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
1576  copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
1577  copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);
1578 
1579  // JM NOTE: We need to save the lattice at the beggining of the cycle
1580  // in order to use it in SequencerCUDAKernel::pairlistCheck();
1581  myLatticeOld = myLattice;
1582 
1583  // JM NOTE: Recalculates the centers of mass since we have new positions
1584  CUDASequencerKernel->centerOfMass(
1585  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1586  d_rcm_x, d_rcm_y, d_rcm_z,
1587  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1588  CUDASequencerKernel->centerOfMass(
1589  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1590  d_vcm_x, d_vcm_y, d_vcm_z,
1591  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1592 
1593  cudaCheck(cudaStreamSynchronize(stream));
1594 }
1595 
1596 void SequencerCUDA::migrationUpdateAdvancedFeatures(const int startup) {
1597  if(simParams->eFieldOn || simParams->SMDOn || simParams->groupRestraintsOn ||
1598  simParams->monteCarloPressureOn || simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn ||
1599  simParams->colvarsOn ||
1600  simParams->useCudaGlobal ||
1601  simParams->tclForcesOn){
1602 
1603  // Handcopies the transform field in a char* SOA
1604  // JM NOTE: We're copying ints into char because "transform" is an int
1605  // field in PatchDataSOA but a signed char in FullAtom
1606  size_t offset = 0;
1607  for (int i = 0; i < numPatchesHome; i++) {
1608  PatchDataSOA& current = patchList[i]->patchDataSOA;
1609  const int numPatchAtoms = current.numAtoms;
1610  // memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type));
1611  for(int j = 0; j < numPatchAtoms; j++){
1612  transform[offset + j].x = current.transform_i[j];
1613  transform[offset + j].y = current.transform_j[j];
1614  transform[offset + j].z = current.transform_k[j];
1615  }
1616  offset += numPatchAtoms;
1617  }
1618  copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1619  }
1620 
1621  if (!startup) {
1623  lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1624  }
1625  if(simParams->constraintsOn) {
1626  restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1627  }
1628  if(simParams->SMDOn) {
1629  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1630  }
1631  if(simParams->groupRestraintsOn) {
1632  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1633  }
1634  if(simParams->mgridforceOn || simParams->gridforceOn){
1635 
1636  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1637  }
1638  if(simParams->consForceOn) {
1639  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1640  }
1641 
1642  }
1643 }
1644 
1645 void SequencerCUDA::migrationUpdateDestination() {
1646  CUDAMigrationKernel->updateMigrationDestination(
1647  numAtomsHomePrev,
1648  coll_migrationDestination.getDevicePtr(),
1649  coll_sortSoluteIndex.getDevicePeerPtr(),
1650  stream
1651  );
1652 }
1653 
1654 bool SequencerCUDA::copyPatchData(
1655  const bool copyIn,
1656  const bool startup
1657 ) {
1658  bool reallocated = false;
1659  if (copyIn) {
1660  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1661  patchData = cpdata.ckLocalBranch();
1662 
1663  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1664 
1665  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1666  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1667 
1668  if (startup) {
1669  // numPatchesHomeAndProxy is set when the patch data is constructed
1670  numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1671  numPatchesHome = homePatches.size();
1672  patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1673 
1674  coll_patchCenter.allocate_no_check(defaultCollectiveBufferType, numPatchesGlobal);
1675 
1676  if (simParams->useDeviceMigration) {
1677  // Padded data structures which will not be reallocated
1678  coll_sortSoluteIndex.allocate(defaultCollectiveBufferType, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch, SynchronousCollectiveScope::master);
1679  coll_atomdata_AoS_in.allocate(defaultCollectiveBufferType, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch, SynchronousCollectiveScope::master);
1680  }
1681 #if defined(NAMD_HIP)
1682  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_localPatches,
1683  sizeof(CudaLocalRecord)*numPatchesHomeAndProxy,
1684  hipDeviceMallocFinegrained);
1685  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_peerPatches,
1686  sizeof(CudaLocalRecord)*peerPatches.size(),
1687  hipDeviceMallocFinegrained);
1688 #else
1689  allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1690  allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1691 #endif
1692  if (simParams->useDeviceMigration) {
1693  CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1694  }
1695 
1696  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1697  numPatchesHomeAndProxy, stream);
1698  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1699  peerPatches.size(), stream);
1700  if(true || mGpuOn) {
1701  this->assembleOrderedPatchList();
1702  }
1703  this->copySettleParameter();
1704 
1705  for (int i = 0; i < numPatchesHome; i++) {
1706  HomePatch *patch = homePatches[i];
1707  this->copyMigrationInfo(patch, i);
1708  patchNewTolerance[i] = 0.5 * ( simParams->pairlistDist - simParams->cutoff);
1709 
1710  globalToLocalID[patch->getPatchID()] = i;
1711  patchToDeviceMap[patch->getPatchID()] = deviceIndex;
1712  }
1713  copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1714  copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1715 
1716  // Need the globalToLocalID and patchToDeviceMap data structures to be system wide for migration
1717  // They are also used in tuple migration, so we add them to patchData, so they can be easily
1718  // accessed elsewhere
1719  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
1720  if (i == deviceIndex) continue;
1721  std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1722  for (int j = 0; j < otherPatches.size(); j++) {
1723  HomePatch *patch = otherPatches[j];
1724  globalToLocalID[patch->getPatchID()] = j;
1725  patchToDeviceMap[patch->getPatchID()] = i;
1726  }
1727  }
1728  copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1729  copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1730  patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1731  patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1732 
1733  // Allocate more data
1734  allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1735  }
1736 
1737  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1738  HomePatch *patch = patchListHomeAndProxy[i];
1739  awayDists[i].x = patch->aAwayDist;
1740  awayDists[i].y = patch->bAwayDist;
1741  awayDists[i].z = patch->cAwayDist;
1742  COPY_CUDAVECTOR(patch->center, patchCenter[i]);
1743  COPY_CUDAVECTOR(patch->getMin(), patchMin[i]);
1744  COPY_CUDAVECTOR(patch->getMax(), patchMax[i]);
1745  }
1746 
1747  copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1748  copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1749  copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1750  copy_HtoD<double3>(patchCenter, coll_patchCenter.getDevicePtr(), numPatchesHomeAndProxy, stream);
1751 
1752  const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1753  localPatches[numPatchesHomeAndProxy-1].numAtoms;
1754 
1755  const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1756  localPatches[numPatchesHome-1].numAtoms;
1757 
1758  reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1759 
1760 
1761  numAtomsHomePrev = numAtomsHome;
1762  numAtomsHomeAndProxy = totalAtomCount;
1763  numAtomsHome = homeAtomCount;
1764 
1765  patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1766 
1767  if (!startup) {
1768  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1769  numPatchesHomeAndProxy, stream);
1770  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1771  peerPatches.size(), stream);
1772  }
1773  if (startup) {
1774  if (simParams->monteCarloPressureOn) {
1775  //Only at startup, copy the molecule's index info to GPU
1776  Molecule *molecule = Node::Object()->molecule;
1777  copy_HtoD<int>(molecule->moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1778  copy_HtoD<int>(molecule->moleculeStartIndex, d_moleculeStartIndex, molecule->numMolecules + 1, stream);
1779  }
1780  }
1781  }
1782  return reallocated;
1783 }
1784 
1785 void SequencerCUDA::copyDataToPeers(
1786  const bool copyIn
1787 ) {
1788  if (!copyIn) return;
1789  // Positions will be copied by the kernel
1790  // Forces don't need to be copied
1791  // Atom data needed to be copied: sortOrder, unsortOrder
1792 
1793  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1794  patchData = cpdata.ckLocalBranch();
1795  if (mGpuOn) {
1796  CUDAMigrationKernel->copyDataToProxies(
1797  deviceIndex,
1798  numPatchesHome,
1799  numPatchesHomeAndProxy,
1800  patchData->devData[deviceIndex].d_localPatches,
1801  coll_idMig.getDevicePeerPtr(),
1802  coll_vdwType.getDevicePeerPtr(),
1803  coll_sortOrder.getDevicePeerPtr(),
1804  coll_unsortOrder.getDevicePeerPtr(),
1805  coll_charge.getDevicePeerPtr(),
1806  coll_partition.getDevicePeerPtr(),
1807  coll_patchCenter.getDevicePeerPtr(),
1808  simParams->alchOn,
1809  stream
1810  );
1811  }
1812  cudaCheck(cudaStreamSynchronize(stream));
1813 }
1814 
1815 void SequencerCUDA::migrationSortAtomsNonbonded() {
1816  CUDAMigrationKernel->sortAtoms(
1817  numPatchesHome, numAtomsHome,
1818  patchData->devData[deviceIndex].d_localPatches,
1819  patchMin, patchMax,
1820  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1821  coll_sortOrder.getDevicePtr(),
1822  coll_unsortOrder.getDevicePtr(),
1823  d_sortIndex,
1824  stream
1825  );
1826 }
1827 
1828 void SequencerCUDA::maximumMove(
1829  const double maxvel2,
1830  const int numAtoms)
1831 {
1832  CUDASequencerKernel->maximumMove(
1833  maxvel2, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1834  killme, numAtoms, stream);
1835 }
1836 
1837 void SequencerCUDA::submitHalf(
1838  int numAtoms, int part, const bool doEnergy)
1839 {
1840  //BigReal kineticEnergy;
1841  Tensor reduction_virial;
1842  //cudaTensor h_virial;
1843  //BigReal intKineticEnergy;
1844  Tensor reduction_intVirialNormal;
1845  //cudaTensor h_intVirialNormal;
1846  int hgs;
1847 
1848  if(doEnergy){
1849 #if 0
1850  cudaCheck(cudaEventRecord(eventStart,stream));
1851 #endif
1852  CUDASequencerKernel->submitHalf(
1853  simParams->fixedAtomsOn,
1854  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1855  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1856  d_kineticEnergy, d_intKineticEnergy,
1857  d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1858  virial_half, intVirialNormal_half,
1859  d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1860 #if 0
1861  cudaCheck(cudaEventRecord(eventStop, stream));
1862  cudaCheck(cudaEventSynchronize(eventStop));
1863  cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1864  fprintf(stderr, "submitHalf total elapsed time: %f\n", t_submitHalf);
1865  t_submitReductions2 = 0;
1866 #endif
1867  }
1868 }
1869 
1870 void SequencerCUDA::submitReductions(
1871  BigReal origin_x,
1872  BigReal origin_y,
1873  BigReal origin_z,
1874  int marginViolations,
1875  int doEnergy,
1876  int doMomentum,
1877  int numAtomsReduction,
1878  int maxForceNumber)
1879 {
1880  // reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsReduction; //moved to launch_part2, startRun2
1881  // where do I get the margin violations?
1882  // reduction->item(REDUCTION_MARGIN_VIOLATIONS) += marginViolations;
1883  if(doEnergy){
1884  if(doMomentum){
1885  // JM NOTE: Calculates momenta if copyOut
1886  CUDASequencerKernel->submitReduction1(
1887  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1888  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_mass,
1889  d_kineticEnergy,
1890  d_momentum_x, d_momentum_y, d_momentum_z,
1891  d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1892  origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1893  momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1894  numAtomsReduction, stream);
1895  }
1896  Tensor regintVirialNormal;
1897  Tensor regintVirialNbond;
1898  Tensor regintVirialSlow;
1899 
1900 #if 0
1901  cudaCheck(cudaEventRecord(eventStart,stream));
1902 #endif
1903  CUDASequencerKernel->submitReduction2(
1904  simParams->fixedAtomsOn, d_atomFixed,
1905  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1906  d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1907  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1908  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
1909  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
1910  d_mass, d_hydrogenGroupSize,
1911  d_kineticEnergy, kineticEnergy,
1912  d_intKineticEnergy, intKineticEnergy,
1913  d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1914  intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1915  d_tbcatomic, numAtomsReduction, maxForceNumber, simParams->isMultiTimeStepping(), stream);
1916  // Haochuan: actually should be doVirial here
1917  if (simParams->fixedAtomsOn) {
1918  CUDASequencerKernel->calcFixVirial(
1919  maxForceNumber, numAtomsReduction, d_atomFixed,
1920  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1921  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1922  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
1923  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
1924  d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1925  d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1926  }
1927  }
1928 #if 0
1929  cudaCheck(cudaEventRecord(eventStop, stream));
1930  cudaCheck(cudaEventSynchronize(eventStop));
1931  cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1932  fprintf(stderr, "submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1933  t_submitReductions2 = 0;
1934 #endif
1935 }
1936 
1937 void SequencerCUDA::copySettleParameter(){
1938  // Searching for a patch that contains initialized settle parameters
1939  cudaCheck(cudaSetDevice(deviceID));
1940  if(simParams->rigidBonds != RIGID_NONE){
1941  HomePatch *patch = NULL;
1942  // PatchList contains all patches in the node, so if there's a single water in the system,
1943  // this is guaranteed to catch it
1944  for(int i = 0; i < patchList.size(); i++){
1945  if(patchList[i]->settle_initialized) {
1946  patch = patchList[i];
1947  break;
1948  }
1949  }
1950  if ( patch ) {
1951  SettleParameters h_sp;
1952  h_sp.mO = patch->settle_mO;
1953  h_sp.mH = patch->settle_mH;
1954  h_sp.mOrmT = patch->settle_mOrmT;
1955  h_sp.mHrmT = patch->settle_mHrmT;
1956  h_sp.rra = patch->settle_rra;
1957  h_sp.ra = patch->settle_ra;
1958  h_sp.rb = patch->settle_rb;
1959  h_sp.rc = patch->settle_rc;
1960  h_sp.r_om = patch->r_om;
1961  h_sp.r_ohc = patch->r_ohc;
1962 
1963  // fprintf(stderr, "SETTLEPARAMETER Found: Values %lf %lf %lf %lf %lf %lf %lf %lf\n",
1964  // h_sp.mO, h_sp.mH, h_sp.mOrmT, h_sp.mHrmT, h_sp.rra, h_sp.ra, h_sp.rb, h_sp.rc );
1965  copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
1966  }
1967  }
1968 }
1969 
1970 // Does rattle1_SOA
1971 void SequencerCUDA::startRun1(
1972  int maxForceNumber,
1973  const Lattice& lattice
1974 ) {
1975  // cudaCheck(cudaSetDevice(deviceID));
1976  myLattice = lattice;
1977 
1978  // JM: Enforcing rigid bonds on first iteration
1979  CUDASequencerKernel->rattle1(
1980  simParams->fixedAtomsOn, 1, 0,
1981  numAtomsHome, 0.f, 0.f,
1982  2.0 * simParams->rigidTol,
1983  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1984  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1985  d_velNew_x, d_velNew_y, d_velNew_z,
1986  d_posNew_x, d_posNew_y, d_posNew_z,
1987  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1988  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
1989  &settleList, settleListSize, &d_consFailure,
1990  d_consFailureSize, &rattleList, rattleListSize,
1991  &nSettle, &nRattle,
1992  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
1993  buildRigidLists, consFailure, simParams->watmodel, stream);
1994 
1995  this->copyPositionsAndVelocitiesToHost(1, 0);
1996  cudaCheck(cudaDeviceSynchronize());
1997 #if 0
1998  printSOAPositionsAndVelocities();
1999 #endif
2000 }
2001 
2002 void SequencerCUDA::startRun2(
2003  double dt_normal,
2004  double dt_nbond,
2005  double dt_slow,
2006  Vector origin,
2007  int doGlobal,
2008  int maxForceNumber
2009 ){
2010  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2011 
2012  // This is a patch-based kernel -> which means each threadblock deals with an entire patch
2013  // So, in the multigpu case, we want to keep non-offset pointers but we want
2014  // to deal only with a handful of patches
2015 
2016  // We dont need to and we should not set normal force to zero
2017  // It stores global forces. Additionally, we set normall forces,
2018  // every step, it's not an addition
2019  // cudaCheck(cudaMemset(coll_f_normal_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2020  // cudaCheck(cudaMemset(coll_f_normal_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2021  // cudaCheck(cudaMemset(coll_f_normal_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2022 
2023  cudaCheck(cudaMemset(coll_f_nbond_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2024  cudaCheck(cudaMemset(coll_f_nbond_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2025  cudaCheck(cudaMemset(coll_f_nbond_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2026 
2027  cudaCheck(cudaMemset(coll_f_slow_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2028  cudaCheck(cudaMemset(coll_f_slow_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2029  cudaCheck(cudaMemset(coll_f_slow_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
2030 
2031  CUDASequencerKernel->accumulateForceToSOA(
2032  doGlobal,
2033  simParams->useCudaGlobal,
2034  maxForceNumber,
2035  numPatchesHomeAndProxy,
2036  nDevices,
2037  patchData->devData[deviceIndex].d_localPatches,
2038  patchData->devData[deviceIndex].f_bond,
2039  patchData->devData[deviceIndex].f_bond_nbond,
2040  patchData->devData[deviceIndex].f_bond_slow,
2041  patchData->devData[deviceIndex].forceStride,
2042  patchData->devData[deviceIndex].f_nbond,
2043  patchData->devData[deviceIndex].f_nbond_slow,
2044  patchData->devData[deviceIndex].f_slow,
2045  d_f_global_x,
2046  d_f_global_y,
2047  d_f_global_z,
2048  coll_f_normal_x.getDevicePtr(),
2049  coll_f_normal_y.getDevicePtr(),
2050  coll_f_normal_z.getDevicePtr(),
2051  coll_f_nbond_x.getDevicePtr(),
2052  coll_f_nbond_y.getDevicePtr(),
2053  coll_f_nbond_z.getDevicePtr(),
2054  coll_f_slow_x.getDevicePtr(),
2055  coll_f_slow_y.getDevicePtr(),
2056  coll_f_slow_z.getDevicePtr(),
2057  coll_unsortOrder.getDevicePtr(),
2058  myLattice,
2059  patchData->d_queues,
2060  patchData->d_queueCounters,
2061  d_tbcatomic,
2062  stream
2063  );
2064  if(mGpuOn){
2065  if(SMDKernel)
2066  {
2067  // compute distributed center of mass for groups of atoms
2068  SMDKernel->computeCOMMGpu(myLattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2069  d_transform, stream);
2070  }
2071  if(groupRestraintsKernel)
2072  {
2073  groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
2074  d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2075  stream);
2076  }
2077  // Synchonize device before node barrier
2078  cudaCheck(cudaDeviceSynchronize());
2079  }
2080 #if 0
2081  printSOAPositionsAndVelocities();
2082 #endif
2083 }
2084 
2085 void SequencerCUDA::startRun3(
2086  double dt_normal,
2087  double dt_nbond,
2088  double dt_slow,
2089  Vector origin,
2090  const bool requestGlobalForces,
2091  int doGlobalMasterStaleForces,
2092  const bool requestForcesOutput,
2093  const bool requestGlobalForcesGPU,
2094  int maxForceNumber
2095 ){
2096  const bool doFixed = simParams->fixedAtomsOn;
2097  if(mGpuOn){
2098  // XXX TODO we need to call the force merging kernel here
2099 #if 1
2100  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2101  std::vector<int> atom_counts;
2102  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2103  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2104  }
2105  CUDASequencerKernel->mergeForcesFromPeers(
2106  deviceIndex,
2107  maxForceNumber,
2108  myLattice,
2109  numPatchesHomeAndProxy,
2110  numPatchesHome,
2111  this->coll_f_normal_x.getDevicePeerPtr(),
2112  this->coll_f_normal_y.getDevicePeerPtr(),
2113  this->coll_f_normal_z.getDevicePeerPtr(),
2114  this->coll_f_nbond_x.getDevicePeerPtr(),
2115  this->coll_f_nbond_y.getDevicePeerPtr(),
2116  this->coll_f_nbond_z.getDevicePeerPtr(),
2117  this->coll_f_slow_x.getDevicePeerPtr(),
2118  this->coll_f_slow_y.getDevicePeerPtr(),
2119  this->coll_f_slow_z.getDevicePeerPtr(),
2120  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2121  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2122  patchData->devData[deviceIndex].d_localPatches,
2123  patchData->devData[deviceIndex].d_peerPatches,
2124  atom_counts,
2125  stream
2126  );
2127 #else
2128 
2129  // Before I call nccl, let's see the forces here
2130  // ncclAllReduce(coll_f_normal_x.getDevicePtr(), coll_f_normal_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2131 
2132  // ncclAllReduce(coll_f_normal_y.getDevicePtr(), coll_f_normal_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2133  // ncclAllReduce(coll_f_normal_z.getDevicePtr(), coll_f_normal_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2134  // ncclAllReduce(coll_f_nbond_x.getDevicePtr(), coll_f_nbond_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2135  // ncclAllReduce(coll_f_nbond_y.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2136  // ncclAllReduce(coll_f_nbond_z.getDevicePtr(), coll_f_nbond_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2137  // ncclAllReduce(coll_f_slow_x.getDevicePtr(), coll_f_slow_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2138  // ncclAllReduce(coll_f_slow_y.getDevicePtr(), coll_f_slow_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2139  // ncclAllReduce(coll_f_slow_z.getDevicePtr(), coll_f_slow_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2140  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2141  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2142 #endif
2143  }
2144  if(doGlobalMasterStaleForces)
2145  {
2146  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
2147  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
2148  computeGlobalMasterVirial(
2149  numPatchesHomeAndProxy,
2150  numAtomsHome,
2151  patchData->devData[deviceIndex].d_localPatches,
2152  coll_pos_x.getDevicePtr(),
2153  coll_pos_y.getDevicePtr(),
2154  coll_pos_z.getDevicePtr(),
2155  d_transform,
2156  d_f_global_x,
2157  d_f_global_y,
2158  d_f_global_z,
2159  &d_extForce[EXT_GLOBALMTS],
2160  &extForce[EXT_GLOBALMTS],
2161  &d_extVirial[EXT_GLOBALMTS],
2162  &extVirial[EXT_GLOBALMTS],
2163  myLattice,
2164  d_tbcatomic,
2165  stream);
2166  }
2167 
2168  // do external forces calculation and store energy and virial
2169  calculateExternalForces(simParams->firstTimestep, maxForceNumber, 1, 1);
2170 #if 0
2171  cudaCheck(cudaDeviceSynchronize());
2172  if(true || deviceID == 0){
2173  char prefix[10];
2174  snprintf(prefix, 10, "step-%d",0);
2175  this->printSOAForces(prefix);
2176  }
2177 #endif
2178 
2179  CUDASequencerKernel->addForceToMomentum(
2180  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2181  d_recipMass,
2182  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2183  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2184  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2185  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2186  numAtomsHome, maxForceNumber, stream);
2187 
2188  CUDASequencerKernel->rattle1(
2189  simParams->fixedAtomsOn,1, 0,
2190  numAtomsHome, -dt_normal, -1.0/(dt_normal),
2191  2.0 * simParams->rigidTol,
2192  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2193  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2194  d_velNew_x, d_velNew_y, d_velNew_z,
2195  d_posNew_x, d_posNew_y, d_posNew_z,
2196  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2197  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2198  &settleList, settleListSize, &d_consFailure,
2199  d_consFailureSize, &rattleList, rattleListSize,
2200  &nSettle, &nRattle,
2201  d_rigidVirial, rigidVirial, d_tbcatomic, true, sp,
2202  true, consFailure, simParams->watmodel, stream);
2203 
2204  CUDASequencerKernel->centerOfMass(
2205  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2206  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2207  d_hydrogenGroupSize, numAtomsHome, stream);
2208 
2209 
2210  {
2211  // SubmitHalf and its corresponding reductions
2212  submitHalf(numAtomsHome, 1, 1);
2213  // submitHalf reductions
2214  cudaCheck(cudaStreamSynchronize(stream));
2215  Tensor reduction_virial;
2216  Tensor reduction_intVirialNormal;
2217  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2218  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2219  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2220  // Haochuan: the tensor is not symmetric when there are fixed atoms
2221  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2222  reduction_virial *= 0.5;
2223  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2224  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2225  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2226  += (intKineticEnergy_half[0] * 0.25);
2227  reduction_intVirialNormal *= 0.5;
2228  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2229  reduction_intVirialNormal);
2230  }
2231 
2232  CUDASequencerKernel->addForceToMomentum(
2233  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2234  d_recipMass,
2235  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2236  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2237  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2238  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2239  numAtomsHome, maxForceNumber, stream);
2240 
2241  CUDASequencerKernel->rattle1(
2242  simParams->fixedAtomsOn, 1, 1,
2243  numAtomsHome, dt_normal, 1.0/dt_normal,
2244  2.0 * simParams->rigidTol,
2245  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2246  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2247  d_velNew_x, d_velNew_y, d_velNew_z,
2248  d_posNew_x, d_posNew_y, d_posNew_z,
2249  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2250  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2251  &settleList, settleListSize, &d_consFailure,
2252  d_consFailureSize, &rattleList, rattleListSize,
2253  &nSettle, &nRattle,
2254  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2255  buildRigidLists, consFailure, simParams->watmodel, stream);
2256 
2257  CUDASequencerKernel->centerOfMass(
2258  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2259  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2260  d_hydrogenGroupSize, numAtomsHome, stream);
2261 
2262  {
2263  // JM: SubmitHalf and its corresponding reductions
2264  submitHalf(numAtomsHome, 1, 1);
2265  // submitHalf reductions
2266  cudaCheck(cudaStreamSynchronize(stream));
2267  Tensor reduction_virial;
2268  Tensor reduction_intVirialNormal;
2269  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2270  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2271  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2272  // Haochuan: the tensor is not symmetric when there are fixed atoms
2273  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2274  reduction_virial *= 0.5;
2275  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2276  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2277  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2278  += (intKineticEnergy_half[0] * 0.25);
2279  reduction_intVirialNormal *= 0.5;
2280  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2281  reduction_intVirialNormal);
2282  }
2283 
2284  CUDASequencerKernel->addForceToMomentum(
2285  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2286  d_recipMass,
2287  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2288  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2289  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2290  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2291  numAtomsHome, maxForceNumber, stream);
2292 
2293  if(requestGlobalForces || requestForcesOutput) {
2294  // store the forces for next step,
2295  // when we need it for colvars and Tcl scripting
2296  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2297  }
2298 
2299  if (requestGlobalForcesGPU) {
2300  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2301  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2302  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2303  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2304  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2305  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2306  CUDASequencerKernel->copyForcesToDevice(
2307  numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2308  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2309  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2310  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2311  // cudaCheck(cudaStreamSynchronize(stream));
2312  }
2313 
2314  CUDASequencerKernel->centerOfMass(
2315  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2316  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2317  d_hydrogenGroupSize, numAtomsHome, stream);
2318 
2319  submitReductions(origin.x, origin.y, origin.z,
2320  marginViolations, 1,
2321  1,
2322  numAtomsHome, maxForceNumber);
2323 
2324  copyPositionsAndVelocitiesToHost(1, 0);
2325 
2326  if(consFailure[0]){
2327  // Constraint failure. Abort.
2328  int dieOnError = simParams->rigidDie;
2329  if(dieOnError){
2330  // Bails out
2331  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
2332  NAMD_die("constraint failure during CUDA rattle!\n");
2333  }else{
2334  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
2335  }
2336  }else if(1){
2337  cudaCheck(cudaStreamSynchronize(stream));
2338  if (doGlobalMasterStaleForces) {
2339  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2340  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2341  // PRINT_CUDATENSOR(extVirial[EXT_GLOBALMTS], iout);
2342  }
2343  if(simParams->rigidBonds != RIGID_NONE){
2344  Tensor reduction_rigidVirial;
2345  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
2346  // Haochuan: the tensor is not symmetric when there are fixed atoms
2347  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2348  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
2349  }
2350 
2351  //submitReductions1
2352  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
2353  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2354  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
2355  Vector angularMomentum(*angularMomentum_x,
2356  *angularMomentum_y,
2357  *angularMomentum_z);
2358  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
2359  //submitReductions2
2360  Tensor regintVirialNormal;
2361  Tensor regintVirialNbond;
2362  Tensor regintVirialSlow;
2363  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
2364  if (maxForceNumber >= 1) {
2365  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
2366  }
2367  if (maxForceNumber >= 2) {
2368  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
2369  }
2370 
2371  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
2372  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
2373  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
2374  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
2375 
2376  if (simParams->fixedAtomsOn) {
2377  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2378  double3 fixForceNormal, fixForceNbond, fixForceSlow;
2379  switch (maxForceNumber) {
2380  case 2: {
2381  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2382  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2383  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
2384  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
2385  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
2386  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
2387  } // intentionally fallthrough
2388  case 1: {
2389  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2390  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2391  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
2392  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
2393  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
2394  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
2395  } // intentionally fallthrough
2396  default: {
2397  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2398  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2399  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
2400  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
2401  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
2402  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
2403  }
2404  }
2405 #if 0
2406  auto printTensor = [](const cudaTensor& t, const std::string& name){
2407  CkPrintf("%s", name.c_str());
2408  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
2409  "%12.5lf %12.5lf %12.5lf\n"
2410  "%12.5lf %12.5lf %12.5lf\n",
2411  t.xx, t.xy, t.xz,
2412  t.yx, t.yy, t.yz,
2413  t.zx, t.zy, t.zz);
2414  };
2415  printTensor(fixVirialNormal, "fixVirialNormal = ");
2416  printTensor(fixVirialNbond, "fixVirialNbond = ");
2417  printTensor(fixVirialSlow, "fixVirialSlow = ");
2418 #endif
2419 
2420  }
2421  }
2422 
2423 #if 0
2424  if(deviceID == 0){
2425  this->printSOAForces(NULL);
2426  }
2427 #endif
2428 
2429 #if 0
2430  printSOAPositionsAndVelocities();
2431 #endif
2432 }
2433 
2434 void SequencerCUDA::monteCarloPressure_reject(Lattice &lattice)
2435 {
2436  // Restore the myLattice
2437  myLattice = lattice;
2438  double *temp;
2439 
2440  // Restore positions and forces
2441  // copy "MC" arrays into standard arrays
2442  copy_DtoD<double>(d_f_normalMC_x, coll_f_normal_x.getDevicePtr(), numAtomsHome, stream);
2443  copy_DtoD<double>(d_f_normalMC_y, coll_f_normal_y.getDevicePtr(), numAtomsHome, stream);
2444  copy_DtoD<double>(d_f_normalMC_z, coll_f_normal_z.getDevicePtr(), numAtomsHome, stream);
2445  copy_DtoD<double>(d_f_nbondMC_x, coll_f_nbond_x.getDevicePtr(), numAtomsHome, stream);
2446  copy_DtoD<double>(d_f_nbondMC_y, coll_f_nbond_y.getDevicePtr(), numAtomsHome, stream);
2447  copy_DtoD<double>(d_f_nbondMC_z, coll_f_nbond_z.getDevicePtr(), numAtomsHome, stream);
2448  copy_DtoD<double>(d_f_slowMC_x, coll_f_slow_x.getDevicePtr(), numAtomsHome, stream);
2449  copy_DtoD<double>(d_f_slowMC_y, coll_f_slow_y.getDevicePtr(), numAtomsHome, stream);
2450  copy_DtoD<double>(d_f_slowMC_z, coll_f_slow_z.getDevicePtr(), numAtomsHome, stream);
2451 #ifdef NAMD_NCCL_ALLREDUCE
2452  if(mGpuOn) {
2453  copy_DtoD<double>(d_posMC_x, d_posNew_x, numAtomsHome, stream);
2454  copy_DtoD<double>(d_posMC_y, d_posNew_y, numAtomsHome, stream);
2455  copy_DtoD<double>(d_posMC_z, d_posNew_z, numAtomsHome, stream);
2456  } else {
2457  copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
2458  copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
2459  copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
2460  }
2461 #else
2462  copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
2463  copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
2464  copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
2465 #endif
2466 }
2467 
2468 void SequencerCUDA::monteCarloPressure_accept(
2469  const int doMigration)
2470 {
2471  // do we need to update center of masses?
2472  CUDASequencerKernel->centerOfMass(
2473  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2474  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2475  d_hydrogenGroupSize, numAtomsHome, stream);
2476 
2477  // Add half step kinetic contribution to energy, intEnergy, virial, intVirial,
2478  // calculated by submitHalf in launch_part11
2479  Tensor reduction_virial;
2480  Tensor reduction_intVirialNormal;
2481  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2482  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2483  // Haochuan: the tensor is not symmetric when there are fixed atoms
2484  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2485  reduction_virial *= 0.5;
2486  reduction_intVirialNormal *= 0.5;
2487  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2488  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2489  reduction_intVirialNormal);
2490  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2491  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += (intKineticEnergy_half[0] * 0.25);
2492 
2493  // If we were on migration steps and move was accepted, we need to update
2494  // the myLatticeOld in order to use it in SequencerCUDAKernel::pairlistCheck();
2495  if(doMigration) {
2496  myLatticeOld = myLattice;
2497  }
2498 }
2499 
2500 void SequencerCUDA::monteCarloPressure_part1(
2501  Tensor &factor,
2502  Vector &origin,
2503  Lattice &oldLattice)
2504 {
2505  // Backup positions and forces
2506  copy_DtoD<double>(coll_f_normal_x.getDevicePtr(), d_f_normalMC_x, numAtomsHome, stream);
2507  copy_DtoD<double>(coll_f_normal_y.getDevicePtr(), d_f_normalMC_y, numAtomsHome, stream);
2508  copy_DtoD<double>(coll_f_normal_z.getDevicePtr(), d_f_normalMC_z, numAtomsHome, stream);
2509  copy_DtoD<double>(coll_f_nbond_x.getDevicePtr(), d_f_nbondMC_x, numAtomsHome, stream);
2510  copy_DtoD<double>(coll_f_nbond_y.getDevicePtr(), d_f_nbondMC_y, numAtomsHome, stream);
2511  copy_DtoD<double>(coll_f_nbond_z.getDevicePtr(), d_f_nbondMC_z, numAtomsHome, stream);
2512  copy_DtoD<double>(coll_f_slow_x.getDevicePtr(), d_f_slowMC_x, numAtomsHome, stream);
2513  copy_DtoD<double>(coll_f_slow_y.getDevicePtr(), d_f_slowMC_y, numAtomsHome, stream);
2514  copy_DtoD<double>(coll_f_slow_z.getDevicePtr(), d_f_slowMC_z, numAtomsHome, stream);
2515 #ifdef NAMD_NCCL_ALLREDUCE
2516  if(mGpuOn) {
2517  copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2518  copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2519  copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2520  } else {
2521  copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
2522  copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
2523  copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
2524  }
2525 #else
2526  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2527  copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
2528  copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
2529  copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
2530 #endif
2531 
2532  // Scale the old lattice with factor. We need both lattice and newLattice
2533  // to properly unwrap and wrap the atom's coordinate
2534  Lattice newLattice = oldLattice;
2535  newLattice.rescale(factor);
2536  cudaTensor cuFactor;
2537  cudaVector cuOrigin;
2538  COPY_CUDATENSOR(factor, cuFactor);
2539  COPY_CUDAVECTOR(origin, cuOrigin);
2540 
2541  // Scale the coordinate using Molecule's geometric center
2542  Molecule *molecule = Node::Object()->molecule;
2543  CUDASequencerKernel->scaleCoordinateUsingGC(
2544  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_idOrder, d_moleculeStartIndex,
2545  d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2546  d_transform, molecule->numMolecules, molecule->numLargeMolecules,
2547  stream);
2548 
2549  // Update the cuda lattice with newLattice for force calculation
2550  myLattice = newLattice;
2551 
2552  // Set up compute position before calling bonded and nonbonded kernel
2553  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2555  bool doNbond = patchData->flags.doNonbonded;
2556  bool doSlow = patchData->flags.doFullElectrostatics;
2557  bool doFEP = false;
2558  bool doTI = false;
2559  bool doAlchDecouple = false;
2560  bool doAlchSoftCore = false;
2561  if (simParams->alchOn) {
2562  if (simParams->alchFepOn) doFEP = true;
2563  if (simParams->alchThermIntOn) doTI = true;
2564  if (simParams->alchDecouple) doAlchDecouple = true;
2565  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2566  }
2567 
2568  if (Node::Object()->molecule->is_lonepairs_psf) {
2569  lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
2570  }
2571 
2572  bool usePatchPme = false;
2573  if (deviceCUDA->getIsPmeDevice() && doSlow) {
2574  // This will check if the current lattice is compatible with the patch-level PME kernels
2575  // This needs to be redone every time the lattice changes, the results are stored in
2576  // the cudaPme object, and the overall compatibility can be computed with the compatible()
2577  // function.
2578  //
2579  // The behavior of set compute positions PME is different depending on the kernels being used,
2580  // so that value needs to be passed to the kernel object
2582  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
2584  myLattice,
2585  getNumPatchesHome(),
2586  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
2587  getHostPatchMin(),
2588  getHostPatchMax(),
2589  getHostAwayDists()
2590  );
2591  usePatchPme = cudaPme->patchLevelPmeData.compatible();
2592  }
2593 
2594  std::vector<int> atom_counts;
2595  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2596  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2597  }
2598  CUDASequencerKernel->set_compute_positions(
2599  deviceIndex,
2601  nDevices,
2602  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2603  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2604 #ifdef NAMD_NCCL_ALLREDUCE
2605  (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
2606  (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
2607  (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
2608 #else
2609  coll_pos_x.getDevicePtr(),
2610  coll_pos_y.getDevicePtr(),
2611  coll_pos_z.getDevicePtr(),
2612  coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
2613  coll_pos_y.getDevicePeerPtr(),
2614  coll_pos_z.getDevicePeerPtr(),
2615  coll_charge.getDevicePeerPtr(),
2616  coll_partition.getDevicePeerPtr(),
2617 #endif
2618  coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
2619  coll_patchCenter.getDevicePtr(),
2620  patchData->devData[deviceIndex].slow_patchPositions,
2621  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2622  coll_sortOrder.getDevicePtr(), newLattice,
2623  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2624  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2626  patchData->devData[deviceIndex].d_localPatches,
2627  patchData->devData[deviceIndex].d_peerPatches,
2628  atom_counts,
2629  stream);
2630 
2631  cudaCheck(cudaStreamSynchronize(stream));
2632 }
2633 
2634 void SequencerCUDA::monteCarloPressure_part2(
2635  int step,
2636  int maxForceNumber,
2637  const bool doEnergy,
2638  const bool doGlobal,
2639  const bool doVirial)
2640 {
2641  // we zero all reduction value in part1. Need to add this
2642  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2643 
2644  if(mGpuOn){
2645 #ifdef NAMD_NCCL_ALLREDUCE
2646  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtoms*3*(maxForceNumber+1)));
2647 #endif
2648  }
2649  //Update SOA buffer
2650  CUDASequencerKernel->accumulateForceToSOA(
2651  doGlobal,
2652  simParams->useCudaGlobal,
2653  maxForceNumber,
2654  numPatchesHomeAndProxy,
2655  nDevices,
2656  patchData->devData[deviceIndex].d_localPatches,
2657  patchData->devData[deviceIndex].f_bond,
2658  patchData->devData[deviceIndex].f_bond_nbond,
2659  patchData->devData[deviceIndex].f_bond_slow,
2660  patchData->devData[deviceIndex].forceStride,
2661  patchData->devData[deviceIndex].f_nbond,
2662  patchData->devData[deviceIndex].f_nbond_slow,
2663  patchData->devData[deviceIndex].f_slow,
2664  d_f_global_x,
2665  d_f_global_y,
2666  d_f_global_z,
2667  coll_f_normal_x.getDevicePtr(),
2668  coll_f_normal_y.getDevicePtr(),
2669  coll_f_normal_z.getDevicePtr(),
2670  coll_f_nbond_x.getDevicePtr(),
2671  coll_f_nbond_y.getDevicePtr(),
2672  coll_f_nbond_z.getDevicePtr(),
2673  coll_f_slow_x.getDevicePtr(),
2674  coll_f_slow_y.getDevicePtr(),
2675  coll_f_slow_z.getDevicePtr(),
2676  coll_unsortOrder.getDevicePtr(),
2677  myLattice,
2678  patchData->d_queues,
2679  patchData->d_queueCounters,
2680  d_tbcatomic,
2681  stream
2682  );
2683  if(mGpuOn){
2684 #ifndef NAMD_NCCL_ALLREDUCE
2685  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2686  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
2687  // 0, deviceCUDA->getNcclComm(), stream);
2688  std::vector<int> atom_counts;
2689  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2690  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2691  }
2692  CUDASequencerKernel->mergeForcesFromPeers(
2693  deviceIndex,
2694  maxForceNumber,
2695  myLattice,
2696  numPatchesHomeAndProxy,
2697  numPatchesHome,
2698  this->coll_f_normal_x.getDevicePeerPtr(),
2699  this->coll_f_normal_y.getDevicePeerPtr(),
2700  this->coll_f_normal_z.getDevicePeerPtr(),
2701  this->coll_f_nbond_x.getDevicePeerPtr(),
2702  this->coll_f_nbond_y.getDevicePeerPtr(),
2703  this->coll_f_nbond_z.getDevicePeerPtr(),
2704  this->coll_f_slow_x.getDevicePeerPtr(),
2705  this->coll_f_slow_y.getDevicePeerPtr(),
2706  this->coll_f_slow_z.getDevicePeerPtr(),
2707  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2708  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2709  patchData->devData[deviceIndex].d_localPatches,
2710  patchData->devData[deviceIndex].d_peerPatches,
2711  atom_counts,
2712  stream
2713  );
2714 #else
2715  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2716  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2717 #endif
2718  }
2719  // do external forces calculation
2720  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
2721 #if 0
2722  cudaCheck(cudaDeviceSynchronize());
2723  if(true || deviceID == 0){
2724  char prefix[10];
2725  snprintf(prefix, 10, "step-%d",step);
2726  this->printSOAForces(prefix);
2727  }
2728 #endif
2729 
2730 }
2731 
2732 void SequencerCUDA::setRescalePairlistTolerance(const bool val) {
2733  rescalePairlistTolerance = val;
2734 }
2735 
2736 void SequencerCUDA::launch_part1(
2737  int step,
2738  double dt_normal,
2739  double dt_nbond,
2740  double dt_slow,
2741  double velrescaling,
2742  const double maxvel2,
2743  Tensor &factor,
2744  Vector &origin,
2745  Lattice &lattice,
2746  int reassignVelocitiesStep,
2747  int langevinPistonStep,
2748  int berendsenPressureStep,
2749  int maxForceNumber,
2750  const int copyIn,
2751  const int savePairlists,
2752  const int usePairlists,
2753  const bool doEnergy)
2754 {
2755  PatchMap* patchMap = PatchMap::Object();
2756  // Aggregate data from all patches
2757  cudaCheck(cudaSetDevice(deviceID));
2758  this->maxvel2 = maxvel2;
2759  const bool doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
2760  const bool doFixed = simParams->fixedAtomsOn;
2761  // JM: for launch_part1:
2762  // copyIn: first call
2763  myLattice = lattice;
2764  if(reassignVelocitiesStep)
2765  {
2766  const int reassignFreq = simParams->reassignFreq;
2767  BigReal newTemp = simParams->reassignTemp;
2768  newTemp += ( step / reassignFreq ) * simParams->reassignIncr;
2769  if ( simParams->reassignIncr > 0.0 ) {
2770  if ( newTemp > simParams->reassignHold && simParams->reassignHold > 0.0 )
2771  newTemp = simParams->reassignHold;
2772  } else {
2773  if ( newTemp < simParams->reassignHold )
2774  newTemp = simParams->reassignHold;
2775  }
2776  const BigReal kbT = BOLTZMANN * newTemp;
2777 
2778  CUDASequencerKernel->reassignVelocities(
2779  dt_normal, simParams->fixedAtomsOn, d_atomFixed,
2780  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2781  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2782  d_recipMass, kbT,
2783  numAtomsHome, numAtomsHome, 0,
2784  curandGen, stream);
2785  }
2786 
2787  // scale the position for berendsen Pressure Controller
2788  if(berendsenPressureStep) {
2789  cudaTensor cuFactor;
2790  cudaVector cuOrigin;
2791  COPY_CUDATENSOR(factor, cuFactor);
2792  COPY_CUDAVECTOR(origin, cuOrigin);
2793  CUDASequencerKernel->scaleCoordinateWithFactor(
2794  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass, d_hydrogenGroupSize,
2795  cuFactor, cuOrigin, simParams->useGroupPressure, numAtomsHome, stream);
2796  }
2797 
2798  if(!langevinPistonStep){
2799  // kernel fusion here
2800  // JM TODO: Fuse kernels for the langevin thermostat
2801  CUDASequencerKernel->velocityVerlet1(
2802  doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2803  dt_slow, velrescaling, d_recipMass,
2804  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), maxvel2, killme, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2805  pos_x, pos_y, pos_z, coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2806  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(), coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2807  d_atomFixed, numAtomsHome, maxForceNumber, stream);
2808  }else{
2809  // Zero-out force buffers here
2810  CUDASequencerKernel->addForceToMomentum(
2811  doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2812  d_recipMass,
2813  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2814  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2815  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2816  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2817  numAtomsHome, maxForceNumber, stream);
2818 
2819  maximumMove(maxvel2, numAtomsHome);
2820  cudaTensor cuFactor;
2821  cudaVector cuOrigin;
2822  COPY_CUDATENSOR(factor, cuFactor);
2823  COPY_CUDAVECTOR(origin, cuOrigin);
2824  double velFactor_x = namd_reciprocal(factor.xx);
2825  double velFactor_y = namd_reciprocal(factor.yy);
2826  double velFactor_z = namd_reciprocal(factor.zz);
2827 
2828  CUDASequencerKernel->addVelocityToPosition(
2829  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2830  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2831  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2832  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2833  CUDASequencerKernel->langevinPiston(
2834  simParams->fixedAtomsOn, d_atomFixed,
2835  d_groupFixed, d_transform, lattice,
2836  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2837  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2838  d_mass, d_hydrogenGroupSize,
2839  cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2840  simParams->useGroupPressure, numAtomsHome, stream);
2841  CUDASequencerKernel->addVelocityToPosition(
2842  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2843  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2844  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2845  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2846  }
2847  if(mGpuOn && SMDKernel)
2848  {
2849  // compute distributed center of mass for groups of atoms
2850  SMDKernel->computeCOMMGpu(lattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2851  d_transform, stream);
2852  }
2853  if(mGpuOn && groupRestraintsKernel)
2854  {
2855  groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
2856  d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2857  stream);
2858  }
2859 
2860  // JM: Recalculate centers of mass if energy calculation or langevinPistonOn
2861  if( (doEnergy || doVirial) ) {
2862  CUDASequencerKernel->centerOfMass(
2863  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2864  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2865  d_hydrogenGroupSize, numAtomsHome, stream);
2866  CUDASequencerKernel->centerOfMass(
2867  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2868  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2869  d_hydrogenGroupSize, numAtomsHome, stream);
2870  }
2871 
2872  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2874  // We need to find doNbond and doSlow for upcoming step
2875  bool doNbond = patchData->flags.doNonbonded;
2876  bool doSlow = patchData->flags.doFullElectrostatics;
2877 
2878  bool doFEP = false;
2879  bool doTI = false;
2880  bool doAlchDecouple = false;
2881  bool doAlchSoftCore = false;
2882  if (simParams->alchOn) {
2883  if (simParams->alchFepOn) doFEP = true;
2884  if (simParams->alchThermIntOn) doTI = true;
2885  if (simParams->alchDecouple) doAlchDecouple = true;
2886  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2887  }
2888  if ( ! savePairlists ) {
2889 
2890  double minSize = simParams->patchDimension - simParams->margin;
2891  double sysdima = lattice.a_r().unit() * lattice.a();
2892  double sysdimb = lattice.b_r().unit() * lattice.b();
2893  double sysdimc = lattice.c_r().unit() * lattice.c();
2894  // Let's pass migrationInfo here
2895 
2896  CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2897  patchData->devData[deviceIndex].d_localPatches,
2898  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_posSave_x, d_posSave_y, d_posSave_z,
2899  d_awayDists,
2900  myLattice, myLatticeOld,
2901  d_patchMin, d_patchMax, coll_patchCenter.getDevicePtr(),
2902  d_mInfo,
2903  d_tbcatomic, simParams->pairlistTrigger,
2904  simParams->pairlistGrow, simParams->pairlistShrink,
2905  d_patchMaxAtomMovement, patchMaxAtomMovement,
2906  d_patchNewTolerance, patchNewTolerance,
2907  minSize, simParams->cutoff, sysdima, sysdimb, sysdimc,
2908  h_marginViolations,
2909  h_periodicCellSmall,
2910  rescalePairlistTolerance,
2911  isPeriodic, stream);
2912  rescalePairlistTolerance = false;
2913  }
2914  else {
2915  rescalePairlistTolerance = true;
2916  }
2917 
2918  if(mGpuOn){
2919  // Synchonize device before node barrier
2920  cudaCheck(cudaStreamSynchronize(stream));
2921  }
2922 }
2923 
2924 void SequencerCUDA::launch_part11(
2925  double dt_normal,
2926  double dt_nbond,
2927  double dt_slow,
2928  double velrescaling,
2929  const double maxvel2,
2930  Tensor &factor,
2931  Vector &origin,
2932  Lattice &lattice,
2933  int langevinPistonStep,
2934  int maxForceNumber,
2935  const int copyIn,
2936  const int savePairlists,
2937  const int usePairlists,
2938  const bool doEnergy)
2939 {
2940  const bool doVirial = simParams->langevinPistonOn;
2941  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2943  // We need to find doNbond and doSlow for upcoming step
2944  bool doNbond = patchData->flags.doNonbonded;
2945  bool doSlow = patchData->flags.doFullElectrostatics;
2946 
2947  bool doFEP = false;
2948  bool doTI = false;
2949  bool doAlchDecouple = false;
2950  bool doAlchSoftCore = false;
2951  if (simParams->alchOn) {
2952  if (simParams->alchFepOn) doFEP = true;
2953  if (simParams->alchThermIntOn) doTI = true;
2954  if (simParams->alchDecouple) doAlchDecouple = true;
2955  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2956  }
2957 
2958  submitHalf(numAtomsHome, 1, doEnergy || doVirial);
2959 
2960  // Updating numerical flags
2961  NAMD_EVENT_START(1, NamdProfileEvent::CPY_PATCHFLAGS);
2962  this->update_patch_flags();
2963  NAMD_EVENT_STOP(1, NamdProfileEvent::CPY_PATCHFLAGS);
2964 
2965  finish_part1(copyIn, patchList[0]->flags.savePairlists,
2966  patchList[0]->flags.usePairlists);
2967 }
2968 
2969 
2970 void SequencerCUDA::launch_set_compute_positions() {
2971 
2972  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2974  // We need to find doNbond and doSlow for upcoming step
2975  bool doNbond = patchData->flags.doNonbonded;
2976  bool doSlow = patchData->flags.doFullElectrostatics;
2977 
2978  bool doFEP = false;
2979  bool doTI = false;
2980  bool doAlchDecouple = false;
2981  bool doAlchSoftCore = false;
2982  if (simParams->alchOn) {
2983  if (simParams->alchFepOn) doFEP = true;
2984  if (simParams->alchThermIntOn) doTI = true;
2985  if (simParams->alchDecouple) doAlchDecouple = true;
2986  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2987  }
2988  const bool doIMD = simParams->IMDon && !(simParams->IMDignoreForces || simParams->IMDignore);
2989  bool doGlobal = simParams->tclForcesOn || simParams->colvarsOn || doIMD;
2990  // Set the positions of lone pairs before copying to NB buffers
2992  lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
2993  }
2994 
2995  bool usePatchPme = false;
2996  if (deviceCUDA->getIsPmeDevice() && doSlow) {
2997  // This will check if the current lattice is compatible with the patch-level PME kernels
2998  // This needs to be redone every time the lattice changes, the results are stored in
2999  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3000  // function.
3001  //
3002  // The behavior of set compute positions PME is different depending on the kernels being used,
3003  // so that value needs to be passed to the kernel object
3005  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3007  myLattice,
3008  getNumPatchesHome(),
3009  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3010  getHostPatchMin(),
3011  getHostPatchMax(),
3012  getHostAwayDists()
3013  );
3014  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3015  }
3016 
3017  if (1) {
3018  //fprintf(stderr, "calling set_compute_positions() ****************************************\n");
3019  //fprintf(stderr, "calling set_compute_positions\n");
3020  //fprintf(stderr, "doNbond=%d doSlow=%d\n", doNbond, doSlow);
3021  std::vector<int> atom_counts;
3022  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3023  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3024  }
3025  CUDASequencerKernel->set_compute_positions(
3026  deviceIndex,
3028  nDevices,
3029  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3030  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3031 #ifdef NAMD_NCCL_ALLREDUCE
3032  (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
3033  (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
3034  (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
3035 #else
3036  coll_pos_x.getDevicePtr(),
3037  coll_pos_y.getDevicePtr(),
3038  coll_pos_z.getDevicePtr(),
3039  coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
3040  coll_pos_y.getDevicePeerPtr(),
3041  coll_pos_z.getDevicePeerPtr(),
3042  coll_charge.getDevicePeerPtr(),
3043  coll_partition.getDevicePeerPtr(),
3044 #endif
3045  coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
3046  coll_patchCenter.getDevicePtr(),
3047  patchData->devData[deviceIndex].slow_patchPositions,
3048  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3049  coll_sortOrder.getDevicePtr(), myLattice,
3050  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3051  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3053  patchData->devData[deviceIndex].d_localPatches,
3054  patchData->devData[deviceIndex].d_peerPatches,
3055  atom_counts,
3056  stream);
3057  // For global forces, copy the coordinate to host with kernel overlap
3058  if (doGlobal) {
3059  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_POSITION);
3060  // CkPrintf("WARNING this probably needs to be changed for multihost\n");
3061  copyPositionsToHost_direct();
3062  //copyPositionsAndVelocitiesToHost(1,0);
3063  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_POSITION);
3064  }
3065  }
3066 }
3067 
3068 void SequencerCUDA:: finish_part1( const int copyIn,
3069  const int savePairlists,
3070  const int usePairlists)
3071 {
3072  // JM: If we're not in a migration step, let's overlap the flagging the
3073  // positions before we synchronize the stream to lessen the compute
3074  // launch overhead
3075  // Hopefully we will see some overlap in this region
3076  //
3077  // TODO: We can just call this a different function and start calling positionsReady
3078  // if we're not on a migration step, so that we can overlap some of the work with the kernels
3079  // before we synchronize, we can clear the device memory
3080  // sets the tileListStat for the nbondKernel
3081  cudaCheck(cudaStreamSynchronize(stream));
3082 
3083  // Checks if periodic cell became too small
3084  if(*h_periodicCellSmall){
3085  NAMD_die("Periodic cell has become too small for original patch grid!\n"
3086  "Possible solutions are to restart from a recent checkpoint,\n"
3087  "increase margin, or disable useFlexibleCell for liquid simulation.");
3088  }
3089 
3090  if (killme[0]) {
3091  const Molecule* mol = Node::Object()->molecule;
3092  // Found at least one atom that is moving too fast.
3093  // Terminating, so loop performance below doesn't matter.
3094  // Loop does not vectorize
3095  double *vel_x, *vel_y, *vel_z;
3096  std::vector<int> id;
3097  std::vector<int> patchIDofAtoms(numAtomsHome);
3098  allocate_host<double>(&vel_x, numAtomsHome);
3099  allocate_host<double>(&vel_y, numAtomsHome);
3100  allocate_host<double>(&vel_z, numAtomsHome);
3101  copy_DtoH_sync<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsHome);
3102  copy_DtoH_sync<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsHome);
3103  copy_DtoH_sync<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsHome);
3104  // Update the id array from patchDataSOA
3105  size_t offset = 0;
3106  // TODO: Does this work for GPU atom migration?
3107  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3108  // std::vector<int> patchIDtoIndex(numPatchesHome);
3109  for (int i = 0; i < numPatchesHome; i++) {
3110  HomePatch *patch = homePatches[i];
3111  PatchDataSOA& current = patchList[i]->patchDataSOA;
3112  const int numPatchAtoms = current.numAtoms;
3113  id.resize(numPatchAtoms + id.size());
3114  for(int j = 0; j < numPatchAtoms; j++){
3115  if (!simParams->useDeviceMigration) {
3116  id[offset + j] = current.id[j];
3117  }
3118  patchIDofAtoms[offset + j] = patch->getPatchID();
3119  }
3120  offset += numPatchAtoms;
3121  }
3122  if (simParams->useDeviceMigration) {
3123  id.resize(numAtomsHome);
3124  copy_DtoH_sync<int>(coll_idMig.getDevicePtr(), id.data(), numAtomsHome);
3125  }
3126  int cnt = 0;
3127  for (int i=0; i < numAtomsHome; i++) {
3128  BigReal vel2 =
3129  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3130  if (vel2 > maxvel2) {
3131  ++cnt;
3132  iout << iERROR << " velocity is "
3133  << PDBVELFACTOR * vel_x[i] << " "
3134  << PDBVELFACTOR * vel_y[i] << " "
3135  << PDBVELFACTOR * vel_z[i]
3136  << " (limit is "
3137  << ( PDBVELFACTOR * sqrt(maxvel2) ) << ", atom ID "
3138  << id[i]+1
3139  // XXX: TODO: mol->get_atomtype only works on a single-node
3140  << " of type " << mol->get_atomtype(id[i])
3141  << " in patch " << patchIDofAtoms[i]
3142  << " on PE " << CkMyPe()
3143  << " with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
3144  << " atoms)\n" << endi;
3145  }
3146  }
3147  iout << iERROR << "Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3148  "; simulation has become unstable ("
3149  << cnt << " atoms on pe " << CkMyPe() << ", GPU " << deviceID << ").\n" << endi;
3150  if (simParams->crashOutputFlag & NAMD_CRASH_ATOM_TOO_FAST) {
3151  double *pos_x, *pos_y, *pos_z;
3152  allocate_host<double>(&pos_x, numAtomsHome);
3153  allocate_host<double>(&pos_y, numAtomsHome);
3154  allocate_host<double>(&pos_z, numAtomsHome);
3155  copy_DtoH_sync<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsHome);
3156  copy_DtoH_sync<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsHome);
3157  copy_DtoH_sync<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsHome);
3158  // Save the positions and velocities to a file for debugging
3159  const std::string outfilename =
3160  std::string(simParams->crashFilename) + "." +
3161  std::to_string(deviceIndex);
3162  std::ofstream ofs_crash_dump(outfilename.c_str());
3163  ofs_crash_dump << "atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
3164  for (int i=0; i < numAtomsHome; i++) {
3165  ofs_crash_dump << id[i]+1 << ","
3166  << pos_x[i] << ","
3167  << pos_y[i] << ","
3168  << pos_z[i] << ","
3169  << PDBVELFACTOR * vel_x[i] << ","
3170  << PDBVELFACTOR * vel_y[i] << ","
3171  << PDBVELFACTOR * vel_z[i] << "\n";
3172  }
3173  ofs_crash_dump.flush();
3174  ofs_crash_dump.close();
3175  iout << iWARN << "PE " << CkMyPe() << ", GPU " << deviceID
3176  << ": the atom positions and velocities have been written to "
3177  << outfilename << "\n" << endi;
3178  deallocate_host<double>(&pos_x);
3179  deallocate_host<double>(&pos_y);
3180  deallocate_host<double>(&pos_z);
3181  }
3182  deallocate_host<double>(&vel_x);
3183  deallocate_host<double>(&vel_y);
3184  deallocate_host<double>(&vel_z);
3185  NAMD_die("SequencerCUDA: Atoms moving too fast");
3186  }
3187  else{
3188  // submitHalf reductions
3189  Tensor reduction_virial;
3190  Tensor reduction_intVirialNormal;
3191  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3192  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3193  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3194  // Haochuan: the tensor is not symmetric when there are fixed atoms
3195  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3196  reduction_virial *= 0.5;
3197  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3198  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
3199  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3200  += (intKineticEnergy_half[0] * 0.25);
3201  reduction_intVirialNormal *= 0.5;
3202  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3203  reduction_intVirialNormal);
3204  int migration = (h_marginViolations[0] != 0) ? 1 :0; // flags migration as TRUE if margin violation occured
3205  // if(migration != 0 ) fprintf(stderr, "DEV[%d] = MIGRATION[%d]\n", deviceID, migration);
3206  patchData->migrationFlagPerDevice[deviceIndex] = migration; // Saves the updated migration flag
3207  h_marginViolations[0] = 0;
3208  }
3209 }
3210 
3211 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3212  bool copyOut, const int doGlobal){
3213  // CkPrintf("copy positions and velocities to host copyout %d doGlobal %d\n", copyOut, doGlobal);
3214  if(copyOut){
3215  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3216  patchData = cpdata.ckLocalBranch();
3217  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3218  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3219  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3220  const int numAtomsToCopy = numAtomsHome;
3221  copy_DtoH<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsToCopy, stream);
3222  copy_DtoH<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsToCopy, stream);
3223  copy_DtoH<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsToCopy, stream);
3224  if (!doGlobal) {
3225  // We already copied coordinate if we have global forces
3226  copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
3227  copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
3228  copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
3229  }
3230  cudaCheck(cudaDeviceSynchronize());
3231 
3232  for(int i = 0; i < homePatches.size(); i++){
3233 
3234  // TODO do we need to copy proxy patches as well
3235  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3236  const int numPatchAtoms = localPatches[i].numAtoms;
3237  const int offset = localPatches[i].bufferOffset;
3238  memcpy(current.vel_x, vel_x + offset, numPatchAtoms*sizeof(double));
3239  memcpy(current.vel_y, vel_y + offset, numPatchAtoms*sizeof(double));
3240  memcpy(current.vel_z, vel_z + offset, numPatchAtoms*sizeof(double));
3241  if (!doGlobal) {
3242  // We already copied coordinate if we have global forces
3243  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3244  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3245  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3246  }
3247  }
3248  }
3249 }
3250 
3251 
3252 void SequencerCUDA::copyPositionsToHost(){
3253  // CkPrintf("copy positions to host \n");
3254  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3255  patchData = cpdata.ckLocalBranch();
3256  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3257  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3258  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3259 
3260  const int numAtomsToCopy = numAtomsHome;
3261  // We already copied coordinate if we have global forces
3262  copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
3263  copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
3264  copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
3265  cudaCheck(cudaDeviceSynchronize());
3266 
3267  for(int i = 0; i < homePatches.size(); i++){
3268 
3269  // TODO do we need to copy proxy patches as well
3270  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3271  const int numPatchAtoms = localPatches[i].numAtoms;
3272  const int offset = localPatches[i].bufferOffset;
3273  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3274  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3275  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3276  }
3277 }
3278 
3279 void SequencerCUDA::update_patch_flags()
3280 {
3281  // int pairlists = 1;
3282  int pairlists = (patchData->flags.step < simParams->N);
3283  for (int i=0; i < numPatchesHome; i++) {
3284  HomePatch *patch = patchList[i];
3285  patch->flags.copyIntFlags(patchData->flags); // copy global flags
3286  }
3287 }
3288 
3289 void SequencerCUDA::updatePairlistFlags(const int doMigration){
3290  int pairlists = patchList[0]->flags.step < simParams->N;
3291  for(int i = 0; i < numPatchesHome; i++){
3292  //for(int i = 0; i < numPatches; i++){
3293  HomePatch *patch = patchList[i];
3294  Sequencer *seq = patch->sequencer;
3295  // the following logic is duplicated from Sequencer::runComputeObjects
3296  // Migrations always invalidates pairlists
3297  if (doMigration) {
3298  seq->pairlistsAreValid = 0;
3299  }
3300  if (seq->pairlistsAreValid &&
3301  ( patch->flags.doFullElectrostatics || ! simParams->fullElectFrequency )
3302  && (seq->pairlistsAge > seq->pairlistsAgeLimit) ) {
3303  seq->pairlistsAreValid = 0;
3304  }
3305  patch->flags.usePairlists = pairlists || seq->pairlistsAreValid;
3306  patch->flags.savePairlists = pairlists && !seq->pairlistsAreValid;
3307  if(patch->flags.savePairlists){
3308  // We need to rebuild pairlists -> reset tolerance values
3309  patch->flags.pairlistTolerance = patchList[i]->doPairlistCheck_newTolerance; // update pairListTolerance
3310  patch->flags.maxAtomMovement = 0;
3311  patch->doPairlistCheck_newTolerance *= (1 - simParams->pairlistShrink);
3312  }else if(patch->flags.usePairlists){
3313  // We can keep going with the existing pairlists -> update tolerances
3314  patch->flags.maxAtomMovement = patchMaxAtomMovement[i];
3315  patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3316  }else{
3317  // End of simulation
3318  patch->flags.maxAtomMovement=99999.;
3319  patch->flags.pairlistTolerance = 0.;
3320  }
3321  }
3322  if(patchList[0]->flags.savePairlists){
3323  // Backs up d_posSave_* for pairlistCheck
3324  copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
3325  copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
3326  copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);
3327  myLatticeOld = myLattice;
3328  }
3329 }
3330 
3331 void SequencerCUDA::finish_patch_flags(int migration)
3332 {
3333  for (int i=0; i < numPatchesHome; i++) {
3334  HomePatch *patch = patchList[i];
3335  Sequencer *seq = patch->sequencer;
3336  if (patch->flags.savePairlists && patch->flags.doNonbonded) {
3337  seq->pairlistsAreValid = 1;
3338  seq->pairlistsAge = 0;
3339  }
3340  if (seq->pairlistsAreValid /* && ! pressureStep */) {
3341  ++(seq->pairlistsAge);
3342  }
3343  }
3344 }
3345 
3346 
3347 void SequencerCUDA::launch_part2(
3348  const int doMCPressure,
3349  double dt_normal,
3350  double dt_nbond,
3351  double dt_slow,
3352  Vector &origin,
3353  int step,
3354  int maxForceNumber,
3355  const int langevinPistonStep,
3356  const int copyIn,
3357  const int copyOut,
3358  const int doGlobal,
3359  const bool doEnergy)
3360 {
3361  PatchMap* patchMap = PatchMap::Object();
3362  Tensor localVirial;
3363  //cudaTensor h_rigidVirial;
3364  bool doNbond = false;
3365  bool doSlow = false;
3366  cudaCheck(cudaSetDevice(deviceID));
3367  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3368  const int is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3369  // const int doVirial = langevinPistonStep;
3370  // JM: For launch_part2:
3371  // copyIn = migration steps
3372 
3373  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
3374 
3375  if(mGpuOn){
3376 #ifdef NAMD_NCCL_ALLREDUCE
3377  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3378 #endif
3379  }
3380 
3381  if(!simParams->langevinOn && !simParams->eFieldOn && !simParams->constraintsOn &&
3382  !simParams->SMDOn && !simParams->groupRestraintsOn && !doMCPressure &&
3383  !simParams->mgridforceOn && ! simParams->gridforceOn && !mGpuOn &&
3384  !simParams->consForceOn &&
3385  (simParams->watmodel == WaterModel::TIP3) &&
3386  (!is_lonepairs_psf)){
3387  CUDASequencerKernel->accumulate_force_kick(
3388  simParams->fixedAtomsOn,
3389  doGlobal,
3390  simParams->useCudaGlobal,
3391  maxForceNumber,
3392  numPatchesHomeAndProxy,
3393  patchData->devData[deviceIndex].d_localPatches,
3394  patchData->devData[deviceIndex].f_bond,
3395  patchData->devData[deviceIndex].f_bond_nbond,
3396  patchData->devData[deviceIndex].f_bond_slow,
3397  patchData->devData[deviceIndex].forceStride,
3398  patchData->devData[deviceIndex].f_nbond,
3399  patchData->devData[deviceIndex].f_nbond_slow,
3400  patchData->devData[deviceIndex].f_slow,
3401  d_f_global_x,
3402  d_f_global_y,
3403  d_f_global_z,
3404  coll_f_normal_x.getDevicePtr(),
3405  coll_f_normal_y.getDevicePtr(),
3406  coll_f_normal_z.getDevicePtr(),
3407  coll_f_nbond_x.getDevicePtr(),
3408  coll_f_nbond_y.getDevicePtr(),
3409  coll_f_nbond_z.getDevicePtr(),
3410  coll_f_slow_x.getDevicePtr(),
3411  coll_f_slow_y.getDevicePtr(),
3412  coll_f_slow_z.getDevicePtr(),
3413  coll_vel_x.getDevicePtr(),
3414  coll_vel_y.getDevicePtr(),
3415  coll_vel_z.getDevicePtr(),
3416  d_recipMass,
3417  d_atomFixed,
3418  dt_normal,
3419  dt_nbond,
3420  dt_slow,
3421  1.0,
3422  coll_unsortOrder.getDevicePtr(),
3423  myLattice,
3424  stream
3425  );
3426  }else{
3427  CUDASequencerKernel->accumulateForceToSOA(
3428  doGlobal,
3429  simParams->useCudaGlobal,
3430  maxForceNumber,
3431  numPatchesHomeAndProxy,
3432  nDevices,
3433  patchData->devData[deviceIndex].d_localPatches,
3434  patchData->devData[deviceIndex].f_bond,
3435  patchData->devData[deviceIndex].f_bond_nbond,
3436  patchData->devData[deviceIndex].f_bond_slow,
3437  patchData->devData[deviceIndex].forceStride,
3438  patchData->devData[deviceIndex].f_nbond,
3439  patchData->devData[deviceIndex].f_nbond_slow,
3440  patchData->devData[deviceIndex].f_slow,
3441  d_f_global_x,
3442  d_f_global_y,
3443  d_f_global_z,
3444  coll_f_normal_x.getDevicePtr(),
3445  coll_f_normal_y.getDevicePtr(),
3446  coll_f_normal_z.getDevicePtr(),
3447  coll_f_nbond_x.getDevicePtr(),
3448  coll_f_nbond_y.getDevicePtr(),
3449  coll_f_nbond_z.getDevicePtr(),
3450  coll_f_slow_x.getDevicePtr(),
3451  coll_f_slow_y.getDevicePtr(),
3452  coll_f_slow_z.getDevicePtr(),
3453  coll_unsortOrder.getDevicePtr(),
3454  myLattice,
3455  patchData->d_queues,
3456  patchData->d_queueCounters,
3457  d_tbcatomic,
3458  stream
3459  );
3460 
3461  }
3462 
3463  if (mGpuOn) {
3464  // Synchonize device before node barrier
3465  cudaCheck(cudaDeviceSynchronize());
3466  }
3467 }
3468 
3469 // launch_part2 is broken into 2 part to support MC barostat
3470 void SequencerCUDA::launch_part3(
3471  const int doMCPressure,
3472  double dt_normal,
3473  double dt_nbond,
3474  double dt_slow,
3475  Vector &origin,
3476  int step,
3477  int maxForceNumber,
3478  const bool requestGlobalForces,
3479  const int doGlobalStaleForces,
3480  const bool forceRequestedGPU,
3481  const int copyIn,
3482  const int copyOut,
3483  const bool doEnergy,
3484  const bool requestForcesOutput)
3485 {
3486  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3487  const bool doFixed = simParams->fixedAtomsOn;
3488  const double velrescaling = 1; // no rescaling
3489 
3490  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3491  simParams->SMDOn || simParams->groupRestraintsOn || requestGlobalForces || simParams->gridforceOn|| simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3492  (simParams->watmodel != WaterModel::TIP3) ||
3494  if(mGpuOn){
3495 #ifndef NAMD_NCCL_ALLREDUCE
3496  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
3497  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
3498  // 0, deviceCUDA->getNcclComm(), stream);
3499 
3500  std::vector<int> atom_counts;
3501  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3502  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3503  }
3504  CUDASequencerKernel->mergeForcesFromPeers(
3505  deviceIndex,
3506  maxForceNumber,
3507  myLattice,
3508  numPatchesHomeAndProxy,
3509  numPatchesHome,
3510  this->coll_f_normal_x.getDevicePeerPtr(),
3511  this->coll_f_normal_y.getDevicePeerPtr(),
3512  this->coll_f_normal_z.getDevicePeerPtr(),
3513  this->coll_f_nbond_x.getDevicePeerPtr(),
3514  this->coll_f_nbond_y.getDevicePeerPtr(),
3515  this->coll_f_nbond_z.getDevicePeerPtr(),
3516  this->coll_f_slow_x.getDevicePeerPtr(),
3517  this->coll_f_slow_y.getDevicePeerPtr(),
3518  this->coll_f_slow_z.getDevicePeerPtr(),
3519  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
3520  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
3521  patchData->devData[deviceIndex].d_localPatches,
3522  patchData->devData[deviceIndex].d_peerPatches,
3523  atom_counts,
3524  stream
3525  );
3526 #else
3527  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3528  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
3529 #endif
3530  }
3531  if(doVirial && doGlobalStaleForces)
3532  {
3533  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
3534  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
3535  computeGlobalMasterVirial(
3536  numPatchesHomeAndProxy,
3537  numAtomsHome,
3538  patchData->devData[deviceIndex].d_localPatches,
3539  coll_pos_x.getDevicePtr(),
3540  coll_pos_y.getDevicePtr(),
3541  coll_pos_z.getDevicePtr(),
3542  d_transform,
3543  d_f_global_x,
3544  d_f_global_y,
3545  d_f_global_z,
3546  &d_extForce[EXT_GLOBALMTS],
3547  &extForce[EXT_GLOBALMTS],
3548  &d_extVirial[EXT_GLOBALMTS],
3549  &extVirial[EXT_GLOBALMTS],
3550  myLattice,
3551  d_tbcatomic,
3552  stream);
3553  }
3554  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
3555 #if 0
3556  cudaCheck(cudaDeviceSynchronize());
3557  if(true || deviceID == 0){
3558  char prefix[10];
3559  snprintf(prefix, 10, "step-%d",step);
3560  this->printSOAForces(prefix);
3561  }
3562 #endif
3563 
3564  }
3565 
3566  if (simParams->langevinOn) {
3567  CUDASequencerKernel->langevinVelocitiesBBK1(
3568  dt_normal, d_langevinParam, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), numAtomsHome, stream);
3569  }
3570 
3571  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3572  simParams->SMDOn || simParams->groupRestraintsOn || doMCPressure || simParams->gridforceOn || simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3573  (simParams->watmodel != WaterModel::TIP3) ||
3575  CUDASequencerKernel->addForceToMomentum(
3576  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3577  d_recipMass,
3578  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3579  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3580  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3581  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
3582  numAtomsHome, maxForceNumber, stream);
3583  }
3584 
3585  if (simParams->langevinOn) {
3586 
3587  // must enforce rigid bond constraints if langevin gammas differ
3588  if (simParams->rigidBonds != RIGID_NONE &&
3589  simParams->langevinGammasDiffer) {
3590  CUDASequencerKernel->rattle1(
3591  simParams->fixedAtomsOn, doEnergy || doVirial,
3592  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3593  2.0 * simParams->rigidTol,
3594  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3595  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3596  d_velNew_x, d_velNew_y, d_velNew_z,
3597  d_posNew_x, d_posNew_y, d_posNew_z,
3598  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3599  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3600  &settleList, settleListSize, &d_consFailure,
3601  d_consFailureSize, &rattleList, rattleListSize,
3602  &nSettle, &nRattle,
3603  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3604  buildRigidLists, consFailure, simParams->watmodel, stream);
3605  buildRigidLists = false;
3606  }
3607  CUDASequencerKernel->langevinVelocitiesBBK2(
3608  dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3609  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3610  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3611  numAtomsHome, numAtomsHome, 0,
3612  curandGen, stream);
3613  }
3614  if(simParams->rigidBonds != RIGID_NONE){
3615  CUDASequencerKernel->rattle1(
3616  simParams->fixedAtomsOn, doEnergy || doVirial,
3617  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3618  2.0 * simParams->rigidTol,
3619  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3620  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3621  d_velNew_x, d_velNew_y, d_velNew_z,
3622  d_posNew_x, d_posNew_y, d_posNew_z,
3623  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3624  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3625  &settleList, settleListSize, &d_consFailure,
3626  d_consFailureSize, &rattleList, rattleListSize,
3627  &nSettle, &nRattle,
3628  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3629  buildRigidLists, consFailure, simParams->watmodel, stream);
3630  buildRigidLists = false;
3631  }
3632 
3633  // Update velocity center of mass here
3634  if(doEnergy || doVirial){
3635  CUDASequencerKernel->centerOfMass(
3636  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3637  d_vcm_x, d_vcm_y, d_vcm_z,
3638  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3639  }
3640 
3641  submitHalf(numAtomsHome, 2, doEnergy || doVirial);
3642 
3643  CUDASequencerKernel->addForceToMomentum(
3644  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3645  d_recipMass,
3646  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3647  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3648  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3649  coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
3650  numAtomsHome, maxForceNumber, stream);
3651 
3652  if(requestGlobalForces || requestForcesOutput) {
3653  // store the forces for next step,
3654  // when we need it for colvars and Tcl scripting
3655  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3656  }
3657 
3658  if (forceRequestedGPU) {
3659  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3660  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3661  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3662  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3663  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3664  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3665  CUDASequencerKernel->copyForcesToDevice(
3666  numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3667  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3668  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3669  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3670  // cudaCheck(cudaStreamSynchronize(stream));
3671  }
3672 
3673  //cudaCheck(cudaStreamSynchronize(stream));
3674  submitReductions(origin.x, origin.y, origin.z,
3675  marginViolations, doEnergy || doVirial,
3676  copyOut && simParams->outputMomenta != 0,
3677  numAtomsHome, maxForceNumber);
3678  // This is for collecting coordinate and velocity to print
3679  copyPositionsAndVelocitiesToHost(copyOut, 0);
3680 
3681  if(consFailure[0]){
3682  // Constraint failure. Abort.
3683  int dieOnError = simParams->rigidDie;
3684  if(dieOnError){
3685  // Bails out
3686  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
3687  NAMD_die("constraint failure during CUDA rattle!\n");
3688  }else{
3689  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
3690  }
3691  }else if(doEnergy || doVirial){
3692  cudaCheck(cudaStreamSynchronize(stream));
3693  if(doVirial && doGlobalStaleForces) {
3694  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3695  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3696  }
3697  if(simParams->rigidBonds != RIGID_NONE){
3698  Tensor reduction_rigidVirial;
3699  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
3700  // Haochuan: the tensor is not symmetric when there are fixed atoms
3701  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3702  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
3703  }
3704 
3705  // SUBMITHALF reductions
3706  Tensor reduction_virial;
3707  Tensor reduction_intVirialNormal;
3708  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3709  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3710  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3711  // Haochuan: the tensor is not symmetric when there are fixed atoms
3712  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3713  reduction_virial *= 0.5;
3714  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3715 
3716  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3717  += (intKineticEnergy_half[0] * 0.25);
3718  reduction_intVirialNormal *= 0.5;
3719  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3720  reduction_intVirialNormal);
3721 
3722  //submitReductions1
3723  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
3724  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3725  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
3726  Vector angularMomentum(*angularMomentum_x,
3727  *angularMomentum_y,
3728  *angularMomentum_z);
3729  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3730  //submitReductions2
3731  Tensor regintVirialNormal;
3732  Tensor regintVirialNbond;
3733  Tensor regintVirialSlow;
3734  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
3735  if (maxForceNumber >= 1) {
3736  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
3737  }
3738  if (maxForceNumber >= 2) {
3739  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
3740  }
3741 
3742  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
3743  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
3744  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
3745  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
3746 
3747  if (simParams->fixedAtomsOn) {
3748  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3749  double3 fixForceNormal, fixForceNbond, fixForceSlow;
3750  switch (maxForceNumber) {
3751  case 2: {
3752  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3753  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3754  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
3755  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
3756  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
3757  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
3758  } // intentionally fallthrough
3759  case 1: {
3760  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3761  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3762  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
3763  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
3764  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
3765  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
3766  } // intentionally fallthrough
3767  default: {
3768  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3769  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3770  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
3771  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
3772  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
3773  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
3774  }
3775  }
3776 #if 0
3777  auto printTensor = [](const cudaTensor& t, const std::string& name){
3778  CkPrintf("%s", name.c_str());
3779  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
3780  "%12.5lf %12.5lf %12.5lf\n"
3781  "%12.5lf %12.5lf %12.5lf\n",
3782  t.xx, t.xy, t.xz,
3783  t.yx, t.yy, t.yz,
3784  t.zx, t.zy, t.zz);
3785  };
3786  printTensor(fixVirialNormal, "fixVirialNormal = ");
3787  printTensor(fixVirialNbond, "fixVirialNbond = ");
3788  printTensor(fixVirialSlow, "fixVirialSlow = ");
3789 #endif
3790  }
3791  }
3792 }
3793 
3794 // Adding this function back temporarily until GPU migration is merged
3795 #if 1
3796 // This function will aggregate data within a single GPU, so we need to have it copied over pmePositions
3797 void SequencerCUDA::atomUpdatePme()
3798 {
3799  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3801  // We need to find doNbond and doSlow for upcoming step
3802  bool doNbond = false;
3803  bool doSlow = true;
3804 
3805  bool doFEP = false;
3806  bool doTI = false;
3807  bool doAlchDecouple = false;
3808  bool doAlchSoftCore = false;
3809  if (simParams->alchOn) {
3810  if (simParams->alchFepOn) doFEP = true;
3811  if (simParams->alchThermIntOn) doTI = true;
3812  if (simParams->alchDecouple) doAlchDecouple = true;
3813  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3814  }
3815 
3816  if (Node::Object()->molecule->is_lonepairs_psf) {
3817  lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
3818  }
3819 
3820  bool usePatchPme = false;
3821  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3822  // This will check if the current lattice is compatible with the patch-level PME kernels
3823  // This needs to be redone every time the lattice changes, the results are stored in
3824  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3825  // function.
3826  //
3827  // The behavior of set compute positions PME is different depending on the kernels being used,
3828  // so that value needs to be passed to the kernel object
3830  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3832  myLattice,
3833  getNumPatchesHome(),
3834  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3835  getHostPatchMin(),
3836  getHostPatchMax(),
3837  getHostAwayDists()
3838  );
3839  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3840  }
3841 
3842  std::vector<int> atom_counts;
3843  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3844  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3845  }
3846  CUDASequencerKernel->set_pme_positions(
3847  deviceIndex,
3849  nDevices,
3850  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3851  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3852 #ifdef NAMD_NCCL_ALLREDUCE
3853  (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
3854  (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
3855  (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
3856 #else
3857  coll_pos_x.getDevicePtr(),
3858  coll_pos_y.getDevicePtr(),
3859  coll_pos_z.getDevicePtr(),
3860  coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
3861  coll_pos_y.getDevicePeerPtr(),
3862  coll_pos_z.getDevicePeerPtr(),
3863  coll_charge.getDevicePeerPtr(),
3864  coll_partition.getDevicePeerPtr(),
3865 #endif
3866  coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
3867  coll_patchCenter.getDevicePtr(),
3868  patchData->devData[deviceIndex].slow_patchPositions,
3869  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3870  coll_sortOrder.getDevicePtr(), myLattice,
3871  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3872  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3874  patchData->devData[deviceIndex].d_localPatches,
3875  patchData->devData[deviceIndex].d_peerPatches,
3876  atom_counts,
3877  stream);
3878 
3879  cudaCheck(cudaStreamSynchronize(stream));
3880 }
3881 #endif
3882 
3883 
3884 void SequencerCUDA::sync() {
3885  cudaCheck(cudaStreamSynchronize(stream));
3886 }
3887 
3888 void SequencerCUDA::calculateExternalForces(
3889  const int step,
3890  const int maxForceNumber,
3891  const int doEnergy,
3892  const int doVirial) {
3893 
3894  const bool is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3895  const bool is_tip4_water = simParams->watmodel == WaterModel::TIP4;
3896 
3897  if (is_lonepairs_psf) {
3898  lonepairsKernel->redistributeForce(
3899  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3900  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3901  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3902  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3903  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), maxForceNumber, doEnergy || doVirial, stream);
3904  }
3905  // TODO: Should I use "else if" here to follow the logic of Sequencer.C?
3906  if (is_tip4_water) {
3907  redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
3908  }
3909 
3910  if(simParams->eFieldOn){
3911  double3 efield;
3912  efield.x = simParams->eField.x;
3913  efield.y = simParams->eField.y;
3914  efield.z = simParams->eField.z;
3915 
3916  double efield_omega = TWOPI * simParams->eFieldFreq / 1000.;
3917  double efield_phi = PI/180. * simParams->eFieldPhase;
3918  double t = step * simParams->dt;
3919 
3920  CUDASequencerKernel->apply_Efield(numAtomsHome, simParams->eFieldNormalized,
3921  doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3922  coll_charge.getDevicePtr(), coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3923  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3924  &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3925  &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3926  &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3927  d_tbcatomic, stream);
3928  }
3929 
3930  if(simParams->constraintsOn){
3931  restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3932  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3933  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3934  &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3935  &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3936  &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3937  }
3938 
3939  if(simParams->SMDOn){
3940  SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
3941  d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
3942  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3943  &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
3944  &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
3945  }
3946 
3947  if(simParams->groupRestraintsOn){
3948  groupRestraintsKernel->doForce(step, doEnergy, doVirial,
3949  myLattice, d_transform,
3950  d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3951  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3952  &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
3953  &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
3954  }
3955  if(simParams->mgridforceOn || simParams->gridforceOn){
3956  gridForceKernel->doForce(doEnergy, doVirial,
3957  myLattice, step,
3958  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
3959  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3960  stream);
3961  }
3962  if(simParams->consForceOn){
3963  consForceKernel->doForce(myLattice, doVirial,
3964  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3965  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3966  d_transform,
3967  &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
3968  &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
3969  }
3970 
3971  if(doEnergy || doVirial) {
3972  // Store the external forces and energy data
3973  cudaCheck(cudaStreamSynchronize(stream));
3974  if (is_lonepairs_psf || is_tip4_water) {
3975  switch (maxForceNumber) {
3976  case 2:
3977  copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
3978  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, lpVirialSlow[0]);
3979  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
3980  case 1:
3981  copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
3982  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, lpVirialNbond[0]);
3983  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
3984  case 0:
3985  copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
3986  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, lpVirialNormal[0]);
3987  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
3988  }
3989  }
3990  if(simParams->eFieldOn){
3991  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_ELEC_FIELD];
3992  if (!simParams->eFieldNormalized){
3993  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
3994  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
3995  }
3996  }
3997 
3998  if(simParams->constraintsOn){
3999  reduction->item(REDUCTION_BC_ENERGY) += extEnergy[EXT_CONSTRAINTS];
4000  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
4001  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
4002  }
4003 
4004  if(simParams->SMDOn){
4006  if(cudaMgr->reducerSMDDevice == deviceIndex)
4007  {
4008  // each SMD kernel has the total SMD energy and extForce
4009  // so only one will contribute
4010  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_SMD];
4011  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_SMD]);
4012  }
4013  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_SMD]);
4014  }
4015 
4016  if(simParams->groupRestraintsOn){
4017  // single GPU case, or designated reducer are only contributors for energy
4019  if(!mGpuOn || (deviceCUDA->getDeviceIndex() == cudaMgr->reducerGroupRestraintDevice))
4020  {
4021  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GROUP_RESTRAINTS];
4022  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
4023  }
4024  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
4025  }
4026 
4027  if(simParams->mgridforceOn || simParams->gridforceOn){
4028  // first sum across grids
4029  gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4030  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GRIDFORCE];
4031  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
4032  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
4033  gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4034  }
4035  if(simParams->consForceOn){
4036  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_CONSFORCE];
4037  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
4038  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
4039  }
4040  }
4041 }
4042 
4043 void SequencerCUDA::copyGlobalForcesToDevice(){
4044  // copy the globalMaster forces from host to device.
4045  // Use normal force on host to aggregate it
4046  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
4047  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
4048  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4049  // TODO: determine if this aggregation needs peers and to be home and proxy
4050  for(int i =0 ; i < numPatchesHome; i++){
4051  CudaLocalRecord record = localPatches[i];
4052  const int patchID = record.patchID;
4053  const int stride = record.bufferOffset;
4054  const int numPatchAtoms = record.numAtoms;
4055  PatchDataSOA& current = homePatches[i]->patchDataSOA;
4056  memcpy(f_global_x + stride, current.f_global_x, numPatchAtoms*sizeof(double));
4057  memcpy(f_global_y + stride, current.f_global_y, numPatchAtoms*sizeof(double));
4058  memcpy(f_global_z + stride, current.f_global_z, numPatchAtoms*sizeof(double));
4059  }
4060  // copy aggregated force to device buffer
4061  copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4062  copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4063  copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4064 
4065 }
4066 
4067 void SequencerCUDA::updateHostPatchDataSOA() {
4068  std::vector<PatchDataSOA> host_copy(numPatchesHome);
4069  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4070 
4071  for(int i =0 ; i < numPatchesHome; i++) {
4072  host_copy[i] = homePatches[i]->patchDataSOA;
4073  }
4074  copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4075  cudaCheck(cudaDeviceSynchronize());
4076 }
4077 
4078 void SequencerCUDA::saveForceCUDASOA_direct(
4079  const bool doGlobal, const bool doForcesOutput, const int maxForceNumber) {
4080  CUDASequencerKernel->copyForcesToHostSOA(
4081  numPatchesHome,
4082  patchData->devData[deviceIndex].d_localPatches,
4083  maxForceNumber,
4084  coll_f_normal_x.getDevicePtr(),
4085  coll_f_normal_y.getDevicePtr(),
4086  coll_f_normal_z.getDevicePtr(),
4087  coll_f_nbond_x.getDevicePtr(),
4088  coll_f_nbond_y.getDevicePtr(),
4089  coll_f_nbond_z.getDevicePtr(),
4090  coll_f_slow_x.getDevicePtr(),
4091  coll_f_slow_y.getDevicePtr(),
4092  coll_f_slow_z.getDevicePtr(),
4093  d_HostPatchDataSOA,
4094  doGlobal,
4095  doForcesOutput,
4096  stream
4097  );
4098  cudaCheck(cudaStreamSynchronize(stream));
4099 }
4100 
4101 void SequencerCUDA::copyPositionsToHost_direct() {
4102  CUDASequencerKernel->copyPositionsToHostSOA(
4103  numPatchesHome,
4104  patchData->devData[deviceIndex].d_localPatches,
4105  coll_pos_x.getDevicePtr(),
4106  coll_pos_y.getDevicePtr(),
4107  coll_pos_z.getDevicePtr(),
4108  d_HostPatchDataSOA,
4109  stream
4110  );
4111  cudaCheck(cudaStreamSynchronize(stream));
4112 }
4113 
4114 void SequencerCUDA::redistributeTip4pForces(
4115  const int maxForceNumber,
4116  const int doVirial) {
4117  CUDASequencerKernel->redistributeTip4pForces(
4118  coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
4119  coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
4120  coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
4121  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4122  coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass,
4123  numAtomsHome, doVirial, maxForceNumber, stream
4124  );
4125 }
4126 
4127 void SequencerCUDA::allocateGPUSavedForces() {
4128  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4129  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4130  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4131  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4132  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4133  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4134 }
4135 
4136 void SequencerCUDA::submitReductionValues() {
4137  reduction->submit();
4138 }
4139 
4140 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
BigReal yy
Definition: CudaUtils.h:89
double * vel_y
Definition: NamdTypes.h:397
NAMD_HOST_DEVICE void rescale(Tensor factor)
Definition: Lattice.h:60
#define NAMD_EVENT_STOP(eon, id)
int getNumAtoms() const
Definition: Patch.h:105
ScaledPosition getMin()
Definition: HomePatch.h:529
int getDeviceCount()
Definition: DeviceCUDA.h:124
int size(void) const
Definition: ResizeArray.h:131
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void updateAtomBuffers()
int periodic_a(void) const
Definition: PatchMap.h:73
#define BOLTZMANN
Definition: common.h:54
#define curandCheck(stmt)
Definition: CudaUtils.h:251
double3 ** curSMDCOM
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal zz
Definition: CudaUtils.h:93
BigReal yx
Definition: CudaUtils.h:88
int periodic_c(void) const
Definition: PatchMap.h:75
BigReal alchElecLambdaStart
Definition: Vector.h:72
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
void deallocate_device(T **pp)
Definition: CudaUtils.h:342
BigReal yz
Definition: CudaUtils.h:90
int savePairlists
Definition: PatchTypes.h:41
double * f_global_z
Definition: NamdTypes.h:439
int32 * moleculeAtom
atom index for all molecules
Definition: Molecule.h:622
#define COULOMB
Definition: common.h:53
HomePatchList * homePatchList()
Definition: PatchMap.C:438
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
int usePairlists
Definition: PatchTypes.h:40
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
PatchID destPatchID
Definition: Migration.h:20
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
#define iout
Definition: InfoStream.h:51
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
#define COPY_CUDATENSOR(S, D)
Definition: CudaUtils.h:51
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
Definition: Molecule.h:174
double * pos_y
Definition: NamdTypes.h:378
void setupDevicePeerAccess()
Flags flags
Definition: Patch.h:128
double * f_global_y
Definition: NamdTypes.h:438
std::atomic< int > reducerSMDDevice
const char * get_atomtype(int anum) const
Definition: Molecule.h:1192
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
std::atomic< int > reducerGroupRestraintDevice
#define PI
Definition: common.h:92
void updateAtomCount(const int n, const int reallocate)
#define NAMD_CRASH_ATOM_TOO_FAST
FullAtomList & getAtomList()
Definition: HomePatch.h:528
void copyIntFlags(const Flags &flags)
Definition: PatchTypes.h:74
NAMD_HOST_DEVICE double3 make_double3(float3 a)
Definition: Vector.h:343
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
int pairlistsAge
Definition: Sequencer.h:232
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38
int getMasterPe()
Definition: DeviceCUDA.h:137
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
Definition: Molecule.h:620
#define ATOMIC_BINS
Definition: CudaUtils.h:79
void NAMD_bug(const char *err_msg)
Definition: common.C:196
BigReal xx
Definition: CudaUtils.h:85
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:62
static ComputeCUDAMgr * getComputeCUDAMgr()
int doFullElectrostatics
Definition: PatchTypes.h:23
double * vel_x
Jim recommends double precision velocity.
Definition: NamdTypes.h:396
int32 * id
Definition: NamdTypes.h:390
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
Definition: Molecule.h:619
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:305
ScaledPosition getMax()
Definition: HomePatch.h:530
BigReal zx
Definition: CudaUtils.h:91
int32 * transform_i
Definition: NamdTypes.h:410
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:586
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:148
int periodic_b(void) const
Definition: PatchMap.h:74
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
BigReal xz
Definition: CudaUtils.h:87
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
BigReal maxAtomMovement
Definition: PatchTypes.h:43
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
BigReal xx
Definition: Tensor.h:17
#define TWOPI
Definition: common.h:96
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:436
BigReal zz
Definition: Tensor.h:19
int32 * transform_k
Definition: NamdTypes.h:412
#define simParams
Definition: Output.C:131
iterator begin(void)
Definition: ResizeArray.h:36
double * pos_z
Definition: NamdTypes.h:379
const PatchID patchID
Definition: Patch.h:150
Definition: Tensor.h:15
double * pos_x
Definition: NamdTypes.h:377
BigReal y
Definition: Vector.h:74
bool getIsPmeDevice()
Definition: DeviceCUDA.h:168
int32 * transform_j
Definition: NamdTypes.h:411
double * vel_z
Definition: NamdTypes.h:398
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28
BigReal yy
Definition: Tensor.h:18
#define PDBVELFACTOR
Definition: common.h:57
int32 * moleculeStartIndex
starting index of each molecule
Definition: Molecule.h:621
int getDeviceIndex()
Definition: DeviceCUDA.h:166
BigReal pairlistTolerance
Definition: PatchTypes.h:42
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
bool getIsMasterDevice()
Definition: DeviceCUDA.C:646
double * f_global_x
Definition: NamdTypes.h:437
int pairlistsAgeLimit
Definition: Sequencer.h:233
int pairlistsAreValid
Definition: Sequencer.h:231
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int destPatchID[3][3][3]
Definition: CudaUtils.h:132
int is_lonepairs_psf
Definition: Molecule.h:491
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
#define namd_reciprocal(x)
Definition: Vector.h:69
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:529
#define RIGID_NONE
Definition: SimParameters.h:80
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
SimParameters *const simParams
Definition: Sequencer.h:322
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
Molecule * molecule
Definition: Node.h:179
BigReal zy
Definition: CudaUtils.h:92
BigReal xy
Definition: CudaUtils.h:86
double BigReal
Definition: common.h:123
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms
Definition: NamdTypes.h:456