20 #ifdef NODEGROUP_FORCE_REGISTER 28 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 30 for (int i = 0; i < numPatchesHome; i++) { \ 31 PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \ 32 const int numPatchAtoms = current.numAtoms; \ 33 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 34 offset += numPatchAtoms; \ 36 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \ 39 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 41 for (int i = 0; i < numPatchesHomeAndProxy; i++) { \ 42 PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \ 43 const int numPatchAtoms = current.numAtoms; \ 44 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 45 offset += numPatchAtoms; \ 47 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \ 50 #define AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do { \ 52 for (int i = 0; i < numPatchesHome; i++) { \ 53 PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \ 54 const int numPatchAtoms = current.numAtoms; \ 55 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 56 offset += numPatchAtoms; \ 58 copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHome, stream); \ 61 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do { \ 63 for (int i = 0; i < numPatchesHomeAndProxy; i++) { \ 64 PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \ 65 const int numPatchAtoms = current.numAtoms; \ 66 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 67 offset += numPatchAtoms; \ 69 copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHomeAndProxy, stream); \ 73 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 75 for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \ 76 PatchDataSOA& current = elem->patch->patchDataSOA; \ 77 const int numPatchAtoms = current.numAtoms; \ 78 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 79 offset += numPatchAtoms; \ 81 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \ 86 void SequencerCUDA::registerSOAPointersToHost(){
87 patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
90 void SequencerCUDA::copySOAHostRegisterToDevice(){
94 copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
97 for(
int i = 0; i < this->nDevices; i++) {
98 patchData->h_soa_sortOrder[i] = coll_sortOrder.getHostPeer()[i];
100 patchData->h_soa_vdwType[i] = coll_vdwType.getHostPeer()[i];
101 patchData->h_soa_id[i] = coll_idMig.getHostPeer()[i];
102 patchData->h_soa_migrationDestination[i] = coll_migrationDestination.getHostPeer()[i];
106 patchData->h_soa_partition[i] = coll_partition.getHostPeer()[i];
111 for(
int i = 0; i < this->nDevices; i++)
112 h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
113 copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
116 void SequencerCUDA::printSOAPositionsAndVelocities() {
127 copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
128 copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
129 copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
131 copy_DtoH_sync<BigReal>(coll_pos_x.getDevicePtr(), h_pos_x, numAtomsHome);
132 copy_DtoH_sync<BigReal>(coll_pos_y.getDevicePtr(), h_pos_y, numAtomsHome);
133 copy_DtoH_sync<BigReal>(coll_pos_z.getDevicePtr(), h_pos_z, numAtomsHome);
136 copy_DtoH_sync<BigReal>(coll_vel_x.getDevicePtr(), h_vel_x, numAtomsHome);
137 copy_DtoH_sync<BigReal>(coll_vel_y.getDevicePtr(), h_vel_y, numAtomsHome);
138 copy_DtoH_sync<BigReal>(coll_vel_z.getDevicePtr(), h_vel_z, numAtomsHome);
140 CmiLock(this->patchData->printlock);
141 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
143 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
144 for(
int i =0 ; i < numPatchesHome; i++){
146 const int patchID = record.
patchID;
148 const int numPatchAtoms = record.
numAtoms;
151 fprintf(stderr,
"Patch [%d]:\n", patchID);
152 for(
int j = 0; j < numPatchAtoms; j++){
153 fprintf(stderr,
" [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.
id[j],
154 h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
155 h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
159 CmiUnlock(this->patchData->printlock);
169 void SequencerCUDA::printSOAForces(
char *prefix) {
182 copy_DtoH_sync<BigReal>(coll_f_normal_x.getDevicePtr(), h_f_normal_x, numAtomsHome);
183 copy_DtoH_sync<BigReal>(coll_f_normal_y.getDevicePtr(), h_f_normal_y, numAtomsHome);
184 copy_DtoH_sync<BigReal>(coll_f_normal_z.getDevicePtr(), h_f_normal_z, numAtomsHome);
186 copy_DtoH_sync<BigReal>(coll_f_nbond_x.getDevicePtr(), h_f_nbond_x, numAtomsHome);
187 copy_DtoH_sync<BigReal>(coll_f_nbond_y.getDevicePtr(), h_f_nbond_y, numAtomsHome);
188 copy_DtoH_sync<BigReal>(coll_f_nbond_z.getDevicePtr(), h_f_nbond_z, numAtomsHome);
190 copy_DtoH_sync<BigReal>(coll_f_slow_x.getDevicePtr(), h_f_slow_x, numAtomsHome);
191 copy_DtoH_sync<BigReal>(coll_f_slow_y.getDevicePtr(), h_f_slow_y, numAtomsHome);
192 copy_DtoH_sync<BigReal>(coll_f_slow_z.getDevicePtr(), h_f_slow_z, numAtomsHome);
195 CmiLock(this->patchData->printlock);
196 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
198 fprintf(stderr,
"PE[%d] force printout\n", CkMyPe());
199 for(
int i =0 ; i < numPatchesHome; i++){
201 const int patchID = record.
patchID;
203 const int numPatchAtoms = record.
numAtoms;
204 FILE *outfile=stderr;
207 snprintf(fname,100,
"%s-patch-%d", prefix, patchID);
208 outfile = fopen(fname,
"w");
210 fprintf(outfile,
"Patch [%d]:\n", patchID);
211 for(
int j = 0; j < numPatchAtoms; j++){
212 fprintf(outfile,
" [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
213 h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
214 h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
215 h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
217 if(prefix!=NULL) fclose(outfile);
220 CmiUnlock(this->patchData->printlock);
236 SequencerCUDA* SequencerCUDA::InstanceInit(
const int deviceID_ID,
238 if (CkpvAccess(SequencerCUDA_instance) == 0) {
239 CkpvAccess(SequencerCUDA_instance) =
new SequencerCUDA(deviceID_ID, sim_Params);
241 return CkpvAccess(SequencerCUDA_instance);
244 SequencerCUDA::SequencerCUDA(
const int deviceID_ID,
246 deviceID(deviceID_ID),
simParams(sim_Params)
248 restraintsKernel = NULL;
250 groupRestraintsKernel = NULL;
251 gridForceKernel = NULL;
252 consForceKernel = NULL;
253 lonepairsKernel =
nullptr;
255 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
256 patchData = cpdata.ckLocalBranch();
259 CUDASequencerKernel =
new SequencerCUDAKernel();
260 CUDAMigrationKernel =
new MigrationCUDAKernel();
262 used_grids.resize(num_used_grids, 0);
298 rescalePairlistTolerance =
false;
301 SequencerCUDA::~SequencerCUDA(){
304 deallocateStaticArrays();
305 deallocate_device<SettleParameters>(&sp);
306 deallocate_device<int>(&settleList);
307 deallocate_device<CudaRattleElem>(&rattleList);
308 deallocate_device<int>(&d_consFailure);
309 if (CUDASequencerKernel != NULL)
delete CUDASequencerKernel;
310 if (CUDAMigrationKernel != NULL)
delete CUDAMigrationKernel;
311 if (restraintsKernel != NULL)
delete restraintsKernel;
312 if(SMDKernel != NULL)
delete SMDKernel;
313 if (groupRestraintsKernel != NULL)
delete groupRestraintsKernel;
314 if (gridForceKernel != NULL)
delete gridForceKernel;
315 if (lonepairsKernel !=
nullptr)
delete lonepairsKernel;
316 if (consForceKernel != NULL)
delete consForceKernel;
322 CmiDestroyLock(printlock);
326 void SequencerCUDA::zeroScalars(){
327 numAtomsHomeAndProxyAllocated = 0;
328 numAtomsHomeAllocated = 0;
329 buildRigidLists =
true;
330 numPatchesCheckedIn = 0;
334 void SequencerCUDA::initialize(){
338 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 339 int leastPriority, greatestPriority;
340 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
342 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
346 cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
351 curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
353 unsigned long long seed =
simParams->randomSeed + CkMyPe();
354 curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
356 numAtomsHomeAllocated = 0;
357 numAtomsHomeAndProxyAllocated = 0;
359 totalMarginViolations = 0;
360 buildRigidLists =
true;
361 numPatchesCheckedIn = 0;
369 mGpuOn = nDevices > 1;
376 printlock = CmiCreateLock();
378 const int numPes = CkNumPes();
380 atomMapList.resize(numPes);
384 allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
385 allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
386 allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
387 allocate_device<double3>(&d_fixForceNormal, 1);
388 allocate_device<double3>(&d_fixForceNbond, 1);
389 allocate_device<double3>(&d_fixForceSlow, 1);
393 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
394 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
395 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
398 allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
400 allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
402 allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
404 allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
405 allocate_host<double3>(&patchCenter, numPatchesGlobal);
406 allocate_host<int>(&globalToLocalID, numPatchesGlobal);
407 allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
408 allocate_host<double3>(&awayDists, numPatchesGlobal);
409 allocate_host<double3>(&patchMin, numPatchesGlobal);
410 allocate_host<double3>(&patchMax, numPatchesGlobal);
412 allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal);
413 allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
414 allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
415 allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
418 allocate_device<double3>(&d_awayDists, numPatchesGlobal);
419 allocate_device<double3>(&d_patchMin, numPatchesGlobal);
420 allocate_device<double3>(&d_patchMax, numPatchesGlobal);
421 allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
422 allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
423 allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
424 allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
425 allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
426 allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
427 allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
430 allocate_device<int>(&d_killme, 1);
431 allocate_device<char>(&d_barrierFlag, 1);
432 allocate_device<unsigned int>(&d_tbcatomic, 5);
433 allocate_device<BigReal>(&d_kineticEnergy,
ATOMIC_BINS);
434 allocate_device<BigReal>(&d_intKineticEnergy,
ATOMIC_BINS);
435 allocate_device<BigReal>(&d_momentum_x,
ATOMIC_BINS);
436 allocate_device<BigReal>(&d_momentum_y,
ATOMIC_BINS);
437 allocate_device<BigReal>(&d_momentum_z,
ATOMIC_BINS);
438 allocate_device<BigReal>(&d_angularMomentum_x,
ATOMIC_BINS);
439 allocate_device<BigReal>(&d_angularMomentum_y,
ATOMIC_BINS);
440 allocate_device<BigReal>(&d_angularMomentum_z,
ATOMIC_BINS);
441 allocate_device<cudaTensor>(&d_virial,
ATOMIC_BINS);
442 allocate_device<cudaTensor>(&d_intVirialNormal,
ATOMIC_BINS);
443 allocate_device<cudaTensor>(&d_intVirialNbond,
ATOMIC_BINS);
444 allocate_device<cudaTensor>(&d_intVirialSlow,
ATOMIC_BINS);
445 allocate_device<cudaTensor>(&d_rigidVirial,
ATOMIC_BINS);
447 allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
448 allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
449 allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
451 allocate_device<cudaTensor>(&d_extVirial,
ATOMIC_BINS * EXT_FORCE_TOTAL);
452 allocate_device<double3>(&d_extForce,
ATOMIC_BINS * EXT_FORCE_TOTAL);
453 allocate_device<double>(&d_extEnergy,
ATOMIC_BINS * EXT_FORCE_TOTAL);
455 allocate_device<SettleParameters>(&sp, 1);
458 allocate_host<int>(&killme, 1);
459 allocate_host<BigReal>(&kineticEnergy, 1);
460 allocate_host<BigReal>(&intKineticEnergy, 1);
461 allocate_host<BigReal>(&kineticEnergy_half, 1);
462 allocate_host<BigReal>(&intKineticEnergy_half, 1);
463 allocate_host<BigReal>(&momentum_x, 1);
464 allocate_host<BigReal>(&momentum_y, 1);
465 allocate_host<BigReal>(&momentum_z, 1);
466 allocate_host<BigReal>(&angularMomentum_x, 1);
467 allocate_host<BigReal>(&angularMomentum_y, 1);
468 allocate_host<BigReal>(&angularMomentum_z, 1);
469 allocate_host<int>(&consFailure, 1);
470 allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
471 allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
472 allocate_host<unsigned int>(&h_marginViolations, 1);
473 allocate_host<unsigned int>(&h_periodicCellSmall, 1);
476 allocate_host<cudaTensor>(&virial, 1);
477 allocate_host<cudaTensor>(&virial_half, 1);
478 allocate_host<cudaTensor>(&intVirialNormal, 1);
479 allocate_host<cudaTensor>(&intVirialNormal_half, 1);
480 allocate_host<cudaTensor>(&intVirialNbond, 1);
481 allocate_host<cudaTensor>(&intVirialSlow, 1);
482 allocate_host<cudaTensor>(&rigidVirial, 1);
483 allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
484 allocate_host<cudaTensor>(&lpVirialNormal, 1);
485 allocate_host<cudaTensor>(&lpVirialNbond, 1);
486 allocate_host<cudaTensor>(&lpVirialSlow, 1);
489 d_f_saved_nbond_x =
nullptr;
490 d_f_saved_nbond_y =
nullptr;
491 d_f_saved_nbond_z =
nullptr;
492 d_f_saved_slow_x =
nullptr;
493 d_f_saved_slow_y =
nullptr;
494 d_f_saved_slow_z =
nullptr;
497 *kineticEnergy = 0.0;
498 *intKineticEnergy = 0.0;
499 *kineticEnergy_half = 0.0;
500 *intKineticEnergy_half = 0.0;
504 *angularMomentum_x = 0.0;
505 *angularMomentum_y = 0.0;
506 *angularMomentum_z = 0.0;
515 t_setComputePositions = 0;
516 t_accumulateForceKick = 0;
519 t_submitReductions1 = 0;
520 t_submitReductions2 = 0;
522 cudaEventCreate(&eventStart);
523 cudaEventCreate(&eventStop);
524 cudaCheck(cudaMemset(d_patchNewTolerance, 0,
sizeof(
BigReal)*numPatchesGlobal));
526 cudaCheck(cudaMemset(d_tbcatomic, 0,
sizeof(
unsigned int) * 5));
546 memset(h_marginViolations, 0,
sizeof(
unsigned int));
547 memset(h_periodicCellSmall, 0,
sizeof(
unsigned int));
550 memset(intVirialNormal, 0,
sizeof(
cudaTensor));
551 memset(intVirialNbond, 0,
sizeof(
cudaTensor));
553 memset(lpVirialNormal, 0,
sizeof(
cudaTensor));
556 memset(globalToLocalID, -1,
sizeof(
int)*numPatchesGlobal);
562 d_consFailure = NULL;
563 d_consFailureSize = 0;
570 numPatchesHome = numPatchesGlobal;
578 for(
int i = 0; i < numPes; i++) {
582 for (
int j = 0; j < npatch; j++) {
586 patchList.push_back(patch);
587 patchNewTolerance[count++] =
591 patchData->devData[deviceIndex].patches.push_back(patch);
592 patchListHomeAndProxy.push_back(patch);
610 for (
int i = 0; i < numPes; ++i) {
621 patchData->devData[deviceIndex].patches.push_back(patch);
628 #ifdef NAMD_NCCL_ALLREDUCE 629 deviceCUDA->setNcclUniqueId(patchData->ncclId);
641 restraintsKernel =
new ComputeRestraintsCUDA(patchList, atomMapList,
654 SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
656 SMDKernel->initPeerCOM(cudaMgr->
curSMDCOM, stream);
662 groupRestraintsKernel =
new ComputeGroupRestraintsCUDA(
simParams->outputEnergies,
663 simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
666 groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
667 groupRestraintsKernel->initPeerCOM(stream);
673 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
681 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
703 void SequencerCUDA::updateDeviceKernels()
708 if(patchData->updateCounter.fetch_sub(1)>=1)
710 if(gridForceKernel!=NULL)
712 delete gridForceKernel;
713 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
714 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
716 if(consForceKernel!=NULL)
718 delete consForceKernel;
719 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,
721 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
728 bool SequencerCUDA::reallocateArrays(
int in_numAtomsHome,
int in_numAtomsHomeAndProxy)
731 const float OVERALLOC = 1.5f;
733 if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
739 bool realloc_gpu_saved_force =
false;
740 if (d_f_saved_nbond_x !=
nullptr || d_f_saved_slow_x !=
nullptr) {
741 realloc_gpu_saved_force =
true;
747 numAtomsHomeAndProxyAllocated = (int) ((
float) in_numAtomsHomeAndProxy * OVERALLOC);
748 numAtomsHomeAllocated = (int) ((
float) in_numAtomsHome * OVERALLOC);
750 allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
751 allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
752 allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
753 allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
754 allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
755 allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
756 allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
757 allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
758 allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
759 allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
760 allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
761 allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
763 allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
764 allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
765 allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
767 allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
768 allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
769 allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
771 allocate_host<double>(&recipMass, numAtomsHomeAllocated);
772 allocate_host<double>(&vel_x, numAtomsHomeAllocated);
773 allocate_host<double>(&vel_y, numAtomsHomeAllocated);
774 allocate_host<double>(&vel_z, numAtomsHomeAllocated);
775 allocate_host<char3>(&transform, numAtomsHomeAllocated);
776 allocate_host<float>(&mass, numAtomsHomeAllocated);
778 allocate_host<int>(&
partition, numAtomsHomeAndProxyAllocated);
780 allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
781 allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
782 allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
787 int n = (numAtomsHomeAllocated + 1) & (~1);
788 allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
790 allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
792 allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
797 allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
801 allocate_host<int>(&idMig, numAtomsHomeAllocated);
802 allocate_host<int>(&vdwType, numAtomsHomeAllocated);
837 allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
838 allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
839 allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
841 if (realloc_gpu_saved_force) {
842 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
843 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
844 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
845 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
846 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
847 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
853 allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
854 allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
855 d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
856 d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
857 d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
858 d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
859 d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
860 d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
861 d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
862 d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
863 d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
864 d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
865 d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
866 d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
868 allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
869 allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
870 allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
871 allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
873 allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
876 allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
877 d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
878 d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
879 d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
880 allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
881 allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
882 allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
883 allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
884 allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
885 allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
886 allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
887 allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
888 allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
889 allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
890 allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
891 allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
892 allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
893 allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
895 allocate_device<float>(&d_mass, numAtomsHomeAllocated);
896 allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
897 allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
898 allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
899 allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
900 allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
901 allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
902 allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
903 allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
905 allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
907 allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
908 allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
909 allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
910 allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
916 allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
917 allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
918 allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
919 allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
923 d_f_saved_nbond_x =
nullptr;
924 d_f_saved_nbond_y =
nullptr;
925 d_f_saved_nbond_z =
nullptr;
926 d_f_saved_slow_x =
nullptr;
927 d_f_saved_slow_y =
nullptr;
928 d_f_saved_slow_z =
nullptr;
931 memset(pos_x, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
932 memset(pos_y, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
933 memset(pos_z, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
934 cudaCheck(cudaMemset(coll_pos_x.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAndProxyAllocated));
935 cudaCheck(cudaMemset(coll_pos_y.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAndProxyAllocated));
936 cudaCheck(cudaMemset(coll_pos_z.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAndProxyAllocated));
937 cudaCheck(cudaMemset(coll_vel_x.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAllocated));
938 cudaCheck(cudaMemset(coll_vel_y.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAllocated));
939 cudaCheck(cudaMemset(coll_vel_z.getDevicePtr(), 0 ,
sizeof(double)*numAtomsHomeAllocated));
941 cudaCheck(cudaMemset(d_posNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
942 cudaCheck(cudaMemset(d_posNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
943 cudaCheck(cudaMemset(d_posNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
944 cudaCheck(cudaMemset(d_velNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
945 cudaCheck(cudaMemset(d_velNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
946 cudaCheck(cudaMemset(d_velNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
951 void SequencerCUDA::reallocateMigrationDestination() {
952 coll_migrationDestination.deallocate();
956 void SequencerCUDA::deallocateArrays() {
957 if (numAtomsHomeAndProxyAllocated != 0) {
960 deallocate_host<double>(&f_normal_x);
961 deallocate_host<double>(&f_normal_y);
962 deallocate_host<double>(&f_normal_z);
964 deallocate_host<double>(&f_global_x);
965 deallocate_host<double>(&f_global_y);
966 deallocate_host<double>(&f_global_z);
968 deallocate_host<double>(&f_nbond_x);
969 deallocate_host<double>(&f_nbond_y);
970 deallocate_host<double>(&f_nbond_z);
971 deallocate_host<double>(&f_slow_x);
972 deallocate_host<double>(&f_slow_y);
973 deallocate_host<double>(&f_slow_z);
974 deallocate_host<double>(&pos_x);
975 deallocate_host<double>(&pos_y);
976 deallocate_host<double>(&pos_z);
977 deallocate_host<float>(&charge);
978 deallocate_host<int>(&sortOrder);
979 deallocate_host<int>(&unsortOrder);
980 deallocate_host<double>(&recipMass);
981 deallocate_host<double>(&vel_x);
982 deallocate_host<double>(&vel_y);
983 deallocate_host<double>(&vel_z);
984 deallocate_host<char3>(&transform);
985 deallocate_host<float>(&mass);
989 deallocate_host<float>(&langevinParam);
990 deallocate_host<float>(&langScalVelBBK2);
991 deallocate_host<float>(&langScalRandBBK2);
993 deallocate_host<int>(&hydrogenGroupSize);
994 deallocate_host<int>(&atomFixed);
996 deallocate_host<int>(&groupFixed);
997 deallocate_host<double>(&fixedPosition_x);
998 deallocate_host<double>(&fixedPosition_y);
999 deallocate_host<double>(&fixedPosition_z);
1002 deallocate_host<float>(&rigidBondLength);
1004 coll_pos_x.deallocate();
1005 coll_pos_y.deallocate();
1006 coll_pos_z.deallocate();
1007 coll_f_normal_x.deallocate();
1008 coll_f_normal_y.deallocate();
1009 coll_f_normal_z.deallocate();
1010 coll_f_nbond_x.deallocate();
1011 coll_f_nbond_y.deallocate();
1012 coll_f_nbond_z.deallocate();
1013 coll_f_slow_x.deallocate();
1014 coll_f_slow_y.deallocate();
1015 coll_f_slow_z.deallocate();
1017 coll_charge.deallocate();
1019 coll_sortOrder.deallocate();
1020 coll_unsortOrder.deallocate();
1022 coll_partition.deallocate();
1025 deallocate_device<double>(&d_posNew_raw);
1028 deallocate_device<double>(&d_f_rawMC);
1029 deallocate_device<double>(&d_pos_rawMC);
1031 deallocate_host<int>(&id);
1032 deallocate_device<int>(&d_id);
1033 deallocate_device<int>(&d_idOrder);
1034 deallocate_device<int>(&d_moleculeAtom);
1035 deallocate_device<int>(&d_moleculeStartIndex);
1039 deallocate_host<int>(&idMig);
1040 deallocate_host<int>(&vdwType);
1041 coll_idMig.deallocate();
1042 coll_vdwType.deallocate();
1043 deallocate_device<FullAtom>(&d_atomdata_AoS);
1044 deallocate_device<int>(&d_migrationGroupSize);
1045 deallocate_device<int>(&d_migrationGroupIndex);
1046 deallocate_device<int>(&d_sortIndex);
1049 deallocate_device<double>(&d_f_global_x);
1050 deallocate_device<double>(&d_f_global_y);
1051 deallocate_device<double>(&d_f_global_z);
1053 coll_vel_x.deallocate();
1054 coll_vel_y.deallocate();
1055 coll_vel_z.deallocate();
1056 deallocate_device<double>(&d_recipMass);
1057 deallocate_device<char3>(&d_transform);
1058 deallocate_device<double>(&d_velNew_x);
1059 deallocate_device<double>(&d_velNew_y);
1060 deallocate_device<double>(&d_velNew_z);
1061 deallocate_device<double>(&d_posSave_x);
1062 deallocate_device<double>(&d_posSave_y);
1063 deallocate_device<double>(&d_posSave_z);
1064 deallocate_device<double>(&d_rcm_x);
1065 deallocate_device<double>(&d_rcm_y);
1066 deallocate_device<double>(&d_rcm_z);
1067 deallocate_device<double>(&d_vcm_x);
1068 deallocate_device<double>(&d_vcm_y);
1069 deallocate_device<double>(&d_vcm_z);
1070 deallocate_device<float>(&d_mass);
1071 deallocate_device<float>(&d_langevinParam);
1072 deallocate_device<float>(&d_langScalVelBBK2);
1073 deallocate_device<float>(&d_langScalRandBBK2);
1074 deallocate_device<float>(&d_gaussrand_x);
1075 deallocate_device<float>(&d_gaussrand_y);
1076 deallocate_device<float>(&d_gaussrand_z);
1077 deallocate_device<int>(&d_hydrogenGroupSize);
1078 deallocate_device<float>(&d_rigidBondLength);
1079 deallocate_device<int>(&d_atomFixed);
1081 deallocate_device<int>(&d_groupFixed);
1082 deallocate_device<double>(&d_fixedPosition_x);
1083 deallocate_device<double>(&d_fixedPosition_y);
1084 deallocate_device<double>(&d_fixedPosition_z);
1086 deallocate_device<double>(&d_f_saved_nbond_x);
1087 deallocate_device<double>(&d_f_saved_nbond_y);
1088 deallocate_device<double>(&d_f_saved_nbond_z);
1089 deallocate_device<double>(&d_f_saved_slow_x);
1090 deallocate_device<double>(&d_f_saved_slow_y);
1091 deallocate_device<double>(&d_f_saved_slow_z);
1095 void SequencerCUDA::deallocateStaticArrays() {
1098 deallocate_host<cudaTensor>(&extVirial);
1099 deallocate_host<double3>(&extForce);
1100 deallocate_host<double>(&extEnergy);
1101 deallocate_host<unsigned int>(&h_marginViolations);
1102 deallocate_host<unsigned int>(&h_periodicCellSmall);
1105 deallocate_host<double3>(&awayDists);
1106 deallocate_host<double3>(&patchMin);
1107 deallocate_host<double3>(&patchMax);
1108 deallocate_host<CudaAtom*>(&cudaAtomLists);
1109 deallocate_host<double3>(&patchCenter);
1110 deallocate_host<int>(&globalToLocalID);
1111 deallocate_host<int>(&patchToDeviceMap);
1112 deallocate_host<Lattice>(&pairlist_lattices);
1113 deallocate_host<double>(&patchMaxAtomMovement);
1114 deallocate_host<double>(&patchNewTolerance);
1115 deallocate_host<CudaMInfo>(&mInfo);
1116 deallocate_host<bool*>(&h_patchRecordHasForces);
1118 deallocate_host<cudaTensor>(&lpVirialNormal);
1119 deallocate_host<cudaTensor>(&lpVirialNbond);
1120 deallocate_host<cudaTensor>(&lpVirialSlow);
1122 deallocate_device<double3>(&d_awayDists);
1123 deallocate_device<double3>(&d_patchMin);
1124 deallocate_device<double3>(&d_patchMax);
1125 deallocate_device<int>(&d_globalToLocalID);
1126 deallocate_device<int>(&d_patchToDeviceMap);
1127 deallocate_device<Lattice>(&d_lattices);
1128 deallocate_device<Lattice>(&d_pairlist_lattices);
1129 deallocate_device<double>(&d_patchMaxAtomMovement);
1130 deallocate_device<double>(&d_patchNewTolerance);
1131 deallocate_device<CudaMInfo>(&d_mInfo);
1133 deallocate_device<int>(&d_killme);
1134 deallocate_device<char>(&d_barrierFlag);
1135 deallocate_device<unsigned int>(&d_tbcatomic);
1136 deallocate_device<BigReal>(&d_kineticEnergy);
1137 deallocate_device<BigReal>(&d_intKineticEnergy);
1138 deallocate_device<BigReal>(&d_momentum_x);
1139 deallocate_device<BigReal>(&d_momentum_y);
1140 deallocate_device<BigReal>(&d_momentum_z);
1141 deallocate_device<BigReal>(&d_angularMomentum_x);
1142 deallocate_device<BigReal>(&d_angularMomentum_y);
1143 deallocate_device<BigReal>(&d_angularMomentum_z);
1144 deallocate_device<cudaTensor>(&d_virial);
1145 deallocate_device<cudaTensor>(&d_intVirialNormal);
1146 deallocate_device<cudaTensor>(&d_intVirialNbond);
1147 deallocate_device<cudaTensor>(&d_intVirialSlow);
1148 deallocate_device<cudaTensor>(&d_lpVirialNormal);
1149 deallocate_device<cudaTensor>(&d_lpVirialNbond);
1150 deallocate_device<cudaTensor>(&d_lpVirialSlow);
1151 deallocate_device<cudaTensor>(&d_rigidVirial);
1152 deallocate_device<cudaTensor>(&d_extVirial);
1153 deallocate_device<double3>(&d_extForce);
1154 deallocate_device<double>(&d_extEnergy);
1155 deallocate_device<SettleParameters>(&sp);
1156 deallocate_device<unsigned int>(&deviceQueue);
1158 deallocate_device<CudaLocalRecord*>(&d_peer_record);
1159 deallocate_device<bool*>(&d_patchRecordHasForces);
1169 coll_atomdata_AoS_in.deallocate();
1170 coll_sortSoluteIndex.deallocate();
1171 coll_migrationDestination.deallocate();
1174 deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1177 void SequencerCUDA::copyMigrationInfo(
HomePatch *p,
int patchIndex){
1179 if (!p->patchMapRead) p->readPatchMap();
1180 for(
int x = 0; x < 3; x++){
1181 for(
int y = 0; y < 3; y++){
1182 for(
int z = 0; z < 3; z++){
1195 void SequencerCUDA::assembleOrderedPatchList(){
1200 for (
int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1201 HomePatch *p = patchData->devData[deviceIndex].patches[i];
1202 patchList.push_back(p);
1208 patchListHomeAndProxy.clear();
1210 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1211 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1212 const int patchID = localPatches[i].
patchID;
1215 for(
int d = 0; d < CkNumPes(); d++){
1219 patchListHomeAndProxy.push_back(patch);
1234 void SequencerCUDA::copyAoSDataToHost() {
1235 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1236 patchData = cpdata.ckLocalBranch();
1238 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1239 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1241 CUDAMigrationKernel->update_AoS(
1243 patchData->devData[deviceIndex].d_localPatches,
1245 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1246 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1250 for (
int i = 0; i < integrationPatches.size(); i++) {
1251 const int numAtoms = localPatches[i].numAtoms;
1252 const int offset = localPatches[i].bufferOffset;
1253 HomePatch *patch = integrationPatches[i];
1257 copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (
FullAtom*)h_atomdata.
begin(), numAtoms, stream);
1259 cudaCheck(cudaStreamSynchronize(stream));
1269 void SequencerCUDA::migrationLocalInit() {
1270 CUDAMigrationKernel->computeMigrationGroupIndex(
1272 patchData->devData[deviceIndex].d_localPatches,
1273 d_migrationGroupSize,
1274 d_migrationGroupIndex,
1278 CUDAMigrationKernel->update_AoS(
1280 patchData->devData[deviceIndex].d_localPatches,
1282 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1283 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1287 CUDAMigrationKernel->computeMigrationDestination(
1289 patchData->devData[deviceIndex].d_localPatches,
1296 d_hydrogenGroupSize,
1297 d_migrationGroupSize,
1298 d_migrationGroupIndex,
1299 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1300 coll_migrationDestination.getDevicePtr(),
1304 CUDAMigrationKernel->performLocalMigration(
1306 patchData->devData[deviceIndex].d_localPatches,
1308 (
FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1309 coll_migrationDestination.getDevicePtr(),
1313 cudaCheck(cudaStreamSynchronize(stream));
1320 void SequencerCUDA::migrationPerform() {
1321 CUDAMigrationKernel->performMigration(
1323 patchData->devData[deviceIndex].d_localPatches,
1326 coll_atomdata_AoS_in.getDevicePeerPtr(),
1327 d_migrationGroupSize,
1328 d_migrationGroupIndex,
1329 coll_migrationDestination.getDevicePtr(),
1332 cudaCheck(cudaStreamSynchronize(stream));
1336 void SequencerCUDA::migrationUpdateAtomCounts() {
1337 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1338 patchData = cpdata.ckLocalBranch();
1340 CUDAMigrationKernel->updateLocalRecords(
1342 patchData->devData[deviceIndex].d_localPatches,
1344 patchData->devData[deviceIndex].d_peerPatches,
1348 cudaCheck(cudaStreamSynchronize(stream));
1351 void SequencerCUDA::migrationUpdateAtomOffsets() {
1352 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1353 patchData = cpdata.ckLocalBranch();
1355 CUDAMigrationKernel->updateLocalRecordsOffset(
1356 numPatchesHomeAndProxy,
1357 patchData->devData[deviceIndex].d_localPatches,
1361 cudaCheck(cudaStreamSynchronize(stream));
1364 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1365 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1366 patchData = cpdata.ckLocalBranch();
1368 CUDAMigrationKernel->updatePeerRecords(
1369 numPatchesHomeAndProxy,
1370 patchData->devData[deviceIndex].d_localPatches,
1372 patchData->devData[deviceIndex].d_peerPatches,
1376 cudaCheck(cudaStreamSynchronize(stream));
1379 void SequencerCUDA::migrationUpdateProxyDestination() {
1381 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1382 patchData = cpdata.ckLocalBranch();
1386 CUDAMigrationKernel->copyMigrationDestinationToProxies(
1389 numPatchesHomeAndProxy,
1390 patchData->devData[deviceIndex].d_localPatches,
1391 patchData->devData[deviceIndex].d_peerPatches,
1392 coll_migrationDestination.getDevicePeerPtr(),
1398 void SequencerCUDA::copyPatchDataToHost() {
1399 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1400 patchData = cpdata.ckLocalBranch();
1401 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1403 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1404 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1406 copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1407 cudaCheck(cudaStreamSynchronize(stream));
1411 for (
int i = 0; i < numPatchesHome; i++) {
1415 cudaCheck(cudaStreamSynchronize(stream));
1419 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1420 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1421 patchData = cpdata.ckLocalBranch();
1423 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1424 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1425 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1428 for (
int i = 0; i < integrationPatches.size(); i++) {
1429 const int numAtoms = localPatches[i].numAtoms;
1430 if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1431 iout <<
iERROR <<
"The number of atoms in patch " << i <<
" is " 1432 << numAtoms <<
", greater than the limit for GPU atom migration (" 1433 << MigrationCUDAKernel::kMaxAtomsPerPatch <<
").\n" <<
endi;
1434 NAMD_bug(
"NAMD has stopped simulating due to the error above, " 1435 "but you could disable GPUAtomMigration and try again.\n");
1437 const int offset = localPatches[i].bufferOffset;
1438 HomePatch *patch = integrationPatches[i];
1440 copy_HtoD<FullAtom>((
FullAtom*)h_atomdata.
begin(), coll_atomdata_AoS_in.getDevicePtr() + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1442 cudaCheck(cudaStreamSynchronize(stream));
1452 void SequencerCUDA::copyAtomDataToDevice(
bool copyForces,
int maxForceNumber) {
1454 AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass,
double, stream);
1456 switch (maxForceNumber) {
1458 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_x,
double, stream);
1459 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_y,
double, stream);
1460 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_z,
double, stream);
1462 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_x,
double, stream);
1463 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_y,
double, stream);
1464 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_z,
double, stream);
1466 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_x,
double, stream);
1467 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_y,
double, stream);
1468 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_z,
double, stream);
1472 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_x,
double, stream);
1473 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_y,
double, stream);
1474 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_z,
double, stream);
1475 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_x,
double, stream);
1476 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_y,
double, stream);
1477 AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_z,
double, stream);
1478 AGGREGATE_HOME_ATOMS_TO_DEVICE(mass,
float, stream);
1479 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(charge,
float, stream);
1481 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(
partition,
int, stream);
1485 AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam,
float, stream);
1486 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2,
float, stream);
1487 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2,
float, stream);
1490 AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize,
int, stream);
1491 AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed,
int, stream);
1493 AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed,
int, stream);
1494 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x,
double, stream);
1495 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y,
double, stream);
1496 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z,
double, stream);
1498 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(sortOrder,
int, stream);
1499 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(unsortOrder,
int, stream);
1500 AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength,
float, stream);
1503 AGGREGATE_HOME_ATOMS_TO_DEVICE(
id,
int, stream);
1505 CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1510 void SequencerCUDA::migrationLocalPost(
int startup) {
1511 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1512 patchData = cpdata.ckLocalBranch();
1516 CUDAMigrationKernel->transformMigratedPositions(
1518 patchData->devData[deviceIndex].d_localPatches,
1519 coll_patchCenter.getDevicePtr(),
1520 (
FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1527 CUDAMigrationKernel->sortSolventAtoms(
1529 patchData->devData[deviceIndex].d_localPatches,
1530 (
FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
1532 coll_sortSoluteIndex.getDevicePtr(),
1538 double tempFactor = 1.0;
1543 tempFactor = (lesReduceTemp ? 1. /
simParams->lesFactor : 1);
1545 CUDAMigrationKernel->copy_AoS_to_SoA(
1547 simParams->langevinOn, dt, kbT, tempFactor,
1548 patchData->devData[deviceIndex].d_localPatches,
1551 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1552 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1553 d_mass, coll_charge.getDevicePtr(),
1554 coll_idMig.getDevicePtr(), coll_vdwType.getDevicePtr(),
1555 d_hydrogenGroupSize, d_migrationGroupSize,
1559 coll_partition.getDevicePtr(),
1570 CUDASequencerKernel->SetAtomIndexOrder(coll_idMig.getDevicePtr(), d_idOrder, numAtomsHome, stream);
1575 copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
1576 copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
1577 copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);
1581 myLatticeOld = myLattice;
1584 CUDASequencerKernel->centerOfMass(
1585 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1586 d_rcm_x, d_rcm_y, d_rcm_z,
1587 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1588 CUDASequencerKernel->centerOfMass(
1589 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1590 d_vcm_x, d_vcm_y, d_vcm_z,
1591 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1593 cudaCheck(cudaStreamSynchronize(stream));
1596 void SequencerCUDA::migrationUpdateAdvancedFeatures(
const int startup) {
1607 for (
int i = 0; i < numPatchesHome; i++) {
1609 const int numPatchAtoms = current.
numAtoms;
1611 for(
int j = 0; j < numPatchAtoms; j++){
1616 offset += numPatchAtoms;
1618 copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1623 lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1626 restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1629 SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1632 groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1636 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1639 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1645 void SequencerCUDA::migrationUpdateDestination() {
1646 CUDAMigrationKernel->updateMigrationDestination(
1648 coll_migrationDestination.getDevicePtr(),
1649 coll_sortSoluteIndex.getDevicePeerPtr(),
1654 bool SequencerCUDA::copyPatchData(
1658 bool reallocated =
false;
1660 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1661 patchData = cpdata.ckLocalBranch();
1663 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1665 std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1666 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1670 numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1671 numPatchesHome = homePatches.size();
1672 patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1674 coll_patchCenter.allocate_no_check(defaultCollectiveBufferType, numPatchesGlobal);
1681 #if defined(NAMD_HIP) 1682 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_localPatches,
1684 hipDeviceMallocFinegrained);
1685 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_peerPatches,
1687 hipDeviceMallocFinegrained);
1689 allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1690 allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1693 CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1696 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1697 numPatchesHomeAndProxy, stream);
1698 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1699 peerPatches.size(), stream);
1700 if(
true || mGpuOn) {
1701 this->assembleOrderedPatchList();
1703 this->copySettleParameter();
1705 for (
int i = 0; i < numPatchesHome; i++) {
1707 this->copyMigrationInfo(patch, i);
1711 patchToDeviceMap[patch->
getPatchID()] = deviceIndex;
1713 copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1714 copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1720 if (i == deviceIndex)
continue;
1721 std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1722 for (
int j = 0; j < otherPatches.size(); j++) {
1728 copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1729 copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1730 patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1731 patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1734 allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1737 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1738 HomePatch *patch = patchListHomeAndProxy[i];
1739 awayDists[i].x = patch->aAwayDist;
1740 awayDists[i].y = patch->bAwayDist;
1741 awayDists[i].z = patch->cAwayDist;
1747 copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1748 copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1749 copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1750 copy_HtoD<double3>(patchCenter, coll_patchCenter.getDevicePtr(), numPatchesHomeAndProxy, stream);
1752 const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1753 localPatches[numPatchesHomeAndProxy-1].numAtoms;
1755 const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1756 localPatches[numPatchesHome-1].numAtoms;
1758 reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1761 numAtomsHomePrev = numAtomsHome;
1762 numAtomsHomeAndProxy = totalAtomCount;
1763 numAtomsHome = homeAtomCount;
1765 patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1768 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1769 numPatchesHomeAndProxy, stream);
1770 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1771 peerPatches.size(), stream);
1777 copy_HtoD<int>(molecule->
moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1785 void SequencerCUDA::copyDataToPeers(
1788 if (!copyIn)
return;
1793 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1794 patchData = cpdata.ckLocalBranch();
1796 CUDAMigrationKernel->copyDataToProxies(
1799 numPatchesHomeAndProxy,
1800 patchData->devData[deviceIndex].d_localPatches,
1801 coll_idMig.getDevicePeerPtr(),
1802 coll_vdwType.getDevicePeerPtr(),
1803 coll_sortOrder.getDevicePeerPtr(),
1804 coll_unsortOrder.getDevicePeerPtr(),
1805 coll_charge.getDevicePeerPtr(),
1806 coll_partition.getDevicePeerPtr(),
1807 coll_patchCenter.getDevicePeerPtr(),
1812 cudaCheck(cudaStreamSynchronize(stream));
1815 void SequencerCUDA::migrationSortAtomsNonbonded() {
1816 CUDAMigrationKernel->sortAtoms(
1817 numPatchesHome, numAtomsHome,
1818 patchData->devData[deviceIndex].d_localPatches,
1820 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1821 coll_sortOrder.getDevicePtr(),
1822 coll_unsortOrder.getDevicePtr(),
1828 void SequencerCUDA::maximumMove(
1829 const double maxvel2,
1832 CUDASequencerKernel->maximumMove(
1833 maxvel2, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1834 killme, numAtoms, stream);
1837 void SequencerCUDA::submitHalf(
1838 int numAtoms,
int part,
const bool doEnergy)
1844 Tensor reduction_intVirialNormal;
1850 cudaCheck(cudaEventRecord(eventStart,stream));
1852 CUDASequencerKernel->submitHalf(
1854 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1855 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1856 d_kineticEnergy, d_intKineticEnergy,
1857 d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1858 virial_half, intVirialNormal_half,
1859 d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1861 cudaCheck(cudaEventRecord(eventStop, stream));
1862 cudaCheck(cudaEventSynchronize(eventStop));
1863 cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1864 fprintf(stderr,
"submitHalf total elapsed time: %f\n", t_submitHalf);
1865 t_submitReductions2 = 0;
1870 void SequencerCUDA::submitReductions(
1874 int marginViolations,
1877 int numAtomsReduction,
1886 CUDASequencerKernel->submitReduction1(
1887 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1888 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_mass,
1890 d_momentum_x, d_momentum_y, d_momentum_z,
1891 d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1892 origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1893 momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1894 numAtomsReduction, stream);
1896 Tensor regintVirialNormal;
1897 Tensor regintVirialNbond;
1901 cudaCheck(cudaEventRecord(eventStart,stream));
1903 CUDASequencerKernel->submitReduction2(
1905 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1906 d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1907 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1908 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
1909 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
1910 d_mass, d_hydrogenGroupSize,
1911 d_kineticEnergy, kineticEnergy,
1912 d_intKineticEnergy, intKineticEnergy,
1913 d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1914 intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1915 d_tbcatomic, numAtomsReduction, maxForceNumber,
simParams->isMultiTimeStepping(), stream);
1918 CUDASequencerKernel->calcFixVirial(
1919 maxForceNumber, numAtomsReduction, d_atomFixed,
1920 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1921 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1922 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
1923 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
1924 d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1925 d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1929 cudaCheck(cudaEventRecord(eventStop, stream));
1930 cudaCheck(cudaEventSynchronize(eventStop));
1931 cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1932 fprintf(stderr,
"submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1933 t_submitReductions2 = 0;
1937 void SequencerCUDA::copySettleParameter(){
1944 for(
int i = 0; i < patchList.size(); i++){
1945 if(patchList[i]->settle_initialized) {
1946 patch = patchList[i];
1952 h_sp.
mO = patch->settle_mO;
1953 h_sp.
mH = patch->settle_mH;
1954 h_sp.
mOrmT = patch->settle_mOrmT;
1955 h_sp.
mHrmT = patch->settle_mHrmT;
1956 h_sp.
rra = patch->settle_rra;
1957 h_sp.
ra = patch->settle_ra;
1958 h_sp.
rb = patch->settle_rb;
1959 h_sp.
rc = patch->settle_rc;
1960 h_sp.
r_om = patch->r_om;
1961 h_sp.
r_ohc = patch->r_ohc;
1965 copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
1971 void SequencerCUDA::startRun1(
1976 myLattice = lattice;
1979 CUDASequencerKernel->rattle1(
1981 numAtomsHome, 0.f, 0.f,
1983 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
1984 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
1985 d_velNew_x, d_velNew_y, d_velNew_z,
1986 d_posNew_x, d_posNew_y, d_posNew_z,
1987 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
1988 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
1989 &settleList, settleListSize, &d_consFailure,
1990 d_consFailureSize, &rattleList, rattleListSize,
1992 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
1993 buildRigidLists, consFailure,
simParams->watmodel, stream);
1995 this->copyPositionsAndVelocitiesToHost(1, 0);
1998 printSOAPositionsAndVelocities();
2002 void SequencerCUDA::startRun2(
2023 cudaCheck(cudaMemset(coll_f_nbond_x.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2024 cudaCheck(cudaMemset(coll_f_nbond_y.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2025 cudaCheck(cudaMemset(coll_f_nbond_z.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2027 cudaCheck(cudaMemset(coll_f_slow_x.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2028 cudaCheck(cudaMemset(coll_f_slow_y.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2029 cudaCheck(cudaMemset(coll_f_slow_z.getDevicePtr(), 0,
sizeof(double)*numAtomsHomeAndProxy));
2031 CUDASequencerKernel->accumulateForceToSOA(
2035 numPatchesHomeAndProxy,
2037 patchData->devData[deviceIndex].d_localPatches,
2038 patchData->devData[deviceIndex].f_bond,
2039 patchData->devData[deviceIndex].f_bond_nbond,
2040 patchData->devData[deviceIndex].f_bond_slow,
2041 patchData->devData[deviceIndex].forceStride,
2042 patchData->devData[deviceIndex].f_nbond,
2043 patchData->devData[deviceIndex].f_nbond_slow,
2044 patchData->devData[deviceIndex].f_slow,
2048 coll_f_normal_x.getDevicePtr(),
2049 coll_f_normal_y.getDevicePtr(),
2050 coll_f_normal_z.getDevicePtr(),
2051 coll_f_nbond_x.getDevicePtr(),
2052 coll_f_nbond_y.getDevicePtr(),
2053 coll_f_nbond_z.getDevicePtr(),
2054 coll_f_slow_x.getDevicePtr(),
2055 coll_f_slow_y.getDevicePtr(),
2056 coll_f_slow_z.getDevicePtr(),
2057 coll_unsortOrder.getDevicePtr(),
2059 patchData->d_queues,
2060 patchData->d_queueCounters,
2068 SMDKernel->computeCOMMGpu(myLattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2069 d_transform, stream);
2071 if(groupRestraintsKernel)
2073 groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
2074 d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2081 printSOAPositionsAndVelocities();
2085 void SequencerCUDA::startRun3(
2090 const bool requestGlobalForces,
2091 int doGlobalMasterStaleForces,
2092 const bool requestForcesOutput,
2093 const bool requestGlobalForcesGPU,
2096 const bool doFixed =
simParams->fixedAtomsOn;
2101 std::vector<int> atom_counts;
2103 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2105 CUDASequencerKernel->mergeForcesFromPeers(
2109 numPatchesHomeAndProxy,
2111 this->coll_f_normal_x.getDevicePeerPtr(),
2112 this->coll_f_normal_y.getDevicePeerPtr(),
2113 this->coll_f_normal_z.getDevicePeerPtr(),
2114 this->coll_f_nbond_x.getDevicePeerPtr(),
2115 this->coll_f_nbond_y.getDevicePeerPtr(),
2116 this->coll_f_nbond_z.getDevicePeerPtr(),
2117 this->coll_f_slow_x.getDevicePeerPtr(),
2118 this->coll_f_slow_y.getDevicePeerPtr(),
2119 this->coll_f_slow_z.getDevicePeerPtr(),
2122 patchData->devData[deviceIndex].d_localPatches,
2123 patchData->devData[deviceIndex].d_peerPatches,
2140 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2141 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2144 if(doGlobalMasterStaleForces)
2146 memset(&extVirial[EXT_GLOBALMTS], 0,
sizeof(
cudaTensor));
2147 memset(&extForce[EXT_GLOBALMTS], 0,
sizeof(double3));
2148 computeGlobalMasterVirial(
2149 numPatchesHomeAndProxy,
2151 patchData->devData[deviceIndex].d_localPatches,
2152 coll_pos_x.getDevicePtr(),
2153 coll_pos_y.getDevicePtr(),
2154 coll_pos_z.getDevicePtr(),
2159 &d_extForce[EXT_GLOBALMTS],
2160 &extForce[EXT_GLOBALMTS],
2161 &d_extVirial[EXT_GLOBALMTS],
2162 &extVirial[EXT_GLOBALMTS],
2169 calculateExternalForces(
simParams->firstTimestep, maxForceNumber, 1, 1);
2172 if(
true || deviceID == 0){
2174 snprintf(prefix, 10,
"step-%d",0);
2175 this->printSOAForces(prefix);
2179 CUDASequencerKernel->addForceToMomentum(
2180 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2182 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2183 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2184 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2185 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2186 numAtomsHome, maxForceNumber, stream);
2188 CUDASequencerKernel->rattle1(
2190 numAtomsHome, -dt_normal, -1.0/(dt_normal),
2192 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2193 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2194 d_velNew_x, d_velNew_y, d_velNew_z,
2195 d_posNew_x, d_posNew_y, d_posNew_z,
2196 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2197 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2198 &settleList, settleListSize, &d_consFailure,
2199 d_consFailureSize, &rattleList, rattleListSize,
2201 d_rigidVirial, rigidVirial, d_tbcatomic,
true, sp,
2202 true, consFailure,
simParams->watmodel, stream);
2204 CUDASequencerKernel->centerOfMass(
2205 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2206 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2207 d_hydrogenGroupSize, numAtomsHome, stream);
2212 submitHalf(numAtomsHome, 1, 1);
2214 cudaCheck(cudaStreamSynchronize(stream));
2216 Tensor reduction_intVirialNormal;
2221 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2222 reduction_virial *= 0.5;
2226 += (intKineticEnergy_half[0] * 0.25);
2227 reduction_intVirialNormal *= 0.5;
2229 reduction_intVirialNormal);
2232 CUDASequencerKernel->addForceToMomentum(
2233 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2235 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2236 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2237 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2238 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2239 numAtomsHome, maxForceNumber, stream);
2241 CUDASequencerKernel->rattle1(
2243 numAtomsHome, dt_normal, 1.0/dt_normal,
2245 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2246 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2247 d_velNew_x, d_velNew_y, d_velNew_z,
2248 d_posNew_x, d_posNew_y, d_posNew_z,
2249 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2250 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2251 &settleList, settleListSize, &d_consFailure,
2252 d_consFailureSize, &rattleList, rattleListSize,
2254 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2255 buildRigidLists, consFailure,
simParams->watmodel, stream);
2257 CUDASequencerKernel->centerOfMass(
2258 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2259 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2260 d_hydrogenGroupSize, numAtomsHome, stream);
2264 submitHalf(numAtomsHome, 1, 1);
2266 cudaCheck(cudaStreamSynchronize(stream));
2268 Tensor reduction_intVirialNormal;
2273 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2274 reduction_virial *= 0.5;
2278 += (intKineticEnergy_half[0] * 0.25);
2279 reduction_intVirialNormal *= 0.5;
2281 reduction_intVirialNormal);
2284 CUDASequencerKernel->addForceToMomentum(
2285 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2287 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2288 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2289 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2290 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2291 numAtomsHome, maxForceNumber, stream);
2293 if(requestGlobalForces || requestForcesOutput) {
2296 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2299 if (requestGlobalForcesGPU) {
2300 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2301 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2302 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2303 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2304 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2305 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2306 CUDASequencerKernel->copyForcesToDevice(
2307 numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2308 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2309 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2310 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2314 CUDASequencerKernel->centerOfMass(
2315 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2316 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2317 d_hydrogenGroupSize, numAtomsHome, stream);
2319 submitReductions(origin.
x, origin.
y, origin.
z,
2320 marginViolations, 1,
2322 numAtomsHome, maxForceNumber);
2324 copyPositionsAndVelocitiesToHost(1, 0);
2332 NAMD_die(
"constraint failure during CUDA rattle!\n");
2334 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
2337 cudaCheck(cudaStreamSynchronize(stream));
2338 if (doGlobalMasterStaleForces) {
2339 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2340 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2344 Tensor reduction_rigidVirial;
2347 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2353 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2355 Vector angularMomentum(*angularMomentum_x,
2357 *angularMomentum_z);
2360 Tensor regintVirialNormal;
2361 Tensor regintVirialNbond;
2364 if (maxForceNumber >= 1) {
2367 if (maxForceNumber >= 2) {
2377 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2378 double3 fixForceNormal, fixForceNbond, fixForceSlow;
2379 switch (maxForceNumber) {
2381 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2382 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2386 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
2389 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2390 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2394 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
2397 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2398 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2402 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
2406 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
2407 CkPrintf(
"%s", name.c_str());
2408 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 2409 "%12.5lf %12.5lf %12.5lf\n" 2410 "%12.5lf %12.5lf %12.5lf\n",
2415 printTensor(fixVirialNormal,
"fixVirialNormal = ");
2416 printTensor(fixVirialNbond,
"fixVirialNbond = ");
2417 printTensor(fixVirialSlow,
"fixVirialSlow = ");
2425 this->printSOAForces(NULL);
2430 printSOAPositionsAndVelocities();
2434 void SequencerCUDA::monteCarloPressure_reject(
Lattice &lattice)
2437 myLattice = lattice;
2442 copy_DtoD<double>(d_f_normalMC_x, coll_f_normal_x.getDevicePtr(), numAtomsHome, stream);
2443 copy_DtoD<double>(d_f_normalMC_y, coll_f_normal_y.getDevicePtr(), numAtomsHome, stream);
2444 copy_DtoD<double>(d_f_normalMC_z, coll_f_normal_z.getDevicePtr(), numAtomsHome, stream);
2445 copy_DtoD<double>(d_f_nbondMC_x, coll_f_nbond_x.getDevicePtr(), numAtomsHome, stream);
2446 copy_DtoD<double>(d_f_nbondMC_y, coll_f_nbond_y.getDevicePtr(), numAtomsHome, stream);
2447 copy_DtoD<double>(d_f_nbondMC_z, coll_f_nbond_z.getDevicePtr(), numAtomsHome, stream);
2448 copy_DtoD<double>(d_f_slowMC_x, coll_f_slow_x.getDevicePtr(), numAtomsHome, stream);
2449 copy_DtoD<double>(d_f_slowMC_y, coll_f_slow_y.getDevicePtr(), numAtomsHome, stream);
2450 copy_DtoD<double>(d_f_slowMC_z, coll_f_slow_z.getDevicePtr(), numAtomsHome, stream);
2451 #ifdef NAMD_NCCL_ALLREDUCE 2453 copy_DtoD<double>(d_posMC_x, d_posNew_x, numAtomsHome, stream);
2454 copy_DtoD<double>(d_posMC_y, d_posNew_y, numAtomsHome, stream);
2455 copy_DtoD<double>(d_posMC_z, d_posNew_z, numAtomsHome, stream);
2457 copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
2458 copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
2459 copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
2462 copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
2463 copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
2464 copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
2468 void SequencerCUDA::monteCarloPressure_accept(
2469 const int doMigration)
2472 CUDASequencerKernel->centerOfMass(
2473 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2474 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2475 d_hydrogenGroupSize, numAtomsHome, stream);
2480 Tensor reduction_intVirialNormal;
2484 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2485 reduction_virial *= 0.5;
2486 reduction_intVirialNormal *= 0.5;
2489 reduction_intVirialNormal);
2496 myLatticeOld = myLattice;
2500 void SequencerCUDA::monteCarloPressure_part1(
2506 copy_DtoD<double>(coll_f_normal_x.getDevicePtr(), d_f_normalMC_x, numAtomsHome, stream);
2507 copy_DtoD<double>(coll_f_normal_y.getDevicePtr(), d_f_normalMC_y, numAtomsHome, stream);
2508 copy_DtoD<double>(coll_f_normal_z.getDevicePtr(), d_f_normalMC_z, numAtomsHome, stream);
2509 copy_DtoD<double>(coll_f_nbond_x.getDevicePtr(), d_f_nbondMC_x, numAtomsHome, stream);
2510 copy_DtoD<double>(coll_f_nbond_y.getDevicePtr(), d_f_nbondMC_y, numAtomsHome, stream);
2511 copy_DtoD<double>(coll_f_nbond_z.getDevicePtr(), d_f_nbondMC_z, numAtomsHome, stream);
2512 copy_DtoD<double>(coll_f_slow_x.getDevicePtr(), d_f_slowMC_x, numAtomsHome, stream);
2513 copy_DtoD<double>(coll_f_slow_y.getDevicePtr(), d_f_slowMC_y, numAtomsHome, stream);
2514 copy_DtoD<double>(coll_f_slow_z.getDevicePtr(), d_f_slowMC_z, numAtomsHome, stream);
2515 #ifdef NAMD_NCCL_ALLREDUCE 2517 copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2518 copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2519 copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2521 copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
2522 copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
2523 copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
2527 copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
2528 copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
2529 copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
2534 Lattice newLattice = oldLattice;
2543 CUDASequencerKernel->scaleCoordinateUsingGC(
2544 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_idOrder, d_moleculeStartIndex,
2545 d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2550 myLattice = newLattice;
2555 bool doNbond = patchData->flags.doNonbonded;
2556 bool doSlow = patchData->flags.doFullElectrostatics;
2559 bool doAlchDecouple =
false;
2560 bool doAlchSoftCore =
false;
2563 if (
simParams->alchThermIntOn) doTI =
true;
2564 if (
simParams->alchDecouple) doAlchDecouple =
true;
2565 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2569 lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
2572 bool usePatchPme =
false;
2585 getNumPatchesHome(),
2594 std::vector<int> atom_counts;
2596 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2598 CUDASequencerKernel->set_compute_positions(
2602 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2603 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2604 #ifdef NAMD_NCCL_ALLREDUCE 2605 (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
2606 (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
2607 (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
2609 coll_pos_x.getDevicePtr(),
2610 coll_pos_y.getDevicePtr(),
2611 coll_pos_z.getDevicePtr(),
2612 coll_pos_x.getDevicePeerPtr(),
2613 coll_pos_y.getDevicePeerPtr(),
2614 coll_pos_z.getDevicePeerPtr(),
2615 coll_charge.getDevicePeerPtr(),
2616 coll_partition.getDevicePeerPtr(),
2618 coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
2619 coll_patchCenter.getDevicePtr(),
2620 patchData->devData[deviceIndex].slow_patchPositions,
2621 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2622 coll_sortOrder.getDevicePtr(), newLattice,
2623 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2624 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2626 patchData->devData[deviceIndex].d_localPatches,
2627 patchData->devData[deviceIndex].d_peerPatches,
2631 cudaCheck(cudaStreamSynchronize(stream));
2634 void SequencerCUDA::monteCarloPressure_part2(
2637 const bool doEnergy,
2638 const bool doGlobal,
2639 const bool doVirial)
2645 #ifdef NAMD_NCCL_ALLREDUCE 2646 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtoms*3*(maxForceNumber+1)));
2650 CUDASequencerKernel->accumulateForceToSOA(
2654 numPatchesHomeAndProxy,
2656 patchData->devData[deviceIndex].d_localPatches,
2657 patchData->devData[deviceIndex].f_bond,
2658 patchData->devData[deviceIndex].f_bond_nbond,
2659 patchData->devData[deviceIndex].f_bond_slow,
2660 patchData->devData[deviceIndex].forceStride,
2661 patchData->devData[deviceIndex].f_nbond,
2662 patchData->devData[deviceIndex].f_nbond_slow,
2663 patchData->devData[deviceIndex].f_slow,
2667 coll_f_normal_x.getDevicePtr(),
2668 coll_f_normal_y.getDevicePtr(),
2669 coll_f_normal_z.getDevicePtr(),
2670 coll_f_nbond_x.getDevicePtr(),
2671 coll_f_nbond_y.getDevicePtr(),
2672 coll_f_nbond_z.getDevicePtr(),
2673 coll_f_slow_x.getDevicePtr(),
2674 coll_f_slow_y.getDevicePtr(),
2675 coll_f_slow_z.getDevicePtr(),
2676 coll_unsortOrder.getDevicePtr(),
2678 patchData->d_queues,
2679 patchData->d_queueCounters,
2684 #ifndef NAMD_NCCL_ALLREDUCE 2688 std::vector<int> atom_counts;
2690 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2692 CUDASequencerKernel->mergeForcesFromPeers(
2696 numPatchesHomeAndProxy,
2698 this->coll_f_normal_x.getDevicePeerPtr(),
2699 this->coll_f_normal_y.getDevicePeerPtr(),
2700 this->coll_f_normal_z.getDevicePeerPtr(),
2701 this->coll_f_nbond_x.getDevicePeerPtr(),
2702 this->coll_f_nbond_y.getDevicePeerPtr(),
2703 this->coll_f_nbond_z.getDevicePeerPtr(),
2704 this->coll_f_slow_x.getDevicePeerPtr(),
2705 this->coll_f_slow_y.getDevicePeerPtr(),
2706 this->coll_f_slow_z.getDevicePeerPtr(),
2709 patchData->devData[deviceIndex].d_localPatches,
2710 patchData->devData[deviceIndex].d_peerPatches,
2715 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2716 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2720 calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
2723 if(
true || deviceID == 0){
2725 snprintf(prefix, 10,
"step-%d",step);
2726 this->printSOAForces(prefix);
2732 void SequencerCUDA::setRescalePairlistTolerance(
const bool val) {
2733 rescalePairlistTolerance = val;
2736 void SequencerCUDA::launch_part1(
2741 double velrescaling,
2742 const double maxvel2,
2746 int reassignVelocitiesStep,
2747 int langevinPistonStep,
2748 int berendsenPressureStep,
2751 const int savePairlists,
2752 const int usePairlists,
2753 const bool doEnergy)
2758 this->maxvel2 = maxvel2;
2760 const bool doFixed =
simParams->fixedAtomsOn;
2763 myLattice = lattice;
2764 if(reassignVelocitiesStep)
2766 const int reassignFreq =
simParams->reassignFreq;
2768 newTemp += ( step / reassignFreq ) *
simParams->reassignIncr;
2773 if ( newTemp < simParams->reassignHold )
2778 CUDASequencerKernel->reassignVelocities(
2779 dt_normal,
simParams->fixedAtomsOn, d_atomFixed,
2780 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2781 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2783 numAtomsHome, numAtomsHome, 0,
2788 if(berendsenPressureStep) {
2793 CUDASequencerKernel->scaleCoordinateWithFactor(
2794 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass, d_hydrogenGroupSize,
2795 cuFactor, cuOrigin,
simParams->useGroupPressure, numAtomsHome, stream);
2798 if(!langevinPistonStep){
2801 CUDASequencerKernel->velocityVerlet1(
2802 doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2803 dt_slow, velrescaling, d_recipMass,
2804 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), maxvel2, killme, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2805 pos_x, pos_y, pos_z, coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2806 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(), coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2807 d_atomFixed, numAtomsHome, maxForceNumber, stream);
2810 CUDASequencerKernel->addForceToMomentum(
2811 doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2813 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
2814 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
2815 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
2816 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
2817 numAtomsHome, maxForceNumber, stream);
2819 maximumMove(maxvel2, numAtomsHome);
2828 CUDASequencerKernel->addVelocityToPosition(
2829 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2830 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2831 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2832 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2833 CUDASequencerKernel->langevinPiston(
2835 d_groupFixed, d_transform, lattice,
2836 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2837 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2838 d_mass, d_hydrogenGroupSize,
2839 cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2840 simParams->useGroupPressure, numAtomsHome, stream);
2841 CUDASequencerKernel->addVelocityToPosition(
2842 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2843 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2844 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2845 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2847 if(mGpuOn && SMDKernel)
2850 SMDKernel->computeCOMMGpu(lattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2851 d_transform, stream);
2853 if(mGpuOn && groupRestraintsKernel)
2855 groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
2856 d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2861 if( (doEnergy || doVirial) ) {
2862 CUDASequencerKernel->centerOfMass(
2863 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
2864 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2865 d_hydrogenGroupSize, numAtomsHome, stream);
2866 CUDASequencerKernel->centerOfMass(
2867 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
2868 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2869 d_hydrogenGroupSize, numAtomsHome, stream);
2875 bool doNbond = patchData->flags.doNonbonded;
2876 bool doSlow = patchData->flags.doFullElectrostatics;
2880 bool doAlchDecouple =
false;
2881 bool doAlchSoftCore =
false;
2884 if (
simParams->alchThermIntOn) doTI =
true;
2885 if (
simParams->alchDecouple) doAlchDecouple =
true;
2886 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2888 if ( ! savePairlists ) {
2891 double sysdima = lattice.
a_r().
unit() * lattice.
a();
2892 double sysdimb = lattice.
b_r().
unit() * lattice.
b();
2893 double sysdimc = lattice.
c_r().
unit() * lattice.
c();
2896 CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2897 patchData->devData[deviceIndex].d_localPatches,
2898 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_posSave_x, d_posSave_y, d_posSave_z,
2900 myLattice, myLatticeOld,
2901 d_patchMin, d_patchMax, coll_patchCenter.getDevicePtr(),
2903 d_tbcatomic,
simParams->pairlistTrigger,
2905 d_patchMaxAtomMovement, patchMaxAtomMovement,
2906 d_patchNewTolerance, patchNewTolerance,
2907 minSize,
simParams->cutoff, sysdima, sysdimb, sysdimc,
2909 h_periodicCellSmall,
2910 rescalePairlistTolerance,
2911 isPeriodic, stream);
2912 rescalePairlistTolerance =
false;
2915 rescalePairlistTolerance =
true;
2920 cudaCheck(cudaStreamSynchronize(stream));
2924 void SequencerCUDA::launch_part11(
2928 double velrescaling,
2929 const double maxvel2,
2933 int langevinPistonStep,
2936 const int savePairlists,
2937 const int usePairlists,
2938 const bool doEnergy)
2940 const bool doVirial =
simParams->langevinPistonOn;
2944 bool doNbond = patchData->flags.doNonbonded;
2945 bool doSlow = patchData->flags.doFullElectrostatics;
2949 bool doAlchDecouple =
false;
2950 bool doAlchSoftCore =
false;
2953 if (
simParams->alchThermIntOn) doTI =
true;
2954 if (
simParams->alchDecouple) doAlchDecouple =
true;
2955 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2958 submitHalf(numAtomsHome, 1, doEnergy || doVirial);
2962 this->update_patch_flags();
2965 finish_part1(copyIn, patchList[0]->flags.savePairlists,
2966 patchList[0]->flags.usePairlists);
2970 void SequencerCUDA::launch_set_compute_positions() {
2975 bool doNbond = patchData->flags.doNonbonded;
2976 bool doSlow = patchData->flags.doFullElectrostatics;
2980 bool doAlchDecouple =
false;
2981 bool doAlchSoftCore =
false;
2984 if (
simParams->alchThermIntOn) doTI =
true;
2985 if (
simParams->alchDecouple) doAlchDecouple =
true;
2986 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2992 lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
2995 bool usePatchPme =
false;
3008 getNumPatchesHome(),
3021 std::vector<int> atom_counts;
3023 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3025 CUDASequencerKernel->set_compute_positions(
3029 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3030 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3031 #ifdef NAMD_NCCL_ALLREDUCE 3032 (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
3033 (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
3034 (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
3036 coll_pos_x.getDevicePtr(),
3037 coll_pos_y.getDevicePtr(),
3038 coll_pos_z.getDevicePtr(),
3039 coll_pos_x.getDevicePeerPtr(),
3040 coll_pos_y.getDevicePeerPtr(),
3041 coll_pos_z.getDevicePeerPtr(),
3042 coll_charge.getDevicePeerPtr(),
3043 coll_partition.getDevicePeerPtr(),
3045 coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
3046 coll_patchCenter.getDevicePtr(),
3047 patchData->devData[deviceIndex].slow_patchPositions,
3048 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3049 coll_sortOrder.getDevicePtr(), myLattice,
3050 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3051 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3053 patchData->devData[deviceIndex].d_localPatches,
3054 patchData->devData[deviceIndex].d_peerPatches,
3061 copyPositionsToHost_direct();
3068 void SequencerCUDA:: finish_part1(
const int copyIn,
3069 const int savePairlists,
3070 const int usePairlists)
3081 cudaCheck(cudaStreamSynchronize(stream));
3084 if(*h_periodicCellSmall){
3085 NAMD_die(
"Periodic cell has become too small for original patch grid!\n" 3086 "Possible solutions are to restart from a recent checkpoint,\n" 3087 "increase margin, or disable useFlexibleCell for liquid simulation.");
3095 double *vel_x, *vel_y, *vel_z;
3096 std::vector<int> id;
3097 std::vector<int> patchIDofAtoms(numAtomsHome);
3098 allocate_host<double>(&vel_x, numAtomsHome);
3099 allocate_host<double>(&vel_y, numAtomsHome);
3100 allocate_host<double>(&vel_z, numAtomsHome);
3101 copy_DtoH_sync<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsHome);
3102 copy_DtoH_sync<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsHome);
3103 copy_DtoH_sync<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsHome);
3107 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3109 for (
int i = 0; i < numPatchesHome; i++) {
3112 const int numPatchAtoms = current.
numAtoms;
3113 id.resize(numPatchAtoms +
id.size());
3114 for(
int j = 0; j < numPatchAtoms; j++){
3116 id[offset + j] = current.
id[j];
3118 patchIDofAtoms[offset + j] = patch->
getPatchID();
3120 offset += numPatchAtoms;
3123 id.resize(numAtomsHome);
3124 copy_DtoH_sync<int>(coll_idMig.getDevicePtr(),
id.data(), numAtomsHome);
3127 for (
int i=0; i < numAtomsHome; i++) {
3129 vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3130 if (vel2 > maxvel2) {
3141 <<
" in patch " << patchIDofAtoms[i]
3142 <<
" on PE " << CkMyPe()
3143 <<
" with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
3144 <<
" atoms)\n" <<
endi;
3147 iout <<
iERROR <<
"Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3148 "; simulation has become unstable (" 3149 << cnt <<
" atoms on pe " << CkMyPe() <<
", GPU " << deviceID <<
").\n" <<
endi;
3151 double *pos_x, *pos_y, *pos_z;
3152 allocate_host<double>(&pos_x, numAtomsHome);
3153 allocate_host<double>(&pos_y, numAtomsHome);
3154 allocate_host<double>(&pos_z, numAtomsHome);
3155 copy_DtoH_sync<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsHome);
3156 copy_DtoH_sync<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsHome);
3157 copy_DtoH_sync<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsHome);
3159 const std::string outfilename =
3160 std::string(
simParams->crashFilename) +
"." +
3161 std::to_string(deviceIndex);
3162 std::ofstream ofs_crash_dump(outfilename.c_str());
3163 ofs_crash_dump <<
"atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
3164 for (
int i=0; i < numAtomsHome; i++) {
3165 ofs_crash_dump <<
id[i]+1 <<
"," 3173 ofs_crash_dump.flush();
3174 ofs_crash_dump.close();
3175 iout <<
iWARN <<
"PE " << CkMyPe() <<
", GPU " << deviceID
3176 <<
": the atom positions and velocities have been written to " 3177 << outfilename <<
"\n" <<
endi;
3178 deallocate_host<double>(&pos_x);
3179 deallocate_host<double>(&pos_y);
3180 deallocate_host<double>(&pos_z);
3182 deallocate_host<double>(&vel_x);
3183 deallocate_host<double>(&vel_y);
3184 deallocate_host<double>(&vel_z);
3185 NAMD_die(
"SequencerCUDA: Atoms moving too fast");
3190 Tensor reduction_intVirialNormal;
3195 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3196 reduction_virial *= 0.5;
3200 += (intKineticEnergy_half[0] * 0.25);
3201 reduction_intVirialNormal *= 0.5;
3203 reduction_intVirialNormal);
3204 int migration = (h_marginViolations[0] != 0) ? 1 :0;
3206 patchData->migrationFlagPerDevice[deviceIndex] = migration;
3207 h_marginViolations[0] = 0;
3211 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3212 bool copyOut,
const int doGlobal){
3215 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3216 patchData = cpdata.ckLocalBranch();
3217 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3218 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3219 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3220 const int numAtomsToCopy = numAtomsHome;
3221 copy_DtoH<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsToCopy, stream);
3222 copy_DtoH<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsToCopy, stream);
3223 copy_DtoH<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsToCopy, stream);
3226 copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
3227 copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
3228 copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
3232 for(
int i = 0; i < homePatches.size(); i++){
3236 const int numPatchAtoms = localPatches[i].
numAtoms;
3237 const int offset = localPatches[i].bufferOffset;
3238 memcpy(current.
vel_x, vel_x + offset, numPatchAtoms*
sizeof(
double));
3239 memcpy(current.
vel_y, vel_y + offset, numPatchAtoms*
sizeof(
double));
3240 memcpy(current.
vel_z, vel_z + offset, numPatchAtoms*
sizeof(
double));
3243 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3244 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3245 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3252 void SequencerCUDA::copyPositionsToHost(){
3254 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3255 patchData = cpdata.ckLocalBranch();
3256 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3257 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3258 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3260 const int numAtomsToCopy = numAtomsHome;
3262 copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
3263 copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
3264 copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
3267 for(
int i = 0; i < homePatches.size(); i++){
3271 const int numPatchAtoms = localPatches[i].
numAtoms;
3272 const int offset = localPatches[i].bufferOffset;
3273 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3274 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3275 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3279 void SequencerCUDA::update_patch_flags()
3282 int pairlists = (patchData->flags.step <
simParams->N);
3283 for (
int i=0; i < numPatchesHome; i++) {
3289 void SequencerCUDA::updatePairlistFlags(
const int doMigration){
3290 int pairlists = patchList[0]->flags.step <
simParams->N;
3291 for(
int i = 0; i < numPatchesHome; i++){
3311 patch->doPairlistCheck_newTolerance *= (1 -
simParams->pairlistShrink);
3315 patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3322 if(patchList[0]->flags.savePairlists){
3324 copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
3325 copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
3326 copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);
3327 myLatticeOld = myLattice;
3331 void SequencerCUDA::finish_patch_flags(
int migration)
3333 for (
int i=0; i < numPatchesHome; i++) {
3347 void SequencerCUDA::launch_part2(
3348 const int doMCPressure,
3355 const int langevinPistonStep,
3359 const bool doEnergy)
3364 bool doNbond =
false;
3365 bool doSlow =
false;
3376 #ifdef NAMD_NCCL_ALLREDUCE 3377 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3386 (!is_lonepairs_psf)){
3387 CUDASequencerKernel->accumulate_force_kick(
3392 numPatchesHomeAndProxy,
3393 patchData->devData[deviceIndex].d_localPatches,
3394 patchData->devData[deviceIndex].f_bond,
3395 patchData->devData[deviceIndex].f_bond_nbond,
3396 patchData->devData[deviceIndex].f_bond_slow,
3397 patchData->devData[deviceIndex].forceStride,
3398 patchData->devData[deviceIndex].f_nbond,
3399 patchData->devData[deviceIndex].f_nbond_slow,
3400 patchData->devData[deviceIndex].f_slow,
3404 coll_f_normal_x.getDevicePtr(),
3405 coll_f_normal_y.getDevicePtr(),
3406 coll_f_normal_z.getDevicePtr(),
3407 coll_f_nbond_x.getDevicePtr(),
3408 coll_f_nbond_y.getDevicePtr(),
3409 coll_f_nbond_z.getDevicePtr(),
3410 coll_f_slow_x.getDevicePtr(),
3411 coll_f_slow_y.getDevicePtr(),
3412 coll_f_slow_z.getDevicePtr(),
3413 coll_vel_x.getDevicePtr(),
3414 coll_vel_y.getDevicePtr(),
3415 coll_vel_z.getDevicePtr(),
3422 coll_unsortOrder.getDevicePtr(),
3427 CUDASequencerKernel->accumulateForceToSOA(
3431 numPatchesHomeAndProxy,
3433 patchData->devData[deviceIndex].d_localPatches,
3434 patchData->devData[deviceIndex].f_bond,
3435 patchData->devData[deviceIndex].f_bond_nbond,
3436 patchData->devData[deviceIndex].f_bond_slow,
3437 patchData->devData[deviceIndex].forceStride,
3438 patchData->devData[deviceIndex].f_nbond,
3439 patchData->devData[deviceIndex].f_nbond_slow,
3440 patchData->devData[deviceIndex].f_slow,
3444 coll_f_normal_x.getDevicePtr(),
3445 coll_f_normal_y.getDevicePtr(),
3446 coll_f_normal_z.getDevicePtr(),
3447 coll_f_nbond_x.getDevicePtr(),
3448 coll_f_nbond_y.getDevicePtr(),
3449 coll_f_nbond_z.getDevicePtr(),
3450 coll_f_slow_x.getDevicePtr(),
3451 coll_f_slow_y.getDevicePtr(),
3452 coll_f_slow_z.getDevicePtr(),
3453 coll_unsortOrder.getDevicePtr(),
3455 patchData->d_queues,
3456 patchData->d_queueCounters,
3470 void SequencerCUDA::launch_part3(
3471 const int doMCPressure,
3478 const bool requestGlobalForces,
3479 const int doGlobalStaleForces,
3480 const bool forceRequestedGPU,
3483 const bool doEnergy,
3484 const bool requestForcesOutput)
3487 const bool doFixed =
simParams->fixedAtomsOn;
3488 const double velrescaling = 1;
3495 #ifndef NAMD_NCCL_ALLREDUCE 3500 std::vector<int> atom_counts;
3502 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3504 CUDASequencerKernel->mergeForcesFromPeers(
3508 numPatchesHomeAndProxy,
3510 this->coll_f_normal_x.getDevicePeerPtr(),
3511 this->coll_f_normal_y.getDevicePeerPtr(),
3512 this->coll_f_normal_z.getDevicePeerPtr(),
3513 this->coll_f_nbond_x.getDevicePeerPtr(),
3514 this->coll_f_nbond_y.getDevicePeerPtr(),
3515 this->coll_f_nbond_z.getDevicePeerPtr(),
3516 this->coll_f_slow_x.getDevicePeerPtr(),
3517 this->coll_f_slow_y.getDevicePeerPtr(),
3518 this->coll_f_slow_z.getDevicePeerPtr(),
3521 patchData->devData[deviceIndex].d_localPatches,
3522 patchData->devData[deviceIndex].d_peerPatches,
3527 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3528 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
3531 if(doVirial && doGlobalStaleForces)
3533 memset(&extVirial[EXT_GLOBALMTS], 0,
sizeof(
cudaTensor));
3534 memset(&extForce[EXT_GLOBALMTS], 0,
sizeof(double3));
3535 computeGlobalMasterVirial(
3536 numPatchesHomeAndProxy,
3538 patchData->devData[deviceIndex].d_localPatches,
3539 coll_pos_x.getDevicePtr(),
3540 coll_pos_y.getDevicePtr(),
3541 coll_pos_z.getDevicePtr(),
3546 &d_extForce[EXT_GLOBALMTS],
3547 &extForce[EXT_GLOBALMTS],
3548 &d_extVirial[EXT_GLOBALMTS],
3549 &extVirial[EXT_GLOBALMTS],
3554 calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
3557 if(
true || deviceID == 0){
3559 snprintf(prefix, 10,
"step-%d",step);
3560 this->printSOAForces(prefix);
3567 CUDASequencerKernel->langevinVelocitiesBBK1(
3568 dt_normal, d_langevinParam, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), numAtomsHome, stream);
3575 CUDASequencerKernel->addForceToMomentum(
3576 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3578 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3579 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3580 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3581 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
3582 numAtomsHome, maxForceNumber, stream);
3590 CUDASequencerKernel->rattle1(
3591 simParams->fixedAtomsOn, doEnergy || doVirial,
3592 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3594 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3595 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3596 d_velNew_x, d_velNew_y, d_velNew_z,
3597 d_posNew_x, d_posNew_y, d_posNew_z,
3598 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3599 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3600 &settleList, settleListSize, &d_consFailure,
3601 d_consFailureSize, &rattleList, rattleListSize,
3603 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3604 buildRigidLists, consFailure,
simParams->watmodel, stream);
3605 buildRigidLists =
false;
3607 CUDASequencerKernel->langevinVelocitiesBBK2(
3608 dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3609 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3610 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3611 numAtomsHome, numAtomsHome, 0,
3615 CUDASequencerKernel->rattle1(
3616 simParams->fixedAtomsOn, doEnergy || doVirial,
3617 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3619 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3620 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3621 d_velNew_x, d_velNew_y, d_velNew_z,
3622 d_posNew_x, d_posNew_y, d_posNew_z,
3623 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3624 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3625 &settleList, settleListSize, &d_consFailure,
3626 d_consFailureSize, &rattleList, rattleListSize,
3628 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3629 buildRigidLists, consFailure,
simParams->watmodel, stream);
3630 buildRigidLists =
false;
3634 if(doEnergy || doVirial){
3635 CUDASequencerKernel->centerOfMass(
3636 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
3637 d_vcm_x, d_vcm_y, d_vcm_z,
3638 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3641 submitHalf(numAtomsHome, 2, doEnergy || doVirial);
3643 CUDASequencerKernel->addForceToMomentum(
3644 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3646 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3647 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3648 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3649 coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
3650 numAtomsHome, maxForceNumber, stream);
3652 if(requestGlobalForces || requestForcesOutput) {
3655 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3658 if (forceRequestedGPU) {
3659 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3660 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3661 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3662 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3663 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3664 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3665 CUDASequencerKernel->copyForcesToDevice(
3666 numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3667 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3668 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3669 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3674 submitReductions(origin.
x, origin.
y, origin.
z,
3675 marginViolations, doEnergy || doVirial,
3676 copyOut &&
simParams->outputMomenta != 0,
3677 numAtomsHome, maxForceNumber);
3679 copyPositionsAndVelocitiesToHost(copyOut, 0);
3687 NAMD_die(
"constraint failure during CUDA rattle!\n");
3689 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
3691 }
else if(doEnergy || doVirial){
3692 cudaCheck(cudaStreamSynchronize(stream));
3693 if(doVirial && doGlobalStaleForces) {
3694 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3695 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3698 Tensor reduction_rigidVirial;
3701 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3707 Tensor reduction_intVirialNormal;
3712 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3713 reduction_virial *= 0.5;
3717 += (intKineticEnergy_half[0] * 0.25);
3718 reduction_intVirialNormal *= 0.5;
3720 reduction_intVirialNormal);
3724 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3726 Vector angularMomentum(*angularMomentum_x,
3728 *angularMomentum_z);
3731 Tensor regintVirialNormal;
3732 Tensor regintVirialNbond;
3735 if (maxForceNumber >= 1) {
3738 if (maxForceNumber >= 2) {
3748 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3749 double3 fixForceNormal, fixForceNbond, fixForceSlow;
3750 switch (maxForceNumber) {
3752 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3753 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3757 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
3760 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3761 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3765 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
3768 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3769 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3773 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
3777 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
3778 CkPrintf(
"%s", name.c_str());
3779 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 3780 "%12.5lf %12.5lf %12.5lf\n" 3781 "%12.5lf %12.5lf %12.5lf\n",
3786 printTensor(fixVirialNormal,
"fixVirialNormal = ");
3787 printTensor(fixVirialNbond,
"fixVirialNbond = ");
3788 printTensor(fixVirialSlow,
"fixVirialSlow = ");
3797 void SequencerCUDA::atomUpdatePme()
3802 bool doNbond =
false;
3807 bool doAlchDecouple =
false;
3808 bool doAlchSoftCore =
false;
3811 if (
simParams->alchThermIntOn) doTI =
true;
3812 if (
simParams->alchDecouple) doAlchDecouple =
true;
3813 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3817 lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
3820 bool usePatchPme =
false;
3833 getNumPatchesHome(),
3842 std::vector<int> atom_counts;
3844 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3846 CUDASequencerKernel->set_pme_positions(
3850 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3851 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3852 #ifdef NAMD_NCCL_ALLREDUCE 3853 (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
3854 (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
3855 (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
3857 coll_pos_x.getDevicePtr(),
3858 coll_pos_y.getDevicePtr(),
3859 coll_pos_z.getDevicePtr(),
3860 coll_pos_x.getDevicePeerPtr(),
3861 coll_pos_y.getDevicePeerPtr(),
3862 coll_pos_z.getDevicePeerPtr(),
3863 coll_charge.getDevicePeerPtr(),
3864 coll_partition.getDevicePeerPtr(),
3866 coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
3867 coll_patchCenter.getDevicePtr(),
3868 patchData->devData[deviceIndex].slow_patchPositions,
3869 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3870 coll_sortOrder.getDevicePtr(), myLattice,
3871 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3872 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3874 patchData->devData[deviceIndex].d_localPatches,
3875 patchData->devData[deviceIndex].d_peerPatches,
3879 cudaCheck(cudaStreamSynchronize(stream));
3884 void SequencerCUDA::sync() {
3885 cudaCheck(cudaStreamSynchronize(stream));
3888 void SequencerCUDA::calculateExternalForces(
3890 const int maxForceNumber,
3892 const int doVirial) {
3897 if (is_lonepairs_psf) {
3898 lonepairsKernel->redistributeForce(
3899 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3900 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
3901 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
3902 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3903 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), maxForceNumber, doEnergy || doVirial, stream);
3906 if (is_tip4_water) {
3907 redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
3917 double efield_phi =
PI/180. *
simParams->eFieldPhase;
3920 CUDASequencerKernel->apply_Efield(numAtomsHome,
simParams->eFieldNormalized,
3921 doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3922 coll_charge.getDevicePtr(), coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3923 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3924 &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3925 &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3926 &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3927 d_tbcatomic, stream);
3931 restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3932 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3933 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3934 &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3935 &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3936 &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3940 SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
3941 d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
3942 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3943 &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
3944 &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
3948 groupRestraintsKernel->doForce(step, doEnergy, doVirial,
3949 myLattice, d_transform,
3950 d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3951 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3952 &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
3953 &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
3956 gridForceKernel->doForce(doEnergy, doVirial,
3958 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
3959 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3963 consForceKernel->doForce(myLattice, doVirial,
3964 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
3965 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
3967 &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
3968 &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
3971 if(doEnergy || doVirial) {
3973 cudaCheck(cudaStreamSynchronize(stream));
3974 if (is_lonepairs_psf || is_tip4_water) {
3975 switch (maxForceNumber) {
3977 copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
3981 copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
3985 copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
3993 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
3994 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
4000 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
4001 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
4022 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
4024 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
4029 gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4031 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
4032 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
4033 gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4037 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
4038 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
4043 void SequencerCUDA::copyGlobalForcesToDevice(){
4046 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
4048 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4050 for(
int i =0 ; i < numPatchesHome; i++){
4052 const int patchID = record.
patchID;
4054 const int numPatchAtoms = record.
numAtoms;
4056 memcpy(f_global_x + stride, current.
f_global_x, numPatchAtoms*
sizeof(
double));
4057 memcpy(f_global_y + stride, current.
f_global_y, numPatchAtoms*
sizeof(
double));
4058 memcpy(f_global_z + stride, current.
f_global_z, numPatchAtoms*
sizeof(
double));
4061 copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4062 copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4063 copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4067 void SequencerCUDA::updateHostPatchDataSOA() {
4068 std::vector<PatchDataSOA> host_copy(numPatchesHome);
4069 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4071 for(
int i =0 ; i < numPatchesHome; i++) {
4072 host_copy[i] = homePatches[i]->patchDataSOA;
4074 copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4078 void SequencerCUDA::saveForceCUDASOA_direct(
4079 const bool doGlobal,
const bool doForcesOutput,
const int maxForceNumber) {
4080 CUDASequencerKernel->copyForcesToHostSOA(
4082 patchData->devData[deviceIndex].d_localPatches,
4084 coll_f_normal_x.getDevicePtr(),
4085 coll_f_normal_y.getDevicePtr(),
4086 coll_f_normal_z.getDevicePtr(),
4087 coll_f_nbond_x.getDevicePtr(),
4088 coll_f_nbond_y.getDevicePtr(),
4089 coll_f_nbond_z.getDevicePtr(),
4090 coll_f_slow_x.getDevicePtr(),
4091 coll_f_slow_y.getDevicePtr(),
4092 coll_f_slow_z.getDevicePtr(),
4098 cudaCheck(cudaStreamSynchronize(stream));
4101 void SequencerCUDA::copyPositionsToHost_direct() {
4102 CUDASequencerKernel->copyPositionsToHostSOA(
4104 patchData->devData[deviceIndex].d_localPatches,
4105 coll_pos_x.getDevicePtr(),
4106 coll_pos_y.getDevicePtr(),
4107 coll_pos_z.getDevicePtr(),
4111 cudaCheck(cudaStreamSynchronize(stream));
4114 void SequencerCUDA::redistributeTip4pForces(
4115 const int maxForceNumber,
4116 const int doVirial) {
4117 CUDASequencerKernel->redistributeTip4pForces(
4118 coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
4119 coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
4120 coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
4121 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4122 coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass,
4123 numAtomsHome, doVirial, maxForceNumber, stream
4127 void SequencerCUDA::allocateGPUSavedForces() {
4128 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4129 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4130 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4131 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4132 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4133 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4136 void SequencerCUDA::submitReductionValues() {
4137 reduction->submit();
NAMD_HOST_DEVICE void rescale(Tensor factor)
#define NAMD_EVENT_STOP(eon, id)
NAMD_HOST_DEVICE Vector c() const
int periodic_a(void) const
#define curandCheck(stmt)
static BigReal dielectric_1
static void partition(int *order, const FullAtom *atoms, int begin, int end)
static PatchMap * Object()
int periodic_c(void) const
BigReal alchElecLambdaStart
#define ADD_TENSOR_OBJECT(R, RL, D)
void deallocate_device(T **pp)
int32 * moleculeAtom
atom index for all molecules
HomePatchList * homePatchList()
std::ostream & endi(std::ostream &s)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
std::ostream & iWARN(std::ostream &s)
SubmitReduction * willSubmit(int setID, int size=-1)
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
#define COPY_CUDATENSOR(S, D)
static PatchMap * ObjectOnPe(int pe)
HomePatch * homePatch(PatchID pid)
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
void setupDevicePeerAccess()
std::atomic< int > reducerSMDDevice
const char * get_atomtype(int anum) const
__thread DeviceCUDA * deviceCUDA
std::atomic< int > reducerGroupRestraintDevice
void updateAtomCount(const int n, const int reallocate)
#define NAMD_CRASH_ATOM_TOO_FAST
FullAtomList & getAtomList()
void copyIntFlags(const Flags &flags)
NAMD_HOST_DEVICE double3 make_double3(float3 a)
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
static ComputeCUDAMgr * getComputeCUDAMgr()
double * vel_x
Jim recommends double precision velocity.
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
void allocate_host(T **pp, const size_t len)
PatchID getPatchID() const
int getPesSharingDevice(const int i)
NAMD_HOST_DEVICE Vector a_r() const
NAMD_HOST_DEVICE Vector b_r() const
void NAMD_die(const char *err_msg)
int periodic_b(void) const
NAMD_HOST_DEVICE Vector c_r() const
NAMD_HOST_DEVICE Vector b() const
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
#define ADD_VECTOR_OBJECT(R, RL, D)
int32 * moleculeStartIndex
starting index of each molecule
BigReal pairlistTolerance
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
NAMD_HOST_DEVICE Vector a() const
#define namd_reciprocal(x)
int getDeviceIDforPe(int pe)
int getNumPesSharingDevice()
SimParameters *const simParams
NAMD_HOST_DEVICE Vector unit(void) const
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms