namd/doxygen/SequencerCUDA_8C_source.html

 #include "ComputeLonepairsCUDA.h"
 #include "CudaUtils.h"
 #include "Molecule.h"
 #include "ReductionMgr.h"
 #include "SequencerCUDA.h"
 #include "ComputeNonbondedUtil.h"
 #include "DeviceCUDA.h"
 #include "SimParameters.h"
 #include "TestArray.h"
 #include "ComputeRestraintsCUDA.h"
 #include "ComputeGridForceCUDA.h"
 #include "ComputeConsForceCUDA.h"
 #include "NamdEventsProfiling.h"
 #include "AtomMap.h"
 #include "common.h"
 #include <algorithm> // std::fill()
 #include "ComputeGlobalMasterVirialCUDAKernel.h"
 //#define DEBUGM
 //#define MIN_DEBUG_LEVEL 3
 #ifdef NODEGROUP_FORCE_REGISTER
 #if !defined(WIN64)
 extern __thread DeviceCUDA *deviceCUDA;
 #else
 extern __declspec(thread) DeviceCUDA *deviceCUDA;
 #endif

 #if 1
 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do {     \
     size_t offset = 0;                                          \
     for (int i = 0; i < numPatchesHome; i++) { \
       PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \
       const int numPatchAtoms = current.numAtoms;                       \
       memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
       offset += numPatchAtoms;                                          \
     }                                                                   \
     copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream);      \
   } while(0);

 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do {     \
     size_t offset = 0;                                          \
     for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
       PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
       const int numPatchAtoms = current.numAtoms;                       \
       memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
       offset += numPatchAtoms;                                          \
     }                                                                   \
     copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream);      \
   } while(0);

 #define AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do {     \
     size_t offset = 0;                                          \
     for (int i = 0; i < numPatchesHome; i++) { \
       PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
       const int numPatchAtoms = current.numAtoms;                       \
       memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
       offset += numPatchAtoms;                                          \
     }                                                                   \
     copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHome, stream);      \
   } while(0);

 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(fieldName, type, stream) do {     \
     size_t offset = 0;                                          \
     for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
       PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
       const int numPatchAtoms = current.numAtoms;                       \
       memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
       offset += numPatchAtoms;                                          \
     }                                                                   \
     copy_HtoD<type>(fieldName, coll_ ## fieldName .getDevicePtr(), numAtomsHomeAndProxy, stream);      \
   } while(0);

 #else
 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do {     \
     size_t offset = 0;                                          \
     for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \
       PatchDataSOA& current = elem->patch->patchDataSOA;                \
       const int numPatchAtoms = current.numAtoms;                       \
       memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
       offset += numPatchAtoms;                                          \
     }                                                                   \
     copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream);      \
   } while(0);
 #endif

 // This function sets whatever SOA pointer I have an
 void SequencerCUDA::registerSOAPointersToHost(){
   patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
 }

 void SequencerCUDA::copySOAHostRegisterToDevice(){
   // This function gets the host-registered SOA device pointers and copies the register itself to the device
   // NOTE: This needs to be called only when ALL masterPEs have safely called ::registerSOAPointersToHost()
   cudaCheck(cudaSetDevice(deviceID));
   copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);

   // Workaround until CUDA compute objects have better access to these buffers
   for(int i = 0; i < this->nDevices; i++) {
     patchData->h_soa_sortOrder[i] = coll_sortOrder.getHostPeer()[i];
     if (simParams->useDeviceMigration) {
       patchData->h_soa_vdwType[i] = coll_vdwType.getHostPeer()[i];
       patchData->h_soa_id[i] = coll_idMig.getHostPeer()[i];
       patchData->h_soa_migrationDestination[i] = coll_migrationDestination.getHostPeer()[i];
     }

     if (simParams->alchOn) {
       patchData->h_soa_partition[i] = coll_partition.getHostPeer()[i];
     }
   }

   // aggregate device pointers
   for(int i = 0; i < this->nDevices; i++)
     h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
   copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
 }

 void SequencerCUDA::printSOAPositionsAndVelocities() {
   BigReal* h_pos_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_pos_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_pos_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);

   BigReal* h_vel_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_vel_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_vel_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);

   // DMC think this condition was a holdover from the NCCL code path
   if(false && mGpuOn){
     copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
     copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
     copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
   }else{
     copy_DtoH_sync<BigReal>(coll_pos_x.getDevicePtr(), h_pos_x, numAtomsHome);
     copy_DtoH_sync<BigReal>(coll_pos_y.getDevicePtr(), h_pos_y, numAtomsHome);
     copy_DtoH_sync<BigReal>(coll_pos_z.getDevicePtr(), h_pos_z, numAtomsHome);
   }

   copy_DtoH_sync<BigReal>(coll_vel_x.getDevicePtr(), h_vel_x, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_vel_y.getDevicePtr(), h_vel_y, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_vel_z.getDevicePtr(), h_vel_z, numAtomsHome);

   CmiLock(this->patchData->printlock);
   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   //  fprintf(stderr,  "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
   std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
   for(int i =0 ; i < numPatchesHome; i++){
     CudaLocalRecord record = localPatches[i];
     const int patchID = record.patchID;
     const int stride = record.bufferOffset;
     const int numPatchAtoms = record.numAtoms;
     PatchDataSOA& current = homePatches[i]->patchDataSOA;

     fprintf(stderr, "Patch [%d]:\n", patchID);
     for(int j = 0; j < numPatchAtoms; j++){
       fprintf(stderr, " [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.id[j],
         h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
         h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
     }

   }
   CmiUnlock(this->patchData->printlock);

   free(h_pos_x);
   free(h_pos_y);
   free(h_pos_z);
   free(h_vel_x);
   free(h_vel_y);
   free(h_vel_z);
 }

 void SequencerCUDA::printSOAForces(char *prefix) {
   BigReal* h_f_normal_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_normal_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_normal_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);

   BigReal* h_f_nbond_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_nbond_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_nbond_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);

   BigReal* h_f_slow_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_slow_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
   BigReal* h_f_slow_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);

   copy_DtoH_sync<BigReal>(coll_f_normal_x.getDevicePtr(), h_f_normal_x, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_normal_y.getDevicePtr(), h_f_normal_y, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_normal_z.getDevicePtr(), h_f_normal_z, numAtomsHome);

   copy_DtoH_sync<BigReal>(coll_f_nbond_x.getDevicePtr(), h_f_nbond_x, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_nbond_y.getDevicePtr(), h_f_nbond_y, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_nbond_z.getDevicePtr(), h_f_nbond_z, numAtomsHome);

   copy_DtoH_sync<BigReal>(coll_f_slow_x.getDevicePtr(), h_f_slow_x, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_slow_y.getDevicePtr(), h_f_slow_y, numAtomsHome);
   copy_DtoH_sync<BigReal>(coll_f_slow_z.getDevicePtr(), h_f_slow_z, numAtomsHome);

   // Great, now let's
   CmiLock(this->patchData->printlock);
   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   char fname[100];
   fprintf(stderr,  "PE[%d] force printout\n", CkMyPe());
   for(int i =0 ; i < numPatchesHome; i++){
     CudaLocalRecord record = localPatches[i];
     const int patchID = record.patchID;
     const int stride = record.bufferOffset;
     const int numPatchAtoms = record.numAtoms;
     FILE *outfile=stderr;
     if(prefix!=NULL)
       {
         snprintf(fname,100, "%s-patch-%d", prefix, patchID);
         outfile = fopen(fname, "w");
       }
     fprintf(outfile, "Patch [%d]:\n", patchID);
     for(int j = 0; j < numPatchAtoms; j++){
       fprintf(outfile, " [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
         h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
         h_f_nbond_x[stride+j],  h_f_nbond_y[stride+j],  h_f_nbond_z[stride+j],
         h_f_slow_x[stride+j],   h_f_slow_y[stride+j],   h_f_slow_z[stride+j] );
     }
     if(prefix!=NULL)  fclose(outfile);
   }

   CmiUnlock(this->patchData->printlock);

   free(h_f_normal_x);
   free(h_f_normal_y);
   free(h_f_normal_z);

   free(h_f_nbond_x);
   free(h_f_nbond_y);
   free(h_f_nbond_z);

   free(h_f_slow_x);
   free(h_f_slow_y);
   free(h_f_slow_z);

 }

 SequencerCUDA* SequencerCUDA::InstanceInit(const int deviceID_ID,
                                        SimParameters *const sim_Params) {
   if (CkpvAccess(SequencerCUDA_instance) == 0) {
     CkpvAccess(SequencerCUDA_instance) = new SequencerCUDA(deviceID_ID, sim_Params);
   }
   return CkpvAccess(SequencerCUDA_instance);
 }

 SequencerCUDA::SequencerCUDA(const int deviceID_ID,
                              SimParameters *const sim_Params):
   deviceID(deviceID_ID), simParams(sim_Params)
 {
   restraintsKernel = NULL;
   SMDKernel = NULL;
   groupRestraintsKernel = NULL;
   gridForceKernel = NULL;
   consForceKernel = NULL;
   lonepairsKernel = nullptr;
 #if 1
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();
 #endif
   initialize();
   CUDASequencerKernel = new SequencerCUDAKernel();
   CUDAMigrationKernel = new MigrationCUDAKernel();
   num_used_grids = simParams->alchGetNumOfPMEGrids();
   used_grids.resize(num_used_grids, 0);
   if (simParams->alchFepOn) {
     // at least two grids are used
     used_grids[0] = 0;
     used_grids[1] = 1;
     // if alchDecouple then two more grids are used
     if (simParams->alchDecouple) {
       used_grids[2] = 2;
       used_grids[3] = 3;
       // an extra for soft-core potential
       if (simParams->alchElecLambdaStart > 0) {
         used_grids[4] = 4;
       }
     } else {
       // in this case alchDecouple is false
       // but if there is still soft-core potential
       // then a total of 3 grids are used
       // mark the last grid for soft-core potential
       if (simParams->alchElecLambdaStart > 0) {
         used_grids[2] = 4;
       }
     }
   }
   if (simParams->alchThermIntOn) {
     used_grids[0] = 0;
     used_grids[1] = 1;
     // in TI, no matter whether soft-core potential is used
     // it has at least three grids
     if (simParams->alchDecouple) {
       used_grids[2] = 2;
       used_grids[3] = 3;
       used_grids[4] = 4;
     } else {
       used_grids[2] = 4;
     }
   }
   rescalePairlistTolerance = false;
 }

 SequencerCUDA::~SequencerCUDA(){
     cudaCheck(cudaSetDevice(deviceID));
     deallocateArrays();
     deallocateStaticArrays();
     deallocate_device<SettleParameters>(&sp);
     deallocate_device<int>(&settleList);
     deallocate_device<CudaRattleElem>(&rattleList);
     deallocate_device<int>(&d_consFailure);
     if (CUDASequencerKernel != NULL) delete CUDASequencerKernel;
     if (CUDAMigrationKernel != NULL) delete CUDAMigrationKernel;
     if (restraintsKernel != NULL) delete restraintsKernel;
     if(SMDKernel != NULL) delete SMDKernel;
     if (groupRestraintsKernel != NULL) delete groupRestraintsKernel;
     if (gridForceKernel != NULL) delete gridForceKernel;
     if (lonepairsKernel != nullptr) delete lonepairsKernel;
     if (consForceKernel != NULL) delete consForceKernel;
 #if 0
     cudaCheck(cudaStreamDestroy(stream));
 #endif
     cudaCheck(cudaStreamDestroy(stream2));
     curandCheck(curandDestroyGenerator(curandGen));
     CmiDestroyLock(printlock);

 }

 void SequencerCUDA::zeroScalars(){
   numAtomsHomeAndProxyAllocated = 0;
   numAtomsHomeAllocated = 0;
   buildRigidLists = true;
   numPatchesCheckedIn = 0;
   numPatchesReady= 0;
 }

 void SequencerCUDA::initialize(){
   cudaCheck(cudaSetDevice(deviceID));
   nDevices       = deviceCUDA->getNumDevice() + 1 *deviceCUDA->isGpuReservedPme();
   deviceIndex    = deviceCUDA->getDeviceIndex();
 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
   int leastPriority, greatestPriority;
   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
 #if 0
   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
 #else
   stream = (CkMyPe() == deviceCUDA->getMasterPe()) ? patchData->devData[deviceIndex].nbond_stream : 0;
 #endif
   cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
 #else
   cudaCheck(cudaStreamCreate(&stream));
   cudaCheck(cudaStreamCreate(&stream2));
 #endif
   curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
   // each PE's seed needs to be different
   unsigned long long seed = simParams->randomSeed + CkMyPe();
   curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));

   numAtomsHomeAllocated = 0;
   numAtomsHomeAndProxyAllocated = 0;

   totalMarginViolations = 0;
   buildRigidLists = true;
   numPatchesCheckedIn = 0;
   numPatchesReady= 0;
   PatchMap* patchMap = PatchMap::Object();
 #if 1
   numPatchesGlobal = patchMap->numPatches();
 #else
   numPatchesGlobal = patchMap->homePatchList()->size();
 #endif
   mGpuOn      = nDevices > 1;
   // Great, now allocate the queue
   // allocate_device<unsigned int>(&deviceQueue, nDevices);
   // cudaCheck(cudaMemset(deviceQueue, 999, sizeof(unsigned int)* nDevices));
   // Allocates and registers the local queue in patchData
   // patchData->d_queues[deviceIndex] = deviceQueue;

   printlock  = CmiCreateLock();

   const int numPes = CkNumPes();
   if (!mGpuOn) {
     atomMapList.resize(numPes);
   }

   if (simParams->fixedAtomsOn) {
     allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
     allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
     allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
     allocate_device<double3>(&d_fixForceNormal, 1);
     allocate_device<double3>(&d_fixForceNbond, 1);
     allocate_device<double3>(&d_fixForceSlow, 1);
     cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
     cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
     cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
     cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
     cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
     cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
   }

   allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);

   allocate_device<bool*>(&d_patchRecordHasForces, nDevices);

   allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
   // Patch-related host datastructures
   allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
   allocate_host<double3>(&patchCenter,  numPatchesGlobal);
   allocate_host<int>(&globalToLocalID, numPatchesGlobal);
   allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
   allocate_host<double3>(&awayDists, numPatchesGlobal);
   allocate_host<double3>(&patchMin, numPatchesGlobal);
   allocate_host<double3>(&patchMax, numPatchesGlobal);
   //allocate_host<Lattice>(&lattices, numPatchesGlobal);
   allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal); // only needed for langevin
   allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
   allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
   allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);

   //Patch-related device datastructures
   allocate_device<double3>(&d_awayDists, numPatchesGlobal);
   allocate_device<double3>(&d_patchMin, numPatchesGlobal);
   allocate_device<double3>(&d_patchMax, numPatchesGlobal);
   allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
   allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
   allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
   allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
   allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
   allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
   allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);

   // Allocate host memory for scalar variables
   allocate_device<int>(&d_killme, 1);
   allocate_device<char>(&d_barrierFlag, 1);
   allocate_device<unsigned int>(&d_tbcatomic, 5);
   allocate_device<BigReal>(&d_kineticEnergy, ATOMIC_BINS);
   allocate_device<BigReal>(&d_intKineticEnergy, ATOMIC_BINS);
   allocate_device<BigReal>(&d_momentum_x, ATOMIC_BINS);
   allocate_device<BigReal>(&d_momentum_y, ATOMIC_BINS);
   allocate_device<BigReal>(&d_momentum_z, ATOMIC_BINS);
   allocate_device<BigReal>(&d_angularMomentum_x, ATOMIC_BINS);
   allocate_device<BigReal>(&d_angularMomentum_y, ATOMIC_BINS);
   allocate_device<BigReal>(&d_angularMomentum_z, ATOMIC_BINS);
   allocate_device<cudaTensor>(&d_virial, ATOMIC_BINS);
   allocate_device<cudaTensor>(&d_intVirialNormal, ATOMIC_BINS);
   allocate_device<cudaTensor>(&d_intVirialNbond, ATOMIC_BINS);
   allocate_device<cudaTensor>(&d_intVirialSlow, ATOMIC_BINS);
   allocate_device<cudaTensor>(&d_rigidVirial, ATOMIC_BINS);
   // for lone pairs
   allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
   allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
   allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
   //space for globalmaster forces on device
   allocate_device<cudaTensor>(&d_extVirial, ATOMIC_BINS * EXT_FORCE_TOTAL);
   allocate_device<double3>(&d_extForce, ATOMIC_BINS * EXT_FORCE_TOTAL);
   allocate_device<double>(&d_extEnergy, ATOMIC_BINS * EXT_FORCE_TOTAL);

   allocate_device<SettleParameters>(&sp, 1);

   //allocates host scalars in host-mapped, pinned memory
   allocate_host<int>(&killme, 1);
   allocate_host<BigReal>(&kineticEnergy, 1);
   allocate_host<BigReal>(&intKineticEnergy, 1);
   allocate_host<BigReal>(&kineticEnergy_half, 1);
   allocate_host<BigReal>(&intKineticEnergy_half, 1);
   allocate_host<BigReal>(&momentum_x, 1);
   allocate_host<BigReal>(&momentum_y, 1);
   allocate_host<BigReal>(&momentum_z, 1);
   allocate_host<BigReal>(&angularMomentum_x, 1);
   allocate_host<BigReal>(&angularMomentum_y, 1);
   allocate_host<BigReal>(&angularMomentum_z, 1);
   allocate_host<int>(&consFailure, 1);
   allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
   allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
   allocate_host<unsigned int>(&h_marginViolations, 1);
   allocate_host<unsigned int>(&h_periodicCellSmall, 1);

   // allocates host cudaTensors in host-mapped, pinned memory
   allocate_host<cudaTensor>(&virial, 1);
   allocate_host<cudaTensor>(&virial_half, 1);
   allocate_host<cudaTensor>(&intVirialNormal, 1);
   allocate_host<cudaTensor>(&intVirialNormal_half, 1);
   allocate_host<cudaTensor>(&intVirialNbond, 1);
   allocate_host<cudaTensor>(&intVirialSlow, 1);
   allocate_host<cudaTensor>(&rigidVirial, 1);
   allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
   allocate_host<cudaTensor>(&lpVirialNormal, 1);
   allocate_host<cudaTensor>(&lpVirialNbond, 1);
   allocate_host<cudaTensor>(&lpVirialSlow, 1);

   // These arrays will get allocated when total forces of the GPU global calculations are requested
   d_f_saved_nbond_x = nullptr;
   d_f_saved_nbond_y = nullptr;
   d_f_saved_nbond_z = nullptr;
   d_f_saved_slow_x = nullptr;
   d_f_saved_slow_y = nullptr;
   d_f_saved_slow_z = nullptr;

   //Sets values for scalars
   *kineticEnergy = 0.0;
   *intKineticEnergy = 0.0;
   *kineticEnergy_half = 0.0;
   *intKineticEnergy_half = 0.0;
   *momentum_x = 0.0;
   *momentum_y = 0.0;
   *momentum_z = 0.0;
   *angularMomentum_x = 0.0;
   *angularMomentum_y = 0.0;
   *angularMomentum_z = 0.0;
   *consFailure = 0;
   *killme = 0;

   // JM: Basic infrastructure to time kernels
   //  XXX TODO: Add timing functions to be switched on/off for each of these
   t_total               = 0;
   t_vverlet             = 0;
   t_pairlistCheck       = 0;
   t_setComputePositions = 0;
   t_accumulateForceKick = 0;
   t_rattle              = 0;
   t_submitHalf          = 0;
   t_submitReductions1   = 0;
   t_submitReductions2   = 0;

   cudaEventCreate(&eventStart);
   cudaEventCreate(&eventStop);
   cudaCheck(cudaMemset(d_patchNewTolerance, 0, sizeof(BigReal)*numPatchesGlobal));
   cudaCheck(cudaMemset(d_kineticEnergy, 0, sizeof(BigReal)));
   cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int) * 5));
   cudaCheck(cudaMemset(d_momentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_momentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_momentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_angularMomentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_angularMomentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_angularMomentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_intKineticEnergy, 0, ATOMIC_BINS * sizeof(BigReal)));
   cudaCheck(cudaMemset(d_virial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_rigidVirial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_intVirialNormal, 0, ATOMIC_BINS * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_intVirialNbond, 0, ATOMIC_BINS * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_intVirialSlow, 0, ATOMIC_BINS * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_extVirial, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(cudaTensor)));
   cudaCheck(cudaMemset(d_extForce, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double3)));
   cudaCheck(cudaMemset(d_extEnergy, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double)));

   memset(h_marginViolations,  0, sizeof(unsigned int));
   memset(h_periodicCellSmall, 0, sizeof(unsigned int));
   memset(virial, 0, sizeof(cudaTensor));
   memset(rigidVirial, 0, sizeof(cudaTensor));
   memset(intVirialNormal, 0, sizeof(cudaTensor));
   memset(intVirialNbond, 0, sizeof(cudaTensor));
   memset(intVirialSlow, 0, sizeof(cudaTensor));
   memset(lpVirialNormal, 0, sizeof(cudaTensor));
   memset(lpVirialNbond, 0, sizeof(cudaTensor));
   memset(lpVirialSlow, 0, sizeof(cudaTensor));
   memset(globalToLocalID, -1, sizeof(int)*numPatchesGlobal);

   settleList = NULL;
   settleListSize = 0;
   rattleList = NULL;
   rattleListSize = 0;
   d_consFailure = NULL;
   d_consFailureSize = 0;

   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);

   // JM: I can bundle the CompAtom and CudaAtomList pointers from the
   //     patches here and fill the Host-arrays after integration. :]
   // if single-gpu simulation, numPatchesHome is the same as numPatchesGlobal
   numPatchesHome = numPatchesGlobal;


   int count = 0;
   // Single-GPU case
   if(!mGpuOn){
     // JM NOTE: This works for a single device only, but it's fast if we want to have multiple-PE's sharing it
     if(deviceCUDA->getMasterPe() == CkMyPe()){
       for(int i = 0; i < numPes; i++) {
         atomMapList[i] = AtomMap::ObjectOnPe(i);
         PatchMap* patchMap = PatchMap::ObjectOnPe(i);
         int npatch = patchMap->numPatches();
         for (int j = 0; j < npatch; j++) {
           HomePatch *patch = patchMap->homePatch(j);
           // JM NOTE: This data structure can be preserved through migration steps
           if(patch != NULL) {
             patchList.push_back(patch);
             patchNewTolerance[count++] =
               0.5 * ( simParams->pairlistDist - simParams->cutoff );
             numAtomsHomeAndProxy += patchMap->patch(j)->getNumAtoms();

             patchData->devData[deviceIndex].patches.push_back(patch);
             patchListHomeAndProxy.push_back(patch);
           }
         }
       }
     }
   }else{
     // Multi-device case
     /* The logic here is a bit trickier than the one on the single-GPU case.
        Each GPU will only allocate space for its home patches and any required proxy patches,
        This is going to eliminate the possibility of using NCCL-based all reduce for communication
        but DMC thinks this is okay...
     */
     if(deviceCUDA->getMasterPe() == CkMyPe()){
       // Haochuan: For multi-GPU cases, we need to find the atom maps from
       // all "non-master PEs" that use the same device as the master PE.
       // 1. Get the current device ID.
       const int currentDevice = deviceCUDA->getDeviceIDforPe(CkMyPe());
       // 2. Iterate over all PEs and find those that have the same device ID as this master PE
       for (int i = 0; i < numPes; ++i) {
         if (deviceCUDA->getDeviceIDforPe(i) == currentDevice) {
           atomMapList.push_back(AtomMap::ObjectOnPe(i));
         }
       }
       for(int i = 0; i < deviceCUDA->getNumPesSharingDevice(); i++){
         PatchMap* pm = PatchMap::ObjectOnPe(deviceCUDA->getPesSharingDevice(i));
         for (int j = 0; j < pm->numPatches(); j++) {
           // Aggregates the patches in a separate data structure for now
           HomePatch *patch = pm->homePatch(j);
           if(patch != NULL) {
             patchData->devData[deviceIndex].patches.push_back(patch);
           }
         }
       }

       // if MGPU is On, we also try to set up peer access
       deviceCUDA->setupDevicePeerAccess();
 #ifdef NAMD_NCCL_ALLREDUCE
       deviceCUDA->setNcclUniqueId(patchData->ncclId);
       deviceCUDA->setupNcclComm();
 #endif

     }
   }
   PatchMap *pm = PatchMap::Object();
   isPeriodic = (pm->periodic_a() && pm->periodic_b() && pm->periodic_c());

   // XXX TODO decide how the biasing methods will work in the future -
   //    for multiple devices this will be a problem, how to solve it?
   if (simParams->constraintsOn) {
     restraintsKernel = new ComputeRestraintsCUDA(patchList, atomMapList,
         stream, mGpuOn);
   }
   if (simParams->SMDOn) {
    if(deviceCUDA->getMasterPe() == CkMyPe()){
     SMDKernel = new ComputeSMDCUDA(patchList, simParams->SMDk, simParams->SMDk2,
       simParams->SMDVel, make_double3(simParams->SMDDir.x, simParams->SMDDir.y, simParams->SMDDir.z),
       simParams->SMDOutputFreq, simParams->firstTimestep, simParams->SMDFile,
                                    deviceCUDA->getIsMasterDevice(),
       Node::Object()->molecule->numAtoms, nDevices, deviceIndex, mGpuOn);
     // need to set smd atom lists in the mgpu case
     if(mGpuOn)
       {
         SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
         ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
         SMDKernel->initPeerCOM(cudaMgr->curSMDCOM, stream);
       }
    }
   }
   if (simParams->groupRestraintsOn) {
    if(deviceCUDA->getMasterPe() == CkMyPe()){
      groupRestraintsKernel = new ComputeGroupRestraintsCUDA(simParams->outputEnergies,
                                                            simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
      if(mGpuOn)
        {
          groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
          groupRestraintsKernel->initPeerCOM(stream);
        }
    }
   }
   if (simParams->mgridforceOn || simParams->gridforceOn ){
     if(deviceCUDA->getMasterPe() == CkMyPe()){
       gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
     }
   }
   if (Node::Object()->molecule->is_lonepairs_psf) {
     lonepairsKernel = new ComputeLonepairsCUDA();
   }
   if (simParams->consForceOn){
     if(deviceCUDA->getMasterPe() == CkMyPe()){
       consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
     }
   }

 }

 /* events like ScriptTcl changes to global data require us to rebuild
    some of the data structures used by some of the kernels.  i.e., the
    gridForceKernel needs to change when the grid, or grid related
    data, is changed.

    Technically these may all be in use and updated at different
    intervals, but that is a weird edge case that doesn't justify
    adding a whole other sync path.  If that happens, we'll be doing
    this slightly too often. Such a hypothetical user has already
    bought in to periodically sacrificing performance to radically
    change the system being simulated at run time in several
    different ways on different triggers, so as long as this overhead
    is substantially better than a full restart, it probably doesn't
    matter.
  */

 void SequencerCUDA::updateDeviceKernels()
 {
   if (simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn){
     // clean out the old and bring in the new if we're a master PE
     if(deviceCUDA->getMasterPe() == CkMyPe()){
       if(patchData->updateCounter.fetch_sub(1)>=1)
         {
           if(gridForceKernel!=NULL)
             {
               delete gridForceKernel;
               gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
               gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
             }
           if(consForceKernel!=NULL)
             {
               delete consForceKernel;
               consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,
                                                          mGpuOn);
               consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
             }
         }
     }
   }
 }

 bool SequencerCUDA::reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy)
 {
   cudaCheck(cudaSetDevice(deviceID));
   const float OVERALLOC = 1.5f;

   if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
     return false;
   }

   // Before deallocate the f_saved_nbond_* and f_saved_slow_*,
   // check if they are nullptr and allocate them if they are null
   bool realloc_gpu_saved_force = false;
   if (d_f_saved_nbond_x != nullptr || d_f_saved_slow_x != nullptr) {
     realloc_gpu_saved_force = true;
   }


   deallocateArrays();

   numAtomsHomeAndProxyAllocated = (int) ((float) in_numAtomsHomeAndProxy * OVERALLOC);
   numAtomsHomeAllocated = (int) ((float) in_numAtomsHome * OVERALLOC);

   allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
   allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
   if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
     allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
     allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
     allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
   }
   allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
   allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
   allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);

   allocate_host<double>(&recipMass, numAtomsHomeAllocated);
   allocate_host<double>(&vel_x, numAtomsHomeAllocated);
   allocate_host<double>(&vel_y, numAtomsHomeAllocated);
   allocate_host<double>(&vel_z, numAtomsHomeAllocated);
   allocate_host<char3>(&transform, numAtomsHomeAllocated);
   allocate_host<float>(&mass,   numAtomsHomeAllocated);
   if (simParams->alchOn) {
     allocate_host<int>(&partition, numAtomsHomeAndProxyAllocated);
   }
   allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
   allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
   allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);

   // array buffers for pseudorandom normal distribution must be even length
   // choose n to be the smallest even number >= numIntegrationAtoms
   // guarantees that n is always > numIntegrationAtoms
   int n = (numAtomsHomeAllocated + 1) & (~1);
   allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
   // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
   allocate_host<int>(&atomFixed,       numAtomsHomeAllocated);
   if (simParams->fixedAtomsOn) {
     allocate_host<int>(&groupFixed,       numAtomsHomeAllocated);
     allocate_host(&fixedPosition_x,      numAtomsHomeAllocated);
     allocate_host(&fixedPosition_y,      numAtomsHomeAllocated);
     allocate_host(&fixedPosition_z,      numAtomsHomeAllocated);
   }
   allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);


   if (simParams->useDeviceMigration) {
     allocate_host<int>(&idMig, numAtomsHomeAllocated);
     allocate_host<int>(&vdwType, numAtomsHomeAllocated);
   }

   // Allocate bonded / normal forces
   coll_f_normal_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_normal_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_normal_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   // Allocate nonbonded forces
   coll_f_nbond_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_nbond_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_nbond_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   // Allocate slow forces
   coll_f_slow_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_slow_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_f_slow_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   // Allocate positions
   coll_pos_x.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_pos_y.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_pos_z.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   // Allocate velocities
   coll_vel_x.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
   coll_vel_y.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
   coll_vel_z.allocate(defaultCollectiveBufferType, numAtomsHomeAllocated, SynchronousCollectiveScope::master);
   // Allocate charges
   coll_charge.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);

   coll_sortOrder.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   coll_unsortOrder.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);

   if (simParams->alchOn) {
     coll_partition.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
   }


   if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
     allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
   }
   if (realloc_gpu_saved_force) {
     allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
     allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
   }

   // allocate memory for backup forces in MC barostat
   if (simParams->monteCarloPressureOn) {
     // Total number of backup force and positions buffers
     allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
     allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
     d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
     d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
     d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
     d_f_nbondMC_x =  &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
     d_f_nbondMC_y =  &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
     d_f_nbondMC_z =  &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
     d_f_slowMC_x =   &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
     d_f_slowMC_y =   &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
     d_f_slowMC_z =   &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
     d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
     d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
     d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];

     allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
     allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
     allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
     allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
     // we can use molecule_size + 1, rather than atom size
     allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
   }

   allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
   d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
   d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
   d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
   allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
   allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
   allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
   allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
   allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
   allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
   allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
   allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
   allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
   allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
   allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
   allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
   allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
   allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);

   allocate_device<float>(&d_mass,   numAtomsHomeAllocated);
   allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
   allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
   allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
   allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
   allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
   allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
   allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
   allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
   // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
   allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
   if (simParams->fixedAtomsOn) {
     allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
     allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
     allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
     allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
   }

   if (simParams->useDeviceMigration) {
     coll_idMig.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
     coll_vdwType.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
     allocate_device<FullAtom>(&d_atomdata_AoS,   numAtomsHomeAllocated);
     allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
     allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
     allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
   }

   // These arrays will get allocated when total forces of the GPU global calculations are requested
   d_f_saved_nbond_x = nullptr;
   d_f_saved_nbond_y = nullptr;
   d_f_saved_nbond_z = nullptr;
   d_f_saved_slow_x = nullptr;
   d_f_saved_slow_y = nullptr;
   d_f_saved_slow_z = nullptr;

   // Memsets the d_pos arrays in order to reduce them afterwards;
   memset(pos_x, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
   memset(pos_y, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
   memset(pos_z, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
   cudaCheck(cudaMemset(coll_pos_x.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
   cudaCheck(cudaMemset(coll_pos_y.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
   cudaCheck(cudaMemset(coll_pos_z.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
   cudaCheck(cudaMemset(coll_vel_x.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(coll_vel_y.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(coll_vel_z.getDevicePtr(), 0 , sizeof(double)*numAtomsHomeAllocated));

   cudaCheck(cudaMemset(d_posNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(d_posNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(d_posNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(d_velNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(d_velNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
   cudaCheck(cudaMemset(d_velNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));

   return true;
 }

 void SequencerCUDA::reallocateMigrationDestination() {
   coll_migrationDestination.deallocate();
   coll_migrationDestination.allocate(defaultCollectiveBufferType, numAtomsHomeAndProxyAllocated, SynchronousCollectiveScope::master);
 }

 void SequencerCUDA::deallocateArrays() {
   if (numAtomsHomeAndProxyAllocated != 0) {
     cudaCheck(cudaSetDevice(deviceID));

     deallocate_host<double>(&f_normal_x);
     deallocate_host<double>(&f_normal_y);
     deallocate_host<double>(&f_normal_z);
     if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
       deallocate_host<double>(&f_global_x);
       deallocate_host<double>(&f_global_y);
       deallocate_host<double>(&f_global_z);
     }
     deallocate_host<double>(&f_nbond_x);
     deallocate_host<double>(&f_nbond_y);
     deallocate_host<double>(&f_nbond_z);
     deallocate_host<double>(&f_slow_x);
     deallocate_host<double>(&f_slow_y);
     deallocate_host<double>(&f_slow_z);
     deallocate_host<double>(&pos_x);
     deallocate_host<double>(&pos_y);
     deallocate_host<double>(&pos_z);
     deallocate_host<float>(&charge);
     deallocate_host<int>(&sortOrder);
     deallocate_host<int>(&unsortOrder);
     deallocate_host<double>(&recipMass);
     deallocate_host<double>(&vel_x);
     deallocate_host<double>(&vel_y);
     deallocate_host<double>(&vel_z);
     deallocate_host<char3>(&transform);
     deallocate_host<float>(&mass);
     if (simParams->alchOn) {
       deallocate_host<int>(&partition);
     }
     deallocate_host<float>(&langevinParam);
     deallocate_host<float>(&langScalVelBBK2);
     deallocate_host<float>(&langScalRandBBK2);

     deallocate_host<int>(&hydrogenGroupSize);
     deallocate_host<int>(&atomFixed);
     if (simParams->fixedAtomsOn) {
       deallocate_host<int>(&groupFixed);
       deallocate_host<double>(&fixedPosition_x);
       deallocate_host<double>(&fixedPosition_y);
       deallocate_host<double>(&fixedPosition_z);
     }

     deallocate_host<float>(&rigidBondLength);

     coll_pos_x.deallocate();
     coll_pos_y.deallocate();
     coll_pos_z.deallocate();
     coll_f_normal_x.deallocate();
     coll_f_normal_y.deallocate();
     coll_f_normal_z.deallocate();
     coll_f_nbond_x.deallocate();
     coll_f_nbond_y.deallocate();
     coll_f_nbond_z.deallocate();
     coll_f_slow_x.deallocate();
     coll_f_slow_y.deallocate();
     coll_f_slow_z.deallocate();

     coll_charge.deallocate();

     coll_sortOrder.deallocate();
     coll_unsortOrder.deallocate();
     if (simParams->alchOn) {
       coll_partition.deallocate();
     }

     deallocate_device<double>(&d_posNew_raw);

     if (simParams->monteCarloPressureOn) {
       deallocate_device<double>(&d_f_rawMC);
       deallocate_device<double>(&d_pos_rawMC);

       deallocate_host<int>(&id);
       deallocate_device<int>(&d_id);
       deallocate_device<int>(&d_idOrder);
       deallocate_device<int>(&d_moleculeAtom);
       deallocate_device<int>(&d_moleculeStartIndex);
     }

     if (simParams->useDeviceMigration) {
       deallocate_host<int>(&idMig);
       deallocate_host<int>(&vdwType);
       coll_idMig.deallocate();
       coll_vdwType.deallocate();
       deallocate_device<FullAtom>(&d_atomdata_AoS);
       deallocate_device<int>(&d_migrationGroupSize);
       deallocate_device<int>(&d_migrationGroupIndex);
       deallocate_device<int>(&d_sortIndex);
     }
     if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
       deallocate_device<double>(&d_f_global_x);
       deallocate_device<double>(&d_f_global_y);
       deallocate_device<double>(&d_f_global_z);
     }
     coll_vel_x.deallocate();
     coll_vel_y.deallocate();
     coll_vel_z.deallocate();
     deallocate_device<double>(&d_recipMass);
     deallocate_device<char3>(&d_transform);
     deallocate_device<double>(&d_velNew_x);
     deallocate_device<double>(&d_velNew_y);
     deallocate_device<double>(&d_velNew_z);
     deallocate_device<double>(&d_posSave_x);
     deallocate_device<double>(&d_posSave_y);
     deallocate_device<double>(&d_posSave_z);
     deallocate_device<double>(&d_rcm_x);
     deallocate_device<double>(&d_rcm_y);
     deallocate_device<double>(&d_rcm_z);
     deallocate_device<double>(&d_vcm_x);
     deallocate_device<double>(&d_vcm_y);
     deallocate_device<double>(&d_vcm_z);
     deallocate_device<float>(&d_mass);
     deallocate_device<float>(&d_langevinParam);
     deallocate_device<float>(&d_langScalVelBBK2);
     deallocate_device<float>(&d_langScalRandBBK2);
     deallocate_device<float>(&d_gaussrand_x);
     deallocate_device<float>(&d_gaussrand_y);
     deallocate_device<float>(&d_gaussrand_z);
     deallocate_device<int>(&d_hydrogenGroupSize);
     deallocate_device<float>(&d_rigidBondLength);
     deallocate_device<int>(&d_atomFixed);
     if (simParams->fixedAtomsOn) {
       deallocate_device<int>(&d_groupFixed);
       deallocate_device<double>(&d_fixedPosition_x);
       deallocate_device<double>(&d_fixedPosition_y);
       deallocate_device<double>(&d_fixedPosition_z);
     }
     deallocate_device<double>(&d_f_saved_nbond_x);
     deallocate_device<double>(&d_f_saved_nbond_y);
     deallocate_device<double>(&d_f_saved_nbond_z);
     deallocate_device<double>(&d_f_saved_slow_x);
     deallocate_device<double>(&d_f_saved_slow_y);
     deallocate_device<double>(&d_f_saved_slow_z);
   }
 }

 void SequencerCUDA::deallocateStaticArrays() {
   cudaCheck(cudaSetDevice(deviceID));

   deallocate_host<cudaTensor>(&extVirial);
   deallocate_host<double3>(&extForce);
   deallocate_host<double>(&extEnergy);
     deallocate_host<unsigned int>(&h_marginViolations);
   deallocate_host<unsigned int>(&h_periodicCellSmall);

   // XXX TODO: Deallocate the additional arrays we added for shmem version
   deallocate_host<double3>(&awayDists);
   deallocate_host<double3>(&patchMin);
   deallocate_host<double3>(&patchMax);
   deallocate_host<CudaAtom*>(&cudaAtomLists);
   deallocate_host<double3>(&patchCenter);
   deallocate_host<int>(&globalToLocalID);
   deallocate_host<int>(&patchToDeviceMap);
   deallocate_host<Lattice>(&pairlist_lattices);
   deallocate_host<double>(&patchMaxAtomMovement);
   deallocate_host<double>(&patchNewTolerance);
   deallocate_host<CudaMInfo>(&mInfo);
   deallocate_host<bool*>(&h_patchRecordHasForces);

   deallocate_host<cudaTensor>(&lpVirialNormal);
   deallocate_host<cudaTensor>(&lpVirialNbond);
   deallocate_host<cudaTensor>(&lpVirialSlow);

   deallocate_device<double3>(&d_awayDists);
   deallocate_device<double3>(&d_patchMin);
   deallocate_device<double3>(&d_patchMax);
   deallocate_device<int>(&d_globalToLocalID);
   deallocate_device<int>(&d_patchToDeviceMap);
   deallocate_device<Lattice>(&d_lattices);
   deallocate_device<Lattice>(&d_pairlist_lattices);
   deallocate_device<double>(&d_patchMaxAtomMovement);
   deallocate_device<double>(&d_patchNewTolerance);
   deallocate_device<CudaMInfo>(&d_mInfo);

   deallocate_device<int>(&d_killme);
   deallocate_device<char>(&d_barrierFlag);
   deallocate_device<unsigned int>(&d_tbcatomic);
   deallocate_device<BigReal>(&d_kineticEnergy);
   deallocate_device<BigReal>(&d_intKineticEnergy);
   deallocate_device<BigReal>(&d_momentum_x);
   deallocate_device<BigReal>(&d_momentum_y);
   deallocate_device<BigReal>(&d_momentum_z);
   deallocate_device<BigReal>(&d_angularMomentum_x);
   deallocate_device<BigReal>(&d_angularMomentum_y);
   deallocate_device<BigReal>(&d_angularMomentum_z);
   deallocate_device<cudaTensor>(&d_virial);
   deallocate_device<cudaTensor>(&d_intVirialNormal);
   deallocate_device<cudaTensor>(&d_intVirialNbond);
   deallocate_device<cudaTensor>(&d_intVirialSlow);
   deallocate_device<cudaTensor>(&d_lpVirialNormal);
   deallocate_device<cudaTensor>(&d_lpVirialNbond);
   deallocate_device<cudaTensor>(&d_lpVirialSlow);
   deallocate_device<cudaTensor>(&d_rigidVirial);
   deallocate_device<cudaTensor>(&d_extVirial);
   deallocate_device<double3>(&d_extForce);
   deallocate_device<double>(&d_extEnergy);
   deallocate_device<SettleParameters>(&sp);
   deallocate_device<unsigned int>(&deviceQueue);

   deallocate_device<CudaLocalRecord*>(&d_peer_record);
   deallocate_device<bool*>(&d_patchRecordHasForces);

   deallocate_device(&d_fixVirialNormal);
   deallocate_device(&d_fixVirialNbond);
   deallocate_device(&d_fixVirialSlow);
   deallocate_device(&d_fixForceNormal);
   deallocate_device(&d_fixForceNbond);
   deallocate_device(&d_fixForceSlow);

   if (simParams->useDeviceMigration) {
     coll_atomdata_AoS_in.deallocate();
     coll_sortSoluteIndex.deallocate();
     coll_migrationDestination.deallocate();
   }

   deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
 }

 void SequencerCUDA::copyMigrationInfo(HomePatch *p, int patchIndex){
   CudaMInfo &m = mInfo[patchIndex];
   if (!p->patchMapRead) p->readPatchMap();
   for(int x = 0; x < 3; x++){
     for(int y = 0; y < 3; y++){
       for(int z = 0; z < 3; z++){
         // copies migration info over
         MigrationInfo *hm = p->mInfo[x][y][z];
         if(hm != NULL) m.destPatchID[x][y][z] = hm->destPatchID;
         else m.destPatchID[x][y][z] = -1; // let's flag this as -1 for now
       }
     }
   }
   if (simParams->useDeviceMigration) {
     m.destPatchID[1][1][1] = p->getPatchID();
   }
 }

 void SequencerCUDA::assembleOrderedPatchList(){
   // Assembles the patches on each Pe sharing a device into a device-ordered list
   patchList.clear();

   // Handle our own patches
   for (int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
     HomePatch *p = patchData->devData[deviceIndex].patches[i];
     patchList.push_back(p);
   }


   // Do we really need this?
 #if 1
   patchListHomeAndProxy.clear();
   // Set up list of device patches. We need the home patches for everyone
   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   for (int i = 0; i < numPatchesHomeAndProxy; i++) {
     const int patchID = localPatches[i].patchID;


     for(int d = 0; d < CkNumPes(); d++){
       PatchMap* pm = PatchMap::ObjectOnPe(d);
       HomePatch *patch = pm->homePatch(patchID);
       if(patch != NULL) {
         patchListHomeAndProxy.push_back(patch);
       }
     }
   }

 #endif
 }

 void SequencerCUDA::copyAoSDataToHost() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;

   CUDAMigrationKernel->update_AoS(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     (FullAtom*) d_atomdata_AoS,
     coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     stream
   );

   for (int i = 0; i < integrationPatches.size(); i++) {
     const int numAtoms = localPatches[i].numAtoms;
     const int offset = localPatches[i].bufferOffset;
     HomePatch *patch = integrationPatches[i];
     patch->updateAtomCount(numAtoms, false);
     patch->updateAtomBuffers();
     FullAtomList& h_atomdata = patch->getAtomList();
     copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (FullAtom*)h_atomdata.begin(), numAtoms, stream);
   }
   cudaCheck(cudaStreamSynchronize(stream));
 }


 void SequencerCUDA::migrationLocalInit() {
   CUDAMigrationKernel->computeMigrationGroupIndex(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     d_migrationGroupSize,
     d_migrationGroupIndex,
     stream
   );

   CUDAMigrationKernel->update_AoS(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     (FullAtom*) d_atomdata_AoS,
     coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     stream
   );

   CUDAMigrationKernel->computeMigrationDestination(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     myLattice,
     d_mInfo,
     d_patchToDeviceMap,
     d_globalToLocalID,
     d_patchMin,
     d_patchMax,
     d_hydrogenGroupSize,
     d_migrationGroupSize,
     d_migrationGroupIndex,
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     coll_migrationDestination.getDevicePtr(),
     stream
   );

   CUDAMigrationKernel->performLocalMigration(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     (FullAtom*) d_atomdata_AoS,
     (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
     coll_migrationDestination.getDevicePtr(),
     stream
   );

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationPerform() {
   CUDAMigrationKernel->performMigration(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     d_peer_record,
     (FullAtom*) d_atomdata_AoS,
     coll_atomdata_AoS_in.getDevicePeerPtr(),
     d_migrationGroupSize,
     d_migrationGroupIndex,
     coll_migrationDestination.getDevicePtr(),
     stream
   );
   cudaCheck(cudaStreamSynchronize(stream));
 }


 void SequencerCUDA::migrationUpdateAtomCounts() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   CUDAMigrationKernel->updateLocalRecords(
     numPatchesHome,
     patchData->devData[deviceIndex].d_localPatches,
     d_peer_record,
     patchData->devData[deviceIndex].d_peerPatches,
     stream
   );

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationUpdateAtomOffsets() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   CUDAMigrationKernel->updateLocalRecordsOffset(
     numPatchesHomeAndProxy,
     patchData->devData[deviceIndex].d_localPatches,
     stream
   );

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationUpdateRemoteOffsets() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   CUDAMigrationKernel->updatePeerRecords(
     numPatchesHomeAndProxy,
     patchData->devData[deviceIndex].d_localPatches,
     d_peer_record,
     patchData->devData[deviceIndex].d_peerPatches,
     stream
   );

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationUpdateProxyDestination() {
   if (mGpuOn) {
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     patchData = cpdata.ckLocalBranch();

     // This is implemented as a put instead of a get to avoid the need
     // for a node synchronization.
     CUDAMigrationKernel->copyMigrationDestinationToProxies(
       deviceIndex,
       numPatchesHome,
       numPatchesHomeAndProxy,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].d_peerPatches,
       coll_migrationDestination.getDevicePeerPtr(),
       stream
     );
   }
 }

 void SequencerCUDA::copyPatchDataToHost() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();
   std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;

   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;

   copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
   cudaCheck(cudaStreamSynchronize(stream));


   // Update Atom Counts
   for (int i = 0; i < numPatchesHome; i++) {
     HomePatch* hp = integrationPatches[i];
     hp->updateAtomCount(localPatches[i].numAtoms, false);
   }
   cudaCheck(cudaStreamSynchronize(stream));
 }

 // This code path is used only by device migration
 void SequencerCUDA::copyAtomDataToDeviceAoS() {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
   std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;


   for (int i = 0; i < integrationPatches.size(); i++) {
     const int numAtoms = localPatches[i].numAtoms;
     if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
       iout << iERROR << "The number of atoms in patch " << i << " is "
            << numAtoms << ", greater than the limit for GPU atom migration ("
            << MigrationCUDAKernel::kMaxAtomsPerPatch << ").\n" << endi;
       NAMD_bug("NAMD has stopped simulating due to the error above, "
                "but you could disable GPUAtomMigration and try again.\n");
     }
     const int offset = localPatches[i].bufferOffset;
     HomePatch *patch = integrationPatches[i];
     FullAtomList& h_atomdata = patch->getAtomList();
     copy_HtoD<FullAtom>((FullAtom*)h_atomdata.begin(), coll_atomdata_AoS_in.getDevicePtr() + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
   }
   cudaCheck(cudaStreamSynchronize(stream));
 }

 /*
  * Aggregates and copys various data from the host to device
  *
  * Some data is only copied for home patches, while other data is copied for
  * home and proxy patches
  *
  */
 void SequencerCUDA::copyAtomDataToDevice(bool copyForces, int maxForceNumber) {

  AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass, double, stream);
   if(copyForces){
     switch (maxForceNumber) {
       case 2:
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_x, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_y, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_slow_z, double, stream);
       case 1:
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_x, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_y, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_nbond_z, double, stream);
       case 0:
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_x, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_y, double, stream);
         AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(f_normal_z, double, stream);
     }
   }

   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_x, double, stream);
   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_y, double, stream);
   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(vel_z, double, stream);
   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_x, double, stream);
   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_y, double, stream);
   AGGREGATE_HOME_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(pos_z, double, stream);
   AGGREGATE_HOME_ATOMS_TO_DEVICE(mass, float, stream);
   AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(charge, float, stream);
   if (simParams->alchOn) {
     AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(partition, int, stream);
   }

   if (simParams->langevinOn) {
     AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam, float, stream);
     AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2, float, stream);
     AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2, float, stream);
   }

   AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize, int, stream);
   AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed, int, stream);
   if (simParams->fixedAtomsOn) {
     AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed, int, stream);
     AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x, double, stream);
     AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y, double, stream);
     AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z, double, stream);
   }
   AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(sortOrder, int, stream);
   AGGREGATE_HOME_AND_PROXY_ATOMS_TO_COLLECTIVE_DEVICE_BUFFER(unsortOrder, int, stream);
   AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength, float, stream);

   if (simParams->monteCarloPressureOn) {
     AGGREGATE_HOME_ATOMS_TO_DEVICE(id, int, stream);
     //set up initial mapping for global index to local array index
     CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
   }
 }


 void SequencerCUDA::migrationLocalPost(int startup) {
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();

   if (simParams->useDeviceMigration) {
     if (!startup) {
       CUDAMigrationKernel->transformMigratedPositions(
         numPatchesHome,
         patchData->devData[deviceIndex].d_localPatches,
         coll_patchCenter.getDevicePtr(),
         (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
         myLattice,
         stream
       );
     }

     // sort solvent/solute data
     CUDAMigrationKernel->sortSolventAtoms(
       numPatchesHome,
       patchData->devData[deviceIndex].d_localPatches,
       (FullAtom*) coll_atomdata_AoS_in.getDevicePtr(),
       (FullAtom*) d_atomdata_AoS,
       coll_sortSoluteIndex.getDevicePtr(),
       stream
     );

     double dt = 1.0;
     double kbT = 1.0;
     double tempFactor = 1.0;
     if (simParams->langevinOn) {
       dt = simParams->dt *  0.001;  // convert timestep to ps
       kbT = BOLTZMANN * simParams->langevinTemp;
       int lesReduceTemp = (simParams->lesOn && simParams->lesReduceTemp);
       tempFactor = (lesReduceTemp ? 1. / simParams->lesFactor : 1);
     }
     CUDAMigrationKernel->copy_AoS_to_SoA(
       numPatchesHome, simParams->alchOn,
       simParams->langevinOn, dt, kbT, tempFactor,
       patchData->devData[deviceIndex].d_localPatches,
       (FullAtom*) d_atomdata_AoS,
       d_recipMass,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_mass, coll_charge.getDevicePtr(),
       coll_idMig.getDevicePtr(), coll_vdwType.getDevicePtr(),
       d_hydrogenGroupSize, d_migrationGroupSize,
       d_atomFixed,
       d_rigidBondLength,
       d_transform,
       coll_partition.getDevicePtr(),
       d_langevinParam,
       d_langScalVelBBK2,
       d_langScalRandBBK2,
       stream
     );
   }

   // Other migration post processing steps
   if (simParams->useDeviceMigration) {
     if (simParams->monteCarloPressureOn) {
       CUDASequencerKernel->SetAtomIndexOrder(coll_idMig.getDevicePtr(), d_idOrder, numAtomsHome, stream);
     }
   }

   // JM: Saving position to these doubles for pairListCheck
   copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
   copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
   copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);

   // JM NOTE: We need to save the lattice at the beggining of the cycle
   //          in order to use it in SequencerCUDAKernel::pairlistCheck();
   myLatticeOld = myLattice;

   // JM NOTE: Recalculates the centers of mass since we have new positions
   CUDASequencerKernel->centerOfMass(
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_rcm_x, d_rcm_y, d_rcm_z,
       d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
   CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z,
       d_mass, d_hydrogenGroupSize, numAtomsHome, stream);

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationUpdateAdvancedFeatures(const int startup) {
   if(simParams->eFieldOn || simParams->SMDOn || simParams->groupRestraintsOn ||
     simParams->monteCarloPressureOn || simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn ||
     simParams->colvarsOn ||
     simParams->useCudaGlobal ||
     simParams->tclForcesOn){

     // Handcopies the transform field in a char* SOA
     // JM NOTE: We're copying ints into char because "transform" is an int
     //          field in PatchDataSOA but a signed char in FullAtom
     size_t offset = 0;
     for (int i = 0; i < numPatchesHome; i++) {
       PatchDataSOA& current = patchList[i]->patchDataSOA;
       const int numPatchAtoms = current.numAtoms;
       // memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type));
       for(int j = 0; j < numPatchAtoms; j++){
         transform[offset + j].x = current.transform_i[j];
         transform[offset + j].y = current.transform_j[j];
         transform[offset + j].z = current.transform_k[j];
       }
       offset += numPatchAtoms;
     }
     copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
   }

   if (!startup) {
     if (Node::Object()->molecule->is_lonepairs_psf) {
       lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
     }
     if(simParams->constraintsOn) {
       restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
     }
     if(simParams->SMDOn) {
       SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
     }
     if(simParams->groupRestraintsOn) {
       groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
     }
     if(simParams->mgridforceOn || simParams->gridforceOn){

       gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
     }
     if(simParams->consForceOn) {
       consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
     }

   }
 }

 void SequencerCUDA::migrationUpdateDestination() {
   CUDAMigrationKernel->updateMigrationDestination(
     numAtomsHomePrev,
     coll_migrationDestination.getDevicePtr(),
     coll_sortSoluteIndex.getDevicePeerPtr(),
     stream
   );
 }

 bool SequencerCUDA::copyPatchData(
   const bool copyIn,
   const bool startup
 ) {
   bool reallocated = false;
   if (copyIn) {
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     patchData = cpdata.ckLocalBranch();

     std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;

     std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
     std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;

     if (startup) {
       // numPatchesHomeAndProxy is set when the patch data is constructed
       numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
       numPatchesHome = homePatches.size();
       patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;

       coll_patchCenter.allocate_no_check(defaultCollectiveBufferType, numPatchesGlobal);

       if (simParams->useDeviceMigration) {
         // Padded data structures which will not be reallocated
         coll_sortSoluteIndex.allocate(defaultCollectiveBufferType,  numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch, SynchronousCollectiveScope::master);
         coll_atomdata_AoS_in.allocate(defaultCollectiveBufferType, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch, SynchronousCollectiveScope::master);
       }
 #if defined(NAMD_HIP)
       hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_localPatches,
                             sizeof(CudaLocalRecord)*numPatchesHomeAndProxy,
                             hipDeviceMallocFinegrained);
       hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_peerPatches,
                                     sizeof(CudaLocalRecord)*peerPatches.size(),
                                     hipDeviceMallocFinegrained);
 #else
       allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
       allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
 #endif
       if (simParams->useDeviceMigration) {
         CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
       }

       copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
                                  numPatchesHomeAndProxy, stream);
       copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
                                   peerPatches.size(), stream);
       if(true || mGpuOn) {
         this->assembleOrderedPatchList();
       }
       this->copySettleParameter();

       for (int i = 0; i < numPatchesHome; i++) {
         HomePatch *patch = homePatches[i];
         this->copyMigrationInfo(patch, i);
         patchNewTolerance[i] = 0.5 * ( simParams->pairlistDist - simParams->cutoff);

         globalToLocalID[patch->getPatchID()] = i;
         patchToDeviceMap[patch->getPatchID()] = deviceIndex;
       }
       copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
       copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);

       // Need the globalToLocalID and patchToDeviceMap data structures to be system wide for migration
       // They are also used in tuple migration, so we add them to patchData, so they can be easily
       // accessed elsewhere
       for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
         if (i == deviceIndex) continue;
         std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
         for (int j = 0; j < otherPatches.size(); j++) {
           HomePatch *patch = otherPatches[j];
           globalToLocalID[patch->getPatchID()] = j;
           patchToDeviceMap[patch->getPatchID()] = i;
         }
       }
       copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
       copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
       patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
       patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;

       // Allocate more data
       allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
     }

     for (int i = 0; i < numPatchesHomeAndProxy; i++) {
       HomePatch *patch = patchListHomeAndProxy[i];
       awayDists[i].x = patch->aAwayDist;
       awayDists[i].y = patch->bAwayDist;
       awayDists[i].z = patch->cAwayDist;
       COPY_CUDAVECTOR(patch->center,   patchCenter[i]);
       COPY_CUDAVECTOR(patch->getMin(), patchMin[i]);
       COPY_CUDAVECTOR(patch->getMax(), patchMax[i]);
     }

     copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
     copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
     copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
     copy_HtoD<double3>(patchCenter, coll_patchCenter.getDevicePtr(), numPatchesHomeAndProxy, stream);

     const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
                                localPatches[numPatchesHomeAndProxy-1].numAtoms;

     const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
                               localPatches[numPatchesHome-1].numAtoms;

     reallocated = reallocateArrays(homeAtomCount, totalAtomCount);


     numAtomsHomePrev = numAtomsHome;
     numAtomsHomeAndProxy = totalAtomCount;
     numAtomsHome = homeAtomCount;

     patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;

     if (!startup) {
       copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
                                  numPatchesHomeAndProxy, stream);
       copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
                                   peerPatches.size(), stream);
     }
     if (startup) {
       if (simParams->monteCarloPressureOn) {
         //Only at startup, copy the molecule's index info to GPU
         Molecule *molecule = Node::Object()->molecule;
         copy_HtoD<int>(molecule->moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
         copy_HtoD<int>(molecule->moleculeStartIndex, d_moleculeStartIndex, molecule->numMolecules + 1, stream);
       }
     }
   }
   return reallocated;
 }

 void SequencerCUDA::copyDataToPeers(
   const bool copyIn
 ) {
   if (!copyIn) return;
   // Positions will be copied by the kernel
   // Forces don't need to be copied
   // Atom data needed to be copied: sortOrder, unsortOrder

   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   patchData = cpdata.ckLocalBranch();
   if (mGpuOn) {
     CUDAMigrationKernel->copyDataToProxies(
       deviceIndex,
       numPatchesHome,
       numPatchesHomeAndProxy,
       patchData->devData[deviceIndex].d_localPatches,
       coll_idMig.getDevicePeerPtr(),
       coll_vdwType.getDevicePeerPtr(),
       coll_sortOrder.getDevicePeerPtr(),
       coll_unsortOrder.getDevicePeerPtr(),
       coll_charge.getDevicePeerPtr(),
       coll_partition.getDevicePeerPtr(),
       coll_patchCenter.getDevicePeerPtr(),
       simParams->alchOn,
       stream
     );
   }
   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::migrationSortAtomsNonbonded() {
   CUDAMigrationKernel->sortAtoms(
     numPatchesHome, numAtomsHome,
     patchData->devData[deviceIndex].d_localPatches,
     patchMin, patchMax,
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     coll_sortOrder.getDevicePtr(),
     coll_unsortOrder.getDevicePtr(),
     d_sortIndex,
     stream
   );
 }

 void SequencerCUDA::maximumMove(
   const double maxvel2,
   const int    numAtoms)
 {
   CUDASequencerKernel->maximumMove(
     maxvel2, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
     killme, numAtoms, stream);
 }

 void SequencerCUDA::submitHalf(
   int numAtoms, int part, const bool doEnergy)
 {
   //BigReal kineticEnergy;
   Tensor reduction_virial;
   //cudaTensor h_virial;
   //BigReal intKineticEnergy;
   Tensor reduction_intVirialNormal;
   //cudaTensor h_intVirialNormal;
   int hgs;

   if(doEnergy){
 #if 0
     cudaCheck(cudaEventRecord(eventStart,stream));
 #endif
     CUDASequencerKernel->submitHalf(
       simParams->fixedAtomsOn,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
       d_kineticEnergy, d_intKineticEnergy,
       d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
       virial_half, intVirialNormal_half,
       d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
 #if 0
     cudaCheck(cudaEventRecord(eventStop, stream));
     cudaCheck(cudaEventSynchronize(eventStop));
     cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
     fprintf(stderr, "submitHalf total elapsed time: %f\n", t_submitHalf);
     t_submitReductions2 = 0;
 #endif
   }
 }

 void SequencerCUDA::submitReductions(
   BigReal origin_x,
   BigReal origin_y,
   BigReal origin_z,
   int marginViolations,
   int doEnergy,
   int doMomentum,
   int numAtomsReduction,
   int maxForceNumber)
 {
   // reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsReduction; //moved to launch_part2, startRun2
   // where do I get the margin violations?
   // reduction->item(REDUCTION_MARGIN_VIOLATIONS) += marginViolations;
   if(doEnergy){
     if(doMomentum){
       // JM NOTE: Calculates momenta if copyOut
       CUDASequencerKernel->submitReduction1(
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
         coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_mass,
         d_kineticEnergy,
         d_momentum_x, d_momentum_y, d_momentum_z,
         d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
         origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
         momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
         numAtomsReduction, stream);
     }
     Tensor regintVirialNormal;
     Tensor regintVirialNbond;
     Tensor regintVirialSlow;

 #if 0
     cudaCheck(cudaEventRecord(eventStart,stream));
 #endif
     CUDASequencerKernel->submitReduction2(
         simParams->fixedAtomsOn, d_atomFixed,
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
         d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
         coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
         coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
         coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
         d_mass, d_hydrogenGroupSize,
         d_kineticEnergy, kineticEnergy,
         d_intKineticEnergy, intKineticEnergy,
         d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
         intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
         d_tbcatomic, numAtomsReduction, maxForceNumber, simParams->isMultiTimeStepping(), stream);
     // Haochuan: actually should be doVirial here
     if (simParams->fixedAtomsOn) {
       CUDASequencerKernel->calcFixVirial(
         maxForceNumber, numAtomsReduction, d_atomFixed,
         d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
         coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
         coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
         coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
         d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
         d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
     }
   }
 #if 0
     cudaCheck(cudaEventRecord(eventStop, stream));
     cudaCheck(cudaEventSynchronize(eventStop));
     cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
     fprintf(stderr, "submitReductions2 total elapsed time: %f\n", t_submitReductions2);
     t_submitReductions2 = 0;
 #endif
 }

 void SequencerCUDA::copySettleParameter(){
   // Searching for a patch that contains initialized settle parameters
   cudaCheck(cudaSetDevice(deviceID));
   if(simParams->rigidBonds != RIGID_NONE){
     HomePatch *patch = NULL;
     // PatchList contains all patches in the node, so if there's a single water in the system,
     // this is guaranteed to catch it
     for(int i = 0; i < patchList.size(); i++){
       if(patchList[i]->settle_initialized) {
         patch = patchList[i];
         break;
       }
     }
     if ( patch ) {
       SettleParameters h_sp;
       h_sp.mO = patch->settle_mO;
       h_sp.mH = patch->settle_mH;
       h_sp.mOrmT = patch->settle_mOrmT;
       h_sp.mHrmT = patch->settle_mHrmT;
       h_sp.rra = patch->settle_rra;
       h_sp.ra = patch->settle_ra;
       h_sp.rb = patch->settle_rb;
       h_sp.rc = patch->settle_rc;
       h_sp.r_om = patch->r_om;
       h_sp.r_ohc = patch->r_ohc;

       // fprintf(stderr, "SETTLEPARAMETER Found: Values %lf %lf %lf %lf %lf %lf %lf %lf\n",
       //    h_sp.mO, h_sp.mH, h_sp.mOrmT, h_sp.mHrmT, h_sp.rra, h_sp.ra, h_sp.rb, h_sp.rc );
       copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
     }
   }
 }

 // Does rattle1_SOA
 void SequencerCUDA::startRun1(
   int maxForceNumber,
   const Lattice& lattice
 ) {
   // cudaCheck(cudaSetDevice(deviceID));
   myLattice = lattice;

   // JM: Enforcing rigid bonds on first iteration
   CUDASequencerKernel->rattle1(
     simParams->fixedAtomsOn, 1, 0,
     numAtomsHome, 0.f, 0.f,
     2.0 * simParams->rigidTol,
     coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     d_velNew_x, d_velNew_y, d_velNew_z,
     d_posNew_x, d_posNew_y, d_posNew_z,
     coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
     d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
     &settleList, settleListSize, &d_consFailure,
     d_consFailureSize, &rattleList, rattleListSize,
     &nSettle, &nRattle,
     d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
     buildRigidLists, consFailure, simParams->watmodel, stream);

   this->copyPositionsAndVelocitiesToHost(1, 0);
   cudaCheck(cudaDeviceSynchronize());
 #if 0
   printSOAPositionsAndVelocities();
 #endif
 }

 void SequencerCUDA::startRun2(
   double dt_normal,
   double dt_nbond,
   double dt_slow,
   Vector origin,
   int doGlobal,
   int maxForceNumber
 ){
   reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;

   // This is a patch-based kernel -> which means each threadblock deals with an entire patch
   // So, in the multigpu case, we want to keep non-offset pointers but we want
   //     to deal only with a handful of patches

   // We dont need to and we should not set normal force to zero
   // It stores global forces. Additionally, we set normall forces,
   // every step, it's not an addition
   //  cudaCheck(cudaMemset(coll_f_normal_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   // cudaCheck(cudaMemset(coll_f_normal_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   // cudaCheck(cudaMemset(coll_f_normal_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));

   cudaCheck(cudaMemset(coll_f_nbond_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   cudaCheck(cudaMemset(coll_f_nbond_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   cudaCheck(cudaMemset(coll_f_nbond_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));

   cudaCheck(cudaMemset(coll_f_slow_x.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   cudaCheck(cudaMemset(coll_f_slow_y.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));
   cudaCheck(cudaMemset(coll_f_slow_z.getDevicePtr(), 0, sizeof(double)*numAtomsHomeAndProxy));

   CUDASequencerKernel->accumulateForceToSOA(
       doGlobal,
       simParams->useCudaGlobal,
       maxForceNumber,
       numPatchesHomeAndProxy,
       nDevices,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].f_bond,
       patchData->devData[deviceIndex].f_bond_nbond,
       patchData->devData[deviceIndex].f_bond_slow,
       patchData->devData[deviceIndex].forceStride,
       patchData->devData[deviceIndex].f_nbond,
       patchData->devData[deviceIndex].f_nbond_slow,
       patchData->devData[deviceIndex].f_slow,
       d_f_global_x,
       d_f_global_y,
       d_f_global_z,
       coll_f_normal_x.getDevicePtr(),
       coll_f_normal_y.getDevicePtr(),
       coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(),
       coll_f_nbond_y.getDevicePtr(),
       coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(),
       coll_f_slow_y.getDevicePtr(),
       coll_f_slow_z.getDevicePtr(),
       coll_unsortOrder.getDevicePtr(),
       myLattice,
       patchData->d_queues,
       patchData->d_queueCounters,
       d_tbcatomic,
       stream
   );
   if(mGpuOn){
     if(SMDKernel)
       {
         // compute distributed center of mass for groups of atoms
         SMDKernel->computeCOMMGpu(myLattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
                                 d_transform, stream);
       }
     if(groupRestraintsKernel)
       {
         groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
                                           d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
                                           stream);
       }
     // Synchonize device before node barrier
     cudaCheck(cudaDeviceSynchronize());
   }
 #if 0
   printSOAPositionsAndVelocities();
 #endif
 }

 void SequencerCUDA::startRun3(
   double dt_normal,
   double dt_nbond,
   double dt_slow,
   Vector origin,
   const bool requestGlobalForces,
   int doGlobalMasterStaleForces,
   const bool requestForcesOutput,
   const bool requestGlobalForcesGPU,
   int maxForceNumber
 ){
   const bool doFixed = simParams->fixedAtomsOn;
   if(mGpuOn){
     // XXX TODO we need to call the force merging kernel here
 #if 1
     // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
     std::vector<int> atom_counts;
     for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
       atom_counts.push_back(patchData->devData[i].numAtomsHome);
     }
     CUDASequencerKernel->mergeForcesFromPeers(
       deviceIndex,
       maxForceNumber,
       myLattice,
       numPatchesHomeAndProxy,
       numPatchesHome,
       this->coll_f_normal_x.getDevicePeerPtr(),
       this->coll_f_normal_y.getDevicePeerPtr(),
       this->coll_f_normal_z.getDevicePeerPtr(),
       this->coll_f_nbond_x.getDevicePeerPtr(),
       this->coll_f_nbond_y.getDevicePeerPtr(),
       this->coll_f_nbond_z.getDevicePeerPtr(),
       this->coll_f_slow_x.getDevicePeerPtr(),
       this->coll_f_slow_y.getDevicePeerPtr(),
       this->coll_f_slow_z.getDevicePeerPtr(),
       // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
       patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].d_peerPatches,
       atom_counts,
       stream
     );
 #else

   // Before I call nccl, let's see the forces here
   // ncclAllReduce(coll_f_normal_x.getDevicePtr(), coll_f_normal_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);

   // ncclAllReduce(coll_f_normal_y.getDevicePtr(), coll_f_normal_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_normal_z.getDevicePtr(), coll_f_normal_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_nbond_x.getDevicePtr(),   coll_f_nbond_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_nbond_y.getDevicePtr(),   coll_f_nbond_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_nbond_z.getDevicePtr(),   coll_f_nbond_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_slow_x.getDevicePtr(),     coll_f_slow_x.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_slow_y.getDevicePtr(),     coll_f_slow_y.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   // ncclAllReduce(coll_f_slow_z.getDevicePtr(),     coll_f_slow_z.getDevicePtr(), numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
   int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
   ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
 #endif
   }
   if(doGlobalMasterStaleForces)
     {
       memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
       memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
       computeGlobalMasterVirial(
                               numPatchesHomeAndProxy,
                               numAtomsHome,
                               patchData->devData[deviceIndex].d_localPatches,
                               coll_pos_x.getDevicePtr(),
                               coll_pos_y.getDevicePtr(),
                               coll_pos_z.getDevicePtr(),
                               d_transform,
                               d_f_global_x,
                               d_f_global_y,
                               d_f_global_z,
                               &d_extForce[EXT_GLOBALMTS],
                               &extForce[EXT_GLOBALMTS],
                               &d_extVirial[EXT_GLOBALMTS],
                               &extVirial[EXT_GLOBALMTS],
                               myLattice,
                               d_tbcatomic,
                               stream);
     }

   // do external forces calculation and store energy and virial
   calculateExternalForces(simParams->firstTimestep, maxForceNumber, 1, 1);
 #if 0
   cudaCheck(cudaDeviceSynchronize());
   if(true || deviceID == 0){
     char prefix[10];
     snprintf(prefix, 10, "step-%d",0);
     this->printSOAForces(prefix);
   }
 #endif

   CUDASequencerKernel->addForceToMomentum(
       doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
       d_recipMass,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
       numAtomsHome, maxForceNumber, stream);

   CUDASequencerKernel->rattle1(
       simParams->fixedAtomsOn,1, 0,
       numAtomsHome, -dt_normal, -1.0/(dt_normal),
       2.0 * simParams->rigidTol,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_velNew_x, d_velNew_y, d_velNew_z,
       d_posNew_x, d_posNew_y, d_posNew_z,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
       &settleList, settleListSize, &d_consFailure,
       d_consFailureSize, &rattleList, rattleListSize,
       &nSettle, &nRattle,
       d_rigidVirial, rigidVirial, d_tbcatomic, true, sp,
       true, consFailure, simParams->watmodel, stream);

   CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
       d_hydrogenGroupSize, numAtomsHome, stream);


   {
     // SubmitHalf and its corresponding reductions
     submitHalf(numAtomsHome, 1, 1);
     // submitHalf reductions
     cudaCheck(cudaStreamSynchronize(stream));
     Tensor reduction_virial;
     Tensor reduction_intVirialNormal;
     COPY_CUDATENSOR(virial_half[0], reduction_virial);
     COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
     reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
     // Haochuan: the tensor is not symmetric when there are fixed atoms
     if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
     reduction_virial *= 0.5;
     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
     // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
     reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
       += (intKineticEnergy_half[0] * 0.25);
     reduction_intVirialNormal *= 0.5;
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
                         reduction_intVirialNormal);
   }

   CUDASequencerKernel->addForceToMomentum(
       doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
       d_recipMass,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
       numAtomsHome, maxForceNumber, stream);

   CUDASequencerKernel->rattle1(
       simParams->fixedAtomsOn, 1, 1,
       numAtomsHome, dt_normal, 1.0/dt_normal,
       2.0 * simParams->rigidTol,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_velNew_x, d_velNew_y, d_velNew_z,
       d_posNew_x, d_posNew_y, d_posNew_z,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
       &settleList, settleListSize, &d_consFailure,
       d_consFailureSize, &rattleList, rattleListSize,
       &nSettle, &nRattle,
       d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
       buildRigidLists, consFailure, simParams->watmodel, stream);

   CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
       d_hydrogenGroupSize, numAtomsHome, stream);

   {
     // JM: SubmitHalf and its corresponding reductions
     submitHalf(numAtomsHome, 1, 1);
     // submitHalf reductions
     cudaCheck(cudaStreamSynchronize(stream));
     Tensor reduction_virial;
     Tensor reduction_intVirialNormal;
     COPY_CUDATENSOR(virial_half[0], reduction_virial);
     COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
     reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
     // Haochuan: the tensor is not symmetric when there are fixed atoms
     if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
     reduction_virial *= 0.5;
     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
     // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
     reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
       += (intKineticEnergy_half[0] * 0.25);
     reduction_intVirialNormal *= 0.5;
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
                         reduction_intVirialNormal);
   }

   CUDASequencerKernel->addForceToMomentum(
       doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
       d_recipMass,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
       numAtomsHome, maxForceNumber, stream);

   if(requestGlobalForces || requestForcesOutput) {
     // store the forces for next step,
     // when we need it for colvars and Tcl scripting
     saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
   }

   if (requestGlobalForcesGPU) {
     if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
     CUDASequencerKernel->copyForcesToDevice(
       numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       d_f_saved_nbond_x, d_f_saved_nbond_y,  d_f_saved_nbond_z,
       d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
     // cudaCheck(cudaStreamSynchronize(stream));
   }

   CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
       d_hydrogenGroupSize, numAtomsHome, stream);

   submitReductions(origin.x, origin.y, origin.z,
                    marginViolations, 1,
                    1,
                    numAtomsHome, maxForceNumber);

   copyPositionsAndVelocitiesToHost(1, 0);

   if(consFailure[0]){
       // Constraint failure. Abort.
     int dieOnError = simParams->rigidDie;
     if(dieOnError){
       // Bails out
       //iout << iWARN << "constraint failure during GPU integration \n" << endi;
       NAMD_die("constraint failure during CUDA rattle!\n");
     }else{
       iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
     }
   }else if(1){
     cudaCheck(cudaStreamSynchronize(stream));
     if (doGlobalMasterStaleForces) {
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
       ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
       // PRINT_CUDATENSOR(extVirial[EXT_GLOBALMTS], iout);
     }
     if(simParams->rigidBonds != RIGID_NONE){
       Tensor reduction_rigidVirial;
       COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
       // Haochuan: the tensor is not symmetric when there are fixed atoms
       if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
       ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
     }

     //submitReductions1
     reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
     Vector momentum(*momentum_x, *momentum_y, *momentum_z);
     ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
     Vector angularMomentum(*angularMomentum_x,
                           *angularMomentum_y,
                           *angularMomentum_z);
     ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
     //submitReductions2
     Tensor regintVirialNormal;
     Tensor regintVirialNbond;
     Tensor regintVirialSlow;
     COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
     if (maxForceNumber >= 1) {
       COPY_CUDATENSOR(intVirialNbond[0],  regintVirialNbond);
     }
     if (maxForceNumber >= 2) {
       COPY_CUDATENSOR(intVirialSlow[0],   regintVirialSlow);
     }

     reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND,  regintVirialNbond);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW,   regintVirialSlow);

     if (simParams->fixedAtomsOn) {
       cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
       double3 fixForceNormal, fixForceNbond, fixForceSlow;
       switch (maxForceNumber) {
         case 2: {
           copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
           copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
           cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
         } // intentionally fallthrough
         case 1: {
           copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
           copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
           cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
         } // intentionally fallthrough
         default: {
           copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
           copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
           cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
         }
       }
 #if 0
       auto printTensor = [](const cudaTensor& t, const std::string& name){
         CkPrintf("%s", name.c_str());
         CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
                  "%12.5lf %12.5lf %12.5lf\n"
                  "%12.5lf %12.5lf %12.5lf\n",
                  t.xx, t.xy, t.xz,
                  t.yx, t.yy, t.yz,
                  t.zx, t.zy, t.zz);
       };
       printTensor(fixVirialNormal, "fixVirialNormal = ");
       printTensor(fixVirialNbond, "fixVirialNbond = ");
       printTensor(fixVirialSlow, "fixVirialSlow = ");
 #endif

     }
   }

 #if 0
   if(deviceID == 0){
     this->printSOAForces(NULL);
   }
 #endif

 #if 0
   printSOAPositionsAndVelocities();
 #endif
 }

 void SequencerCUDA::monteCarloPressure_reject(Lattice &lattice)
 {
   // Restore the myLattice
   myLattice = lattice;
   double *temp;

   // Restore positions and forces
   // copy "MC" arrays into standard arrays
   copy_DtoD<double>(d_f_normalMC_x, coll_f_normal_x.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_normalMC_y, coll_f_normal_y.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_normalMC_z, coll_f_normal_z.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_nbondMC_x, coll_f_nbond_x.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_nbondMC_y, coll_f_nbond_y.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_nbondMC_z, coll_f_nbond_z.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_slowMC_x, coll_f_slow_x.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_slowMC_y, coll_f_slow_y.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_f_slowMC_z, coll_f_slow_z.getDevicePtr(), numAtomsHome, stream);
 #ifdef NAMD_NCCL_ALLREDUCE
   if(mGpuOn) {
     copy_DtoD<double>(d_posMC_x, d_posNew_x, numAtomsHome, stream);
     copy_DtoD<double>(d_posMC_y, d_posNew_y, numAtomsHome, stream);
     copy_DtoD<double>(d_posMC_z, d_posNew_z, numAtomsHome, stream);
   } else {
     copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
     copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
     copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
   }
 #else
   copy_DtoD<double>(d_posMC_x, coll_pos_x.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_posMC_y, coll_pos_y.getDevicePtr(), numAtomsHome, stream);
   copy_DtoD<double>(d_posMC_z, coll_pos_z.getDevicePtr(), numAtomsHome, stream);
 #endif
 }

 void SequencerCUDA::monteCarloPressure_accept(
   const int doMigration)
 {
   // do we need to update center of masses?
   CUDASequencerKernel->centerOfMass(
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
     d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
     d_hydrogenGroupSize, numAtomsHome, stream);

   // Add half step kinetic contribution to energy, intEnergy, virial, intVirial,
   // calculated by submitHalf in launch_part11
   Tensor reduction_virial;
   Tensor reduction_intVirialNormal;
   COPY_CUDATENSOR(virial_half[0], reduction_virial);
   COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
   // Haochuan: the tensor is not symmetric when there are fixed atoms
   if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
   reduction_virial *= 0.5;
   reduction_intVirialNormal *= 0.5;
   ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
   ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
                     reduction_intVirialNormal);
   reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
   reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += (intKineticEnergy_half[0] * 0.25);

   // If we were on migration steps and move was accepted, we need to update
   // the myLatticeOld in order to use it in SequencerCUDAKernel::pairlistCheck();
   if(doMigration) {
     myLatticeOld = myLattice;
   }
 }

 void SequencerCUDA::monteCarloPressure_part1(
   Tensor         &factor,
   Vector         &origin,
   Lattice        &oldLattice)
 {
   // Backup positions and forces
   copy_DtoD<double>(coll_f_normal_x.getDevicePtr(), d_f_normalMC_x, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_normal_y.getDevicePtr(), d_f_normalMC_y, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_normal_z.getDevicePtr(), d_f_normalMC_z, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_nbond_x.getDevicePtr(), d_f_nbondMC_x, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_nbond_y.getDevicePtr(), d_f_nbondMC_y, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_nbond_z.getDevicePtr(), d_f_nbondMC_z, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_slow_x.getDevicePtr(), d_f_slowMC_x, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_slow_y.getDevicePtr(), d_f_slowMC_y, numAtomsHome, stream);
   copy_DtoD<double>(coll_f_slow_z.getDevicePtr(), d_f_slowMC_z, numAtomsHome, stream);
 #ifdef NAMD_NCCL_ALLREDUCE
   if(mGpuOn) {
     copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
     copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
     copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
   } else {
     copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
     copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
     copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
   }
 #else
   //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
   copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posMC_x, numAtomsHome, stream);
   copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posMC_y, numAtomsHome, stream);
   copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posMC_z, numAtomsHome, stream);
 #endif

   // Scale the old lattice with factor. We need both lattice and newLattice
   // to properly unwrap and wrap the atom's coordinate
   Lattice newLattice = oldLattice;
   newLattice.rescale(factor);
   cudaTensor cuFactor;
   cudaVector cuOrigin;
   COPY_CUDATENSOR(factor, cuFactor);
   COPY_CUDAVECTOR(origin, cuOrigin);

   // Scale the coordinate using Molecule's geometric center
   Molecule *molecule = Node::Object()->molecule;
   CUDASequencerKernel->scaleCoordinateUsingGC(
             coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_idOrder, d_moleculeStartIndex,
             d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
             d_transform, molecule->numMolecules, molecule->numLargeMolecules,
             stream);

   // Update the cuda lattice with newLattice for force calculation
   myLattice = newLattice;

   // Set up compute position before calling bonded and nonbonded kernel
   const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
      ComputeNonbondedUtil::dielectric_1);
   bool doNbond = patchData->flags.doNonbonded;
   bool doSlow = patchData->flags.doFullElectrostatics;
   bool doFEP = false;
   bool doTI = false;
   bool doAlchDecouple = false;
   bool doAlchSoftCore = false;
   if (simParams->alchOn) {
     if (simParams->alchFepOn) doFEP = true;
     if (simParams->alchThermIntOn) doTI = true;
     if (simParams->alchDecouple) doAlchDecouple = true;
     if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
   }

   if (Node::Object()->molecule->is_lonepairs_psf) {
     lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
   }

   bool usePatchPme = false;
   if (deviceCUDA->getIsPmeDevice() && doSlow) {
     // This will check if the current lattice is compatible with the patch-level PME kernels
     // This needs to be redone every time the lattice changes, the results are stored in
     // the cudaPme object, and the overall compatibility can be computed with the compatible()
     // function.
     //
     // The behavior of set compute positions PME is different depending on the kernels being used,
     // so that value needs to be passed to the kernel object
     ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
     CudaPmeOneDevice* cudaPme =  cudaMgr->getCudaPmeOneDevice();
     cudaPme->checkPatchLevelLatticeCompatibilityAndComputeOffsets(
       myLattice,
       getNumPatchesHome(),
       patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
       getHostPatchMin(),
       getHostPatchMax(),
       getHostAwayDists()
     );
     usePatchPme = cudaPme->patchLevelPmeData.compatible();
   }

   std::vector<int> atom_counts;
   for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
     atom_counts.push_back(patchData->devData[i].numAtomsHome);
   }
   CUDASequencerKernel->set_compute_positions(
                 deviceIndex,
                 deviceCUDA->getIsPmeDevice(),
                 nDevices,
                 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
                 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
 #ifdef NAMD_NCCL_ALLREDUCE
                 (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
                 (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
                 (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
 #else
                 coll_pos_x.getDevicePtr(),
                 coll_pos_y.getDevicePtr(),
                 coll_pos_z.getDevicePtr(),
                 coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
                 coll_pos_y.getDevicePeerPtr(),
                 coll_pos_z.getDevicePeerPtr(),
                 coll_charge.getDevicePeerPtr(),
                 coll_partition.getDevicePeerPtr(),
 #endif
                 coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
                 coll_patchCenter.getDevicePtr(),
                 patchData->devData[deviceIndex].slow_patchPositions,
                 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
                 coll_sortOrder.getDevicePtr(), newLattice,
                 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
                 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
                 Node::Object()->molecule->numAtoms,
                 patchData->devData[deviceIndex].d_localPatches,
                 patchData->devData[deviceIndex].d_peerPatches,
                 atom_counts,
                 stream);

   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::monteCarloPressure_part2(
   int           step,
   int           maxForceNumber,
   const bool    doEnergy,
   const bool    doGlobal,
   const bool    doVirial)
 {
   // we zero all reduction value in part1. Need to add this
   reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;

   if(mGpuOn){
 #ifdef NAMD_NCCL_ALLREDUCE
     cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtoms*3*(maxForceNumber+1)));
 #endif
   }
   //Update SOA buffer
   CUDASequencerKernel->accumulateForceToSOA(
       doGlobal,
       simParams->useCudaGlobal,
       maxForceNumber,
       numPatchesHomeAndProxy,
       nDevices,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].f_bond,
       patchData->devData[deviceIndex].f_bond_nbond,
       patchData->devData[deviceIndex].f_bond_slow,
       patchData->devData[deviceIndex].forceStride,
       patchData->devData[deviceIndex].f_nbond,
       patchData->devData[deviceIndex].f_nbond_slow,
       patchData->devData[deviceIndex].f_slow,
       d_f_global_x,
       d_f_global_y,
       d_f_global_z,
       coll_f_normal_x.getDevicePtr(),
       coll_f_normal_y.getDevicePtr(),
       coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(),
       coll_f_nbond_y.getDevicePtr(),
       coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(),
       coll_f_slow_y.getDevicePtr(),
       coll_f_slow_z.getDevicePtr(),
       coll_unsortOrder.getDevicePtr(),
       myLattice,
       patchData->d_queues,
       patchData->d_queueCounters,
       d_tbcatomic,
       stream
   );
   if(mGpuOn){
 #ifndef NAMD_NCCL_ALLREDUCE
     // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
     //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
     //  0, deviceCUDA->getNcclComm(), stream);
     std::vector<int> atom_counts;
     for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
       atom_counts.push_back(patchData->devData[i].numAtomsHome);
     }
     CUDASequencerKernel->mergeForcesFromPeers(
       deviceIndex,
       maxForceNumber,
       myLattice,
       numPatchesHomeAndProxy,
       numPatchesHome,
       this->coll_f_normal_x.getDevicePeerPtr(),
       this->coll_f_normal_y.getDevicePeerPtr(),
       this->coll_f_normal_z.getDevicePeerPtr(),
       this->coll_f_nbond_x.getDevicePeerPtr(),
       this->coll_f_nbond_y.getDevicePeerPtr(),
       this->coll_f_nbond_z.getDevicePeerPtr(),
       this->coll_f_slow_x.getDevicePeerPtr(),
       this->coll_f_slow_y.getDevicePeerPtr(),
       this->coll_f_slow_z.getDevicePeerPtr(),
       // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
       patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].d_peerPatches,
       atom_counts,
       stream
     );
 #else
     int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
     ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
 #endif
   }
   // do external forces calculation
   calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
 #if 0
   cudaCheck(cudaDeviceSynchronize());
   if(true || deviceID == 0){
     char prefix[10];
     snprintf(prefix, 10, "step-%d",step);
     this->printSOAForces(prefix);
   }
 #endif

 }

 void SequencerCUDA::setRescalePairlistTolerance(const bool val) {
   rescalePairlistTolerance = val;
 }

 void SequencerCUDA::launch_part1(
   int step,
   double         dt_normal,
   double         dt_nbond,
   double         dt_slow,
   double         velrescaling,
   const double   maxvel2,
   Tensor         &factor,
   Vector         &origin,
   Lattice        &lattice,
   int            reassignVelocitiesStep,
   int            langevinPistonStep,
   int            berendsenPressureStep,
   int            maxForceNumber,
   const int      copyIn,
   const int      savePairlists,
   const int      usePairlists,
   const bool     doEnergy)
 {
   PatchMap* patchMap = PatchMap::Object();
   // Aggregate data from all patches
   cudaCheck(cudaSetDevice(deviceID));
   this->maxvel2 = maxvel2;
   const bool doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
   const bool doFixed = simParams->fixedAtomsOn;
   // JM: for launch_part1:
   //     copyIn:  first call
   myLattice = lattice;
   if(reassignVelocitiesStep)
     {
       const int reassignFreq = simParams->reassignFreq;
       BigReal newTemp = simParams->reassignTemp;
       newTemp += ( step / reassignFreq ) * simParams->reassignIncr;
       if ( simParams->reassignIncr > 0.0 ) {
         if ( newTemp > simParams->reassignHold && simParams->reassignHold > 0.0 )
           newTemp = simParams->reassignHold;
       } else {
         if ( newTemp < simParams->reassignHold )
           newTemp = simParams->reassignHold;
       }
       const BigReal kbT = BOLTZMANN * newTemp;

       CUDASequencerKernel->reassignVelocities(
       dt_normal, simParams->fixedAtomsOn, d_atomFixed,
       d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_recipMass, kbT,
       numAtomsHome, numAtomsHome, 0,
       curandGen, stream);
     }

   // scale the position for berendsen Pressure Controller
   if(berendsenPressureStep) {
     cudaTensor cuFactor;
     cudaVector cuOrigin;
     COPY_CUDATENSOR(factor, cuFactor);
     COPY_CUDAVECTOR(origin, cuOrigin);
     CUDASequencerKernel->scaleCoordinateWithFactor(
             coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass, d_hydrogenGroupSize,
             cuFactor, cuOrigin, simParams->useGroupPressure, numAtomsHome, stream);
   }

   if(!langevinPistonStep){
     // kernel fusion here
     // JM TODO: Fuse kernels for the langevin thermostat
     CUDASequencerKernel->velocityVerlet1(
       doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
       dt_slow, velrescaling, d_recipMass,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), maxvel2, killme, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       pos_x, pos_y, pos_z, coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(), coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       d_atomFixed, numAtomsHome, maxForceNumber, stream);
   }else{
     // Zero-out force buffers here
     CUDASequencerKernel->addForceToMomentum(
       doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
       d_recipMass,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
       numAtomsHome, maxForceNumber, stream);

     maximumMove(maxvel2, numAtomsHome);
     cudaTensor cuFactor;
     cudaVector cuOrigin;
     COPY_CUDATENSOR(factor, cuFactor);
     COPY_CUDAVECTOR(origin, cuOrigin);
     double velFactor_x = namd_reciprocal(factor.xx);
     double velFactor_y = namd_reciprocal(factor.yy);
     double velFactor_z = namd_reciprocal(factor.zz);

     CUDASequencerKernel->addVelocityToPosition(
         simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
         coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
         pos_x, pos_y, pos_z, numAtomsHome, false, stream);
     CUDASequencerKernel->langevinPiston(
         simParams->fixedAtomsOn, d_atomFixed,
         d_groupFixed, d_transform, lattice,
         d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
         d_mass, d_hydrogenGroupSize,
         cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
         simParams->useGroupPressure, numAtomsHome, stream);
     CUDASequencerKernel->addVelocityToPosition(
         simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
         coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
         pos_x, pos_y, pos_z, numAtomsHome, false, stream);
   }
   if(mGpuOn && SMDKernel)
     {
       // compute distributed center of mass for groups of atoms
       SMDKernel->computeCOMMGpu(lattice, d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
                                 d_transform, stream);
     }
   if(mGpuOn && groupRestraintsKernel)
     {
       groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
                                         d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
                                         stream);
     }

   // JM: Recalculate centers of mass if energy calculation or langevinPistonOn
   if( (doEnergy || doVirial) ) {
     CUDASequencerKernel->centerOfMass(
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
       d_hydrogenGroupSize, numAtomsHome, stream);
     CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
       d_hydrogenGroupSize, numAtomsHome, stream);
   }

   const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
      ComputeNonbondedUtil::dielectric_1);
   // We need to find doNbond and doSlow for upcoming step
   bool doNbond = patchData->flags.doNonbonded;
   bool doSlow = patchData->flags.doFullElectrostatics;

   bool doFEP = false;
   bool doTI = false;
   bool doAlchDecouple = false;
   bool doAlchSoftCore = false;
   if (simParams->alchOn) {
     if (simParams->alchFepOn) doFEP = true;
     if (simParams->alchThermIntOn) doTI = true;
     if (simParams->alchDecouple) doAlchDecouple = true;
     if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
   }
   if ( ! savePairlists ) {

     double minSize = simParams->patchDimension - simParams->margin;
     double sysdima = lattice.a_r().unit() * lattice.a();
     double sysdimb = lattice.b_r().unit() * lattice.b();
     double sysdimc = lattice.c_r().unit() * lattice.c();
     // Let's pass migrationInfo here

     CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
         patchData->devData[deviceIndex].d_localPatches,
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_posSave_x, d_posSave_y, d_posSave_z,
         d_awayDists,
         myLattice, myLatticeOld,
         d_patchMin, d_patchMax, coll_patchCenter.getDevicePtr(),
         d_mInfo,
         d_tbcatomic, simParams->pairlistTrigger,
         simParams->pairlistGrow, simParams->pairlistShrink,
         d_patchMaxAtomMovement, patchMaxAtomMovement,
         d_patchNewTolerance, patchNewTolerance,
         minSize, simParams->cutoff, sysdima, sysdimb, sysdimc,
         h_marginViolations,
         h_periodicCellSmall,
         rescalePairlistTolerance,
         isPeriodic, stream);
     rescalePairlistTolerance = false;
   }
   else {
     rescalePairlistTolerance = true;
   }

   if(mGpuOn){
     // Synchonize device before node barrier
     cudaCheck(cudaStreamSynchronize(stream));
   }
 }

 void SequencerCUDA::launch_part11(
   double         dt_normal,
   double         dt_nbond,
   double         dt_slow,
   double         velrescaling,
   const double   maxvel2,
   Tensor         &factor,
   Vector         &origin,
   Lattice        &lattice,
   int            langevinPistonStep,
   int            maxForceNumber,
   const int      copyIn,
   const int      savePairlists,
   const int      usePairlists,
   const bool     doEnergy)
 {
   const bool doVirial = simParams->langevinPistonOn;
   const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
      ComputeNonbondedUtil::dielectric_1);
   // We need to find doNbond and doSlow for upcoming step
   bool doNbond = patchData->flags.doNonbonded;
   bool doSlow = patchData->flags.doFullElectrostatics;

   bool doFEP = false;
   bool doTI = false;
   bool doAlchDecouple = false;
   bool doAlchSoftCore = false;
   if (simParams->alchOn) {
     if (simParams->alchFepOn) doFEP = true;
     if (simParams->alchThermIntOn) doTI = true;
     if (simParams->alchDecouple) doAlchDecouple = true;
     if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
   }

   submitHalf(numAtomsHome, 1, doEnergy || doVirial);

   // Updating numerical flags
   NAMD_EVENT_START(1, NamdProfileEvent::CPY_PATCHFLAGS);
   this->update_patch_flags();
   NAMD_EVENT_STOP(1, NamdProfileEvent::CPY_PATCHFLAGS);

   finish_part1(copyIn, patchList[0]->flags.savePairlists,
     patchList[0]->flags.usePairlists);
 }


 void SequencerCUDA::launch_set_compute_positions() {

   const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
      ComputeNonbondedUtil::dielectric_1);
   // We need to find doNbond and doSlow for upcoming step
   bool doNbond = patchData->flags.doNonbonded;
   bool doSlow = patchData->flags.doFullElectrostatics;

   bool doFEP = false;
   bool doTI = false;
   bool doAlchDecouple = false;
   bool doAlchSoftCore = false;
   if (simParams->alchOn) {
     if (simParams->alchFepOn) doFEP = true;
     if (simParams->alchThermIntOn) doTI = true;
     if (simParams->alchDecouple) doAlchDecouple = true;
     if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
   }
   const bool doIMD = simParams->IMDon && !(simParams->IMDignoreForces || simParams->IMDignore);
   bool doGlobal = simParams->tclForcesOn || simParams->colvarsOn || doIMD;
   // Set the positions of lone pairs before copying to NB buffers
   if (Node::Object()->molecule->is_lonepairs_psf) {
     lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
   }

   bool usePatchPme = false;
   if (deviceCUDA->getIsPmeDevice() && doSlow) {
     // This will check if the current lattice is compatible with the patch-level PME kernels
     // This needs to be redone every time the lattice changes, the results are stored in
     // the cudaPme object, and the overall compatibility can be computed with the compatible()
     // function.
     //
     // The behavior of set compute positions PME is different depending on the kernels being used,
     // so that value needs to be passed to the kernel object
     ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
     CudaPmeOneDevice* cudaPme =  cudaMgr->getCudaPmeOneDevice();
     cudaPme->checkPatchLevelLatticeCompatibilityAndComputeOffsets(
       myLattice,
       getNumPatchesHome(),
       patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
       getHostPatchMin(),
       getHostPatchMax(),
       getHostAwayDists()
     );
     usePatchPme = cudaPme->patchLevelPmeData.compatible();
   }

   if (1) {
     //fprintf(stderr, "calling set_compute_positions() ****************************************\n");
     //fprintf(stderr, "calling set_compute_positions\n");
     //fprintf(stderr, "doNbond=%d  doSlow=%d\n", doNbond, doSlow);
     std::vector<int> atom_counts;
     for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
       atom_counts.push_back(patchData->devData[i].numAtomsHome);
     }
     CUDASequencerKernel->set_compute_positions(
                   deviceIndex,
                   deviceCUDA->getIsPmeDevice(),
                   nDevices,
                   numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
                   doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
 #ifdef NAMD_NCCL_ALLREDUCE
                   (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
                   (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
                   (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
 #else
                   coll_pos_x.getDevicePtr(),
                   coll_pos_y.getDevicePtr(),
                   coll_pos_z.getDevicePtr(),
                   coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
                   coll_pos_y.getDevicePeerPtr(),
                   coll_pos_z.getDevicePeerPtr(),
                   coll_charge.getDevicePeerPtr(),
                   coll_partition.getDevicePeerPtr(),
 #endif
                   coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
                   coll_patchCenter.getDevicePtr(),
                   patchData->devData[deviceIndex].slow_patchPositions,
                   patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
                   coll_sortOrder.getDevicePtr(), myLattice,
                   (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
                   (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
                   Node::Object()->molecule->numAtoms,
                   patchData->devData[deviceIndex].d_localPatches,
                   patchData->devData[deviceIndex].d_peerPatches,
                   atom_counts,
                   stream);
     // For global forces, copy the coordinate to host with kernel overlap
     if (doGlobal) {
       NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_POSITION);
       //      CkPrintf("WARNING this probably needs to be changed for multihost\n");
       copyPositionsToHost_direct();
       //copyPositionsAndVelocitiesToHost(1,0);
       NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_POSITION);
     }
   }
 }

 void SequencerCUDA:: finish_part1( const int copyIn,
                                    const int savePairlists,
                                    const int usePairlists)
 {
   // JM: If we're not in a migration step, let's overlap the  flagging the
   //     positions before we synchronize the stream to lessen the compute
   //     launch overhead
   // Hopefully we will see some overlap in this region
   //
   // TODO: We can just call this a different function and start calling positionsReady
   // if we're not on a migration step, so that we can overlap some of the work with the kernels
   // before we synchronize, we can clear the device memory
   // sets the tileListStat for the nbondKernel
   cudaCheck(cudaStreamSynchronize(stream));

   // Checks if periodic cell became too small
   if(*h_periodicCellSmall){
     NAMD_die("Periodic cell has become too small for original patch grid!\n"
       "Possible solutions are to restart from a recent checkpoint,\n"
       "increase margin, or disable useFlexibleCell for liquid simulation.");
   }

   if (killme[0]) {
     const Molecule* mol = Node::Object()->molecule;
     // Found at least one atom that is moving too fast.
     // Terminating, so loop performance below doesn't matter.
     // Loop does not vectorize
     double *vel_x, *vel_y, *vel_z;
     std::vector<int> id;
     std::vector<int> patchIDofAtoms(numAtomsHome);
     allocate_host<double>(&vel_x, numAtomsHome);
     allocate_host<double>(&vel_y, numAtomsHome);
     allocate_host<double>(&vel_z, numAtomsHome);
     copy_DtoH_sync<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsHome);
     copy_DtoH_sync<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsHome);
     copy_DtoH_sync<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsHome);
     // Update the id array from patchDataSOA
     size_t offset = 0;
     // TODO: Does this work for GPU atom migration?
     std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
     // std::vector<int> patchIDtoIndex(numPatchesHome);
     for (int i = 0; i < numPatchesHome; i++) {
       HomePatch *patch = homePatches[i];
       PatchDataSOA& current = patchList[i]->patchDataSOA;
       const int numPatchAtoms = current.numAtoms;
       id.resize(numPatchAtoms + id.size());
       for(int j = 0; j < numPatchAtoms; j++){
         if (!simParams->useDeviceMigration) {
           id[offset + j] = current.id[j];
         }
         patchIDofAtoms[offset + j] = patch->getPatchID();
       }
       offset += numPatchAtoms;
     }
     if (simParams->useDeviceMigration) {
       id.resize(numAtomsHome);
       copy_DtoH_sync<int>(coll_idMig.getDevicePtr(), id.data(), numAtomsHome);
     }
     int cnt = 0;
     for (int i=0;  i < numAtomsHome;  i++) {
       BigReal vel2 =
         vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
       if (vel2 > maxvel2) {
         ++cnt;
         iout << iERROR << " velocity is "
              << PDBVELFACTOR * vel_x[i] << " "
              << PDBVELFACTOR * vel_y[i] << " "
              << PDBVELFACTOR * vel_z[i]
              << " (limit is "
              << ( PDBVELFACTOR * sqrt(maxvel2) ) << ", atom ID "
              << id[i]+1
              // XXX: TODO: mol->get_atomtype only works on a single-node
              << " of type " << mol->get_atomtype(id[i])
              << " in patch " << patchIDofAtoms[i]
              << " on PE " << CkMyPe()
              << " with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
              << " atoms)\n" << endi;
       }
     }
     iout << iERROR << "Atoms moving too fast at timestep " << patchList[0]->flags.step <<
       "; simulation has become unstable ("
       << cnt << " atoms on pe " << CkMyPe() << ", GPU " << deviceID << ").\n" << endi;
     if (simParams->crashOutputFlag & NAMD_CRASH_ATOM_TOO_FAST) {
       double *pos_x, *pos_y, *pos_z;
       allocate_host<double>(&pos_x, numAtomsHome);
       allocate_host<double>(&pos_y, numAtomsHome);
       allocate_host<double>(&pos_z, numAtomsHome);
       copy_DtoH_sync<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsHome);
       copy_DtoH_sync<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsHome);
       copy_DtoH_sync<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsHome);
       // Save the positions and velocities to a file for debugging
       const std::string outfilename =
         std::string(simParams->crashFilename) + "." +
         std::to_string(deviceIndex);
       std::ofstream ofs_crash_dump(outfilename.c_str());
       ofs_crash_dump << "atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
       for (int i=0;  i < numAtomsHome;  i++) {
         ofs_crash_dump << id[i]+1 << ","
                        << pos_x[i] << ","
                        << pos_y[i] << ","
                        << pos_z[i] << ","
                        << PDBVELFACTOR * vel_x[i] << ","
                        << PDBVELFACTOR * vel_y[i] << ","
                        << PDBVELFACTOR * vel_z[i] << "\n";
       }
       ofs_crash_dump.flush();
       ofs_crash_dump.close();
       iout << iWARN << "PE " << CkMyPe() << ", GPU " << deviceID
            << ": the atom positions and velocities have been written to "
            << outfilename << "\n" << endi;
       deallocate_host<double>(&pos_x);
       deallocate_host<double>(&pos_y);
       deallocate_host<double>(&pos_z);
     }
     deallocate_host<double>(&vel_x);
     deallocate_host<double>(&vel_y);
     deallocate_host<double>(&vel_z);
     NAMD_die("SequencerCUDA: Atoms moving too fast");
   }
   else{
     // submitHalf reductions
     Tensor reduction_virial;
     Tensor reduction_intVirialNormal;
     COPY_CUDATENSOR(virial_half[0], reduction_virial);
     COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
     reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
     // Haochuan: the tensor is not symmetric when there are fixed atoms
     if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
     reduction_virial *= 0.5;
     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
     // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
     reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
       += (intKineticEnergy_half[0] * 0.25);
     reduction_intVirialNormal *= 0.5;
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
                       reduction_intVirialNormal);
     int migration = (h_marginViolations[0] != 0) ? 1 :0; // flags migration as TRUE if margin violation occured
     // if(migration != 0 ) fprintf(stderr, "DEV[%d] = MIGRATION[%d]\n", deviceID, migration);
     patchData->migrationFlagPerDevice[deviceIndex] = migration; // Saves the updated migration flag
     h_marginViolations[0] = 0;
   }
 }

 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
   bool copyOut, const int doGlobal){
   //    CkPrintf("copy positions and velocities to host copyout %d doGlobal %d\n", copyOut, doGlobal);
   if(copyOut){
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     patchData = cpdata.ckLocalBranch();
     std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
     std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
     std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
     const int numAtomsToCopy = numAtomsHome;
     copy_DtoH<double>(coll_vel_x.getDevicePtr(), vel_x, numAtomsToCopy, stream);
     copy_DtoH<double>(coll_vel_y.getDevicePtr(), vel_y, numAtomsToCopy, stream);
     copy_DtoH<double>(coll_vel_z.getDevicePtr(), vel_z, numAtomsToCopy, stream);
     if (!doGlobal) {
       // We already copied coordinate if we have global forces
       copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
       copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
       copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
     }
     cudaCheck(cudaDeviceSynchronize());

     for(int i = 0; i < homePatches.size(); i++){

       // TODO do we need to copy proxy patches as well
       PatchDataSOA& current = homePatches[i]->patchDataSOA;
       const int numPatchAtoms = localPatches[i].numAtoms;
       const int offset = localPatches[i].bufferOffset;
       memcpy(current.vel_x, vel_x + offset, numPatchAtoms*sizeof(double));
       memcpy(current.vel_y, vel_y + offset, numPatchAtoms*sizeof(double));
       memcpy(current.vel_z, vel_z + offset, numPatchAtoms*sizeof(double));
       if (!doGlobal) {
         // We already copied coordinate if we have global forces
         memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
         memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
         memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
       }
     }
   }
 }


 void SequencerCUDA::copyPositionsToHost(){
   //    CkPrintf("copy positions to host \n");
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     patchData = cpdata.ckLocalBranch();
     std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
     std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
     std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;

     const int numAtomsToCopy = numAtomsHome;
     // We already copied coordinate if we have global forces
     copy_DtoH<double>(coll_pos_x.getDevicePtr(), pos_x, numAtomsToCopy, stream);
     copy_DtoH<double>(coll_pos_y.getDevicePtr(), pos_y, numAtomsToCopy, stream);
     copy_DtoH<double>(coll_pos_z.getDevicePtr(), pos_z, numAtomsToCopy, stream);
     cudaCheck(cudaDeviceSynchronize());

     for(int i = 0; i < homePatches.size(); i++){

       // TODO do we need to copy proxy patches as well
       PatchDataSOA& current = homePatches[i]->patchDataSOA;
       const int numPatchAtoms = localPatches[i].numAtoms;
       const int offset = localPatches[i].bufferOffset;
       memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
       memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
       memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
     }
 }

 void SequencerCUDA::update_patch_flags()
 {
   // int pairlists = 1;
   int pairlists = (patchData->flags.step < simParams->N);
   for (int i=0;  i < numPatchesHome;  i++) {
     HomePatch *patch = patchList[i];
     patch->flags.copyIntFlags(patchData->flags); // copy global flags
   }
 }

 void SequencerCUDA::updatePairlistFlags(const int doMigration){
   int pairlists = patchList[0]->flags.step < simParams->N;
   for(int i = 0; i < numPatchesHome; i++){
   //for(int i = 0; i < numPatches; i++){
     HomePatch *patch = patchList[i];
     Sequencer *seq = patch->sequencer;
     // the following logic is duplicated from Sequencer::runComputeObjects
     // Migrations always invalidates pairlists
     if (doMigration) {
       seq->pairlistsAreValid = 0;
     }
     if (seq->pairlistsAreValid &&
       ( patch->flags.doFullElectrostatics || ! simParams->fullElectFrequency )
         && (seq->pairlistsAge > seq->pairlistsAgeLimit) ) {
         seq->pairlistsAreValid = 0;
     }
     patch->flags.usePairlists  = pairlists ||  seq->pairlistsAreValid;
     patch->flags.savePairlists = pairlists && !seq->pairlistsAreValid;
     if(patch->flags.savePairlists){
       // We need to rebuild pairlists -> reset tolerance values
       patch->flags.pairlistTolerance = patchList[i]->doPairlistCheck_newTolerance; // update pairListTolerance
       patch->flags.maxAtomMovement = 0;
       patch->doPairlistCheck_newTolerance *= (1 - simParams->pairlistShrink);
     }else if(patch->flags.usePairlists){
       // We can keep going with the existing pairlists -> update tolerances
       patch->flags.maxAtomMovement = patchMaxAtomMovement[i];
       patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
     }else{
       // End of simulation
       patch->flags.maxAtomMovement=99999.;
       patch->flags.pairlistTolerance = 0.;
     }
   }
   if(patchList[0]->flags.savePairlists){
     // Backs up d_posSave_* for pairlistCheck
     copy_DtoD<double>(coll_pos_x.getDevicePtr(), d_posSave_x, numAtomsHome, stream);
     copy_DtoD<double>(coll_pos_y.getDevicePtr(), d_posSave_y, numAtomsHome, stream);
     copy_DtoD<double>(coll_pos_z.getDevicePtr(), d_posSave_z, numAtomsHome, stream);
     myLatticeOld = myLattice;
   }
 }

 void SequencerCUDA::finish_patch_flags(int migration)
 {
   for (int i=0;  i < numPatchesHome;  i++) {
     HomePatch *patch = patchList[i];
     Sequencer *seq = patch->sequencer;
     if (patch->flags.savePairlists && patch->flags.doNonbonded) {
       seq->pairlistsAreValid = 1;
       seq->pairlistsAge = 0;
     }
     if (seq->pairlistsAreValid /* && ! pressureStep */) {
       ++(seq->pairlistsAge);
     }
   }
 }


 void SequencerCUDA::launch_part2(
   const int     doMCPressure,
   double        dt_normal,
   double        dt_nbond,
   double        dt_slow,
   Vector        &origin,
   int           step,
   int           maxForceNumber,
   const int     langevinPistonStep,
   const int     copyIn,
   const int     copyOut,
   const int     doGlobal,
   const bool    doEnergy)
 {
   PatchMap* patchMap = PatchMap::Object();
   Tensor localVirial;
   //cudaTensor h_rigidVirial;
   bool doNbond = false;
   bool doSlow  = false;
   cudaCheck(cudaSetDevice(deviceID));
   const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
   const int is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
   // const int doVirial = langevinPistonStep;
   // JM: For launch_part2:
   //   copyIn   = migration steps

   reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;

   if(mGpuOn){
 #ifdef NAMD_NCCL_ALLREDUCE
     cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
 #endif
   }

   if(!simParams->langevinOn && !simParams->eFieldOn && !simParams->constraintsOn &&
     !simParams->SMDOn && !simParams->groupRestraintsOn && !doMCPressure &&
      !simParams->mgridforceOn && ! simParams->gridforceOn && !mGpuOn &&
      !simParams->consForceOn &&
     (simParams->watmodel == WaterModel::TIP3) &&
     (!is_lonepairs_psf)){
     CUDASequencerKernel->accumulate_force_kick(
       simParams->fixedAtomsOn,
       doGlobal,
       simParams->useCudaGlobal,
       maxForceNumber,
       numPatchesHomeAndProxy,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].f_bond,
       patchData->devData[deviceIndex].f_bond_nbond,
       patchData->devData[deviceIndex].f_bond_slow,
       patchData->devData[deviceIndex].forceStride,
       patchData->devData[deviceIndex].f_nbond,
       patchData->devData[deviceIndex].f_nbond_slow,
       patchData->devData[deviceIndex].f_slow,
       d_f_global_x,
       d_f_global_y,
       d_f_global_z,
       coll_f_normal_x.getDevicePtr(),
       coll_f_normal_y.getDevicePtr(),
       coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(),
       coll_f_nbond_y.getDevicePtr(),
       coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(),
       coll_f_slow_y.getDevicePtr(),
       coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(),
       coll_vel_y.getDevicePtr(),
       coll_vel_z.getDevicePtr(),
       d_recipMass,
       d_atomFixed,
       dt_normal,
       dt_nbond,
       dt_slow,
       1.0,
       coll_unsortOrder.getDevicePtr(),
       myLattice,
       stream
       );
   }else{
     CUDASequencerKernel->accumulateForceToSOA(
       doGlobal,
       simParams->useCudaGlobal,
       maxForceNumber,
       numPatchesHomeAndProxy,
       nDevices,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].f_bond,
       patchData->devData[deviceIndex].f_bond_nbond,
       patchData->devData[deviceIndex].f_bond_slow,
       patchData->devData[deviceIndex].forceStride,
       patchData->devData[deviceIndex].f_nbond,
       patchData->devData[deviceIndex].f_nbond_slow,
       patchData->devData[deviceIndex].f_slow,
       d_f_global_x,
       d_f_global_y,
       d_f_global_z,
       coll_f_normal_x.getDevicePtr(),
       coll_f_normal_y.getDevicePtr(),
       coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(),
       coll_f_nbond_y.getDevicePtr(),
       coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(),
       coll_f_slow_y.getDevicePtr(),
       coll_f_slow_z.getDevicePtr(),
       coll_unsortOrder.getDevicePtr(),
       myLattice,
       patchData->d_queues,
       patchData->d_queueCounters,
       d_tbcatomic,
       stream
     );

   }

   if (mGpuOn) {
     // Synchonize device before node barrier
     cudaCheck(cudaDeviceSynchronize());
   }
 }

 // launch_part2 is broken into 2 part to support MC barostat
 void SequencerCUDA::launch_part3(
   const int     doMCPressure,
   double        dt_normal,
   double        dt_nbond,
   double        dt_slow,
   Vector        &origin,
   int           step,
   int           maxForceNumber,
   const bool    requestGlobalForces,
   const int     doGlobalStaleForces,
   const bool    forceRequestedGPU,
   const int     copyIn,
   const int     copyOut,
   const bool    doEnergy,
   const bool    requestForcesOutput)
 {
   const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
   const bool doFixed = simParams->fixedAtomsOn;
   const double velrescaling = 1;  // no rescaling

   if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
     simParams->SMDOn || simParams->groupRestraintsOn || requestGlobalForces || simParams->gridforceOn|| simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
     (simParams->watmodel != WaterModel::TIP3) ||
     Node::Object()->molecule->is_lonepairs_psf){
     if(mGpuOn){
 #ifndef NAMD_NCCL_ALLREDUCE
     // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
     //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
     //  0, deviceCUDA->getNcclComm(), stream);

     std::vector<int> atom_counts;
     for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
       atom_counts.push_back(patchData->devData[i].numAtomsHome);
     }
     CUDASequencerKernel->mergeForcesFromPeers(
       deviceIndex,
       maxForceNumber,
       myLattice,
       numPatchesHomeAndProxy,
       numPatchesHome,
       this->coll_f_normal_x.getDevicePeerPtr(),
       this->coll_f_normal_y.getDevicePeerPtr(),
       this->coll_f_normal_z.getDevicePeerPtr(),
       this->coll_f_nbond_x.getDevicePeerPtr(),
       this->coll_f_nbond_y.getDevicePeerPtr(),
       this->coll_f_nbond_z.getDevicePeerPtr(),
       this->coll_f_slow_x.getDevicePeerPtr(),
       this->coll_f_slow_y.getDevicePeerPtr(),
       this->coll_f_slow_z.getDevicePeerPtr(),
       // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
       patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
       patchData->devData[deviceIndex].d_localPatches,
       patchData->devData[deviceIndex].d_peerPatches,
       atom_counts,
       stream
     );
 #else
     int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
     ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
 #endif
     }
     if(doVirial && doGlobalStaleForces)
       {
         memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
         memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
         computeGlobalMasterVirial(
                                 numPatchesHomeAndProxy,
                                 numAtomsHome,
                                 patchData->devData[deviceIndex].d_localPatches,
                                 coll_pos_x.getDevicePtr(),
                                 coll_pos_y.getDevicePtr(),
                                 coll_pos_z.getDevicePtr(),
                                 d_transform,
                                 d_f_global_x,
                                 d_f_global_y,
                                 d_f_global_z,
                                 &d_extForce[EXT_GLOBALMTS],
                                 &extForce[EXT_GLOBALMTS],
                                 &d_extVirial[EXT_GLOBALMTS],
                                 &extVirial[EXT_GLOBALMTS],
                                 myLattice,
                                 d_tbcatomic,
                                 stream);
       }
     calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
 #if 0
   cudaCheck(cudaDeviceSynchronize());
   if(true || deviceID == 0){
     char prefix[10];
     snprintf(prefix, 10, "step-%d",step);
     this->printSOAForces(prefix);
   }
 #endif

   }

   if (simParams->langevinOn) {
     CUDASequencerKernel->langevinVelocitiesBBK1(
       dt_normal, d_langevinParam, coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), numAtomsHome, stream);
   }

   if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
     simParams->SMDOn || simParams->groupRestraintsOn || doMCPressure || simParams->gridforceOn || simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
     (simParams->watmodel != WaterModel::TIP3) ||
     Node::Object()->molecule->is_lonepairs_psf){
     CUDASequencerKernel->addForceToMomentum(
       doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
       d_recipMass,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
       numAtomsHome, maxForceNumber, stream);
   }

   if (simParams->langevinOn) {

     // must enforce rigid bond constraints if langevin gammas differ
     if (simParams->rigidBonds != RIGID_NONE &&
         simParams->langevinGammasDiffer) {
       CUDASequencerKernel->rattle1(
         simParams->fixedAtomsOn, doEnergy || doVirial,
         1, numAtomsHome, dt_normal, 1.0/dt_normal,
         2.0 * simParams->rigidTol,
         coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
         coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
         d_velNew_x, d_velNew_y, d_velNew_z,
         d_posNew_x, d_posNew_y, d_posNew_z,
         coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
         d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
         &settleList, settleListSize, &d_consFailure,
         d_consFailureSize, &rattleList, rattleListSize,
         &nSettle, &nRattle,
         d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
         buildRigidLists, consFailure, simParams->watmodel, stream);
       buildRigidLists = false;
     }
     CUDASequencerKernel->langevinVelocitiesBBK2(
       dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
       d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       numAtomsHome, numAtomsHome, 0,
       curandGen, stream);
   }
   if(simParams->rigidBonds != RIGID_NONE){
     CUDASequencerKernel->rattle1(
       simParams->fixedAtomsOn, doEnergy || doVirial,
       1,  numAtomsHome, dt_normal, 1.0/dt_normal,
       2.0 * simParams->rigidTol,
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       d_velNew_x, d_velNew_y, d_velNew_z,
       d_posNew_x, d_posNew_y, d_posNew_z,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
       &settleList, settleListSize, &d_consFailure,
       d_consFailureSize, &rattleList, rattleListSize,
       &nSettle, &nRattle,
       d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
       buildRigidLists, consFailure, simParams->watmodel, stream);
     buildRigidLists = false;
   }

   // Update velocity center of mass here
   if(doEnergy || doVirial){
     CUDASequencerKernel->centerOfMass(
       coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(),
       d_vcm_x, d_vcm_y, d_vcm_z,
       d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
   }

   submitHalf(numAtomsHome, 2, doEnergy || doVirial);

   CUDASequencerKernel->addForceToMomentum(
     doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
     d_recipMass,
     coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
     coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
     coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
     coll_vel_x.getDevicePtr(), coll_vel_y.getDevicePtr(), coll_vel_z.getDevicePtr(), d_atomFixed,
     numAtomsHome, maxForceNumber, stream);

   if(requestGlobalForces || requestForcesOutput) {
     // store the forces for next step,
     // when we need it for colvars and Tcl scripting
     saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
   }

   if (forceRequestedGPU) {
     if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
     if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
     CUDASequencerKernel->copyForcesToDevice(
       numAtomsHome, coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       d_f_saved_nbond_x, d_f_saved_nbond_y,  d_f_saved_nbond_z,
       d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
     // cudaCheck(cudaStreamSynchronize(stream));
   }

   //cudaCheck(cudaStreamSynchronize(stream));
   submitReductions(origin.x, origin.y, origin.z,
                    marginViolations, doEnergy || doVirial,
                    copyOut && simParams->outputMomenta != 0,
                    numAtomsHome, maxForceNumber);
   // This is for collecting coordinate and velocity to print
   copyPositionsAndVelocitiesToHost(copyOut, 0);

   if(consFailure[0]){
     // Constraint failure. Abort.
     int dieOnError = simParams->rigidDie;
     if(dieOnError){
       // Bails out
       //iout << iWARN << "constraint failure during GPU integration \n" << endi;
       NAMD_die("constraint failure during CUDA rattle!\n");
     }else{
       iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
     }
   }else if(doEnergy || doVirial){
     cudaCheck(cudaStreamSynchronize(stream));
     if(doVirial && doGlobalStaleForces) {
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
       ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
     }
     if(simParams->rigidBonds != RIGID_NONE){
       Tensor reduction_rigidVirial;
       COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
       // Haochuan: the tensor is not symmetric when there are fixed atoms
       if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
       ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
     }

     // SUBMITHALF reductions
     Tensor reduction_virial;
     Tensor reduction_intVirialNormal;
     COPY_CUDATENSOR(virial_half[0], reduction_virial);
     COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
     reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
     // Haochuan: the tensor is not symmetric when there are fixed atoms
     if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
     reduction_virial *= 0.5;
     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);

     reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
       += (intKineticEnergy_half[0] * 0.25);
     reduction_intVirialNormal *= 0.5;
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
                       reduction_intVirialNormal);

     //submitReductions1
     reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
     Vector momentum(*momentum_x, *momentum_y, *momentum_z);
     ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
     Vector angularMomentum(*angularMomentum_x,
                            *angularMomentum_y,
                            *angularMomentum_z);
     ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
     //submitReductions2
     Tensor regintVirialNormal;
     Tensor regintVirialNbond;
     Tensor regintVirialSlow;
     COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
     if (maxForceNumber >= 1) {
     COPY_CUDATENSOR(intVirialNbond[0],  regintVirialNbond);
     }
     if (maxForceNumber >= 2) {
     COPY_CUDATENSOR(intVirialSlow[0],   regintVirialSlow);
     }

     reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND,  regintVirialNbond);
     ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW,   regintVirialSlow);

     if (simParams->fixedAtomsOn) {
       cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
       double3 fixForceNormal, fixForceNbond, fixForceSlow;
       switch (maxForceNumber) {
         case 2: {
           copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
           copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
           cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
         } // intentionally fallthrough
         case 1: {
           copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
           copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
           cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
         } // intentionally fallthrough
         default: {
           copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
           copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
           cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
           cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
         }
       }
 #if 0
       auto printTensor = [](const cudaTensor& t, const std::string& name){
         CkPrintf("%s", name.c_str());
         CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
                  "%12.5lf %12.5lf %12.5lf\n"
                  "%12.5lf %12.5lf %12.5lf\n",
                  t.xx, t.xy, t.xz,
                  t.yx, t.yy, t.yz,
                  t.zx, t.zy, t.zz);
       };
       printTensor(fixVirialNormal, "fixVirialNormal = ");
       printTensor(fixVirialNbond, "fixVirialNbond = ");
       printTensor(fixVirialSlow, "fixVirialSlow = ");
 #endif
     }
   }
 }

 // Adding this function back temporarily until GPU migration is merged
 #if 1
 // This function will aggregate data within a single GPU, so we need to have it copied over pmePositions
 void SequencerCUDA::atomUpdatePme()
 {
   const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
      ComputeNonbondedUtil::dielectric_1);
   // We need to find doNbond and doSlow for upcoming step
   bool doNbond = false;
   bool doSlow = true;

   bool doFEP = false;
   bool doTI = false;
   bool doAlchDecouple = false;
   bool doAlchSoftCore = false;
   if (simParams->alchOn) {
     if (simParams->alchFepOn) doFEP = true;
     if (simParams->alchThermIntOn) doTI = true;
     if (simParams->alchDecouple) doAlchDecouple = true;
     if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
   }

   if (Node::Object()->molecule->is_lonepairs_psf) {
     lonepairsKernel->reposition(coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), stream);
   }

   bool usePatchPme = false;
   if (deviceCUDA->getIsPmeDevice() && doSlow) {
     // This will check if the current lattice is compatible with the patch-level PME kernels
     // This needs to be redone every time the lattice changes, the results are stored in
     // the cudaPme object, and the overall compatibility can be computed with the compatible()
     // function.
     //
     // The behavior of set compute positions PME is different depending on the kernels being used,
     // so that value needs to be passed to the kernel object
     ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
     CudaPmeOneDevice* cudaPme =  cudaMgr->getCudaPmeOneDevice();
     cudaPme->checkPatchLevelLatticeCompatibilityAndComputeOffsets(
       myLattice,
       getNumPatchesHome(),
       patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
       getHostPatchMin(),
       getHostPatchMax(),
       getHostAwayDists()
     );
     usePatchPme = cudaPme->patchLevelPmeData.compatible();
   }

   std::vector<int> atom_counts;
   for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
     atom_counts.push_back(patchData->devData[i].numAtomsHome);
   }
   CUDASequencerKernel->set_pme_positions(
                   deviceIndex,
                   deviceCUDA->getIsPmeDevice(),
                   nDevices,
                   numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
                   doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
 #ifdef NAMD_NCCL_ALLREDUCE
                   (mGpuOn) ? d_posNew_x: coll_pos_x.getDevicePtr(),
                   (mGpuOn) ? d_posNew_y: coll_pos_y.getDevicePtr(),
                   (mGpuOn) ? d_posNew_z: coll_pos_z.getDevicePtr(),
 #else
                   coll_pos_x.getDevicePtr(),
                   coll_pos_y.getDevicePtr(),
                   coll_pos_z.getDevicePtr(),
                   coll_pos_x.getDevicePeerPtr(), // passes double-pointer if mgpuOn
                   coll_pos_y.getDevicePeerPtr(),
                   coll_pos_z.getDevicePeerPtr(),
                   coll_charge.getDevicePeerPtr(),
                   coll_partition.getDevicePeerPtr(),
 #endif
                   coll_charge.getDevicePtr(), coll_partition.getDevicePtr(), charge_scaling,
                   coll_patchCenter.getDevicePtr(),
                   patchData->devData[deviceIndex].slow_patchPositions,
                   patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
                   coll_sortOrder.getDevicePtr(), myLattice,
                   (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
                   (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
                   Node::Object()->molecule->numAtoms,
                   patchData->devData[deviceIndex].d_localPatches,
                   patchData->devData[deviceIndex].d_peerPatches,
                   atom_counts,
                   stream);

   cudaCheck(cudaStreamSynchronize(stream));
 }
 #endif


 void SequencerCUDA::sync() {
   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::calculateExternalForces(
   const int step,
   const int maxForceNumber,
   const int doEnergy,
   const int doVirial) {

   const bool is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
   const bool is_tip4_water = simParams->watmodel == WaterModel::TIP4;

   if (is_lonepairs_psf) {
     lonepairsKernel->redistributeForce(
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
       d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), maxForceNumber, doEnergy || doVirial, stream);
   }
   // TODO: Should I use "else if" here to follow the logic of Sequencer.C?
   if (is_tip4_water) {
     redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
   }

   if(simParams->eFieldOn){
       double3 efield;
       efield.x = simParams->eField.x;
       efield.y = simParams->eField.y;
       efield.z = simParams->eField.z;

       double efield_omega = TWOPI * simParams->eFieldFreq / 1000.;
       double efield_phi = PI/180. * simParams->eFieldPhase;
       double t = step * simParams->dt;

       CUDASequencerKernel->apply_Efield(numAtomsHome, simParams->eFieldNormalized,
         doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
         coll_charge.getDevicePtr(), coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
         coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
         &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
         &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
         &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
         d_tbcatomic, stream);
   }

   if(simParams->constraintsOn){
     restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
       coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
       &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
       &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
   }

   if(simParams->SMDOn){
     SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
       d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
       &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
   }

   if(simParams->groupRestraintsOn){
     groupRestraintsKernel->doForce(step, doEnergy, doVirial,
       myLattice, d_transform,
       d_mass, coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
       coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
       &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
       &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
   }
   if(simParams->mgridforceOn || simParams->gridforceOn){
     gridForceKernel->doForce(doEnergy, doVirial,
                              myLattice, step,
                              coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_transform,
                              coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
                              stream);
   }
   if(simParams->consForceOn){
     consForceKernel->doForce(myLattice, doVirial,
                              coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(),
                              coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
                              d_transform,
                              &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
                              &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
   }

   if(doEnergy || doVirial) {
     // Store the external forces and energy data
     cudaCheck(cudaStreamSynchronize(stream));
     if (is_lonepairs_psf || is_tip4_water) {
       switch (maxForceNumber) {
         case 2:
           copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, lpVirialSlow[0]);
           cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
         case 1:
           copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, lpVirialNbond[0]);
           cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
         case 0:
           copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, lpVirialNormal[0]);
           cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
       }
     }
     if(simParams->eFieldOn){
       reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_ELEC_FIELD];
       if (!simParams->eFieldNormalized){
         ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
       }
     }

     if(simParams->constraintsOn){
       reduction->item(REDUCTION_BC_ENERGY) += extEnergy[EXT_CONSTRAINTS];
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
       ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
     }

     if(simParams->SMDOn){
       ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
       if(cudaMgr->reducerSMDDevice == deviceIndex)
         {
           // each SMD kernel has the total SMD energy and extForce
           // so only one will contribute
           reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_SMD];
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_SMD]);
         }
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_SMD]);
     }

     if(simParams->groupRestraintsOn){
       // single GPU case, or designated reducer are only contributors for energy
       ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr();
       if(!mGpuOn || (deviceCUDA->getDeviceIndex() == cudaMgr->reducerGroupRestraintDevice))
         {
           reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GROUP_RESTRAINTS];
           ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
         }
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
     }

     if(simParams->mgridforceOn || simParams->gridforceOn){
       // first sum across grids
       gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
       reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GRIDFORCE];
       ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
       gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
     }
     if(simParams->consForceOn){
       reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_CONSFORCE];
       ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
     }
   }
 }

 void SequencerCUDA::copyGlobalForcesToDevice(){
   // copy the globalMaster forces  from host to device.
   // Use normal force on host to aggregate it
   std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
   //  fprintf(stderr,  "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
   std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
   // TODO: determine if this aggregation needs peers and to be home and proxy
   for(int i =0 ; i < numPatchesHome; i++){
     CudaLocalRecord record = localPatches[i];
     const int patchID = record.patchID;
     const int stride = record.bufferOffset;
     const int numPatchAtoms = record.numAtoms;
     PatchDataSOA& current = homePatches[i]->patchDataSOA;
     memcpy(f_global_x + stride, current.f_global_x, numPatchAtoms*sizeof(double));
     memcpy(f_global_y + stride, current.f_global_y, numPatchAtoms*sizeof(double));
     memcpy(f_global_z + stride, current.f_global_z, numPatchAtoms*sizeof(double));
   }
   // copy aggregated force to device buffer
   copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
   copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
   copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);

 }

 void SequencerCUDA::updateHostPatchDataSOA() {
     std::vector<PatchDataSOA> host_copy(numPatchesHome);
     std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;

     for(int i =0 ; i < numPatchesHome; i++) {
         host_copy[i] = homePatches[i]->patchDataSOA;
     }
     copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
     cudaCheck(cudaDeviceSynchronize());
 }

 void SequencerCUDA::saveForceCUDASOA_direct(
   const bool doGlobal, const bool doForcesOutput, const int maxForceNumber) {
   CUDASequencerKernel->copyForcesToHostSOA(
       numPatchesHome,
       patchData->devData[deviceIndex].d_localPatches,
       maxForceNumber,
       coll_f_normal_x.getDevicePtr(),
       coll_f_normal_y.getDevicePtr(),
       coll_f_normal_z.getDevicePtr(),
       coll_f_nbond_x.getDevicePtr(),
       coll_f_nbond_y.getDevicePtr(),
       coll_f_nbond_z.getDevicePtr(),
       coll_f_slow_x.getDevicePtr(),
       coll_f_slow_y.getDevicePtr(),
       coll_f_slow_z.getDevicePtr(),
       d_HostPatchDataSOA,
       doGlobal,
       doForcesOutput,
       stream
   );
   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::copyPositionsToHost_direct() {
   CUDASequencerKernel->copyPositionsToHostSOA(
       numPatchesHome,
       patchData->devData[deviceIndex].d_localPatches,
       coll_pos_x.getDevicePtr(),
       coll_pos_y.getDevicePtr(),
       coll_pos_z.getDevicePtr(),
       d_HostPatchDataSOA,
       stream
   );
   cudaCheck(cudaStreamSynchronize(stream));
 }

 void SequencerCUDA::redistributeTip4pForces(
   const int maxForceNumber,
   const int doVirial) {
   CUDASequencerKernel->redistributeTip4pForces(
     coll_f_normal_x.getDevicePtr(), coll_f_normal_y.getDevicePtr(), coll_f_normal_z.getDevicePtr(),
     coll_f_nbond_x.getDevicePtr(), coll_f_nbond_y.getDevicePtr(), coll_f_nbond_z.getDevicePtr(),
     coll_f_slow_x.getDevicePtr(), coll_f_slow_y.getDevicePtr(), coll_f_slow_z.getDevicePtr(),
     d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
     coll_pos_x.getDevicePtr(), coll_pos_y.getDevicePtr(), coll_pos_z.getDevicePtr(), d_mass,
     numAtomsHome, doVirial, maxForceNumber, stream
   );
 }

 void SequencerCUDA::allocateGPUSavedForces() {
   allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
   allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
   allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
   allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
   allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
   allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
 }

 void SequencerCUDA::submitReductionValues() {
   reduction->submit();
 }

 #endif // NAMD_CUDA
Node::Object
static Node * Object()
Definition: Node.h:86

cudaTensor::yy
BigReal yy
Definition: CudaUtils.h:89

ComputeLonepairsCUDA.h

PatchDataSOA::vel_y
double * vel_y
Definition: NamdTypes.h:397

CudaUtils.h

Lattice::rescale
NAMD_HOST_DEVICE void rescale(Tensor factor)
Definition: Lattice.h:60

NAMD_EVENT_STOP
#define NAMD_EVENT_STOP(eon, id)
Definition: NamdEventsProfiling.h:318

Patch::getNumAtoms
int getNumAtoms() const
Definition: Patch.h:105

HomePatch::getMin
ScaledPosition getMin()
Definition: HomePatch.h:529

DeviceCUDA::getDeviceCount
int getDeviceCount()
Definition: DeviceCUDA.h:124

ResizeArray::size
int size(void) const
Definition: ResizeArray.h:131

Lattice::c
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270

ComputeRestraintsCUDA.h

HomePatch::updateAtomBuffers
void updateAtomBuffers()

PatchMap::periodic_a
int periodic_a(void) const
Definition: PatchMap.h:73

ComputeNonbondedUtil::scaling
static BigReal scaling
Definition: ComputeNonbondedUtil.h:364

BOLTZMANN
#define BOLTZMANN
Definition: common.h:54

SettleParameters
Definition: MShakeKernel.h:30

SettleParameters::rc
double rc
Definition: MShakeKernel.h:38

curandCheck
#define curandCheck(stmt)
Definition: CudaUtils.h:251

MigrationInfo
Definition: Migration.h:19

SettleParameters::mOrmT
double mOrmT
Definition: MShakeKernel.h:33

ComputeCUDAMgr::curSMDCOM
double3 ** curSMDCOM
Definition: ComputeCUDAMgr.h:36

ComputeNonbondedUtil::dielectric_1
static BigReal dielectric_1
Definition: ComputeNonbondedUtil.h:304

SimParameters
Definition: SimParameters.h:139

partition
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45

NamdEventsProfiling.h

PatchMap::Object
static PatchMap * Object()
Definition: PatchMap.h:27

cudaTensor::zz
BigReal zz
Definition: CudaUtils.h:93

cudaTensor::yx
BigReal yx
Definition: CudaUtils.h:88

PatchMap::periodic_c
int periodic_c(void) const
Definition: PatchMap.h:75

SimParameters::alchElecLambdaStart
BigReal alchElecLambdaStart
Definition: SimParameters.h:500

Vector
Definition: Vector.h:72

ADD_TENSOR_OBJECT
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44

deallocate_device
void deallocate_device(T **pp)
Definition: CudaUtils.h:342

cudaTensor::yz
BigReal yz
Definition: CudaUtils.h:90

Flags::savePairlists
int savePairlists
Definition: PatchTypes.h:41

PatchDataSOA::f_global_z
double * f_global_z
Definition: NamdTypes.h:439

SettleParameters::r_ohc
double r_ohc
Definition: MShakeKernel.h:41

Molecule::moleculeAtom
int32 * moleculeAtom
atom index for all molecules
Definition: Molecule.h:622

COULOMB
#define COULOMB
Definition: common.h:53

ComputeGridForceCUDA.h

PatchMap::homePatchList
HomePatchList * homePatchList()
Definition: PatchMap.C:438

endi
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54

Vector::z
BigReal z
Definition: Vector.h:74

DeviceCUDA::getNumDevice
int getNumDevice()
Definition: DeviceCUDA.h:125

Flags::usePairlists
int usePairlists
Definition: PatchTypes.h:40

CudaPmeOneDevice::checkPatchLevelLatticeCompatibilityAndComputeOffsets
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
Definition: CudaPmeSolverUtil.C:2065

CudaLocalRecord
Definition: CudaRecord.h:35

MigrationInfo::destPatchID
PatchID destPatchID
Definition: Migration.h:20

REDUCTION_MISC_ENERGY
Definition: ReductionMgr.h:107

iWARN
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82

ReductionMgr::willSubmit
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368

Sequencer
Definition: Sequencer.h:45

WaterModel::TIP4

ReductionMgr::Object
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290

iout
#define iout
Definition: InfoStream.h:51

PatchMap::patch
Patch * patch(PatchID pid)
Definition: PatchMap.h:244

COPY_CUDATENSOR
#define COPY_CUDATENSOR(S, D)
Definition: CudaUtils.h:51

PatchMap::ObjectOnPe
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28

PatchMap::homePatch
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249

CudaPmeOneDevice::patchLevelPmeData
PatchLevelPmeData patchLevelPmeData
Definition: CudaPmeSolverUtil.h:282

REDUCTION_CENTERED_KINETIC_ENERGY
Definition: ReductionMgr.h:95

Molecule
Molecule stores the structural information for the system.
Definition: Molecule.h:174

PatchDataSOA::pos_y
double * pos_y
Definition: NamdTypes.h:378

SettleParameters::r_om
double r_om
Definition: MShakeKernel.h:40

CudaLocalRecord::numAtoms
int numAtoms
Definition: CudaRecord.h:40

DeviceCUDA::setupDevicePeerAccess
void setupDevicePeerAccess()

Patch::flags
Flags flags
Definition: Patch.h:128

PatchDataSOA::f_global_y
double * f_global_y
Definition: NamdTypes.h:438

HomePatch
Definition: HomePatch.h:329

ComputeCUDAMgr::reducerSMDDevice
std::atomic< int > reducerSMDDevice
Definition: ComputeCUDAMgr.h:39

ComputeGlobalMasterVirialCUDAKernel.h

REDUCTION_INT_CENTERED_KINETIC_ENERGY
Definition: ReductionMgr.h:97

Molecule::get_atomtype
const char * get_atomtype(int anum) const
Definition: Molecule.h:1192

deviceCUDA
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23

ComputeCUDAMgr::reducerGroupRestraintDevice
std::atomic< int > reducerGroupRestraintDevice
Definition: ComputeCUDAMgr.h:40

PI
#define PI
Definition: common.h:92

Molecule.h

HomePatch::updateAtomCount
void updateAtomCount(const int n, const int reallocate)

NAMD_CRASH_ATOM_TOO_FAST
#define NAMD_CRASH_ATOM_TOO_FAST
Definition: SimParameters.h:112

ComputeLonepairsCUDA
Definition: ComputeLonepairsCUDA.h:13

HomePatch::getAtomList
FullAtomList & getAtomList()
Definition: HomePatch.h:528

Flags::copyIntFlags
void copyIntFlags(const Flags &flags)
Definition: PatchTypes.h:74

make_double3
NAMD_HOST_DEVICE double3 make_double3(float3 a)
Definition: Vector.h:343

CudaLocalRecord::bufferOffset
int bufferOffset
Definition: CudaRecord.h:38

PatchMap::numPatches
int numPatches(void) const
Definition: PatchMap.h:59

NAMD_EVENT_START
#define NAMD_EVENT_START(eon, id)
Definition: NamdEventsProfiling.h:312

SettleParameters::rb
double rb
Definition: MShakeKernel.h:37

Sequencer::pairlistsAge
int pairlistsAge
Definition: Sequencer.h:232

DeviceCUDA::getPmeDeviceIndex
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167

REDUCTION_ATOM_CHECKSUM
Definition: ReductionMgr.h:143

AtomMap::ObjectOnPe
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38

DeviceCUDA::getMasterPe
int getMasterPe()
Definition: DeviceCUDA.h:137

Molecule::numLargeMolecules
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
Definition: Molecule.h:620

ATOMIC_BINS
#define ATOMIC_BINS
Definition: CudaUtils.h:79

ComputeNonbondedUtil.h

NAMD_bug
void NAMD_bug(const char *err_msg)
Definition: common.C:196

cudaTensor::xx
BigReal xx
Definition: CudaUtils.h:85

COPY_CUDAVECTOR
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:62

ComputeCUDAMgr::getComputeCUDAMgr
static ComputeCUDAMgr * getComputeCUDAMgr()
Definition: ComputeCUDAMgr.C:191

Flags::doFullElectrostatics
int doFullElectrostatics
Definition: PatchTypes.h:23

SettleParameters::mHrmT
double mHrmT
Definition: MShakeKernel.h:34

PatchDataSOA::vel_x
double * vel_x
Jim recommends double precision velocity.
Definition: NamdTypes.h:396

PatchDataSOA::id
int32 * id
Definition: NamdTypes.h:390

ResizeArray< FullAtom >

PatchDataSOA
Definition: NamdTypes.h:373

Molecule::numMolecules
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
Definition: Molecule.h:619

allocate_host
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:305

REDUCTION_INT_HALFSTEP_KINETIC_ENERGY
Definition: ReductionMgr.h:96

HomePatch::getMax
ScaledPosition getMax()
Definition: HomePatch.h:530

cudaTensor::zx
BigReal zx
Definition: CudaUtils.h:91

PatchDataSOA::transform_i
int32 * transform_i
Definition: NamdTypes.h:410

Vector::x
BigReal x
Definition: Vector.h:74

Patch::getPatchID
PatchID getPatchID() const
Definition: Patch.h:114

DeviceCUDA::getPesSharingDevice
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139

SimParameters::alchDecouple
Bool alchDecouple
Definition: SimParameters.h:522

Lattice::a_r
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284

ReductionMgr.h

SequencerCUDA.h

Lattice::b_r
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285

Molecule::numAtoms
int numAtoms
Definition: Molecule.h:586

Flags::doNonbonded
int doNonbonded
Definition: PatchTypes.h:22

NAMD_die
void NAMD_die(const char *err_msg)
Definition: common.C:148

FullAtom
Definition: NamdTypes.h:210

AtomMap.h

SettleParameters::rra
double rra
Definition: MShakeKernel.h:35

PatchMap::periodic_b
int periodic_b(void) const
Definition: PatchMap.h:74

Lattice::c_r
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286

TestArray.h

cudaTensor::xz
BigReal xz
Definition: CudaUtils.h:87

Lattice::b
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269

SynchronousCollectiveScope::master

PatchLevelPmeData::compatible
bool compatible() const
Definition: CudaPmeSolverUtilKernel.h:20

Flags::maxAtomMovement
BigReal maxAtomMovement
Definition: PatchTypes.h:43

DeviceCUDA::isGpuReservedPme
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164

Tensor::xx
BigReal xx
Definition: Tensor.h:17

REDUCTION_HALFSTEP_KINETIC_ENERGY
Definition: ReductionMgr.h:94

TWOPI
#define TWOPI
Definition: common.h:96

copy_DtoH
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:436

SettleParameters::mH
double mH
Definition: MShakeKernel.h:32

Tensor::zz
BigReal zz
Definition: Tensor.h:19

REDUCTIONS_GPURESIDENT
Definition: ReductionMgr.h:184

PatchDataSOA::transform_k
int32 * transform_k
Definition: NamdTypes.h:412

simParams
#define simParams
Definition: Output.C:131

CudaPmeOneDevice
Definition: CudaPmeSolverUtil.h:209

ResizeArray::begin
iterator begin(void)
Definition: ResizeArray.h:36

PatchDataSOA::pos_z
double * pos_z
Definition: NamdTypes.h:379

DeviceCUDA.h

cudaVector
Definition: CudaUtils.h:125

Patch::patchID
const PatchID patchID
Definition: Patch.h:150

Tensor
Definition: Tensor.h:15

PatchDataSOA::pos_x
double * pos_x
Definition: NamdTypes.h:377

Vector::y
BigReal y
Definition: Vector.h:74

WaterModel::TIP3

DeviceCUDA::getIsPmeDevice
bool getIsPmeDevice()
Definition: DeviceCUDA.h:168

PatchDataSOA::transform_j
int32 * transform_j
Definition: NamdTypes.h:411

PatchDataSOA::vel_z
double * vel_z
Definition: NamdTypes.h:398

ADD_VECTOR_OBJECT
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28

Tensor::yy
BigReal yy
Definition: Tensor.h:18

PDBVELFACTOR
#define PDBVELFACTOR
Definition: common.h:57

common.h

cudaTensor
Definition: CudaUtils.h:84

Molecule::moleculeStartIndex
int32 * moleculeStartIndex
starting index of each molecule
Definition: Molecule.h:621

DeviceCUDA::getDeviceIndex
int getDeviceIndex()
Definition: DeviceCUDA.h:166

Flags::pairlistTolerance
BigReal pairlistTolerance
Definition: PatchTypes.h:42

cudaCheck
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

CudaLocalRecord::patchID
int patchID
Definition: CudaRecord.h:37

SettleParameters::ra
double ra
Definition: MShakeKernel.h:36

CudaMInfo
Definition: CudaUtils.h:131

DeviceCUDA::getIsMasterDevice
bool getIsMasterDevice()
Definition: DeviceCUDA.C:646

PatchDataSOA::f_global_x
double * f_global_x
Definition: NamdTypes.h:437

Sequencer::pairlistsAgeLimit
int pairlistsAgeLimit
Definition: Sequencer.h:233

Lattice
Definition: Lattice.h:17

Sequencer::pairlistsAreValid
int pairlistsAreValid
Definition: Sequencer.h:231

SimParameters::alchGetNumOfPMEGrids
size_t alchGetNumOfPMEGrids() const
Definition: SimParameters.C:8587

iERROR
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83

CudaMInfo::destPatchID
int destPatchID[3][3][3]
Definition: CudaUtils.h:132

Molecule::is_lonepairs_psf
int is_lonepairs_psf
Definition: Molecule.h:491

Lattice::a
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268

PatchMap
Definition: PatchMap.h:23

namd_reciprocal
#define namd_reciprocal(x)
Definition: Vector.h:69

DeviceCUDA::getDeviceIDforPe
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:529

RIGID_NONE
#define RIGID_NONE
Definition: SimParameters.h:80

SettleParameters::mO
double mO
Definition: MShakeKernel.h:31

DeviceCUDA::getNumPesSharingDevice
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138

Sequencer::simParams
SimParameters *const simParams
Definition: Sequencer.h:322

Vector::unit
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215

Node::molecule
Molecule * molecule
Definition: Node.h:179

cudaTensor::zy
BigReal zy
Definition: CudaUtils.h:92

REDUCTION_BC_ENERGY
Definition: ReductionMgr.h:106

ComputeConsForceCUDA.h

DeviceCUDA
Definition: DeviceCUDA.h:54

cudaTensor::xy
BigReal xy
Definition: CudaUtils.h:86

BigReal
double BigReal
Definition: common.h:123

SimParameters::alchThermIntOn
Bool alchThermIntOn
Definition: SimParameters.h:480

SimParameters.h

SimParameters::alchFepOn
Bool alchFepOn
Definition: SimParameters.h:477

ComputeCUDAMgr::getCudaPmeOneDevice
CudaPmeOneDevice * getCudaPmeOneDevice()
Definition: ComputeCUDAMgr.C:229

ComputeCUDAMgr
Definition: ComputeCUDAMgr.h:16

PatchDataSOA::numAtoms
int32 numAtoms
number of atoms
Definition: NamdTypes.h:456