34 #if defined(NAMD_CUDA) || defined(NAMD_HIP)    36 #define __thread __declspec(thread)    43 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
    55 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
    74 Compute(c), deviceID(deviceID), masterPe(CkMyPe()),
    75 bondedKernel(deviceID, cudaNonbondedTables), computeMgr(computeMgr)
    78   computes.resize(CkMyNodeSize());
    79   patchIDsPerRank.resize(CkMyNodeSize());
    80   numExclPerRank.resize(CkMyNodeSize());
    81   for (
int i=0;i < numExclPerRank.size();i++) {
    82     numExclPerRank[i].numModifiedExclusions = 0;
    83     numExclPerRank[i].numExclusions = 0;
   100   energies_virials = NULL;
   102   initializeCalled = 
false;
   105   accelMDdoDihe = 
false;
   106   if (params->accelMDOn) {
   107      if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=
true;
   113   pswitchTable[0] = 0; pswitchTable[1] = 1;  pswitchTable[2] = 2;
   114   pswitchTable[3] = 1; pswitchTable[4] = 1;  pswitchTable[5] = 99;
   115   pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
   117   h_patchRecord = NULL;
   118   d_patchRecord = NULL;
   120   h_patchMapCenter = NULL;
   121   d_patchMapCenter = NULL;
   128 ComputeBondedCUDA::~ComputeBondedCUDA() {
   131   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
   132   if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
   133   if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
   134   if (tupleData != NULL) deallocate_host<char>(&tupleData);
   136   if (initializeCalled) {
   138     cudaCheck(cudaEventDestroy(forceDoneEvent));
   139     CmiDestroyLock(lock);
   140     CmiDestroyLock(printLock);
   141     if (reductionGpuOffload) {
   142       delete reductionGpuOffload;
   144     if (reductionGpuResident) {
   145       delete reductionGpuResident;
   149   if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
   150   if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
   152   if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
   153   if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
   159 void ComputeBondedCUDA::unregisterBoxesOnPe() {
   160   for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
   161     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
   163     if (tpe == NULL || tpe->
p == NULL) {
   164       NAMD_bug(
"ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
   177 void ComputeBondedCUDA::registerCompute(
int pe, 
int type, 
PatchIDList& pids) {
   179   if (CkMyPe() != masterPe)
   180     NAMD_bug(
"ComputeBondedCUDA::registerCompute() called on non master PE");
   182   int rank = CkRankOf(pe);
   184   HomeCompute& homeCompute = computes[rank].homeCompute;
   185   if (homeCompute.patchIDs.size() == 0) {
   187     homeCompute.patchIDs.resize(pids.
size());
   188     for (
int i=0;i < pids.
size();i++) {
   189       homeCompute.patchIDs[i] = pids[i];
   190       homeCompute.isBasePatch[pids[i]] = 1;
   193     if (homeCompute.patchIDs.size() != pids.
size()) {
   194       NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
   196     for (
int i=0;i < pids.
size();i++) {
   197       if (homeCompute.patchIDs[i] != pids[i]) {
   198         NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
   205     homeCompute.tuples.push_back(
new HomeTuples<BondElem, Bond, BondValue>(
Tuples::BOND));
   209     homeCompute.tuples.push_back(
new HomeTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE));
   213     homeCompute.tuples.push_back(
new HomeTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL));
   217     homeCompute.tuples.push_back(
new HomeTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER));
   221     homeCompute.tuples.push_back(
new HomeTuples<ExclElem, Exclusion, int>(
Tuples::EXCLUSION));
   225     homeCompute.tuples.push_back(
new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM));
   229     homeCompute.tuples.push_back(
new HomeTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE));
   233     homeCompute.tuples.push_back(
new HomeTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO));
   237     homeCompute.tuples.push_back(
new HomeTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE));
   241     NAMD_bug(
"ComputeBondedCUDA::registerCompute(), Unsupported compute type");
   251 void ComputeBondedCUDA::registerSelfCompute(
int pe, 
int type, 
int pid) {
   253   if (CkMyPe() != masterPe)
   254     NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute() called on non master PE");
   256   int rank = CkRankOf(pe);
   258   std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
   259   auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
   260   if (it == selfComputes.end()) {
   262     selfComputes.push_back(SelfCompute(type));
   263     it = selfComputes.begin() + (selfComputes.size() - 1);
   267       it->tuples = 
new SelfTuples<BondElem, Bond, BondValue>(
Tuples::BOND);
   271       it->tuples = 
new SelfTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE);
   275       it->tuples = 
new SelfTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL);
   279       it->tuples = 
new SelfTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER);
   287       it->tuples = 
new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM);
   291       it->tuples = 
new SelfTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE);
   295       it->tuples = 
new SelfTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO);
   299       it->tuples = 
new SelfTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE);
   303       NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
   310   it->patchIDs.push_back(pid);
   313 void ComputeBondedCUDA::assignPatchesOnPe() {
   316   for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
   317     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
   321       NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, patch not found");
   322     if (flags == NULL) flags = &patchMap->
patch(patchID)->
flags;
   325       NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
   327     if (tpe->
p != NULL) {
   328       NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
   341 void ComputeBondedCUDA::atomUpdate() {
   342   atomsChangedIn = 
true;
   349 int ComputeBondedCUDA::noWork() {
   354 void ComputeBondedCUDA::messageEnqueueWork() {
   355   if (masterPe != CkMyPe())
   356     NAMD_bug(
"ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
   364 void ComputeBondedCUDA::doWork() {
   365   if (CkMyPe() != masterPe)
   366     NAMD_bug(
"ComputeBondedCUDA::doWork() called on non master PE");
   372   atomsChanged = atomsChangedIn;
   373   atomsChangedIn = 
false;
   375   if (getNumPatches() == 0) {
   380     NAMD_bug(
"ComputeBondedCUDA::doWork(), no flags set");
   384   lattice  = flags->lattice;
   385   doEnergy = flags->doEnergy;
   386   doVirial = flags->doVirial;
   387   doSlow   = flags->doFullElectrostatics;
   388   doMolly  = flags->doMolly;
   391   if (hostAlchFlags.alchOn) {
   392     updateHostCudaAlchLambdas();
   393     updateKernelCudaAlchLambdas();
   395     if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
   405   if(params->CUDASOAintegrate) {
   406     if (!atomsChanged) this->openBoxesOnPe();
   414 void ComputeBondedCUDA::patchReady(
PatchID pid, 
int doneMigration, 
int seq) {
   420 #ifdef NODEGROUP_FORCE_REGISTER   421     patches[patchIndex[pid]].patchID = pid;
   427   if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
   437 void ComputeBondedCUDA::updatePatches() {
   440     for (
int i=0;i < patches.size();i++) {
   441       patches[i].atomStart = atomStart;
   442       atomStart += patches[i].numAtoms;
   444     atomStorageSize = atomStart;
   447     reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
   450 #ifdef NODEGROUP_FORCE_REGISTER   452     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   453     PatchData *patchData = cpdata.ckLocalBranch();
   455     std::vector<CudaLocalRecord>& localPatches = 
   456       patchData->devData[deviceIndex].h_localPatches;
   457     const int numPatchesHomeAndProxy = 
   458       patchData->devData[deviceIndex].numPatchesHomeAndProxy;
   461     for (
int i=0;i < numPatchesHomeAndProxy; i++) {
   462       patches[i].numAtoms = localPatches[i].numAtoms;
   463       patches[i].atomStart = localPatches[i].bufferOffset;
   464       atomStart += patches[i].numAtoms;
   467     atomStorageSize = atomStart;
   468     reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
   470     if (params->CUDASOAintegrate && params->useDeviceMigration) {
   471       bondedKernel.updateAtomBuffer(atomStorageSize, stream);
   472       updatePatchRecords();
   474 #endif  // NODEGROUP_FORCE_REGISTER   481 void ComputeBondedCUDA::mapAtoms() {
   482   for (
int i=0;i < getNumPatches();i++) {
   492 void ComputeBondedCUDA::unmapAtoms() {
   493   for (
int i=0;i < getNumPatches();i++) {
   502 void ComputeBondedCUDA::openBoxesOnPe(
int startup) {
   504   std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
   506   fprintf(stderr, 
"PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(), 
this);
   508 #ifdef NODEGROUP_FORCE_REGISTER   509   if( 
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
   510       for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
   521   for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
   532     if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
   533       int pi = patchIndex[patchID];
   534       int atomStart = patches[pi].atomStart;
   535       int numAtoms = patches[pi].numAtoms;
   539       for (
int i=0;i < numAtoms;i++) {
   542         atoms[atomStart + j] = src[i];
   550   patchesCounter -= patchIDs.size();
   551   if (patchesCounter == 0) {
   552     patchesCounter = getNumPatches();
   558       if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
   562       if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes, 
this);
   565       if(params->CUDASOAintegrate){
   566          if(!atomsChanged) this->launchWork();
   571 #ifdef NODEGROUP_FORCE_REGISTER   581   fprintf(stderr, 
"PE[%d] (%p) tuplePatchList printout\n", CkMyPe(), 
this);
   582   for(
int i = 0 ; i < tuplePatchList.size(); i++){
   585     if(tpe == NULL) 
break;
   586     fprintf(stderr, 
"PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(), 
this, i, tpe->
p->
getPatchID(), tpe->
xExt);
   588   CmiUnlock(printLock);
   593 void countNumExclusions(Tuples* tuples, 
int& numModifiedExclusions, 
int& numExclusions) {
   594   numModifiedExclusions = 0;
   595   int ntuples = tuples->getNumTuples();
   597   for (
int ituple=0;ituple < ntuples;ituple++) {
   598     if (src[ituple].modified) numModifiedExclusions++;
   600   numExclusions = ntuples - numModifiedExclusions;
   606 void ComputeBondedCUDA::loadTuplesOnPe(
const int startup) {
   609   int numModifiedExclusions = 0;
   610   int numExclusions = 0;
   611   if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
   612     std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
   614     for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
   616       it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
   620         countNumExclusions(it->tuples, tmp1, tmp2);
   621         numModifiedExclusions += tmp1;
   622         numExclusions += tmp2;
   626     HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
   627     for (
int i=0;i < homeCompute.tuples.size();i++) {
   628       homeCompute.tuples[i]->loadTuples(tuplePatchList,
   629         homeCompute.isBasePatch.data(), &atomMap,
   630         homeCompute.patchIDs);
   634         countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
   635         numModifiedExclusions += tmp1;
   636         numExclusions += tmp2;
   640     numModifiedExclusions = modifiedExclusionTupleData.size();
   641     numExclusions = exclusionTupleData.size();
   645   numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
   646   numExclPerRank[CkMyRank()].numExclusions         = numExclusions;
   649   if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
   655     patchesCounter -= patchIDsPerRank[CkMyRank()].size();
   656     if (patchesCounter == 0) {
   657       patchesCounter = getNumPatches();
   665       if(!params->CUDASOAintegrate)computeMgr->
sendLaunchWork(masterPe, 
this);
   672 void ComputeBondedCUDA::copyBondData(
const int ntuples, 
const BondElem* __restrict__ src,
   676   for (
int ituple=0;ituple < ntuples;ituple++) {
   678     auto p0 = src[ituple].p[0];
   679     auto p1 = src[ituple].p[1];
   680     int pi0 = patchIndex[p0->patchID];
   681     int pi1 = patchIndex[p1->patchID];
   682     int l0 = src[ituple].localIndex[0];
   683     int l1 = src[ituple].localIndex[1];
   684     dstval.
i = l0 + patches[pi0].atomStart;
   685     dstval.
j = l1 + patches[pi1].atomStart;
   686     dstval.
itype = (src[ituple].value - bond_array);
   689     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
   690     if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
   692     dstval.
scale = src[ituple].scale;
   693     if (hostAlchFlags.alchOn) {
   694       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
   700     dst[ituple] = dstval;
   704 #ifdef NODEGROUP_FORCE_REGISTER   706 void ComputeBondedCUDA::copyTupleToStage(
const BondElem& src,
   711   int pi0 = patchIndex[p0->patchID];
   712   int pi1 = patchIndex[p1->patchID];
   720   dstval.
index[0] = l0;
   721   dstval.
index[1] = l1;
   723   if (hostAlchFlags.alchOn) {
   731 #endif  // NODEGROUP_FORCE_REGISTER   734 void ComputeBondedCUDA::copyBondDatafp32(
const int ntuples, 
const BondElem* __restrict__ src,
   738   float3 b1f, b2f, b3f;
   739   b1f = 
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
   740   b2f = 
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
   741   b3f = 
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
   743   for (
int ituple=0;ituple < ntuples;ituple++) {
   745     auto p0 = src[ituple].p[0];
   746     auto p1 = src[ituple].p[1];
   747     int pi0 = patchIndex[p0->patchID];
   748     int pi1 = patchIndex[p1->patchID];
   749     int l0 = src[ituple].localIndex[0];
   750     int l1 = src[ituple].localIndex[1];
   751     dstval.
i = l0 + patches[pi0].atomStart;
   752     dstval.
j = l1 + patches[pi1].atomStart;
   753     dstval.
itype = (src[ituple].value - bond_array);
   757     Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
   758     if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
   760     float3 position1 = 
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
   761     float3 position2 = 
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
   762     float3 diff = position1 - position2;
   763     float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
   764     float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
   765     float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
   773     dstval.
scale = src[ituple].scale;
   774     if (hostAlchFlags.alchOn) {
   775       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
   781     dst[ituple] = dstval;
   785 void ComputeBondedCUDA::copyAngleData(
const int ntuples, 
const AngleElem* __restrict__ src,
   788   for (
int ituple=0;ituple < ntuples;ituple++) {
   790     auto p0 = src[ituple].p[0];
   791     auto p1 = src[ituple].p[1];
   792     auto p2 = src[ituple].p[2];
   793     int pi0 = patchIndex[p0->patchID];
   794     int pi1 = patchIndex[p1->patchID];
   795     int pi2 = patchIndex[p2->patchID];
   796     int l0 = src[ituple].localIndex[0];
   797     int l1 = src[ituple].localIndex[1];
   798     int l2 = src[ituple].localIndex[2];
   799     dstval.
i = l0 + patches[pi0].atomStart;
   800     dstval.
j = l1 + patches[pi1].atomStart;
   801     dstval.
k = l2 + patches[pi2].atomStart;
   802     dstval.
itype = (src[ituple].value - angle_array);
   806     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
   807     Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
   808     if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
   809     if(pi2 != pi1) shiftVec32 += patchMap->
center(p2->patchID) - patchMap->
center(p1->patchID);
   813     dstval.
scale = src[ituple].scale;
   814     if (hostAlchFlags.alchOn) {
   815       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
   821     dst[ituple] = dstval;
   825 #ifdef NODEGROUP_FORCE_REGISTER   827 void ComputeBondedCUDA::copyTupleToStage(
const AngleElem& src,
   833   int pi0 = patchIndex[p0->patchID];
   834   int pi1 = patchIndex[p1->patchID];
   835   int pi2 = patchIndex[p2->patchID];
   845   dstval.
index[0] = l0;
   846   dstval.
index[1] = l1;
   847   dstval.
index[2] = l2;
   850   if (hostAlchFlags.alchOn) {
   858 #endif  // NODEGROUP_FORCE_REGISTER   863 template <
bool doDihedral, 
typename T, 
typename P>
   864 void ComputeBondedCUDA::copyDihedralData(
const int ntuples, 
const T* __restrict__ src,
   865   const P* __restrict__ p_array, 
CudaDihedral* __restrict__ dst) {
   869   for (
int ituple=0;ituple < ntuples;ituple++) {
   871     auto p0 = src[ituple].p[0];
   872     auto p1 = src[ituple].p[1];
   873     auto p2 = src[ituple].p[2];
   874     auto p3 = src[ituple].p[3];
   875     int pi0 = patchIndex[p0->patchID];
   876     int pi1 = patchIndex[p1->patchID];
   877     int pi2 = patchIndex[p2->patchID];
   878     int pi3 = patchIndex[p3->patchID];
   879     int l0 = src[ituple].localIndex[0];
   880     int l1 = src[ituple].localIndex[1];
   881     int l2 = src[ituple].localIndex[2];
   882     int l3 = src[ituple].localIndex[3];
   883     dstval.
i = l0 + patches[pi0].atomStart;
   884     dstval.
j = l1 + patches[pi1].atomStart;
   885     dstval.
k = l2 + patches[pi2].atomStart;
   886     dstval.
l = l3 + patches[pi3].atomStart;
   888       dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
   890       dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
   896     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
   897     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
   898     Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
   899     if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
   900     if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
   901     if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
   906     dstval.
scale = src[ituple].scale;
   907     if (hostAlchFlags.alchOn) {
   908       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
   914     dst[ituple] = dstval;
   918 #ifdef NODEGROUP_FORCE_REGISTER   920 void ComputeBondedCUDA::copyTupleToStage(
const DihedralElem& src,
   927   int pi0 = patchIndex[p0->patchID];
   928   int pi1 = patchIndex[p1->patchID];
   929   int pi2 = patchIndex[p2->patchID];
   930   int pi3 = patchIndex[p3->patchID];
   935   dstval.
itype = dihedralMultMap[(src.
value - p_array)];
   942   dstval.
index[0] = l0;
   943   dstval.
index[1] = l1;
   944   dstval.
index[2] = l2;
   945   dstval.
index[3] = l3;
   948   if (hostAlchFlags.alchOn) {
   958 void ComputeBondedCUDA::copyTupleToStage(
const ImproperElem& src,
   965   int pi0 = patchIndex[p0->patchID];
   966   int pi1 = patchIndex[p1->patchID];
   967   int pi2 = patchIndex[p2->patchID];
   968   int pi3 = patchIndex[p3->patchID];
   973   dstval.
itype = improperMultMap[(src.
value - p_array)];
   980   dstval.
index[0] = l0;
   981   dstval.
index[1] = l1;
   982   dstval.
index[2] = l2;
   983   dstval.
index[3] = l3;
   986   if (hostAlchFlags.alchOn) {
   995 template <
typename T, 
typename P, 
typename D>
   996   void ComputeBondedCUDA::copyToStage(
const int ntuples, 
const T* __restrict__ src,
   997   const P* __restrict__ p_array, std::vector<D>& dst) {
   999   for (
int ituple=0;ituple < ntuples;ituple++) {
  1001     copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
  1002     dst.push_back(dstval);
  1005 #endif  // NODEGROUP_FORCE_REGISTER  1011 template <
bool doDihedral, 
typename T, 
typename P>
  1012 void ComputeBondedCUDA::copyDihedralDatafp32(
const int ntuples, 
const T* __restrict__ src,
  1013   const P* __restrict__ p_array, 
CudaDihedral* __restrict__ dst) {
  1016   float3 b1f = 
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
  1017   float3 b2f = 
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
  1018   float3 b3f = 
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
  1020   for (
int ituple=0;ituple < ntuples;ituple++) {
  1022     auto p0 = src[ituple].p[0];
  1023     auto p1 = src[ituple].p[1];
  1024     auto p2 = src[ituple].p[2];
  1025     auto p3 = src[ituple].p[3];
  1026     int pi0 = patchIndex[p0->patchID];
  1027     int pi1 = patchIndex[p1->patchID];
  1028     int pi2 = patchIndex[p2->patchID];
  1029     int pi3 = patchIndex[p3->patchID];
  1030     int l0 = src[ituple].localIndex[0];
  1031     int l1 = src[ituple].localIndex[1];
  1032     int l2 = src[ituple].localIndex[2];
  1033     int l3 = src[ituple].localIndex[3];
  1034     dstval.
i = l0 + patches[pi0].atomStart;
  1035     dstval.
j = l1 + patches[pi1].atomStart;
  1036     dstval.
k = l2 + patches[pi2].atomStart;
  1037     dstval.
l = l3 + patches[pi3].atomStart;
  1039       dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
  1041       dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
  1044     Position position1 = p0->
x[l0].position;
  1045     Position position2 = p1->
x[l1].position;
  1046     Position position3 = p2->
x[l2].position;
  1047     Position position4 = p3->
x[l3].position;
  1048     Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
  1049     Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
  1050     Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
  1051     if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
  1052     if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
  1053     if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
  1060     float3 position1 = 
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
  1061     float3 position2 = 
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
  1062     float3 position3 = 
make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
  1063     float3 position4 = 
make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
  1065     float3 diff12, diff23, diff43;
  1066     diff12 = position1 - position2;
  1067     diff23 = position2 - position3;
  1068     diff43 = position4 - position3;
  1070     float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
  1071     float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
  1072     float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
  1074     float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
  1075     float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
  1076     float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
  1078     float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
  1079     float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
  1080     float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
  1107     dstval.
scale = src[ituple].scale;
  1108     if (hostAlchFlags.alchOn) {
  1109       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
  1115     dst[ituple] = dstval;
  1120 void ComputeBondedCUDA::copyExclusionData(
const int ntuples, 
const ExclElem* __restrict__ src, 
const int typeSize,
  1124   for (
int ituple=0;ituple < ntuples;ituple++) {
  1125     auto p0 = src[ituple].p[0];
  1126     auto p1 = src[ituple].p[1];
  1127     int pi0 = patchIndex[p0->patchID];
  1128     int pi1 = patchIndex[p1->patchID];
  1129     int l0 = src[ituple].localIndex[0];
  1130     int l1 = src[ituple].localIndex[1];
  1135     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
  1136     if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
  1138     ce.
i            = l0 + patches[pi0].atomStart;
  1139     ce.
j            = l1 + patches[pi1].atomStart;
  1143     if (hostAlchFlags.alchOn) {
  1146       ce.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
  1150     if (src[ituple].modified) {
  1162 #ifdef NODEGROUP_FORCE_REGISTER  1164 void ComputeBondedCUDA::copyTupleToStage(
const ExclElem& __restrict__ src,
  1169   int pi0 = patchIndex[p0->patchID];
  1170   int pi1 = patchIndex[p1->patchID];
  1171   int l0 = src.localIndex[0];
  1172   int l1 = src.localIndex[1];
  1182   dstval.
index[0] = l0;
  1183   dstval.
index[1] = l1;
  1185   if (hostAlchFlags.alchOn) {
  1188     dstval.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
  1196 void ComputeBondedCUDA::copyExclusionDataStage(
const int ntuples, 
const ExclElem* __restrict__ src, 
const int typeSize,
  1197   std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
  1199   for (
int ituple=0;ituple < ntuples;ituple++) {
  1201     copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple], 
nullptr, ce);
  1202     if (src[ituple].modified) {
  1211 #endif  // NODEGROUP_FORCE_REGISTER  1213 void ComputeBondedCUDA::copyCrosstermData(
const int ntuples, 
const CrosstermElem* __restrict__ src,
  1217   for (
int ituple=0;ituple < ntuples;ituple++) {
  1218     auto p0 = src[ituple].p[0];
  1219     auto p1 = src[ituple].p[1];
  1220     auto p2 = src[ituple].p[2];
  1221     auto p3 = src[ituple].p[3];
  1222     auto p4 = src[ituple].p[4];
  1223     auto p5 = src[ituple].p[5];
  1224     auto p6 = src[ituple].p[6];
  1225     auto p7 = src[ituple].p[7];
  1226     int pi0 = patchIndex[p0->patchID];
  1227     int pi1 = patchIndex[p1->patchID];
  1228     int pi2 = patchIndex[p2->patchID];
  1229     int pi3 = patchIndex[p3->patchID];
  1230     int pi4 = patchIndex[p4->patchID];
  1231     int pi5 = patchIndex[p5->patchID];
  1232     int pi6 = patchIndex[p6->patchID];
  1233     int pi7 = patchIndex[p7->patchID];
  1234     int l0 = src[ituple].localIndex[0];
  1235     int l1 = src[ituple].localIndex[1];
  1236     int l2 = src[ituple].localIndex[2];
  1237     int l3 = src[ituple].localIndex[3];
  1238     int l4 = src[ituple].localIndex[4];
  1239     int l5 = src[ituple].localIndex[5];
  1240     int l6 = src[ituple].localIndex[6];
  1241     int l7 = src[ituple].localIndex[7];
  1242     dst[ituple].i1 = l0 + patches[pi0].atomStart;
  1243     dst[ituple].i2 = l1 + patches[pi1].atomStart;
  1244     dst[ituple].i3 = l2 + patches[pi2].atomStart;
  1245     dst[ituple].i4 = l3 + patches[pi3].atomStart;
  1246     dst[ituple].i5 = l4 + patches[pi4].atomStart;
  1247     dst[ituple].i6 = l5 + patches[pi5].atomStart;
  1248     dst[ituple].i7 = l6 + patches[pi6].atomStart;
  1249     dst[ituple].i8 = l7 + patches[pi7].atomStart;
  1250     dst[ituple].itype = (src[ituple].value - crossterm_array);
  1251     Position position1 = p0->
x[l0].position;
  1252     Position position2 = p1->
x[l1].position;
  1253     Position position3 = p2->
x[l2].position;
  1254     Position position4 = p3->
x[l3].position;
  1255     Position position5 = p4->
x[l4].position;
  1256     Position position6 = p5->
x[l5].position;
  1257     Position position7 = p6->
x[l6].position;
  1258     Position position8 = p7->
x[l7].position;
  1259     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
  1260     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
  1261     Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
  1262     Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
  1263     Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
  1264     Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
  1265     if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
  1266     if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
  1267     if(pi2 != pi3) shiftVec34 += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
  1268     if(pi4 != pi5) shiftVec56 += patchMap->
center(p4->patchID) - patchMap->
center(p5->patchID);
  1269     if(pi5 != pi6) shiftVec67 += patchMap->
center(p5->patchID) - patchMap->
center(p6->patchID);
  1270     if(pi6 != pi7) shiftVec78 += patchMap->
center(p6->patchID) - patchMap->
center(p7->patchID);
  1271     dst[ituple].offset12XYZ = 
make_float3( (
float)shiftVec12.
x, (
float)shiftVec12.
y, (
float)shiftVec12.
z);
  1272     dst[ituple].offset23XYZ = 
make_float3( (
float)shiftVec23.
x, (
float)shiftVec23.
y, (
float)shiftVec23.
z);
  1273     dst[ituple].offset34XYZ = 
make_float3( (
float)shiftVec34.
x, (
float)shiftVec34.
y, (
float)shiftVec34.
z);
  1274     dst[ituple].offset56XYZ = 
make_float3( (
float)shiftVec56.
x, (
float)shiftVec56.
y, (
float)shiftVec56.
z);
  1275     dst[ituple].offset67XYZ = 
make_float3( (
float)shiftVec67.
x, (
float)shiftVec67.
y, (
float)shiftVec67.
z);
  1276     dst[ituple].offset78XYZ = 
make_float3( (
float)shiftVec78.
x, (
float)shiftVec78.
y, (
float)shiftVec78.
z);
  1277     if (hostAlchFlags.alchOn) {
  1278       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
  1280       int typeSum1 = 0, typeSum2 = 0;
  1281       for (
size_t i = 0; i < 4; ++i) {
  1282         typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
  1283         typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
  1285       int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
  1286       if ((0 < typeSum1 && typeSum1 < 
order) || (0 < typeSum2 && typeSum2 < 
order)) {
  1287         dst[ituple].fepBondedType = 1;
  1288       } 
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
  1289         dst[ituple].fepBondedType = 2;
  1292       dst[ituple].fepBondedType = 0;
  1294     dst[ituple].scale = src[ituple].scale;
  1298 #ifdef NODEGROUP_FORCE_REGISTER  1300 void ComputeBondedCUDA::copyTupleToStage(
const CrosstermElem& src,
  1311   int pi0 = patchIndex[p0->patchID];
  1312   int pi1 = patchIndex[p1->patchID];
  1313   int pi2 = patchIndex[p2->patchID];
  1314   int pi3 = patchIndex[p3->patchID];
  1315   int pi4 = patchIndex[p4->patchID];
  1316   int pi5 = patchIndex[p5->patchID];
  1317   int pi6 = patchIndex[p6->patchID];
  1318   int pi7 = patchIndex[p7->patchID];
  1327   dstval.
itype = (src.
value - crossterm_array);
  1338   dstval.
index[0] = l0;
  1339   dstval.
index[1] = l1;
  1340   dstval.
index[2] = l2;
  1341   dstval.
index[3] = l3;
  1342   dstval.
index[4] = l4;
  1343   dstval.
index[5] = l5;
  1344   dstval.
index[6] = l6;
  1345   dstval.
index[7] = l7;
  1348   if (hostAlchFlags.alchOn) {
  1351     int typeSum1 = 0, typeSum2 = 0;
  1352     for (
size_t i = 0; i < 4; ++i) {
  1356     int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
  1357     if ((0 < typeSum1 && typeSum1 < 
order) || (0 < typeSum2 && typeSum2 < 
order)) {
  1359     } 
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
  1366 #endif  // NODEGROUP_FORCE_REGISTER  1368 void ComputeBondedCUDA::copyTholeData(
  1369   const int ntuples, 
const TholeElem* __restrict__ src,
  1372   for (
int ituple=0;ituple < ntuples;ituple++) {
  1373     const auto p0 = src[ituple].p[0];
  1374     const auto p1 = src[ituple].p[1];
  1375     const auto p2 = src[ituple].p[2];
  1376     const auto p3 = src[ituple].p[3];
  1377     const int pi0 = patchIndex[p0->patchID];
  1378     const int pi1 = patchIndex[p1->patchID];
  1379     const int pi2 = patchIndex[p2->patchID];
  1380     const int pi3 = patchIndex[p3->patchID];
  1381     const int l0 = src[ituple].localIndex[0];
  1382     const int l1 = src[ituple].localIndex[1];
  1383     const int l2 = src[ituple].localIndex[2];
  1384     const int l3 = src[ituple].localIndex[3];
  1385     dst[ituple].i = l0 + patches[pi0].atomStart;
  1386     dst[ituple].j = l1 + patches[pi1].atomStart;
  1387     dst[ituple].k = l2 + patches[pi2].atomStart;
  1388     dst[ituple].l = l3 + patches[pi3].atomStart;
  1390     const TholeValue * 
const(&value)(src[ituple].value);
  1391     if (value == NULL) {
  1392       NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
  1394     dst[ituple].aa = value->aa;
  1395     dst[ituple].qq = value->qq;
  1397     const Position position0 = p0->
x[l0].position;
  1398     const Position position1 = p1->
x[l1].position;
  1399     const Position position2 = p2->
x[l2].position;
  1400     const Position position3 = p3->
x[l3].position;
  1401     Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
  1402     Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
  1403     Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
  1404     Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
  1405     if (pi0 != pi2) shiftVec_aiaj += patchMap->
center(p0->patchID) - patchMap->
center(p2->patchID);
  1406     if (pi0 != pi3) shiftVec_aidj += patchMap->
center(p0->patchID) - patchMap->
center(p3->patchID);
  1407     if (pi1 != pi2) shiftVec_diaj += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
  1408     if (pi1 != pi3) shiftVec_didj += patchMap->
center(p1->patchID) - patchMap->
center(p3->patchID);
  1409     dst[ituple].offset_aiaj = 
make_float3((
float)shiftVec_aiaj.
x, (
float)shiftVec_aiaj.
y, (
float)shiftVec_aiaj.
z);
  1410     dst[ituple].offset_aidj = 
make_float3((
float)shiftVec_aidj.
x, (
float)shiftVec_aidj.
y, (
float)shiftVec_aidj.
z);
  1411     dst[ituple].offset_diaj = 
make_float3((
float)shiftVec_diaj.
x, (
float)shiftVec_diaj.
y, (
float)shiftVec_diaj.
z);
  1412     dst[ituple].offset_didj = 
make_float3((
float)shiftVec_didj.
x, (
float)shiftVec_didj.
y, (
float)shiftVec_didj.
z);
  1413     if (hostAlchFlags.alchOn) {
  1416       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
  1419       typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
  1420       typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
  1421       const int order = (!params->alchDecouple ? 3 : 2);
  1422       if (typeSum == 0 || abs(typeSum) == 
order) dst[ituple].fepBondedType = 0;
  1423       else if (0 < typeSum && typeSum < 
order) dst[ituple].fepBondedType = 1;
  1424       else if (-
order < typeSum && typeSum < 0) dst[ituple].fepBondedType = 2;
  1425       else dst[ituple].fepBondedType = 0;
  1427       dst[ituple].fepBondedType = 0;
  1432 #ifdef NODEGROUP_FORCE_REGISTER  1434 void ComputeBondedCUDA::copyTupleToStage(
const TholeElem& src,
  1452   dstval.
index[0] = l0;
  1453   dstval.
index[1] = l1;
  1454   dstval.
index[2] = l2;
  1455   dstval.
index[3] = l3;
  1457   if (value == NULL) {
  1458     NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
  1460   dstval.
aa = value->aa;
  1461   dstval.
qq = value->qq;
  1462   if (hostAlchFlags.alchOn) {
  1470     const int order = (!params->alchDecouple ? 3 : 2);
  1481 void ComputeBondedCUDA::copyAnisoData(
  1482   const int ntuples, 
const AnisoElem* __restrict src,
  1485   for (
int ituple=0;ituple < ntuples;ituple++) {
  1486     const auto p0 = src[ituple].p[0];
  1487     const auto p1 = src[ituple].p[1];
  1488     const auto p2 = src[ituple].p[2];
  1489     const auto p3 = src[ituple].p[3];
  1490     const int pi0 = patchIndex[p0->patchID];
  1491     const int pi1 = patchIndex[p1->patchID];
  1492     const int pi2 = patchIndex[p2->patchID];
  1493     const int pi3 = patchIndex[p3->patchID];
  1494     const int l0 = src[ituple].localIndex[0];
  1495     const int l1 = src[ituple].localIndex[1];
  1496     const int l2 = src[ituple].localIndex[2];
  1497     const int l3 = src[ituple].localIndex[3];
  1498     dst[ituple].i = l0 + patches[pi0].atomStart;
  1500     dst[ituple].j = l0 + 1 + patches[pi0].atomStart;
  1501     dst[ituple].l = l1 + patches[pi1].atomStart;
  1502     dst[ituple].m = l2 + patches[pi2].atomStart;
  1503     dst[ituple].n = l3 + patches[pi3].atomStart;
  1504     const AnisoValue * 
const(&value)(src[ituple].value);
  1505     if (value == NULL) {
  1506       NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyAnisoData.\n");
  1508     dst[ituple].kpar0  = 2.0 * value->k11;
  1509     dst[ituple].kperp0 = 2.0 * value->k22;
  1510     dst[ituple].kiso0  = 2.0 * value->k33;
  1511     const Position position0 = p0->
x[l0].position;
  1512     const Position position1 = p1->
x[l1].position;
  1513     const Position position2 = p2->
x[l2].position;
  1514     const Position position3 = p3->
x[l3].position;
  1515     Vector shiftVec_il = lattice.wrap_delta_scaled(position0, position1);
  1516     Vector shiftVec_mn = lattice.wrap_delta_scaled(position2, position3);
  1517     if (pi0 != pi1) shiftVec_il += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
  1518     if (pi2 != pi3) shiftVec_mn += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
  1519     dst[ituple].offset_il = 
make_float3((
float)shiftVec_il.
x, (
float)shiftVec_il.
y, (
float)shiftVec_il.
z);
  1520     dst[ituple].offset_mn = 
make_float3((
float)shiftVec_mn.
x, (
float)shiftVec_mn.
y, (
float)shiftVec_mn.
z);
  1521     if (hostAlchFlags.alchOn) {
  1522       const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
  1526       dst[ituple].fepBondedType = 0;
  1531 #ifdef NODEGROUP_FORCE_REGISTER  1533 void ComputeBondedCUDA::copyTupleToStage(
const AnisoElem& src,
  1548   dstval.
index[0] = l0;
  1549   dstval.
index[1] = l0 + 1;
  1550   dstval.
index[2] = l1;
  1551   dstval.
index[3] = l2;
  1552   dstval.
index[4] = l3;
  1554   if (value == NULL) {
  1555     NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
  1557   dstval.
kpar0  = 2.0 * value->k11;
  1558   dstval.
kperp0 = 2.0 * value->k22;
  1559   dstval.
kiso0  = 2.0 * value->k33;
  1560   if (hostAlchFlags.alchOn) {
  1570 void ComputeBondedCUDA::copyOneFourNbTholeData(
  1575   for (
int ituple=0;ituple < ntuples;ituple++) {
  1576     const auto p0 = src[ituple].p[0];
  1577     const auto p1 = src[ituple].p[1];
  1578     const auto p2 = src[ituple].p[2];
  1579     const auto p3 = src[ituple].p[3];
  1580     const int pi0 = patchIndex[p0->patchID];
  1581     const int pi1 = patchIndex[p1->patchID];
  1582     const int pi2 = patchIndex[p2->patchID];
  1583     const int pi3 = patchIndex[p3->patchID];
  1584     const int l0 = src[ituple].localIndex[0];
  1585     const int l1 = src[ituple].localIndex[1];
  1586     const int l2 = src[ituple].localIndex[2];
  1587     const int l3 = src[ituple].localIndex[3];
  1588     dst[ituple].i = l0 + patches[pi0].atomStart;
  1589     dst[ituple].j = l1 + patches[pi1].atomStart;
  1590     dst[ituple].k = l2 + patches[pi2].atomStart;
  1591     dst[ituple].l = l3 + patches[pi3].atomStart;
  1593     if (value == NULL) {
  1594       NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
  1596     dst[ituple].aa = value->aa;
  1597     const Position position0 = p0->
x[l0].position;
  1598     const Position position1 = p1->
x[l1].position;
  1599     const Position position2 = p2->
x[l2].position;
  1600     const Position position3 = p3->
x[l3].position;
  1601     Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
  1602     Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
  1603     Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
  1604     Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
  1605     if (pi0 != pi2) shiftVec_aiaj += patchMap->
center(p0->patchID) - patchMap->
center(p2->patchID);
  1606     if (pi0 != pi3) shiftVec_aidj += patchMap->
center(p0->patchID) - patchMap->
center(p3->patchID);
  1607     if (pi1 != pi2) shiftVec_diaj += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
  1608     if (pi1 != pi3) shiftVec_didj += patchMap->
center(p1->patchID) - patchMap->
center(p3->patchID);
  1609     dst[ituple].offset_aiaj = 
make_float3((
float)shiftVec_aiaj.
x, (
float)shiftVec_aiaj.
y, (
float)shiftVec_aiaj.
z);
  1610     dst[ituple].offset_aidj = 
make_float3((
float)shiftVec_aidj.
x, (
float)shiftVec_aidj.
y, (
float)shiftVec_aidj.
z);
  1611     dst[ituple].offset_diaj = 
make_float3((
float)shiftVec_diaj.
x, (
float)shiftVec_diaj.
y, (
float)shiftVec_diaj.
z);
  1612     dst[ituple].offset_didj = 
make_float3((
float)shiftVec_didj.
x, (
float)shiftVec_didj.
y, (
float)shiftVec_didj.
z);
  1619 #ifdef NODEGROUP_FORCE_REGISTER  1639   dstval.
index[0] = l0;
  1640   dstval.
index[1] = l1;
  1641   dstval.
index[2] = l2;
  1642   dstval.
index[3] = l3;
  1644   if (value == NULL) {
  1645     NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
  1647   dstval.
aa = value->aa;
  1652 #endif // NODEGROUP_FORCE_REGISTER  1654 void ComputeBondedCUDA::tupleCopyWorker(
int first, 
int last, 
void *result, 
int paraNum, 
void *param) {
  1655   ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
  1656   c->tupleCopyWorker(first, last);
  1659 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first, 
int last, 
void *result, 
int paraNum, 
void *param) {
  1660   ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
  1661   c->tupleCopyWorkerExcl(first, last);
  1664 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first, 
int last) {
  1670   int64_t numModExclusionsBefore=0;
  1671   int64_t numExclusionsBefore=0;
  1674     int ntuples = (*it)->getNumTuples();
  1675     auto thisExcl = (
ExclElem *)(*it)->getTupleList();
  1676     for (
int ituple=0;ituple < ntuples;ituple++) {
  1677       if(thisExcl[ituple].modified)
  1678         numModExclusionsBefore++;
  1680         numExclusionsBefore++;
  1684   int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
  1685   int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
  1686   int numExclusionsWork=last-first;
  1687   auto itEnd=
std::next(itStart,numExclusionsWork+1);
  1688   for ( 
auto it=itStart;it != itEnd;it++) {
  1689     int ntuples = (*it)->getNumTuples();
  1696 void ComputeBondedCUDA::tupleCopyWorker(
int first, 
int last) {
  1701     int64_t pos = exclusionStartPos;
  1702     int64_t pos2 = exclusionStartPos2;
  1704       int ntuples = (*it)->getNumTuples();
  1713   for (
int i=first;i <= last;i++) {
  1714     switch (tupleCopyWorkList[i].tupletype) {
  1719         copyBondData(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
  1720           Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1722          copyBondDatafp32(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
  1723            Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1730         copyAngleData(tupleCopyWorkList[i].ntuples, (
AngleElem *)tupleCopyWorkList[i].tupleElemList,
  1731           Node::Object()->parameters->angle_array, (
CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1739         copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
  1741           (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1743         copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
  1745           (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1752         copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
  1754           (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1760         copyCrosstermData(tupleCopyWorkList[i].ntuples, (
CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
  1767         copyTholeData(tupleCopyWorkList[i].ntuples, (
TholeElem*)tupleCopyWorkList[i].tupleElemList, 
nullptr, (
CudaThole *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1773         copyAnisoData(tupleCopyWorkList[i].ntuples, (
AnisoElem*)tupleCopyWorkList[i].tupleElemList, 
nullptr, (
CudaAniso*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1777       case Tuples::ONEFOURNBTHOLE:
  1779         copyOneFourNbTholeData(tupleCopyWorkList[i].ntuples, (
OneFourNbTholeElem*)tupleCopyWorkList[i].tupleElemList, 
nullptr, (
CudaOneFourNbThole*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
  1784       NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
  1792 #ifdef NODEGROUP_FORCE_REGISTER  1794 void ComputeBondedCUDA::updateMaxTupleCounts(
TupleCounts counts) {
  1798   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  1799   PatchData *patchData = cpdata.ckLocalBranch();
  1802   int numBondsTest = patchData->maxNumBonds.load();
  1803   while (numBondsTest < counts.
bond &&
  1804          !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.
bond));
  1806   int numAnglesTest = patchData->maxNumAngles.load();
  1807   while (numAnglesTest < counts.
angle &&
  1808          !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.
angle));
  1810   int numDihedralsTest = patchData->maxNumDihedrals.load();
  1811   while (numDihedralsTest < counts.
dihedral &&
  1812          !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.
dihedral));
  1814   int numImpropersTest = patchData->maxNumImpropers.load();
  1815   while (numImpropersTest < counts.
improper &&
  1816          !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.
improper));
  1818   int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
  1820          !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.
modifiedExclusion));
  1822   int numExclusionsTest = patchData->maxNumExclusions.load();
  1823   while (numExclusionsTest < counts.
exclusion &&
  1824          !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.
exclusion));
  1826   int numCrosstermsTest = patchData->maxNumCrossterms.load();
  1827   while (numCrosstermsTest < counts.
crossterm &&
  1828          !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.
crossterm));
  1833 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
  1834   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  1835   PatchData *patchData = cpdata.ckLocalBranch();
  1839   counts.
bond = patchData->maxNumBonds.load();
  1840   counts.
angle = patchData->maxNumAngles.load();
  1841   counts.
dihedral = patchData->maxNumDihedrals.load();
  1842   counts.
improper = patchData->maxNumImpropers.load();
  1844   counts.
exclusion = patchData->maxNumExclusions.load();
  1845   counts.
crossterm = patchData->maxNumCrossterms.load();
  1848   CkPrintf(
"[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
  1897 void ComputeBondedCUDA::migrateTuples(
bool startup) {
  1899   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  1900   PatchData *patchData = cpdata.ckLocalBranch();
  1905   bool amMaster = (masterPe == CkMyPe());
  1910     cudaStreamSynchronize(stream);
  1912     cudaStreamSynchronize(stream);
  1919       const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
  1923       TupleCounts counts = bondedKernel.getTupleCounts();
  1924       migrationKernel.computeTupleDestination(
  1927         patchData->devData[devInd].numPatchesHome,
  1928         migrationDestination,
  1929         patchData->devData[devInd].d_patchToDeviceMap,
  1930         patchData->devData[devInd].d_globalToLocalID,
  1935       cudaCheck(cudaStreamSynchronize(stream));
  1942       migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
  1943       cudaCheck(cudaStreamSynchronize(stream));
  1949   bool realloc = 
false;
  1951   if (amMaster && !startup) {
  1952     migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
  1954     local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
  1955     updateMaxTupleCounts(local);
  1957     CkPrintf(
"[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
  1963   if (amMaster && !startup) {
  1966       newMax = getMaxTupleCounts();
  1967       realloc = migrationKernel.reallocateBufferDst(newMax);
  1968       patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
  1978     realloc = patchData->tupleReallocationFlagPerDevice[0];
  1987       registerPointersToHost(); 
  1992       copyHostRegisterToDevice();
  1993       cudaCheck(cudaStreamSynchronize(stream));
  1998   if (amMaster && !startup) {
  2000     migrationKernel.performTupleMigration(
  2001       bondedKernel.getTupleCounts(),
  2004     cudaCheck(cudaStreamSynchronize(stream));
  2008   if (amMaster && !startup) {
  2015       bondedKernel.reallocateTupleBuffer(newMax, stream);
  2016       migrationKernel.reallocateBufferSrc(newMax);
  2017       cudaCheck(cudaStreamSynchronize(stream));
  2026       bondedKernel.setTupleCounts(local);
  2029     const int* ids = patchData->h_soa_id[devInd];
  2030     migrationKernel.updateTuples(
  2031       bondedKernel.getTupleCounts(),
  2032       bondedKernel.getData(),
  2036       bondedKernel.getAtomBuffer(),
  2050 template<
typename T>
  2051 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
  2054   std::vector<std::pair<int,int>> downstreamPatches;
  2055   for (
int i = 0; i < tuples.size(); i++) {
  2056     int downstream = tuples[i].patchIDs[0];
  2057     for (
int j = 1; j < T::size; j++) {
  2058       downstream = patchMap->
downstream(downstream, tuples[i].patchIDs[j]);
  2060     downstreamPatches.push_back(std::make_pair(i, downstream));
  2061     tupleCounts[patchIndex[downstream]]++;
  2067   tupleOffsets[0] = 0;
  2068   for (
int i = 0; i < tupleCounts.size(); i++) {
  2069     tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
  2075   std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
  2076     [](std::pair<int, int> a, std::pair<int, int> b) {
  2077     return a.second < b.second;
  2083   std::vector<T> copy = tuples;
  2084   for (
int i = 0; i < tuples.size(); i++) {
  2085     tuples[i] = copy[downstreamPatches[i].first];
  2089 void ComputeBondedCUDA::sortAndCopyToDevice() {
  2090   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  2091   PatchData *patchData = cpdata.ckLocalBranch();
  2093   const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
  2103   std::vector<int> bondPatchCounts(numPatchesHome, 0);
  2104   std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
  2106   std::vector<int> anglePatchCounts(numPatchesHome, 0);
  2107   std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
  2109   std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
  2110   std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
  2112   std::vector<int> improperPatchCounts(numPatchesHome, 0);
  2113   std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
  2115   std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
  2116   std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
  2118   std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
  2119   std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
  2121   std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
  2122   std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
  2124   #define CALL(fieldName) do { \  2125     sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \  2126     h_dataStage.fieldName = fieldName##TupleData.data(); \  2127     h_counts.fieldName = fieldName##PatchCounts.data(); \  2128     h_offsets.fieldName = fieldName##PatchOffsets.data(); \  2135   CALL(modifiedExclusion);
  2141   migrationKernel.copyTupleToDevice(
  2142     bondedKernel.getTupleCounts(),
  2150   cudaCheck(cudaStreamSynchronize(stream));
  2160 void ComputeBondedCUDA::tupleCopyWorkerType(
int tupletype) {
  2163   switch (tupletype) {
  2167       modifiedExclusionTupleData.clear();
  2168       exclusionTupleData.clear();
  2169       int64_t pos = exclusionStartPos;
  2170       int64_t pos2 = exclusionStartPos2;
  2172         int ntuples = (*it)->getNumTuples();
  2174           modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
  2185       bondTupleData.clear();
  2187         int ntuples = (*it)->getNumTuples();
  2189         copyToStage<BondElem, BondValue, CudaBondStage>(
  2200       angleTupleData.clear();
  2201       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2202         int ntuples = (*it)->getNumTuples();
  2213       dihedralTupleData.clear();
  2214       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2215         int ntuples = (*it)->getNumTuples();
  2217         copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
  2227       improperTupleData.clear();
  2228       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2229         int ntuples = (*it)->getNumTuples();
  2231         copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList, 
  2241       crosstermTupleData.clear();
  2242       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2243         int ntuples = (*it)->getNumTuples();
  2245         copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList, 
  2255       tholeTupleData.clear();
  2256       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2257         const int ntuples = (*it)->getNumTuples();
  2259         copyToStage<TholeElem, TholeValue, CudaTholeStage>(ntuples, elemList, 
nullptr, tholeTupleData);
  2267       anisoTupleData.clear();
  2268       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2269         const int ntuples = (*it)->getNumTuples();
  2271         copyToStage<AnisoElem, AnisoValue, CudaAnisoStage>(ntuples, elemList, 
nullptr, anisoTupleData);
  2277     case Tuples::ONEFOURNBTHOLE:
  2279       oneFourNbTholeTupleData.clear();
  2280       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2281         const int ntuples = (*it)->getNumTuples();
  2283         copyToStage<OneFourNbTholeElem, OneFourNbTholeValue, CudaOneFourNbTholeStage>(ntuples, elemList, 
nullptr, oneFourNbTholeTupleData);
  2290     NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
  2296 #endif  // NODEGROUP_FORCE_REGISTER  2303 void ComputeBondedCUDA::copyTupleData() {
  2308   int numModifiedExclusions = 0;
  2309   int numExclusions = 0;
  2310   for (
int i=0;i < numExclPerRank.size();i++) {
  2311     numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
  2312     numExclusions         += numExclPerRank[i].numExclusions;
  2319   exclusionStartPos = 0;
  2320   exclusionStartPos2 = 0;
  2321   tupleCopyWorkList.clear();
  2322   for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  2324     int64_t pos = posWA;
  2326       exclusionStartPos = pos;
  2327       exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
  2331     for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2332       int ntuples = (*it)->getNumTuples();
  2335         TupleCopyWork tupleCopyWork;
  2336         tupleCopyWork.tupletype     = tupletype;
  2337         tupleCopyWork.ntuples       = ntuples;
  2338         tupleCopyWork.tupleElemList = (*it)->getTupleList();
  2339         tupleCopyWork.tupleDataPos  = pos;
  2341         tupleCopyWorkList.push_back(tupleCopyWork);
  2342         pos += ntuples*CudaTupleTypeSize[tupletype];
  2345     numTuplesPerType[tupletype] = num;
  2349       posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
  2354   if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
  2355     NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
  2359   hasExclusions = (numExclusions > 0);
  2360   hasModifiedExclusions = (numModifiedExclusions > 0);
  2364   reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
  2366 #if CMK_SMP && USE_CKLOOP  2368   if (useCkLoop >= 1) {
  2369 #define CKLOOP_EXCLUSIONS 1  2370 #if CKLOOP_EXCLUSIONS  2371     CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (
void *)
this, CkMyNodeSize(), 0, tupleList[
Tuples::EXCLUSION].size()-1);
  2373     CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
  2377     CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
  2384     tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
  2390     numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
  2391     numTuplesPerType[Tuples::ONEFOURNBTHOLE], tupleData, stream);
  2394   int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, 
true);
  2395   reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
  2399 #ifdef NODEGROUP_FORCE_REGISTER  2401 void ComputeBondedCUDA::copyTupleDataSN() {
  2404   size_t numExclusions, numModifiedExclusions, copyIndex;
  2408   if(masterPe == CkMyPe()){
  2409     numModifiedExclusions = 0;
  2411     for (
int i=0;i < numExclPerRank.size();i++) {
  2412       numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
  2413       numExclusions         += numExclPerRank[i].numExclusions;
  2420     exclusionStartPos = 0;
  2421     exclusionStartPos2 = 0;
  2422     tupleCopyWorkList.clear();
  2423     for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  2425       int64_t pos = posWA;
  2427         exclusionStartPos = pos;
  2428         exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
  2432       for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2433         int ntuples = (*it)->getNumTuples();
  2436           TupleCopyWork tupleCopyWork;
  2437           tupleCopyWork.tupletype     = tupletype;
  2438           tupleCopyWork.ntuples       = ntuples;
  2439           tupleCopyWork.tupleElemList = (*it)->getTupleList();
  2440           tupleCopyWork.tupleDataPos  = pos;
  2441           tupleCopyWorkList.push_back(tupleCopyWork);
  2442           pos += ntuples*CudaTupleTypeSize[tupletype];
  2445       numTuplesPerType[tupletype] = num;
  2449         posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
  2454     if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
  2455       NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
  2459     hasExclusions = (numExclusions > 0);
  2460     hasModifiedExclusions = (numModifiedExclusions > 0);
  2464     reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
  2474   this->tupleCopyWorker(first, last);
  2480   while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <=  tupleCopyWorkList.size()){
  2481      this->tupleCopyWorker(copyIndex -1, copyIndex -1);
  2486   if(masterPe == CkMyPe()){
  2487     tupleWorkIndex.store(0);
  2491       numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
  2492       numTuplesPerType[Tuples::ONEFOURNBTHOLE],
  2496     int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, 
true);
  2497     reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
  2509 void ComputeBondedCUDA::copyTupleDataGPU(
const int startup) {
  2513   size_t numExclusions, numModifiedExclusions, copyIndex;
  2522     if(masterPe == CkMyPe()){
  2523       numModifiedExclusions = 0;
  2525       for (
int i=0;i < numExclPerRank.size();i++) {
  2526         numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
  2527         numExclusions         += numExclPerRank[i].numExclusions;
  2534       exclusionStartPos = 0;
  2535       exclusionStartPos2 = 0;
  2536       tupleCopyWorkList.clear();
  2537       for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  2541           exclusionStartPos = pos;
  2542           exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[
Tuples::EXCLUSION];
  2546         for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
  2547           int ntuples = (*it)->getNumTuples();
  2550             TupleCopyWork tupleCopyWork;
  2551             tupleCopyWork.tupletype     = tupletype;
  2552             tupleCopyWork.ntuples       = ntuples;
  2553             tupleCopyWork.tupleElemList = (*it)->getTupleList();
  2554             tupleCopyWork.tupleDataPos  = pos;
  2555             tupleCopyWorkList.push_back(tupleCopyWork);
  2556             pos += ntuples*CudaTupleTypeSizeStage[tupletype];
  2559         numTuplesPerType[tupletype] = num;
  2563           posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
  2568       if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
  2569         NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
  2573       hasExclusions = (numExclusions > 0);
  2574       hasModifiedExclusions = (numModifiedExclusions > 0);
  2578       reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
  2586       local.modifiedExclusion = numModifiedExclusions;
  2587       local.exclusion = numExclusions;
  2589       local.thole = numTuplesPerType[Tuples::THOLE];
  2590       local.aniso = numTuplesPerType[Tuples::ANISO];
  2591       local.oneFourNbThole = numTuplesPerType[Tuples::ONEFOURNBTHOLE];
  2593       updateMaxTupleCounts(local);
  2594       bondedKernel.setTupleCounts(local);
  2600     if(masterPe == CkMyPe()){
  2602       migrationKernel.reallocateBufferSrc(newMax);
  2603       migrationKernel.reallocateBufferDst(newMax);
  2604       bondedKernel.reallocateTupleBuffer(newMax, stream);
  2605       registerPointersToHost(); 
  2611     if(masterPe == CkMyPe()){
  2612       copyHostRegisterToDevice();
  2613       for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  2614         this->tupleCopyWorkerType(tupletype);
  2616       sortAndCopyToDevice();
  2620   migrateTuples(startup);
  2622   if(masterPe == CkMyPe()){
  2623     TupleCounts count = bondedKernel.getTupleCounts();
  2631     numTuplesPerType[Tuples::THOLE] = count.
thole;
  2632     numTuplesPerType[Tuples::ANISO] = count.
aniso;
  2636     hasExclusions = (numExclusions > 0);
  2637     hasModifiedExclusions = (numModifiedExclusions > 0);
  2640     int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, 
true);
  2641     reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
  2656 #ifdef NODEGROUP_FORCE_REGISTER  2657 void ComputeBondedCUDA::updatePatchRecords() {
  2658   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  2659   PatchData *patchData = cpdata.ckLocalBranch();
  2661   PatchRecord **d_pr   = &(patchData->devData[devInd].bond_pr);
  2662   int **d_pid          = &(patchData->devData[devInd].bond_pi);
  2663   size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
  2664   size_t st_d_pr_size  = (size_t)(patchData->devData[devInd].bond_pr_size);
  2665   patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
  2666   reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size()); 
  2667   patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size); 
  2668   reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
  2669   patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size); 
  2670   copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
  2671   copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
  2672   if (params->CUDASOAintegrate && params->useDeviceMigration) {
  2673     patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
  2681 void ComputeBondedCUDA::launchWork() {
  2683   if (CkMyPe() != masterPe)
  2684     NAMD_bug(
"ComputeBondedCUDA::launchWork() called on non master PE");
  2693 #ifdef NODEGROUP_FORCE_REGISTER  2695       updatePatchRecords();
  2701   float3 lata = 
make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
  2702   float3 latb = 
make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
  2703   float3 latc = 
make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
  2709     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  2710     PatchData *patchData = cpdata.ckLocalBranch();
  2712     float4 *d_atoms = bondedKernel.getAtomBuffer();
  2713     copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
  2717     fprintf(stderr, 
"DEV[%d] BOND POS PRINTOUT\n", deviceID);
  2719     for(
int i = 0 ; i < atomStorageSize; i++){
  2720       fprintf(stderr, 
" ATOMS[%d] = %lf %lf %lf %lf\n",
  2721         i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
  2732   bondedKernel.bondedForce(
  2735     doEnergy, doVirial, doSlow, doTable,
  2740     (
const float4*)atoms, forces, 
  2745 #ifdef NODEGROUP_FORCE_REGISTER  2746     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  2747     PatchData *patchData = cpdata.ckLocalBranch();
  2750       patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
  2752     patchData->devData[devInd].f_bond = bondedKernel.getForces();
  2753     patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
  2754     patchData->devData[devInd].f_bond_slow  = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
  2755     patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
  2761   int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, 
true);
  2763   copy_DtoH_sync(bondedKernel.getForces(), forces,  forceStorageSize);
  2764   fprintf(stderr, 
"DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
  2765   for(
int i = 0; i < forceStorageSize; i++){
  2767     fprintf(stderr, 
"BOND[%d] = %lf\n", i, forces[i]);
  2772   forceDoneSetCallback();
  2775 void ComputeBondedCUDA::forceDoneCheck(
void *arg, 
double walltime) {
  2776   ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
  2778   if (CkMyPe() != c->masterPe)
  2779     NAMD_bug(
"ComputeBondedCUDA::forceDoneCheck called on non masterPe");
  2783   cudaError_t err = cudaEventQuery(c->forceDoneEvent);
  2784   if (err == cudaSuccess) {
  2790   } 
else if (err != cudaErrorNotReady) {
  2793     sprintf(errmsg,
"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
  2794             c->checkCount, walltime - c->beforeForceCompute);
  2800   if (c->checkCount >= 1000000) {
  2802     sprintf(errmsg,
"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
  2803             c->checkCount, walltime - c->beforeForceCompute);
  2809   CcdCallFnAfter(forceDoneCheck, arg, 0.1);
  2815 void ComputeBondedCUDA::forceDoneSetCallback() {
  2816   if (CkMyPe() != masterPe)
  2817     NAMD_bug(
"ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
  2819   cudaCheck(cudaEventRecord(forceDoneEvent, stream));
  2823   beforeForceCompute = CkWallTimer();
  2828 template <
bool sumNbond, 
bool sumSlow>
  2829 void finishForceLoop(
const int numAtoms, 
const int forceStride,
  2830   const double* __restrict__ af,
  2831   const double* __restrict__ af_nbond,
  2832   const double* __restrict__ af_slow,
  2833   Force* __restrict__ f,
  2834   Force* __restrict__ f_nbond,
  2835   Force* __restrict__ f_slow) {
  2838   for (
int j=0;j < numAtoms;j++) {
  2846       f[j].y += af[j + forceStride];
  2847       f[j].z += af[j + forceStride*2];
  2853       f_nbond[j].x += af_nbond[j];
  2854       f_nbond[j].y += af_nbond[j + forceStride];
  2855       f_nbond[j].z += af_nbond[j + forceStride*2];
  2862       f_slow[j].x += af_slow[j];
  2863       f_slow[j].y += af_slow[j + forceStride];
  2864       f_slow[j].z += af_slow[j + forceStride*2];
  2871   for(
int j=0; j < numAtoms; j++){
  2873     f[j].y += af[j+ forceStride];
  2874     f[j].z += af[j+ 2*forceStride];
  2877     for(
int j=0; j < numAtoms; j++){
  2878       f_nbond[j].x += af_nbond[j];
  2879       f_nbond[j].y += af_nbond[j + forceStride];
  2880       f_nbond[j].z += af_nbond[j + 2*forceStride];
  2884     for(
int j=0; j < numAtoms; j++){
  2885       f_slow[j].x += af_slow[j];
  2886       f_slow[j].y += af_slow[j + forceStride];
  2887       f_slow[j].z += af_slow[j + 2*forceStride];
  2893   for(
int j=0; j < numAtoms; j++) f[j].x += af[j];
  2894   for(
int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
  2895   for(
int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
  2898 #ifdef DEBUG_MINIMIZE  2900     printf(
"%s, line %d\n", __FILE__, __LINE__);
  2901     printf(
"  before:  f_nbond[%d] = %f %f %f\n",
  2902         k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
  2904     for(
int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
  2905     for(
int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
  2906     for(
int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
  2907 #ifdef DEBUG_MINIMIZE  2908     printf(
"  after:   f_nbond[%d] = %f %f %f\n",
  2909         k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
  2913     for(
int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
  2914     for(
int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
  2915     for(
int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
  2918   for(
int j = 0; j < numAtoms; j++){
  2919     fprintf(stderr, 
"f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
  2920       af[j], af[j + forceStride], af[j + 2*forceStride],
  2921       af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
  2922       af_slow[j],  af_slow[j + forceStride],  af_slow[j + 2*forceStride]);
  2930 void ComputeBondedCUDA::finishPatchesOnPe() {
  2934   int myRank = CkMyRank();
  2936   const int forceStride = bondedKernel.getForceStride(atomStorageSize);
  2937   const int forceSize = bondedKernel.getForceSize(atomStorageSize);
  2938   const bool sumNbond = hasModifiedExclusions;
  2939   const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
  2942   for (
int i=0;i < patchIDsPerRank[myRank].size();i++) {
  2943     PatchID patchID = patchIDsPerRank[myRank][i];
  2947       NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
  2950     int pi = patchIndex[patchID];
  2951     int numAtoms = patches[pi].numAtoms;
  2952     int atomStart = patches[pi].atomStart;
  2956 #ifdef NODEGROUP_FORCE_REGISTER  2957     double *af       = forces + atomStart;
  2958     double *af_nbond = forces + forceSize + atomStart;
  2959     double *af_slow  = forces + 2*forceSize + atomStart;
  2965       if (!sumNbond && !sumSlow) {
  2966         finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2967       } 
else if (sumNbond && !sumSlow) {
  2968         finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2969       } 
else if (!sumNbond && sumSlow) {
  2970         finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2971       } 
else if (sumNbond && sumSlow) {
  2972         finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2974         NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
  2985     double *af       = forces + atomStart;
  2986     double *af_nbond = forces + forceSize + atomStart;
  2987     double *af_slow  = forces + 2*forceSize + atomStart;
  2989     if (!sumNbond && !sumSlow) {
  2990       finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2991     } 
else if (sumNbond && !sumSlow) {
  2992       finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2993     } 
else if (!sumNbond && sumSlow) {
  2994       finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2995     } 
else if (sumNbond && sumSlow) {
  2996       finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
  2998       NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
  3011   NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
  3013   NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
  3017   patchesCounter -= patchIDsPerRank[CkMyRank()].size();
  3024   if (patchesCounter == 0) {
  3025     patchesCounter = getNumPatches();
  3032       if(!atomsChanged) this->finishReductions();
  3037   NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
  3041 void ComputeBondedCUDA::finishPatches() {
  3057 void ComputeBondedCUDA::finishReductions() {
  3059   if (CkMyPe() != masterPe)
  3060     NAMD_bug(
"ComputeBondedCUDA::finishReductions() called on non masterPe");
  3071     cudaCheck(cudaStreamSynchronize(stream));
  3073   for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  3074     if (numTuplesPerType[tupletype] > 0) {
  3077         switch (tupletype) {
  3080           if (hostAlchFlags.alchFepOn) {
  3083           if (hostAlchFlags.alchThermIntOn) {
  3091           if (hostAlchFlags.alchFepOn) {
  3094           if (hostAlchFlags.alchThermIntOn) {
  3102           if (hostAlchFlags.alchFepOn) {
  3105           if (hostAlchFlags.alchThermIntOn) {
  3113           if (hostAlchFlags.alchFepOn) {
  3116           if (hostAlchFlags.alchThermIntOn) {
  3126           if (hostAlchFlags.alchFepOn) {
  3131           if (hostAlchFlags.alchThermIntOn) {
  3143           if (hostAlchFlags.alchFepOn) {
  3146           if (hostAlchFlags.alchThermIntOn) {
  3152         case Tuples::THOLE: {
  3154           if (hostAlchFlags.alchFepOn) {
  3157           if (hostAlchFlags.alchThermIntOn) {
  3164         case Tuples::ANISO: {
  3166           if (hostAlchFlags.alchFepOn) {
  3169           if (hostAlchFlags.alchThermIntOn) {
  3176         case Tuples::ONEFOURNBTHOLE: {
  3178           if (hostAlchFlags.alchFepOn) {
  3181           if (hostAlchFlags.alchThermIntOn) {
  3189           NAMD_bug(
"ComputeBondedCUDA::finishReductions, Unsupported tuple type");
  3194       auto it = tupleList[tupletype].begin();
  3196         (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
  3202 #ifndef WRITE_FULL_VIRIALS  3203 #error "non-WRITE_FULL_VIRIALS not implemented"  3206     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
  3207     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
  3208     ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW,  energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
  3209     ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
  3212     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,   energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
  3223 void ComputeBondedCUDA::initialize() {
  3224 #ifdef NODEGROUP_FORCE_REGISTER  3225   tupleWorkIndex.store(0);
  3228   if (CkMyPe() != masterPe)
  3229     NAMD_bug(
"ComputeBondedCUDA::initialize() called on non master PE");
  3232   for (
int rank=0;rank < computes.size();rank++) {
  3233     if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
  3234       pes.push_back(CkNodeFirst(CkMyNode()) + rank);
  3239   if (pes.size() == 0) 
return;
  3241   initializeCalled = 
false;
  3244 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)  3245   int leastPriority, greatestPriority;
  3246   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
  3247   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
  3251   cudaCheck(cudaEventCreate(&forceDoneEvent));
  3252   lock = CmiCreateLock();
  3253   printLock = CmiCreateLock();
  3254   if(
Node::Object()->simParameters->CUDASOAintegrateMode) {
  3259   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  3260   PatchData *patchData = cpdata.ckLocalBranch();
  3265   for (
int rank=0;rank < computes.size();rank++) {
  3266     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
  3267     for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
  3268       for (
auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
  3271           patchIDsPerRank[rank].push_back(*jt);
  3272           allPatchIDs.push_back(*jt);
  3280   for (
int rank=0;rank < computes.size();rank++) {
  3281     HomeCompute& homeCompute = computes[rank].homeCompute;
  3282     std::vector<int>& patchIDs = homeCompute.patchIDs;
  3283     for (
int i=0;i < patchIDs.size();i++) {
  3284       int patchID = patchIDs[i];
  3287         patchIDsPerRank[rank].push_back(patchID);
  3288         allPatchIDs.push_back(patchID);
  3293   std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
  3295   std::vector<int> neighborPids;
  3296   for (
int rank=0;rank < computes.size();rank++) {
  3298     HomeCompute& homeCompute = computes[rank].homeCompute;
  3299     std::vector<int>& patchIDs = homeCompute.patchIDs;
  3300     for (
int i=0;i < patchIDs.size();i++) {
  3301       int patchID = patchIDs[i];
  3303       for (
int j=0;j < numNeighbors;j++) {
  3305           neighborPids.push_back(neighbors[j]);
  3312     std::sort(neighborPids.begin(), neighborPids.end());
  3313     auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
  3314     neighborPids.resize(std::distance(neighborPids.begin(), it_end));
  3317   for (
int i=0;i < neighborPids.size();i++) {
  3318     for (
int rank=0;rank < computes.size();rank++) {
  3319       int pid = neighborPids[i];
  3320       int pe = rank + CkNodeFirst(CkMyNode());
  3321       if (patchMap->
node(pid) == pe) {
  3324         patchIDsPerRank[rank].push_back(pid);
  3325         allPatchIDs.push_back(pid);
  3327         patchIDsToAppend[rank].push_back(pid);
  3329         pes.push_back(CkNodeFirst(CkMyNode()) + rank);
  3336     std::sort(pes.begin(), pes.end());
  3337     auto it_end = std::unique(pes.begin(), pes.end());
  3338     pes.resize(std::distance(pes.begin(), it_end));
  3343   for (
int rank=0;rank < computes.size();rank++) {
  3345     HomeCompute& homeCompute = computes[rank].homeCompute;
  3346     std::vector<int>& patchIDs = homeCompute.patchIDs;
  3347     std::vector<int> neighborPatchIDs;
  3348     for (
int i=0;i < patchIDs.size();i++) {
  3349       int patchID = patchIDs[i];
  3351       for (
int j=0;j < numNeighbors;j++) {
  3355           patchIDsPerRank[rank].push_back(neighbors[j]);
  3356           allPatchIDs.push_back(neighbors[j]);
  3358         if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
  3359           && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
  3360           neighborPatchIDs.push_back(neighbors[j]);
  3370     for (
int i=0;i < neighborPatchIDs.size();i++) {
  3371       patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
  3375   for (
int rank=0;rank < patchIDsToAppend.size();rank++) {
  3376     for (
int i=0;i < patchIDsToAppend[rank].size();i++) {
  3377       computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
  3383     std::sort(allPatchIDs.begin(), allPatchIDs.end());
  3384     auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
  3385     allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
  3389   setNumPatches(allPatchIDs.size());
  3392   patchesCounter = getNumPatches();
  3394   patches.resize(getNumPatches());
  3397   for (
int rank=0;rank < computes.size();rank++) {
  3398     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
  3399     for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
  3400       tupleList[it->tuples->getType()].push_back(it->tuples);
  3402     HomeCompute& homeCompute = computes[rank].homeCompute;
  3403     for (
int i=0;i < homeCompute.tuples.size();i++) {
  3404       tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
  3412   std::vector<char> patchIDset(patchMap->
numPatches(), 0);
  3413   int numPatchIDset = 0;
  3414   int numPatchIDs = 0;
  3415   for (
int rank=0;rank < computes.size();rank++) {
  3416     numPatchIDs += patchIDsPerRank[rank].size();
  3417     for (
int i=0;i < patchIDsPerRank[rank].size();i++) {
  3418       PatchID patchID = patchIDsPerRank[rank][i];
  3419       if (patchIDset[patchID] == 0) numPatchIDset++;
  3420       patchIDset[patchID] = 1;
  3421       if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
  3422         NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
  3426   if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
  3427     NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
  3432   atomMappers.resize(getNumPatches());
  3433   for (
int i=0;i < getNumPatches();i++) {
  3434     atomMappers[i] = 
new AtomMapper(allPatchIDs[i], &atomMap);
  3435     patchIndex[allPatchIDs[i]] = i;
  3439   updateHostCudaAlchFlags();
  3440   updateKernelCudaAlchFlags();
  3441   if (hostAlchFlags.alchOn) {
  3442     updateHostCudaAlchParameters();
  3443     bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
  3444     updateHostCudaAlchLambdas();
  3445     bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
  3446     if (hostAlchFlags.alchDecouple) {
  3447       pswitchTable[1+3*1] = 0;
  3448       pswitchTable[2+3*2] = 0;
  3454   for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
  3455     if (tupleList[tupletype].size() > 0) {
  3462           std::vector<CudaBondValue> bondValues(NumBondParams);
  3463           for (
int i=0;i < NumBondParams;i++) {
  3464             bondValues[i].k  = bond_array[i].
k;
  3465             bondValues[i].x0 = bond_array[i].
x0;
  3466             bondValues[i].x1 = bond_array[i].
x1;
  3468           bondedKernel.setupBondValues(NumBondParams, bondValues.data());
  3476           std::vector<CudaAngleValue> angleValues(NumAngleParams);
  3477           bool normal_ub_error = 
false;
  3478           for (
int i=0;i < NumAngleParams;i++) {
  3479             angleValues[i].k      = angle_array[i].
k;
  3480             if (angle_array[i].normal == 1) {
  3481               angleValues[i].theta0 = angle_array[i].
theta0;
  3483               angleValues[i].theta0 = cos(angle_array[i].theta0);
  3485             normal_ub_error |= (angle_array[i].
normal == 0 && angle_array[i].
k_ub);
  3486             angleValues[i].k_ub   = angle_array[i].
k_ub;
  3487             angleValues[i].r_ub   = angle_array[i].
r_ub;
  3488             angleValues[i].normal = angle_array[i].
normal;
  3490           if (normal_ub_error) 
NAMD_die(
"ERROR: Can't use cosAngles with Urey-Bradley angles");
  3491           bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
  3499           int NumDihedralParamsMult = 0;
  3500           for (
int i=0;i < NumDihedralParams;i++) {
  3501             NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
  3503           std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
  3504           dihedralMultMap.resize(NumDihedralParams);
  3506           for (
int i=0;i < NumDihedralParams;i++) {
  3508             dihedralMultMap[i] = k;
  3509             for (
int j=0;j < multiplicity;j++) {
  3510               dihedralValues[k].k     = dihedral_array[i].
values[j].
k;
  3511               dihedralValues[k].n     = (dihedral_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
  3512               dihedralValues[k].delta = dihedral_array[i].
values[j].
delta;
  3516           bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
  3524           int NumImproperParamsMult = 0;
  3525           for (
int i=0;i < NumImproperParams;i++) {
  3526             NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
  3528           std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
  3529           improperMultMap.resize(NumImproperParams);
  3531           for (
int i=0;i < NumImproperParams;i++) {
  3533             improperMultMap[i] = k;
  3534             for (
int j=0;j < multiplicity;j++) {
  3535               improperValues[k].k     = improper_array[i].
values[j].
k;
  3536               improperValues[k].n     = (improper_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
  3537               improperValues[k].delta = improper_array[i].
values[j].
delta;
  3541           bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
  3549           std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
  3552           for (
int ipar=0;ipar < NumCrosstermParams;ipar++) {
  3553             for (
int i=0;i < N;i++) {
  3554               for (
int j=0;j < N;j++) {
  3559                 #define INDEX(ncols,i,j)  ((i)*ncols + (j))  3562                 const double Ainv[16][16] = {
  3563                   { 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
  3564                   { 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
  3565                   {-3,  3,  0,  0, -2, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
  3566                   { 2, -2,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
  3567                   { 0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0},
  3568                   { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0},
  3569                   { 0,  0,  0,  0,  0,  0,  0,  0, -3,  3,  0,  0, -2, -1,  0,  0},
  3570                   { 0,  0,  0,  0,  0,  0,  0,  0,  2, -2,  0,  0,  1,  1,  0,  0},
  3571                   {-3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0,  0,  0,  0,  0},
  3572                   { 0,  0,  0,  0, -3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0},
  3573                   { 9, -9, -9,  9,  6,  3, -6, -3,  6, -6,  3, -3,  4,  2,  2,  1},
  3574                   {-6,  6,  6, -6, -3, -3,  3,  3, -4,  4, -2,  2, -2, -2, -1, -1},
  3575                   { 2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0},
  3576                   { 0,  0,  0,  0,  2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0},
  3577                   {-6,  6,  6, -6, -4, -2,  4,  2, -3,  3, -3,  3, -2, -1, -2, -1},
  3578                   { 4, -4, -4,  4,  2,  2, -2, -2,  2, -2,  2, -2,  1,  1,  1,  1}
  3582 #define M_PI 3.14159265358979323846  3585                 const double h = 
M_PI/12.0;
  3587                 const double x[16] = {
  3588                   table[
INDEX(D,i,j)].
d00, table[
INDEX(D,i+1,j)].
d00, table[
INDEX(D,i,j+1)].
d00, table[
INDEX(D,i+1,j+1)].
d00,
  3589                   table[
INDEX(D,i,j)].
d10*h, table[
INDEX(D,i+1,j)].
d10*h, table[
INDEX(D,i,j+1)].
d10*h, table[
INDEX(D,i+1,j+1)].
d10*h,
  3590                   table[
INDEX(D,i,j)].
d01*h, table[
INDEX(D,i+1,j)].
d01*h, table[
INDEX(D,i,j+1)].
d01*h, table[
INDEX(D,i+1,j+1)].
d01*h,
  3591                   table[
INDEX(D,i,j)].
d11*h*h, table[
INDEX(D,i+1,j)].
d11*h*h, table[
INDEX(D,i,j+1)].
d11*h*h, table[
INDEX(D,i+1,j+1)].
d11*h*h
  3595                 float* a = (
float *)&crosstermValues[ipar].c[i][j][0];
  3596                 for (
int k=0;k < 16;k++) {
  3598                   for (
int l=0;l < 16;l++) {
  3599                     a_val += Ainv[k][l]*x[l];
  3601                   a[k] = (float)a_val;
  3607           bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
  3615         case Tuples::THOLE: {
  3620         case Tuples::ANISO: {
  3625         case Tuples::ONEFOURNBTHOLE: {
  3631         NAMD_bug(
"ComputeBondedCUDA::initialize, Undefined tuple type");
  3641 #ifdef NODEGROUP_FORCE_REGISTER  3644     migrationKernel.setup(nDevices, patchMap->
numPatches());
  3648     allocate_host<PatchRecord>(&h_patchRecord, patchMap->
numPatches());
  3649     allocate_device<PatchRecord>(&d_patchRecord, patchMap->
numPatches());
  3652     allocate_host<double3>(&h_patchMapCenter, patchMap->
numPatches());
  3653     allocate_device<double3>(&d_patchMapCenter, patchMap->
numPatches());
  3654     for (
int i = 0; i < patchMap->
numPatches(); i++) {
  3657     copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->
numPatches());
  3659 #endif  // NODEGROUP_FORCE_REGISTER  3662 #ifdef NODEGROUP_FORCE_REGISTER  3663 void ComputeBondedCUDA::updatePatchOrder(
const std::vector<CudaLocalRecord>& data) {
  3666   std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
  3667   for (
int i=0;i < data.size();i++) {
  3668     patchIndex[data[i].patchID] = i;
  3672   for (
int i=0;i < data.size();i++) {
  3677     patches.push_back(p);
  3680 #endif  // NODEGROUP_FORCE_REGISTER  3682 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
  3685   hostAlchFlags.alchFepOn             = sim_params.
alchFepOn;
  3687   hostAlchFlags.alchWCAOn             = sim_params.
alchWCAOn;
  3688   hostAlchFlags.alchLJcorrection      = sim_params.
LJcorrection;
  3694 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
  3695   bondedKernel.updateCudaAlchFlags(hostAlchFlags);
  3698 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
  3704 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
  3705   bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
  3708 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
  3712   hostAlchLambdas.bondLambda1         = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda));
  3713   hostAlchLambdas.bondLambda2         = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda));
  3714   hostAlchLambdas.bondLambda12        = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda2));
  3715   hostAlchLambdas.bondLambda22        = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
  3716   hostAlchLambdas.elecLambdaUp        = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda));
  3717   hostAlchLambdas.elecLambda2Up       = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda2));
  3718   hostAlchLambdas.elecLambdaDown      = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda));
  3719   hostAlchLambdas.elecLambda2Down     = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
  3720   hostAlchLambdas.vdwLambdaUp         = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda));
  3721   hostAlchLambdas.vdwLambda2Up        = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda2));
  3722   hostAlchLambdas.vdwLambdaDown       = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
  3723   hostAlchLambdas.vdwLambda2Down      = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
  3726 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
  3727   bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
  3730 #ifdef NODEGROUP_FORCE_REGISTER  3731 void ComputeBondedCUDA::registerPointersToHost() {
  3732   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  3733   PatchData *patchData = cpdata.ckLocalBranch();
  3738   patchData->h_tupleDataStage.bond[deviceIndex] = dst.
bond;
  3739   patchData->h_tupleDataStage.angle[deviceIndex] = dst.
angle;
  3740   patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.
dihedral;
  3741   patchData->h_tupleDataStage.improper[deviceIndex] = dst.
improper;
  3742   patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.
modifiedExclusion;
  3743   patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.
exclusion;
  3744   patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.
crossterm;
  3747   patchData->h_tupleCount.bond[deviceIndex] = count.
bond();
  3748   patchData->h_tupleCount.angle[deviceIndex] = count.
angle();
  3749   patchData->h_tupleCount.dihedral[deviceIndex] = count.
dihedral();
  3750   patchData->h_tupleCount.improper[deviceIndex] = count.
improper();
  3751   patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.
modifiedExclusion();
  3752   patchData->h_tupleCount.exclusion[deviceIndex] = count.
exclusion();
  3753   patchData->h_tupleCount.crossterm[deviceIndex] = count.
crossterm();
  3756   patchData->h_tupleOffset.bond[deviceIndex] = offset.
bond();
  3757   patchData->h_tupleOffset.angle[deviceIndex] = offset.
angle();
  3758   patchData->h_tupleOffset.dihedral[deviceIndex] = offset.
dihedral();
  3759   patchData->h_tupleOffset.improper[deviceIndex] = offset.
improper();
  3760   patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.
modifiedExclusion();
  3761   patchData->h_tupleOffset.exclusion[deviceIndex] = offset.
exclusion();
  3762   patchData->h_tupleOffset.crossterm[deviceIndex] = offset.
crossterm();
  3765 void ComputeBondedCUDA::copyHostRegisterToDevice() {
  3766   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
  3767   PatchData *patchData = cpdata.ckLocalBranch();
  3774   migrationKernel.copyPeerDataToDevice(
  3775     patchData->h_tupleDataStage, 
  3776     patchData->h_tupleCount, 
  3777     patchData->h_tupleOffset, 
  3783 void ComputeBondedCUDA::copyPatchData() {
  3787   for (
int i = 0; i < numPatches; i++) {
  3788     h_patchRecord[i] = patches[patchIndex[i]];
  3791   copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
  3793 #endif  // NODEGROUP_FORCE_REGISTER  3797   return (
simParams->CUDASOAintegrate) ? reductionGpuResident : 
  3798                                          reductionGpuOffload;
  3801 #endif // BONDED_CUDA 
CudaDihedralStage * improper
 
NAMD_HOST_DEVICE int * modifiedExclusion()
 
void barrier(const SynchronousCollectiveScope scope)
 
#define NAMD_EVENT_STOP(eon, id)
 
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
 
ScaledPosition center(int pid) const
 
const CrosstermValue * value
 
CrosstermData c[dim][dim]
 
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
 
BigReal getBondLambda(const BigReal) const
 
const ImproperValue * value
 
static ProxyMgr * Object()
 
static int warpAlign(const int n)
 
#define CUDA_BONDED_KERNEL_EVENT
 
#define ADD_TENSOR(R, RL, D, DL)
 
static PatchMap * Object()
 
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
 
CudaCrosstermStage * crossterm
 
virtual void submit(void)=0
 
SimParameters * simParameters
 
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
 
NAMD_HOST_DEVICE int * exclusion()
 
void cudaDie(const char *msg, cudaError_t err)
 
Bool CUDASOAintegrateMode
 
#define INDEX(ncols, i, j)
 
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
 
int downstream(int pid1, int pid2)
 
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
 
static void messageEnqueueWork(Compute *)
 
SubmitReduction * willSubmit(int setID, int size=-1)
 
DihedralValue * dihedral_array
 
static ReductionMgr * Object(void)
 
Patch * patch(PatchID pid)
 
BigReal getElecLambda(const BigReal) const
 
CrosstermValue * crossterm_array
 
Molecule stores the structural information for the system. 
 
const OneFourNbTholeValue * value
 
__thread DeviceCUDA * deviceCUDA
 
FourBodyConsts values[MAX_MULTIPLICITY]
 
NAMD_HOST_DEVICE int * bond()
 
static CudaNBConstants getNonbondedCoef(SimParameters *params)
 
static Units next(Units u)
 
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
 
CudaAtom * getCudaAtomList()
 
int numPatches(void) const
 
#define NAMD_EVENT_START(eon, id)
 
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
 
void CcdCallBacksReset(void *ignored, double curWallTime)
 
const DihedralValue * value
 
NAMD_HOST_DEVICE float3 make_float3(float4 a)
 
void NAMD_bug(const char *err_msg)
 
#define COPY_CUDAVECTOR(S, D)
 
CudaDihedralStage * dihedral
 
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
 
NAMD_HOST_DEVICE int * crossterm()
 
NAMD_HOST_DEVICE int * angle()
 
PatchID getPatchID() const
 
ImproperValue * improper_array
 
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
 
CudaExclusionStage * modifiedExclusion
 
Box< Patch, CompAtom > * positionBox
 
void createProxy(PatchID pid)
 
int get_fep_bonded_type(const int *atomID, unsigned int order) const
 
void NAMD_die(const char *err_msg)
 
CudaExclusionStage * exclusion
 
FourBodyConsts values[MAX_MULTIPLICITY]
 
static Bool vdwForceSwitching
 
unsigned char get_fep_type(int anum) const
 
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
 
BigReal getCurrentLambda2(const int) const
 
BigReal getCurrentLambda(const int) const
 
BigReal getVdwLambda(const BigReal) const
 
Box< Patch, CompAtom > * avgPositionBox
 
virtual void patchReady(PatchID, int doneMigration, int seq)
 
NAMD_HOST_DEVICE int * improper()
 
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
 
NAMD_HOST_DEVICE int * dihedral()
 
Box< Patch, Results > * forceBox
 
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
 
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
 
void close(Data **const t)
 
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
 
static SynchronousCollectives * Object()
 
BigReal alchVdwShiftCoeff
 
CompAtomExt * getCompAtomExtInfo()
 
Box< Patch, Results > * registerForceDeposit(Compute *cid)