namd/doxygen/ComputeSMDCUDA_8C_source.html

 #include "ComputeSMDCUDA.h"
 #include "ComputeSMDCUDAKernel.h"
 #include "SimParameters.h"
 #include "PDB.h"
 #include "PDBData.h"
 #include "Node.h"
 #include "Molecule.h"
 #include "InfoStream.h"
 #include "ComputeCUDAMgr.h"
 #include "DeviceCUDA.h"

 #define MIN_DEBUG_LEVEL 3
 //#define DEBUGM
 #include "Debug.h"
 #ifdef DEBUGM
 // need these to use deviceCUDA in debugging output
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 #ifdef WIN32
 #define __thread __declspec(thread)
 #endif // WIN32
 extern __thread DeviceCUDA *deviceCUDA;
 #endif // defined(NAMD_CUDA) || defined(NAMD_HIP)
 #endif // DEBUGM

 #ifdef NODEGROUP_FORCE_REGISTER

 ComputeSMDCUDA::ComputeSMDCUDA(
   std::vector<HomePatch*> &patchList,
   double springConstant,
   double transverseSpringConstant,
   double velocity,
   double3 direction,
   int outputFrequency,
   int firstTimeStep,
   const char* filename,
   bool isMasterDevice,
   int numAtoms,
   int numDevices,
   int deviceIndex,
   bool _mGpuOn ){
   DebugM(3, "ComputeSMDCUDA\n" << endi);
   // I could use an initializer list, but I don't like them
   this->patchList = &patchList;
   this->springConstant = springConstant;
   this->transverseSpringConstant = transverseSpringConstant;
   this->velocity = velocity;
   this->direction = direction;
   this->outputFrequency = outputFrequency;
   this->firstTimeStep = firstTimeStep;
   this->filename = filename;
   this->isMasterDevice = isMasterDevice;
   this->numAtoms = numAtoms;
   this->mGpuOn =_mGpuOn;
   this->numDevices = numDevices;
   this->deviceIndex = deviceIndex;
   ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();

   smdAtomsGlobalIndex.clear();
   // I need to save the global index of atoms. That way I can quickly rebuild the SMD index vector
   allocate_host<double3>(&curCOM, 1);
   parseAtoms();

   smdAtomsSOAIndex.resize(this->numSMDAtoms);
   allocate_device<unsigned int>(&d_tbcatomic, 1);
   allocate_device<double3>(&d_curCOM, 1);
   allocate_device<int>(&d_smdAtomsSOAIndex, this->numSMDAtoms);

   if(mGpuOn)
     {// each SMD needs its own peer array
       allocate_device<double3>(&d_peerCOM, sizeof(double3));
 #ifdef DEBUGM
       allocate_host<double3>(&h_peerCOM, sizeof(double3));
 #endif
     }

   // set the current COM value to {0, 0, 0}
   curCOM->x = 0.0;
   curCOM->y = 0.0;
   curCOM->z = 0.0;

   copy_HtoD<double3>(curCOM, d_curCOM, 1);
   cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int)));

 }

 ComputeSMDCUDA::~ComputeSMDCUDA(){
   DebugM(3, "~ComputeSMDCUDA\n" << endi);
   deallocate_host<double3>(&curCOM);
   deallocate_device<unsigned int>(&d_tbcatomic);
   deallocate_device<int>(&d_smdAtomsSOAIndex);
 }

 // This builds the global vector index - swiped from GlobalMasterSMD.C
 void ComputeSMDCUDA::parseAtoms(){
   DebugM(3, "parseAtoms\n" << endi);
   PDB smdpdb(filename);
   origCOM.x = origCOM.y = origCOM.z = 0;
   Molecule *mol = Node::Object()->molecule; // to get masses
   int numPDBAtoms = smdpdb.num_atoms();
   if(numPDBAtoms < 1 ) NAMD_die("No Atoms found in SMDFile\n");

   BigReal imass = 0;

   if (numPDBAtoms != this->numAtoms){
     fprintf(stderr, "Error, wrong numPDB (%d vs %d)\n",numPDBAtoms, this->numAtoms);
     NAMD_die("The number of atoms in SMDFile must be equal to the total number of atoms in the structure!\n");
   }

   // Would this work on PDB atoms? Is the data replicated for everyone?
   for(int i = 0; i < numPDBAtoms; i++){
     // MEMOPT obviously doesn't work with CUDASOA, so we can just use this
     PDBAtom *atom = smdpdb.atom(i);
     if(atom->occupancy()){ // It's a SMD atom! Add it to the list
       smdAtomsGlobalIndex.push_back(i);

       // compute the center of mass
       BigReal mass = mol->atommass(i);
       origCOM.x += atom->xcoor()*mass;
       origCOM.y += atom->ycoor()*mass;
       origCOM.z += atom->zcoor()*mass;
       imass += mass;
     }
   }

   inv_group_mass = 1.0 / imass;
   origCOM.x *= inv_group_mass;
   origCOM.y *= inv_group_mass;
   origCOM.z *= inv_group_mass;

   if (imass == 0) // we didn't find any!
     NAMD_die("SMDFile contained no SMD atoms (atoms w/ nonzero occupancy)\n");

   this->numSMDAtoms = smdAtomsGlobalIndex.size();
 }

 void ComputeSMDCUDA::updateAtoms(
   std::vector<AtomMap*> &atomMapsList,
   std::vector<CudaLocalRecord> &localRecords,
   const int* h_globalToLocalID) {
   DebugM(3, "[" << CkMyPe() << "]" <<" updateAtoms full "<< smdAtomsGlobalIndex.size()<< " SOA "<< smdAtomsSOAIndex.size()<< "\n" << endi);
   smdAtomsSOAIndex.clear();
 #ifdef DEBUGM
   smdAtomsSOAtoGlobalLocalMap.clear();
 #endif
   for(int i = 0 ; i < this->numSMDAtoms; i++){
     int gid = smdAtomsGlobalIndex[i];
     LocalID lid;
     // Search for a valid localID in all atoms
     for(int j = 0 ; j < atomMapsList.size(); j++){
       lid = atomMapsList[j]->localID(gid);
       if( lid.pid != -1) break;
     }
     //JM NOTE: Fields of lid need to be != -1, bc the atom needs to be somewhere
     //          otherwise we have a bug
     if(lid.pid == -1){
       if(!mGpuOn)
         {
           NAMD_bug(" LocalAtomID not found in patchMap");
         }
     }
     else{

       int soaPid = h_globalToLocalID[lid.pid]; // Converts global patch ID to its local position in our SOA data structures
       int soaIndex = localRecords[soaPid].bufferOffset + lid.index;
       smdAtomsSOAIndex.push_back(soaIndex);
 #ifdef DEBUGM
       // so we can reverse map from the sorted SOA list to decomp independent
       smdAtomsSOAtoGlobalLocalMap[soaIndex]=gid;
 #endif
     }
   }
   int numLocalSMDAtoms=smdAtomsSOAIndex.size();
   DebugM(3, "[" << CkMyPe() << "]" << " updateAtoms " << numLocalSMDAtoms << "\n" << endi);
   if(numLocalSMDAtoms>0)
     { // only a device involved in the computation will have all the results
       ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();         // last one wins
       cudaMgr->reducerSMDDevice.store(deviceIndex);
     }
   // Sort vector for better coalesce memory access
   std::sort(smdAtomsSOAIndex.begin(), smdAtomsSOAIndex.end());
   copy_HtoD<int>(smdAtomsSOAIndex.data(), d_smdAtomsSOAIndex, numLocalSMDAtoms);
 }

 void ComputeSMDCUDA::computeCOMMGpu(
   const Lattice lat,
   const float * d_mass,
   const double* d_pos_x,
   const double* d_pos_y,
   const double* d_pos_z,
   const char3*  d_transform,
   cudaStream_t stream)
 {
   DebugM(3, "[" << CkMyPe() << "]" << " computeCOMMGpu " << this->smdAtomsSOAIndex.size() << " 1st "<< this->smdAtomsSOAIndex[0]<< "\n" << endi);
   ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();
   computeCOMSMDMgpu(this->smdAtomsSOAIndex.size(),lat,
                     d_mass, d_pos_x, d_pos_y, d_pos_z,
                     d_transform, this->d_smdAtomsSOAIndex,
                     d_peerCOM, cudaMgr->curSMDCOM,
                     this->d_tbcatomic, numDevices, deviceIndex, stream);
 #ifdef DEBUGM
   cudaCheck(cudaStreamSynchronize(stream));
   copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
   DebugM(3, "deviceIndex "<< deviceIndex << " COM " <<h_peerCOM[0].x*inv_group_mass<<", "
          <<h_peerCOM[0].y*inv_group_mass<<", "
          <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);

 #endif
 }

 void ComputeSMDCUDA::doForce(
       const int timeStep,
       const Lattice &lat,
       bool        doEnergy,
       const float*      d_mass,
       const double*     d_pos_x,
       const double*     d_pos_y,
       const double*     d_pos_z,
       const char3*      d_transform,
       double*           d_f_normal_x,
       double*           d_f_normal_y,
       double*           d_f_normal_z,
       cudaTensor*       d_extVirial,
       double*           h_extEnergy,
       double3*          h_extForce,
       cudaTensor*       h_extVirial,
       cudaStream_t      stream
     )
 {
   DebugM(3, "[" << CkMyPe() << "]" << " doForce " << this->smdAtomsSOAIndex.size() <<"\n" << endi);
   ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();

   bool doOutput = ((timeStep % this->outputFrequency) == 0);
   if(mGpuOn)
     {  // only the reducerDevice does output and energy
            doOutput&=cudaMgr->reducerSMDDevice == deviceIndex;
     }

 #ifdef DEBUGM
   cudaCheck(cudaStreamSynchronize(stream));
   copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
   DebugM(3, " deviceIndex "<< deviceIndex << " com " <<h_peerCOM[0].x*inv_group_mass<<", "
          <<h_peerCOM[0].y*inv_group_mass<<", "
          <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);

 #endif


   int bvalue;

   computeSMDForce(
                       lat,
                       this->inv_group_mass,
                       this->springConstant,
                       this->transverseSpringConstant,
                       this->velocity,
                       this->direction,
                       doEnergy || doOutput,
                       timeStep,
                       mGpuOn,
                       this->origCOM,
                       d_mass,
                       d_pos_x,
                       d_pos_y,
                       d_pos_z,
                       d_transform,
                       d_f_normal_x,
                       d_f_normal_y,
                       d_f_normal_z,
                       this->smdAtomsSOAIndex.size(),
                       this->d_smdAtomsSOAIndex,
                       this->d_curCOM,
                       this->curCOM,
                       cudaMgr->curSMDCOM,
                       d_extVirial,
                       h_extEnergy,
                       h_extForce,
                       h_extVirial,
                       this->d_tbcatomic,
                       numDevices,
                       deviceIndex,
                       stream
                       );
   if(doOutput){
     cudaCheck(cudaStreamSynchronize(stream));
 #ifdef DEBUGM
     Vector f(h_extForce->x, h_extForce->y, h_extForce->z);
 #endif
     DebugM(3, "[" << CkMyPe() << "]" << " co force " << f*PNPERKCALMOL <<"\n" << endi);
     if(cudaMgr->reducerSMDDevice == deviceIndex)
       {
         // only one device does the output
         outputStep(timeStep, curCOM, h_extForce);
       }
   }
 }

 void ComputeSMDCUDA::outputStep(const int timeStep, double3* curCOM, double3* extForce)
 {
   Vector p(curCOM->x, curCOM->y, curCOM->z);
   Vector f(extForce->x, extForce->y, extForce->z);
   if(timeStep % (100*this->outputFrequency) == 0) {
     iout << "SMDTITLE: TS   CURRENT_POSITION         FORCE\n" << endi;
   }
   iout << "SMD  " << timeStep << ' ' << p << ' ' << f*PNPERKCALMOL << '\n' << endi;

 }
 void ComputeSMDCUDA::outputCOM(std::string comment)
 {
 #ifdef DEBUGM
   ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();
   Vector p(curCOM->x, curCOM->y, curCOM->z);
   //  Vector g(cudaMgr->curSMDCOM[0].x*inv_group_mass, cudaMgr->curSMDCOM[0].y*inv_group_mass, cudaMgr->curSMDCOM[0].z*inv_group_mass);
   //  DebugM(3, comment << " origCOM : "<< this->origCOM <<" cur : "<<p<< " G : "<< g << "\n");
 #endif
 }


 void ComputeSMDCUDA::initPeerCOM(double3** d_peerPoolCOM, cudaStream_t stream){
   DebugM(3, "[" << CkMyPe() << "]" << " initPeerCOM\n" << endi);
   initPeerCOMmgpu(numDevices, deviceIndex, d_peerPoolCOM, d_peerCOM, stream);
 }


 void ComputeSMDCUDA::dump(std::string comment,
                           const int step,
                           const int numAtoms,
                           const double* d_pos_x,
                           const double* d_pos_y,
                           const double* d_pos_z,
                           const float* d_mass
                           )
 {

 #ifdef DEBUGM
   DebugM(3, "[" << CkMyPe() << "]" << " dump " << comment<<" "<< smdAtomsSOAIndex.size() <<"\n" << endi);
   ComputeCUDAMgr*       cudaMgr    = ComputeCUDAMgr::getComputeCUDAMgr();
   // copy data from GPU to local buffers for output

   // output GID X Y Z MASS as these can be collected for decomp
   // independent comparisons
   float* mass;
   double *pos_x;
   double *pos_y;
   double *pos_z;
   allocate_host<double>(&pos_x,  numAtoms);
   allocate_host<double>(&pos_y,  numAtoms);
   allocate_host<double>(&pos_z,  numAtoms);
   allocate_host<float>(&mass,  numAtoms);
   copy_DtoH_sync<double>(d_pos_x, pos_x, numAtoms);
   copy_DtoH_sync<double>(d_pos_y, pos_y, numAtoms);
   copy_DtoH_sync<double>(d_pos_z, pos_z, numAtoms);
   copy_DtoH_sync<float>(d_mass, mass, numAtoms);
   std::ofstream ofs;
   std::string name("smd-");
   name+=std::to_string(CkMyPe());
   name+=".dat";
   ofs.open(name, std::ofstream::out  | std::ofstream::app);
   for(int i=0; i<smdAtomsSOAIndex.size(); i++)
     {
       int soaid=smdAtomsSOAIndex[i];
       ofs <<step<<"- "<< smdAtomsSOAtoGlobalLocalMap[soaid]<<":"<<pos_x[soaid] <<","<<pos_y[soaid]<<","<<pos_z[soaid]<<","<<mass[soaid]<< "\n";
     }
   ofs.close();
 #endif
 }

 #endif // NODEGROUP_FORCE_REGISTER
Node::Object
static Node * Object()
Definition: Node.h:86

PDB
Definition: PDB.h:36

PDBData.h

ComputeCUDAMgr.h

PDBAtom
Definition: PDBData.h:134

Debug.h

Vector
Definition: Vector.h:72

Node.h

DebugM
#define DebugM(x, y)
Definition: Debug.h:75

InfoStream.h

endi
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54

PDB.h

PDBAtom::zcoor
BigReal zcoor(void)
Definition: PDBData.C:433

iout
#define iout
Definition: InfoStream.h:51

LocalID
Definition: NamdTypes.h:297

Molecule
Molecule stores the structural information for the system.
Definition: Molecule.h:174

ComputeCUDAMgr::reducerSMDDevice
std::atomic< int > reducerSMDDevice
Definition: ComputeCUDAMgr.h:39

LocalID::index
int32 index
Definition: NamdTypes.h:300

deviceCUDA
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23

Molecule.h

NAMD_bug
void NAMD_bug(const char *err_msg)
Definition: common.C:196

ComputeCUDAMgr::getComputeCUDAMgr
static ComputeCUDAMgr * getComputeCUDAMgr()
Definition: ComputeCUDAMgr.C:191

PDBAtom::ycoor
BigReal ycoor(void)
Definition: PDBData.C:429

NAMD_die
void NAMD_die(const char *err_msg)
Definition: common.C:148

LocalID::pid
PatchID pid
Definition: NamdTypes.h:299

Molecule::atommass
Real atommass(int anum) const
Definition: Molecule.h:1114

PDBAtom::xcoor
BigReal xcoor(void)
Definition: PDBData.C:425

PNPERKCALMOL
#define PNPERKCALMOL
Definition: common.h:59

DeviceCUDA.h

ComputeSMDCUDAKernel.h

ComputeSMDCUDA.h

cudaTensor
Definition: CudaUtils.h:84

cudaCheck
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

PDBAtom::occupancy
BigReal occupancy(void)
Definition: PDBData.C:444

Lattice
Definition: Lattice.h:17

Node::molecule
Molecule * molecule
Definition: Node.h:179

DeviceCUDA
Definition: DeviceCUDA.h:54

BigReal
double BigReal
Definition: common.h:123

SimParameters.h

ComputeCUDAMgr
Definition: ComputeCUDAMgr.h:16