NAMD
ComputeSMDCUDA.C
Go to the documentation of this file.
1 #include "ComputeSMDCUDA.h"
2 #include "ComputeSMDCUDAKernel.h"
3 #include "SimParameters.h"
4 #include "PDB.h"
5 #include "PDBData.h"
6 #include "Node.h"
7 #include "Molecule.h"
8 #include "InfoStream.h"
9 #include "ComputeCUDAMgr.h"
10 #include "DeviceCUDA.h"
11 
12 #define MIN_DEBUG_LEVEL 3
13 //#define DEBUGM
14 #include "Debug.h"
15 #ifdef DEBUGM
16 // need these to use deviceCUDA in debugging output
17 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
18 #ifdef WIN32
19 #define __thread __declspec(thread)
20 #endif // WIN32
21 extern __thread DeviceCUDA *deviceCUDA;
22 #endif // defined(NAMD_CUDA) || defined(NAMD_HIP)
23 #endif // DEBUGM
24 
25 #ifdef NODEGROUP_FORCE_REGISTER
26 
27 ComputeSMDCUDA::ComputeSMDCUDA(
28  std::vector<HomePatch*> &patchList,
29  double springConstant,
30  double transverseSpringConstant,
31  double velocity,
32  double3 direction,
33  int outputFrequency,
34  int firstTimeStep,
35  const char* filename,
36  bool isMasterDevice,
37  int numAtoms,
38  int numDevices,
39  int deviceIndex,
40  bool _mGpuOn ){
41  DebugM(3, "ComputeSMDCUDA\n" << endi);
42  // I could use an initializer list, but I don't like them
43  this->patchList = &patchList;
44  this->springConstant = springConstant;
45  this->transverseSpringConstant = transverseSpringConstant;
46  this->velocity = velocity;
47  this->direction = direction;
48  this->outputFrequency = outputFrequency;
49  this->firstTimeStep = firstTimeStep;
50  this->filename = filename;
51  this->isMasterDevice = isMasterDevice;
52  this->numAtoms = numAtoms;
53  this->mGpuOn =_mGpuOn;
54  this->numDevices = numDevices;
55  this->deviceIndex = deviceIndex;
57 
58  smdAtomsGlobalIndex.clear();
59  // I need to save the global index of atoms. That way I can quickly rebuild the SMD index vector
60  allocate_host<double3>(&curCOM, 1);
61  parseAtoms();
62 
63  smdAtomsSOAIndex.resize(this->numSMDAtoms);
64  allocate_device<unsigned int>(&d_tbcatomic, 1);
65  allocate_device<double3>(&d_curCOM, 1);
66  allocate_device<int>(&d_smdAtomsSOAIndex, this->numSMDAtoms);
67 
68  if(mGpuOn)
69  {// each SMD needs its own peer array
70  allocate_device<double3>(&d_peerCOM, sizeof(double3));
71 #ifdef DEBUGM
72  allocate_host<double3>(&h_peerCOM, sizeof(double3));
73 #endif
74  }
75 
76  // set the current COM value to {0, 0, 0}
77  curCOM->x = 0.0;
78  curCOM->y = 0.0;
79  curCOM->z = 0.0;
80 
81  copy_HtoD<double3>(curCOM, d_curCOM, 1);
82  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int)));
83 
84 }
85 
86 ComputeSMDCUDA::~ComputeSMDCUDA(){
87  DebugM(3, "~ComputeSMDCUDA\n" << endi);
88  deallocate_host<double3>(&curCOM);
89  deallocate_device<unsigned int>(&d_tbcatomic);
90  deallocate_device<int>(&d_smdAtomsSOAIndex);
91 }
92 
93 // This builds the global vector index - swiped from GlobalMasterSMD.C
94 void ComputeSMDCUDA::parseAtoms(){
95  DebugM(3, "parseAtoms\n" << endi);
96  PDB smdpdb(filename);
97  origCOM.x = origCOM.y = origCOM.z = 0;
98  Molecule *mol = Node::Object()->molecule; // to get masses
99  int numPDBAtoms = smdpdb.num_atoms();
100  if(numPDBAtoms < 1 ) NAMD_die("No Atoms found in SMDFile\n");
101 
102  BigReal imass = 0;
103 
104  if (numPDBAtoms != this->numAtoms){
105  fprintf(stderr, "Error, wrong numPDB (%d vs %d)\n",numPDBAtoms, this->numAtoms);
106  NAMD_die("The number of atoms in SMDFile must be equal to the total number of atoms in the structure!\n");
107  }
108 
109  // Would this work on PDB atoms? Is the data replicated for everyone?
110  for(int i = 0; i < numPDBAtoms; i++){
111  // MEMOPT obviously doesn't work with CUDASOA, so we can just use this
112  PDBAtom *atom = smdpdb.atom(i);
113  if(atom->occupancy()){ // It's a SMD atom! Add it to the list
114  smdAtomsGlobalIndex.push_back(i);
115 
116  // compute the center of mass
117  BigReal mass = mol->atommass(i);
118  origCOM.x += atom->xcoor()*mass;
119  origCOM.y += atom->ycoor()*mass;
120  origCOM.z += atom->zcoor()*mass;
121  imass += mass;
122  }
123  }
124 
125  inv_group_mass = 1.0 / imass;
126  origCOM.x *= inv_group_mass;
127  origCOM.y *= inv_group_mass;
128  origCOM.z *= inv_group_mass;
129 
130  if (imass == 0) // we didn't find any!
131  NAMD_die("SMDFile contained no SMD atoms (atoms w/ nonzero occupancy)\n");
132 
133  this->numSMDAtoms = smdAtomsGlobalIndex.size();
134 }
135 
136 void ComputeSMDCUDA::updateAtoms(
137  std::vector<AtomMap*> &atomMapsList,
138  std::vector<CudaLocalRecord> &localRecords,
139  const int* h_globalToLocalID) {
140  DebugM(3, "[" << CkMyPe() << "]" <<" updateAtoms full "<< smdAtomsGlobalIndex.size()<< " SOA "<< smdAtomsSOAIndex.size()<< "\n" << endi);
141  smdAtomsSOAIndex.clear();
142 #ifdef DEBUGM
143  smdAtomsSOAtoGlobalLocalMap.clear();
144 #endif
145  for(int i = 0 ; i < this->numSMDAtoms; i++){
146  int gid = smdAtomsGlobalIndex[i];
147  LocalID lid;
148  // Search for a valid localID in all atoms
149  for(int j = 0 ; j < atomMapsList.size(); j++){
150  lid = atomMapsList[j]->localID(gid);
151  if( lid.pid != -1) break;
152  }
153  //JM NOTE: Fields of lid need to be != -1, bc the atom needs to be somewhere
154  // otherwise we have a bug
155  if(lid.pid == -1){
156  if(!mGpuOn)
157  {
158  NAMD_bug(" LocalAtomID not found in patchMap");
159  }
160  }
161  else{
162 
163  int soaPid = h_globalToLocalID[lid.pid]; // Converts global patch ID to its local position in our SOA data structures
164  int soaIndex = localRecords[soaPid].bufferOffset + lid.index;
165  smdAtomsSOAIndex.push_back(soaIndex);
166 #ifdef DEBUGM
167  // so we can reverse map from the sorted SOA list to decomp independent
168  smdAtomsSOAtoGlobalLocalMap[soaIndex]=gid;
169 #endif
170  }
171  }
172  int numLocalSMDAtoms=smdAtomsSOAIndex.size();
173  DebugM(3, "[" << CkMyPe() << "]" << " updateAtoms " << numLocalSMDAtoms << "\n" << endi);
174  if(numLocalSMDAtoms>0)
175  { // only a device involved in the computation will have all the results
176  ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr(); // last one wins
177  cudaMgr->reducerSMDDevice.store(deviceIndex);
178  }
179  // Sort vector for better coalesce memory access
180  std::sort(smdAtomsSOAIndex.begin(), smdAtomsSOAIndex.end());
181  copy_HtoD<int>(smdAtomsSOAIndex.data(), d_smdAtomsSOAIndex, numLocalSMDAtoms);
182 }
183 
184 void ComputeSMDCUDA::computeCOMMGpu(
185  const Lattice lat,
186  const float * d_mass,
187  const double* d_pos_x,
188  const double* d_pos_y,
189  const double* d_pos_z,
190  const char3* d_transform,
191  cudaStream_t stream)
192 {
193  DebugM(3, "[" << CkMyPe() << "]" << " computeCOMMGpu " << this->smdAtomsSOAIndex.size() << " 1st "<< this->smdAtomsSOAIndex[0]<< "\n" << endi);
195  computeCOMSMDMgpu(this->smdAtomsSOAIndex.size(),lat,
196  d_mass, d_pos_x, d_pos_y, d_pos_z,
197  d_transform, this->d_smdAtomsSOAIndex,
198  d_peerCOM, cudaMgr->curSMDCOM,
199  this->d_tbcatomic, numDevices, deviceIndex, stream);
200 #ifdef DEBUGM
201  cudaCheck(cudaStreamSynchronize(stream));
202  copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
203  DebugM(3, "deviceIndex "<< deviceIndex << " COM " <<h_peerCOM[0].x*inv_group_mass<<", "
204  <<h_peerCOM[0].y*inv_group_mass<<", "
205  <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);
206 
207 #endif
208 }
209 
210 void ComputeSMDCUDA::doForce(
211  const int timeStep,
212  const Lattice &lat,
213  bool doEnergy,
214  const float* d_mass,
215  const double* d_pos_x,
216  const double* d_pos_y,
217  const double* d_pos_z,
218  const char3* d_transform,
219  double* d_f_normal_x,
220  double* d_f_normal_y,
221  double* d_f_normal_z,
222  cudaTensor* d_extVirial,
223  double* h_extEnergy,
224  double3* h_extForce,
225  cudaTensor* h_extVirial,
226  cudaStream_t stream
227  )
228 {
229  DebugM(3, "[" << CkMyPe() << "]" << " doForce " << this->smdAtomsSOAIndex.size() <<"\n" << endi);
231 
232  bool doOutput = ((timeStep % this->outputFrequency) == 0);
233  if(mGpuOn)
234  { // only the reducerDevice does output and energy
235  doOutput&=cudaMgr->reducerSMDDevice == deviceIndex;
236  }
237 
238 #ifdef DEBUGM
239  cudaCheck(cudaStreamSynchronize(stream));
240  copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
241  DebugM(3, " deviceIndex "<< deviceIndex << " com " <<h_peerCOM[0].x*inv_group_mass<<", "
242  <<h_peerCOM[0].y*inv_group_mass<<", "
243  <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);
244 
245 #endif
246 
247 
248  int bvalue;
249 
250  computeSMDForce(
251  lat,
252  this->inv_group_mass,
253  this->springConstant,
254  this->transverseSpringConstant,
255  this->velocity,
256  this->direction,
257  doEnergy || doOutput,
258  timeStep,
259  mGpuOn,
260  this->origCOM,
261  d_mass,
262  d_pos_x,
263  d_pos_y,
264  d_pos_z,
265  d_transform,
266  d_f_normal_x,
267  d_f_normal_y,
268  d_f_normal_z,
269  this->smdAtomsSOAIndex.size(),
270  this->d_smdAtomsSOAIndex,
271  this->d_curCOM,
272  this->curCOM,
273  cudaMgr->curSMDCOM,
274  d_extVirial,
275  h_extEnergy,
276  h_extForce,
277  h_extVirial,
278  this->d_tbcatomic,
279  numDevices,
280  deviceIndex,
281  stream
282  );
283  if(doOutput){
284  cudaCheck(cudaStreamSynchronize(stream));
285 #ifdef DEBUGM
286  Vector f(h_extForce->x, h_extForce->y, h_extForce->z);
287 #endif
288  DebugM(3, "[" << CkMyPe() << "]" << " co force " << f*PNPERKCALMOL <<"\n" << endi);
289  if(cudaMgr->reducerSMDDevice == deviceIndex)
290  {
291  // only one device does the output
292  outputStep(timeStep, curCOM, h_extForce);
293  }
294  }
295 }
296 
297 void ComputeSMDCUDA::outputStep(const int timeStep, double3* curCOM, double3* extForce)
298 {
299  Vector p(curCOM->x, curCOM->y, curCOM->z);
300  Vector f(extForce->x, extForce->y, extForce->z);
301  if(timeStep % (100*this->outputFrequency) == 0) {
302  iout << "SMDTITLE: TS CURRENT_POSITION FORCE\n" << endi;
303  }
304  iout << "SMD " << timeStep << ' ' << p << ' ' << f*PNPERKCALMOL << '\n' << endi;
305 
306 }
307 void ComputeSMDCUDA::outputCOM(std::string comment)
308 {
309 #ifdef DEBUGM
311  Vector p(curCOM->x, curCOM->y, curCOM->z);
312  // Vector g(cudaMgr->curSMDCOM[0].x*inv_group_mass, cudaMgr->curSMDCOM[0].y*inv_group_mass, cudaMgr->curSMDCOM[0].z*inv_group_mass);
313  // DebugM(3, comment << " origCOM : "<< this->origCOM <<" cur : "<<p<< " G : "<< g << "\n");
314 #endif
315 }
316 
317 
318 void ComputeSMDCUDA::initPeerCOM(double3** d_peerPoolCOM, cudaStream_t stream){
319  DebugM(3, "[" << CkMyPe() << "]" << " initPeerCOM\n" << endi);
320  initPeerCOMmgpu(numDevices, deviceIndex, d_peerPoolCOM, d_peerCOM, stream);
321 }
322 
323 
324 void ComputeSMDCUDA::dump(std::string comment,
325  const int step,
326  const int numAtoms,
327  const double* d_pos_x,
328  const double* d_pos_y,
329  const double* d_pos_z,
330  const float* d_mass
331  )
332 {
333 
334 #ifdef DEBUGM
335  DebugM(3, "[" << CkMyPe() << "]" << " dump " << comment<<" "<< smdAtomsSOAIndex.size() <<"\n" << endi);
337  // copy data from GPU to local buffers for output
338 
339  // output GID X Y Z MASS as these can be collected for decomp
340  // independent comparisons
341  float* mass;
342  double *pos_x;
343  double *pos_y;
344  double *pos_z;
345  allocate_host<double>(&pos_x, numAtoms);
346  allocate_host<double>(&pos_y, numAtoms);
347  allocate_host<double>(&pos_z, numAtoms);
348  allocate_host<float>(&mass, numAtoms);
349  copy_DtoH_sync<double>(d_pos_x, pos_x, numAtoms);
350  copy_DtoH_sync<double>(d_pos_y, pos_y, numAtoms);
351  copy_DtoH_sync<double>(d_pos_z, pos_z, numAtoms);
352  copy_DtoH_sync<float>(d_mass, mass, numAtoms);
353  std::ofstream ofs;
354  std::string name("smd-");
355  name+=std::to_string(CkMyPe());
356  name+=".dat";
357  ofs.open(name, std::ofstream::out | std::ofstream::app);
358  for(int i=0; i<smdAtomsSOAIndex.size(); i++)
359  {
360  int soaid=smdAtomsSOAIndex[i];
361  ofs <<step<<"- "<< smdAtomsSOAtoGlobalLocalMap[soaid]<<":"<<pos_x[soaid] <<","<<pos_y[soaid]<<","<<pos_z[soaid]<<","<<mass[soaid]<< "\n";
362  }
363  ofs.close();
364 #endif
365 }
366 
367 #endif // NODEGROUP_FORCE_REGISTER
static Node * Object()
Definition: Node.h:86
Definition: PDB.h:36
Definition: Vector.h:72
#define DebugM(x, y)
Definition: Debug.h:75
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal zcoor(void)
Definition: PDBData.C:433
#define iout
Definition: InfoStream.h:51
Molecule stores the structural information for the system.
Definition: Molecule.h:174
std::atomic< int > reducerSMDDevice
int32 index
Definition: NamdTypes.h:300
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
void NAMD_bug(const char *err_msg)
Definition: common.C:196
static ComputeCUDAMgr * getComputeCUDAMgr()
BigReal ycoor(void)
Definition: PDBData.C:429
void NAMD_die(const char *err_msg)
Definition: common.C:148
PatchID pid
Definition: NamdTypes.h:299
Real atommass(int anum) const
Definition: Molecule.h:1114
BigReal xcoor(void)
Definition: PDBData.C:425
#define PNPERKCALMOL
Definition: common.h:59
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
BigReal occupancy(void)
Definition: PDBData.C:444
Molecule * molecule
Definition: Node.h:179
double BigReal
Definition: common.h:123