NAMD
SequencerCUDA.h
Go to the documentation of this file.
1 #ifndef SEQUENCERCUDA_H
2 #define SEQUENCERCUDA_H
3 
4 #ifdef NAMD_CUDA
5 #include <curand.h>
6 //**< Filed to be used for external force, energy, and virial */
7 #endif
8 
9 #ifdef NAMD_HIP
10 #include <hiprand/hiprand.h>
11 #endif
12 
13 #include "NamdTypes.h"
14 #include "HomePatch.h"
15 #include "PatchTypes.h"
16 #include "ProcessorPrivate.h"
17 #include "ReductionMgr.h"
18 #include "SimParameters.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
21 #include "ComputeRestraintsCUDA.h"
22 #include "ComputeSMDCUDA.h"
24 #include "ComputeGridForceCUDA.h"
25 #include "ComputeLonepairsCUDA.h"
26 #include "ComputeConsForceCUDA.h"
27 #include "PatchData.h"
28 #include "Lattice.h"
29 #include "MigrationCUDAKernel.h"
30 #include "CollectiveDeviceBuffer.h"
31 #include "HipDefines.h"
32 
33 #ifdef NODEGROUP_FORCE_REGISTER
34 
35 enum ExtForceTag {
36  EXT_CONSTRAINTS = 0,
37  EXT_ELEC_FIELD,
38  EXT_SMD,
39  EXT_GROUP_RESTRAINTS,
40  EXT_GRIDFORCE,
41  EXT_CONSFORCE,
42  EXT_GLOBALMTS,
43  EXT_FORCE_TOTAL
44 };
45 
46 class SequencerCUDA{
47  friend class HomePatch;
48  friend class Sequencer;
49 public:
50  static SequencerCUDA *InstanceInit(const int, SimParameters*);
51  inline static SequencerCUDA *Object() { return CkpvAccess(SequencerCUDA_instance); }
52  inline static SequencerCUDA *ObjectOnPe(int pe) { return CkpvAccessOther(SequencerCUDA_instance, CmiRankOf(pe)); }
53 
54  int numPatchesCheckedIn;
55  int numPatchesReady;
56  std::vector<CthThread> waitingThreads;
57  CthThread masterThread;
58  bool masterThreadSleeping = false;
59  bool breakSuspends = false;
60  SequencerCUDA(const int, SimParameters*);
61  ~SequencerCUDA();
62  void initialize();
63  void zeroScalars();
64  bool reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy);
65  void reallocateMigrationDestination();
66  void deallocateArrays();
67  void deallocateStaticArrays();
68  void copyAoSDataToHost();
69  void copyPatchDataToHost();
70  void copyAtomDataToDeviceAoS();
71 
72  void copyAtomDataToDevice(bool copyForces, int maxForceNumber);
73 
74  bool copyPatchData(const bool copyIn, const bool startup);
75  void copyDataToPeers(const bool copyIn);
76  void migrationLocalInit();
77  void migrationPerform();
78  void migrationLocalPost(int startup);
79  void migrationUpdateAdvancedFeatures(const int startup);
80  void migrationUpdateAtomCounts();
81  void migrationUpdateAtomOffsets();
82  void migrationUpdateRemoteOffsets();
83  void migrationUpdateProxyDestination();
84  void migrationUpdateDestination();
85  void migrationSortAtomsNonbonded();
86  void sync();
87 
88  void copyMigrationInfo(HomePatch *p, int patchIndex);
89 
90  void assembleOrderedPatchList();
93  void monteCarloPressure_part1(Tensor &factor, Vector &origin, Lattice &oldLattice);
95  void monteCarloPressure_part2(int step, int maxForceNumber,
96  const bool doEnergy, const bool doGloblal, const bool doVirial);
97 
99  void monteCarloPressure_reject(Lattice &lattice);
101  void monteCarloPressure_accept(const int doMigration);
102 
105  inline static void tensor_enforce_symmetry(Tensor& t) {
106  t.xy = t.yx;
107  t.xz = t.zx;
108  t.yz = t.zy;
109  }
110 
111  void launch_part1(
112  int step,
113  double dt_normal,
114  double dt_nbond,
115  double dt_slow,
116  double velrescaling,
117  const double maxvel2,
118  Tensor &factor,
119  Vector &origin,
120  Lattice &lattice,
121  int reassignVelocitiesStep,
122  int langevinPistonStep,
123  int berendsenPressureStep,
124  int maxForceNumber,
125  const int copyIn,
126  const int savePairlists,
127  const int usePairlists,
128  const bool doEnergy);
129 
130  void launch_part11(
131  double dt_normal,
132  double dt_nbond,
133  double dt_slow,
134  double velrescaling,
135  const double maxvel2,
136  Tensor &factor,
137  Vector &origin,
138  Lattice &lattice,
139  int langevinPistonStep,
140  int maxForceNumber,
141  const int copyIn,
142  const int savePairlists,
143  const int usePairlists,
144  const bool doEnergy);
145 
146  void launch_set_compute_positions();
147 
148  void launch_part2(
149  const int doMCPressure,
150  double dt_normal,
151  double dt_nbond,
152  double dt_slow,
153  Vector &origin,
154  int step,
155  int maxForceNumber,
156  const int langevinPistonStep,
157  const int copyIn,
158  const int copyOut,
159  const int doGlobal,
160  const bool doEnergy);
161 
162  void launch_part3(
163  const int doMCPressure,
164  double dt_normal,
165  double dt_nbond,
166  double dt_slow,
167  Vector &origin,
168  int step,
169  int maxForceNumber,
170  const bool requestGlobalForces,
171  const int doGlobalStaleForces,
172  const bool forceRequestedGPU,
173  const int copyIn,
174  const int copyOut,
175  const bool doEnergy,
176  const bool requestForcesOutput);
177 
179  void copyGlobalForcesToDevice();
180 
181  void copySettleParameter();
182  void finish_part1(const int copyIn,
183  const int savePairlists,
184  const int usePairlists);
185 
186  void update_patch_flags();
187  void finish_patch_flags(int isMigration);
188  void updatePairlistFlags(const int doMigration);
189  void updateDeviceKernels();
190  void setRescalePairlistTolerance(const bool val);
191 
192  // For total forces on CUDA global master
193  void allocateGPUSavedForces();
194  cudaStream_t stream, stream2;
195  PatchData *patchData;
196  friend class CudaGlobalMasterServer;
197 private:
198  const int deviceID;
199  bool mGpuOn; /*<! True if nDevices > 1*/
200  int nDevices; /*<! Total number of GPUs in the simulation -- number of MasterPEs */
201 
202 
203  int deviceIndex;
204  cudaEvent_t stream2CopyDone, stream2CopyAfter;
205  SimParameters *const simParams;
206 
207  SubmitReduction *reduction;
208 
209  std::vector<AtomMap*> atomMapList;
210 
211  //
212  // Collective Device Buffers
213  //
214 
215  // Positions
219 
220  // Velocities
224 
225  // Bonded Forces
226  CollectiveDeviceBuffer<double> coll_f_normal_x;
227  CollectiveDeviceBuffer<double> coll_f_normal_y;
228  CollectiveDeviceBuffer<double> coll_f_normal_z;
229 
230  // Nonbonded Forces
231  CollectiveDeviceBuffer<double> coll_f_nbond_x;
232  CollectiveDeviceBuffer<double> coll_f_nbond_y;
233  CollectiveDeviceBuffer<double> coll_f_nbond_z;
234 
235  // Slow Forces
236  CollectiveDeviceBuffer<double> coll_f_slow_x;
237  CollectiveDeviceBuffer<double> coll_f_slow_y;
238  CollectiveDeviceBuffer<double> coll_f_slow_z;
239 
240  // Misc. atom buffers buffers
241  CollectiveDeviceBuffer<float> coll_charge;
242  CollectiveDeviceBuffer<int> coll_partition;
243 
244  // Migration related buffers
245  CollectiveDeviceBuffer<int4> coll_migrationDestination;
246  CollectiveDeviceBuffer<int> coll_sortSoluteIndex;
247  CollectiveDeviceBuffer<int> coll_idMig;
248  CollectiveDeviceBuffer<int> coll_vdwType;
249  CollectiveDeviceBuffer<int> coll_sortOrder;
250  CollectiveDeviceBuffer<int> coll_unsortOrder;
251  CollectiveDeviceBuffer<double3> coll_patchCenter;
252 
253  CollectiveDeviceBuffer<FullAtom> coll_atomdata_AoS_in;
254 
255  // Migration Structures
256  FullAtom * d_atomdata_AoS;
257 
258  int *d_migrationGroupSize;
259  int *d_migrationGroupIndex;
260  int *d_sortIndex;
261 
262  int *idMig;
263  int *vdwType;
264 
265  // Device arrays
266 #if 1
267  double *d_recipMass;
268 #else
269  float *d_recipMass; // mass should be float
270 #endif
271  // for fixed atoms
272  double *d_fixedPosition_x, *d_fixedPosition_y, *d_fixedPosition_z;
273 
274  double *d_f_saved_nbond_x, *d_f_saved_nbond_y, *d_f_saved_nbond_z;
275  double *d_f_saved_slow_x, *d_f_saved_slow_y, *d_f_saved_slow_z;
276 
277  double *d_posNew_raw;
278  double *d_posNew_x, *d_posNew_y, *d_posNew_z;
279 
280  double *d_f_global_x, *d_f_global_y, *d_f_global_z;
281 
282  double *d_rcm_x, *d_rcm_y, *d_rcm_z;
283  double *d_vcm_x, *d_vcm_y, *d_vcm_z;
284 
285  // backup forces for monte carlo barostat
286  double *d_f_rawMC; // raw buffer for all backup force array in MC barostat
287  double *d_pos_rawMC; // raw buffer for all backup position array in MC barostat
288  double *d_f_normalMC_x, *d_f_normalMC_y, *d_f_normalMC_z;
289  double *d_f_nbondMC_x, *d_f_nbondMC_y, *d_f_nbondMC_z;
290  double *d_f_slowMC_x, *d_f_slowMC_y, *d_f_slowMC_z;
291  // backup positions for monte carlo barostat
292  double *d_posMC_x, *d_posMC_y, *d_posMC_z;
293 
294  int *d_id; /*<! global atom index */
295  int *d_idOrder; /*<! Maps global atom index to local*/
296  int *d_moleculeStartIndex; /*<! starting index of each molecule */
297  int *d_moleculeAtom;/*<! Atom index sorted for all molecules */
298 
299  // XXX TODO: Maybe we can get away with not allocating these arrays
300  double *d_velNew_x, *d_velNew_y, *d_velNew_z;
301  double *d_posSave_x, *d_posSave_y, *d_posSave_z;
302  char3 *d_transform;
303 
304  int *d_patchOffsetTemp;
305 
306  float *d_rigidBondLength;
307  int *d_atomFixed;
308  int *d_groupFixed;
309  float *d_mass;
310  float *d_langevinParam;
311  float *d_langScalVelBBK2;
312  float *d_langScalRandBBK2;
313  float *d_gaussrand_x, *d_gaussrand_y, *d_gaussrand_z;
314  int *d_hydrogenGroupSize;
315  int *d_consFailure;
316  size_t d_consFailureSize;
317  int *settleList; /*<! Indexes of HGs to be calculated through SETTLE*/
318  size_t settleListSize;
319  CudaRattleElem* rattleList; /*<! Indexes of HGs with rigid bonds to be calculated through RATTLE*/
320  size_t rattleListSize;
321  int* d_globalToLocalID; /*<! maps PatchID to localID inside SOA structures */
322  int* d_patchToDeviceMap; /*<! Maps a patch to a device */
323  double3* d_awayDists; /*<! 'away' field from marginCheck */
324  double3* d_patchMin; /*<! 'min' field from HomePatch */
325  double3* d_patchMax; /*<! 'max' field from HomePatch */
326  double* d_patchMaxAtomMovement; /*<! maxMovement of atoms in the patch*/
327  double* d_patchNewTolerance; /*<! maxMovement of atoms in the patch*/
328  unsigned int* d_tbcatomic;
329 
330  PatchDataSOA* d_HostPatchDataSOA;
331 
332  // Host arrays
333  double *recipMass;
334  double *f_global_x, *f_global_y, *f_global_z;
335  double *f_normal_x, *f_normal_y, *f_normal_z;
336  double *f_nbond_x, *f_nbond_y, *f_nbond_z;
337  double *f_slow_x, *f_slow_y, *f_slow_z;
338  double *vel_x, *vel_y, *vel_z;
339  double *pos_x, *pos_y, *pos_z;
340  char3 *transform;
341 
342  float *mass;
343  float *charge;
344  int *partition;
345  float *langevinParam;
346  float *langScalVelBBK2;
347  float *langScalRandBBK2;
348  //float *gaussrand_x, *gaussrand_y, *gaussrand_z;
349  int *hydrogenGroupSize; /*<! hydrogen size for each heavy atom. Hydrogens have 0*/
350  float *rigidBondLength;
351  // for fixed atoms
352  int* atomFixed;
353  int* groupFixed; // fixed atom + langevin piston
354  double* fixedPosition_x;
355  double* fixedPosition_y;
356  double* fixedPosition_z;
357  int* globalToLocalID; /*<! maps PatchID to localID inside SOA structure */
358  int* patchToDeviceMap; /*<! maps PatchID to localID inside SOA structure */
359  int* id; /*<! Global atom index */
360  double3* patchCenter;
361  double3* awayDists;
362  double3* patchMin;
363  double3* patchMax;
364  double* patchMaxAtomMovement;
365  double* patchNewTolerance;
366  int* computeNbondPosition;
367  int* sortOrder;
368  int* unsortOrder;
369  Lattice* pairlist_lattices;
370  double pairlist_newTolerance;
371  Lattice myLattice;
372  Lattice myLatticeOld;
373 
374  CudaMInfo *mInfo;
375 
376  // Host-Mapped scalars
377  int* killme;
378  BigReal* kineticEnergy_half;
379  BigReal* intKineticEnergy_half;
380  BigReal* kineticEnergy;
381  BigReal* intKineticEnergy;
382  BigReal* momentum_x;
383  BigReal* momentum_y;
384  BigReal* momentum_z;
385  BigReal* angularMomentum_x;
386  BigReal* angularMomentum_y;
387  BigReal* angularMomentum_z;
388  cudaTensor *virial;
389  cudaTensor *virial_half;
390  cudaTensor *intVirialNormal;
391  cudaTensor *intVirialNormal_half;
392  cudaTensor *rigidVirial;
393  cudaTensor *intVirialNbond;
394  cudaTensor *intVirialSlow;
395  cudaTensor *lpVirialNormal;
396  cudaTensor *lpVirialNbond;
397  cudaTensor *lpVirialSlow;
398 
399  double3 *extForce;
400  double *extEnergy;
401  cudaTensor *extVirial;
402 
403  unsigned int *h_marginViolations;
404  unsigned int *h_periodicCellSmall;
405 
406  unsigned int totalMarginViolations;
407 
408  // Device scalars
409  bool buildRigidLists;
410  int *d_killme;
411  BigReal *d_kineticEnergy;
412  BigReal *d_intKineticEnergy;
413  cudaTensor *d_virial;
414  cudaTensor *d_intVirialNormal;
415  cudaTensor *d_intVirialNbond;
416  cudaTensor *d_intVirialSlow;
417  cudaTensor *d_rigidVirial;
418  cudaTensor *d_lpVirialNormal;
419  cudaTensor *d_lpVirialNbond;
420  cudaTensor *d_lpVirialSlow;
421  // cudaTensor *d_extTensor;
422  BigReal *d_momentum_x;
423  BigReal *d_momentum_y;
424  BigReal *d_momentum_z;
425  BigReal *d_angularMomentum_x;
426  BigReal *d_angularMomentum_y;
427  BigReal *d_angularMomentum_z;
428  Lattice *d_lattices;
429  Lattice *d_pairlist_lattices;
430 
431  double3 *d_extForce;
432  double *d_extEnergy;
433  cudaTensor *d_extVirial;
434  CudaMInfo *d_mInfo;
435 
436  // Haochuan: for fixed atoms
437  cudaTensor *d_fixVirialNormal;
438  cudaTensor *d_fixVirialNbond;
439  cudaTensor *d_fixVirialSlow;
440  double3 *d_fixForceNormal;
441  double3 *d_fixForceNbond;
442  double3 *d_fixForceSlow;
443 
444  int numAtomsHome;
445  int numAtomsHomePrev;
446  int numAtomsHomeAllocated;
447  int numAtomsHomeAndProxy;
448  int numAtomsHomeAndProxyAllocated;
449 
450  int numPatchesGlobal;
451  int numPatchesHomeAndProxy;
452  int numPatchesHome;
453 
454  int marginViolations;
455  bool rescalePairlistTolerance;
456  int nSettle, nRattle;
457  BigReal maxvel2;
458  SettleParameters *sp;
459  CudaAtom** cudaAtomLists;
460 
461  CmiNodeLock printlock;
462 
463  cudaEvent_t eventStart, eventStop; // events for kernel timing
464  float t_total;
465 
466  // Launch_pt1 timers
467  float t_vverlet;
468  float t_pairlistCheck;
469  float t_setComputePositions;
470 
471  // Launch_pt2 timers
472  float t_accumulateForceKick;
473  float t_rattle;
474  float t_submitHalf;
475  float t_submitReductions1;
476  float t_submitReductions2;
477 
478  // True if system is periodic in all directions
479  bool isPeriodic;
480 
481  std::vector<HomePatch*> patchList;
482 
483 #if 1
484  // TODO remove once GPU migration is merged
485  std::vector<HomePatch*> patchListHomeAndProxy;
486 #endif
487 
488  int* consFailure;
489  unsigned long long int d_ullmaxtol;
490  SequencerCUDAKernel *CUDASequencerKernel;
491  MigrationCUDAKernel *CUDAMigrationKernel;
492  ComputeRestraintsCUDA *restraintsKernel;
493  ComputeSMDCUDA *SMDKernel;
494  ComputeGroupRestraintsCUDA *groupRestraintsKernel;
495  ComputeGridForceCUDA *gridForceKernel;
496  ComputeLonepairsCUDA *lonepairsKernel;
497  curandGenerator_t curandGen;
498  ComputeConsForceCUDA *consForceKernel;
499  // alchemical used PME grids
500  size_t num_used_grids;
501  std::vector<int> used_grids;
502 
503 #if 1
504  unsigned int* deviceQueue; // pointer to work queue this device holds
505 
506  // if we have more than one device running this, we keep a record of all devices' data here
507  bool** h_patchRecordHasForces;
508  bool** d_patchRecordHasForces;
509  char* d_barrierFlag;
510 
511  CudaLocalRecord** d_peer_record;
512 
513 #endif
514 
515  void maximumMove(
516  const double maxvel2,
517  const int numAtoms);
518  void submitHalf(
519  int numAtoms, int part,
520  const bool doEnergy);
521  void submitReductions(
522  BigReal origin_x,
523  BigReal origin_y,
524  BigReal origin_z,
525  int marginViolations,
526  int doEnergy, // doEnergy
527  int doMomentum,
528  int numAtoms,
529  int maxForceNumber);
530 
531  void submitReductionValues();
532  void copyPositionsAndVelocitiesToHost(bool copyOut, const int doGlobal);
533  void copyPositionsToHost();
534  void startRun1(int maxForceNumber, const Lattice& lat);
535  void startRun2(
536  double dt_normal,
537  double dt_nbond,
538  double dt_slow,
539  Vector origin,
540  int doGlobal,
541  int maxForceNumber);
542  void startRun3(
543  double dt_normal,
544  double dt_nbond,
545  double dt_slow,
546  Vector origin,
547  const bool requestGlobalForces,
548  int doGlobalMasterStateForces,
549  const bool requestForcesOutput,
550  const bool requestGlobalForcesGPU,
551  int maxForceNumber);
552 
553  // TIP4 water model
554  void redistributeTip4pForces(
555  const int maxForceNumber,
556  const int doVirial);
557 
558  void printSOAForces(char *);
559  void printSOAPositionsAndVelocities();
560  void registerSOAPointersToHost();
561  void copySOAHostRegisterToDevice();
564  void calculateExternalForces(
565  const int step,
566  const int maxForceNumber,
567  const int doEnergy,
568  const int doVirial);
569 
570  void atomUpdatePme();
571 
572  void updateHostPatchDataSOA();
573  void saveForceCUDASOA_direct(
574  const bool doGlobal,
575  const bool doForcesOutput,
576  const int maxForceNumber);
577  void copyPositionsToHost_direct();
578 
579  int getNumPatchesHome() { return numPatchesHome; }
580 
581  double3* getHostPatchMin() { return patchMin; }
582  double3* getHostPatchMax() { return patchMax; }
583  double3* getHostAwayDists() { return awayDists; }
584 
585  CollectiveBufferType defaultCollectiveBufferType = CollectiveBufferType::SingleProcess;
586 };
587 
588 #endif // NAMD_CUDA
589 #endif // SEQUENCERCUDA_H
BigReal zy
Definition: Tensor.h:19
friend class SequencerCUDA
Definition: Sequencer.h:49
BigReal xz
Definition: Tensor.h:17
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
Definition: Vector.h:72
bool masterThread
Definition: Sequencer.h:331
BigReal yz
Definition: Tensor.h:18
BigReal yx
Definition: Tensor.h:18
#define simParams
Definition: Output.C:131
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal zx
Definition: Tensor.h:19
CollectiveBufferType
double BigReal
Definition: common.h:123
A class for copying atom information from SequencerCUDA to CudaGlobalMasterClient.