NAMD
ComputeCUDAMgr.C
Go to the documentation of this file.
1 #include "NamdTypes.h"
2 #include "common.h"
3 #include "Node.h"
4 #include "ComputeCUDAMgr.h"
5 #include "PatchData.h"
6 #include "DeviceCUDA.h"
7 #include "CudaUtils.h"
9 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
10 #ifdef WIN32
11 #define __thread __declspec(thread)
12 #endif
13 extern __thread DeviceCUDA *deviceCUDA;
14 
15 //
16 // Class constructor
17 //
19  // __sdag_init();
20  numDevices = 0;
21  // numNodesContributed = 0;
22  // numDevicesMax = 0;
23  cudaPmeOneDevice = NULL; // XXX is this needed?
24  cudaGlobalMasterObject = nullptr;
25 }
26 
27 //
28 // Class constructor
29 //
30 ComputeCUDAMgr::ComputeCUDAMgr(CkMigrateMessage *) {
31  // __sdag_init();
32  NAMD_bug("ComputeCUDAMgr cannot be migrated");
33  numDevices = 0;
34  // numNodesContributed = 0;
35  // numDevicesMax = 0;
36  cudaPmeOneDevice = NULL; // XXX is this needed?
37  cudaGlobalMasterObject = nullptr;
38 }
39 
40 //
41 // Class destructor
42 //
44  for (int i=0;i < numDevices;i++) {
45  if (cudaNonbondedTablesList[i] != NULL) delete cudaNonbondedTablesList[i];
46  if (cudaComputeNonbondedList[i] != NULL) delete cudaComputeNonbondedList[i];
47 #ifdef BONDED_CUDA
48  if (computeBondedCUDAList[i] != NULL) delete computeBondedCUDAList[i];
49 #endif
50  if(curSMDCOM != NULL && curSMDCOM[i] != NULL) cudaFree(curSMDCOM[i]);
51  if(curGrp1COM != NULL && curGrp1COM[i] != NULL) cudaFree(curGrp1COM[i]);
52  if(curGrp2COM != NULL && curGrp2COM[i] != NULL) cudaFree(curGrp2COM[i]);
53  }
54  if(curSMDCOM != NULL) cudaFree(curSMDCOM);
55  if(curGrp1COM != NULL) cudaFree(curGrp1COM);
56  if(curGrp2COM != NULL) cudaFree(curGrp2COM);
57 }
58 
59 //
60 // Initialize manager
61 // This gets called on rank 0 of each node
62 //
63 void ComputeCUDAMgr::initialize(CkQdMsg *msg) {
64  if (msg != NULL) delete msg;
65 
66  numDevices = deviceCUDA->getDeviceCount();
67 #ifdef NODEGROUP_FORCE_REGISTER
68  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
69  PatchData *pdata = cpdata.ckLocalBranch();
70  int ndevs = deviceCUDA->getNumDevice() + 1*deviceCUDA->isGpuReservedPme();
71  pdata->devData.resize(numDevices);
72 
73  {
74  // Pointers to SOA integration data
75  allocate_host<bool*>(&(pdata->h_devHasForces),ndevs);
76 
77  allocate_host<int*> (&(pdata->h_soa_id), deviceCUDA->getNumDevice());
78  allocate_host<int*> (&(pdata->h_soa_vdwType), deviceCUDA->getNumDevice());
79  allocate_host<int*> (&(pdata->h_soa_sortOrder), deviceCUDA->getNumDevice());
80  allocate_host<int4*> (&(pdata->h_soa_migrationDestination), deviceCUDA->getNumDevice());
81 
82  allocate_host<int*> (&(pdata->h_soa_partition), deviceCUDA->getNumDevice());
83 
84  allocate_host<CudaLocalRecord*>(&(pdata->h_peer_record), deviceCUDA->getNumDevice());
85 
86  allocate_host<int*>(&(pdata->h_tupleCount.bond), deviceCUDA->getNumDevice());
87  allocate_host<int*>(&(pdata->h_tupleCount.angle), deviceCUDA->getNumDevice());
88  allocate_host<int*>(&(pdata->h_tupleCount.dihedral), deviceCUDA->getNumDevice());
89  allocate_host<int*>(&(pdata->h_tupleCount.improper), deviceCUDA->getNumDevice());
90  allocate_host<int*>(&(pdata->h_tupleCount.modifiedExclusion), deviceCUDA->getNumDevice());
91  allocate_host<int*>(&(pdata->h_tupleCount.exclusion), deviceCUDA->getNumDevice());
92  allocate_host<int*>(&(pdata->h_tupleCount.crossterm), deviceCUDA->getNumDevice());
93 
94  allocate_host<int*>(&(pdata->h_tupleOffset.bond), deviceCUDA->getNumDevice());
95  allocate_host<int*>(&(pdata->h_tupleOffset.angle), deviceCUDA->getNumDevice());
96  allocate_host<int*>(&(pdata->h_tupleOffset.dihedral), deviceCUDA->getNumDevice());
97  allocate_host<int*>(&(pdata->h_tupleOffset.improper), deviceCUDA->getNumDevice());
98  allocate_host<int*>(&(pdata->h_tupleOffset.modifiedExclusion), deviceCUDA->getNumDevice());
99  allocate_host<int*>(&(pdata->h_tupleOffset.exclusion), deviceCUDA->getNumDevice());
100  allocate_host<int*>(&(pdata->h_tupleOffset.crossterm), deviceCUDA->getNumDevice());
101 
102  allocate_host<CudaBondStage*>(&(pdata->h_tupleDataStage.bond), deviceCUDA->getNumDevice());
103  allocate_host<CudaAngleStage*>(&(pdata->h_tupleDataStage.angle), deviceCUDA->getNumDevice());
104  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.dihedral), deviceCUDA->getNumDevice());
105  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.improper), deviceCUDA->getNumDevice());
106  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.modifiedExclusion), deviceCUDA->getNumDevice());
107  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.exclusion), deviceCUDA->getNumDevice());
108  allocate_host<CudaCrosstermStage*>(&(pdata->h_tupleDataStage.crossterm), deviceCUDA->getNumDevice());
109  }
110 
111  // Allocate the work queues
112  allocate_host<unsigned int*>(&(pdata->d_queues), ndevs);
113  allocate_host<unsigned int>(&(pdata->d_queueCounters), ndevs);
114 
115  cudaCheck(cudaMemset(pdata->d_queueCounters, 0, sizeof(unsigned int)*ndevs));
116 
117  pdata->migrationFlagPerDevice.resize(deviceCUDA->getNumDevice());
118 
119  pdata->tupleReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
120  pdata->atomReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
121  pdata->maxNumBonds.store(0);
122  pdata->maxNumAngles.store(0);
123  pdata->maxNumDihedrals.store(0);
124  pdata->maxNumImpropers.store(0);
125  pdata->maxNumModifiedExclusions.store(0);
126  pdata->maxNumExclusions.store(0);
127  pdata->maxNumCrossterms.store(0);
128  pdata->devicePatchMapFlag.resize(CkNumPes(), 0);
129 #ifdef NAMD_NCCL_ALLREDUCE
130  // Allocate NCCL-related stuff
131  deviceCUDA->setupNcclUniqueId();
132  // After I do this, I can go ahead and register it in patchData
133  pdata->ncclId = deviceCUDA->getNcclUniqueId(); // registered in ngroup
134 #endif
135  // allocate global data for mGpuOn shared memory accumulation
136  // one per device, each referencing a numDevices element buffer
138  if(simParams->SMDOn && numDevices>1){
139  allocate_device<double3*>(&curSMDCOM, sizeof(double3*)*numDevices);
140  }
141  else
142  {
143  curSMDCOM = NULL;
144  }
145  if(simParams->groupRestraintsOn){
146  // as SMD, but we we need numGroups buffers for type1 and type2
147  allocate_host<double3**>(&curGrp1COM, sizeof(double3**)*simParams->groupRestraintsCount);
148  allocate_host<double3**>(&curGrp2COM, sizeof(double3**)*simParams->groupRestraintsCount);
149  for(int i=0;i<simParams->groupRestraintsCount;i++)
150  {
151  allocate_device<double3*>(&curGrp1COM[i], sizeof(double3*)*numDevices);
152  allocate_device<double3*>(&curGrp2COM[i], sizeof(double3*)*numDevices);
153  }
154  }
155  else
156  {
157  curGrp1COM = NULL;
158  curGrp2COM = NULL;
159  }
160 #endif
161 
162  // Create pointers to devices
163  cudaNonbondedTablesList.resize(numDevices, NULL);
164  cudaComputeNonbondedList.resize(numDevices, NULL);
165 #ifdef BONDED_CUDA
166  computeBondedCUDAList.resize(numDevices, NULL);
167 #endif
168  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
169  cudaPmeOneDevice = NULL;
170 
171  // Create CUDA non-bonded tables for all devices that are used for computation
172  for (int i=0;i < deviceCUDA->getNumDevice();i++) {
173  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
174  cudaNonbondedTablesList[deviceID] = new CudaNonbondedTables(deviceID);
175  }
176 }
177 
178 //
179 // Update nonbonded tables
180 // Should be called only on rank 0 of each node
181 //
183  if ( CkMyRank() ) NAMD_bug("ComputeCUDAMgr::update() should be called only by rank 0");
184  for (int i=0; i < deviceCUDA->getNumDevice(); i++) {
185  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
186  // calls update function from CudaNonbondedTables
187  cudaNonbondedTablesList[deviceID]->updateTables();
188  }
189 }
190 
192  // Get pointer to ComputeCUDAMgr on this node
193  CProxy_ComputeCUDAMgr computeCUDAMgrProxy = CkpvAccess(BOCclass_group).computeCUDAMgr;
194  ComputeCUDAMgr* computeCUDAMgr = computeCUDAMgrProxy.ckLocalBranch();
195  if (computeCUDAMgr == NULL)
196  NAMD_bug("getComputeCUDAMgr, unable to locate local branch of BOC entry ComputeCUDAMgr");
197  return computeCUDAMgr;
198 }
199 
201  // initialize pmeGrid from simParams
203  PmeGrid pmeGrid;
204  pmeGrid.K1 = simParams->PMEGridSizeX;
205  pmeGrid.K2 = simParams->PMEGridSizeY;
206  pmeGrid.K3 = simParams->PMEGridSizeZ;
207  pmeGrid.order = simParams->PMEInterpOrder;
208  pmeGrid.dim2 = pmeGrid.K2;
209  pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
210  // override settings for PME pencils
211  pmeGrid.xBlocks = 1;
212  pmeGrid.yBlocks = 1;
213  pmeGrid.zBlocks = 1;
214  pmeGrid.block1 = pmeGrid.K1;
215  pmeGrid.block2 = pmeGrid.K2;
216  pmeGrid.block3 = pmeGrid.K3;
217  // use shared deviceID class
218  int deviceID = 0;
219  int deviceIndex = 0;
220 #ifdef NODEGROUP_FORCE_REGISTER
221  deviceID = deviceCUDA->getPmeDevice();
222  deviceIndex = deviceCUDA->getPmeDeviceIndex();
223 #endif
224  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
225  cudaPmeOneDevice = new CudaPmeOneDevice(pmeGrid, deviceID, deviceIndex);
226  return cudaPmeOneDevice;
227 }
228 
230  return cudaPmeOneDevice;
231 }
232 
233 //
234 // Creates CudaComputeNonbonded object
235 //
237  int deviceID = deviceCUDA->getDeviceID();
238  if (cudaComputeNonbondedList.at(deviceID) != NULL)
239  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded called twice");
240  if (cudaNonbondedTablesList.at(deviceID) == NULL)
241  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
242  //bool doStreaming = !deviceCUDA->getNoStreaming() && !Node::Object()->simParameters->GBISOn && !Node::Object()->simParameters->CUDASOAintegrate;
244  cudaComputeNonbondedList[deviceID] = new CudaComputeNonbonded(c, deviceID, *cudaNonbondedTablesList[deviceID], doStreaming);
245  return cudaComputeNonbondedList[deviceID];
246 }
247 
248 //
249 // Returns CudaComputeNonbonded for this Pe
250 //
252  // Get device ID for this Pe
253  int deviceID = deviceCUDA->getDeviceID();
254  CudaComputeNonbonded* p = cudaComputeNonbondedList[deviceID];
255  if (p == NULL)
256  NAMD_bug("ComputeCUDAMgr::getCudaComputeNonbonded(), device not created yet");
257  return p;
258 }
259 
260 #ifdef BONDED_CUDA
261 //
262 // Creates ComputeBondedCUDA object
263 //
264 ComputeBondedCUDA* ComputeCUDAMgr::createComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr) {
265  int deviceID = deviceCUDA->getDeviceID();
266  if (computeBondedCUDAList.at(deviceID) != NULL)
267  NAMD_bug("ComputeCUDAMgr::createComputeBondedCUDA called twice");
268  if (cudaNonbondedTablesList.at(deviceID) == NULL)
269  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
270  computeBondedCUDAList[deviceID] = new ComputeBondedCUDA(c, computeMgr, deviceID, *cudaNonbondedTablesList[deviceID]);
271  return computeBondedCUDAList[deviceID];
272 }
273 
274 //
275 // Returns ComputeBondedCUDA for this Pe
276 //
277 ComputeBondedCUDA* ComputeCUDAMgr::getComputeBondedCUDA() {
278  // Get device ID for this Pe
279  int deviceID = deviceCUDA->getDeviceID();
280  ComputeBondedCUDA* p = computeBondedCUDAList[deviceID];
281  if (p == NULL)
282  NAMD_bug("ComputeCUDAMgr::getComputeBondedCUDA(), device not created yet");
283  return p;
284 }
285 #endif // BONDED_CUDA
286 
287 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::getCudaGlobalMaster() {
288  return cudaGlobalMasterObject;
289 }
290 
291 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::createCudaGlobalMaster() {
292  iout << iINFO << "Creating CUDAGlobalMaster on PE " << CkMyPe() << '\n' << endi;
293  if (cudaGlobalMasterObject) {
294  return cudaGlobalMasterObject;
295  }
296  const int deviceID = deviceCUDA->getGlobalDevice();
298  cudaGlobalMasterObject = std::make_shared<CudaGlobalMasterServer>(deviceID, simParams->cudaGlobalProfilingFreq);
299  return cudaGlobalMasterObject;
300 }
301 
302 #endif // defined(NAMD_CUDA) || defined(NAMD_HIP)
303 
304 #include "ComputeCUDAMgr.def.h"
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
int zBlocks
Definition: PmeBase.h:25
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
int getDeviceCount()
Definition: DeviceCUDA.h:124
std::shared_ptr< CudaGlobalMasterServer > getCudaGlobalMaster()
int dim3
Definition: PmeBase.h:22
int32 ComputeID
Definition: NamdTypes.h:288
double3 ** curSMDCOM
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void initialize(CkQdMsg *msg)
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int getPmeDevice()
Definition: DeviceCUDA.h:165
int block1
Definition: PmeBase.h:24
int getNumDevice()
Definition: DeviceCUDA.h:125
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int getGlobalDevice() const
Definition: DeviceCUDA.h:171
int yBlocks
Definition: PmeBase.h:25
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
int order
Definition: PmeBase.h:23
double3 *** curGrp2COM
void NAMD_bug(const char *err_msg)
Definition: common.C:196
static ComputeCUDAMgr * getComputeCUDAMgr()
int block3
Definition: PmeBase.h:24
double3 *** curGrp1COM
std::shared_ptr< CudaGlobalMasterServer > createCudaGlobalMaster()
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
int getDeviceIDbyRank(int rank)
Definition: DeviceCUDA.h:145
int getNoStreaming()
Definition: DeviceCUDA.h:130
CudaComputeNonbonded * getCudaComputeNonbonded()
CudaComputeNonbonded * createCudaComputeNonbonded(ComputeID c)
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int xBlocks
Definition: PmeBase.h:25
CudaPmeOneDevice * createCudaPmeOneDevice()
CudaPmeOneDevice * getCudaPmeOneDevice()