7 #include "ComputePmeCUDAMgr.decl.h"    11 #if defined(NAMD_CUDA) || defined(NAMD_HIP)    13 #define __thread __declspec(thread)    30   int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
    31   cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
    32   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
    34   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
    35     if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == 
true) {
    38       energyReady[iGrid] = 0;
    40       fftComputes[iGrid] = NULL;
    41       pmeKSpaceComputes[iGrid] = NULL;
    42       energyReady[iGrid] = -1;
    47 void CudaPmePencilXYZ::backwardDone() {
    48   deviceProxy[CkMyNode()].gatherForce();
    51   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
    52     if (pmeKSpaceComputes[iGrid] != NULL)
    62   submitReductions(iGrid);
    81   if (eventCreated) 
cudaCheck(cudaEventDestroy(event));
    91   deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
    92   stream = deviceProxy.ckLocalBranch()->getStream();
    93   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
    95   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
    96     if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == 
true) {
   100       fftComputes[iGrid] = NULL;
   101       pmeTransposes[iGrid] = NULL;
   105   deviceBuffers.resize(pmeGrid.xBlocks, 
DeviceBuffer(-1, 
false));
   106   numDeviceBuffers = 0;
   110   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   135   for (
int x=0;x < pmeGrid.xBlocks;x++) {
   136     int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
   137     if (CkNodeOf(pe) == CkMyNode()) {
   139       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(x, 0);
   141       int canAccessPeer = 0;
   142       if (deviceID != deviceID0) {
   144         cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
   149           unsigned int flags = 0;
   150           cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
   156       deviceBuffers[x] = 
DeviceBuffer(deviceID0, canAccessPeer);
   157       pmePencilZ(x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
   168 void CudaPmePencilXY::start(
const CkCallback &cb) {
   169   thisProxy[thisIndex].recvDeviceBuffers(cb);
   172 void CudaPmePencilXY::setDeviceBuffers() {
   173   std::array<std::vector<float2*>, 
NUM_GRID_MAX> dataPtrsGrid;
   175   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   176     dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
   177     for (
int x=0;x < pmeGrid.xBlocks;x++) {
   178       if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   179         if (deviceBuffers[x].deviceID == deviceID || deviceBuffers[x].isPeerDevice) {
   182           dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
   188       if (pmeTransposes[iGrid] != NULL) {
   189         ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
   195 std::array<float2*, NUM_GRID_MAX> CudaPmePencilXY::getData(
const int i, 
const bool sameDevice) {
   196   std::array<float2*, NUM_GRID_MAX> data_grid;
   197   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   198     if (fftComputes[iGrid] != NULL) {
   199 #ifndef P2P_ENABLE_3D   201         int i0, i1, j0, j1, k0, k1;
   202         getBlockDim(pmeGrid, 
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
   203         data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   205         data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
   208       int i0, i1, j0, j1, k0, k1;
   209       getBlockDim(pmeGrid, 
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
   210       data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   213       data_grid[iGrid] = NULL;
   219 void CudaPmePencilXY::backwardDone() {
   220   deviceProxy[CkMyNode()].gatherForce();
   223 void CudaPmePencilXY::forwardDone() {
   224   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   225     if (pmeTransposes[iGrid] != NULL) {
   227       pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataDst());
   229       if (numDeviceBuffers > 0) {
   231         for (
int x=0;x < pmeGrid.xBlocks;x++) {
   232           if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   233             if (deviceBuffers[x].deviceID != deviceID && !deviceBuffers[x].isPeerDevice) {
   234               ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(x, deviceBuffers[x].deviceID,
   241         cudaCheck(cudaEventRecord(event, stream));
   243         for (
int x=0;x < pmeGrid.xBlocks;x++) {
   244           if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   248             msg->
y = thisIndex.y;
   249             msg->
z = thisIndex.z;
   256             pmePencilZ(x,0,0).recvBlock(msg);
   262       for (
int x=0;x < pmeGrid.xBlocks;x++) {
   263         if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
   267           msg->
y = thisIndex.y;
   268           msg->
z = thisIndex.z;
   277           pmePencilZ(x,0,0).recvBlock(msg);
   284 void CudaPmePencilXY::recvDataFromZ(
PmeBlockMsg *msg) {
   294     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
   295 #ifndef P2P_ENABLE_3D   296     if (deviceBuffers[msg->
x].dataGrid[msg->
grid] != NULL && deviceBuffers[msg->
x].deviceID != deviceID && !deviceBuffers[msg->
x].isPeerDevice) {
   298       ((
CudaPmeTranspose *)(pmeTransposes[msg->
grid]))->copyDataDeviceToDevice(msg->
x, (float2 *)fftComputes[msg->
grid]->getDataDst());
   321   if (eventCreated) 
cudaCheck(cudaEventDestroy(event));
   331   deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
   332   stream = deviceProxy.ckLocalBranch()->getStream();
   333   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
   339   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   340     if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == 
true) {
   344       fftComputes[iGrid] = NULL;
   345       pmeTransposes[iGrid] = NULL;
   351   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   354   deviceBuffers.resize(pmeGrid.xBlocks, 
DeviceBuffer(-1, 
false));
   355   numDeviceBuffers = 0;
   357   for (
int x=0;x < pmeGrid.xBlocks;x++) {
   358     int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,thisIndex.z));
   359     if (CkNodeOf(pe) == CkMyNode()) {
   360       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(x, thisIndex.z);
   363       pmePencilY(x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
   372 void CudaPmePencilX::start(
const CkCallback &cb) {
   373   thisProxy[thisIndex].recvDeviceBuffers(cb);
   379 void CudaPmePencilX::setDeviceBuffers() {
   380   std::array<std::vector<float2*>, 
NUM_GRID_MAX> dataPtrsGrid;
   382   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   383     dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
   384     for (
int x=0;x < pmeGrid.xBlocks;x++) {
   385       if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   386         if (deviceBuffers[x].deviceID == deviceID) {
   388           dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
   394     if (pmeTransposes[iGrid] != NULL) {
   395       ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
   400 std::array<float2*, NUM_GRID_MAX> CudaPmePencilX::getData(
const int i, 
const bool sameDevice) {
   401   std::array<float2*, NUM_GRID_MAX> data_grid;
   402   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   403     if (fftComputes[iGrid] != NULL) {
   404 #ifndef P2P_ENABLE_3D   406         int i0, i1, j0, j1, k0, k1;
   407         getBlockDim(pmeGrid, 
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
   408         data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   410         data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
   413       int i0, i1, j0, j1, k0, k1;
   414       getBlockDim(pmeGrid, 
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
   415       data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   418       data_grid[iGrid] = NULL;
   424 void CudaPmePencilX::backwardDone() {
   425   deviceProxy[CkMyNode()].gatherForce();
   428 void CudaPmePencilX::forwardDone() {
   429   if (pmeTransposes[0] == NULL)
   430     NAMD_bug(
"CudaPmePencilX::forwardDone, pmeTranspose not initialized");
   431   if (blockSizes.size() == 0)
   432     NAMD_bug(
"CudaPmePencilX::forwardDone, blockSizes not initialized");
   433   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   434     if (pmeTransposes[iGrid] != NULL) {
   436       pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
   440       if (numDeviceBuffers > 0) {
   442         for (
int x=0;x < pmeGrid.xBlocks;x++) {
   443           if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   444             if (deviceBuffers[x].deviceID != deviceID) {
   445               ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(x, deviceBuffers[x].deviceID,
   452         cudaCheck(cudaEventRecord(event, stream));
   454         for (
int x=0;x < pmeGrid.xBlocks;x++) {
   455           if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
   459             msg->
y = thisIndex.y;
   460             msg->
z = thisIndex.z;
   467             pmePencilY(x,0,thisIndex.z).recvBlock(msg);     
   473       for (
int x=0;x < pmeGrid.xBlocks;x++) {
   474         if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
   478           msg->
y = thisIndex.y;
   479           msg->
z = thisIndex.z;
   488           pmePencilY(x,0,thisIndex.z).recvBlock(msg);
   495 void CudaPmePencilX::recvDataFromY(
PmeBlockMsg *msg) {
   504     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
   505 #ifndef P2P_ENABLE_3D   506     if (deviceBuffers[msg->
x].dataGrid[msg->
grid] != NULL && deviceBuffers[msg->
x].deviceID != deviceID) {
   508       ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
x, (float2 *)fftComputes[msg->
grid]->getDataDst());
   532   if (eventCreated) 
cudaCheck(cudaEventDestroy(event));
   540   CProxy_ComputePmeCUDADevice deviceProxy = msg->
deviceProxy;
   543   CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
   549   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   550     if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == 
true) {
   554       fftComputes[iGrid] = NULL;
   555       pmeTransposes[iGrid] = NULL;
   561   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   564   deviceBuffersZ.resize(pmeGrid.yBlocks, 
DeviceBuffer(-1, 
false));
   565   deviceBuffersX.resize(pmeGrid.yBlocks, 
DeviceBuffer(-1, 
false));
   566   numDeviceBuffersZ = 0;
   567   numDeviceBuffersX = 0;
   569   for (
int y=0;y < pmeGrid.yBlocks;y++) {
   571     pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x, y, 0));
   572     if (CkNodeOf(pe) == CkMyNode()) {
   573       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x, y);
   576       pmePencilZ(thisIndex.x, y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
   578     pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0, y, thisIndex.z));
   579     if (CkNodeOf(pe) == CkMyNode()) {
   580       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(y, thisIndex.z);
   583       pmePencilX(0, y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
   592 void CudaPmePencilY::start(
const CkCallback &cb) {
   593   thisProxy[thisIndex].recvDeviceBuffers(cb);
   599 void CudaPmePencilY::setDeviceBuffers() {
   600   std::array<std::vector<float2*>, 
NUM_GRID_MAX> dataPtrsYZXGrid;
   601   std::array<std::vector<float2*>, 
NUM_GRID_MAX> dataPtrsZXYGrid;
   602   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   603     dataPtrsYZXGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
   604     dataPtrsZXYGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
   605     for (
int y=0;y < pmeGrid.yBlocks;y++) {
   606       if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
   607         if (deviceBuffersZ[y].deviceID == deviceID) {
   608           dataPtrsYZXGrid[iGrid][y] = deviceBuffersZ[y].dataGrid[iGrid];
   611       if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
   612         if (deviceBuffersX[y].deviceID == deviceID) {
   613           dataPtrsZXYGrid[iGrid][y] = deviceBuffersX[y].dataGrid[iGrid];
   617     if (pmeTransposes[iGrid] != NULL) {
   618       ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsYZXGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
   619       ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsZXYGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
   624 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForX(
const int i, 
const bool sameDevice) {
   625   std::array<float2*, NUM_GRID_MAX> data_grid;
   626   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   627     if (fftComputes[iGrid] != NULL) {
   628 #ifndef P2P_ENABLE_3D   630         int i0, i1, j0, j1, k0, k1;
   631         getBlockDim(pmeGrid, 
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
   632         data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
   634         data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
   637       int i0, i1, j0, j1, k0, k1;
   638       getBlockDim(pmeGrid, 
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
   639       data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
   642       data_grid[iGrid] = NULL;
   648 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForZ(
const int i, 
const bool sameDevice) {
   649   std::array<float2*, NUM_GRID_MAX> data_grid;
   650   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   651     if (fftComputes[iGrid] != NULL) {
   652 #ifndef P2P_ENABLE_3D   654         int i0, i1, j0, j1, k0, k1;
   655         getBlockDim(pmeGrid, 
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
   656         data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   658         data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
   661       int i0, i1, j0, j1, k0, k1;
   662       getBlockDim(pmeGrid, 
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
   663       data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
   666       data_grid[iGrid] = NULL;
   672 void CudaPmePencilY::backwardDone() {
   673   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   674     if (pmeTransposes[iGrid] != NULL) {
   676       pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
   680       if (numDeviceBuffersX > 0) {
   681         for (
int y=0;y < pmeGrid.yBlocks;y++) {
   682           if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
   683             if (deviceBuffersX[y].deviceID != deviceID) {
   684               ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(y, deviceBuffersX[y].deviceID,
   690         cudaCheck(cudaEventRecord(event, stream));
   692         for (
int y=0;y < pmeGrid.yBlocks;y++) {
   693           if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
   696             msg->
x = thisIndex.x;
   698             msg->
z = thisIndex.z;
   701             pmePencilX(0,y,thisIndex.z).recvBlock(msg);
   707       for (
int y=0;y < pmeGrid.yBlocks;y++) {
   708         if (deviceBuffersX[y].dataGrid[iGrid] == NULL) {
   711           msg->
x = thisIndex.x;
   713           msg->
z = thisIndex.z;
   718           pmePencilX(0,y,thisIndex.z).recvBlock(msg);
   725 void CudaPmePencilY::forwardDone() {
   726   if (pmeTransposes[0] == NULL)
   727     NAMD_bug(
"CudaPmePencilY::forwardDone, pmeTranspose not initialized");
   728   if (blockSizes.size() == 0)
   729     NAMD_bug(
"CudaPmePencilY::forwardDone, blockSizes not initialized");
   731   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   732     if (pmeTransposes[iGrid] != NULL) {
   734       pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
   738       if (numDeviceBuffersZ > 0) {
   739         for (
int y=0;y < pmeGrid.yBlocks;y++) {
   740           if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
   741             if (deviceBuffersZ[y].deviceID != deviceID) {
   742               ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(y, deviceBuffersZ[y].deviceID,
   749         cudaCheck(cudaEventRecord(event, stream));
   751         for (
int y=0;y < pmeGrid.yBlocks;y++) {
   752           if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
   755             msg->
x = thisIndex.x;
   757             msg->
z = thisIndex.z;
   764             pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
   770       for (
int y=0;y < pmeGrid.yBlocks;y++) {
   771         if (deviceBuffersZ[y].dataGrid[iGrid] == NULL) {
   774           msg->
x = thisIndex.x;
   776           msg->
z = thisIndex.z;
   785           pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
   792 void CudaPmePencilY::recvDataFromX(
PmeBlockMsg *msg) {
   801     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->
y].event, 0));
   802 #ifndef P2P_ENABLE_3D   803     if (deviceBuffersX[msg->
y].dataGrid[msg->
grid] != NULL && deviceBuffersX[msg->
y].deviceID != deviceID) {
   805       ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
y, (float2 *)fftComputes[msg->
grid]->getDataSrc());
   812 void CudaPmePencilY::recvDataFromZ(
PmeBlockMsg *msg) {
   821     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->
y].event, 0));
   822 #ifndef P2P_ENABLE_3D   823     if (deviceBuffersZ[msg->
y].dataGrid[msg->
grid] != NULL && deviceBuffersZ[msg->
y].deviceID != deviceID) {
   825       ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
y, (float2 *)fftComputes[msg->
grid]->getDataDst());
   859   if (eventCreated) 
cudaCheck(cudaEventDestroy(event));
   867   CProxy_ComputePmeCUDADevice deviceProxy = msg->
deviceProxy;
   870   CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
   876   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   877     if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == 
true) {
   881       energyReady[iGrid] = 0;
   883       fftComputes[iGrid] = NULL;
   884       pmeTransposes[iGrid] = NULL;
   885       pmeKSpaceComputes[iGrid] = NULL;
   886       energyReady[iGrid] = -1;
   892   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   895   deviceBuffers.resize(pmeGrid.zBlocks, 
DeviceBuffer(-1, 
false));
   896   numDeviceBuffers = 0;
   899     for (
int z=0;z < pmeGrid.zBlocks;z++) {
   900       int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,z));
   901       if (CkNodeOf(pe) == CkMyNode()) {
   902         int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0, z);
   904         int canAccessPeer = 0;
   905         if (deviceID != deviceID0) {
   907           cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
   913         deviceBuffers[z] = 
DeviceBuffer(deviceID0, canAccessPeer);
   914         pmePencilXY(0,0,z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
   918     for (
int z=0;z < pmeGrid.zBlocks;z++) {
   919       int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,z));
   920       if (CkNodeOf(pe) == CkMyNode()) {
   921         int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x, z);
   924         pmePencilY(thisIndex.x,0,z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
   934 void CudaPmePencilZ::start(
const CkCallback &cb) {
   935   thisProxy[thisIndex].recvDeviceBuffers(cb);
   938 void CudaPmePencilZ::setDeviceBuffers() {
   939   std::array<std::vector<float2*>, 
NUM_GRID_MAX> dataPtrsGrid;
   940   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   941     dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.zBlocks, (float2*)0);
   942     for (
int z=0;z < pmeGrid.zBlocks;z++) {
   943       if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
   944         if (deviceBuffers[z].deviceID == deviceID || deviceBuffers[z].isPeerDevice) {
   945           dataPtrsGrid[iGrid][z] = deviceBuffers[z].dataGrid[iGrid];
   950       if (pmeTransposes[iGrid] != NULL) {
   951         ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
   954       if (pmeTransposes[iGrid] != NULL) {
   955         ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
   961 std::array<float2*, NUM_GRID_MAX> CudaPmePencilZ::getData(
const int i, 
const bool sameDevice) {
   962   std::array<float2*, NUM_GRID_MAX> data_grid;
   963   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   964     if (fftComputes[iGrid] != NULL) {
   965 #ifndef P2P_ENABLE_3D   967         int i0, i1, j0, j1, k0, k1;
   968         getBlockDim(pmeGrid, 
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
   969         data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
   971         data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
   974       int i0, i1, j0, j1, k0, k1;
   975       getBlockDim(pmeGrid, 
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
   976       data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
   979       data_grid[iGrid] = NULL;
   985 void CudaPmePencilZ::backwardDone() {
   986   for (
unsigned int iGrid = 0; iGrid < 
NUM_GRID_MAX; ++iGrid) {
   987     if (pmeTransposes[iGrid] != NULL) {
   990         pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataSrc());
   992         pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
   997         if (numDeviceBuffers > 0) {
   998           for (
int z=0;z < pmeGrid.zBlocks;z++) {
   999             if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
  1000               if (deviceBuffers[z].deviceID != deviceID && !deviceBuffers[z].isPeerDevice) {
  1001                 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(z, deviceBuffers[z].deviceID,
  1007           cudaCheck(cudaEventRecord(event, stream));
  1009           for (
int z=0;z < pmeGrid.zBlocks;z++) {
  1010             if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
  1013               msg->
x = thisIndex.x;
  1014               msg->
y = thisIndex.y;
  1018               pmePencilXY(0,0,z).recvBlock(msg);
  1024         for (
int z=0;z < pmeGrid.zBlocks;z++) {
  1025           if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
  1028             msg->
x = thisIndex.x;
  1029             msg->
y = thisIndex.y;
  1035             pmePencilXY(0,0,z).recvBlock(msg);
  1041         if (numDeviceBuffers > 0) {
  1042           for (
int z=0;z < pmeGrid.zBlocks;z++) {
  1043             if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
  1044               if (deviceBuffers[z].deviceID != deviceID) {
  1045                 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(z, deviceBuffers[z].deviceID,
  1051           cudaCheck(cudaEventRecord(event, stream));
  1053           for (
int z=0;z < pmeGrid.zBlocks;z++) {
  1054             if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
  1057               msg->
x = thisIndex.x;
  1058               msg->
y = thisIndex.y;
  1062               pmePencilY(thisIndex.x,0,z).recvBlock(msg);
  1068         for (
int z=0;z < pmeGrid.zBlocks;z++) {
  1069           if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
  1072             msg->
x = thisIndex.x;
  1073             msg->
y = thisIndex.y;
  1079             pmePencilY(thisIndex.x,0,z).recvBlock(msg);
  1093   submitReductions(iGrid);
  1096 void CudaPmePencilZ::recvDataFromY(
PmeBlockMsg *msg) {
  1106     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
z].event, 0));
  1107 #ifndef P2P_ENABLE_3D  1108     if (deviceBuffers[msg->
z].dataGrid[0] != NULL && deviceBuffers[msg->
z].deviceID != deviceID && !deviceBuffers[msg->
z].isPeerDevice) {
  1110       ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
z, (float2 *)fftComputes[msg->
grid]->getDataSrc());
  1118 #include "CudaPmeSolver.def.h" 
CProxy_PmePencilXMap zMap
 
void initialize(CudaPmeXInitMsg *msg)
 
void initialize(CudaPmeXYInitMsg *msg)
 
CProxy_PmePencilXYMap xyMap
 
CProxy_ComputePmeCUDADevice deviceProxy
 
CProxy_CudaPmePencilZ pmePencilZ
 
CProxy_PmePencilXMap xMap
 
void energyAndVirialDone(unsigned int iGrid)
 
CProxy_CudaPmePencilX pmePencilX
 
const unsigned int NUM_GRID_MAX
 
void initializeDevice(InitDeviceMsg2 *msg)
 
CProxy_ComputePmeCUDADevice deviceProxy
 
void initialize(CudaPmeXInitMsg *msg)
 
void initializeDevice(InitDeviceMsg *msg)
 
CProxy_PmePencilXMap zMap
 
void initializeDevice(InitDeviceMsg *msg)
 
void NAMD_bug(const char *err_msg)
 
__thread DeviceCUDA * deviceCUDA
 
void initializeDevice(InitDeviceMsg2 *msg)
 
CProxy_CudaPmePencilZ pmePencilZ
 
void energyAndVirialDone(unsigned int iGrid)
 
void initialize(CudaPmeXYZInitMsg *msg)
 
CProxy_CudaPmePencilY pmePencilY
 
void initialize(CudaPmeXInitMsg *msg)
 
void initializeDevice(InitDeviceMsg *msg)
 
CProxy_CudaPmePencilXY pmePencilXY
 
CProxy_ComputePmeCUDAMgr mgrProxy
 
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
 
CProxy_PmePencilXMap yMap