NAMD
Public Member Functions | List of all members
CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 
 ~CudaPmeTranspose ()
 
void setDataPtrsYZX (std::vector< float2 *> &dataPtrsNew, float2 *data)
 
void setDataPtrsZXY (std::vector< float2 *> &dataPtrsNew, float2 *data)
 
void transposeXYZtoYZX (const float2 *data)
 
void transposeXYZtoZXY (const float2 *data)
 
void waitStreamSynchronize ()
 
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
 
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
 
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
 
float2 * getBuffer (const int iblock)
 
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
- Public Member Functions inherited from PmeTranspose
 PmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
 
virtual ~PmeTranspose ()
 

Additional Inherited Members

- Protected Attributes inherited from PmeTranspose
PmeGrid pmeGrid
 
const int permutation
 
const int jblock
 
const int kblock
 
int isize
 
int jsize
 
int ksize
 
int dataSize
 
int nblock
 
std::vector< int > pos
 

Detailed Description

Definition at line 158 of file CudaPmeSolverUtil.h.

Constructor & Destructor Documentation

◆ CudaPmeTranspose()

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 845 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

846  :
847  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
848  cudaCheck(cudaSetDevice(deviceID));
849 
850  allocate_device<float2>(&d_data, dataSize);
851 #ifndef P2P_ENABLE_3D
852  allocate_device<float2>(&d_buffer, dataSize);
853 #endif
854 
855  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
856  dataPtrsYZX.resize(nblock, NULL);
857  dataPtrsZXY.resize(nblock, NULL);
858 
859  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
860  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
861 }
const int permutation
PmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
PmeGrid pmeGrid
const int jblock
const int kblock
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ ~CudaPmeTranspose()

CudaPmeTranspose::~CudaPmeTranspose ( )

Definition at line 863 of file CudaPmeSolverUtil.C.

References cudaCheck.

863  {
864  cudaCheck(cudaSetDevice(deviceID));
865  deallocate_device<float2>(&d_data);
866 #ifndef P2P_ENABLE_3D
867  deallocate_device<float2>(&d_buffer);
868 #endif
869  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
870  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
871 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

Member Function Documentation

◆ copyDataDeviceToDevice()

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 *  data_out 
)

Definition at line 1207 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1207  {
1208  cudaCheck(cudaSetDevice(deviceID));
1209 
1210  if (iblock >= nblock)
1211  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1212 
1213  // Determine block size = how much we're copying
1214  int i0, i1, j0, j1, k0, k1;
1215  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1216  int ni = i1-i0+1;
1217  int nj = j1-j0+1;
1218  int nk = k1-k0+1;
1219 
1220  float2* data_in = d_buffer + i0*nj*nk;
1221 
1222  copy3D_DtoD<float2>(data_in, data_out,
1223  0, 0, 0,
1224  ni, nj,
1225  i0, 0, 0,
1226  isize, jsize,
1227  ni, nj, nk, stream);
1228 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:196
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ copyDataDeviceToHost()

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 *  h_data,
const int  h_dataSize 
)

Definition at line 1161 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

1161  {
1162  cudaCheck(cudaSetDevice(deviceID));
1163 
1164  if (iblock >= nblock)
1165  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1166 
1167  int x0 = pos[iblock];
1168  int nx = pos[iblock+1] - x0;
1169 
1170  int copySize = jsize*ksize*nx;
1171  int copyStart = jsize*ksize*x0;
1172 
1173  if (copyStart + copySize > dataSize)
1174  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1175 
1176  if (copySize > h_dataSize)
1177  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1178 
1179  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1180 }
std::vector< int > pos
void NAMD_bug(const char *err_msg)
Definition: common.C:196
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ copyDataHostToDevice()

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 *  data_in,
float2 *  data_out 
)

Definition at line 1182 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1182  {
1183  cudaCheck(cudaSetDevice(deviceID));
1184 
1185  if (iblock >= nblock)
1186  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1187 
1188  // Determine block size = how much we're copying
1189  int i0, i1, j0, j1, k0, k1;
1190  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1191  int ni = i1-i0+1;
1192  int nj = j1-j0+1;
1193  int nk = k1-k0+1;
1194 
1195  copy3D_HtoD<float2>(data_in, data_out,
1196  0, 0, 0,
1197  ni, nj,
1198  i0, 0, 0,
1199  isize, jsize,
1200  ni, nj, nk, stream);
1201 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:196
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ copyDataToPeerDeviceYZX()

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 *  data_out 
)

Definition at line 1248 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1249  {
1250 
1251  int iblock_out = jblock;
1252  int jblock_out = kblock;
1253  int kblock_out = iblock;
1254 
1255  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1256 }
const int jblock
const int kblock

◆ copyDataToPeerDeviceZXY()

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 *  data_out 
)

Definition at line 1258 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1259  {
1260 
1261  int iblock_out = kblock;
1262  int jblock_out = iblock;
1263  int kblock_out = jblock;
1264 
1265  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1266 }
const int jblock
const int kblock

◆ getBuffer()

float2 * CudaPmeTranspose::getBuffer ( const int  iblock)

Definition at line 1233 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1233  {
1234  if (iblock >= nblock)
1235  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1236 
1237  // Determine block size = how much we're copying
1238  int i0, i1, j0, j1, k0, k1;
1239  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1240  int ni = i1-i0+1;
1241  int nj = j1-j0+1;
1242  int nk = k1-k0+1;
1243 
1244  return d_buffer + i0*nj*nk;
1245 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:196
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89

◆ setDataPtrsYZX()

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 *> &  dataPtrsNew,
float2 *  data 
)

Definition at line 876 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

876  {
877  if (dataPtrsYZX.size() != dataPtrsNew.size())
878  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
879  for (int iblock=0;iblock < nblock;iblock++) {
880  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
881  }
882  // Build batched data structures
884 
885  for (int iperm=0;iperm < 3;iperm++) {
886  int isize_out;
887  if (iperm == 0) {
888  // Perm_Z_cX_Y:
889  // ZXY -> XYZ
890  isize_out = pmeGrid.K1/2+1;
891  } else if (iperm == 1) {
892  // Perm_cX_Y_Z:
893  // XYZ -> YZX
894  isize_out = pmeGrid.K2;
895  } else {
896  // Perm_Y_Z_cX:
897  // YZX -> ZXY
898  isize_out = pmeGrid.K3;
899  }
900 
901  int max_nx = 0;
902  for (int iblock=0;iblock < nblock;iblock++) {
903 
904  int x0 = pos[iblock];
905  int nx = pos[iblock+1] - x0;
906  max_nx = std::max(max_nx, nx);
907 
908  int width_out;
909  float2* data_out;
910  if (dataPtrsYZX[iblock] == NULL) {
911  // Local transpose, use internal buffer
912  data_out = d_data + jsize*ksize*x0;
913  width_out = jsize;
914  } else {
915  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
916  data_out = dataPtrsYZX[iblock];
917  width_out = isize_out;
918  }
919 
921  batch.nx = nx;
922  batch.ysize_out = width_out;
923  batch.zsize_out = ksize;
924  batch.data_in = data+x0;
925  batch.data_out = data_out;
926 
927  h_batchesYZX[iperm*nblock + iblock] = batch;
928 
929  // transpose_xyz_yzx(
930  // nx, jsize, ksize,
931  // isize, jsize,
932  // width_out, ksize,
933  // data+x0, data_out, stream);
934  }
935 
936  max_nx_YZX[iperm] = max_nx;
937  }
938 
939  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
940  cudaCheck(cudaStreamSynchronize(stream));
941  delete [] h_batchesYZX;
942 }
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
std::vector< int > pos
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:196
int K3
Definition: PmeBase.h:21
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ setDataPtrsZXY()

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 *> &  dataPtrsNew,
float2 *  data 
)

Definition at line 947 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

947  {
948  if (dataPtrsZXY.size() != dataPtrsNew.size())
949  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
950  for (int iblock=0;iblock < nblock;iblock++) {
951  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
952  }
953 
954  // Build batched data structures
956 
957  for (int iperm=0;iperm < 3;iperm++) {
958  int isize_out;
959  if (iperm == 0) {
960  // Perm_cX_Y_Z:
961  // XYZ -> ZXY
962  isize_out = pmeGrid.K3;
963  } else if (iperm == 1) {
964  // Perm_Z_cX_Y:
965  // ZXY -> YZX
966  isize_out = pmeGrid.K2;
967  } else {
968  // Perm_Y_Z_cX:
969  // YZX -> XYZ
970  isize_out = pmeGrid.K1/2+1;
971  }
972 
973  int max_nx = 0;
974  for (int iblock=0;iblock < nblock;iblock++) {
975 
976  int x0 = pos[iblock];
977  int nx = pos[iblock+1] - x0;
978  max_nx = std::max(max_nx, nx);
979 
980  int width_out;
981  float2* data_out;
982  if (dataPtrsZXY[iblock] == NULL) {
983  // Local transpose, use internal buffer
984  data_out = d_data + jsize*ksize*x0;
985  width_out = ksize;
986  } else {
987  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
988  data_out = dataPtrsZXY[iblock];
989  width_out = isize_out;
990  }
991 
993  batch.nx = nx;
994  batch.zsize_out = width_out;
995  batch.xsize_out = nx;
996  batch.data_in = data+x0;
997  batch.data_out = data_out;
998  h_batchesZXY[iperm*nblock + iblock] = batch;
999  }
1000 
1001  max_nx_ZXY[iperm] = max_nx;
1002  }
1003 
1004  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
1005  cudaCheck(cudaStreamSynchronize(stream));
1006  delete [] h_batchesZXY;
1007 }
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
std::vector< int > pos
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:196
int K3
Definition: PmeBase.h:21
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ transposeXYZtoYZX()

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 *  data)
virtual

Implements PmeTranspose.

Definition at line 1009 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

1009  {
1010  cudaCheck(cudaSetDevice(deviceID));
1011 
1012  int iperm;
1013  switch(permutation) {
1014  case Perm_Z_cX_Y:
1015  // ZXY -> XYZ
1016  iperm = 0;
1017  break;
1018  case Perm_cX_Y_Z:
1019  // XYZ -> YZX
1020  iperm = 1;
1021  break;
1022  case Perm_Y_Z_cX:
1023  // YZX -> ZXY
1024  iperm = 2;
1025  break;
1026  default:
1027  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1028  break;
1029  }
1030 
1032  nblock, batchesYZX + iperm*nblock,
1033  max_nx_YZX[iperm], jsize, ksize,
1034  isize, jsize, stream);
1035 
1036 
1037 /*
1038  int isize_out;
1039  switch(permutation) {
1040  case Perm_Z_cX_Y:
1041  // ZXY -> XYZ
1042  isize_out = pmeGrid.K1/2+1;
1043  break;
1044  case Perm_cX_Y_Z:
1045  // XYZ -> YZX
1046  isize_out = pmeGrid.K2;
1047  break;
1048  case Perm_Y_Z_cX:
1049  // YZX -> ZXY
1050  isize_out = pmeGrid.K3;
1051  break;
1052  default:
1053  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1054  break;
1055  }
1056 
1057  for (int iblock=0;iblock < nblock;iblock++) {
1058 
1059  int x0 = pos[iblock];
1060  int nx = pos[iblock+1] - x0;
1061 
1062  int width_out;
1063  float2* data_out;
1064  if (dataPtrsYZX[iblock] == NULL) {
1065  // Local transpose, use internal buffer
1066  data_out = d_data + jsize*ksize*x0;
1067  width_out = jsize;
1068  } else {
1069  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1070  data_out = dataPtrsYZX[iblock];
1071  width_out = isize_out;
1072  }
1073 
1074  transpose_xyz_yzx(
1075  nx, jsize, ksize,
1076  isize, jsize,
1077  width_out, ksize,
1078  data+x0, data_out, stream);
1079  }
1080 */
1081 }
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void NAMD_bug(const char *err_msg)
Definition: common.C:196
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ transposeXYZtoZXY()

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 *  data)
virtual

Implements PmeTranspose.

Definition at line 1083 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

1083  {
1084  cudaCheck(cudaSetDevice(deviceID));
1085 
1086  int iperm;
1087  switch(permutation) {
1088  case Perm_cX_Y_Z:
1089  // XYZ -> ZXY
1090  iperm = 0;
1091  break;
1092  case Perm_Z_cX_Y:
1093  // ZXY -> YZX
1094  iperm = 1;
1095  break;
1096  case Perm_Y_Z_cX:
1097  // YZX -> XYZ
1098  iperm = 2;
1099  break;
1100  default:
1101  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1102  break;
1103  }
1104 
1106  nblock, batchesZXY + iperm*nblock,
1107  max_nx_ZXY[iperm], jsize, ksize,
1108  isize, jsize, stream);
1109 
1110 /*
1111  int isize_out;
1112  switch(permutation) {
1113  case Perm_cX_Y_Z:
1114  // XYZ -> ZXY
1115  isize_out = pmeGrid.K3;
1116  break;
1117  case Perm_Z_cX_Y:
1118  // ZXY -> YZX
1119  isize_out = pmeGrid.K2;
1120  break;
1121  case Perm_Y_Z_cX:
1122  // YZX -> XYZ
1123  isize_out = pmeGrid.K1/2+1;
1124  break;
1125  default:
1126  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1127  break;
1128  }
1129 
1130  for (int iblock=0;iblock < nblock;iblock++) {
1131 
1132  int x0 = pos[iblock];
1133  int nx = pos[iblock+1] - x0;
1134 
1135  int width_out;
1136  float2* data_out;
1137  if (dataPtrsZXY[iblock] == NULL) {
1138  // Local transpose, use internal buffer
1139  data_out = d_data + jsize*ksize*x0;
1140  width_out = ksize;
1141  } else {
1142  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1143  data_out = dataPtrsZXY[iblock];
1144  width_out = isize_out;
1145  }
1146 
1147  transpose_xyz_zxy(
1148  nx, jsize, ksize,
1149  isize, jsize,
1150  width_out, nx,
1151  data+x0, data_out, stream);
1152  }
1153 */
1154 }
const int permutation
void NAMD_bug(const char *err_msg)
Definition: common.C:196
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)

◆ waitStreamSynchronize()

void CudaPmeTranspose::waitStreamSynchronize ( )

Definition at line 1156 of file CudaPmeSolverUtil.C.

References cudaCheck.

1156  {
1157  cudaCheck(cudaSetDevice(deviceID));
1158  cudaCheck(cudaStreamSynchronize(stream));
1159 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

The documentation for this class was generated from the following files: