4 #if __CUDACC_VER_MAJOR__ >= 11
     7 #include <namd_cub/cub.cuh>
    12 #ifdef NAMD_HIP //NAMD_HIP
    13 #include <hip/hip_runtime.h>
    14 #include <hipcub/hipcub.hpp>
    18 #include "HipDefines.h"
    19 #include "SequencerCUDAKernel.h"
    20 #include "MShakeKernel.h"
    22 #ifdef NODEGROUP_FORCE_REGISTER
    24 #define OVERALLOC 2.0f
    26 void NAMD_die(const char *);
    29 __constant__ double c_pairlistTrigger;
    30 __constant__ double c_pairlistGrow;
    31 __constant__ double c_pairlistShrink;
    33 __device__ void reset_atomic_counter(unsigned int *counter) {
    39 template <int T_MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
    40 __global__ void accumulateForceToSOAKernelMGPU(
    42     CudaLocalRecord*                   localRecords,
    43     const double * __restrict          f_bond,
    44     const double * __restrict          f_bond_nbond,
    45     const double * __restrict          f_bond_slow,
    47     const float4 * __restrict          f_nbond,
    48     const float4 * __restrict          f_nbond_slow,
    49     const CudaForce* __restrict        f_slow,
    50     double * __restrict                d_f_global_x,
    51     double * __restrict                d_f_global_y,
    52     double * __restrict                d_f_global_z,
    53     double * __restrict                d_f_normal_x,
    54     double * __restrict                d_f_normal_y,
    55     double * __restrict                d_f_normal_z,
    56     double * __restrict                d_f_nbond_x,
    57     double * __restrict                d_f_nbond_y,
    58     double * __restrict                d_f_nbond_z,
    59     double * __restrict                d_f_slow_x,
    60     double * __restrict                d_f_slow_y,
    61     double * __restrict                d_f_slow_z,
    62     const int * __restrict             patchUnsortOrder,
    63     const Lattice                      lattice,
    64     unsigned int** __restrict          deviceQueues,
    65     unsigned int*  __restrict          queueCounters, 
    66     unsigned int*  __restrict          tbcatomic
    68   __shared__ CudaLocalRecord s_record;
    69   using AccessType = int32_t;
    70   AccessType* s_record_buffer = (AccessType*)  &s_record;
    72   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
    73     // Read in the CudaLocalRecord using multiple threads. This should 
    75     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
    76       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
    80     const int numAtoms = s_record.numAtoms;
    81     const int offset = s_record.bufferOffset;
    82     const int offsetNB = s_record.bufferOffsetNBPad;
    84     double f_slow_x, f_slow_y, f_slow_z;
    85     double f_nbond_x, f_nbond_y, f_nbond_z;
    86     double f_normal_x, f_normal_y, f_normal_z;
    88     for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
    89       if (T_MAX_FORCE_NUMBER >= 1) {
    90         int unorder = patchUnsortOrder[offset + i];
    91         // gather from sorted nonbonded force array
    92         float4 fnbond = f_nbond[offsetNB + unorder];
    93         // set (medium) nonbonded force accumulators
    94         f_nbond_x = (double)fnbond.x;
    95         f_nbond_y = (double)fnbond.y;
    96         f_nbond_z = (double)fnbond.z;
    97         if (T_MAX_FORCE_NUMBER == 2) {
    98           // gather from sorted nonbonded slow force array
    99           float4 fnbslow = f_nbond_slow[offsetNB + unorder];
   100           // accumulate slow force contributions from nonbonded calculation
   101           f_slow_x = (double)fnbslow.x;
   102           f_slow_y = (double)fnbslow.y;
   103           f_slow_z = (double)fnbslow.z;
   106       // gather from strided bond force array
   107       // set (fast) normal force accumulators
   108       if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
   109     // Global forces is stored in d_f_global, add to bonded force
   110    f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
   111         f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
   112         f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
   113         if (T_DOCUDAGLOBAL) {
   114           d_f_global_x[offset + i] = 0;
   115           d_f_global_y[offset + i] = 0;
   116           d_f_global_z[offset + i] = 0;
   120    f_normal_x = f_bond[offset + i];
   121    f_normal_y = f_bond[offset + i +   forceStride];
   122    f_normal_z = f_bond[offset + i + 2*forceStride];
   124       if (T_MAX_FORCE_NUMBER >= 1) {
   125         // gather from strided bond nonbonded force array
   126         // accumulate (medium) nonbonded force accumulators
   127         f_nbond_x += f_bond_nbond[offset + i];
   128         f_nbond_y += f_bond_nbond[offset + i +   forceStride];
   129         f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
   130         if (T_MAX_FORCE_NUMBER == 2) {
   131           // gather from strided bond slow force array
   132           // accumulate slow force accumulators
   133           f_slow_x += f_bond_slow[offset + i];
   134           f_slow_y += f_bond_slow[offset + i +   forceStride];
   135           f_slow_z += f_bond_slow[offset + i + 2*forceStride];
   138       // set normal, nonbonded, and slow SOA force buffers
   139       d_f_normal_x[offset + i] = f_normal_x;
   140       d_f_normal_y[offset + i] = f_normal_y;
   141       d_f_normal_z[offset + i] = f_normal_z;
   142       if (T_MAX_FORCE_NUMBER >= 1) {
   143         d_f_nbond_x[offset + i] = f_nbond_x;
   144         d_f_nbond_y[offset + i] = f_nbond_y;
   145         d_f_nbond_z[offset + i] = f_nbond_z;
   146         if (T_MAX_FORCE_NUMBER == 2){
   147           d_f_slow_x[offset + i] = f_slow_x;
   148           d_f_slow_y[offset + i] = f_slow_y;
   149           d_f_slow_z[offset + i] = f_slow_z;
   157 // DMC This was in the accumulateForceToSOAKernelMGPU (commented out)
   160   if(threadIdx.x == 0){
   161     // Need another value here
   162     unsigned int value = atomicInc(&tbcatomic[4], gridDim.x);
   163     isLastBlockDone = (value == (gridDim.x - 1));
   167     // thread0 flags everything
   168     if(threadIdx.x == 0){
   169       for(int i = 0; i < nDev; i++ ){
   170         if (i == devID) continue; 
   171         unsigned int value = atomicInc(& (queueCounters[i]), nDev);
   172         printf(" Device[%d] queue[%d] pos[%d]\n", devID, i, value);
   173         // deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
   174         deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
   175         __threadfence_system();
   177       // sets tcbCounter back to zero
   179       printf(" GPU[%d] finished accumulating SOA buffers\n", devID);
   184 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
   185 __global__ void accumulateForceToSOAKernel(
   186     const int                          numPatches,
   187     CudaLocalRecord*                   localRecords,
   188     const double * __restrict          f_bond,
   189     const double * __restrict          f_bond_nbond,
   190     const double * __restrict          f_bond_slow,
   192     const float4 * __restrict          f_nbond,
   193     const float4 * __restrict          f_nbond_slow,
   194     const CudaForce* __restrict        f_slow,
   195     double * __restrict                d_f_global_x,
   196     double * __restrict                d_f_global_y,
   197     double * __restrict                d_f_global_z,
   198     double * __restrict                d_f_normal_x,
   199     double * __restrict                d_f_normal_y,
   200     double * __restrict                d_f_normal_z,
   201     double * __restrict                d_f_nbond_x,
   202     double * __restrict                d_f_nbond_y,
   203     double * __restrict                d_f_nbond_z,
   204     double * __restrict                d_f_slow_x,
   205     double * __restrict                d_f_slow_y,
   206     double * __restrict                d_f_slow_z,
   207     const int * __restrict             patchUnsortOrder,
   208     const Lattice                      lattice
   210   __shared__ CudaLocalRecord s_record;
   211   using AccessType = int32_t;
   212   AccessType* s_record_buffer = (AccessType*)  &s_record;
   214   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
   215     // Read in the CudaLocalRecord using multiple threads. This should 
   217     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
   218       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
   222     const int numAtoms = s_record.numAtoms;
   223     const int offset = s_record.bufferOffset;
   224     const int offsetNB = s_record.bufferOffsetNBPad;
   226     double f_slow_x, f_slow_y, f_slow_z;
   227     double f_nbond_x, f_nbond_y, f_nbond_z;
   228     double f_normal_x, f_normal_y, f_normal_z;
   230     for (int i = threadIdx.x;  i < numAtoms;  i += blockDim.x) {
   231       if (MAX_FORCE_NUMBER == 2) {
   232         CudaForce f  = f_slow[offset + i];
   233         double3 f_scaled = lattice.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
   234         // set slow force accumulators
   235         f_slow_x = f_scaled.x;
   236         f_slow_y = f_scaled.y;
   237         f_slow_z = f_scaled.z;
   239       if (MAX_FORCE_NUMBER >= 1) {
   240         int unorder = patchUnsortOrder[offset + i];
   241         // gather from sorted nonbonded force array
   242         float4 fnbond = f_nbond[offsetNB + unorder];
   243         // set (medium) nonbonded force accumulators
   244         f_nbond_x = (double)fnbond.x;
   245         f_nbond_y = (double)fnbond.y;
   246         f_nbond_z = (double)fnbond.z;
   247         if (MAX_FORCE_NUMBER == 2) {
   248           // gather from sorted nonbonded slow force array
   249           float4 fnbslow = f_nbond_slow[offsetNB + unorder];
   250           // accumulate slow force contributions from nonbonded calculation
   251           f_slow_x += (double)fnbslow.x;
   252           f_slow_y += (double)fnbslow.y;
   253           f_slow_z += (double)fnbslow.z;
   256       // gather from strided bond force array
   257       // set (fast) normal force accumulators
   258       if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
   259     // Global forces is stored in d_f_global, add to bonded force
   260    f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
   261         f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
   262         f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
   263         if (T_DOCUDAGLOBAL) {
   264           d_f_global_x[offset + i] = 0;
   265           d_f_global_y[offset + i] = 0;
   266           d_f_global_z[offset + i] = 0;
   271    f_normal_x = f_bond[offset + i];
   272    f_normal_y = f_bond[offset + i +   forceStride];
   273    f_normal_z = f_bond[offset + i + 2*forceStride];
   276       if (MAX_FORCE_NUMBER >= 1) {
   277         // gather from strided bond nonbonded force array
   278         // accumulate (medium) nonbonded force accumulators
   279         f_nbond_x += f_bond_nbond[offset + i];
   280         f_nbond_y += f_bond_nbond[offset + i +   forceStride];
   281         f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
   282         if (MAX_FORCE_NUMBER == 2) {
   283           // gather from strided bond slow force array
   284           // accumulate slow force accumulators
   285           f_slow_x += f_bond_slow[offset + i];
   286           f_slow_y += f_bond_slow[offset + i +   forceStride];
   287           f_slow_z += f_bond_slow[offset + i + 2*forceStride];
   290       // set normal, nonbonded, and slow SOA force buffers
   291       d_f_normal_x[offset + i] = f_normal_x;
   292       d_f_normal_y[offset + i] = f_normal_y;
   293       d_f_normal_z[offset + i] = f_normal_z;
   294       if (MAX_FORCE_NUMBER >= 1) {
   295         d_f_nbond_x[offset + i] = f_nbond_x;
   296         d_f_nbond_y[offset + i] = f_nbond_y;
   297         d_f_nbond_z[offset + i] = f_nbond_z;
   298         if (MAX_FORCE_NUMBER == 2) {
   299           d_f_slow_x[offset + i] = f_slow_x;
   300           d_f_slow_y[offset + i] = f_slow_y;
   301           d_f_slow_z[offset + i] = f_slow_z;
   309 __global__ void accumulatePMEForces(
   311   const CudaForce* f_slow, 
   315   const int* patchOffsets, 
   318   int tid = threadIdx.x + (blockDim.x * blockIdx.x);
   319   double f_slow_x, f_slow_y, f_slow_z;
   322     CudaForce f  = f_slow[tid];
   324     double3 f_scaled = lat.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
   325     f_slow_x = f_scaled.x;
   326     f_slow_y = f_scaled.y;
   327     f_slow_z = f_scaled.z;
   329     d_f_slow_x[tid] += f_slow_x;
   330     d_f_slow_y[tid] += f_slow_y;
   331     d_f_slow_z[tid] += f_slow_z;
   335 void SequencerCUDAKernel::accumulateForceToSOA(
   337   const int               doCudaGlobal,
   338   const int               maxForceNumber,
   339   const int               numPatches,
   341   CudaLocalRecord*        localRecords,
   342   const double*           f_bond,
   343   const double*           f_bond_nbond,
   344   const double*           f_bond_slow,
   346   const float4*           f_nbond,
   347   const float4*           f_nbond_slow,
   348   const CudaForce*        f_slow,
   349   double*                 d_f_global_x,
   350   double*                 d_f_global_y,
   351   double*                 d_f_global_z,
   352   double*                 d_f_normal_x,
   353   double*                 d_f_normal_y,
   354   double*                 d_f_normal_z,
   361   const int*              patchUnsortOrder,
   362   const Lattice           lattice,
   363   unsigned int**          deviceQueues,
   364   unsigned int*           queueCounters,
   365   unsigned int*           tbcatomic, 
   368   // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
   369   const int options = (maxForceNumber << 2) + (doGlobal << 1) + doCudaGlobal;
   371 #define CALL_MGPU(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
   372     accumulateForceToSOAKernelMGPU<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
   373     <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
   374     ( numPatches, localRecords, \
   375       f_bond, f_bond_nbond, f_bond_slow, forceStride, \
   376       f_nbond, f_nbond_slow, f_slow, \
   377       d_f_global_x, d_f_global_y, d_f_global_z, \
   378       d_f_normal_x, d_f_normal_y, d_f_normal_z, \
   379       d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
   380       d_f_slow_x, d_f_slow_y, d_f_slow_z, \
   381       patchUnsortOrder, lattice, \
   382       deviceQueues, queueCounters, tbcatomic \
   385       case ((0<<2) + (0<<1) + (0<<0)): CALL_MGPU(0, 0, 0); break;
   386       case ((0<<2) + (0<<1) + (1<<0)): CALL_MGPU(0, 0, 1); break;
   387       case ((0<<2) + (1<<1) + (1<<0)): CALL_MGPU(0, 1, 1); break;
   388       case ((0<<2) + (1<<1) + (0<<0)): CALL_MGPU(0, 1, 0); break;
   389       case ((1<<2) + (0<<1) + (0<<0)): CALL_MGPU(1, 0, 0); break;
   390       case ((1<<2) + (0<<1) + (1<<0)): CALL_MGPU(1, 0, 1); break;
   391       case ((1<<2) + (1<<1) + (1<<0)): CALL_MGPU(1, 1, 1); break;
   392       case ((1<<2) + (1<<1) + (0<<0)): CALL_MGPU(1, 1, 0); break;
   393       case ((2<<2) + (0<<1) + (0<<0)): CALL_MGPU(2, 0, 0); break;
   394       case ((2<<2) + (0<<1) + (1<<0)): CALL_MGPU(2, 0, 1); break;
   395       case ((2<<2) + (1<<1) + (1<<0)): CALL_MGPU(2, 1, 1); break;
   396       case ((2<<2) + (1<<1) + (0<<0)): CALL_MGPU(2, 1, 0); break;
   398         const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
   399         NAMD_bug(error.c_str());
   404 #define CALL(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
   405     accumulateForceToSOAKernel<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
   406     <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
   407     ( numPatches, localRecords, \
   408       f_bond, f_bond_nbond, f_bond_slow, forceStride, \
   409       f_nbond, f_nbond_slow, f_slow, \
   410       d_f_global_x, d_f_global_y, d_f_global_z, \
   411       d_f_normal_x, d_f_normal_y, d_f_normal_z, \
   412       d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
   413       d_f_slow_x, d_f_slow_y, d_f_slow_z, \
   414       patchUnsortOrder, lattice \
   417       case ((0<<2) + (0<<1) + (0<<0)): CALL(0, 0, 0); break;
   418       case ((0<<2) + (0<<1) + (1<<0)): CALL(0, 0, 1); break;
   419       case ((0<<2) + (1<<1) + (1<<0)): CALL(0, 1, 1); break;
   420       case ((0<<2) + (1<<1) + (0<<0)): CALL(0, 1, 0); break;
   421       case ((1<<2) + (0<<1) + (0<<0)): CALL(1, 0, 0); break;
   422       case ((1<<2) + (0<<1) + (1<<0)): CALL(1, 0, 1); break;
   423       case ((1<<2) + (1<<1) + (1<<0)): CALL(1, 1, 1); break;
   424       case ((1<<2) + (1<<1) + (0<<0)): CALL(1, 1, 0); break;
   425       case ((2<<2) + (0<<1) + (0<<0)): CALL(2, 0, 0); break;
   426       case ((2<<2) + (0<<1) + (1<<0)): CALL(2, 0, 1); break;
   427       case ((2<<2) + (1<<1) + (1<<0)): CALL(2, 1, 1); break;
   428       case ((2<<2) + (1<<1) + (0<<0)): CALL(2, 1, 0); break;
   430         const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
   431         NAMD_bug(error.c_str());
   438 template <int T_MAX_FORCE_NUMBER>
   439 __global__ void mergeForcesFromPeersKernel(
   440   const int                   numPatches,
   441   const int                   devID,        // GPU ID 
   442   CudaLocalRecord*            localRecords,
   443   CudaPeerRecord*             peerRecords,
   445   double** __restrict         f_normal_x,      
   446   double** __restrict         f_normal_y, 
   447   double** __restrict         f_normal_z, 
   448   double** __restrict         f_nbond_x, 
   449   double** __restrict         f_nbond_y, 
   450   double** __restrict         f_nbond_z, 
   451   double** __restrict         f_slow_x, 
   452   double** __restrict         f_slow_y, 
   453   double** __restrict         f_slow_z,
   454   const CudaForce* __restrict pmeForces
   456   __shared__ CudaLocalRecord s_record;
   457   using AccessType = int32_t;
   458   AccessType* s_record_buffer = (AccessType*)  &s_record;
   460   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
   461     // Read in the CudaLocalRecord using multiple threads. This should 
   463     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
   464       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
   468     const int numAtoms = s_record.numAtoms;
   469     const int dstOffset = s_record.bufferOffset;
   470     const int numPeerRecords = s_record.numPeerRecord;
   471     const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
   473     const int inlinePeers = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
   475     for (int peer = 0; peer < inlinePeers; peer++) {
   476       const int srcDevice = s_record.inline_peers[peer].deviceIndex;
   477       const int srcOffset = s_record.inline_peers[peer].bufferOffset;
   479       for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
   480         double  f_x = f_normal_x[srcDevice][srcOffset + i];
   481         double  f_y = f_normal_y[srcDevice][srcOffset + i];
   482         double  f_z = f_normal_z[srcDevice][srcOffset + i];
   484         f_normal_x[devID][dstOffset + i] += f_x;
   485         f_normal_y[devID][dstOffset + i] += f_y;
   486         f_normal_z[devID][dstOffset + i] += f_z;
   488         if(T_MAX_FORCE_NUMBER >= 1){ 
   489           // We don't need the nonbonded offset here since
   490           // this isn't the actual nonbonded buffer
   491           f_x = f_nbond_x[srcDevice][srcOffset + i];
   492           f_y = f_nbond_y[srcDevice][srcOffset + i];
   493           f_z = f_nbond_z[srcDevice][srcOffset + i]; 
   495           f_nbond_x[devID][dstOffset + i] += f_x;
   496           f_nbond_y[devID][dstOffset + i] += f_y;
   497           f_nbond_z[devID][dstOffset + i] += f_z;
   499           if(T_MAX_FORCE_NUMBER == 2){
   500             f_x = f_slow_x[srcDevice][srcOffset + i];
   501             f_y = f_slow_y[srcDevice][srcOffset + i];
   502             f_z = f_slow_z[srcDevice][srcOffset + i];
   504             f_slow_x[devID][dstOffset + i] += f_x;
   505             f_slow_y[devID][dstOffset + i] += f_y;
   506             f_slow_z[devID][dstOffset + i] += f_z;
   512     for (int peer = inlinePeers; peer < numPeerRecords; peer++) {
   513       const int srcDevice = peerRecords[peerRecordStartIndexs + peer].deviceIndex;
   514       const int srcOffset = peerRecords[peerRecordStartIndexs + peer].bufferOffset;
   516       for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
   517         double  f_x = f_normal_x[srcDevice][srcOffset + i];
   518         double  f_y = f_normal_y[srcDevice][srcOffset + i];
   519         double  f_z = f_normal_z[srcDevice][srcOffset + i];
   521         f_normal_x[devID][dstOffset + i] += f_x;
   522         f_normal_y[devID][dstOffset + i] += f_y;
   523         f_normal_z[devID][dstOffset + i] += f_z;
   525         if(T_MAX_FORCE_NUMBER >= 1){ 
   526           // We don't need the nonbonded offset here since
   527           // this isn't the actual nonbonded buffer
   528           f_x = f_nbond_x[srcDevice][srcOffset + i];
   529           f_y = f_nbond_y[srcDevice][srcOffset + i];
   530           f_z = f_nbond_z[srcDevice][srcOffset + i]; 
   532           f_nbond_x[devID][dstOffset + i] += f_x;
   533           f_nbond_y[devID][dstOffset + i] += f_y;
   534           f_nbond_z[devID][dstOffset + i] += f_z;
   536           if(T_MAX_FORCE_NUMBER == 2){
   537             f_x = f_slow_x[srcDevice][srcOffset + i];
   538             f_y = f_slow_y[srcDevice][srcOffset + i];
   539             f_z = f_slow_z[srcDevice][srcOffset + i];
   541             f_slow_x[devID][dstOffset + i] += f_x;
   542             f_slow_y[devID][dstOffset + i] += f_y;
   543             f_slow_z[devID][dstOffset + i] += f_z;
   549     // Merge PME forces here instead of in the fetch forcing kernel
   550     if(T_MAX_FORCE_NUMBER == 2){
   551       for(int i = threadIdx.x; i < numAtoms; i += blockDim.x){
   552         CudaForce f = pmeForces[dstOffset + i];
   554         double3 f_scaled = lat.scale_force(
   555           Vector((double)f.x, (double)f.y, (double)f.z));
   556         f_slow_x[devID][dstOffset + i] += f_scaled.x;
   557         f_slow_y[devID][dstOffset + i] += f_scaled.y;
   558         f_slow_z[devID][dstOffset + i] += f_scaled.z;
   565 void SequencerCUDAKernel::mergeForcesFromPeers(
   567   const int              maxForceNumber, 
   569   const int              numPatchesHomeAndProxy, 
   570   const int              numPatchesHome, 
   580   const CudaForce*       pmeForces, 
   581   CudaLocalRecord*       localRecords,
   582   CudaPeerRecord*        peerRecords,
   583   std::vector<int>&      atomCounts,
   586   // atom-based kernel here
   587   //int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   588   int grid = (numPatchesHome);
   589   int blocks = PATCH_BLOCKS;
   590   // launch a single-threaded grid for debugging
   593   for (int i = 0; i < devID; i++) {
   594     offset += atomCounts[i];
   597   switch (maxForceNumber) {
   599       mergeForcesFromPeersKernel<0><<<grid, blocks, 0, stream>>>(
   600         numPatchesHome, devID, localRecords, peerRecords,
   602         f_normal_x, f_normal_y, f_normal_z,
   603         f_nbond_x, f_nbond_y, f_nbond_z, 
   604         f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
   608       mergeForcesFromPeersKernel<1><<<grid, blocks, 0, stream>>>(
   609         numPatchesHome, devID, localRecords, peerRecords,
   611         f_normal_x, f_normal_y, f_normal_z,
   612         f_nbond_x, f_nbond_y, f_nbond_z, 
   613         f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
   617       mergeForcesFromPeersKernel<2><<<grid, blocks, 0, stream>>>(
   618         numPatchesHome, devID, localRecords, peerRecords,
   620         f_normal_x, f_normal_y, f_normal_z,
   621         f_nbond_x, f_nbond_y, f_nbond_z, 
   622         f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
   628 // JM NOTE: This is a fused version of accumulateForceToSOA + addForceToMomentum.
   629 //          addForceToMomentum barely has any math and is memory-bandwith bound, so
   630 //          fusing these kernels allows us to keep the forces in the registers and
   631 //          reusing its values for velocities
   632 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL, int DO_FIXED>
   633 __global__ void accumulateForceKick(
   634     const int                          numPatches,
   635     CudaLocalRecord*                   localRecords,
   636     const double * __restrict          f_bond,
   637     const double * __restrict          f_bond_nbond,
   638     const double * __restrict          f_bond_slow,
   640     const float4 * __restrict          f_nbond,
   641     const float4 * __restrict          f_nbond_slow,
   642     const CudaForce* __restrict        f_slow,
   643     double * __restrict                d_f_global_x,
   644     double * __restrict                d_f_global_y,
   645     double * __restrict                d_f_global_z,
   646     double * __restrict                d_f_normal_x,
   647     double * __restrict                d_f_normal_y,
   648     double * __restrict                d_f_normal_z,
   649     double * __restrict                d_f_nbond_x,
   650     double * __restrict                d_f_nbond_y,
   651     double * __restrict                d_f_nbond_z,
   652     double * __restrict                d_f_slow_x,
   653     double * __restrict                d_f_slow_y,
   654     double * __restrict                d_f_slow_z,
   655     const int * __restrict             patchUnsortOrder,
   656     const Lattice                      lattice,
   660     const double * __restrict recipMass, 
   661     const int* __restrict atomFixed,
   662     const double dt_normal,
   663     const double dt_nbond, 
   664     const double dt_slow,
   667   __shared__ CudaLocalRecord s_record;
   668   using AccessType = int32_t;
   669   AccessType* s_record_buffer = (AccessType*)  &s_record;
   671   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
   672     // Read in the CudaLocalRecord using multiple threads. This should 
   674     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
   675       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
   679     const int numAtoms = s_record.numAtoms;
   680     const int offset = s_record.bufferOffset;
   681     const int offsetNB = s_record.bufferOffsetNBPad;
   683     double f_slow_x, f_slow_y, f_slow_z;
   684     double f_nbond_x, f_nbond_y, f_nbond_z;
   685     double f_normal_x, f_normal_y, f_normal_z;
   687     for (int i = threadIdx.x;  i < numAtoms;  i += blockDim.x) {
   698       if (MAX_FORCE_NUMBER == 2) {
   699         CudaForce f  = f_slow[offset + i];
   701         double3 f_scaled = lattice.scale_force(
   702           Vector((double)f.x, (double)f.y, (double)f.z));
   704         // set slow force accumulators
   705         f_slow_x = f_scaled.x;
   706         f_slow_y = f_scaled.y;
   707         f_slow_z = f_scaled.z;
   709       if (MAX_FORCE_NUMBER >= 1) {
   710         int unorder = patchUnsortOrder[offset + i];
   711         // gather from sorted nonbonded force array
   712         float4 fnbond = f_nbond[offsetNB + unorder];
   713         // set (medium) nonbonded force accumulators
   714         f_nbond_x = (double)fnbond.x;
   715         f_nbond_y = (double)fnbond.y;
   716         f_nbond_z = (double)fnbond.z;
   717         if (MAX_FORCE_NUMBER == 2) {
   718           // gather from sorted nonbonded slow force array
   719           float4 fnbslow = f_nbond_slow[offsetNB + unorder];
   720           // accumulate slow force contributions from nonbonded calculation
   721           f_slow_x += (double)fnbslow.x;
   722           f_slow_y += (double)fnbslow.y;
   723           f_slow_z += (double)fnbslow.z;
   726       // gather from strided bond force array
   727       // set (fast) normal force accumulators
   728       if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
   729     // Global forces is stored in d_f_global, add to bonded force
   730    f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
   731         f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
   732         f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
   733         if (T_DOCUDAGLOBAL) {
   734           d_f_global_x[offset + i] = 0;
   735           d_f_global_y[offset + i] = 0;
   736           d_f_global_z[offset + i] = 0;
   742    f_normal_x = f_bond[offset + i];
   743    f_normal_y = f_bond[offset + i +   forceStride];
   744    f_normal_z = f_bond[offset + i + 2*forceStride];
   747       if (MAX_FORCE_NUMBER >= 1) {
   748         // gather from strided bond nonbonded force array
   749         // accumulate (medium) nonbonded force accumulators
   750         f_nbond_x += f_bond_nbond[offset + i];
   751         f_nbond_y += f_bond_nbond[offset + i +   forceStride];
   752         f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
   753         if (MAX_FORCE_NUMBER == 2) {
   754           // gather from strided bond slow force array
   755           // accumulate slow force accumulators
   756           f_slow_x += f_bond_slow[offset + i];
   757           f_slow_y += f_bond_slow[offset + i +   forceStride];
   758           f_slow_z += f_bond_slow[offset + i + 2*forceStride];
   761       // set normal, nonbonded, and slow SOA force buffers
   762       d_f_normal_x[offset + i] = f_normal_x;
   763       d_f_normal_y[offset + i] = f_normal_y;
   764       d_f_normal_z[offset + i] = f_normal_z;
   765       if (MAX_FORCE_NUMBER >= 1) {
   766         d_f_nbond_x[offset + i] = f_nbond_x;
   767         d_f_nbond_y[offset + i] = f_nbond_y;
   768         d_f_nbond_z[offset + i] = f_nbond_z;
   769         if (MAX_FORCE_NUMBER == 2) {
   770           d_f_slow_x[offset + i] = f_slow_x;
   771           d_f_slow_y[offset + i] = f_slow_y;
   772           d_f_slow_z[offset + i] = f_slow_z;
   776       /* Velocity updates */
   777       double rp = recipMass[offset + i];
   779       vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
   780       vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
   781       vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
   783         d_vel_x[offset + i] += vx;
   784         d_vel_y[offset + i] += vy;
   785         d_vel_z[offset + i] += vz;
   787         if (!atomFixed[offset + i]) {
   788           d_vel_x[offset + i] += vx;
   789           d_vel_y[offset + i] += vy;
   790           d_vel_z[offset + i] += vz;
   792           d_vel_x[offset + i] = 0;
   793           d_vel_y[offset + i] = 0;
   794           d_vel_z[offset + i] = 0;
   804 template <int MAX_FORCE_NUMBER>
   805 __global__ void accumulateForceKick(
   806     const double  * __restrict f_bond,
   807     const double  * __restrict f_bond_nbond,
   808     const double  * __restrict f_bond_slow,
   810     const PatchRecord * __restrict bond_pr,
   811     const float4 * __restrict f_nbond,
   812     const float4 * __restrict f_nbond_slow,
   813     const CudaPatchRecord * __restrict nbond_pr,
   814     const CudaForce * __restrict f_slow,
   815     double *d_f_normal_x,
   816     double *d_f_normal_y,
   817     double *d_f_normal_z,
   827     const double * __restrict recipMass,
   828     const double dt_normal,
   829     const double dt_nbond,
   830     const double dt_slow,
   831     const double scaling,
   832     const int * __restrict patchIDs,
   833     const int * __restrict patchOffsets,
   834     const int * __restrict patchUnsortOrder,
   835     const CudaLattice lat)
   837   int tid = threadIdx.x;
   838   int stride = blockDim.x;
   840   __shared__ int sh_patchID;
   841   __shared__ int sh_patchOffset;
   842   // number of atoms per patch should be same no matter
   843   // which data structure we access it through
   844   __shared__ int sh_natoms;
   845   __shared__ int sh_bondForceOffset;
   846   __shared__ int sh_nbondForceOffset;
   850   // number of atoms per patch should be same no matter
   851   // which data structure we access it through
   854   int nbondForceOffset;
   856   if(threadIdx.x == 0){
   857     sh_patchID = patchIDs[blockIdx.x];
   858     sh_patchOffset = patchOffsets[blockIdx.x];
   859     // number of atoms per patch should be same no matter
   860     // which data structure we access it through
   861     sh_natoms = bond_pr[sh_patchID].numAtoms;
   862     sh_bondForceOffset = bond_pr[sh_patchID].atomStart;
   863     if(MAX_FORCE_NUMBER >= 1){
   864       sh_nbondForceOffset = nbond_pr[sh_patchID].atomStart;
   870   // pull stuff to registers
   871   // patchID = sh_patchID;
   872   patchOffset = sh_patchOffset;
   874   bondForceOffset = sh_bondForceOffset;
   875   nbondForceOffset = (MAX_FORCE_NUMBER >= 1) ? sh_nbondForceOffset : 0;
   877   double r1x, r1y, r1z, r2x, r2y, r2z, r3x, r3y, r3z;
   879   if (MAX_FORCE_NUMBER == 2) {
   891   double f_slow_x, f_slow_y, f_slow_z;
   892   double f_nbond_x, f_nbond_y, f_nbond_z;
   893   double f_normal_x, f_normal_y, f_normal_z;
   896   for (int i = tid;  i < natoms;  i += stride) {
   908       // Global forces is stored in d_f_normal, just need to add bonded force
   909       f_normal_x = d_f_normal_x[patchOffset + i];
   910       f_normal_y = d_f_normal_y[patchOffset + i];
   911       f_normal_z = d_f_normal_z[patchOffset + i];
   914     if (MAX_FORCE_NUMBER == 2) {
   915       CudaForce f  = f_slow[patchOffset + i];
   916       double fx = (double) f.x;
   917       double fy = (double) f.y;
   918       double fz = (double) f.z;
   919       // set slow force accumulators
   920       f_slow_x = (fx*r1x + fy*r2x + fz*r3x);
   921       f_slow_y = (fx*r1y + fy*r2y + fz*r3y);
   922       f_slow_z = (fx*r1z + fy*r2z + fz*r3z);
   924     if (MAX_FORCE_NUMBER >= 1) {
   925       int unorder = patchUnsortOrder[patchOffset + i];
   926       // gather from sorted nonbonded force array
   927       float4 fnbond = f_nbond[nbondForceOffset + unorder];
   928       // set (medium) nonbonded force accumulators
   929       f_nbond_x = (double)fnbond.x;
   930       f_nbond_y = (double)fnbond.y;
   931       f_nbond_z = (double)fnbond.z;
   932       if (MAX_FORCE_NUMBER == 2) {
   933         // gather from sorted nonbonded slow force array
   934         float4 fnbslow = f_nbond_slow[nbondForceOffset + unorder];
   935         // accumulate slow force contributions from nonbonded calculation
   936         f_slow_x += (double)fnbslow.x;
   937         f_slow_y += (double)fnbslow.y;
   938         f_slow_z += (double)fnbslow.z;
   941     // gather from strided bond force array
   942     // set (fast) normal force accumulators
   943     f_normal_x += f_bond[bondForceOffset + i];
   944     f_normal_y += f_bond[bondForceOffset + i +   forceStride];
   945     f_normal_z += f_bond[bondForceOffset + i + 2*forceStride];
   946     if (MAX_FORCE_NUMBER >= 1) {
   947       // gather from strided bond nonbonded force array
   948       // accumulate (medium) nonbonded force accumulators
   949       f_nbond_x += f_bond_nbond[bondForceOffset + i];
   950       f_nbond_y += f_bond_nbond[bondForceOffset + i +   forceStride];
   951       f_nbond_z += f_bond_nbond[bondForceOffset + i + 2*forceStride];
   952       if (MAX_FORCE_NUMBER == 2) {
   953         // gather from strided bond slow force array
   954         // accumulate slow force accumulators
   955         f_slow_x += f_bond_slow[bondForceOffset + i];
   956         f_slow_y += f_bond_slow[bondForceOffset + i +   forceStride];
   957         f_slow_z += f_bond_slow[bondForceOffset + i + 2*forceStride];
   960     // set normal, nonbonded, and slow SOA force buffers
   961     d_f_normal_x[patchOffset + i] = f_normal_x;
   962     d_f_normal_y[patchOffset + i] = f_normal_y;
   963     d_f_normal_z[patchOffset + i] = f_normal_z;
   965     if (MAX_FORCE_NUMBER >= 1) {
   966       d_f_nbond_x[patchOffset + i] = f_nbond_x;
   967       d_f_nbond_y[patchOffset + i] = f_nbond_y;
   968       d_f_nbond_z[patchOffset + i] = f_nbond_z;
   969       if (MAX_FORCE_NUMBER == 2) {
   970         d_f_slow_x[patchOffset + i] = f_slow_x;
   971         d_f_slow_y[patchOffset + i] = f_slow_y;
   972         d_f_slow_z[patchOffset + i] = f_slow_z;
   976     /* Velocity updates */
   977     double rp = recipMass[patchOffset + i];
   978     vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
   979     vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
   980     vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
   982     d_vel_x[patchOffset + i] += vx;
   983     d_vel_y[patchOffset + i] += vy;
   984     d_vel_z[patchOffset + i] += vz;
   990 void SequencerCUDAKernel::accumulate_force_kick(
   991   const bool              doFixedAtoms,
   993   const int               doCudaGlobal,
   994   const int               maxForceNumber,
   995   const int               numPatches,
   996   CudaLocalRecord*        localRecords,
   997   const double*           f_bond,
   998   const double*           f_bond_nbond,
   999   const double*           f_bond_slow,
  1001   const float4*           f_nbond,
  1002   const float4*           f_nbond_slow,
  1003   const CudaForce*        f_slow,
  1004   double*                 d_f_global_x,
  1005   double*                 d_f_global_y,
  1006   double*                 d_f_global_z,
  1007   double*                 d_f_normal_x,
  1008   double*                 d_f_normal_y,
  1009   double*                 d_f_normal_z,
  1010   double*                 d_f_nbond_x,
  1011   double*                 d_f_nbond_y,
  1012   double*                 d_f_nbond_z,
  1019   const double*           recipMass,
  1020   const int*              d_atomFixed,
  1021   const double            dt_normal, 
  1022   const double            dt_nbond,
  1023   const double            dt_slow,
  1024   const double            scaling,
  1025   const int*              patchUnsortOrder,
  1026   const Lattice           lattice,
  1029   // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
  1030   const int options = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms);
  1031 #define CALL(MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED) \
  1032   accumulateForceKick<MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED> \
  1033   <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
  1034   ( numPatches, localRecords, \
  1035     f_bond, f_bond_nbond, f_bond_slow, forceStride, \
  1036     f_nbond, f_nbond_slow, f_slow, \
  1037     d_f_global_x, d_f_global_y, d_f_global_z, \
  1038     d_f_normal_x, d_f_normal_y, d_f_normal_z, \
  1039     d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
  1040     d_f_slow_x, d_f_slow_y, d_f_slow_z, \
  1041     patchUnsortOrder, lattice, \
  1042     d_vel_x, d_vel_y, d_vel_z, recipMass, \
  1043     d_atomFixed, dt_normal, dt_nbond, dt_slow, scaling)
  1045    * Generated by the following python script:
  1046    * #!/usr/bin/env python3
  1047    * for maxForceNumber in range(3):
  1048    *     for doGlobal in range(2):
  1049    *         for doCudaGlobal in range(2):
  1050    *             for doFixedAtoms in range(2):
  1051    *                 option = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms)
  1052    *                 print(f'    case {option}: CALL ({maxForceNumber}, {doGlobal}, {doCudaGlobal}, {doFixedAtoms}); break;')
  1055     case 0: CALL (0, 0, 0, 0); break;
  1056     case 1: CALL (0, 0, 0, 1); break;
  1057     case 2: CALL (0, 0, 1, 0); break;
  1058     case 3: CALL (0, 0, 1, 1); break;
  1059     case 4: CALL (0, 1, 0, 0); break;
  1060     case 5: CALL (0, 1, 0, 1); break;
  1061     case 6: CALL (0, 1, 1, 0); break;
  1062     case 7: CALL (0, 1, 1, 1); break;
  1063     case 8: CALL (1, 0, 0, 0); break;
  1064     case 9: CALL (1, 0, 0, 1); break;
  1065     case 10: CALL (1, 0, 1, 0); break;
  1066     case 11: CALL (1, 0, 1, 1); break;
  1067     case 12: CALL (1, 1, 0, 0); break;
  1068     case 13: CALL (1, 1, 0, 1); break;
  1069     case 14: CALL (1, 1, 1, 0); break;
  1070     case 15: CALL (1, 1, 1, 1); break;
  1071     case 16: CALL (2, 0, 0, 0); break;
  1072     case 17: CALL (2, 0, 0, 1); break;
  1073     case 18: CALL (2, 0, 1, 0); break;
  1074     case 19: CALL (2, 0, 1, 1); break;
  1075     case 20: CALL (2, 1, 0, 0); break;
  1076     case 21: CALL (2, 1, 0, 1); break;
  1077     case 22: CALL (2, 1, 1, 0); break;
  1078     case 23: CALL (2, 1, 1, 1); break;
  1080       const std::string error =
  1081         "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " +
  1082         std::to_string(maxForceNumber) +
  1083         ", doGlobal = " + std::to_string(doGlobal) +
  1084         ", doFixedAtoms = " + std::to_string(doFixedAtoms) +
  1085         ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
  1086       NAMD_bug(error.c_str());
  1092 template <bool doSlow>
  1093 __global__  void copyFNBondToSOA(float4* __restrict__ f_nbond,
  1094   float4* __restrict__                f_nbond_slow,
  1101   const int* __restrict__              patchIDs,
  1102   const int* __restrict__              patchOffsets,
  1103   const int* __restrict__              patchUnsortOrder,
  1104   const CudaPatchRecord*  __restrict__ nbondIndexPerPatch)
  1107     int stride = blockDim.x;
  1108     int pid = patchIDs[blockIdx.x];
  1110     // nbondIndexPerPatch is organized on a per local ID on the compute
  1111     // It's also device-wide, which means it hopefully has more patches than
  1112     // PatchMap but that means the ordering is different. I can't use the
  1113     // PatchID here to index nbondIndexPerPatch I need something else.
  1114     // Each patchID from patchMap needs to map somewhere to nbondIndexPerPatch
  1115     // Means I can't use patchIDs anymore. I need to use a different ordering
  1116     // straight from the computes nbondIndexPerPatch is the same for
  1117     // both multi and single PE. What changes is force storage?
  1118     // The only thing that needs updating is the usage of the local compute index
  1119     // instead of using the global patchID.
  1120     // I need the local compute index
  1121     // what do... Patches never migrate, so hopefully this datastructure will never change.
  1123     // I can build an index at patchMap with the ordering inside the computes.
  1124     int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
  1125     int patchOffset = patchOffsets[blockIdx.x];
  1126     int natom = nbondIndexPerPatch[pid].numAtoms;
  1127     for(int i = threadIdx.x; i < natom; i+= stride){
  1128       unorder = patchUnsortOrder[patchOffset + i];
  1129       float4 force = f_nbond[globalForceOffset + unorder];
  1131       f_nbond_x[patchOffset + i] = (double)force.x;
  1132       f_nbond_y[patchOffset + i] = (double)force.y;
  1133       f_nbond_z[patchOffset + i] = (double)force.z;
  1136         // XXX Accumulation here works because each f_slow_* buffer
  1137         // is first initialized in copyFSlowToSOA.
  1138         float4  f_nb_slow = f_nbond_slow[globalForceOffset + unorder];
  1139         f_slow_x[patchOffset + i] += (double)f_nb_slow.x;
  1140         f_slow_y[patchOffset + i] += (double)f_nb_slow.y;
  1141         f_slow_z[patchOffset + i] += (double)f_nb_slow.z;
  1146 void SequencerCUDAKernel::copy_nbond_forces(int numPatches,
  1148   float4*                f_nbond_slow,
  1155   const int*             patchIDs,
  1156   const int*             patchOffsets,
  1157   const int*             patchUnsortOrder,
  1158   const CudaPatchRecord* pr,
  1160   cudaStream_t           stream){
  1163       copyFNBondToSOA<1><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
  1164         f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
  1165         patchIDs, patchOffsets, patchUnsortOrder, pr);
  1167       copyFNBondToSOA<0><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
  1168         f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
  1169         patchIDs, patchOffsets, patchUnsortOrder, pr);
  1174 // this function will fuse copyFNBond, copyFbond, addForceToMomentum and updateVelocities (hopefully)
  1175 // XXX TODO: missing slow forces support
  1176 template<bool T_DONBOND, bool T_DOSLOW>
  1177 __global__ void fetchForcesAndUpdateVelocitiesKernel(
  1178         const double scaling,
  1182         /* copyFbondToSOA parameters*/
  1183         const double* __restrict__ f_bond, // forces from bond compute
  1184         const double* __restrict__ f_bond_nbond, // forces from bond compute
  1185         double* __restrict__ f_bond_x,
  1186         double* __restrict__ f_bond_y,
  1187         double* __restrict__ f_bond_z,
  1188         double* __restrict__ f_nbond_x,
  1189         double* __restrict__ f_nbond_y,
  1190         double* __restrict__ f_nbond_z,
  1191         double* __restrict__ f_slow_x,
  1192         double* __restrict__ f_slow_y,
  1193         double* __restrict__ f_slow_z,
  1195         const PatchRecord* __restrict__ b_pr,
  1196         const int* __restrict__ patchIDs,
  1197         const int* __restrict__ patchOffsets,
  1198         /* copyFNbondToSOA parameters*/
  1199         const float4* __restrict__ f_nbond, // forces from bbond compute
  1200         const float4* __restrict__ f_nbond_slow, // forces from bond compute
  1201         const int* __restrict__ patchSortOrder,
  1202         const CudaPatchRecord* __restrict__ nbondIndexPerPatch,
  1203         /* copyFSlowToSOA */
  1204         const CudaForce* __restrict__ f_slow,
  1205         const int* __restrict__         patchPositions,
  1206         const int* __restrict__         pencilPatchIndex,
  1207         const int* __restrict__         patchOffsets,
  1208         const int* __restrict__         patchIDs,
  1209         const int* __restrict__         slow_patchIDs,
  1210         const CudaLattice* __restrict__ lattices,
  1211         /* addForceToMomentum */
  1212         const double * __restrict recipMass,
  1213         double       * __restrict vel_x,
  1214         double       * __restrict vel_y,
  1215         double       * __restrict vel_z,
  1216         /* updateRigidArrays */
  1217         const double * __restrict pos_x,
  1218         const double * __restrict pos_y,
  1219         const double * __restrict pos_z,
  1220         double *       __restrict velNew_x,
  1221         double *       __restrict velNew_y,
  1222         double *       __restrict velNew_z,
  1223         double *       __restrict posNew_x,
  1224         double *       __restrict posNew_y,
  1225         double *       __restrict posNew_z
  1228   int stride = blockDim.x;
  1229   int pid = patchIDs[blockIdx.x];
  1231   int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
  1232   int b_forceOffset  = b_pr[patchID].atomStart;
  1234   int patchOffset = patchOffsets[blockIdx.x];
  1235   int natom = nbondIndexPerPatch.numAtoms;
  1240   for(int i = 0; i < threadIdx.x; i < natom; i += stride){
  1242     if(T_DONBOND) f_nb = {0};
  1243     if(T_DOSLOW)  f_s  = {0};
  1244     // fetch the bonded forces first
  1245     f_b.x  = f_bond[b_forceOffset + i];
  1246     f_b.y  = f_bond[b_forceOffset + i + forceStride];
  1247     f_b.z  = f_bond[b_forceOffset + i + 2*forceStride];
  1250        f_nb = f_nbond[globalForceOffset + i];
  1251        f_nb.x += f_bond_nbond[b_forceOffset + i];
  1252        f_nb.y += f_bond_nbond[b_forceOffset + i + forceStride];
  1253        f_nb.z += f_bond_nbond[b_forceOffset + i + 2*forceStride];
  1255     // addForceToMomentum now
  1256     // this striding is not good
  1257     float vx = vel_x[patchOffset + i];
  1258     float vy = vel_y[patchOffset + i];
  1259     float vz = vel_z[patchOffset + i];
  1260     float rmass = recipMass[patchOffset + i];
  1261     switch(maxForceNumber){
  1262       // XXX TODO: Case 2 for slow forces
  1264         dt_nbond *= scaling;
  1265         vx += f_nb.x * rmass * dt_nbond;
  1266         vy += f_nb.y * rmass * dt_nbond;
  1267         vz += f_nb.z * rmass * dt_nbond;
  1269         dt_normal *= scaling;
  1270         vx += f_b.x * rmass * dt_nbond;
  1271         vy += f_b.y * rmass * dt_nbond;
  1272         vz += f_b.z * rmass * dt_nbond;
  1275     // updateRigidArrays
  1277     posNew_x[patchOffset + i] = pos_x[i] + (vx * dt);
  1278     posNew_y[patchOffset + i] = pos_y[i] + (vy * dt);
  1279     posNew_z[patchOffset + i] = pos_z[i] + (vz * dt);
  1280     vel_x[patchOffset + i]    = vx;
  1281     vel_y[patchOffset + i]    = vy;
  1282     vel_z[patchOffset + i]    = vz;
  1283     velNew_x[patchOffset + i] = vx;
  1284     velNew_y[patchOffset + i] = vy;
  1285     velNew_z[patchOffset + i] = vz;
  1286     f_normal_x[patchOffset + i] = f_b.x;
  1287     f_normal_y[patchOffset + i] = f_b.y;
  1288     f_normal_z[patchOffset + i] = f_b.z;
  1291       order = patchSortOrder[patchOffset + i];
  1292       f_nbond_x[patchOffset + order] = f_nb.x;
  1293       f_nbond_y[patchOffset + order] = f_nb.y;
  1294       f_nbond_z[patchOffset + order] = f_nb.z;
  1299 void SequencerCUDAKernel::fetchForcesAndUpdateVelocities(int numPatches,
  1302         const double scaling,
  1306         const int    maxForceNumber,
  1307         /* copyFbondToSOA parameters*/
  1308         const double* __restrict__ f_bond, // forces from bond compute
  1309         const double* __restrict__ f_bond_nbond, // forces from bond compute
  1320         const PatchRecord* b_pr,
  1321         const int* patchIDs,
  1322         const int* patchOffsets,
  1323         /* copyFNbondToSOA parameters*/
  1324         const float4* f_nbond, // forces from bbond compute
  1325         const float4* f_nbond_slow, // forces from bond compute
  1326         const int* patchSortOrder,
  1327         const CudaPatchRecord* nbondIndexPerPatch,
  1328         /* copyFSlowToSOA */
  1329         const CudaForce* f_slow,
  1330         const int* patchPositions,
  1331         const int* pencilPatchIndex,
  1332         const int* patchOffsets,
  1333         const int* patchIDs,
  1334         const int* slow_patchIDs,
  1335         const CudaLattice* lattices,
  1336         /* addForceToMomentum */
  1337         const double *  recipMass,
  1341         /* updateRigidArrays */
  1342         const double *  pos_x,
  1343         const double *  pos_y,
  1344         const double *  pos_z,
  1351         cudaStream_t    stream){
  1353         // reduce the amount of arguments in this function
  1355          int grid = numPatches;
  1357          // XXX TODO finish this
  1360              fetchForcesAndUpdateVelocities<true, true><<<numPatches, 128, 0, stream>>>();
  1362              fetchForcesAndUpdateVelocities<true, false><<<numPatches, 128, 0, stream>>>();
  1365            fetchForcesAndUpdateVelocities<false, false><<<numPatches, 128, 0, stream>>>();
  1370 // JM: I need a function to do the pairlistCheck
  1371 template<bool T_ISPERIODIC>
  1372 __global__ void pairListMarginCheckKernel(
  1373   const int                    numPatches,
  1374   CudaLocalRecord*             localRecords,
  1375   const double* __restrict     pos_x, 
  1376   const double*     __restrict pos_y, 
  1377   const double*     __restrict pos_z,
  1378   const double*     __restrict pos_old_x,
  1379   const double*     __restrict pos_old_y,
  1380   const double*     __restrict pos_old_z,
  1381   const double3*    __restrict awayDists, // for margin check
  1382   const Lattice                lattice,
  1383   const Lattice                latticeOld,
  1384   const double3*    __restrict patchMins,
  1385   const double3*    __restrict patchMaxes,
  1386   const double3*    __restrict patchCenter,
  1387   const CudaMInfo*  __restrict mInfo,
  1388   unsigned int*     __restrict tbcatomic,
  1389   double*                      patchMaxAtomMovement,
  1390   double*                      h_patchMaxAtomMovement,
  1391   double*                      patchNewTolerance,
  1392   double*                      h_patchNewTolerance,
  1393   const double                 minSize,
  1394   const double                 cutoff,
  1395   const double                 sysdima,
  1396   const double                 sysdimb,
  1397   const double                 sysdimc,
  1398   unsigned int*     __restrict h_marginViolations,
  1399   unsigned int*     __restrict h_periodicCellSmall, 
  1400   const bool                   rescalePairlistTolerance
  1402   __shared__ CudaLocalRecord s_record;
  1403   using AccessType = int32_t;
  1404   AccessType* s_record_buffer = (AccessType*)  &s_record;
  1406   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
  1407     // Read in the CudaLocalRecord using multiple threads. This should 
  1409     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
  1410       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
  1414     const int pid = s_record.patchID;
  1417     // So each patch has a max_CD particular to it
  1418     // since we have a block per patch, we get the stuff here
  1419     // we need to check eight points for each patch. Might not pay off to do that
  1420     // we do that once per patch I guess
  1421     double cd, max_cd, max_pd;
  1422     __shared__ double minx, miny, minz, maxx, maxy, maxz;
  1423     double3 corner, corner_unscaled, corner2_unscaled;
  1427     const int stride = blockDim.x;
  1429     const int ind = patchIndex;
  1430     double3 min = patchMins[ind];
  1431     double3 max = patchMaxes[ind];
  1432     double3 aDist = awayDists[ind];
  1434     const int start    = s_record.bufferOffset;
  1435     const int numAtoms = s_record.numAtoms;
  1436     double3 center = patchCenter[ind];
  1438     double3 center_cur = lattice.unscale(center);
  1439     double3 center_old = latticeOld.unscale(center);
  1441     center_cur.x -= center_old.x;
  1442     center_cur.y -= center_old.y;
  1443     center_cur.z -= center_old.z;
  1447     if(threadIdx.x == 0){
  1448       if (rescalePairlistTolerance) patchNewTolerance[ind] *= (1.0f - c_pairlistShrink);
  1450       // doMargincheck -> checks if the periodic cell has became too small:
  1452       // Those values are in registers to begin with, so the overhead here is not that large
  1453       // if this condition is set to true, simulation will stop
  1454       mValue.x = (0.5) * (aDist.x  - cutoff / sysdima);
  1455       mValue.y = (0.5) * (aDist.y  - cutoff / sysdimb);
  1456       mValue.z = (0.5) * (aDist.z  - cutoff / sysdimc);
  1457       minx = min.x - mValue.x;
  1458       miny = min.y - mValue.y;
  1459       minz = min.z - mValue.z;
  1460       maxx = max.x + mValue.x;
  1461       maxy = max.y + mValue.y;
  1462       maxz = max.z + mValue.z;
  1464       for(int i = 0; i < 2; i++){
  1465         for(int j =0 ; j< 2; j++){
  1466           for(int k = 0; k < 2; k++){
  1467               corner.x = (i ? min.x : max.x);
  1468               corner.y = (j ? min.y : max.y);
  1469               corner.z = (k ? min.z : max.z);
  1470               corner_unscaled = lattice.unscale(corner);
  1471               corner2_unscaled = latticeOld.unscale(corner);
  1472               corner.x = (corner_unscaled.x - corner2_unscaled.x) - center_cur.x;
  1473               corner.y = (corner_unscaled.y - corner2_unscaled.y) - center_cur.y;
  1474               corner.z = (corner_unscaled.z - corner2_unscaled.z) - center_cur.z;
  1475               cd = corner.x * corner.x + 
  1476                    corner.y * corner.y + corner.z * corner.z;
  1477               if (cd > max_cd) max_cd = cd;
  1484     // Alrights, so we have max_cd for each patch, now we get the max_pd
  1485     // XXX TODO: Get the number of atoms per patch
  1486     // we need the nAtomsPerPatch
  1487     // we need an atomStart here
  1488     unsigned int mc = 0;
  1489     for(int i = threadIdx.x; i < numAtoms; i += stride){
  1490       // Here I can also check for margin violations - we need the aAwayDists
  1492       pos.x = pos_x[start + i];
  1493       pos.y = pos_y[start + i];
  1494       pos.z = pos_z[start + i];
  1495       s = lattice.scale(pos);
  1497         // This is true if the system in periodic in A, B and C
  1498         // if any of these clauses are true, atom needs to migrate
  1499         mc += ((s.x < minx || s.x >= maxx) || (s.y < miny || s.y >= maxy) || (s.z < minz || s.z >= maxz)) ? 1 : 0;
  1501         int xdev, ydev, zdev;
  1502         // Ok, so if the system is not periodic, we need to access the mInfo data structure to determine migrations
  1503         if (s.x < minx) xdev = 0;
  1504         else if (maxx <= s.x) xdev = 2;
  1507         if (s.y < miny) ydev = 0;
  1508         else if (maxy <= s.y) ydev = 2;
  1511         if (s.z < minz) zdev = 0;
  1512         else if (maxz <= s.z) zdev = 2;
  1515         if(xdev != 1 || ydev != 1 || zdev != 1){
  1516           // we check if any *dev are different than zero to prevent this horrible global memory access here
  1517           int destPatch = mInfo[ind].destPatchID[xdev][ydev][zdev];
  1518           if(destPatch != -1 && destPatch != pid) mc += 1; // atom needs to migrate
  1521       corner.x = (pos.x - pos_old_x[start + i]) - center_cur.x;
  1522       corner.y = (pos.y - pos_old_y[start + i]) - center_cur.y;
  1523       corner.z = (pos.z - pos_old_z[start + i]) - center_cur.z;
  1524       cd = corner.x * corner.x + 
  1525            corner.y * corner.y + corner.z * corner.z;
  1526       if(cd > max_pd) max_pd = cd;
  1528     // JM NOTE: The atomic add to host memory is bad, but if you have margin violations the simulation is going badly
  1531       atomicAdd(h_marginViolations, mc);
  1534     typedef cub::BlockReduce<BigReal, PATCH_BLOCKS> BlockReduce;
  1535     __shared__ typename BlockReduce::TempStorage temp_storage;
  1536 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
  1537     max_pd = BlockReduce(temp_storage).Reduce(max_pd, cub::Max());
  1539     max_pd = BlockReduce(temp_storage).Reduce(max_pd, cuda::maximum());
  1541     if(threadIdx.x == 0){
  1542       max_pd = sqrt(max_cd) + sqrt(max_pd);
  1543       // this might not be needed since I'm updating host-mapped values anyways
  1544       patchMaxAtomMovement[ind] = max_pd;
  1545       if(max_pd > (1.f - c_pairlistTrigger) * patchNewTolerance[ind]){
  1546         patchNewTolerance[ind] *= (1.f + c_pairlistGrow);
  1548       if(max_pd > patchNewTolerance[ind]){
  1549         patchNewTolerance[ind] = max_pd / (1.f - c_pairlistTrigger);
  1551       // printf("patchNewTolerance[%d] =  %lf %lf\n", ind,
  1552       //  patchNewTolerance[ind], patchMaxAtomMovement[ind]);
  1553       h_patchMaxAtomMovement[ind] = max_pd; // Host-mapped update
  1554       h_patchNewTolerance[ind] = patchNewTolerance[ind];
  1557     // Checks if periodic cell has become too small and flags it
  1558     if(threadIdx.x == 0){
  1559       if ( ( aDist.x*sysdima < minSize*0.9999 ) ||
  1560          ( aDist.y*sysdimb < minSize*0.9999 ) ||
  1561          ( aDist.z*sysdimc < minSize*0.9999 ) ||
  1562          ( mValue.x < -0.0001 )               ||
  1563          ( mValue.y < -0.0001 )               ||
  1564          ( mValue.z < -0.0001)) {
  1565         *h_periodicCellSmall = 1;
  1571 void SequencerCUDAKernel::PairListMarginCheck(const int numPatches, 
  1572   CudaLocalRecord*                  localRecords,
  1573   const double*  pos_x,
  1574   const double*  pos_y,
  1575   const double*  pos_z,
  1576   const double*  pos_old_x,
  1577   const double*  pos_old_y,
  1578   const double*  pos_old_z,
  1579   const double3* awayDists, // for margin check
  1580   const Lattice lattices,
  1581   const Lattice lattices_old,
  1582   const double3* __restrict patchMins,
  1583   const double3* __restrict patchMaxes,
  1584   const double3* __restrict patchCenter,
  1585   const CudaMInfo* __restrict mInfo,
  1586   unsigned int* __restrict tbcatomic,
  1587   const double pairlistTrigger,
  1588   const double pairlistGrow,
  1589   const double pairlistShrink,
  1590   double* __restrict patchMaxAtomMovement,
  1591   double* __restrict h_patchMaxAtomMovement,
  1592   double* __restrict patchNewTolerance,
  1593   double* __restrict h_patchNewTolerance,
  1594   const double                 minSize,
  1595   const double                 cutoff,
  1596   const double                 sysdima,
  1597   const double                 sysdimb,
  1598   const double                 sysdimc,
  1599   unsigned int*                h_marginViolations,
  1600   unsigned int*               h_periodicCellSmall,
  1601   const bool rescalePairlistTolerance,
  1602   const bool isPeriodic,
  1603   cudaStream_t stream){
  1604   int grid = numPatches;
  1606   if(!this->intConstInit){
  1607     this->intConstInit = true;
  1608    cudaCheck(cudaMemcpyToSymbol(c_pairlistTrigger, &pairlistTrigger, sizeof(double)));
  1609    cudaCheck(cudaMemcpyToSymbol(c_pairlistGrow, &pairlistGrow, sizeof(double)));
  1610    cudaCheck(cudaMemcpyToSymbol(c_pairlistShrink, &pairlistShrink, sizeof(double)));
  1614     pairListMarginCheckKernel<true><<<grid, PATCH_BLOCKS, 0, stream>>>(
  1615       numPatches, localRecords,
  1616       pos_x, pos_y, pos_z, pos_old_x,
  1617       pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
  1618       patchMins, patchMaxes, patchCenter, mInfo, tbcatomic, 
  1619       patchMaxAtomMovement, h_patchMaxAtomMovement, 
  1620       patchNewTolerance, h_patchNewTolerance, 
  1621       minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
  1622       rescalePairlistTolerance);
  1625     pairListMarginCheckKernel<false><<<grid, PATCH_BLOCKS, 0, stream>>>(
  1626       numPatches, localRecords, 
  1627       pos_x, pos_y, pos_z, pos_old_x,
  1628       pos_old_y, pos_old_z, awayDists, lattices, lattices_old, 
  1629       patchMins, patchMaxes, patchCenter, mInfo, tbcatomic, 
  1630       patchMaxAtomMovement, h_patchMaxAtomMovement, 
  1631       patchNewTolerance, h_patchNewTolerance, 
  1632       minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
  1633       rescalePairlistTolerance);
  1637 template <bool doNbond, bool doSlow>
  1638 __global__ void copyFBondToSOA(double *f_bond,
  1639   double *f_bond_nbond,
  1640   double *f_bond_slow,
  1650   const int forceStride,
  1651   const PatchRecord *pr,
  1652   const int *patchIDs,
  1653   const int *patchOffsets)
  1655   // I suppose if I work with the entire PatchMap, this should work?
  1656   // Same thing, each block gets a patch
  1657   // What do I need -> Forces + atomStart
  1658   // the bonded forces are wrong here.
  1659   // What isforceOffset and patchOffset?
  1660   int stride       = blockDim.x;
  1661   int patchID      = patchIDs[blockIdx.x];
  1662   // we need to check this data structure first to make sure it is safe to access it using patchID
  1663   // I think patchId is the correct way of indexing this datastructure. Does this change with +p?
  1664   int natoms       = pr[patchID].numAtoms;
  1665   int forceOffset  = pr[patchID].atomStart;
  1666   int patchOffset  = patchOffsets[blockIdx.x];
  1668   for(int i = threadIdx.x; i < natoms; i+=stride){
  1670     f_bond_x[patchOffset + i] = f_bond[forceOffset + i];
  1671     f_bond_y[patchOffset + i] = f_bond[forceOffset + i +   forceStride];
  1672     f_bond_z[patchOffset + i] = f_bond[forceOffset + i + 2*forceStride];
  1675       // XXX Accumulation here works because each f_nbond_* buffer
  1676       // is first initialized in copyFNBondToSOA.
  1677       f_nbond_x[patchOffset + i] += f_bond_nbond[forceOffset + i];
  1678       f_nbond_y[patchOffset + i] += f_bond_nbond[forceOffset + i + forceStride];
  1679       f_nbond_z[patchOffset + i] += f_bond_nbond[forceOffset + i + 2*forceStride];
  1683       // XXX Accumulation here works because each f_slow_* buffer
  1684       // is first initialized in copyFSlowToSOA.
  1685       f_slow_x[patchOffset + i] += f_bond_slow[forceOffset + i];
  1686       f_slow_y[patchOffset + i] += f_bond_slow[forceOffset + i +   forceStride];
  1687       f_slow_z[patchOffset + i] += f_bond_slow[forceOffset + i + 2*forceStride];
  1692 void SequencerCUDAKernel::copy_bond_forces(int           numPatches,
  1694                                            double*       f_bond_nbond,
  1695                                            double*       f_bond_slow,
  1705                                            int           forceStride, //if stridedForces
  1707                                            const int*    patchIDs,
  1708                                            const int*    patchOffsets,
  1711                                            cudaStream_t  stream)
  1714     copyFBondToSOA<true, true><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
  1715       f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
  1716       f_nbond_x, f_nbond_y, f_nbond_z,
  1717       f_slow_x, f_slow_y, f_slow_z, forceStride,
  1718       pr, patchIDs, patchOffsets);
  1720      copyFBondToSOA<true, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
  1721        f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
  1722        f_nbond_x, f_nbond_y, f_nbond_z,
  1723        f_slow_x, f_slow_y, f_slow_z, forceStride,
  1724        pr, patchIDs, patchOffsets);
  1726      copyFBondToSOA<false, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
  1727        f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
  1728        f_nbond_x, f_nbond_y, f_nbond_z,
  1729        f_slow_x, f_slow_y, f_slow_z, forceStride,
  1730        pr, patchIDs, patchOffsets);
  1734 __global__ void copyFSlowToSOA(const CudaForce* __restrict__   f_slow,
  1738                                const int* __restrict__ patchOffsets,
  1739                                const Lattice* __restrict__ lattices)
  1741   int tid            = threadIdx.x;
  1742   int stride         = blockDim.x;
  1743   int patchOffset    = patchOffsets[blockIdx.x];
  1744   int numAtoms       = patchOffsets[blockIdx.x + 1] - patchOffset;
  1746   Lattice lat = lattices[0];
  1748   for(int i = tid; i < numAtoms; i += stride){
  1749     CudaForce f  = f_slow[patchOffset + i];
  1751     double3 f_scaled = lat.scale_force(
  1752       Vector((double)f.x, (double)f.y, (double)f.z));
  1754     // XXX Instead of accumulating slow force (+=), set them here (=)!
  1755     f_slow_x[patchOffset + i] = f_scaled.x;
  1756     f_slow_y[patchOffset + i] = f_scaled.y;
  1757     f_slow_z[patchOffset + i] = f_scaled.z;
  1761 void SequencerCUDAKernel::copy_slow_forces(int numPatches,
  1762   const CudaForce* f_slow,
  1766   const int* d_patchOffsets,
  1767   const Lattice *lattices,
  1768   cudaStream_t     stream){
  1770   copyFSlowToSOA<<<numPatches, PATCH_BLOCKS, 0, stream>>>(f_slow,
  1771     f_slow_x, f_slow_y, f_slow_z, d_patchOffsets, lattices);
  1774 __forceinline__ __device__ void zero_cudaTensor(cudaTensor &v)
  1787 template <int MAX_FORCE_NUMBER, bool DO_VEL_RESCALING, bool DO_FIXED>
  1788 __global__ void addForceToMomentumKernel(const double scaling,
  1792                                          double       velrescaling, // for stochastic velocity rescaling
  1793                                          const double  * __restrict recipMass,
  1794                                          const double * __restrict f_normal_x,
  1795                                          const double * __restrict f_normal_y,
  1796                                          const double * __restrict f_normal_z,
  1797                                          const double * __restrict f_nbond_x,
  1798                                          const double * __restrict f_nbond_y,
  1799                                          const double * __restrict f_nbond_z,
  1800                                          const double * __restrict f_slow_x,
  1801                                          const double * __restrict f_slow_y,
  1802                                          const double * __restrict f_slow_z,
  1803                                          double       * __restrict vel_x,
  1804                                          double       * __restrict vel_y,
  1805                                          double       * __restrict vel_z,
  1806                                          const int    * __restrict atomFixed,
  1809   int i = threadIdx.x + blockIdx.x*blockDim.x;
  1812     switch (maxForceNumber) {
  1815       vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
  1816       vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
  1817       vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
  1818       // fall through because we will always have the "faster" forces
  1820       dt_nbond *= scaling;
  1821       vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
  1822       vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
  1823       vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
  1824       // fall through because we will always have the "faster" forces
  1826       dt_normal *= scaling;
  1827       vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
  1828       vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
  1829       vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
  1835     switch (MAX_FORCE_NUMBER) {
  1837       vx += f_slow_x[i] * dt_slow;
  1838       vy += f_slow_y[i] * dt_slow;
  1839       vz += f_slow_z[i] * dt_slow;
  1840       // fall through because we will always have the "faster" forces
  1842       vx += f_nbond_x[i] * dt_nbond;
  1843       vy += f_nbond_y[i] * dt_nbond;
  1844       vz += f_nbond_z[i] * dt_nbond;
  1845       // fall through because we will always have the "faster" forces
  1847       vx += f_normal_x[i] * dt_normal;
  1848       vy += f_normal_y[i] * dt_normal;
  1849       vz += f_normal_z[i] * dt_normal;
  1851     vx *= scaling * recipMass[i];
  1852     vy *= scaling * recipMass[i];
  1853     vz *= scaling * recipMass[i];
  1855       if (DO_VEL_RESCALING) {
  1856         vel_x[i] = velrescaling*vel_x[i] + vx;
  1857         vel_y[i] = velrescaling*vel_y[i] + vy;
  1858         vel_z[i] = velrescaling*vel_z[i] + vz;
  1865     } else { // fixed atoms
  1866       if (!atomFixed[i]) {
  1867         if (DO_VEL_RESCALING) {
  1868           vel_x[i] = velrescaling*vel_x[i] + vx;
  1869           vel_y[i] = velrescaling*vel_y[i] + vy;
  1870           vel_z[i] = velrescaling*vel_z[i] + vz;
  1888 // JM: This sets the cudaAtom vector from the nonbonded compute
  1889 template <bool t_doNbond, bool t_doHomePatches>
  1890 __global__ void setComputePositionsKernel(
  1891   CudaLocalRecord*                  localRecords,
  1892   CudaPeerRecord*                   peerRecords,
  1893   const double3* __restrict         patchCenter, 
  1894   const int* __restrict             patchSortOrder, 
  1895   const int                         numPatches,
  1896   const unsigned int                devID, 
  1898   const double                      charge_scaling, 
  1899   const double* __restrict          d_pos_x, 
  1900   const double* __restrict          d_pos_y, 
  1901   const double* __restrict          d_pos_z,
  1902   const float* __restrict           charges,
  1903   double** __restrict               d_peer_pos_x, 
  1904   double** __restrict               d_peer_pos_y, 
  1905   double** __restrict               d_peer_pos_z, 
  1906   float4* __restrict                nb_atoms,
  1907   float4* __restrict                b_atoms,
  1908   float4* __restrict                s_atoms
  1911   __shared__ CudaLocalRecord s_record;
  1912   using AccessType = int32_t;
  1913   AccessType* s_record_buffer = (AccessType*)  &s_record;
  1915   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
  1916     // Read in the CudaLocalRecord using multiple threads
  1918     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
  1919       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
  1924     //int soapid = globalToLocalID[record.patchID];
  1925     center = patchCenter[patchIndex];
  1926     double3 ucenter = lat.unscale(center);
  1928     const int numAtoms = s_record.numAtoms;
  1929     const int dstOffset = s_record.bufferOffset;
  1930     const int dstOffsetNB = s_record.bufferOffsetNBPad;
  1932     int srcDevice, srcOffset;
  1933     const double *srcPosX, *srcPosY, *srcPosZ;
  1934     if (t_doHomePatches) {
  1936       srcOffset = dstOffset;
  1941       srcDevice = s_record.inline_peers[0].deviceIndex;
  1942       srcOffset = s_record.inline_peers[0].bufferOffset;
  1943       srcPosX = d_peer_pos_x[srcDevice];
  1944       srcPosY = d_peer_pos_y[srcDevice];
  1945       srcPosZ = d_peer_pos_z[srcDevice];
  1949     for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
  1950       const int order = patchSortOrder[dstOffset + i]; // Should this be order or unorder?
  1951       atom.x = srcPosX[srcOffset + order] - ucenter.x;
  1952       atom.y = srcPosY[srcOffset + order] - ucenter.y;
  1953       atom.z = srcPosZ[srcOffset + order] - ucenter.z;
  1954       atom.w = charges[dstOffset + order] * charge_scaling;
  1956       b_atoms[dstOffset + order] = atom;
  1957       if (t_doNbond) nb_atoms[dstOffsetNB + i] = atom;
  1961       if (threadIdx.x / WARPSIZE == 0) {
  1962         const int to_write = (((numAtoms+32-1)/32)*32) - numAtoms; // WARPSIZE
  1965         const int order = patchSortOrder[dstOffset + numAtoms - 1]; // Should this be order or unorder?
  1966         lastAtom.x = srcPosX[srcOffset + order] - ucenter.x;
  1967         lastAtom.y = srcPosY[srcOffset + order] - ucenter.y;
  1968         lastAtom.z = srcPosZ[srcOffset + order] - ucenter.z;
  1969         lastAtom.w = charges[dstOffset + order] * charge_scaling;
  1971         if (threadIdx.x < to_write) {
  1972           nb_atoms[dstOffsetNB+numAtoms+threadIdx.x] = lastAtom;
  1980 template<bool t_doHomePatches, bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
  1981 __global__ void setComputePositionsKernel_PME (
  1982   const double* __restrict          d_pos_x, 
  1983   const double* __restrict          d_pos_y, 
  1984   const double* __restrict          d_pos_z,
  1985   const float* __restrict           charges,
  1986   double** __restrict               d_peer_pos_x, 
  1987   double** __restrict               d_peer_pos_y, 
  1988   double** __restrict               d_peer_pos_z, 
  1989   float** __restrict                d_peer_charge,
  1990   int** __restrict                  d_peer_partition,
  1991   const int* __restrict             partition,
  1992   const double                      charge_scaling, 
  1993   const int* __restrict             s_patchPositions,
  1994   const int* __restrict             s_pencilPatchIndex,
  1995   const int* __restrict             s_patchIDs,
  1997   float4* __restrict                s_atoms,
  1999   int* __restrict                   s_partition,
  2001   const int peerDevice,
  2004   const bool handleBoundary
  2009   const int pmeBufferOffset = offset;
  2010   const int srcBufferOffset = 0; 
  2011   const int srcDevice = peerDevice;
  2013   // double precision calculation
  2014   double px, py, pz, wx, wy, wz, q;
  2016   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numAtoms; i += blockDim.x * gridDim.x) {
  2017     // this gets atoms in same order as PmeCuda code path
  2018     if (t_doHomePatches) {
  2019       q = (double)(charges[srcBufferOffset + i]);
  2020       px = d_pos_x[srcBufferOffset + i];
  2021       py = d_pos_y[srcBufferOffset + i];
  2022       pz = d_pos_z[srcBufferOffset + i];
  2023       if (t_doFEP || t_doTI) {
  2024         part = partition[srcBufferOffset + i];
  2027       q = (double)(d_peer_charge[srcDevice][srcBufferOffset + i]);
  2028       px = d_peer_pos_x[srcDevice][srcBufferOffset + i];
  2029       py = d_peer_pos_y[srcDevice][srcBufferOffset + i];
  2030       pz = d_peer_pos_z[srcDevice][srcBufferOffset + i];
  2031       if (t_doFEP || t_doTI) {
  2032         part = d_peer_partition[srcDevice][srcBufferOffset + i];
  2036     double3 w_vec = lat.scale(Vector(px, py, pz));
  2041     if (handleBoundary) {
  2042       wx = (wx - (floor(wx + 0.5) - 0.5));
  2043       wy = (wy - (floor(wy + 0.5) - 0.5));
  2044       wz = (wz - (floor(wz + 0.5) - 0.5));
  2051     if (handleBoundary) {
  2052       foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
  2053       foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
  2054       foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
  2057     foo.w = (float) (charge_scaling * q);
  2058     if (!t_doFEP && !t_doTI) {
  2059       s_atoms[pmeBufferOffset + i] = foo;
  2061     else { // alchemical multiple grids
  2062       float4 foo_zero_charge = foo;
  2063       foo_zero_charge.w = 0.0f;
  2064       s_partition[pmeBufferOffset + i] = part;
  2065       /*                        grid 0      grid 1      grid 2      grid 3      grid 4
  2066       * non-alch     (p = 0)     Y           Y           N           N           Y
  2067       * appearing    (p = 1)     Y           N           Y           N           N
  2068       * disappearing (p = 2)     N           Y           N           Y           N
  2069       * Requirements of grids:
  2070       * 1. t_doFEP || t_doTI : grid 0, grid 1
  2071       * 2. t_doAlchDecouple : grid 2, grid 3
  2072       * 3. t_doAlchSoftCore || t_doTI: grid 4
  2073       * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
  2074       * although the atoms that have zero charges in extra grids would not change in non-migration steps,
  2075       * I still find no way to get rid of these copying, because positions of the atoms can be changed.
  2076       * The non-zero charges may also change if they are computed from some QM engines or some new kinds of FF.
  2077       * It seems these branchings in non-migration steps are inevitable.
  2082           s_atoms[pmeBufferOffset + i] = foo;
  2083           s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
  2084           if (t_doAlchDecouple) {
  2085             s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
  2086             s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
  2087             if (t_doAlchSoftCore || t_doTI) {
  2088               s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo;
  2091             if (t_doAlchSoftCore || t_doTI) {
  2092               s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
  2099           s_atoms[pmeBufferOffset + i] = foo;
  2100           s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo_zero_charge;
  2101           if (t_doAlchDecouple) {
  2102             s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
  2103             s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
  2104             if (t_doAlchSoftCore || t_doTI) {
  2105               s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
  2108             if (t_doAlchSoftCore || t_doTI) {
  2109               s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
  2114         // disappearing atoms
  2116           s_atoms[pmeBufferOffset + i] = foo_zero_charge;
  2117           s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
  2118           if (t_doAlchDecouple) {
  2119             s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
  2120             s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo;
  2121             if (t_doAlchSoftCore || t_doTI) {
  2122               s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
  2125             if (t_doAlchSoftCore || t_doTI) {
  2126               s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
  2136 void SequencerCUDAKernel::set_compute_positions(
  2138   const bool isPmeDevice, 
  2140   const int  numPatchesHomeAndProxy,
  2141   const int  numPatchesHome,
  2146   const bool doAlchDecouple,
  2147   const bool doAlchSoftCore,
  2148   const bool handleBoundary,
  2149   const double* d_pos_x,
  2150   const double* d_pos_y,
  2151   const double* d_pos_z,
  2152 #ifndef NAMD_NCCL_ALLREDUCE
  2153   double**      d_peer_pos_x, 
  2154   double**      d_peer_pos_y, 
  2155   double**      d_peer_pos_z, 
  2156   float**       d_peer_charge, 
  2157   int**         d_peer_partition,
  2159   const float* charges,
  2160   const int* partition,
  2161   const double charge_scaling, 
  2162   const double3* patchCenter, 
  2163   const int* s_patchPositions, 
  2164   const int* s_pencilPatchIndex, 
  2165   const int* s_patchIDs, 
  2166   const int* patchSortOrder, 
  2167   const Lattice lattice,
  2173   CudaLocalRecord*                  localRecords,
  2174   CudaPeerRecord*                   peerRecords,
  2175   std::vector<int>& atomCounts,
  2176   cudaStream_t stream)
  2178   // Launch Local Set Compute Positions
  2180     setComputePositionsKernel<true, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
  2181       localRecords, peerRecords, patchCenter, patchSortOrder,
  2182       numPatchesHome, devID, lattice, charge_scaling,
  2183       d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, 
  2184       nb_atoms, b_atoms, s_atoms 
  2187     setComputePositionsKernel<false, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
  2188       localRecords, peerRecords, patchCenter, patchSortOrder,
  2189       numPatchesHome, devID, lattice, charge_scaling,
  2190       d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, 
  2191       nb_atoms, b_atoms, s_atoms 
  2195   // Launch Peer Set Computes
  2197     const int numProxyPatches = numPatchesHomeAndProxy - numPatchesHome;
  2199       setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
  2200         localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
  2201         numProxyPatches, devID, lattice, charge_scaling,
  2202         d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, 
  2203         nb_atoms, b_atoms, s_atoms 
  2206       setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
  2207         localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
  2208         numProxyPatches, devID, lattice, charge_scaling,
  2209         d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, 
  2210         nb_atoms, b_atoms, s_atoms
  2215   // Launch PME setComputes
  2216   #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
  2217     setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
  2218     <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
  2219           d_pos_x, d_pos_y, d_pos_z, charges, \
  2220           d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
  2221           d_peer_partition, partition, charge_scaling,  \
  2222           s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
  2223           lattice, s_atoms, numTotalAtoms, s_partition, \
  2224           i, numAtoms, offset, handleBoundary \
  2226   // Only when PME long-range electrostaic is enabled (doSlow is true) 
  2227   // The partition is needed for alchemical calculation.
  2228   if(doSlow && isPmeDevice) {
  2230     for (int i = 0; i < nDev; i++) {
  2231       const bool home = (i == devID);
  2232       const int numAtoms = atomCounts[i];
  2233       const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
  2234       const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
  2237         case  0: CALL(0, 0, 0, 0, 0); break;
  2238         case  1: CALL(0, 0, 0, 0, 1); break;
  2239         case  2: CALL(0, 0, 0, 1, 0); break;
  2240         case  3: CALL(0, 0, 0, 1, 1); break;
  2241         case  4: CALL(0, 0, 1, 0, 0); break;
  2242         case  5: CALL(0, 0, 1, 0, 1); break;
  2243         case  6: CALL(0, 0, 1, 1, 0); break;
  2244         case  7: CALL(0, 0, 1, 1, 1); break;
  2245         case  8: CALL(0, 1, 0, 0, 0); break;
  2246         case  9: CALL(0, 1, 0, 0, 1); break;
  2247         case 10: CALL(0, 1, 0, 1, 0); break;
  2248         case 11: CALL(0, 1, 0, 1, 1); break;
  2249         case 12: CALL(0, 1, 1, 0, 0); break;
  2250         case 13: CALL(0, 1, 1, 0, 1); break;
  2251         case 14: CALL(0, 1, 1, 1, 0); break;
  2252         case 15: CALL(0, 1, 1, 1, 1); break;
  2253         case 16: CALL(1, 0, 0, 0, 0); break;
  2254         case 17: CALL(1, 0, 0, 0, 1); break;
  2255         case 18: CALL(1, 0, 0, 1, 0); break;
  2256         case 19: CALL(1, 0, 0, 1, 1); break;
  2257         case 20: CALL(1, 0, 1, 0, 0); break;
  2258         case 21: CALL(1, 0, 1, 0, 1); break;
  2259         case 22: CALL(1, 0, 1, 1, 0); break;
  2260         case 23: CALL(1, 0, 1, 1, 1); break;
  2261         case 24: CALL(1, 1, 0, 0, 0); break;
  2262         case 25: CALL(1, 1, 0, 0, 1); break;
  2263         case 26: CALL(1, 1, 0, 1, 0); break;
  2264         case 27: CALL(1, 1, 0, 1, 1); break;
  2265         case 28: CALL(1, 1, 1, 0, 0); break;
  2266         case 29: CALL(1, 1, 1, 0, 1); break;
  2267         case 30: CALL(1, 1, 1, 1, 0); break;
  2268         case 31: CALL(1, 1, 1, 1, 1); break;
  2270           NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
  2279 template <bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
  2280 __global__ void setPmePositionsKernel(
  2281     double charge_scaling,
  2282     const Lattice lattice,
  2283     const double *pos_x,
  2284     const double *pos_y,
  2285     const double *pos_z,
  2286     const float *charges,
  2287     const int* partition,
  2289     int* s_atoms_partition,
  2292   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2294     Lattice lat = lattice;
  2296     double wx, wy, wz, q;
  2297     q = (double)(charges[i]);
  2299     double3 w_vec = lat.scale(Vector(pos_x[i], pos_y[i], pos_z[i]));
  2304     wx = (wx - (floor(wx + 0.5) - 0.5));
  2305     wy = (wy - (floor(wy + 0.5) - 0.5));
  2306     wz = (wz - (floor(wz + 0.5) - 0.5));
  2310     foo.w = (float) (charge_scaling * q);
  2311     foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
  2312     foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
  2313     foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
  2314     if (!t_doFEP && !t_doTI) {
  2317     else { // alchemical multiple grids
  2318       float4 foo_zero_charge = foo;
  2319       foo_zero_charge.w = 0.0f;
  2320       s_atoms_partition[i] = partition[i];
  2321       /*                        grid 0      grid 1      grid 2      grid 3      grid 4
  2322        * non-alch     (p = 0)     Y           Y           N           N           Y
  2323        * appearing    (p = 1)     Y           N           Y           N           N
  2324        * disappearing (p = 2)     N           Y           N           Y           N
  2325        * Requirements of grids:
  2326        * 1. t_doFEP || t_doTI : grid 0, grid 1
  2327        * 2. t_doAlchDecouple : grid 2, grid 3
  2328        * 3. t_doAlchSoftCore || t_doTI: grid 4
  2329        * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
  2331       switch (partition[i]) {
  2335           s_atoms[i + numAtoms] = foo;
  2336           if (t_doAlchDecouple) {
  2337             s_atoms[i + 2 * numAtoms] = foo_zero_charge;
  2338             s_atoms[i + 3 * numAtoms] = foo_zero_charge;
  2339             if (t_doAlchSoftCore || t_doTI) {
  2340               s_atoms[i + 4 * numAtoms] = foo;
  2343             if (t_doAlchSoftCore || t_doTI) {
  2344               s_atoms[i + 2 * numAtoms] = foo;
  2352           s_atoms[i + numAtoms] = foo_zero_charge;
  2353           if (t_doAlchDecouple) {
  2354             s_atoms[i + 2 * numAtoms] = foo;
  2355             s_atoms[i + 3 * numAtoms] = foo_zero_charge;
  2356             if (t_doAlchSoftCore || t_doTI) {
  2357               s_atoms[i + 4 * numAtoms] = foo_zero_charge;
  2360             if (t_doAlchSoftCore || t_doTI) {
  2361               s_atoms[i + 2 * numAtoms] = foo_zero_charge;
  2366         // disappearing atoms
  2368           s_atoms[i] = foo_zero_charge;
  2369           s_atoms[i + numAtoms] = foo;
  2370           if (t_doAlchDecouple) {
  2371             s_atoms[i + 2 * numAtoms] = foo_zero_charge;
  2372             s_atoms[i + 3 * numAtoms] = foo;
  2373             if (t_doAlchSoftCore || t_doTI) {
  2374               s_atoms[i + 4 * numAtoms] = foo_zero_charge;
  2377             if (t_doAlchSoftCore || t_doTI) {
  2378               s_atoms[i + 2 * numAtoms] = foo_zero_charge;
  2388 __global__ void maximumMoveKernel(const double maxvel2,
  2389                                   const double * __restrict vel_x,
  2390                                   const double * __restrict vel_y,
  2391                                   const double * __restrict vel_z,
  2395   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2397     double vel2 = vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i];
  2398     if (vel2 > maxvel2) {
  2399       //If this ever happens, we're already screwed, so performance does not matter
  2400       atomicAdd(killme, 1);
  2405 void SequencerCUDAKernel::maximumMove(const double maxvel2,
  2406                                       const double *vel_x,
  2407                                       const double *vel_y,
  2408                                       const double *vel_z,
  2411                                       cudaStream_t stream)
  2413   //cudaCheck(cudaMemset(killme, 0, sizeof(int)));
  2414   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2415   maximumMoveKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  2416     maxvel2, vel_x, vel_y, vel_z, killme, numAtoms);
  2419 void SequencerCUDAKernel::addForceToMomentum(
  2421   const double scaling,
  2425   double       velrescaling,  // for stochastic velocity rescaling
  2426   const double *recipMass,
  2427   const double *f_normal_x,
  2428   const double *f_normal_y,
  2429   const double *f_normal_z,
  2430   const double *f_nbond_x,
  2431   const double *f_nbond_y,
  2432   const double *f_nbond_z,
  2433   const double *f_slow_x,
  2434   const double *f_slow_y,
  2435   const double *f_slow_z,
  2439   const int    *atomFixed,
  2442   cudaStream_t stream)
  2444   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2445   const bool doVelRescaling = (velrescaling != 1.0);
  2446   int kernelCalled = 0;
  2447 #define CALL(MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED) \
  2449   addForceToMomentumKernel<MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED> \
  2450   <<<grid, ATOM_BLOCKS, 0, stream>>>( \
  2451     scaling, dt_normal, dt_nbond, dt_slow, velrescaling, \
  2452     recipMass, f_normal_x, f_normal_y, f_normal_z, \
  2453     f_nbond_x, f_nbond_y, f_nbond_z, \
  2454     f_slow_x, f_slow_y, f_slow_z, \
  2455     vel_x, vel_y, vel_z, atomFixed, \
  2459   switch (maxForceNumber) {
  2461       if (doVelRescaling && doFixedAtoms)   CALL(2, true, true);
  2462       if (doVelRescaling && !doFixedAtoms)  CALL(2, true, false);
  2463       if (!doVelRescaling && !doFixedAtoms) CALL(2, false, false);
  2464       if (!doVelRescaling && doFixedAtoms) CALL(2, false, true);
  2468       if (doVelRescaling && doFixedAtoms)   CALL(1, true, true);
  2469       if (doVelRescaling && !doFixedAtoms)  CALL(1, true, false);
  2470       if (!doVelRescaling && !doFixedAtoms) CALL(1, false, false);
  2471       if (!doVelRescaling && doFixedAtoms) CALL(1, false, true);
  2475       if (doVelRescaling && doFixedAtoms)   CALL(0, true, true);
  2476       if (doVelRescaling && !doFixedAtoms)  CALL(0, true, false);
  2477       if (!doVelRescaling && !doFixedAtoms) CALL(0, false, false);
  2478       if (!doVelRescaling && doFixedAtoms) CALL(0, false, true);
  2483   if (kernelCalled != 1) NAMD_bug("Incorrect number of calls to kernel in SequencerCUDAKernel::addForceToMomentum!\n");
  2484   // cudaCheck(cudaGetLastError());
  2487 template <bool copyPos, bool DO_FIXED>
  2488 __global__ void addVelocityToPositionKernel(
  2490   const int*     __restrict atomFixed,
  2491   const double * __restrict vel_x,
  2492   const double * __restrict vel_y,
  2493   const double * __restrict vel_z,
  2494   double *       __restrict pos_x,
  2495   double *       __restrict pos_y,
  2496   double *       __restrict pos_z,
  2497   double *       __restrict h_pos_x, // host-mapped vectors
  2498   double *       __restrict h_pos_y,
  2499   double *       __restrict h_pos_z,
  2502   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2509       if (!atomFixed[i]) {
  2530 void SequencerCUDAKernel::addVelocityToPosition(
  2533   const int *atomFixed,
  2534   const double *vel_x,
  2535   const double *vel_y,
  2536   const double *vel_z,
  2545   cudaStream_t   stream)
  2547   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2548 #define CALL(copyPos, DO_FIXED) \
  2549   addVelocityToPositionKernel<copyPos, DO_FIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
  2550     dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z, \
  2551     h_pos_x, h_pos_y, h_pos_z, numAtoms)
  2552   if ( copyPositions &&  doFixedAtoms) CALL(true, true);
  2553   else if (!copyPositions &&  doFixedAtoms) CALL(false, true);
  2554   else if ( copyPositions && !doFixedAtoms) CALL(true, false);
  2555   else if (!copyPositions && !doFixedAtoms) CALL(false, false);
  2556   else NAMD_bug("No kernel was called in SequencerCUDAKernel::addVelocityToPosition!\n");
  2558   //cudaCheck(cudaGetLastError());
  2561 template <bool doFixed>
  2562 __global__ void updateRigidArraysKernel(
  2564   const int *    __restrict atomFixed,
  2565   const double * __restrict vel_x,
  2566   const double * __restrict vel_y,
  2567   const double * __restrict vel_z,
  2568   const double * __restrict pos_x,
  2569   const double * __restrict pos_y,
  2570   const double * __restrict pos_z,
  2571   double *       __restrict velNew_x,
  2572   double *       __restrict velNew_y,
  2573   double *       __restrict velNew_z,
  2574   double *       __restrict posNew_x,
  2575   double *       __restrict posNew_y,
  2576   double *       __restrict posNew_z,
  2579   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2581     double vx = vel_x[i];
  2582     double vy = vel_y[i];
  2583     double vz = vel_z[i];
  2585       posNew_x[i] = pos_x[i] + (vx * dt);
  2586       posNew_y[i] = pos_y[i] + (vy * dt);
  2587       posNew_z[i] = pos_z[i] + (vz * dt);
  2589       if (!atomFixed[i]) {
  2590         posNew_x[i] = pos_x[i] + (vx * dt);
  2591         posNew_y[i] = pos_y[i] + (vy * dt);
  2592         posNew_z[i] = pos_z[i] + (vz * dt);
  2594         posNew_x[i] = pos_x[i];
  2595         posNew_y[i] = pos_y[i];
  2596         posNew_z[i] = pos_z[i];
  2606 // JM NOTE: Optimize this kernel further to improve global memory access pattern
  2607 __global__ void centerOfMassKernel(
  2608   const double * __restrict coor_x,
  2609   const double * __restrict coor_y,
  2610   const double * __restrict coor_z,
  2611   double * __restrict cm_x, // center of mass is atom-sized
  2612   double * __restrict cm_y,
  2613   double * __restrict cm_z,
  2614   const float * __restrict mass,
  2615   const int * __restrict hydrogenGroupSize,
  2618   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2620     int hgs = hydrogenGroupSize[i];
  2623       // missing fixed atoms
  2625       BigReal reg_cm_x = 0;
  2626       BigReal reg_cm_y = 0;
  2627       BigReal reg_cm_z = 0;
  2628       for ( j = i; j < (i+hgs); ++j ) {
  2630         reg_cm_x += mass[j] * coor_x[j];
  2631         reg_cm_y += mass[j] * coor_y[j];
  2632         reg_cm_z += mass[j] * coor_z[j];
  2634       BigReal inv_m_cm = 1.0/m_cm;
  2635       reg_cm_x *= inv_m_cm;
  2636       reg_cm_y *= inv_m_cm;
  2637       reg_cm_z *= inv_m_cm;
  2639       for(j = i ; j < (i + hgs); j++){
  2648 void SequencerCUDAKernel::centerOfMass(
  2649   const double *coor_x,
  2650   const double *coor_y,
  2651   const double *coor_z,
  2656   const int* hydrogenGroupSize,
  2660   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2662   centerOfMassKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(coor_x, coor_y, coor_z,
  2663       cm_x, cm_y, cm_z, mass, hydrogenGroupSize, numAtoms);
  2667 void SequencerCUDAKernel::updateRigidArrays(
  2670   const int    *atomFixed,
  2671   const double *vel_x,
  2672   const double *vel_y,
  2673   const double *vel_z,
  2674   const double *pos_x,
  2675   const double *pos_y,
  2676   const double *pos_z,
  2684   cudaStream_t   stream)
  2686   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2688     updateRigidArraysKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
  2689       dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
  2690       velNew_x, velNew_y, velNew_z,
  2691       posNew_x, posNew_y, posNew_z, numAtoms);
  2693     updateRigidArraysKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
  2694       dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
  2695       velNew_x, velNew_y, velNew_z,
  2696       posNew_x, posNew_y, posNew_z, numAtoms);
  2697   // cudaCheck(cudaGetLastError());
  2700 template<int BLOCK_SIZE, bool DO_FIXED>
  2701 __global__ void submitHalfKernel(
  2702   // const int* __restrict atomFixed,
  2703   const double * __restrict vel_x,
  2704   const double * __restrict vel_y,
  2705   const double * __restrict vel_z,
  2706   const double * __restrict vcm_x,
  2707   const double * __restrict vcm_y,
  2708   const double * __restrict vcm_z,
  2709   const float  * __restrict mass,
  2710   BigReal *kineticEnergy,
  2711   BigReal *intKineticEnergy,
  2713   cudaTensor *intVirialNormal,
  2714   BigReal *h_kineticEnergy,
  2715   BigReal *h_intKineticEnergy,
  2716   cudaTensor *h_virial,
  2717   cudaTensor *h_intVirialNormal,
  2718   int *hydrogenGroupSize,
  2720   unsigned int* tbcatomic)
  2722   BigReal k = 0, intK = 0;
  2725   zero_cudaTensor(intV);
  2726   int i = threadIdx.x + blockIdx.x*blockDim.x;
  2727   int totaltb = gridDim.x;
  2728   __shared__ bool isLastBlockDone;
  2731   if(threadIdx.x == 0){
  2732     isLastBlockDone = 0;
  2739       (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
  2740     v.xx = mass[i] * vel_x[i] * vel_x[i];
  2742       v.xy = mass[i] * vel_x[i] * vel_y[i];
  2743       v.xz = mass[i] * vel_x[i] * vel_z[i];
  2745     v.yx = mass[i] * vel_y[i] * vel_x[i];
  2746     v.yy = mass[i] * vel_y[i] * vel_y[i];
  2748       v.yz = mass[i] * vel_y[i] * vel_z[i];
  2750     v.zx = mass[i] * vel_z[i] * vel_x[i];
  2751     v.zy = mass[i] * vel_z[i] * vel_y[i];
  2752     v.zz = mass[i] * vel_z[i] * vel_z[i];
  2755     int hgs = hydrogenGroupSize[i];
  2761       for (int j = i;  j < (i+hgs);  j++) {
  2763         v_cm_x += mass[j] * vel_x[j];
  2764         v_cm_y += mass[j] * vel_y[j];
  2765         v_cm_z += mass[j] * vel_z[j];
  2767       BigReal recip_m_cm = 1.0 / m_cm;
  2768       v_cm_x *= recip_m_cm;
  2769       v_cm_y *= recip_m_cm;
  2770       v_cm_z *= recip_m_cm;
  2772       for (int j = i;  j < (i+hgs);  j++) {
  2773         BigReal dv_x = vel_x[j] - v_cm_x;
  2774         BigReal dv_y = vel_y[j] - v_cm_y;
  2775         BigReal dv_z = vel_z[j] - v_cm_z;
  2777           (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
  2778         intV.xx += mass[j] * vel_x[j] * dv_x;
  2779         intV.xy += mass[j] * vel_x[j] * dv_y;
  2780         intV.xz += mass[j] * vel_x[j] * dv_z;
  2781         intV.yx += mass[j] * vel_y[j] * dv_x;
  2782         intV.yy += mass[j] * vel_y[j] * dv_y;
  2783         intV.yz += mass[j] * vel_y[j] * dv_z;
  2784         intV.zx += mass[j] * vel_z[j] * dv_x;
  2785         intV.zy += mass[j] * vel_z[j] * dv_y;
  2786         intV.zz += mass[j] * vel_z[j] * dv_z;
  2790     // JM: New version with centers of mass calculated by a separate kernel
  2791     BigReal v_cm_x = vcm_x[i];
  2792     BigReal v_cm_y = vcm_y[i];
  2793     BigReal v_cm_z = vcm_z[i];
  2794     BigReal dv_x = vel_x[i] - v_cm_x;
  2795     BigReal dv_y = vel_y[i] - v_cm_y;
  2796     BigReal dv_z = vel_z[i] - v_cm_z;
  2798         (vel_x[i] * dv_x + vel_y[i] * dv_y + vel_z[i] * dv_z);
  2799     intV.xx += mass[i] * vel_x[i] * dv_x;
  2800     intV.xy += mass[i] * vel_x[i] * dv_y;
  2801     intV.xz += mass[i] * vel_x[i] * dv_z;
  2802     intV.yx += mass[i] * vel_y[i] * dv_x;
  2803     intV.yy += mass[i] * vel_y[i] * dv_y;
  2804     intV.yz += mass[i] * vel_y[i] * dv_z;
  2805     intV.zx += mass[i] * vel_z[i] * dv_x;
  2806     intV.zy += mass[i] * vel_z[i] * dv_y;
  2807     intV.zz += mass[i] * vel_z[i] * dv_z;
  2811   typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
  2812   __shared__ typename BlockReduce::TempStorage temp_storage;
  2813   k    = BlockReduce(temp_storage).Sum(k);
  2815   v.xx = BlockReduce(temp_storage).Sum(v.xx);
  2818     v.xy = BlockReduce(temp_storage).Sum(v.xy);
  2820     v.xz = BlockReduce(temp_storage).Sum(v.xz);
  2823   v.yx = BlockReduce(temp_storage).Sum(v.yx);
  2825   v.yy = BlockReduce(temp_storage).Sum(v.yy);
  2828     v.yz = BlockReduce(temp_storage).Sum(v.yz);
  2831   v.zx = BlockReduce(temp_storage).Sum(v.zx);
  2833   v.zy = BlockReduce(temp_storage).Sum(v.zy);
  2835   v.zz = BlockReduce(temp_storage).Sum(v.zz);
  2837   intK = BlockReduce(temp_storage).Sum(intK);
  2839   intV.xx = BlockReduce(temp_storage).Sum(intV.xx);
  2841   intV.xy = BlockReduce(temp_storage).Sum(intV.xy);
  2843   intV.xz = BlockReduce(temp_storage).Sum(intV.xz);
  2845   intV.yx = BlockReduce(temp_storage).Sum(intV.yx);
  2847   intV.yy = BlockReduce(temp_storage).Sum(intV.yy);
  2849   intV.yz = BlockReduce(temp_storage).Sum(intV.yz);
  2851   intV.zx = BlockReduce(temp_storage).Sum(intV.zx);
  2853   intV.zy = BlockReduce(temp_storage).Sum(intV.zy);
  2855   intV.zz = BlockReduce(temp_storage).Sum(intV.zz);
  2858   if (threadIdx.x == 0) {
  2859     const int bin = blockIdx.x % ATOMIC_BINS;
  2861     atomicAdd(&kineticEnergy[bin], k);
  2862     atomicAdd(&virial[bin].xx, v.xx);
  2864       atomicAdd(&virial[bin].xy, v.xy);
  2865       atomicAdd(&virial[bin].xz, v.xz);
  2867     atomicAdd(&virial[bin].yx, v.yx);
  2868     atomicAdd(&virial[bin].yy, v.yy);
  2870       atomicAdd(&virial[bin].yz, v.yz);
  2872     atomicAdd(&virial[bin].zx, v.zx);
  2873     atomicAdd(&virial[bin].zy, v.zy);
  2874     atomicAdd(&virial[bin].zz, v.zz);
  2875     atomicAdd(&intKineticEnergy[bin], intK);
  2876     atomicAdd(&intVirialNormal[bin].xx, intV.xx);
  2877     atomicAdd(&intVirialNormal[bin].xy, intV.xy);
  2878     atomicAdd(&intVirialNormal[bin].xz, intV.xz);
  2879     atomicAdd(&intVirialNormal[bin].yx, intV.yx);
  2880     atomicAdd(&intVirialNormal[bin].yy, intV.yy);
  2881     atomicAdd(&intVirialNormal[bin].yz, intV.yz);
  2882     atomicAdd(&intVirialNormal[bin].zx, intV.zx);
  2883     atomicAdd(&intVirialNormal[bin].zy, intV.zy);
  2884     atomicAdd(&intVirialNormal[bin].zz, intV.zz);
  2887     unsigned int value = atomicInc(&tbcatomic[1], totaltb);
  2888     isLastBlockDone = (value == (totaltb -1));
  2893   if(isLastBlockDone){
  2894     if(threadIdx.x < ATOMIC_BINS){
  2895       const int bin = threadIdx.x;
  2897       double k = kineticEnergy[bin];
  2898       double intK = intKineticEnergy[bin];
  2899       cudaTensor v = virial[bin];
  2900       cudaTensor intV = intVirialNormal[bin];
  2902       // sets device scalars back to zero
  2903       kineticEnergy[bin] = 0.0;
  2904       intKineticEnergy[bin] = 0.0;
  2905       zero_cudaTensor(virial[bin]);
  2906       zero_cudaTensor(intVirialNormal[bin]);
  2908       if(ATOMIC_BINS > 1){
  2909         typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
  2910         typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
  2911         __shared__ typename WarpReduce::TempStorage tempStorage;
  2912         __shared__ typename WarpReduceT::TempStorage tempStorageT;
  2914         k = WarpReduce(tempStorage).Sum(k);
  2915         intK = WarpReduce(tempStorage).Sum(intK);
  2916         v = WarpReduceT(tempStorageT).Sum(v);
  2917         intV = WarpReduceT(tempStorageT).Sum(intV);
  2920       if(threadIdx.x == 0){
  2921         h_kineticEnergy[0] = k;
  2922         h_intKineticEnergy[0] = intK;
  2924         h_intVirialNormal[0] = intV;
  2926         //resets atomic counter
  2927         reset_atomic_counter(&tbcatomic[1]);
  2934 void SequencerCUDAKernel::submitHalf(
  2935   const bool doFixedAtoms,
  2936   const double *vel_x,
  2937   const double *vel_y,
  2938   const double *vel_z,
  2939   const double *vcm_x,
  2940   const double *vcm_y,
  2941   const double *vcm_z,
  2943   BigReal *kineticEnergy,
  2944   BigReal *intKineticEnergy,
  2946   cudaTensor *intVirialNormal,
  2947   BigReal *h_kineticEnergy,
  2948   BigReal *h_intKineticEnergy,
  2949   cudaTensor *h_virial,
  2950   cudaTensor *h_intVirialNormal,
  2951   int *hydrogenGroupSize,
  2953   unsigned int* tbcatomic,
  2954   cudaStream_t stream)
  2956   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  2958     submitHalfKernel<ATOM_BLOCKS, true><<<grid, ATOM_BLOCKS, 0, stream>>>(
  2959       vel_x, vel_y, vel_z,
  2960       vcm_x, vcm_y, vcm_z, mass,
  2961       kineticEnergy, intKineticEnergy,
  2962       virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
  2963       h_virial, h_intVirialNormal,
  2964       hydrogenGroupSize, numAtoms, tbcatomic);
  2966     submitHalfKernel<ATOM_BLOCKS, false><<<grid, ATOM_BLOCKS, 0, stream>>>(
  2967       vel_x, vel_y, vel_z,
  2968       vcm_x, vcm_y, vcm_z, mass,
  2969       kineticEnergy, intKineticEnergy,
  2970       virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
  2971       h_virial, h_intVirialNormal,
  2972       hydrogenGroupSize, numAtoms, tbcatomic);
  2973   //cudaCheck(cudaGetLastError());
  2976 __global__ void scaleCoordinateUseGroupPressureKernel(
  2977   double * __restrict pos_x,
  2978   double * __restrict pos_y,
  2979   double * __restrict pos_z,
  2981   int *hydrogenGroupSize,
  2986   int i = threadIdx.x + blockIdx.x * blockDim.x;
  2988     int hgs = hydrogenGroupSize[i];
  2991       // missing fixed atoms implementation
  2996       // calculate the center of mass
  2997       for ( j = i; j < (i+hgs); ++j ) {
  2999         r_cm_x += mass[j] * pos_x[j];
  3000         r_cm_y += mass[j] * pos_y[j];
  3001         r_cm_z += mass[j] * pos_z[j];
  3003       BigReal inv_m_cm = 1.0/m_cm;
  3007       // scale the center of mass with factor
  3009       double tx = r_cm_x - origin.x;
  3010       double ty = r_cm_y - origin.y;
  3011       double tz = r_cm_z - origin.z;
  3012       // apply transformation 
  3013       double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3014       double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3015       double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3017       new_r_cm_x += origin.x;
  3018       new_r_cm_y += origin.y;
  3019       new_r_cm_z += origin.z;
  3020       // translation vector from old COM and new COM
  3021       double delta_r_cm_x = new_r_cm_x - r_cm_x;
  3022       double delta_r_cm_y = new_r_cm_y - r_cm_y;
  3023       double delta_r_cm_z = new_r_cm_z - r_cm_z;
  3024       // shift the hydrogen group with translation vector
  3025       for (j = i;  j < (i+hgs);  j++) {
  3026         pos_x[j] += delta_r_cm_x;
  3027         pos_y[j] += delta_r_cm_y;
  3028         pos_z[j] += delta_r_cm_z;
  3034 __global__ void scaleCoordinateNoGroupPressureKernel(
  3035   double * __restrict pos_x,
  3036   double * __restrict pos_y,
  3037   double * __restrict pos_z,
  3042   int i = threadIdx.x + blockIdx.x * blockDim.x;
  3044     // missing fixed atoms implementation
  3046     double tx = pos_x[i] - origin.x;
  3047     double ty = pos_y[i] - origin.y;
  3048     double tz = pos_z[i] - origin.z;
  3049     // apply transformation 
  3050     double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3051     double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3052     double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3054     pos_x[i] = ftx + origin.x;
  3055     pos_y[i] = fty + origin.y;
  3056     pos_z[i] = ftz + origin.z;
  3060 __global__ void SetAtomIndexOrderKernel(
  3065   int i = threadIdx.x + blockIdx.x * blockDim.x;
  3067     int atomIndex = id[i];
  3068     idOrder[atomIndex] = i;
  3072 __global__ void scaleCoordinateUsingGCKernel(
  3073   double* __restrict pos_x, 
  3074   double* __restrict pos_y, 
  3075   double* __restrict pos_z, 
  3076   const int* __restrict idOrder, 
  3077   const int* __restrict moleculeStartIndex,
  3078   const int* __restrict moleculeAtom, 
  3079   const cudaTensor factor, 
  3080   const cudaVector origin, 
  3081   const Lattice oldLattice,
  3082   const Lattice newLattice,
  3083   const char3* __restrict transform,
  3084   const int numMolecules,
  3085   const int numLargeMolecules)
  3087   // missing fixed atoms implementation
  3088   int startIdx, endIdx, i, j, jmapped, atomIndex;
  3089   double3 position, r_gc, new_r_gc, delta_r_gc;
  3091   r_gc.x = 0; r_gc.y = 0; r_gc.z = 0;
  3092   if (blockIdx.x < numLargeMolecules){ //large molecule case
  3094     startIdx = moleculeStartIndex[i];
  3095     endIdx = moleculeStartIndex[i + 1];
  3096     __shared__ double3 sh_gc;
  3097     double inv_length = 1.0/(double)(endIdx - startIdx);
  3098     // calculate the geometric center
  3099     for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
  3100       atomIndex = moleculeAtom[j];
  3101       jmapped = idOrder[atomIndex];
  3102       tr = transform[jmapped];
  3103       position.x = pos_x[jmapped];
  3104       position.y = pos_y[jmapped];
  3105       position.z = pos_z[jmapped];
  3106       //Unwrap the coordinate with oldLattice
  3107       position = oldLattice.reverse_transform(position ,tr);
  3108       r_gc.x += position.x;
  3109       r_gc.y += position.y;
  3110       r_gc.z += position.z;
  3113     typedef cub::BlockReduce<double, 64> BlockReduce;
  3114     __shared__ typename BlockReduce::TempStorage temp_storage;
  3116     r_gc.x = BlockReduce(temp_storage).Sum(r_gc.x);
  3118     r_gc.y = BlockReduce(temp_storage).Sum(r_gc.y);
  3120     r_gc.z = BlockReduce(temp_storage).Sum(r_gc.z);
  3123     if (threadIdx.x == 0) {   
  3124       sh_gc.x = r_gc.x * inv_length;
  3125       sh_gc.y = r_gc.y * inv_length;
  3126       sh_gc.z = r_gc.z * inv_length;
  3130     // scale the geometric center with factor 
  3132     double tx = sh_gc.x - origin.x;
  3133     double ty = sh_gc.y - origin.y;
  3134     double tz = sh_gc.z - origin.z;
  3135     // apply transformation 
  3136     new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3137     new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3138     new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3140     new_r_gc.x += origin.x;
  3141     new_r_gc.y += origin.y;
  3142     new_r_gc.z += origin.z;
  3143     // translation vector from old GC to new GC
  3144     delta_r_gc.x = new_r_gc.x - sh_gc.x;
  3145     delta_r_gc.y = new_r_gc.y - sh_gc.y;
  3146     delta_r_gc.z = new_r_gc.z - sh_gc.z;
  3148     // shift the atoms in molecule with translation vector
  3149     for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
  3150       atomIndex = moleculeAtom[j];
  3151       jmapped = idOrder[atomIndex];
  3152       tr = transform[jmapped];
  3153       position.x = pos_x[jmapped];
  3154       position.y = pos_y[jmapped];
  3155       position.z = pos_z[jmapped];
  3156       //Unwrap the coordinate with oldLattice
  3157       position = oldLattice.reverse_transform(position, tr);
  3158       position.x += delta_r_gc.x;
  3159       position.y += delta_r_gc.y;
  3160       position.z += delta_r_gc.z;
  3161       // wrap the coordinate with new lattice parameter
  3162       position = newLattice.apply_transform(position, tr);
  3163       pos_x[jmapped] = position.x;
  3164       pos_y[jmapped] = position.y;
  3165       pos_z[jmapped] = position.z;
  3167   } else { //Small molecule
  3168     // numLargeMolecules block has been assigned to large molecule
  3169     i = numLargeMolecules + threadIdx.x + 
  3170         (blockIdx.x - numLargeMolecules) * blockDim.x;
  3172     if (i < numMolecules) {
  3173       startIdx = moleculeStartIndex[i];
  3174       endIdx = moleculeStartIndex[i+1];
  3175       double inv_length = 1.0/(double)(endIdx - startIdx);
  3177       // calculate the geometric center
  3178       for ( j = startIdx; j < endIdx; j++ ) {
  3179         atomIndex = moleculeAtom[j];
  3180         jmapped = idOrder[atomIndex];
  3181         tr = transform[jmapped];
  3182         position.x = pos_x[jmapped];
  3183         position.y = pos_y[jmapped];
  3184         position.z = pos_z[jmapped];
  3185         //Unwrap the coordinate with oldLattice
  3186         position = oldLattice.reverse_transform(position, tr);
  3187         r_gc.x += position.x;
  3188         r_gc.y += position.y;
  3189         r_gc.z += position.z;
  3192       r_gc.x *= inv_length;
  3193       r_gc.y *= inv_length;
  3194       r_gc.z *= inv_length;
  3196       // scale the geometric center with factor 
  3198       double tx = r_gc.x - origin.x;
  3199       double ty = r_gc.y - origin.y;
  3200       double tz = r_gc.z - origin.z;
  3201       // apply transformation 
  3202       new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3203       new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3204       new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3206       new_r_gc.x += origin.x;
  3207       new_r_gc.y += origin.y;
  3208       new_r_gc.z += origin.z;
  3209       // translation vector from old GC to new GC
  3210       delta_r_gc.x = new_r_gc.x - r_gc.x;
  3211       delta_r_gc.y = new_r_gc.y - r_gc.y;
  3212       delta_r_gc.z = new_r_gc.z - r_gc.z;
  3214       // shift the atoms in molecule with translation vector
  3215       for (j = startIdx; j < endIdx; j++) {
  3216         atomIndex = moleculeAtom[j];
  3217         jmapped = idOrder[atomIndex];
  3219         tr = transform[jmapped];
  3220         position.x = pos_x[jmapped];
  3221         position.y = pos_y[jmapped];
  3222         position.z = pos_z[jmapped];
  3223         // unwrap the coordinate with oldLattice
  3224         position = oldLattice.reverse_transform(position, tr);
  3225         position.x += delta_r_gc.x;
  3226         position.y += delta_r_gc.y;
  3227         position.z += delta_r_gc.z;
  3228         // wrap the coordinate with new lattice parameter
  3229         position = newLattice.apply_transform(position, tr);
  3230         pos_x[jmapped] = position.x;
  3231         pos_y[jmapped] = position.y;
  3232         pos_z[jmapped] = position.z;
  3238 template <bool doFixed>
  3239 __global__ void langevinPistonUseGroupPressureKernel(
  3240   const int* __restrict atomFixed,
  3241   const int* __restrict groupFixed,
  3242   const Lattice lattice,
  3243   const char3* transform,
  3244   const double* __restrict fixedPosition_x,
  3245   const double* __restrict fixedPosition_y,
  3246   const double* __restrict fixedPosition_z,
  3247   double * __restrict pos_x,
  3248   double * __restrict pos_y,
  3249   double * __restrict pos_z,
  3250   double * __restrict vel_x,
  3251   double * __restrict vel_y,
  3252   double * __restrict vel_z,
  3254   int *hydrogenGroupSize,
  3262   int i = threadIdx.x + blockIdx.x*blockDim.x;
  3264     int hgs = hydrogenGroupSize[i];
  3268         if (groupFixed[i]) {
  3269           for ( j = i; j < (i+hgs); ++j ) {
  3270             const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
  3271             const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
  3272             pos_x[j] = pos_trans.x;
  3273             pos_y[j] = pos_trans.y;
  3274             pos_z[j] = pos_trans.z;
  3286       for ( j = i; j < (i+hgs); ++j ) {
  3288           if (atomFixed[j]) continue;
  3291         r_cm_x += mass[j] * pos_x[j];
  3292         r_cm_y += mass[j] * pos_y[j];
  3293         r_cm_z += mass[j] * pos_z[j];
  3294         v_cm_x += mass[j] * vel_x[j];
  3295         v_cm_y += mass[j] * vel_y[j];
  3296         v_cm_z += mass[j] * vel_z[j];
  3298       BigReal inv_m_cm = 1.0/m_cm;
  3303       double tx = r_cm_x - origin.x;
  3304       double ty = r_cm_y - origin.y;
  3305       double tz = r_cm_z - origin.z;
  3306       double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3307       double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3308       double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3309       new_r_cm_x += origin.x;
  3310       new_r_cm_y += origin.y;
  3311       new_r_cm_z += origin.z;
  3313       double delta_r_cm_x = new_r_cm_x - r_cm_x;
  3314       double delta_r_cm_y = new_r_cm_y - r_cm_y;
  3315       double delta_r_cm_z = new_r_cm_z - r_cm_z;
  3319       double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
  3320       double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
  3321       double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
  3322       for (j = i;  j < (i+hgs);  j++) {
  3325             const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
  3326             const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
  3327             pos_x[j] = pos_trans.x;
  3328             pos_y[j] = pos_trans.y;
  3329             pos_z[j] = pos_trans.z;
  3333         pos_x[j] += delta_r_cm_x;
  3334         pos_y[j] += delta_r_cm_y;
  3335         pos_z[j] += delta_r_cm_z;
  3336         vel_x[j] += delta_v_cm_x;
  3337         vel_y[j] += delta_v_cm_y;
  3338         vel_z[j] += delta_v_cm_z;
  3345 template <bool doFixed>
  3346 __global__ void langevinPistonNoGroupPressureKernel(
  3347   const int* __restrict atomFixed,
  3348   const Lattice lattice,
  3349   const char3* transform,
  3350   const double* __restrict fixedPosition_x,
  3351   const double* __restrict fixedPosition_y,
  3352   const double* __restrict fixedPosition_z,
  3353   double * __restrict pos_x,
  3354   double * __restrict pos_y,
  3355   double * __restrict pos_z,
  3356   double * __restrict vel_x,
  3357   double * __restrict vel_y,
  3358   double * __restrict vel_z,
  3366   int i = threadIdx.x + blockIdx.x*blockDim.x;
  3370         const double3 pos_origin{fixedPosition_x[i], fixedPosition_y[i], fixedPosition_z[i]};
  3371         const auto pos_trans = lattice.apply_transform(pos_origin, transform[i]);
  3372         pos_x[i] = pos_trans.x;
  3373         pos_y[i] = pos_trans.y;
  3374         pos_z[i] = pos_trans.z;
  3378     double tx = pos_x[i] - origin.x;
  3379     double ty = pos_y[i] - origin.y;
  3380     double tz = pos_z[i] - origin.z;
  3381     double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
  3382     double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
  3383     double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
  3384     pos_x[i] = ftx + origin.x;
  3385     pos_y[i] = fty + origin.y;
  3386     pos_z[i] = ftz + origin.z;
  3387     vel_x[i] *= velFactor_x;
  3388     vel_y[i] *= velFactor_y;
  3389     vel_z[i] *= velFactor_z;
  3393 void SequencerCUDAKernel::scaleCoordinateWithFactor(
  3398   int *hydrogenGroupSize,
  3401   int useGroupPressure,
  3403   cudaStream_t stream)
  3405   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;  
  3406   if (useGroupPressure) {
  3407     scaleCoordinateUseGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  3408       pos_x, pos_y, pos_z, mass, hydrogenGroupSize, factor, origin, numAtoms);
  3410     scaleCoordinateNoGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  3411       pos_x, pos_y, pos_z, factor, origin, numAtoms);
  3416 void SequencerCUDAKernel::SetAtomIndexOrder(
  3420   cudaStream_t stream)
  3422   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  3423   SetAtomIndexOrderKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  3424     id, idOrder, numAtoms);
  3427 void SequencerCUDAKernel::scaleCoordinateUsingGC(
  3432   const int *moleculeStartIndex,
  3433   const int *moleculeAtom,
  3434   const cudaTensor factor,
  3435   const cudaVector origin,
  3436   const Lattice oldLattice,
  3437   const Lattice newLattice,
  3438   const char3 *transform,
  3439   const int numMolecules,
  3440   const int numLargeMolecules,
  3441   cudaStream_t stream)
  3443  // we want each thread to calculate geometric center for molecule with 
  3444  // less than 128 atoms, and 1 threadblock to calculate each molecule 
  3445  // with larger than 128 atoms
  3446   int numThreadsPerBlock = 64;
  3447   int grid = ((numMolecules - numLargeMolecules + numThreadsPerBlock - 1) / 
  3448     numThreadsPerBlock) + numLargeMolecules; 
  3449   scaleCoordinateUsingGCKernel<<<grid, numThreadsPerBlock, 0, stream>>>(
  3450     pos_x, pos_y, pos_z, idOrder, moleculeStartIndex,
  3451     moleculeAtom, factor, origin, oldLattice, newLattice,
  3452     transform, numMolecules, numLargeMolecules);
  3456 void SequencerCUDAKernel::langevinPiston(
  3457   const bool doFixedAtoms,
  3458   const int* atomFixed,
  3459   const int* groupFixed,
  3460   const char3* transform,
  3461   const Lattice lattice,
  3462   const double* fixedPosition_x,
  3463   const double* fixedPosition_y,
  3464   const double* fixedPosition_z,
  3472   int *hydrogenGroupSize,
  3478   int useGroupPressure,
  3480   cudaStream_t stream)
  3482   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  3484     double *h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
  3485     double *h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
  3486     double *h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
  3488     double *h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
  3489     double *h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
  3490     double *h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
  3492     copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
  3493     copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
  3494     copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
  3496     copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
  3497     copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
  3498     copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
  3500     fprintf(stderr, "velFactors = %lf %lf %lf\n",
  3501       velFactor_x, velFactor_y, velFactor_z);
  3502     for(int i = 0; i < numAtoms; i++){
  3503       fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
  3504         h_pos_x[i], h_pos_y[i], h_pos_z[i],
  3505         h_vel_x[i], h_vel_y[i], h_vel_z[i]);
  3515   if (useGroupPressure) {
  3517       langevinPistonUseGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
  3518         atomFixed, groupFixed, lattice, transform,
  3519         fixedPosition_x, fixedPosition_y, fixedPosition_z,
  3520         pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
  3521         factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
  3523       langevinPistonUseGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
  3524         atomFixed, groupFixed, lattice, transform,
  3525         fixedPosition_x, fixedPosition_y, fixedPosition_z,
  3526         pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
  3527         factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
  3531       langevinPistonNoGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
  3532         atomFixed, lattice, transform,
  3533         fixedPosition_x, fixedPosition_y, fixedPosition_z,
  3534         pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
  3535         factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
  3537       langevinPistonNoGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
  3538         atomFixed, lattice, transform,
  3539         fixedPosition_x, fixedPosition_y, fixedPosition_z,
  3540         pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
  3541         factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
  3544   //cudaCheck(cudaGetLastError());
  3546   h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
  3547   h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
  3548   h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
  3550   h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
  3551   h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
  3552   h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
  3554   copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
  3555   copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
  3556   copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
  3558   copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
  3559   copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
  3560   copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
  3562   for(int i = 0; i < numAtoms; i++){
  3563     fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
  3564       h_pos_x[i], h_pos_y[i], h_pos_z[i],
  3565       h_vel_x[i], h_vel_y[i], h_vel_z[i]);
  3570 template <int BLOCK_SIZE>
  3571 __global__ void submitReduction1Kernel(
  3572   double * __restrict pos_x,
  3573   double * __restrict pos_y,
  3574   double * __restrict pos_z,
  3575   double * __restrict vel_x,
  3576   double * __restrict vel_y,
  3577   double * __restrict vel_z,
  3578   float  * __restrict mass,
  3579   // TODO: wrap these scalars in a struct
  3580   BigReal *kineticEnergy,
  3581   BigReal *momentum_x,
  3582   BigReal *momentum_y,
  3583   BigReal *momentum_z,
  3584   BigReal *angularMomentum_x,
  3585   BigReal *angularMomentum_y,
  3586   BigReal *angularMomentum_z,
  3590   BigReal *h_kineticEnergy,
  3591   BigReal *h_momentum_x,
  3592   BigReal *h_momentum_y,
  3593   BigReal *h_momentum_z,
  3594   BigReal *h_angularMomentum_x,
  3595   BigReal *h_angularMomentum_y,
  3596   BigReal *h_angularMomentum_z,
  3597   unsigned int* tbcatomic,
  3600   BigReal m_x = 0, m_y = 0, m_z = 0,
  3601     a_x = 0, a_y = 0, a_z = 0;
  3602   int i = threadIdx.x + blockIdx.x*blockDim.x;
  3603   int totaltb = gridDim.x;
  3604   __shared__ bool isLastBlockDone;
  3606  if(threadIdx.x == 0){
  3607    isLastBlockDone = 0;
  3612     // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
  3614     //  (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
  3616     // vector momentum += mass[i] * vel[i]
  3617     m_x += mass[i] * vel_x[i];
  3618     m_y += mass[i] * vel_y[i];
  3619     m_z += mass[i] * vel_z[i];
  3621     // vector dpos = pos[i] - origin
  3622     BigReal dpos_x = pos_x[i] - origin_x;
  3623     BigReal dpos_y = pos_y[i] - origin_y;
  3624     BigReal dpos_z = pos_z[i] - origin_z;
  3626     // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
  3627     a_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
  3628     a_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
  3629     a_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
  3631   typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
  3632   __shared__ typename BlockReduce::TempStorage temp_storage;
  3633   // k   = BlockReduce(temp_storage).Sum(k);
  3636   m_x = BlockReduce(temp_storage).Sum(m_x);
  3638   m_y = BlockReduce(temp_storage).Sum(m_y);
  3640   m_z = BlockReduce(temp_storage).Sum(m_z);
  3642   a_x = BlockReduce(temp_storage).Sum(a_x);
  3644   a_y = BlockReduce(temp_storage).Sum(a_y);
  3646   a_z = BlockReduce(temp_storage).Sum(a_z);
  3649   if (threadIdx.x == 0) {
  3650     const int bin = blockIdx.x % ATOMIC_BINS;
  3652     // atomicAdd(&kineticEnergy[bin], k);
  3653     atomicAdd(&momentum_x[bin], m_x);
  3654     atomicAdd(&momentum_y[bin], m_y);
  3655     atomicAdd(&momentum_z[bin], m_z);
  3656     atomicAdd(&angularMomentum_x[bin], a_x);
  3657     atomicAdd(&angularMomentum_y[bin], a_y);
  3658     atomicAdd(&angularMomentum_z[bin], a_z);
  3660     unsigned int value = atomicInc(&tbcatomic[0], totaltb);
  3661     isLastBlockDone = (value == (totaltb-1));
  3667   if(isLastBlockDone){
  3668     if(threadIdx.x < ATOMIC_BINS){
  3669       const int bin = threadIdx.x;
  3671       double m_x = momentum_x[bin];
  3672       double m_y = momentum_y[bin];
  3673       double m_z = momentum_z[bin];
  3674       double a_x = angularMomentum_x[bin];
  3675       double a_y = angularMomentum_y[bin];
  3676       double a_z = angularMomentum_z[bin];
  3678       // sets device scalars back to zero
  3679       kineticEnergy[0] = 0.0;
  3680       momentum_x[bin] = 0.0;
  3681       momentum_y[bin] = 0.0;
  3682       momentum_z[bin] = 0.0;
  3683       angularMomentum_x[bin] = 0.0;
  3684       angularMomentum_y[bin] = 0.0;
  3685       angularMomentum_z[bin] = 0.0;
  3687       if(ATOMIC_BINS > 1){
  3688         typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
  3689         __shared__ typename WarpReduce::TempStorage tempStorage;
  3691         m_x = WarpReduce(tempStorage).Sum(m_x);
  3692         m_y = WarpReduce(tempStorage).Sum(m_y);
  3693         m_z = WarpReduce(tempStorage).Sum(m_z);
  3694         a_x = WarpReduce(tempStorage).Sum(a_x);
  3695         a_y = WarpReduce(tempStorage).Sum(a_y);
  3696         a_z = WarpReduce(tempStorage).Sum(a_z);
  3699       if(threadIdx.x == 0){
  3700         h_momentum_x[0] = m_x;
  3701         h_momentum_y[0] = m_y;
  3702         h_momentum_z[0] = m_z;
  3703         h_angularMomentum_x[0] = a_x;
  3704         h_angularMomentum_y[0] = a_y;
  3705         h_angularMomentum_z[0] = a_z;
  3707         //resets atomic counter
  3708         reset_atomic_counter(&tbcatomic[0]);
  3716 // JM: does addForcetoMomentum, maximumMove, and addVelocityToPosition
  3717 template <bool DO_VEL_RESCALING, bool DO_FIXED>
  3718 __global__ void velocityVerlet1Kernel(
  3720    const double scaling,
  3721    const double dt_normal,
  3722    const double dt_nbond,
  3723    const double dt_slow,
  3724    const double velrescaling,   // for stochastic velocity rescaling
  3725    const double* __restrict recipMass,
  3726    double* __restrict  vel_x,
  3727    double* __restrict  vel_y,
  3728    double* __restrict  vel_z,
  3729    const double maxvel2,
  3731    double* __restrict pos_x,
  3732    double* __restrict pos_y,
  3733    double* __restrict pos_z,
  3737    double* __restrict f_normal_x,
  3738    double* __restrict f_normal_y,
  3739    double* __restrict f_normal_z,
  3740    double* __restrict f_nbond_x,
  3741    double* __restrict f_nbond_y,
  3742    double* __restrict f_nbond_z,
  3743    double* __restrict f_slow_x,
  3744    double* __restrict f_slow_y,
  3745    double* __restrict f_slow_z,
  3746    const int* atomFixed,
  3748    const int maxForceNumber
  3750    // fusion of addForceToMomentum, maximumMove, addVelocityToPosition
  3751   double dt, dt_b, dt_nb, dt_s;
  3752   int i = threadIdx.x + blockIdx.x*blockDim.x;
  3756     double velx = vel_x[i];
  3757     double vely = vel_y[i];
  3758     double velz = vel_z[i];
  3759     if (DO_VEL_RESCALING) {
  3760       velx *= velrescaling;
  3761       vely *= velrescaling;
  3762       velz *= velrescaling;
  3764     // JM NOTE: these need to be patch-centered
  3765     double posx = pos_x[i];
  3766     double posy = pos_y[i];
  3767     double posz = pos_z[i];
  3768     double rmass = recipMass[i];
  3769     /* addForceToMomentum*/
  3770     // keep velocities in registers so I can access them faster when calculating positions!
  3771     switch(maxForceNumber){
  3773         dt_s = dt_slow * scaling;
  3774         velx += f_slow_x[i] * rmass * dt_s;
  3775         vely += f_slow_y[i] * rmass * dt_s;
  3776         velz += f_slow_z[i] * rmass * dt_s;
  3777         // f_slow_x[i] = 0.0;
  3778         // f_slow_y[i] = 0.0;
  3779         // f_slow_z[i] = 0.0;
  3781         dt_nb = dt_nbond * scaling;
  3782         velx += f_nbond_x[i] * rmass * dt_nb;
  3783         vely += f_nbond_y[i] * rmass * dt_nb;
  3784         velz += f_nbond_z[i] * rmass * dt_nb;
  3785         // f_nbond_x[i] = 0.0;
  3786         // f_nbond_y[i] = 0.0;
  3787         // f_nbond_z[i] = 0.0;
  3789         dt_b = dt_normal * scaling;
  3790         velx += f_normal_x[i] * rmass * dt_b;
  3791         vely += f_normal_y[i] * rmass * dt_b;
  3792         velz += f_normal_z[i] * rmass * dt_b;
  3793         // f_normal_x[i] = 0.0;
  3794         // f_normal_y[i] = 0.0;
  3795         // f_normal_z[i] = 0.0;
  3798     // --------------------------------------------------------
  3800     // -- MaximumMove --
  3801     double vel2 = velx * velx + vely * vely + velz * velz;
  3802     if(vel2 > maxvel2) atomicAdd(h_killme, 1); // stops dynamics if true, perf does not matter
  3803     // --------------------------------------------------------
  3805     // -- AddVelocityToPosition --
  3806     // ---------------------------------------------------------
  3818       // Haochuan: for fixed atoms, we need to clear their velocities (see HomePatch::addForceToMomentum)
  3819       if (!atomFixed[i]) {
  3839 void SequencerCUDAKernel::velocityVerlet1(
  3840     const bool doFixedAtoms,
  3842     const double scaling,
  3843     const double dt_normal,
  3844     const double dt_nbond,
  3845     const double dt_slow,
  3846     const double velrescaling,  // for stochastic velocity rescaling
  3847     const double *recipMass,
  3868     const int* atomFixed,
  3870     const int maxForceNumber,
  3871     cudaStream_t stream)
  3873   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  3874   const bool doVelRescaling = (velrescaling != 1.0);
  3875 #define CALL(DO_VEL_RESCALING, DO_FIXED) \
  3876   velocityVerlet1Kernel<DO_VEL_RESCALING, DO_FIXED> \
  3877   <<<grid, ATOM_BLOCKS, 0, stream>>>( \
  3878     step, scaling, dt_normal, dt_nbond, dt_slow, velrescaling, recipMass, \
  3879     vel_x, vel_y, vel_z, maxvel2, h_killme, \
  3880     pos_x, pos_y, pos_z, h_pos_x, h_pos_y, h_pos_z, \
  3881     f_normal_x, f_normal_y, f_normal_z, \
  3882     f_nbond_x, f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z, \
  3883     atomFixed, numAtoms, maxForceNumber \
  3885   if ( doVelRescaling &&  doFixedAtoms) CALL(true, true);
  3886   else if ( doVelRescaling && !doFixedAtoms) CALL(true, false);
  3887   else if (!doVelRescaling &&  doFixedAtoms) CALL(false, true);
  3888   else if (!doVelRescaling && !doFixedAtoms) CALL(false, false);
  3890     NAMD_bug("No kernel was called in SequencerCUDAKernel::velocityVerlet1!\n");
  3895 void SequencerCUDAKernel::submitReduction1(
  3903   BigReal *kineticEnergy,
  3904   BigReal *momentum_x,
  3905   BigReal *momentum_y,
  3906   BigReal *momentum_z,
  3907   BigReal *angularMomentum_x,
  3908   BigReal *angularMomentum_y,
  3909   BigReal *angularMomentum_z,
  3913   BigReal *h_kineticEnergy,
  3914   BigReal *h_momentum_x,
  3915   BigReal *h_momentum_y,
  3916   BigReal *h_momentum_z,
  3917   BigReal *h_angularMomentum_x,
  3918   BigReal *h_angularMomentum_y,
  3919   BigReal *h_angularMomentum_z,
  3920   unsigned int* tbcatomic,
  3922   cudaStream_t stream)
  3924   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  3925   submitReduction1Kernel<ATOM_BLOCKS><<<grid, ATOM_BLOCKS, 0, stream>>>(
  3926     pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass,
  3927     kineticEnergy, momentum_x, momentum_y, momentum_z,
  3928     angularMomentum_x, angularMomentum_y, angularMomentum_z,
  3929     origin_x, origin_y, origin_z, h_kineticEnergy, h_momentum_x, h_momentum_y,
  3930     h_momentum_z, h_angularMomentum_x, h_angularMomentum_y, h_angularMomentum_z,
  3931     tbcatomic, numAtoms);
  3932   //cudaCheck(cudaGetLastError());
  3935 template <int BLOCK_SIZE, bool DO_FIXED, bool DO_MTS>
  3936 __global__ void submitReduction2Kernel(
  3937   const int* __restrict atomFixed,
  3938   const double * __restrict pos_x,
  3939   const double * __restrict pos_y,
  3940   const double * __restrict pos_z,
  3941   const double * __restrict vel_x,
  3942   const double * __restrict vel_y,
  3943   const double * __restrict vel_z,
  3944   const double * __restrict rcm_x,
  3945   const double * __restrict rcm_y,
  3946   const double * __restrict rcm_z,
  3947   const double * __restrict vcm_x,
  3948   const double * __restrict vcm_y,
  3949   const double * __restrict vcm_z,
  3950   const double * __restrict f_normal_x,
  3951   const double * __restrict f_normal_y,
  3952   const double * __restrict f_normal_z,
  3953   const double * __restrict f_nbond_x,
  3954   const double * __restrict f_nbond_y,
  3955   const double * __restrict f_nbond_z,
  3956   const double * __restrict f_slow_x,
  3957   const double * __restrict f_slow_y,
  3958   const double * __restrict f_slow_z,
  3959   const float  * __restrict mass,
  3960   const int * __restrict hydrogenGroupSize,
  3961   BigReal    *kineticEnergy,
  3962   BigReal    *h_kineticEnergy,
  3963   BigReal    *intKineticEnergy,
  3964   BigReal    *h_intKineticEnergy,
  3965   cudaTensor *intVirialNormal,
  3966   cudaTensor *intVirialNbond,
  3967   cudaTensor *intVirialSlow,
  3968   cudaTensor *h_intVirialNormal,
  3969   cudaTensor *h_intVirialNbond,
  3970   cudaTensor *h_intVirialSlow,
  3971   cudaTensor* rigidVirial,
  3972   cudaTensor* h_rigidVirial,
  3973   unsigned int *tbcatomic,
  3975   const int maxForceNumber)
  3979   cudaTensor intVNormal, intVNbond, intVSlow;
  3980   zero_cudaTensor(intVNormal);
  3982     zero_cudaTensor(intVNbond);
  3983     zero_cudaTensor(intVSlow);
  3985   int i = threadIdx.x + blockIdx.x*blockDim.x;
  3986   __shared__ bool isLastBlockDone;
  3987   int totaltb = gridDim.x;
  3989   if(threadIdx.x == 0){
  3990     isLastBlockDone = false;
  3996     int hgs = hydrogenGroupSize[i];
  3997     // let's see, I have the hydrogenGroupSize, but that's not what I need;
  4007       for ( j = i; j < (i+hgs); ++j ) {
  4008         r_mass[j -i ] = mass[j];
  4009         r_pos[j - i].x  = pos_x[j];
  4010         r_pos[j - i].y  = pos_y[j];
  4011         r_pos[j - i].z  = pos_z[j];
  4012         r_vel[j - i].x  = vel_x[j];
  4013         r_vel[j - i].y  = vel_y[j];
  4014         r_vel[j - i].z  = vel_z[j];
  4016         // r_cm_x += mass[j] * pos_x[j];
  4017         // r_cm_y += mass[j] * pos_y[j];
  4018         // r_cm_z += mass[j] * pos_z[j];
  4019         // v_cm_x += mass[j] * vel_x[j];
  4020         // v_cm_y += mass[j] * vel_y[j];
  4021         // v_cm_z += mass[j] * vel_z[j];
  4022         m_cm += r_mass[j - i];
  4023         r_cm_x += r_mass[j - i] * r_pos[j-i].x;
  4024         r_cm_y += r_mass[j - i] * r_pos[j-i].y;
  4025         r_cm_z += r_mass[j - i] * r_pos[j-i].z;
  4026         v_cm_x += r_mass[j - i] * r_vel[j-i].x;
  4027         v_cm_y += r_mass[j - i] * r_vel[j-i].y;
  4028         v_cm_z += r_mass[j - i] * r_vel[j-i].z;
  4030       BigReal inv_m_cm = 1.0/m_cm;
  4038       // XXX removed pairInteraction
  4039       for ( j = i; j < (i+hgs); ++j ) {
  4040         // XXX removed fixed atoms
  4042         // vector vel[j] used twice below
  4043         BigReal v_x = r_vel[j-i].x;
  4044         BigReal v_y = r_vel[j-i].y;
  4045         BigReal v_z = r_vel[j-i].z;
  4047         // vector dv = vel[j] - v_cm
  4048         BigReal dv_x = v_x - v_cm_x;
  4049         BigReal dv_y = v_y - v_cm_y;
  4050         BigReal dv_z = v_z - v_cm_z;
  4052         // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
  4054         //   (v_x * dv_x + v_y * dv_y + v_z * dv_z);
  4055         intK += r_mass[j-i] *
  4056              (v_x * dv_x + v_y * dv_y + v_z * dv_z);
  4058         // vector dr = pos[j] - r_cm
  4059         // BigReal dr_x = pos_x[j] - r_cm_x;
  4060         // BigReal dr_y = pos_y[j] - r_cm_y;
  4061         // BigReal dr_z = pos_z[j] - r_cm_z;
  4063         BigReal dr_x = r_pos[j -i].x - r_cm_x;
  4064         BigReal dr_y = r_pos[j -i].y - r_cm_y;
  4065         BigReal dr_z = r_pos[j -i].z - r_cm_z;
  4067         // tensor intVirialNormal += outer_product(f_normal[j], dr)
  4069         // we're not going to make this function any faster if we don't fix
  4070         // the global memory access pattern
  4071         intVNormal.xx += f_normal_x[j] * dr_x;
  4072         intVNormal.xy += f_normal_x[j] * dr_y;
  4073         intVNormal.xz += f_normal_x[j] * dr_z;
  4074         intVNormal.yx += f_normal_y[j] * dr_x;
  4075         intVNormal.yy += f_normal_y[j] * dr_y;
  4076         intVNormal.yz += f_normal_y[j] * dr_z;
  4077         intVNormal.zx += f_normal_z[j] * dr_x;
  4078         intVNormal.zy += f_normal_z[j] * dr_y;
  4079         intVNormal.zz += f_normal_z[j] * dr_z;
  4081         if (maxForceNumber >= 1) {
  4082           // tensor intVirialNbond += outer_product(f_nbond[j], dr)
  4083           intVNbond.xx += f_nbond_x[j] * dr_x;
  4084           intVNbond.xy += f_nbond_x[j] * dr_y;
  4085           intVNbond.xz += f_nbond_x[j] * dr_z;
  4086           intVNbond.yx += f_nbond_y[j] * dr_x;
  4087           intVNbond.yy += f_nbond_y[j] * dr_y;
  4088           intVNbond.yz += f_nbond_y[j] * dr_z;
  4089           intVNbond.zx += f_nbond_z[j] * dr_x;
  4090           intVNbond.zy += f_nbond_z[j] * dr_y;
  4091           intVNbond.zz += f_nbond_z[j] * dr_z;
  4094         if (maxForceNumber >= 2) {
  4095           // tensor intVirialSlow += outer_product(f_slow[j], dr)
  4096           intVSlow.xx += f_slow_x[j] * dr_x;
  4097           intVSlow.xy += f_slow_x[j] * dr_y;
  4098           intVSlow.xz += f_slow_x[j] * dr_z;
  4099           intVSlow.yx += f_slow_y[j] * dr_x;
  4100           intVSlow.yy += f_slow_y[j] * dr_y;
  4101           intVSlow.yz += f_slow_y[j] * dr_z;
  4102           intVSlow.zx += f_slow_z[j] * dr_x;
  4103           intVSlow.zy += f_slow_z[j] * dr_y;
  4104           intVSlow.zz += f_slow_z[j] * dr_z;
  4109     // Haochuan: I need to skip the fixed atoms
  4110     if (!DO_FIXED || (DO_FIXED && !atomFixed[i])) {
  4111       BigReal r_cm_x = rcm_x[i];
  4112       BigReal r_cm_y = rcm_y[i];
  4113       BigReal r_cm_z = rcm_z[i];
  4114       BigReal v_cm_x = vcm_x[i];
  4115       BigReal v_cm_y = vcm_y[i];
  4116       BigReal v_cm_z = vcm_z[i];
  4118       BigReal v_x = vel_x[i];
  4119       BigReal v_y = vel_y[i];
  4120       BigReal v_z = vel_z[i];
  4122       // vector dv = vel[j] - v_cm
  4123       BigReal dv_x = v_x - v_cm_x;
  4124       BigReal dv_y = v_y - v_cm_y;
  4125       BigReal dv_z = v_z - v_cm_z;
  4127       // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
  4129       //   (v_x * dv_x + v_y * dv_y + v_z * dv_z);
  4131         (v_x * v_x + v_y*v_y + v_z*v_z);
  4133         (v_x * dv_x + v_y * dv_y + v_z * dv_z);
  4135       // vector dr = pos[j] - r_cm
  4136       // BigReal dr_x = pos_x[j] - r_cm_x;
  4137       // BigReal dr_y = pos_y[j] - r_cm_y;
  4138       // BigReal dr_z = pos_z[j] - r_cm_z;
  4140       BigReal dr_x = pos_x[i] - r_cm_x;
  4141       BigReal dr_y = pos_y[i] - r_cm_y;
  4142       BigReal dr_z = pos_z[i] - r_cm_z;
  4144       // tensor intVirialNormal += outer_product(f_normal[j], dr)
  4146       // we're not going to make this function any faster if we don't fix
  4147       // the global memory access pattern
  4148       intVNormal.xx += f_normal_x[i] * dr_x;
  4149       intVNormal.xy += f_normal_x[i] * dr_y;
  4150       intVNormal.xz += f_normal_x[i] * dr_z;
  4151       intVNormal.yx += f_normal_y[i] * dr_x;
  4152       intVNormal.yy += f_normal_y[i] * dr_y;
  4153       intVNormal.yz += f_normal_y[i] * dr_z;
  4154       intVNormal.zx += f_normal_z[i] * dr_x;
  4155       intVNormal.zy += f_normal_z[i] * dr_y;
  4156       intVNormal.zz += f_normal_z[i] * dr_z;
  4158       if (maxForceNumber >= 1) {
  4159         // tensor intVirialNbond += outer_product(f_nbond[j], dr)
  4160         // Haochuan: For a MTS simulation we have to calculate intVNbond separately
  4162           intVNbond.xx += f_nbond_x[i] * dr_x;
  4163           intVNbond.xy += f_nbond_x[i] * dr_y;
  4164           intVNbond.xz += f_nbond_x[i] * dr_z;
  4165           intVNbond.yx += f_nbond_y[i] * dr_x;
  4166           intVNbond.yy += f_nbond_y[i] * dr_y;
  4167           intVNbond.yz += f_nbond_y[i] * dr_z;
  4168           intVNbond.zx += f_nbond_z[i] * dr_x;
  4169           intVNbond.zy += f_nbond_z[i] * dr_y;
  4170           intVNbond.zz += f_nbond_z[i] * dr_z;
  4172           // If this is not a MTS simulation, then we can sum the internal virials into a single one (intVNormal)
  4173           intVNormal.xx += f_nbond_x[i] * dr_x;
  4174           intVNormal.xy += f_nbond_x[i] * dr_y;
  4175           intVNormal.xz += f_nbond_x[i] * dr_z;
  4176           intVNormal.yx += f_nbond_y[i] * dr_x;
  4177           intVNormal.yy += f_nbond_y[i] * dr_y;
  4178           intVNormal.yz += f_nbond_y[i] * dr_z;
  4179           intVNormal.zx += f_nbond_z[i] * dr_x;
  4180           intVNormal.zy += f_nbond_z[i] * dr_y;
  4181           intVNormal.zz += f_nbond_z[i] * dr_z;
  4185       if (maxForceNumber >= 2) {
  4186         // tensor intVirialSlow += outer_product(f_slow[j], dr)
  4188           intVSlow.xx += f_slow_x[i] * dr_x;
  4189           intVSlow.xy += f_slow_x[i] * dr_y;
  4190           intVSlow.xz += f_slow_x[i] * dr_z;
  4191           intVSlow.yx += f_slow_y[i] * dr_x;
  4192           intVSlow.yy += f_slow_y[i] * dr_y;
  4193           intVSlow.yz += f_slow_y[i] * dr_z;
  4194           intVSlow.zx += f_slow_z[i] * dr_x;
  4195           intVSlow.zy += f_slow_z[i] * dr_y;
  4196           intVSlow.zz += f_slow_z[i] * dr_z;
  4198           intVNormal.xx += f_slow_x[i] * dr_x;
  4199           intVNormal.xy += f_slow_x[i] * dr_y;
  4200           intVNormal.xz += f_slow_x[i] * dr_z;
  4201           intVNormal.yx += f_slow_y[i] * dr_x;
  4202           intVNormal.yy += f_slow_y[i] * dr_y;
  4203           intVNormal.yz += f_slow_y[i] * dr_z;
  4204           intVNormal.zx += f_slow_z[i] * dr_x;
  4205           intVNormal.zy += f_slow_z[i] * dr_y;
  4206           intVNormal.zz += f_slow_z[i] * dr_z;
  4213   typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
  4214   __shared__ typename BlockReduce::TempStorage temp_storage;
  4215   // XXX TODO: If we handwrite these reductions, it might avoid a
  4216   // lot of overhead from launching CUB
  4218   K = BlockReduce(temp_storage).Sum(K);
  4220   intK = BlockReduce(temp_storage).Sum(intK);
  4222   intVNormal.xx = BlockReduce(temp_storage).Sum(intVNormal.xx);
  4224   intVNormal.xy = BlockReduce(temp_storage).Sum(intVNormal.xy);
  4226   intVNormal.xz = BlockReduce(temp_storage).Sum(intVNormal.xz);
  4228   intVNormal.yx = BlockReduce(temp_storage).Sum(intVNormal.yx);
  4230   intVNormal.yy = BlockReduce(temp_storage).Sum(intVNormal.yy);
  4232   intVNormal.yz = BlockReduce(temp_storage).Sum(intVNormal.yz);
  4234   intVNormal.zx = BlockReduce(temp_storage).Sum(intVNormal.zx);
  4236   intVNormal.zy = BlockReduce(temp_storage).Sum(intVNormal.zy);
  4238   intVNormal.zz = BlockReduce(temp_storage).Sum(intVNormal.zz);
  4241     if (maxForceNumber >= 1) {
  4242       intVNbond.xx = BlockReduce(temp_storage).Sum(intVNbond.xx);
  4244       intVNbond.xy = BlockReduce(temp_storage).Sum(intVNbond.xy);
  4246       intVNbond.xz = BlockReduce(temp_storage).Sum(intVNbond.xz);
  4248       intVNbond.yx = BlockReduce(temp_storage).Sum(intVNbond.yx);
  4250       intVNbond.yy = BlockReduce(temp_storage).Sum(intVNbond.yy);
  4252       intVNbond.yz = BlockReduce(temp_storage).Sum(intVNbond.yz);
  4254       intVNbond.zx = BlockReduce(temp_storage).Sum(intVNbond.zx);
  4256       intVNbond.zy = BlockReduce(temp_storage).Sum(intVNbond.zy);
  4258       intVNbond.zz = BlockReduce(temp_storage).Sum(intVNbond.zz);
  4261     if (maxForceNumber >= 2) {
  4262       intVSlow.xx = BlockReduce(temp_storage).Sum(intVSlow.xx);
  4264       intVSlow.xy = BlockReduce(temp_storage).Sum(intVSlow.xy);
  4266       intVSlow.xz = BlockReduce(temp_storage).Sum(intVSlow.xz);
  4268       intVSlow.yx = BlockReduce(temp_storage).Sum(intVSlow.yx);
  4270       intVSlow.yy = BlockReduce(temp_storage).Sum(intVSlow.yy);
  4272       intVSlow.yz = BlockReduce(temp_storage).Sum(intVSlow.yz);
  4274       intVSlow.zx = BlockReduce(temp_storage).Sum(intVSlow.zx);
  4276       intVSlow.zy = BlockReduce(temp_storage).Sum(intVSlow.zy);
  4278       intVSlow.zz = BlockReduce(temp_storage).Sum(intVSlow.zz);
  4284   if (threadIdx.x == 0) {
  4285     const int bin = blockIdx.x % ATOMIC_BINS;
  4287     atomicAdd(&kineticEnergy[bin], K);
  4288     atomicAdd(&intKineticEnergy[bin], intK);
  4289     atomicAdd(&intVirialNormal[bin].xx, intVNormal.xx);
  4290     atomicAdd(&intVirialNormal[bin].xy, intVNormal.xy);
  4291     atomicAdd(&intVirialNormal[bin].xz, intVNormal.xz);
  4292     atomicAdd(&intVirialNormal[bin].yx, intVNormal.yx);
  4293     atomicAdd(&intVirialNormal[bin].yy, intVNormal.yy);
  4294     atomicAdd(&intVirialNormal[bin].yz, intVNormal.yz);
  4295     atomicAdd(&intVirialNormal[bin].zx, intVNormal.zx);
  4296     atomicAdd(&intVirialNormal[bin].zy, intVNormal.zy);
  4297     atomicAdd(&intVirialNormal[bin].zz, intVNormal.zz);
  4299       if (maxForceNumber >= 1) {
  4300         atomicAdd(&intVirialNbond[bin].xx, intVNbond.xx);
  4301         atomicAdd(&intVirialNbond[bin].xy, intVNbond.xy);
  4302         atomicAdd(&intVirialNbond[bin].xz, intVNbond.xz);
  4303         atomicAdd(&intVirialNbond[bin].yx, intVNbond.yx);
  4304         atomicAdd(&intVirialNbond[bin].yy, intVNbond.yy);
  4305         atomicAdd(&intVirialNbond[bin].yz, intVNbond.yz);
  4306         atomicAdd(&intVirialNbond[bin].zx, intVNbond.zx);
  4307         atomicAdd(&intVirialNbond[bin].zy, intVNbond.zy);
  4308         atomicAdd(&intVirialNbond[bin].zz, intVNbond.zz);
  4310       if (maxForceNumber >= 2) {
  4311         atomicAdd(&intVirialSlow[bin].xx, intVSlow.xx);
  4312         atomicAdd(&intVirialSlow[bin].xy, intVSlow.xy);
  4313         atomicAdd(&intVirialSlow[bin].xz, intVSlow.xz);
  4314         atomicAdd(&intVirialSlow[bin].yx, intVSlow.yx);
  4315         atomicAdd(&intVirialSlow[bin].yy, intVSlow.yy);
  4316         atomicAdd(&intVirialSlow[bin].yz, intVSlow.yz);
  4317         atomicAdd(&intVirialSlow[bin].zx, intVSlow.zx);
  4318         atomicAdd(&intVirialSlow[bin].zy, intVSlow.zy);
  4319         atomicAdd(&intVirialSlow[bin].zz, intVSlow.zz);
  4323     unsigned int value = atomicInc(&tbcatomic[3], totaltb);
  4324     isLastBlockDone = (value == (totaltb -1 ));
  4327   // this function calculates the internal pressures and is a huge bottleneck if we
  4328   // do this host-mapped update
  4329   // How do I know if this is really the bottleneck?
  4330   if(isLastBlockDone){
  4331     if(threadIdx.x < ATOMIC_BINS){
  4332       const int bin = threadIdx.x;
  4334       double k = kineticEnergy[bin];
  4335       double intK = intKineticEnergy[bin];
  4336       cudaTensor intVNormal = intVirialNormal[bin];
  4338         if (maxForceNumber >= 1) {
  4339           intVNbond = intVirialNbond[bin];
  4341         if (maxForceNumber >= 2) {
  4342           intVSlow = intVirialSlow[bin];
  4345       cudaTensor rigidV = rigidVirial[bin];
  4347       // sets device scalars back to zero
  4348       kineticEnergy[bin] = 0.0;
  4349       intKineticEnergy[bin] = 0.0;
  4350       zero_cudaTensor(intVirialNormal[bin]);
  4351       zero_cudaTensor(intVirialNbond[bin]);
  4352       zero_cudaTensor(intVirialSlow[bin]);
  4353       zero_cudaTensor(rigidVirial[bin]);
  4355       if(ATOMIC_BINS > 1){
  4356         typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
  4357         typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
  4358         __shared__ typename WarpReduce::TempStorage tempStorage;
  4359         __shared__ typename WarpReduceT::TempStorage tempStorageT;
  4361         // According to https://nvidia.github.io/cccl/cub/developer_overview.html#temporary-storage-usage,
  4362         // I should use __syncwarp() here.
  4363         k = WarpReduce(tempStorage).Sum(k);                       NAMD_WARP_SYNC(WARP_FULL_MASK);
  4364         intK = WarpReduce(tempStorage).Sum(intK);                 NAMD_WARP_SYNC(WARP_FULL_MASK);
  4365         intVNormal = WarpReduceT(tempStorageT).Sum(intVNormal);   NAMD_WARP_SYNC(WARP_FULL_MASK);
  4366         rigidV = WarpReduceT(tempStorageT).Sum(rigidV);           NAMD_WARP_SYNC(WARP_FULL_MASK);
  4368           if (maxForceNumber >= 1) {
  4369             intVNbond = WarpReduceT(tempStorageT).Sum(intVNbond); NAMD_WARP_SYNC(WARP_FULL_MASK);
  4371           if (maxForceNumber >= 2) {
  4372             intVSlow = WarpReduceT(tempStorageT).Sum(intVSlow);   NAMD_WARP_SYNC(WARP_FULL_MASK);
  4377       if(threadIdx.x == 0){
  4378         h_kineticEnergy[0] = k;
  4379         h_intKineticEnergy[0] = intK;
  4380         h_intVirialNormal[0] = intVNormal;
  4382           if (maxForceNumber >= 1) {
  4383             h_intVirialNbond[0] = intVNbond;
  4385           if (maxForceNumber >= 2) {
  4386             h_intVirialSlow[0] = intVSlow;
  4389         h_rigidVirial[0] = rigidV;
  4391         //resets atomic counter
  4392         reset_atomic_counter(&tbcatomic[3]);
  4398 void SequencerCUDAKernel::submitReduction2(
  4399   const bool doFixedAtoms,
  4400   const int* atomFixed,
  4401   const double *pos_x,
  4402   const double *pos_y,
  4403   const double *pos_z,
  4404   const double *vel_x,
  4405   const double *vel_y,
  4406   const double *vel_z,
  4407   const double *rcm_x,
  4408   const double *rcm_y,
  4409   const double *rcm_z,
  4410   const double *vcm_x,
  4411   const double *vcm_y,
  4412   const double *vcm_z,
  4413   const double *f_normal_x,
  4414   const double *f_normal_y,
  4415   const double *f_normal_z,
  4416   const double *f_nbond_x,
  4417   const double *f_nbond_y,
  4418   const double *f_nbond_z,
  4419   const double *f_slow_x,
  4420   const double *f_slow_y,
  4421   const double *f_slow_z,
  4423   int *hydrogenGroupSize,
  4424   BigReal *kineticEnergy,
  4425   BigReal *h_kineticEnergy,
  4426   BigReal *intKineticEnergy,
  4427   BigReal *h_intKineticEnergy,
  4428   cudaTensor *intVirialNormal,
  4429   cudaTensor *intVirialNbond,
  4430   cudaTensor *intVirialSlow,
  4431   cudaTensor *h_intVirialNormal,
  4432   cudaTensor *h_intVirialNbond,
  4433   cudaTensor *h_intVirialSlow,
  4434   cudaTensor* rigidVirial,
  4435   cudaTensor* h_rigidVirial,
  4436   unsigned int *tbcatomic,
  4440   cudaStream_t stream)
  4443   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  4444 #define CALL(DO_FIXED, DO_MTS) \
  4445   submitReduction2Kernel<ATOM_BLOCKS, DO_FIXED, DO_MTS><<<grid, ATOM_BLOCKS, 0, stream>>>( \
  4446     atomFixed, pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, \
  4447     rcm_x, rcm_y, rcm_z, vcm_x, vcm_y, vcm_z, \
  4448     f_normal_x, f_normal_y, f_normal_z, \
  4449     f_nbond_x, f_nbond_y, f_nbond_z, \
  4450     f_slow_x, f_slow_y, f_slow_z, \
  4451     mass, hydrogenGroupSize, \
  4452     kineticEnergy, h_kineticEnergy, \
  4453     intKineticEnergy, h_intKineticEnergy, \
  4454     intVirialNormal, intVirialNbond, intVirialSlow, \
  4455     h_intVirialNormal, h_intVirialNbond, h_intVirialSlow, rigidVirial, \
  4456     h_rigidVirial, tbcatomic, numAtoms, maxForceNumber)
  4458     if (doMTS) CALL(true, true);
  4459     else CALL(true, false);
  4461   else if (!doFixedAtoms) {
  4462     if (doMTS) CALL(false, true);
  4463     else CALL(false, false);
  4465   else NAMD_bug("No kernel was called in SequencerCUDAKernel::submitReduction2!\n");
  4467   // cudaCheck(cudaGetLastError());
  4470 __global__ void langevinVelocitiesBBK1Kernel(
  4472   const float * __restrict langevinParam,
  4473   double      * __restrict vel_x,
  4474   double      * __restrict vel_y,
  4475   double      * __restrict vel_z,
  4478   BigReal dt = timestep * (0.001 * TIMEFACTOR);
  4479   int i = threadIdx.x + blockIdx.x*blockDim.x;
  4481     BigReal dt_gamma = dt * langevinParam[i];
  4482     BigReal scaling = 1. - 0.5 * dt_gamma;
  4483     vel_x[i] *= scaling;
  4484     vel_y[i] *= scaling;
  4485     vel_z[i] *= scaling;
  4489 void SequencerCUDAKernel::langevinVelocitiesBBK1(
  4491   const float *langevinParam,
  4496   cudaStream_t stream)
  4498   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  4499   langevinVelocitiesBBK1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  4500     timestep, langevinParam, vel_x, vel_y, vel_z, numAtoms);
  4501   // cudaCheck(cudaGetLastError());
  4504 __global__ void langevinVelocitiesBBK2Kernel(
  4506   const float * __restrict langScalVelBBK2,
  4507   const float * __restrict langScalRandBBK2,
  4508   const float * __restrict gaussrand_x,
  4509   const float * __restrict gaussrand_y,
  4510   const float * __restrict gaussrand_z,
  4511   double * __restrict vel_x,
  4512   double * __restrict vel_y,
  4513   double * __restrict vel_z,
  4516   int i = threadIdx.x + blockIdx.x*blockDim.x;
  4518     vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
  4519     vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
  4520     vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
  4521     vel_x[i] *= langScalVelBBK2[i];
  4522     vel_y[i] *= langScalVelBBK2[i];
  4523     vel_z[i] *= langScalVelBBK2[i];
  4529 void SequencerCUDAKernel::langevinVelocitiesBBK2(
  4531   const float *langScalVelBBK2,
  4532   const float *langScalRandBBK2,
  4540   const int numAtomsGlobal,
  4542   curandGenerator_t gen,
  4543   cudaStream_t stream)
  4546   // For some reason, if n == nAtomsDevice + 1, this gives me a misaligned address
  4547   // Generating the full array on gaussrand without striding for multiple GPU simulations for now
  4549   // Adding missing call to hiprandSetStream  to set the current stream for HIPRAND kernel launches
  4550   curandCheck(curandSetStream(gen, stream));
  4551   // array buffers have to be even length for pseudorandom normal distribution
  4552   // choose n to be the smallest even number >= numAtoms
  4553   // we can choose 1 larger than numAtoms since allocation is > numAtoms
  4554   int n = (numAtomsGlobal + 1) & (~1);
  4556   curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
  4557   curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
  4558   curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
  4559   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  4560   langevinVelocitiesBBK2Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  4561     timestep, langScalVelBBK2, langScalRandBBK2,
  4562     gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride,
  4563     vel_x, vel_y, vel_z, numAtoms);
  4564   // cudaCheck(cudaGetLastError());
  4567 template <bool doFixed>
  4568 __global__ void reassignVelocitiesKernel(
  4569   const BigReal timestep,
  4570   const int* __restrict atomFixed,
  4571   const float * __restrict gaussrand_x,
  4572   const float * __restrict gaussrand_y,
  4573   const float * __restrict gaussrand_z,
  4574   double * __restrict vel_x,
  4575   double * __restrict vel_y,
  4576   double * __restrict vel_z,
  4577   const double* __restrict d_recipMass,
  4581   //! note we are NOT supporting LES (locally enhanced sampling) here
  4582   //! hence no switch on the partition to apply tempfactor
  4585   int i = threadIdx.x + blockIdx.x*blockDim.x;
  4593         vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
  4594         vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
  4595         vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
  4598       vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
  4599       vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
  4600       vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
  4606 void SequencerCUDAKernel::reassignVelocities(
  4607   const BigReal timestep,
  4608   const bool doFixedAtoms,
  4609   const int* atomFixed,
  4616   const double* d_recipMass,
  4619   const int numAtomsGlobal,
  4621   curandGenerator_t gen,
  4622   cudaStream_t stream)
  4625   curandCheck(curandSetStream(gen, stream));
  4626   // array buffers have to be even length for pseudorandom normal distribution
  4627   // choose n to be the smallest even number >= numAtoms
  4628   // we can choose 1 larger than numAtoms since allocation is > numAtoms
  4629   int n = (numAtomsGlobal + 1) & (~1);
  4631   curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
  4632   curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
  4633   curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
  4634   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  4635 #define CALL(DOFIXED) \
  4636   reassignVelocitiesKernel<DOFIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
  4637     timestep, atomFixed, \
  4638     gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride, \
  4639     vel_x, vel_y, vel_z,  d_recipMass, kbT, numAtoms)
  4640   if (doFixedAtoms) CALL(true);
  4643   cudaCheck(cudaStreamSynchronize(stream));
  4649 __global__ void updateVelocities(const int nRattles,
  4652   const int * __restrict settleList,
  4653   const CudaRattleElem * __restrict rattleList,
  4654   const double * __restrict pos_x,
  4655   const double * __restrict pos_y,
  4656   const double * __restrict pos_z,
  4657   const double * __restrict posNew_x,
  4658   const double * __restrict posNew_y,
  4659   const double * __restrict posNew_z,
  4664   int tid = threadIdx.x + blockIdx.x*blockDim.x;
  4665   int stride = gridDim.x*blockDim.x;
  4668   for(int i = tid; i <  nSettles; i += stride){
  4669     int ig = settleList[i];
  4670     //Now I update the position
  4671     //Updates three atoms in the settleList
  4672     velNew_x[ig] = (posNew_x[ig] - pos_x[ig]) * invdt;
  4673     velNew_y[ig] = (posNew_y[ig] - pos_y[ig]) * invdt;
  4674     velNew_z[ig] = (posNew_z[ig] - pos_z[ig]) * invdt;
  4676     velNew_x[ig+1] = (posNew_x[ig+1] - pos_x[ig+1]) * invdt;
  4677     velNew_y[ig+1] = (posNew_y[ig+1] - pos_y[ig+1]) * invdt;
  4678     velNew_z[ig+1] = (posNew_z[ig+1] - pos_z[ig+1]) * invdt;
  4680     velNew_x[ig+2] = (posNew_x[ig+2] - pos_x[ig+2]) * invdt;
  4681     velNew_y[ig+2] = (posNew_y[ig+2] - pos_y[ig+2]) * invdt;
  4682     velNew_z[ig+2] = (posNew_z[ig+2] - pos_z[ig+2]) * invdt;
  4686   for(int i = tid; i <  nRattles; i += stride){
  4687     int ig = rattleList[i].ig;
  4688     int icnt = rattleList[i].icnt;
  4690     fixed[0] = false; fixed[1] = false;
  4691     fixed[2] = false; fixed[3] = false;
  4692     for(int j = 0; j < icnt; j++){
  4693       //Gets two positions
  4694       int ia = rattleList[i].params[j].ia;
  4695       int ib = rattleList[i].params[j].ib;
  4696       //checks if any of these positions have been updated yet
  4698         velNew_x[ig+ia] = (posNew_x[ig+ia] - pos_x[ig+ia]) * invdt;
  4699         velNew_y[ig+ia] = (posNew_y[ig+ia] - pos_y[ig+ia]) * invdt;
  4700         velNew_z[ig+ia] = (posNew_z[ig+ia] - pos_z[ig+ia]) * invdt;
  4704         velNew_x[ig+ib] = (posNew_x[ig+ib] - pos_x[ig+ib]) * invdt;
  4705         velNew_y[ig+ib] = (posNew_y[ig+ib] - pos_y[ig+ib]) * invdt;
  4706         velNew_z[ig+ib] = (posNew_z[ig+ib] - pos_z[ig+ib]) * invdt;
  4713 //JM: Function that adds a correction to the bonded forces
  4714 template<bool doEnergy, bool doFixed>
  4715 __global__ void addRattleForce(int numAtoms,
  4717       const int* __restrict atomFixed,
  4718       const float  * __restrict mass,
  4719       const double * __restrict pos_x,
  4720       const double * __restrict pos_y,
  4721       const double * __restrict pos_z,
  4722       double * __restrict vel_x,
  4723       double * __restrict vel_y,
  4724       double * __restrict vel_z,
  4725       double * __restrict velNew_x,
  4726       double * __restrict velNew_y,
  4727       double * __restrict velNew_z,
  4728       double * __restrict f_normal_x,
  4729       double * __restrict f_normal_y,
  4730       double * __restrict f_normal_z,
  4731       cudaTensor* __restrict virial,
  4732       cudaTensor* __restrict h_virial,
  4733       unsigned int* tbcatomic){
  4736   // zero_cudaTensor(lVirial);
  4737   int i = threadIdx.x + blockIdx.x*blockDim.x;
  4741   lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
  4742   lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
  4743   lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
  4744   //for(int i = tid; i < numAtoms; i += stride){
  4746     df[0] = (velNew_x[i] - vel_x[i]) * ((double)mass[i] * invdt);
  4747     df[1] = (velNew_y[i] - vel_y[i]) * ((double)mass[i] * invdt);
  4748     df[2] = (velNew_z[i] - vel_z[i]) * ((double)mass[i] * invdt);
  4750       pos[0] = pos_x[i]; pos[1] = pos_y[i]; pos[2] = pos_z[i];
  4751       lVirial.xx += df[0] * pos[0];
  4752       lVirial.xy += df[0] * pos[1];
  4753       lVirial.xz += df[0] * pos[2];
  4754       lVirial.yx += df[1] * pos[0];
  4755       lVirial.yy += df[1] * pos[1];
  4756       lVirial.yz += df[1] * pos[2];
  4757       lVirial.zx += df[2] * pos[0];
  4758       lVirial.zy += df[2] * pos[1];
  4759       lVirial.zz += df[2] * pos[2];
  4762     //Updates force and velocities
  4763     f_normal_x[i] += df[0];
  4764     f_normal_y[i] += df[1];
  4765     f_normal_z[i] += df[2];
  4767     // instead of copying I can swap the pointers
  4768     vel_x[i] = velNew_x[i];
  4769     vel_y[i] = velNew_y[i];
  4770     vel_z[i] = velNew_z[i];
  4773   //Makes sure that every thread from block has its lVirial set
  4774   // Now do a block reduce on each position of virialTensor
  4775   // to get consistent values
  4776   // The virial tensor is supposed to be symmetric
  4777   // XXX TODO: Handwrite the reductions to avoid syncthreads
  4779     typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
  4780     __shared__ typename BlockReduce::TempStorage temp_storage;
  4781     lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
  4783       lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
  4784       lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
  4786     lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
  4787     lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
  4790       lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
  4792     lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
  4793     lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
  4794     lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
  4796     // Every block has a locally reduced blockVirial
  4797     // Now every thread does an atomicAdd to get a global virial
  4798     if(threadIdx.x == 0){
  4799       const int bin = blockIdx.x % ATOMIC_BINS;
  4801       atomicAdd(&virial[bin].xx, lVirial.xx);
  4803         atomicAdd(&virial[bin].xy, lVirial.xy);
  4804         atomicAdd(&virial[bin].xz, lVirial.xz);
  4806       atomicAdd(&virial[bin].yx, lVirial.yx);
  4807       atomicAdd(&virial[bin].yy, lVirial.yy);
  4809         atomicAdd(&virial[bin].yz, lVirial.yz);
  4811       atomicAdd(&virial[bin].zx, lVirial.zx);
  4812       atomicAdd(&virial[bin].zy, lVirial.zy);
  4813       atomicAdd(&virial[bin].zz, lVirial.zz);
  4819 template <bool fillIndexes>
  4820 __global__ void buildRattleParams(const int size,
  4824   //Do I need new vectors for holding the rattle forces? Let's do this for now
  4825   const float  * __restrict mass,
  4826   const int    * __restrict hydrogenGroupSize,
  4827   const float  * __restrict rigidBondLength,
  4828   const int   * __restrict atomFixed,
  4829   CudaRattleElem*  rattleList,
  4832   int tid = threadIdx.x + blockIdx.x*blockDim.x;
  4833   int stride = gridDim.x * blockDim.x;
  4842   for(int k = tid; k < size; k += stride){
  4844         i = rattleIndexes[k];
  4848     hgs = hydrogenGroupSize[ig];
  4849     if(hgs == 1) continue; //early bail. Sucks;
  4852     for(int j = 0; j < hgs; j++){
  4853       rmass[j] = (mass[ig + j] > 0.f ? __frcp_rn(mass[ig+j]) : 0.f);
  4854       fixed[j] = atomFixed[ig+j];
  4860     float tmp = rigidBondLength[ig];
  4861     //We bail here if hydrogens are not fixed
  4863       if(!anyfixed) continue;
  4864       if( !(fixed[1] && fixed[2]) ){
  4865         dsq[icnt]   = tmp*tmp;
  4871         rattleIndexes[i] = i;
  4874         for(int j = 1; j < hgs; j++){
  4875             if((tmp = rigidBondLength[ig+j]) > 0){
  4876                 if( !(fixed[0] && fixed[j])){
  4877                     dsq[icnt]   = tmp * tmp;
  4883         // This is really bad: does an atomicadd for each rattle
  4884         // Improve this later
  4885         rattleList[k].ig   = ig;
  4886         rattleList[k].icnt = icnt;
  4887         for(int j = 0; j < icnt; j++){
  4890             rattleList[k].params[j].ia  = ia;
  4891             rattleList[k].params[j].ib  = ib;
  4892             rattleList[k].params[j].dsq = dsq[j];
  4893             rattleList[k].params[j].rma = rmass[ia];
  4894             rattleList[k].params[j].rmb = rmass[ib];
  4900 void buildRattleLists(
  4901     const bool doFixedAtoms,
  4906     const int    *hydrogenGroupSize,
  4907     const float *rigidBondLength,
  4908     const int *atomFixed,
  4910     size_t& settleListSize,
  4912     size_t& consFailureSize,
  4913     CudaRattleElem **rattleList,
  4914     size_t& rattleListSize,
  4922     char*& d_rattleList_temp_storage,
  4923     size_t& temp_storage_bytes,
  4924     int*& rattleIndexes,
  4925     size_t& rattleIndexes_size,
  4927     cudaStream_t stream){
  4930     //thrust::device_ptr<const int> h_dev = thrust::device_pointer_cast(hydrogenGroupSize)
  4931     size_t st_hgi_size = (size_t)(hgi_size);
  4932     reallocate_device<int>(&hgi, &st_hgi_size, natoms, OVERALLOC);
  4933     hgi_size = (size_t)(st_hgi_size);
  4934     size_t temp_length = 1;
  4935     reallocate_device<int>(&d_nHG, &temp_length, 1);
  4936     reallocate_device<int>(&d_nSettles, &temp_length, 1);
  4937     reallocate_device<int>(&d_nRattles, &temp_length, 1);
  4939     copy_DtoD<int>(hydrogenGroupSize, hgi, natoms, stream);
  4940     size_t new_storage_bytes = 0; 
  4941     // Removes zeroes from HGI array
  4942     // Pass NULL to first CUB call to get temp buffer size
  4943     cudaCheck(cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, hgi, 
  4944         d_nHG, natoms, notZero(), stream));
  4945     reallocate_device<char>(&d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC); 
  4946     // DC: There were issues where reallocate wanted int types and CUB wanted unsigned ints
  4947     new_storage_bytes = temp_storage_bytes;
  4948     cudaCheck( cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, hgi, 
  4949         d_nHG, natoms, notZero(), stream));
  4952     copy_DtoH<int>(d_nHG, &temp, 1, stream );
  4955     cudaCheck(cudaStreamSynchronize(stream));
  4957     // Exclusive scan to build an array of indices
  4958     new_storage_bytes = 0; 
  4959     cub::DeviceScan::ExclusiveSum(NULL, 
  4960       new_storage_bytes, hgi, hgi, nHG, stream);
  4961     reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC); 
  4962     new_storage_bytes = temp_storage_bytes;
  4963     cub::DeviceScan::ExclusiveSum(d_rattleList_temp_storage, new_storage_bytes, 
  4964        hgi, hgi, nHG, stream);
  4966     size_t st_tempListSize = (size_t)(settleListSize);
  4967     reallocate_device<int>(settleList, &st_tempListSize, nHG, OVERALLOC);
  4968     settleListSize = (size_t)(st_tempListSize);
  4970     st_tempListSize = (size_t)(rattleListSize);
  4971     reallocate_device<CudaRattleElem>(rattleList, &st_tempListSize, nHG, OVERALLOC);
  4972     rattleListSize = (size_t)(st_tempListSize);
  4974     st_tempListSize = (size_t)(consFailureSize);
  4975     reallocate_device<int>(consFailure, &st_tempListSize, nHG, OVERALLOC);
  4976     consFailureSize = (size_t)(st_tempListSize);
  4978     st_tempListSize = (size_t)(rattleIndexes_size);
  4979     reallocate_device<int>(&rattleIndexes, &st_tempListSize, nHG, OVERALLOC);
  4980     rattleIndexes_size = (size_t)(st_tempListSize);
  4983     cudaCheck(cudaMemsetAsync(*rattleList,  0, sizeof(CudaRattleElem)*nHG, stream));
  4984     cudaCheck(cudaMemsetAsync(rattleIndexes,-1, sizeof(int)*nHG, stream));
  4985     cudaCheck(cudaMemsetAsync(*settleList, 0, sizeof(int)*nHG, stream));
  4987     ///thrust::device_vector<int> hgi(natoms);
  4988     new_storage_bytes = 0;
  4990       isWater<true> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
  4991       // builds SettleList with isWater functor
  4992       cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
  4993           d_nSettles, nHG, comp, stream);
  4994       reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
  4995       new_storage_bytes = temp_storage_bytes;
  4996       cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
  4997           d_nSettles, nHG, comp, stream);
  4999       isWater<false> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
  5000       // builds SettleList with isWater functor
  5001       cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
  5002           d_nSettles, nHG, comp, stream);
  5003       reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
  5004       new_storage_bytes = temp_storage_bytes;
  5005       cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
  5006           d_nSettles, nHG, comp, stream);
  5009     // alright I have the number of settles here
  5010     copy_DtoH<int>(d_nSettles, &nSettles, 1, stream);
  5011     // fprintf(stderr, "nSettles = %d", nSettles);
  5013     //Warmup call to buildRattleParams
  5014     buildRattleParams<true><<<128, 128, 0, stream>>>(nHG, dt, invdt,
  5015         hgi, mass, hydrogenGroupSize,
  5016         rigidBondLength, atomFixed, *rattleList, rattleIndexes);
  5017     cudaCheck(cudaGetLastError());
  5019     // Removing -1 from rattleIndexes
  5020     new_storage_bytes = 0; 
  5021     cub::DeviceSelect::If(NULL, new_storage_bytes, 
  5022         rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
  5023     reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC); 
  5024     new_storage_bytes = temp_storage_bytes;
  5025     cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, 
  5026         rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
  5027     copy_DtoH<int>(d_nRattles, &nRattles, 1, stream);
  5029     // Calculates rattleParams on rattleIndexes
  5030     buildRattleParams<false><<<128, 128, 0, stream>>>(nRattles, dt, invdt,
  5031         hgi, mass, hydrogenGroupSize,
  5032         rigidBondLength, atomFixed, *rattleList, rattleIndexes);
  5034     cudaCheck(cudaGetLastError());
  5037 // XXX TODO: JM: Memory access pattern in this function is bad, since we have to
  5038 //           access memory by the hydrogen groups. Improve it later
  5040 void SequencerCUDAKernel::rattle1(
  5041   const bool doFixedAtoms,
  5042   const bool doEnergy,
  5043   const bool pressure, // pressure is only false for startRun1 and startRun2 right now
  5063   const int *hydrogenGroupSize,
  5064   const float *rigidBondLength,
  5066   const int *atomFixed,
  5068   size_t& settleListSize, 
  5069   int **consFailure_d,
  5070   size_t& consFailureSize, 
  5071   CudaRattleElem **rattleList,
  5072   size_t& rattleListSize,
  5076   cudaTensor *h_virial,
  5077   unsigned int* tbcatomic,
  5079   SettleParameters *sp,
  5082   const WaterModel water_model,
  5083   cudaStream_t stream)
  5085   //Calls the necessary functions to enforce rigid bonds
  5086   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  5088   cudaCheck(cudaStreamSynchronize(stream));
  5089   if(migration || !firstRattleDone){
  5090     // I need to allocate the water positions here
  5092         doFixedAtoms, numAtoms, dt, invdt, mass, hydrogenGroupSize,
  5093         rigidBondLength, atomFixed, settleList, settleListSize,
  5094         consFailure_d, consFailureSize, rattleList, rattleListSize,
  5096         d_nHG, d_nSettles, d_nRattles, hgi, hgi_size,
  5097         d_rattleList_temp_storage, temp_storage_bytes,
  5098         rattleIndexes, rattleIndexes_size, 
  5100     firstRattleDone = true;
  5105     this->updateRigidArrays(
  5106       doFixedAtoms, dt, atomFixed,
  5107       vel_x, vel_y, vel_z,
  5108       pos_x, pos_y, pos_z,
  5109       velNew_x, velNew_y, velNew_z,
  5110       posNew_x, posNew_y, posNew_z,
  5114     // if(*nSettle != 0){
  5117             numAtoms, dt, invdt, *nSettle,
  5118             vel_x, vel_y, vel_z,
  5119             pos_x, pos_y, pos_z,
  5120             velNew_x, velNew_y, velNew_z,
  5121             posNew_x, posNew_y, posNew_z,
  5122             f_normal_x, f_normal_y, f_normal_z, virial, mass,
  5123             hydrogenGroupSize, rigidBondLength, atomFixed,
  5124             *settleList, sp, water_model, stream);
  5130         doEnergy, doFixedAtoms, *rattleList, *nRattle, hydrogenGroupSize,
  5131         atomFixed, pos_x, pos_y, pos_z,
  5132         posNew_x, posNew_y, posNew_z,
  5133         vel_x, vel_y, vel_z,
  5134         velNew_x, velNew_y,velNew_z,
  5135         f_normal_x, f_normal_y, f_normal_z, virial, mass,
  5136         invdt, tol2, maxiter, *consFailure_d,
  5137         consFailure, stream);
  5140       Rattle1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
  5141         dt, invdt, *nSettle,
  5142         vel_x, vel_y, vel_z,
  5143         pos_x, pos_y, pos_z,
  5144         velNew_x, velNew_y, velNew_z,
  5145         posNew_x, posNew_y, posNew_z,
  5146         f_normal_x, f_normal_y, f_normal_z,
  5148         hydrogenGroupSize, rigidBondLength, atomFixed,
  5150         *rattleList, *nRattle, tol2, maxiter, *consFailure_d);
  5155       // only for the first time step
  5156       copy_DtoD<double>(posNew_x, pos_x, numAtoms, stream);
  5157       copy_DtoD<double>(posNew_y, pos_y, numAtoms, stream);
  5158       copy_DtoD<double>(posNew_z, pos_z, numAtoms, stream);
  5159       cudaCheck(cudaStreamSynchronize(stream));
  5160     }else if (pressure == 0){ // pressure is zero for the first timesteps
  5161       copy_DtoD<double>(velNew_x, vel_x, numAtoms, stream);
  5162       copy_DtoD<double>(velNew_y, vel_y, numAtoms, stream);
  5163       copy_DtoD<double>(velNew_z, vel_z, numAtoms, stream);
  5164       cudaCheck(cudaStreamSynchronize(stream));
  5166 #define CALL(DOENERGY, DOFIXED) \
  5167   addRattleForce<DOENERGY, DOFIXED><<<grid, ATOM_BLOCKS, 0 ,stream>>>( \
  5168     numAtoms, invdt, atomFixed, mass, \
  5169     pos_x, pos_y, pos_z, \
  5170     vel_x, vel_y, vel_z, \
  5171     velNew_x, velNew_y, velNew_z, \
  5172     f_normal_x, f_normal_y, f_normal_z, virial, h_virial, tbcatomic)
  5173       if (doEnergy && doFixedAtoms) CALL(true, true);
  5174       else if (doEnergy && !doFixedAtoms) CALL(true, false);
  5175       else if (!doEnergy && doFixedAtoms) CALL(false, true);
  5176       else if (!doEnergy && !doFixedAtoms) CALL(false, false);
  5177       else NAMD_bug("addRattleForce was not called in SequencerCUDAKernel::rattle1!\n");
  5182     // Calls a fused kernel without any tensor update
  5183     // let's calculate the number of settle blocks
  5184     int nSettleBlocks = (*nSettle + 128 -1 ) / 128;
  5185     int nShakeBlocks  = (*nRattle + 128 -1 ) / 128;
  5186     // int nTotalBlocks  = (nSettleBlocks + nShakeBlocks);
  5187     if (doFixedAtoms) NAMD_bug("CallRattle1Kernel does not support fixed atoms!\n");
  5189       numAtoms, dt, invdt, *nSettle,
  5190       vel_x, vel_y, vel_z,
  5191       pos_x, pos_y, pos_z,
  5192       f_normal_x, f_normal_y, f_normal_z,
  5193       mass, hydrogenGroupSize, rigidBondLength, atomFixed,
  5195       *rattleList, *nRattle, tol2, maxiter, *consFailure_d, nSettleBlocks,
  5196       nShakeBlocks, water_model, stream
  5203 template <bool T_NORMALIZED, bool T_DOENERGY>
  5204 __global__ void eFieldKernel(
  5206   const double3     eField,
  5207   const double      eFieldOmega, 
  5208   const double      eFieldPhi, 
  5211   const char3*  __restrict     transform, 
  5212   const float*  __restrict     charges,
  5213   const double* __restrict     pos_x,
  5214   const double* __restrict     pos_y,
  5215   const double* __restrict     pos_z,
  5216   double*       __restrict     f_normal_x,
  5217   double*       __restrict     f_normal_y,
  5218   double*       __restrict     f_normal_z,
  5219   double3*      __restrict     d_extForce,
  5220   cudaTensor*   __restrict     d_extVirial,
  5221   double*       __restrict     d_extEnergy,
  5222   double3*      __restrict     h_extForce,
  5223   cudaTensor*   __restrict     h_extVirial,
  5224   double*       __restrict     h_extEnergy,
  5225   unsigned int*                tbcatomic)
  5227   int i = threadIdx.x + blockIdx.x*blockDim.x;
  5230   // This can all be moved outside of the kernel
  5233   double eCos = cos(eFieldOmega * t - eFieldPhi);
  5234   double charge = charges[i];
  5235   double3 r_extForce = {0, 0, 0};
  5236   double  r_extEnergy = 0;
  5237   cudaTensor r_extVirial = {0};
  5238   int totaltb = gridDim.x;
  5239   __shared__ bool isLastBlockDone;
  5240   eField1.x = eCos * eField.x;
  5241   eField1.y = eCos * eField.y;
  5242   eField1.z = eCos * eField.z;
  5244   if(threadIdx.x == 0) {
  5245      isLastBlockDone = 0;
  5251     // This can be moved outside of the kernel
  5253       eField1 = Vector(lat.a_r()* (Vector)eField1, lat.b_r()* (Vector)eField1, lat.c_r()* (Vector)eField1);
  5256     fx = charge * eField1.x;
  5257     fy = charge * eField1.y;
  5258     fz = charge * eField1.z;
  5260     f_normal_x[i] += fx;
  5261     f_normal_y[i] += fy;
  5262     f_normal_z[i] += fz;
  5265       const char3 tr = transform[i];
  5271       //all threads from block calculate their contribution to energy, extForce and extVirial
  5272       vpos = lat.reverse_transform(vpos, tr);
  5273       double3 o = lat.origin();
  5274       r_extEnergy -= (fx * (vpos.x - o.x)) + (fy * (vpos.y - o.y)) + (fz * (vpos.z - o.z));
  5281         // outer product to calculate a virial here
  5282         r_extVirial.xx = fx * vpos.x;
  5283         r_extVirial.xy = fx * vpos.y;
  5284         r_extVirial.xz = fx * vpos.z;
  5285         r_extVirial.yx = fy * vpos.x;
  5286         r_extVirial.yy = fy * vpos.y;
  5287         r_extVirial.yz = fy * vpos.z;
  5288         r_extVirial.zx = fz * vpos.x;
  5289         r_extVirial.zz = fz * vpos.z;
  5290         r_extVirial.zy = fz * vpos.y;
  5297       // use cub to reduce energies
  5298       typedef cub::BlockReduce<double, ATOM_BLOCKS> BlockReduce;
  5299       __shared__ typename BlockReduce::TempStorage temp_storage;
  5300       r_extEnergy  = BlockReduce(temp_storage).Sum(r_extEnergy);
  5303         // Do the remaining reductions
  5305         r_extForce.x  = BlockReduce(temp_storage).Sum(r_extForce.x);
  5307         r_extForce.y  = BlockReduce(temp_storage).Sum(r_extForce.y);
  5309         r_extForce.z  = BlockReduce(temp_storage).Sum(r_extForce.z);
  5312         r_extVirial.xx  = BlockReduce(temp_storage).Sum(r_extVirial.xx);
  5314         r_extVirial.xy  = BlockReduce(temp_storage).Sum(r_extVirial.xy);
  5316         r_extVirial.xz  = BlockReduce(temp_storage).Sum(r_extVirial.xz);
  5318         r_extVirial.yx  = BlockReduce(temp_storage).Sum(r_extVirial.yx);
  5320         r_extVirial.yy  = BlockReduce(temp_storage).Sum(r_extVirial.yy);
  5322         r_extVirial.yz  = BlockReduce(temp_storage).Sum(r_extVirial.yz);
  5324         r_extVirial.zx  = BlockReduce(temp_storage).Sum(r_extVirial.zx);
  5326         r_extVirial.zy  = BlockReduce(temp_storage).Sum(r_extVirial.zy);
  5328         r_extVirial.zz  = BlockReduce(temp_storage).Sum(r_extVirial.zz);
  5332       // Cool now atomic add to gmem
  5333       if(threadIdx.x == 0 ){
  5334         const int bin = blockIdx.x % ATOMIC_BINS;
  5336         // printf("adding %lf to d_extEnergy\n", r_extEnergy);
  5337         atomicAdd(&d_extEnergy[bin], r_extEnergy);
  5340           // add force and virial as well
  5341           atomicAdd(&d_extForce[bin].x, r_extForce.x);
  5342           atomicAdd(&d_extForce[bin].y, r_extForce.y);
  5343           atomicAdd(&d_extForce[bin].z, r_extForce.z);
  5345           atomicAdd(&d_extVirial[bin].xx, r_extVirial.xx);
  5346           atomicAdd(&d_extVirial[bin].xy, r_extVirial.xy);
  5347           atomicAdd(&d_extVirial[bin].xz, r_extVirial.xz);
  5349           atomicAdd(&d_extVirial[bin].yx, r_extVirial.yx);
  5350           atomicAdd(&d_extVirial[bin].yy, r_extVirial.yy);
  5351           atomicAdd(&d_extVirial[bin].yz, r_extVirial.yz);
  5353           atomicAdd(&d_extVirial[bin].zx, r_extVirial.zx);
  5354           atomicAdd(&d_extVirial[bin].zy, r_extVirial.zy);
  5355           atomicAdd(&d_extVirial[bin].zz, r_extVirial.zz);
  5359         unsigned int value = atomicInc(&tbcatomic[2], totaltb);
  5360         isLastBlockDone = (value == (totaltb -1));
  5364       if(isLastBlockDone){
  5365         if(threadIdx.x < ATOMIC_BINS){
  5366           const int bin = threadIdx.x;
  5368           double e = d_extEnergy[bin];
  5372             f = d_extForce[bin];
  5373             v = d_extVirial[bin];
  5376           // sets device scalars back to zero
  5377           d_extEnergy[bin] = 0.0;
  5379             d_extForce[bin] = make_double3(0.0, 0.0, 0.0);
  5380             zero_cudaTensor(d_extVirial[bin]);
  5383           if(ATOMIC_BINS > 1){
  5384             typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
  5385             typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
  5386             __shared__ typename WarpReduce::TempStorage tempStorage;
  5387             __shared__ typename WarpReduceT::TempStorage tempStorageT;
  5389             e = WarpReduce(tempStorage).Sum(e);
  5391               f.x = WarpReduce(tempStorage).Sum(f.x);
  5392               f.y = WarpReduce(tempStorage).Sum(f.y);
  5393               f.z = WarpReduce(tempStorage).Sum(f.z);
  5394               v = WarpReduceT(tempStorageT).Sum(v);
  5398           if(threadIdx.x == 0){
  5405             //resets atomic counter
  5406             reset_atomic_counter(&(tbcatomic[2]));
  5413 // JM NOTE: Apply external electric field to every atom in the system equally
  5414 void SequencerCUDAKernel::apply_Efield(
  5416   const bool        normalized,
  5417   const bool        doEnergy,
  5418   const double3     eField,
  5419   const double      eFieldOmega, 
  5420   const double      eFieldPhi, 
  5423   const char3*      transform, 
  5424   const float*      charges,
  5425   const double*     pos_x,
  5426   const double*     pos_y,
  5427   const double*     pos_z,
  5431   double3*          d_extForce,
  5432   cudaTensor*       d_extVirial,
  5433   double*           d_extEnergy,
  5434   double3*          h_extForce,
  5435   cudaTensor*       h_extVirial,
  5436   double*           h_extEnergy,
  5437   unsigned int*     tbcatomic,
  5440   int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  5443       eFieldKernel<true, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
  5444         eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
  5445         pos_x, pos_y, pos_z,
  5446         f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
  5447         d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
  5449       eFieldKernel<true, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
  5450         eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
  5451         pos_x, pos_y, pos_z,
  5452         f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
  5453         d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
  5457       eFieldKernel<false, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
  5458         eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
  5459         pos_x, pos_y, pos_z,
  5460         f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
  5461         d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
  5463       eFieldKernel<false, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
  5464         eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
  5465         pos_x, pos_y, pos_z,
  5466         f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
  5467         d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
  5472 SequencerCUDAKernel::SequencerCUDAKernel() { 
  5473   firstRattleDone = false;
  5474   intConstInit = false;
  5483   d_rattleList_temp_storage = NULL;
  5484   temp_storage_bytes = 0;
  5486   rattleIndexes = NULL;
  5487   rattleIndexes_size = 0;
  5490 SequencerCUDAKernel::~SequencerCUDAKernel() {
  5491   deallocate_device<int>(&d_nHG); 
  5492   deallocate_device<int>(&d_nSettles); 
  5493   deallocate_device<int>(&d_nRattles); 
  5494   deallocate_device<int>(&hgi); 
  5495   deallocate_device<int>(&rattleIndexes); 
  5496   deallocate_device<char>(&d_rattleList_temp_storage); 
  5506 void SequencerCUDAKernel::set_pme_positions(
  5508   const bool isPmeDevice, 
  5510   const int  numPatchesHomeAndProxy,
  5511   const int  numPatchesHome,
  5516   const bool doAlchDecouple,
  5517   const bool doAlchSoftCore,
  5518   const bool handleBoundary,
  5519   const double* d_pos_x,
  5520   const double* d_pos_y,
  5521   const double* d_pos_z,
  5522 #ifndef NAMD_NCCL_ALLREDUCE
  5523   double**      d_peer_pos_x, 
  5524   double**      d_peer_pos_y, 
  5525   double**      d_peer_pos_z, 
  5526   float**       d_peer_charge, 
  5527   int**         d_peer_partition,
  5529   const float* charges,
  5530   const int* partition,
  5531   const double charge_scaling, 
  5532   const double3* patchCenter, 
  5533   const int* s_patchPositions, 
  5534   const int* s_pencilPatchIndex, 
  5535   const int* s_patchIDs, 
  5536   const int* patchSortOrder, 
  5537   const Lattice lattice,
  5543   CudaLocalRecord*                  localRecords,
  5544   CudaPeerRecord*                   peerRecords,
  5545   std::vector<int>& atomCounts,
  5548   // Launch PME setComputes
  5549   #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
  5550     setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
  5551     <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
  5552           d_pos_x, d_pos_y, d_pos_z, charges, \
  5553           d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
  5554           d_peer_partition, partition, charge_scaling,  \
  5555           s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
  5556           lattice, s_atoms, numTotalAtoms, s_partition, \
  5557           i, numAtoms, offset, handleBoundary \
  5559   // Only when PME long-range electrostaic is enabled (doSlow is true) 
  5560   // The partition is needed for alchemical calculation.
  5561   if(doSlow && isPmeDevice) {
  5563     for (int i = 0; i < nDev; i++) {
  5564       const bool home = (i == devID);
  5565       const int numAtoms = atomCounts[i];
  5566       const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
  5567       const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
  5570         case  0: CALL(0, 0, 0, 0, 0); break;
  5571         case  1: CALL(0, 0, 0, 0, 1); break;
  5572         case  2: CALL(0, 0, 0, 1, 0); break;
  5573         case  3: CALL(0, 0, 0, 1, 1); break;
  5574         case  4: CALL(0, 0, 1, 0, 0); break;
  5575         case  5: CALL(0, 0, 1, 0, 1); break;
  5576         case  6: CALL(0, 0, 1, 1, 0); break;
  5577         case  7: CALL(0, 0, 1, 1, 1); break;
  5578         case  8: CALL(0, 1, 0, 0, 0); break;
  5579         case  9: CALL(0, 1, 0, 0, 1); break;
  5580         case 10: CALL(0, 1, 0, 1, 0); break;
  5581         case 11: CALL(0, 1, 0, 1, 1); break;
  5582         case 12: CALL(0, 1, 1, 0, 0); break;
  5583         case 13: CALL(0, 1, 1, 0, 1); break;
  5584         case 14: CALL(0, 1, 1, 1, 0); break;
  5585         case 15: CALL(0, 1, 1, 1, 1); break;
  5586         case 16: CALL(1, 0, 0, 0, 0); break;
  5587         case 17: CALL(1, 0, 0, 0, 1); break;
  5588         case 18: CALL(1, 0, 0, 1, 0); break;
  5589         case 19: CALL(1, 0, 0, 1, 1); break;
  5590         case 20: CALL(1, 0, 1, 0, 0); break;
  5591         case 21: CALL(1, 0, 1, 0, 1); break;
  5592         case 22: CALL(1, 0, 1, 1, 0); break;
  5593         case 23: CALL(1, 0, 1, 1, 1); break;
  5594         case 24: CALL(1, 1, 0, 0, 0); break;
  5595         case 25: CALL(1, 1, 0, 0, 1); break;
  5596         case 26: CALL(1, 1, 0, 1, 0); break;
  5597         case 27: CALL(1, 1, 0, 1, 1); break;
  5598         case 28: CALL(1, 1, 1, 0, 0); break;
  5599         case 29: CALL(1, 1, 1, 0, 1); break;
  5600         case 30: CALL(1, 1, 1, 1, 0); break;
  5601         case 31: CALL(1, 1, 1, 1, 1); break;
  5603           NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
  5612 template <bool doGlobal, bool doForcesOutput>
  5613 __global__ void copyForcesToHostSOAKernel(
  5614   const int               numPatches,
  5615   CudaLocalRecord*        localRecords,
  5616   const int               maxForceNumber, 
  5617   const double*           d_f_normal_x,
  5618   const double*           d_f_normal_y,
  5619   const double*           d_f_normal_z,
  5620   const double*           d_f_nbond_x,
  5621   const double*           d_f_nbond_y,
  5622   const double*           d_f_nbond_z,
  5623   const double*           d_f_slow_x,
  5624   const double*           d_f_slow_y,
  5625   const double*           d_f_slow_z,
  5626   PatchDataSOA*           d_HostPatchDataSOA
  5628   __shared__ CudaLocalRecord s_record;
  5629   using AccessType = int32_t;
  5630   AccessType* s_record_buffer = (AccessType*)  &s_record;
  5632   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
  5633     // Read in the CudaLocalRecord using multiple threads. This should 
  5635     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
  5636       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
  5640     const int numAtoms = s_record.numAtoms;
  5641     const int offset = s_record.bufferOffset;
  5643     for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
  5644       if (maxForceNumber >= 0) {
  5645         d_HostPatchDataSOA[patchIndex].f_normal_x[i] = d_f_normal_x[offset + i];
  5646         d_HostPatchDataSOA[patchIndex].f_normal_y[i] = d_f_normal_y[offset + i];
  5647         d_HostPatchDataSOA[patchIndex].f_normal_z[i] = d_f_normal_z[offset + i];
  5649       if (maxForceNumber >= 1) {
  5651           d_HostPatchDataSOA[patchIndex].f_saved_nbond_x[i] = d_f_nbond_x[offset + i];
  5652           d_HostPatchDataSOA[patchIndex].f_saved_nbond_y[i] = d_f_nbond_y[offset + i];
  5653           d_HostPatchDataSOA[patchIndex].f_saved_nbond_z[i] = d_f_nbond_z[offset + i];
  5655         if (doForcesOutput) {
  5656           d_HostPatchDataSOA[patchIndex].f_nbond_x[i] = d_f_nbond_x[offset + i];
  5657           d_HostPatchDataSOA[patchIndex].f_nbond_y[i] = d_f_nbond_y[offset + i];
  5658           d_HostPatchDataSOA[patchIndex].f_nbond_z[i] = d_f_nbond_z[offset + i];
  5661       if (maxForceNumber >= 2) {
  5663           d_HostPatchDataSOA[patchIndex].f_saved_slow_x[i] = d_f_slow_x[offset + i];
  5664           d_HostPatchDataSOA[patchIndex].f_saved_slow_y[i] = d_f_slow_y[offset + i];
  5665           d_HostPatchDataSOA[patchIndex].f_saved_slow_z[i] = d_f_slow_z[offset + i];
  5667         if (doForcesOutput) {
  5668           d_HostPatchDataSOA[patchIndex].f_slow_x[i] = d_f_slow_x[offset + i];
  5669           d_HostPatchDataSOA[patchIndex].f_slow_y[i] = d_f_slow_y[offset + i];
  5670           d_HostPatchDataSOA[patchIndex].f_slow_z[i] = d_f_slow_z[offset + i];
  5679 void SequencerCUDAKernel::copyForcesToHostSOA(
  5680   const int               numPatches,
  5681   CudaLocalRecord*        localRecords,
  5682   const int               maxForceNumber, 
  5683   const double*           d_f_normal_x,
  5684   const double*           d_f_normal_y,
  5685   const double*           d_f_normal_z,
  5686   const double*           d_f_nbond_x,
  5687   const double*           d_f_nbond_y,
  5688   const double*           d_f_nbond_z,
  5689   const double*           d_f_slow_x,
  5690   const double*           d_f_slow_y,
  5691   const double*           d_f_slow_z,
  5692   PatchDataSOA*           d_HostPatchDataSOA,
  5693   const bool              doGlobal,
  5694   const bool              doForcesOutput,
  5697 #define CALL(GLOBAL, FORCESOUTPUT)                                                          \
  5698   do {copyForcesToHostSOAKernel<GLOBAL, FORCESOUTPUT><<<numPatches, PATCH_BLOCKS, 0, stream>>>( \
  5711     d_HostPatchDataSOA                                                                      \
  5713   if (doGlobal && !doForcesOutput) CALL(true, false);
  5714   else if (doGlobal && doForcesOutput) CALL(true, true);
  5715   else if (!doGlobal && doForcesOutput) CALL(false, true);
  5716   else NAMD_bug("No kernel is called in copyForcesToHostSOA");
  5720 __global__ void copyPositionsToHostSOAKernel(
  5721   const int               numPatches,
  5722   CudaLocalRecord*        localRecords,
  5723   const double*           pos_x, 
  5724   const double*           pos_y, 
  5725   const double*           pos_z, 
  5726   PatchDataSOA*           d_HostPatchDataSOA
  5728   __shared__ CudaLocalRecord s_record;
  5729   using AccessType = int32_t;
  5730   AccessType* s_record_buffer = (AccessType*)  &s_record;
  5732   for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
  5733     // Read in the CudaLocalRecord using multiple threads. This should 
  5735     for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
  5736       s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
  5740     const int numAtoms = s_record.numAtoms;
  5741     const int offset = s_record.bufferOffset;
  5743     for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
  5744       d_HostPatchDataSOA[patchIndex].pos_x[i] = pos_x[offset + i];
  5745       d_HostPatchDataSOA[patchIndex].pos_y[i] = pos_y[offset + i];
  5746       d_HostPatchDataSOA[patchIndex].pos_z[i] = pos_z[offset + i];
  5752 void SequencerCUDAKernel::copyPositionsToHostSOA(
  5753   const int               numPatches,
  5754   CudaLocalRecord*        localRecords,
  5755   const double*           pos_x, 
  5756   const double*           pos_y, 
  5757   const double*           pos_z, 
  5758   PatchDataSOA*           d_HostPatchDataSOA,
  5761   copyPositionsToHostSOAKernel<<<numPatches, PATCH_BLOCKS, 0, stream>>>(
  5764     pos_x, pos_y, pos_z,
  5769 // Swipe from HomePatch::redistrib_lp_water_force
  5770 /* Redistribute forces from the massless lonepair charge particle onto
  5771  * the other atoms of the water.
  5773  * This is done using the same algorithm as charmm uses for TIP4P lonepairs.
  5775  * Pass by reference the forces (O H1 H2 LP) to be modified,
  5776  * pass by constant reference the corresponding positions.
  5778 template <bool doVirial>
  5779 __device__ void redistrib_lp_water_force(
  5780   double f_ox[3], double f_h1[3], double f_h2[3], double f_lp[3],
  5781   const double p_ox[3], const double p_h1[3], const double p_h2[3],
  5782   const double p_lp[3], cudaTensor& virial) {
  5783   // Accumulate force adjustments
  5784   double fad_ox[3] = {};
  5785   double fad_h[3] = {};
  5786   // Calculate the radial component of the force and add it to the oxygen
  5787   const double r_ox_lp[3] = {p_lp[0] - p_ox[0],
  5790   double invlen2_r_ox_lp = rnorm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
  5791   invlen2_r_ox_lp *= invlen2_r_ox_lp;
  5792   const double rad_factor =
  5793     (f_lp[0] * r_ox_lp[0] + f_lp[1] * r_ox_lp[1] + f_lp[2] * r_ox_lp[2]) * invlen2_r_ox_lp;
  5794   const double f_rad[3] = {r_ox_lp[0] * rad_factor,
  5795                            r_ox_lp[1] * rad_factor,
  5796                            r_ox_lp[2] * rad_factor};
  5797   fad_ox[0] += f_rad[0];
  5798   fad_ox[1] += f_rad[1];
  5799   fad_ox[2] += f_rad[2];
  5800   // Calculate the angular component
  5801   const double r_hcom_ox[3] = {
  5802     p_ox[0] - ( (p_h1[0] + p_h2[0]) * 0.5 ),
  5803     p_ox[1] - ( (p_h1[1] + p_h2[1]) * 0.5 ),
  5804     p_ox[2] - ( (p_h1[2] + p_h2[2]) * 0.5 )};
  5805   const double f_ang[3] = {
  5808     f_lp[2] - f_rad[2]};
  5809   // now split this component onto the other atoms
  5810   const double len_r_ox_lp = norm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
  5811   const double invlen_r_hcom_ox = rnorm3d(
  5812     r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]);
  5813   const double oxcomp =
  5814     (norm3d(r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]) - len_r_ox_lp) *
  5816   const double hydcomp = 0.5 * len_r_ox_lp * invlen_r_hcom_ox;
  5817   fad_ox[0] += (f_ang[0] * oxcomp);
  5818   fad_ox[1] += (f_ang[1] * oxcomp);
  5819   fad_ox[2] += (f_ang[2] * oxcomp);
  5820   fad_h[0] += (f_ang[0] * hydcomp);
  5821   fad_h[1] += (f_ang[1] * hydcomp);
  5822   fad_h[2] += (f_ang[2] * hydcomp);
  5824     virial.xx = fad_ox[0] * p_ox[0] + fad_h[0] * p_h1[0] + fad_h[0] * p_h2[0] - f_lp[0] * p_lp[0];
  5825     virial.xy = fad_ox[0] * p_ox[1] + fad_h[0] * p_h1[1] + fad_h[0] * p_h2[1] - f_lp[0] * p_lp[1];
  5826     virial.xz = fad_ox[0] * p_ox[2] + fad_h[0] * p_h1[2] + fad_h[0] * p_h2[2] - f_lp[0] * p_lp[2];
  5827     virial.yx = fad_ox[1] * p_ox[0] + fad_h[1] * p_h1[0] + fad_h[1] * p_h2[0] - f_lp[1] * p_lp[0];
  5828     virial.yy = fad_ox[1] * p_ox[1] + fad_h[1] * p_h1[1] + fad_h[1] * p_h2[1] - f_lp[1] * p_lp[1];
  5829     virial.yz = fad_ox[1] * p_ox[2] + fad_h[1] * p_h1[2] + fad_h[1] * p_h2[2] - f_lp[1] * p_lp[2];
  5830     virial.zx = fad_ox[2] * p_ox[0] + fad_h[2] * p_h1[0] + fad_h[2] * p_h2[0] - f_lp[2] * p_lp[0];
  5831     virial.zy = fad_ox[2] * p_ox[1] + fad_h[2] * p_h1[1] + fad_h[2] * p_h2[1] - f_lp[2] * p_lp[1];
  5832     virial.zz = fad_ox[2] * p_ox[2] + fad_h[2] * p_h1[2] + fad_h[2] * p_h2[2] - f_lp[2] * p_lp[2];
  5837   f_ox[0] += fad_ox[0];
  5838   f_ox[1] += fad_ox[1];
  5839   f_ox[2] += fad_ox[2];
  5840   f_h1[0] += fad_h[0];
  5841   f_h1[1] += fad_h[1];
  5842   f_h1[2] += fad_h[2];
  5843   f_h2[0] += fad_h[0];
  5844   f_h2[1] += fad_h[1];
  5845   f_h2[2] += fad_h[2];
  5848 template <bool doVirial>
  5849 __global__ void redistributeTip4pForcesKernel2(
  5853   cudaTensor*    d_virial,
  5854   const double*  d_pos_x,
  5855   const double*  d_pos_y,
  5856   const double*  d_pos_z,
  5857   const float*   d_mass,
  5860   const int i = threadIdx.x + blockIdx.x * blockDim.x;
  5861   cudaTensor lVirial = {0};
  5863     if (d_mass[i] < 0.01f) {
  5864       double f_ox[3] = {d_f_x[i-3], d_f_y[i-3], d_f_z[i-3]};
  5865       double f_h1[3] = {d_f_x[i-2], d_f_y[i-2], d_f_z[i-2]};
  5866       double f_h2[3] = {d_f_x[i-1], d_f_y[i-1], d_f_z[i-1]};
  5867       double f_lp[3] = {d_f_x[i],   d_f_y[i],   d_f_z[i]};
  5868       const double p_ox[3] = {d_pos_x[i-3], d_pos_y[i-3], d_pos_z[i-3]};
  5869       const double p_h1[3] = {d_pos_x[i-2], d_pos_y[i-2], d_pos_z[i-2]};
  5870       const double p_h2[3] = {d_pos_x[i-1], d_pos_y[i-1], d_pos_z[i-1]};
  5871       const double p_lp[3] = {d_pos_x[i],   d_pos_y[i],   d_pos_z[i]};
  5872       redistrib_lp_water_force<doVirial>(
  5873         f_ox, f_h1, f_h2, f_lp, p_ox, p_h1, p_h2, p_lp, lVirial);
  5874       // copy the force back
  5875       d_f_x[i-3] = f_ox[0];
  5876       d_f_x[i-2] = f_h1[0];
  5877       d_f_x[i-1] = f_h2[0];
  5879       d_f_y[i-3] = f_ox[1];
  5880       d_f_y[i-2] = f_h1[1];
  5881       d_f_y[i-1] = f_h2[1];
  5883       d_f_z[i-3] = f_ox[2];
  5884       d_f_z[i-2] = f_h1[2];
  5885       d_f_z[i-1] = f_h2[2];
  5889   typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
  5890   __shared__ typename BlockReduce::TempStorage temp_storage;
  5892     lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx);
  5894     lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy);
  5896     lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz);
  5898     lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx);
  5900     lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
  5902     lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz);
  5904     lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx);
  5906     lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy);
  5908     lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz);
  5910     if (threadIdx.x == 0) {
  5911       atomicAdd(&(d_virial->xx), lVirial.xx);
  5912       atomicAdd(&(d_virial->xy), lVirial.xy);
  5913       atomicAdd(&(d_virial->xz), lVirial.xz);
  5914       atomicAdd(&(d_virial->yx), lVirial.yx);
  5915       atomicAdd(&(d_virial->yy), lVirial.yy);
  5916       atomicAdd(&(d_virial->yz), lVirial.yz);
  5917       atomicAdd(&(d_virial->zx), lVirial.zx);
  5918       atomicAdd(&(d_virial->zy), lVirial.zy);
  5919       atomicAdd(&(d_virial->zz), lVirial.zz);
  5924 void SequencerCUDAKernel::redistributeTip4pForces(
  5925   double*        d_f_normal_x,
  5926   double*        d_f_normal_y,
  5927   double*        d_f_normal_z,
  5928   double*        d_f_nbond_x,
  5929   double*        d_f_nbond_y,
  5930   double*        d_f_nbond_z,
  5934   cudaTensor*    d_virial_normal,
  5935   cudaTensor*    d_virial_nbond,
  5936   cudaTensor*    d_virial_slow,
  5937   const double*  d_pos_x,
  5938   const double*  d_pos_y,
  5939   const double*  d_pos_z,
  5940   const float*   d_mass,
  5943   const int      maxForceNumber,
  5946   const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  5947   switch (maxForceNumber) {
  5949       if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5950       else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5952       if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5953       else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5955       if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5956       else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
  5960 __forceinline__ __device__ void calcVirial(
  5961   const int i, const double factor,
  5962   const double* __restrict f_x, const double* __restrict f_y, const double* __restrict f_z,
  5963   const double* __restrict r_x, const double* __restrict r_y, const double* __restrict r_z,
  5964   cudaTensor& tensor_out) {
  5965   tensor_out.xx = factor * f_x[i] * r_x[i];
  5966   tensor_out.xy = factor * f_x[i] * r_y[i];
  5967   tensor_out.xz = factor * f_x[i] * r_z[i];
  5968   tensor_out.yx = factor * f_y[i] * r_x[i];
  5969   tensor_out.yy = factor * f_y[i] * r_y[i];
  5970   tensor_out.yz = factor * f_y[i] * r_z[i];
  5971   tensor_out.zx = factor * f_z[i] * r_x[i];
  5972   tensor_out.zy = factor * f_z[i] * r_y[i];
  5973   tensor_out.zz = factor * f_z[i] * r_z[i];
  5976 template <bool doNbond, bool doSlow>
  5977 __global__ void calcFixVirialKernel(
  5979   const int*    __restrict  d_atomFixed,
  5980   const double* __restrict  d_fixedPosition_x,
  5981   const double* __restrict  d_fixedPosition_y,
  5982   const double* __restrict  d_fixedPosition_z,
  5983   const double* __restrict  d_f_normal_x,
  5984   const double* __restrict  d_f_normal_y,
  5985   const double* __restrict  d_f_normal_z,
  5986   const double* __restrict  d_f_nbond_x,
  5987   const double* __restrict  d_f_nbond_y,
  5988   const double* __restrict  d_f_nbond_z,
  5989   const double* __restrict  d_f_slow_x,
  5990   const double* __restrict  d_f_slow_y,
  5991   const double* __restrict  d_f_slow_z,
  5992   cudaTensor*   __restrict  d_virial_normal,
  5993   cudaTensor*   __restrict  d_virial_nbond,
  5994   cudaTensor*   __restrict  d_virial_slow,
  5995   double3*      __restrict  d_extForce_normal,
  5996   double3*      __restrict  d_extForce_nbond,
  5997   double3*      __restrict  d_extForce_slow)
  5999   const int i = threadIdx.x + blockIdx.x * blockDim.x;
  6000   cudaTensor vir_normal = {0};
  6001   cudaTensor vir_nbond  = {0};
  6002   cudaTensor vir_slow   = {0};
  6003   double3 f_sum_normal  = {0, 0, 0};
  6004   double3 f_sum_nbond;
  6006     f_sum_nbond = {0, 0, 0};
  6010     f_sum_slow = {0, 0, 0};
  6013     if (d_atomFixed[i]) {
  6014       calcVirial(i, -1.0, d_f_normal_x, d_f_normal_y, d_f_normal_z,
  6015                  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_normal);
  6016       f_sum_normal.x = -1.0 * d_f_normal_x[i];
  6017       f_sum_normal.y = -1.0 * d_f_normal_y[i];
  6018       f_sum_normal.z = -1.0 * d_f_normal_z[i];
  6020         calcVirial(i, -1.0, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
  6021                    d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_nbond);
  6022         f_sum_nbond.x = -1.0 * d_f_nbond_x[i];
  6023         f_sum_nbond.y = -1.0 * d_f_nbond_y[i];
  6024         f_sum_nbond.z = -1.0 * d_f_nbond_z[i];
  6027         calcVirial(i, -1.0, d_f_slow_x, d_f_slow_y, d_f_slow_z,
  6028                    d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_slow);
  6029         f_sum_slow.x = -1.0 * d_f_slow_x[i];
  6030         f_sum_slow.y = -1.0 * d_f_slow_y[i];
  6031         f_sum_slow.z = -1.0 * d_f_slow_z[i];
  6035   // Haochuan: is it OK to use lambda to reduce the cudaTensor as a whole instead of component-wise??
  6036   typedef cub::BlockReduce<cudaTensor, ATOM_BLOCKS> BlockReduceCudaTensor;
  6037   __shared__ typename BlockReduceCudaTensor::TempStorage temp_storage;
  6038   vir_normal = BlockReduceCudaTensor(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
  6041     vir_nbond = BlockReduceCudaTensor(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
  6045     vir_slow = BlockReduceCudaTensor(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
  6048   typedef cub::BlockReduce<double3, ATOM_BLOCKS> BlockReduceDouble3;
  6049   __shared__ typename BlockReduceDouble3::TempStorage temp_storage2;
  6050   f_sum_normal = BlockReduceDouble3(temp_storage2).Reduce(f_sum_normal, [](const double3& a, const double3& b){return a+b;});
  6053     f_sum_nbond = BlockReduceDouble3(temp_storage2).Reduce(f_sum_nbond, [](const double3& a, const double3& b){return a+b;});
  6057     f_sum_slow = BlockReduceDouble3(temp_storage2).Reduce(f_sum_slow, [](const double3& a, const double3& b){return a+b;});
  6060   if (threadIdx.x == 0) {
  6061     atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
  6062     atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
  6063     atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
  6064     atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
  6065     atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
  6066     atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
  6067     atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
  6068     atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
  6069     atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
  6070     atomicAdd(&(d_extForce_normal->x), f_sum_normal.x);
  6071     atomicAdd(&(d_extForce_normal->y), f_sum_normal.y);
  6072     atomicAdd(&(d_extForce_normal->z), f_sum_normal.z);
  6074       atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
  6075       atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
  6076       atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
  6077       atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
  6078       atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
  6079       atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
  6080       atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
  6081       atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
  6082       atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
  6083       atomicAdd(&(d_extForce_nbond->x), f_sum_nbond.x);
  6084       atomicAdd(&(d_extForce_nbond->y), f_sum_nbond.y);
  6085       atomicAdd(&(d_extForce_nbond->z), f_sum_nbond.z);
  6088       atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
  6089       atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
  6090       atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
  6091       atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
  6092       atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
  6093       atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
  6094       atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
  6095       atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
  6096       atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
  6097       atomicAdd(&(d_extForce_slow->x), f_sum_slow.x);
  6098       atomicAdd(&(d_extForce_slow->y), f_sum_slow.y);
  6099       atomicAdd(&(d_extForce_slow->z), f_sum_slow.z);
  6104 __global__ void copyForcesToDeviceKernel(
  6106   const double*           d_f_nbond_x,
  6107   const double*           d_f_nbond_y,
  6108   const double*           d_f_nbond_z,
  6109   const double*           d_f_slow_x,
  6110   const double*           d_f_slow_y,
  6111   const double*           d_f_slow_z,
  6112   double*                 d_f_saved_nbond_x,
  6113   double*                 d_f_saved_nbond_y,
  6114   double*                 d_f_saved_nbond_z,
  6115   double*                 d_f_saved_slow_x,
  6116   double*                 d_f_saved_slow_y,
  6117   double*                 d_f_saved_slow_z,
  6118   const int               maxForceNumber
  6120   const int i = threadIdx.x + blockIdx.x * blockDim.x;
  6122     if (maxForceNumber >= 2) {
  6123       d_f_saved_slow_x[i] = d_f_slow_x[i];
  6124       d_f_saved_slow_y[i] = d_f_slow_y[i];
  6125       d_f_saved_slow_z[i] = d_f_slow_z[i];
  6127     if (maxForceNumber >= 1) {
  6128       d_f_saved_nbond_x[i] = d_f_nbond_x[i];
  6129       d_f_saved_nbond_y[i] = d_f_nbond_y[i];
  6130       d_f_saved_nbond_z[i] = d_f_nbond_z[i];
  6135 void SequencerCUDAKernel::calcFixVirial(
  6136   const int       maxForceNumber,
  6138   const int*      d_atomFixed,
  6139   const double*   d_fixedPosition_x,
  6140   const double*   d_fixedPosition_y,
  6141   const double*   d_fixedPosition_z,
  6142   const double*   d_f_normal_x,
  6143   const double*   d_f_normal_y,
  6144   const double*   d_f_normal_z,
  6145   const double*   d_f_nbond_x,
  6146   const double*   d_f_nbond_y,
  6147   const double*   d_f_nbond_z,
  6148   const double*   d_f_slow_x,
  6149   const double*   d_f_slow_y,
  6150   const double*   d_f_slow_z,
  6151   cudaTensor*     d_virial_normal,
  6152   cudaTensor*     d_virial_nbond,
  6153   cudaTensor*     d_virial_slow,
  6154   double3*        d_extForce_normal,
  6155   double3*        d_extForce_nbond,
  6156   double3*        d_extForce_slow,
  6157   cudaStream_t   stream)
  6159   const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  6160 #define CALL(doNbond, doSlow) \
  6161   calcFixVirialKernel<doNbond, doSlow><<<grid, ATOM_BLOCKS, 0, stream>>>( \
  6162     numAtoms, d_atomFixed, \
  6163     d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, \
  6164     d_f_normal_x, d_f_normal_y, d_f_normal_z, \
  6165     d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
  6166     d_f_slow_x, d_f_slow_y, d_f_slow_z, \
  6167     d_virial_normal, d_virial_nbond, d_virial_slow, \
  6168     d_extForce_normal, d_extForce_nbond, d_extForce_slow \
  6170   switch (maxForceNumber) {
  6171     case 0: CALL(false, false); break;
  6172     case 1: CALL(true, false); break;
  6173     case 2: CALL(true, true); break;
  6174     default: NAMD_bug("No kernel is called in SequencerCUDAKernel::calcFixVirial!\n");
  6179 void SequencerCUDAKernel::copyForcesToDevice(
  6181   const double*           d_f_nbond_x,
  6182   const double*           d_f_nbond_y,
  6183   const double*           d_f_nbond_z,
  6184   const double*           d_f_slow_x,
  6185   const double*           d_f_slow_y,
  6186   const double*           d_f_slow_z,
  6187   double*                 d_f_saved_nbond_x,
  6188   double*                 d_f_saved_nbond_y,
  6189   double*                 d_f_saved_nbond_z,
  6190   double*                 d_f_saved_slow_x,
  6191   double*                 d_f_saved_slow_y,
  6192   double*                 d_f_saved_slow_z,
  6193   const int               maxForceNumber,
  6196   const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
  6197   copyForcesToDeviceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
  6198     numAtoms, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
  6199     d_f_slow_x, d_f_slow_y, d_f_slow_z,
  6200     d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
  6201     d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z,
  6207 #endif // NODEGROUP_FORCE_REGISTER