2 ***  Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
     3 ***  The Board of Trustees of the University of Illinois.
     4 ***  All rights reserved.
    13 #include <hip/hip_runtime.h>
    19 #include "MShakeKernel.h"
    20 #include "HipDefines.h"
    24 typedef float   BigReal;
    26 typedef double  BigReal;
    29 //__constant__ SettleParameters constSP;
    31 // Haochuan: this function is not used so I disable it
    33 __global__ void rattlePair(
    35   const double * __restrict vel_x,
    36   const double * __restrict vel_y,
    37   const double * __restrict vel_z,
    38   const double * __restrict pos_x,
    39   const double * __restrict pos_y,
    40   const double * __restrict pos_z,
    41   double * __restrict velNew_x, 
    42   double * __restrict velNew_y,
    43   double * __restrict velNew_z,
    44   double * __restrict posNew_x,
    45   double * __restrict posNew_y,
    46   double * __restrict posNew_z,
    47   const int   * __restrict hydrogenGroupSize,
    48   const float  * __restrict rigidBondLength,
    49   const int   * __restrict atomFixed,
    50   CudaRattleElem    * __restrict rattlePairList,
    53     int tid = threadIdx.x + blockIdx.x*blockDim.x;
    54     int stride = blockDim.x * gridDim.x;
    55     for(int i = tid; i < nRattlePairs; i+= stride){
    57         int ig = rattlePairList[i].ig;
    58         CudaRattleParam param = rattlePairList[i].params[0];
    59         int a = ig + param.ia;
    60         int b = ig + param.ib;
    61         BigReal pabx = posNew_x[a] - posNew_x[b];
    62         BigReal paby = posNew_y[a] - posNew_y[b];
    63         BigReal pabz = posNew_z[a] - posNew_z[b];
    64         BigReal pabsq = pabx*pabx + paby*paby + pabz*pabz;
    65         BigReal rabsq = param.dsq;
    66         BigReal diffsq = rabsq - pabsq;
    67         BigReal rabx = pos_x[a] - pos_x[b];
    68         BigReal raby = pos_y[a] - pos_y[b];
    69         BigReal rabz = pos_z[a] - pos_z[b];
    71         BigReal refsq = rabx*rabx + raby*raby + rabz*rabz;
    72         BigReal rpab = rabx*pabx + raby*paby + rabz*pabz;
    74         BigReal rma = param.rma;
    75         BigReal rmb = param.rmb;
    78         BigReal sqrtarg = rpab*rpab + refsq*diffsq;
    84           gab = (-rpab + sqrt(sqrtarg))/(refsq*(rma + rmb));
    86         BigReal dpx = rabx * gab;
    87         BigReal dpy = raby * gab;
    88         BigReal dpz = rabz * gab;
    89         posNew_x[a] += rma * dpx;
    90         posNew_y[a] += rma * dpy;
    91         posNew_z[a] += rma * dpz;
    92         posNew_x[b] -= rmb * dpx;
    93         posNew_y[b] -= rmb * dpy;
    94         posNew_z[b] -= rmb * dpz;
    99 // Haochuan: this function does not support SWM4 and TIP4,
   100 // and is not used anywhere, so I disable it.
   102 __global__ void settleInit(int waterIndex, const float *mass,
   103     const int *hydrogenGroupSize, const float *rigidBondLength, 
   104     SettleParameters *sp){
   106     //This initializes the water parameters
   108    //Water index points to oxygen, oatm poins to a hydrogen in the group
   109    float pmO, pmH, hhdist, ohdist;
   110    int oatm = waterIndex+1;
   111    oatm += 1*(mass[waterIndex] < 0.5 || mass[waterIndex+1] < 0.5);
   113    //Assigns values to everyone
   114    pmO = mass[waterIndex];
   116    hhdist = rigidBondLength[waterIndex];
   117    ohdist = rigidBondLength[oatm];
   118    float rmT = 1.f / (pmO+pmH+pmH);
   119    sp->mOrmT = pmO * rmT;
   120    sp->mHrmT = pmH * rmT;
   121    BigReal t1 = 0.5f*pmO/pmH;
   122    sp->rc = 0.5f*hhdist;
   123    sp->ra = sqrt(ohdist*ohdist-sp->rc*sp->rc)/(1.0+t1);
   125    sp->rra = 1.f / sp->ra;
   129 __forceinline__ __device__ void settle1(
   130     const double ref[3][3],
   132     const SettleParameters *sp) {
   134     // swiped from Settle.C
   155     double mOrmT = sp->mOrmT;
   156     double mHrmT = sp->mHrmT;
   157     double rra = sp->rra;
   182     // vectors in the plane of the original positions
   183     BigReal b0x = ref1x - ref0x;
   184     BigReal b0y = ref1y - ref0y;
   185     BigReal b0z = ref1z - ref0z;
   187     BigReal c0x = ref2x - ref0x;
   188     BigReal c0y = ref2y - ref0y;
   189     BigReal c0z = ref2z - ref0z;
   191     // new center of mass
   192     BigReal d0x = pos0x*mOrmT + ((pos1x + pos2x)*mHrmT);
   193     BigReal d0y = pos0y*mOrmT + ((pos1y + pos2y)*mHrmT);
   194     BigReal d0z = pos0z*mOrmT + ((pos1z + pos2z)*mHrmT);
   196     BigReal a1x = pos0x - d0x;
   197     BigReal a1y = pos0y - d0y;
   198     BigReal a1z = pos0z - d0z;
   200     BigReal b1x = pos1x - d0x;
   201     BigReal b1y = pos1y - d0y;
   202     BigReal b1z = pos1z - d0z;
   204     BigReal c1x = pos2x - d0x;
   205     BigReal c1y = pos2y - d0y;
   206     BigReal c1z = pos2z - d0z;
   208     // Vectors describing transformation from original coordinate system to
   209     // the 'primed' coordinate system as in the diagram.
   211     BigReal n0x = b0y*c0z-c0y*b0z;
   212     BigReal n0y = c0x*b0z-b0x*c0z;
   213     BigReal n0z = b0x*c0y-c0x*b0y;
   216     BigReal n1x = a1y*n0z-n0y*a1z;
   217     BigReal n1y = n0x*a1z-a1x*n0z;
   218     BigReal n1z = a1x*n0y-n0x*a1y;
   220     BigReal n2x = n0y*n1z-n1y*n0z;
   221     BigReal n2y = n1x*n0z-n0x*n1z;
   222     BigReal n2z = n0x*n1y-n1x*n0y;
   224     //Change by rsqrtf later
   225     BigReal n0inv = 1.0/sqrt(n0x*n0x + n0y*n0y + n0z*n0z);
   230     BigReal n1inv = 1.0/sqrt(n1x*n1x + n1y*n1y + n1z*n1z);
   235     BigReal n2inv = 1.0/sqrt(n2x*n2x + n2y*n2y + n2z*n2z);
   240     //b0 = Vector(n1*b0, n2*b0, n0*b0); // note: b0.z is never referenced again
   241     BigReal n1b0 = n1x*b0x + n1y*b0y + n1z*b0z;
   242     BigReal n2b0 = n2x*b0x + n2y*b0y + n2z*b0z;
   244     //c0 = Vector(n1*c0, n2*c0, n0*c0); // note: c0.z is never referenced again
   245     BigReal n1c0 = n1x*c0x + n1y*c0y + n1z*c0z;
   246     BigReal n2c0 = n2x*c0x + n2y*c0y + n2z*c0z;
   248     BigReal A1Z = n0x*a1x + n0y*a1y + n0z*a1z;
   250     //b1 = Vector(n1*b1, n2*b1, n0*b1);
   251     BigReal n1b1 = n1x*b1x + n1y*b1y + n1z*b1z;
   252     BigReal n2b1 = n2x*b1x + n2y*b1y + n2z*b1z;
   253     BigReal n0b1 = n0x*b1x + n0y*b1y + n0z*b1z;
   255     //c1 = Vector(n1*c1, n2*c1, n0*c1);
   256     BigReal n1c1 = n1x*c1x + n1y*c1y + n1z*c1z;
   257     BigReal n2c1 = n2x*c1x + n2y*c1y + n2z*c1z;
   258     BigReal n0c1 = n0x*c1x + n0y*c1y + n0z*c1z;
   260     // now we can compute positions of canonical water
   261     BigReal sinphi = A1Z * rra;
   262     BigReal tmp = 1.0-sinphi*sinphi;
   263     BigReal cosphi = sqrt(tmp);
   264     BigReal sinpsi = (n0b1 - n0c1)/(2.0*rc*cosphi);
   265     tmp = 1.0-sinpsi*sinpsi;
   266     BigReal cospsi = sqrt(tmp);
   268     BigReal rbphi = -rb*cosphi;
   269     BigReal tmp1 = rc*sinpsi*sinphi;
   270     //BigReal tmp2 = rc*sinpsi*cosphi;
   272     //Vector a2(0, ra*cosphi, ra*sinphi);
   273     BigReal a2y = ra*cosphi;
   275     //Vector b2(-rc*cospsi, rbphi - tmp1, -rb*sinphi + tmp2);
   276     BigReal b2x = -rc*cospsi;
   277     BigReal b2y = rbphi - tmp1;
   279     //Vector c2( rc*cosphi, rbphi + tmp1, -rb*sinphi - tmp2);
   280     BigReal c2y = rbphi + tmp1;
   282     // there are no a0 terms because we've already subtracted the term off
   283     // when we first defined b0 and c0.
   284     BigReal alpha = b2x*(n1b0 - n1c0) + n2b0*b2y + n2c0*c2y;
   285     BigReal beta  = b2x*(n2c0 - n2b0) + n1b0*b2y + n1c0*c2y;
   286     BigReal gama  = n1b0*n2b1 - n1b1*n2b0 + n1c0*n2c1 - n1c1*n2c0;
   288     BigReal a2b2 = alpha*alpha + beta*beta;
   289     BigReal sintheta = (alpha*gama - beta*sqrt(a2b2 - gama*gama))/a2b2;
   290     BigReal costheta = sqrt(1.0 - sintheta*sintheta);
   292     //Vector a3( -a2y*sintheta,
   295     BigReal a3x = -a2y*sintheta;
   296     BigReal a3y = a2y*costheta;
   299     // Vector b3(b2x*costheta - b2y*sintheta,
   300     //             b2x*sintheta + b2y*costheta,
   302     BigReal b3x = b2x*costheta - b2y*sintheta;
   303     BigReal b3y = b2x*sintheta + b2y*costheta;
   306     // Vector c3(-b2x*costheta - c2y*sintheta,
   307     //           -b2x*sintheta + c2y*costheta,
   309     BigReal c3x = -b2x*costheta - c2y*sintheta;
   310     BigReal c3y = -b2x*sintheta + c2y*costheta;
   313     // undo the transformation; generate new normal vectors from the transpose.
   314     // Vector m1(n1.x, n2.x, n0.x);
   319     // Vector m2(n1.y, n2.y, n0.y);
   324     // Vector m0(n1.z, n2.z, n0.z);
   329     //pos[i*3+0] = Vector(a3*m1, a3*m2, a3*m0) + d0;
   330     pos0x = a3x*m1x + a3y*m1y + a3z*m1z + d0x;
   331     pos0y = a3x*m2x + a3y*m2y + a3z*m2z + d0y;
   332     pos0z = a3x*m0x + a3y*m0y + a3z*m0z + d0z;
   334     // pos[i*3+1] = Vector(b3*m1, b3*m2, b3*m0) + d0;
   335     pos1x = b3x*m1x + b3y*m1y + b3z*m1z + d0x;
   336     pos1y = b3x*m2x + b3y*m2y + b3z*m2z + d0y;
   337     pos1z = b3x*m0x + b3y*m0y + b3z*m0z + d0z;
   339     // pos[i*3+2] = Vector(c3*m1, c3*m2, c3*m0) + d0;
   340     pos2x = c3x*m1x + c3y*m1y + c3z*m1z + d0x;
   341     pos2y = c3x*m2x + c3y*m2y + c3z*m2z + d0y;
   342     pos2z = c3x*m0x + c3y*m0y + c3z*m0z + d0z;
   355 // swipe from HomePatch::tip4_omrepos
   356 /* Reposition the om particle of a tip4p water
   357  * A little geometry shows that the appropriate position is given by
   358  * R_O + (1 / 2 r_ohc) * ( 0.5 (R_H1 + R_H2) - R_O )
   359  * Here r_om is the distance from the oxygen to Om site, and r_ohc
   360  * is the altitude from the oxygen to the hydrogen center of mass
   361  * Those quantities are precalculated upon initialization of HomePatch
   363  * Ordering of TIP4P atoms: O, H1, H2, LP.
   365 __device__ void tip4_Om_reposition(
   367   const double r_om, const double r_ohc) {
   368   const double factor = r_om / r_ohc;
   369   pos[3][0] = pos[0][0] + (0.5 * (pos[1][0] + pos[2][0]) - pos[0][0]) * factor;
   370   pos[3][1] = pos[0][1] + (0.5 * (pos[1][1] + pos[2][1]) - pos[0][1]) * factor;
   371   pos[3][2] = pos[0][2] + (0.5 * (pos[1][2] + pos[2][2]) - pos[0][2]) * factor;
   374 // swipe from HomePatch::swm4_omrepos
   375 /* Reposition lonepair (Om) particle of Drude SWM4 water.
   376  * Same comments apply as to tip4_omrepos(), but the ordering of atoms
   377  * is different: O, D, LP, H1, H2.
   379 __device__ void swm4_Om_reposition(
   381   const double r_om, const double r_ohc) {
   382   const double factor = r_om / r_ohc;
   383   pos[2][0] = pos[0][0] + (0.5 * (pos[3][0] + pos[4][0]) - pos[0][0]) * factor;
   384   pos[2][1] = pos[0][1] + (0.5 * (pos[3][1] + pos[4][1]) - pos[0][1]) * factor;
   385   pos[2][2] = pos[0][2] + (0.5 * (pos[3][2] + pos[4][2]) - pos[0][2]) * factor;
   389 void MSHAKEIterate(const int icnt, const CudaRattleElem* rattleList,
   390   const BigReal *refx, const BigReal *refy, const BigReal *refz,
   391   BigReal *posx, BigReal *posy, BigReal *posz,
   392   const BigReal tol2, const int maxiter,
   393   bool& done, bool& consFailure);
   395 __device__ inline BigReal det_3by3(BigReal A[4][4])
   398     A[0][0]*(A[1][1]*A[2][2]-A[1][2]*A[2][1])-
   399     A[0][1]*(A[1][0]*A[2][2]-A[1][2]*A[2][0])+
   400     A[0][2]*(A[1][0]*A[2][1]-A[1][1]*A[2][0]);
   403 __device__ void swap_row(BigReal A[4][4], BigReal b[4], int r1, int r2)
   406     for (int i = 0; i < 4; i++)
   408         BigReal* p1 = &A[r1][i];
   409         BigReal* p2 = &A[r2][i];
   422 __device__ void solve_4by4(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4])
   426     for (int k = 0; k < 4; ++k)
   430         BigReal Max = A[k][k];
   432         for (int row = k + 1; row < 4; ++row)
   434             if ((tmp = fabs(A[row][k])) > Max)
   441             swap_row(A, sigma, k, piv_row);
   443         for (int row = k + 1; row < 4; ++row)
   445             tmp = A[row][k]/ A[k][k];
   446             for (int col = k+1; col < 4; col++)
   447                 A[row][col] -= tmp * A[k][col];
   449             sigma[row]-= tmp * sigma[k];
   452     for (int row = 3; row >= 0; --row)
   455         for (int j = 3; j > row; --j)
   456             tmp -= lambda[j] * A[row][j];
   457         lambda[row] = tmp / A[row][row];
   462 __device__ void solveMatrix(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4], int icnt)
   468             lambda[0] = sigma[0]/A[0][0];
   474             BigReal det=1./(A[0][0]*A[1][1]-A[0][1]*A[1][0]);
   475             lambda[0]  = ( A[1][1]*sigma[0]-A[0][1]*sigma[1])*det;
   476             lambda[1]  = (-A[1][0]*sigma[0]+A[0][0]*sigma[1])*det;
   481             BigReal det = 1./det_3by3(A);
   482             lambda[0] = det*((A[1][1]*A[2][2]-A[1][2]*A[2][1])*sigma[0]-
   483                              (A[0][1]*A[2][2]-A[0][2]*A[2][1])*sigma[1]+
   484                              (A[0][1]*A[1][2]-A[0][2]*A[1][1])*sigma[2]);
   486             lambda[1] = det*((A[1][2]*A[2][0]-A[1][0]*A[2][2])*sigma[0]+
   487                              (A[0][0]*A[2][2]-A[0][2]*A[2][0])*sigma[1]-
   488                              (A[0][0]*A[1][2]-A[0][2]*A[1][0])*sigma[2]);
   490             lambda[2] = det*((A[1][0]*A[2][1]-A[1][1]*A[2][0])*sigma[0]-
   491                              (A[0][0]*A[2][1]-A[0][1]*A[2][0])*sigma[1]+
   492                              (A[0][0]*A[1][1]-A[0][1]*A[1][0])*sigma[2]);
   497             solve_4by4(lambda, A, sigma);
   504 // The choice of launch bounds is determined based on nvcc output of how many
   505 // registers the kernel is using.  We see optimal performance using a register
   506 // limit of 128, which corresponds to __launch_bounds__(512,1).
   508 template <bool DOENERGY, bool DOFIXED>
   511 __launch_bounds__(512,1)
   513 //TODO:HIP - tune for HIP
   514 // __launch_bounds__(512,1)
   516 void MSHAKE_CUDA_Kernel(
   517     const CudaRattleElem *rattleList, 
   520     const BigReal* __restrict__ refx_d, 
   521     const BigReal* __restrict__ refy_d, 
   522     const BigReal* __restrict__ refz_d,
   523     BigReal* __restrict__ posx_d, 
   524     BigReal* __restrict__ posy_d, 
   525     BigReal* __restrict__ posz_d,
   526     const BigReal* __restrict__ ref_velx_d, // for fixed atoms icnt == 0
   527     const BigReal* __restrict__ ref_vely_d, // for fixed atoms icnt == 0
   528     const BigReal* __restrict__ ref_velz_d, // for fixed atoms icnt == 0
   529     BigReal* __restrict__ velx_d, 
   530     BigReal* __restrict__ vely_d,
   531     BigReal* __restrict__ velz_d, 
   532     BigReal* __restrict__ f_normal_x, 
   533     BigReal* __restrict__ f_normal_y, 
   534     BigReal* __restrict__ f_normal_z,
   535     cudaTensor* __restrict virial, 
   536     const float* __restrict mass, 
   538     const BigReal tol2_d, 
   541     const int* __restrict__ atomFixed)
   543 // TODO: why is the CUDA (non-shared) path broken?
   545       __shared__ BigReal sh_posx[128][4+1];
   546       __shared__ BigReal sh_posy[128][4+1];
   547       __shared__ BigReal sh_posz[128][4+1];
   548       __shared__ BigReal sh_refx[128][4+1];
   549       __shared__ BigReal sh_refy[128][4+1];
   550       __shared__ BigReal sh_refz[128][4+1];
   552       int idx = threadIdx.x + blockIdx.x * blockDim.x;
   553       // cudaTensor lVirial;
   554       // lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
   555       // lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
   556       // lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
   559           consFailure[idx] = 0;
   560           int ig   = rattleList[idx].ig;
   561           int icnt = rattleList[idx].icnt;
   562           BigReal tol2 = tol2_d;
   563           BigReal sigma[4] = {0}; 
   564           BigReal lambda[4]= {0};
   565           BigReal A[4][4]  = {0};
   566           // BigReal df[3] = {0};
   567           BigReal pabx[4] = {0};
   568           BigReal rabx[4] = {0};
   569           BigReal paby[4] = {0};
   570           BigReal raby[4] = {0};
   571           BigReal pabz[4] = {0};
   572           BigReal rabz[4] = {0};
   574           BigReal posx[4+1] = {0};
   575           BigReal posy[4+1] = {0};
   576           BigReal posz[4+1] = {0};
   577           BigReal refx[4+1] = {0};
   578           BigReal refy[4+1] = {0};
   579           BigReal refz[4+1] = {0};
   581           BigReal* posx = &sh_posx[threadIdx.x][0];
   582           BigReal* posy = &sh_posy[threadIdx.x][0];
   583           BigReal* posz = &sh_posz[threadIdx.x][0];
   584           BigReal* refx = &sh_refx[threadIdx.x][0];
   585           BigReal* refy = &sh_refy[threadIdx.x][0];
   586           BigReal* refz = &sh_refz[threadIdx.x][0];
   587           for(int i = 0; i < 4+1; ++i)
   597           //load datas from global memory to local memory
   598           CudaRattleParam rattleParam[4]; //Does this work?
   599           for(int i = 0; i < 4+1; ++i)
   600           //for(int i = 0; i < hgs[i]; ++i)
   602               //rattleParam[i] = rattleParam_d[4*idx+i];
   603               if (i < 4) rattleParam[i] = rattleList[idx].params[i];
   604               /*posx[i] = posx_d[i+4*idx];
   605               posy[i] = posy_d[i+4*idx];
   606               posz[i] = posz_d[i+4*idx];
   607               refx[i] = refx_d[i+4*idx]; 
   608               refx[i] = refx_d[i+4*idx]; 
   609               refx[i] = refx_d[i+4*idx]; 
   610               refy[i] = refy_d[i+4*idx];
   611               refz[i] = refz_d[i+4*idx];*/
   612               //I can pass the HGS and check if I is >= HGS[i]
   614                 posx[i] = posx_d[i+ig];
   615                 posy[i] = posy_d[i+ig];
   616                 posz[i] = posz_d[i+ig];
   617                 refx[i] = refx_d[i+ig]; 
   618                 refy[i] = refy_d[i+ig];
   619                 refz[i] = refz_d[i+ig];
   624           // Haochuan: there are atoms fixed in the group if icnt == 0,
   625           //           which should be in the noconstList in the CPU code
   626           //           and simulated without constraints, but the CUDA
   627           //           code here count all of them into the rattleList,
   628           //           so I check icnt == 0 and skip the whole MSHAKE.
   629           if (!DOFIXED || (DOFIXED && icnt > 0)) {
   631               for(int i = 0; i < 4; ++i)
   633                   int a = rattleParam[i].ia;
   634                   int b = rattleParam[i].ib;
   635                   pabx[i] = posx[a] - posx[b];
   636                   paby[i] = posy[a] - posy[b];
   637                   pabz[i] = posz[a] - posz[b];
   638                   rabx[i] = refx[a] - refx[b];
   639                   raby[i] = refy[a] - refy[b];
   640                   rabz[i] = refz[a] - refz[b];
   643               for(int i = 0; i < 4; ++i)
   645                   BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
   646                   BigReal rabsq = rattleParam[i].dsq;
   647                   BigReal diffsq = pabsq - rabsq;
   649                   if ( fabs(diffsq) > (rabsq * tol2)  && i < icnt)
   652               int maxiter = maxiter_d;
   653               for(loop = 0; loop < maxiter; ++loop)
   659                       for(int j = 0; j < 4; ++j)
   661                           BigReal rma = rattleParam[j].rma;
   663                           for(int i = 0; i < 4; ++i)
   665                               A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
   667                           BigReal rmb = rattleParam[j].rmb;
   668                           A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
   671                       solveMatrix(lambda, A, sigma, icnt);
   673                       for(int i = 0; i < 4; ++i)
   675                           int a = rattleParam[i].ia;
   676                           int b = rattleParam[i].ib;
   677                           BigReal rma = rattleParam[i].rma * lambda[i];
   678                           BigReal rmb = rattleParam[i].rmb * lambda[i];
   680                           posx[a] -= rma * rabx[i];
   681                           posy[a] -= rma * raby[i];
   682                           posz[a] -= rma * rabz[i];
   683                           posx[b] += rmb * rabx[i];
   684                           posy[b] += rmb * raby[i];
   685                           posz[b] += rmb * rabz[i];
   692                   for(int i = 0; i < 4; ++i)
   694                       int a = rattleParam[i].ia;
   695                       int b = rattleParam[i].ib;
   696                       pabx[i] = posx[a] - posx[b];
   697                       paby[i] = posy[a] - posy[b];
   698                       pabz[i] = posz[a] - posz[b];
   699                       BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
   700                       BigReal rabsq = rattleParam[i].dsq;
   701                       BigReal diffsq = pabsq - rabsq;
   703                       if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
   708                   consFailure[idx] = 1;
   712                   for(int i = 0; i < hgs[ig]; ++i)
   714                     posx_d[i+ig] = posx[i];
   715                     posy_d[i+ig] = posy[i];
   716                     posz_d[i+ig] = posz[i];
   719                     // double vx  = velx_d[i + ig];
   720                     // double vy  = vely_d[i + ig];
   721                     // double vz  = velz_d[i + ig];
   722                     double vnx = (posx[i] - refx[i]) * invdt;
   723                     double vny = (posy[i] - refy[i]) * invdt;
   724                     double vnz = (posz[i] - refz[i]) * invdt;
   725                     // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
   726                     // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
   727                     // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
   729                     velx_d[i + ig] = vnx;
   730                     vely_d[i + ig] = vny;
   731                     velz_d[i + ig] = vnz;
   734                   consFailure[idx] = 0;
   735                   //printf("Loop is %d %d\n", loop, consFailure[idx]);
   738               // Fixed atoms without constraints
   739             for(int i = 0; i < hgs[ig]; ++i) {
   740               // Copy from the old/reference positions.
   741               posx_d[i+ig] = refx[i];
   742               posy_d[i+ig] = refy[i];
   743               posz_d[i+ig] = refz[i];
   744               // velx_d[i+ig] = ref_velx_d[i+ig];
   745               // vely_d[i+ig] = ref_vely_d[i+ig];
   746               // velz_d[i+ig] = ref_velz_d[i+ig];
   748           } // !DOFIXED || (DOFIXED && icnt > 0)
   752 // XXX TODO: Fix this, it's horrible
   754 void CheckConstraints(int* consFailure, int* consFailure_h, int size)
   756     __shared__ int result[128];
   757     int idx = threadIdx.x;
   759     for(int i = idx; i < size; i += 128)
   760         result[idx] += consFailure[i];
   763         result[idx] += result[idx+64];
   766         result[idx] += result[idx+32];
   769         result[idx] += result[idx+16];
   772         result[idx] += result[idx+8];
   775         result[idx] += result[idx+4];
   778         result[idx] += result[idx+2];
   781         result[idx] += result[idx+1];
   785         consFailure_h[0] += result[0];
   786         //printf("Constraints %d\n", onsFailure[0]);
   793     const CudaRattleElem* rattleList, 
   795     const int *hydrogenGroupSize, 
   796     const int *atomFixed,
   803     const double *ref_velx,
   804     const double *ref_vely,
   805     const double *ref_velz,
   812     cudaTensor* rigidVirial, 
   822          //fprintf(stderr, "No rattles, returning\n");
   826     int gridDim = int(size/128)+(size%128==0 ? 0 : 1);
   828 #define CALL(DOENERGY, DOFIXED) \
   829   MSHAKE_CUDA_Kernel<DOENERGY, DOFIXED><<<gridDim, blockDim, 0, stream>>>( \
   830     rattleList, size, hydrogenGroupSize, \
   831     refx, refy, refz, posx, posy, posz, \
   832     ref_velx, ref_vely, ref_velz, \
   834     f_normal_x, f_normal_y, f_normal_z, rigidVirial, \
   835     mass, invdt, tol2, maxiter, consFailure_d, atomFixed)
   836   if      ( doEnergy &&  doFixed) CALL(true, true);
   837   else if ( doEnergy && !doFixed) CALL(true, false);
   838   else if (!doEnergy &&  doFixed) CALL(false, true);
   839   else if (!doEnergy && !doFixed) CALL(false, false);
   840   else NAMD_bug("No kernel function is called in MSHAKE_CUDA.\n");
   842     // cudaCheck(cudaGetLastError());
   843     // JM NOTE: We can check the consFailure flag for every migration step
   844     // CheckConstraints<<<1,128, 0, stream>>>(consFailure_d, consFailure, size);
   845     // I can't do this here
   846     // why do I need this variable?????
   847     //done = !consFailure;
   851 // Perform a warp-wide read on 3-element clusters from global memory
   852 // into shared memory.
   853 // a - global memory buffer
   854 // buf - shared memory buffer, dimension BLOCK*3
   855 // list - index offset to beginning of each 3-element cluster
   856 // n - total number of clusters
   858 // We apply this to improve reading of water (positions and velocities).
   859 // Data in global memory is stored in SOA form: pos_x, pos_y, pos_z, etc,
   860 // so we call this routine six times to read positions and velocities.
   861 // Reading is coalesced whenever water molecules are all consecutive.
   862 // Although waters are sorted to the end of each patch, there will
   863 // generally be gaps between patches due to solute.
   865 template <int VECTOR_SIZE>
   866 __device__ __forceinline__ void ld_vecN(const double *a, double buf[][VECTOR_SIZE], int *list, int n)
   868   int laneid = threadIdx.x % warpSize;
   869   int warpid = threadIdx.x / warpSize;  
   873   //TODO:HIP verify for HIP
   877   for (int j = 0; j < VECTOR_SIZE; j++) {
   878     int offset = laneid + j*warpSize;
   879     int ii = offset % VECTOR_SIZE;
   880     int jj = warpid*warpSize + offset/VECTOR_SIZE;
   881     if (blockIdx.x*blockDim.x + jj < n) {
   882       int idx = list[jj] + ii;
   883       buf[jj][ii] = a[idx];
   889   //TODO:HIP verify for HIP
   895 // Overload for float
   896 template <int VECTOR_SIZE>
   897 __device__ __forceinline__ void ld_vecN(const float *a, float buf[][VECTOR_SIZE], int *list, int n)
   899   int laneid = threadIdx.x % warpSize;
   900   int warpid = threadIdx.x / warpSize;
   904   //TODO:HIP verify for HIP
   908   for (int j = 0; j < VECTOR_SIZE; j++) {
   909     int offset = laneid + j*warpSize;
   910     int ii = offset % VECTOR_SIZE;
   911     int jj = warpid*warpSize + offset/VECTOR_SIZE;
   912     if (blockIdx.x*blockDim.x + jj < n) {
   913       int idx = list[jj] + ii;
   914       buf[jj][ii] = a[idx];
   920   //TODO:HIP verify for HIP
   927 // Same as previous, only this time we write to global memory from
   930 template <int VECTOR_SIZE>
   931 __device__ __forceinline__ void st_vecN(double *a, double buf[][VECTOR_SIZE], int *list, int n)
   933   int laneid = threadIdx.x % warpSize;
   934   int warpid = threadIdx.x / warpSize;  
   938   //TODO:HIP verify for HIP
   941   for (int j = 0; j < VECTOR_SIZE; j++) {
   942     int offset = laneid + j*warpSize;
   943     int ii = offset % VECTOR_SIZE;
   944     int jj = warpid*warpSize + offset/VECTOR_SIZE;
   945     if (blockIdx.x*blockDim.x + jj < n) {
   946       int idx = list[jj] + ii;
   947       a[idx] = buf[jj][ii];
   953   //TODO:HIP verify for HIP
   959 // Same as previous, but accumulates the value instead of copying
   961 template <int VECTOR_SIZE>
   962 __device__ __forceinline__ void acc_vecN(double* __restrict__ a, double buf[][VECTOR_SIZE], int *list, int n)
   964   int laneid = threadIdx.x % warpSize;
   965   int warpid = threadIdx.x / warpSize;  
   969   //TODO:HIP verify for HIP
   972   for (int j = 0; j < VECTOR_SIZE; j++) {
   973     int offset = laneid + j*warpSize;
   974     int ii = offset % VECTOR_SIZE;
   975     int jj = warpid*warpSize + offset/VECTOR_SIZE;
   976     if (blockIdx.x*blockDim.x + jj < n) {
   977       int idx = list[jj] + ii;
   978       a[idx] += buf[jj][ii];
   984   //TODO:HIP verify for HIP
   989 // Haochuan: fixed atoms in water are already excluded from the
   990 // settleList, so we don't need to do anything related to
   993 // The choice of launch bounds is determined based on nvcc output of how many
   994 // registers the kernel is using.  We see optimal performance using a register
   995 // limit of 128, which corresponds to __launch_bounds__(512,1).
   997 template <bool DOENERGY, int BLOCK,
   998           int WATER_GROUP_SIZE,
   999           WaterModel WATER_MODEL>
  1002 __launch_bounds__(512,1)
  1005 // __launch_bounds__(512,1)
  1012   const double * __restrict vel_x,
  1013   const double * __restrict vel_y,
  1014   const double * __restrict vel_z,
  1015   const double * __restrict pos_x,
  1016   const double * __restrict pos_y,
  1017   const double * __restrict pos_z,
  1018   double * __restrict velNew_x,
  1019   double * __restrict velNew_y,
  1020   double * __restrict velNew_z,
  1021   double * __restrict posNew_x,
  1022   double * __restrict posNew_y,
  1023   double * __restrict posNew_z,
  1024   double * __restrict f_normal_x, 
  1025   double * __restrict f_normal_y,
  1026   double * __restrict f_normal_z,
  1027   cudaTensor* __restrict virial, 
  1028   const float* __restrict mass,   
  1029   const int   * __restrict hydrogenGroupSize,
  1030   const float  * __restrict rigidBondLength,
  1031   const int   * __restrict atomFixed,
  1032   const int    * __restrict settleList,
  1033   const SettleParameters * __restrict sp)
  1035   int tid = threadIdx.x + blockIdx.x*blockDim.x;
  1036   double ref[WATER_GROUP_SIZE][3] = {0};
  1037   double pos[WATER_GROUP_SIZE][3] = {0};
  1038   double vel[WATER_GROUP_SIZE][3] = {0};
  1040   __shared__ int list[BLOCK];
  1041   __shared__ double buf[BLOCK][WATER_GROUP_SIZE];
  1042   if (tid < nSettles) {
  1043     list[threadIdx.x] = settleList[tid];
  1046   // Do a warp-wide read on pos* and vel* from global memory to shmem
  1047   //BEGIN OF FIRST PART OF RATTLE1: Settle_SIMD invocation
  1048   ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
  1050   for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
  1051   ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
  1053   for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
  1054   ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
  1056   for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
  1057   ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
  1059   for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
  1060   ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
  1062   for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
  1063   ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
  1065   for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
  1068   for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
  1069     pos[i][0] = ref[i][0] + vel[i][0]*dt;
  1070     pos[i][1] = ref[i][1] + vel[i][1]*dt;
  1071     pos[i][2] = ref[i][2] + vel[i][2]*dt;
  1074   switch (WATER_MODEL) {
  1075     case WaterModel::SWM4: {
  1076       // SWM4 ordering:  O D LP H1 H2
  1077       // do swap(O,LP) and call settle with subarray O H1 H2
  1081       for (int i = 0; i < 3; ++i) {
  1082         lp_ref[i] = ref[2][i];
  1083         lp_pos[i] = pos[2][i];
  1086       for (int i = 0; i < 3; ++i) {
  1087         ref[2][i] = ref[0][i];
  1088         pos[2][i] = pos[0][i];
  1090       settle1(ref+2, pos+2, sp);
  1091       // swap back after we return
  1093       for (int i = 0; i < 3; ++i) {
  1094         ref[0][i] = ref[2][i];
  1095         pos[0][i] = pos[2][i];
  1098       for (int i = 0; i < 3; ++i) {
  1099         ref[2][i] = lp_ref[i];
  1100         pos[2][i] = lp_pos[i];
  1102       swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
  1105     case WaterModel::TIP4: {
  1106       settle1(ref, pos, sp);
  1107       tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
  1111       settle1(ref, pos, sp);
  1115   // Update position and velocities with stuff calculated by settle1
  1117   // Do a warp-wide write on pos* and vel* to global memory from shmem
  1119   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
  1120   st_vecN<WATER_GROUP_SIZE>(velNew_x, buf, list, nSettles);
  1122   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
  1123   st_vecN<WATER_GROUP_SIZE>(velNew_y, buf, list, nSettles);
  1125   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
  1126   st_vecN<WATER_GROUP_SIZE>(velNew_z, buf, list, nSettles);
  1128   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][0];
  1129   st_vecN<WATER_GROUP_SIZE>(posNew_x, buf, list, nSettles);
  1131   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][1];
  1132   st_vecN<WATER_GROUP_SIZE>(posNew_y, buf, list, nSettles);
  1134   for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][2];
  1135   st_vecN<WATER_GROUP_SIZE>(posNew_z, buf, list, nSettles);
  1137     // force+velocity update
  1138   if (tid < nSettles) {
  1139     double vnx, vny, vnz;
  1141     for(int i = 0 ; i < 3; i++){
  1142         vnx = (pos[i][0] - ref[i][0]) * invdt;
  1143         vny = (pos[i][1] - ref[i][1]) * invdt;
  1144         vnz = (pos[i][2] - ref[i][2]) * invdt;
  1146         df[0] = (vnx - vel[i][0]) * mass[i] * invdt;
  1147         df[1] = (vny - vel[i][1]) * mass[i] * invdt;
  1148         df[2] = (vnz - vel[i][2]) * mass[i] * invdt;
  1150         velNew_x[ig+i] = vnx;
  1151         velNew_y[ig+i] = vny;
  1152         velNew_z[ig+i] = vnx;
  1153         posNew_x[ig+i] = pos[i][0];
  1154         posNew_y[ig+i] = pos[i][1];
  1155         posNew_z[ig+i] = pos[i][2];
  1156         f_normal_x[ig+i] += df[0];
  1157         f_normal_y[ig+i] += df[1];
  1158         f_normal_z[ig+i] += df[2];
  1161             lVirial.xx +=  df[0] * pos[i][0];
  1162             // lVirial.xy +=  df[0] * pos[i][1];
  1163             // lVirial.xz +=  df[0] * pos[i][2];
  1164             lVirial.yx +=  df[1] * pos[i][0];
  1165             lVirial.yy +=  df[1] * pos[i][1];
  1166             // lVirial.yz +=  df[1] * pos[i][2];
  1167             lVirial.zx +=  df[2] * pos[i][0];
  1168             lVirial.zy +=  df[2] * pos[i][1];
  1169             lVirial.zz +=  df[2] * pos[i][2];
  1174         typedef cub::BlockReduce<BigReal, 128> BlockReduce;
  1175         __shared__ typename BlockReduce::TempStorage temp_storage;
  1176         lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
  1177         // lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
  1178         // lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
  1179         lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
  1180         lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy); __syncthreads();
  1181         // lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
  1182         lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
  1183         lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
  1184         lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
  1186         // Every block has a locally reduced blockVirial
  1187         // Now every thread does an atomicAdd to get a global virial
  1188         if(threadIdx.x == 0){
  1189           atomicAdd(&(virial->xx), lVirial.xx);
  1190           atomicAdd(&(virial->xy), lVirial.yx);
  1191           atomicAdd(&(virial->xz), lVirial.zx);
  1192           atomicAdd(&(virial->yx), lVirial.yx);
  1193           atomicAdd(&(virial->yy), lVirial.yy);
  1194           atomicAdd(&(virial->yz), lVirial.zy);
  1195           atomicAdd(&(virial->zx), lVirial.zx);
  1196           atomicAdd(&(virial->zy), lVirial.zy);
  1197           atomicAdd(&(virial->zz), lVirial.zz);
  1205     const bool doEnergy, 
  1210     const double *  vel_x, 
  1211     const double *  vel_y,  
  1212     const double *  vel_z, 
  1213     const double *  pos_x, 
  1214     const double *  pos_y,  
  1215     const double *  pos_z, 
  1222     double *  f_normal_x, 
  1223     double *  f_normal_y, 
  1224     double *  f_normal_z, 
  1227     const int   *  hydrogenGroupSize, 
  1228     const float *  rigidBondLength,
  1229     const int   *  atomFixed, 
  1231     const SettleParameters * sp,
  1232     const WaterModel water_model,
  1233     cudaStream_t stream)
  1237     const int blocks = 128;
  1239     const int blocks = 128;
  1241     int grid = (nSettles + blocks - 1) / blocks;
  1242 #define CALL(DOENERGY, WATER_MODEL) \
  1243   SettleKernel<DOENERGY, blocks, getWaterModelGroupSize(WATER_MODEL), WATER_MODEL> \
  1244   <<< grid, blocks, 0, stream >>> \
  1245   (numAtoms, dt, invdt, nSettles, \
  1246     vel_x, vel_y, vel_z, \
  1247     pos_x, pos_y, pos_z, \
  1248     velNew_x, velNew_y, velNew_z, \
  1249     posNew_x, posNew_y, posNew_z, \
  1250     f_normal_x, f_normal_y, f_normal_z, virial, mass, \
  1251     hydrogenGroupSize, rigidBondLength, atomFixed, \
  1254   switch (water_model) {
  1255     case WaterModel::SWM4: {
  1256       if ( doEnergy) CALL(true, WaterModel::SWM4);
  1257       if (!doEnergy) CALL(false, WaterModel::SWM4);
  1260     case WaterModel::TIP4: {
  1261       if ( doEnergy) CALL(true, WaterModel::TIP4);
  1262       if (!doEnergy) CALL(false, WaterModel::TIP4);
  1266       if ( doEnergy) CALL(true, WaterModel::TIP3);
  1267       if (!doEnergy) CALL(false, WaterModel::TIP3);
  1271   cudaCheck(cudaStreamSynchronize(stream));
  1274 // Fused kernel for rigid bonds constraints
  1275 // Each block will do a single operation -> either SETTLE or MSHAKE.
  1276 template <int WATER_GROUP_SIZE,
  1277           WaterModel WATER_MODEL,
  1280 __launch_bounds__(512,1)
  1283 // __launch_bounds__(512,1)
  1285 __global__ void Rattle1Kernel(
  1287   // Settle Parameters
  1291   double * __restrict                   vel_x,
  1292   double * __restrict                   vel_y,
  1293   double * __restrict                   vel_z,
  1294   const double * __restrict             pos_x,
  1295   const double * __restrict             pos_y,
  1296   const double * __restrict             pos_z,
  1297   double * __restrict                   f_normal_x,
  1298   double * __restrict                   f_normal_y,
  1299   double * __restrict                   f_normal_z,
  1300   const float * __restrict              mass,
  1301   const int   * __restrict              hydrogenGroupSize,
  1302   const float * __restrict              rigidBondLength,
  1303   const int   * __restrict              atomFixed,
  1304   int * __restrict                      settleList,
  1305   const SettleParameters *              sp,
  1306   const CudaRattleElem * __restrict     rattleList,
  1308   const BigReal                         tol2_d,
  1309   const int                             maxiter_d,
  1311   const int                             nSettleBlocks
  1313   // Great, first step is fetching value
  1314   int tid = threadIdx.x + blockIdx.x * blockDim.x;
  1315   bool isSettle = blockIdx.x < nSettleBlocks;
  1316   __shared__ int list[128];
  1317   __shared__ double buf[128][WATER_GROUP_SIZE];
  1319   __shared__ BigReal sh_posx[128][4+1];
  1320   __shared__ BigReal sh_posy[128][4+1];
  1321   __shared__ BigReal sh_posz[128][4+1];
  1322   __shared__ BigReal sh_refx[128][4+1];
  1323   __shared__ BigReal sh_refy[128][4+1];
  1324   __shared__ BigReal sh_refz[128][4+1];
  1329     list[threadIdx.x] = settleList[tid];
  1331     double ref[WATER_GROUP_SIZE][3] = {0};
  1332     double pos[WATER_GROUP_SIZE][3] = {0};
  1333     double vel[WATER_GROUP_SIZE][3] = {0};
  1334     float tmp_mass[WATER_GROUP_SIZE];
  1336     ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
  1338     for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
  1339     ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
  1341     for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
  1342     ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
  1344     for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
  1345     ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
  1347     for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
  1348     ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
  1350     for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
  1351     ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
  1353     for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
  1355     for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
  1356       pos[i][0] = ref[i][0] + vel[i][0]*dt;
  1357       pos[i][1] = ref[i][1] + vel[i][1]*dt;
  1358       pos[i][2] = ref[i][2] + vel[i][2]*dt;
  1360     switch (WATER_MODEL) {
  1361       case WaterModel::SWM4: {
  1362         // SWM4 ordering:  O D LP H1 H2
  1363         // do swap(O,LP) and call settle with subarray O H1 H2
  1367         for (int i = 0; i < 3; ++i) {
  1368           lp_ref[i] = ref[2][i];
  1369           lp_pos[i] = pos[2][i];
  1372         for (int i = 0; i < 3; ++i) {
  1373           ref[2][i] = ref[0][i];
  1374           pos[2][i] = pos[0][i];
  1376         settle1(ref+2, pos+2, sp);
  1377         // swap back after we return
  1379         for (int i = 0; i < 3; ++i) {
  1380           ref[0][i] = ref[2][i];
  1381           pos[0][i] = pos[2][i];
  1384         for (int i = 0; i < 3; ++i) {
  1385           ref[2][i] = lp_ref[i];
  1386           pos[2][i] = lp_pos[i];
  1388         swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
  1391       case WaterModel::TIP4: {
  1392         settle1(ref, pos, sp);
  1393         tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
  1397         settle1(ref, pos, sp);
  1400     // load the mass to calculate the force
  1401     if (WATER_MODEL != WaterModel::TIP3) {
  1402       __shared__ float mass_buf[128][WATER_GROUP_SIZE];
  1403       ld_vecN<WATER_GROUP_SIZE>(mass, mass_buf, list, nSettles);
  1405       for (int i = 0; i < WATER_GROUP_SIZE; ++i) tmp_mass[i] = mass_buf[threadIdx.x][i];
  1407     // ------ Update the atomic velocities along X
  1409     for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
  1411     st_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
  1413     // buf here holds the new velocities. now we calculate the force contributions
  1414     // maybe it's this mass?
  1415     if (WATER_MODEL != WaterModel::TIP3) {
  1417       for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][0]) * (tmp_mass[i]*invdt);
  1419       buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][0]) * (sp->mO*invdt);
  1420       buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][0]) * (sp->mH*invdt);
  1421       buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][0]) * (sp->mH*invdt);
  1423     acc_vecN<WATER_GROUP_SIZE>(f_normal_x, buf, list, nSettles);
  1425     // ------ Update the atomic velocities along Y
  1427     for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
  1428     st_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
  1429     if (WATER_MODEL != WaterModel::TIP3) {
  1431       for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][1]) * (tmp_mass[i]*invdt);
  1433       buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][1]) * (sp->mO*invdt);
  1434       buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][1]) * (sp->mH*invdt);
  1435       buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][1]) * (sp->mH*invdt);
  1437     acc_vecN<WATER_GROUP_SIZE>(f_normal_y, buf, list, nSettles);
  1440     // ------ Update the atomic velocities along Z
  1442     for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
  1443     st_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
  1444     if (WATER_MODEL != WaterModel::TIP3) {
  1446       for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][2]) * (tmp_mass[i]*invdt);
  1448       buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][2]) * (sp->mO*invdt);
  1449       buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][2]) * (sp->mH*invdt);
  1450       buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][2]) * (sp->mH*invdt);
  1452     acc_vecN<WATER_GROUP_SIZE>(f_normal_z, buf, list, nSettles);
  1455     // Remaining threadblocks do MSHAKE.
  1456     // instead of tid - nSettle, IDX needs to be the first threadBlock that does MSHAKE instead of settle
  1457     int idx = tid - blockDim.x * nSettleBlocks; // Numbers of blocks that did SETTLE
  1458     // Ok, now we follow the logic of MSHAKE_CUDA_KERNEL
  1460       // This needs to increase monotonically
  1461       // if (threadIdx.x == 0) printf("tid %d idx %d\n", tid, idx);
  1462       // consFailure[idx] = 0;
  1463       int ig   = rattleList[idx].ig;
  1464       int icnt = rattleList[idx].icnt;
  1465       BigReal tol2 = tol2_d;
  1466       BigReal sigma[4] = {0};
  1467       BigReal lambda[4]= {0};
  1468       BigReal A[4][4]  = {0};
  1469       BigReal velx[4+1] = {0};
  1470       BigReal vely[4+1] = {0};
  1471       BigReal velz[4+1] = {0};
  1472       BigReal pabx[4] = {0};
  1473       BigReal rabx[4] = {0};
  1474       BigReal paby[4] = {0};
  1475       BigReal raby[4] = {0};
  1476       BigReal pabz[4] = {0};
  1477       BigReal rabz[4] = {0};
  1478       BigReal df[3]   = {0};
  1480       BigReal posx[4+1] = {0};
  1481       BigReal posy[4+1] = {0};
  1482       BigReal posz[4+1] = {0};
  1483       BigReal refx[4+1] = {0};
  1484       BigReal refy[4+1] = {0};
  1485       BigReal refz[4+1] = {0};
  1487       BigReal* posx = &sh_posx[threadIdx.x][0];
  1488       BigReal* posy = &sh_posy[threadIdx.x][0];
  1489       BigReal* posz = &sh_posz[threadIdx.x][0];
  1490       BigReal* refx = &sh_refx[threadIdx.x][0];
  1491       BigReal* refy = &sh_refy[threadIdx.x][0];
  1492       BigReal* refz = &sh_refz[threadIdx.x][0];
  1493       for(int i = 0; i < 4+1; ++i)
  1503       //load datas from global memory to local memory
  1504       CudaRattleParam rattleParam[4];
  1506       for(int i = 0; i < 4+1; ++i)
  1507       //for(int i = 0; i < hgs[i]; ++i)
  1509           //rattleParam[i] = rattleParam_d[4*idx+i];
  1510           if (i < 4) rattleParam[i] = rattleList[idx].params[i];
  1511           /*posx[i] = posx_d[i+4*idx];
  1512           posy[i] = posy_d[i+4*idx];
  1513           posz[i] = posz_d[i+4*idx];
  1514           refx[i] = refx_d[i+4*idx];
  1515           refx[i] = refx_d[i+4*idx];
  1516           refx[i] = refx_d[i+4*idx];
  1517           refy[i] = refy_d[i+4*idx];
  1518           refz[i] = refz_d[i+4*idx];*/
  1519           //I can pass the HGS and check if I is >= HGS[i]
  1520           if(i < hydrogenGroupSize[ig]){
  1521             refx[i] = pos_x[i+ig];
  1522             refy[i] = pos_y[i+ig];
  1523             refz[i] = pos_z[i+ig];
  1524             velx[i] = vel_x[i + ig];
  1525             vely[i] = vel_y[i + ig];
  1526             velz[i] = vel_z[i + ig];
  1528               posx[i] = refx[i] + (velx[i] * dt);
  1529               posy[i] = refy[i] + (vely[i] * dt);
  1530               posz[i] = refz[i] + (velz[i] * dt);
  1532               // Haochuan: don't update the positions for fixed atoms
  1533               if (!atomFixed[ig+i]) {
  1534                 posx[i] = refx[i] + (velx[i] * dt);
  1535                 posy[i] = refy[i] + (vely[i] * dt);
  1536                 posz[i] = refz[i] + (velz[i] * dt);
  1548       for(int i = 0; i < 4; ++i)
  1550           int a = rattleParam[i].ia;
  1551           int b = rattleParam[i].ib;
  1552           pabx[i] = posx[a] - posx[b];
  1553           paby[i] = posy[a] - posy[b];
  1554           pabz[i] = posz[a] - posz[b];
  1555           rabx[i] = refx[a] - refx[b];
  1556           raby[i] = refy[a] - refy[b];
  1557           rabz[i] = refz[a] - refz[b];
  1561       for(int i = 0; i < 4; ++i)
  1563           BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
  1564           BigReal rabsq = rattleParam[i].dsq;
  1565           BigReal diffsq = pabsq - rabsq;
  1567           if ( fabs(diffsq) > (rabsq * tol2)  && i < icnt)
  1570       int maxiter = maxiter_d;
  1571       for(loop = 0; loop < maxiter; ++loop)
  1577               for(int j = 0; j < 4; ++j)
  1579                   BigReal rma = rattleParam[j].rma;
  1581                   for(int i = 0; i < 4; ++i)
  1583                       A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
  1585                   BigReal rmb = rattleParam[j].rmb;
  1586                   A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
  1589               solveMatrix(lambda, A, sigma, icnt);
  1591               for(int i = 0; i < 4; ++i)
  1593                   int a = rattleParam[i].ia;
  1594                   int b = rattleParam[i].ib;
  1595                   BigReal rma = rattleParam[i].rma * lambda[i];
  1596                   BigReal rmb = rattleParam[i].rmb * lambda[i];
  1598                   posx[a] -= rma * rabx[i];
  1599                   posy[a] -= rma * raby[i];
  1600                   posz[a] -= rma * rabz[i];
  1601                   posx[b] += rmb * rabx[i];
  1602                   posy[b] += rmb * raby[i];
  1603                   posz[b] += rmb * rabz[i];
  1610           for(int i = 0; i < 4; ++i)
  1612               int a = rattleParam[i].ia;
  1613               int b = rattleParam[i].ib;
  1614               pabx[i] = posx[a] - posx[b];
  1615               paby[i] = posy[a] - posy[b];
  1616               pabz[i] = posz[a] - posz[b];
  1617               BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
  1618               BigReal rabsq = rattleParam[i].dsq;
  1619               BigReal diffsq = pabsq - rabsq;
  1621               if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
  1626           consFailure[idx] = 1;
  1630 //           double m = mass[ig];
  1631           for(int i = 0; i < hydrogenGroupSize[ig]; ++i)
  1633             // posNew_x[i+ig] = posx[i];
  1634             // posNew_y[i+ig] = posy[i];
  1635             // posNew_z[i+ig] = posz[i];
  1638             // double vx  = velx_d[i + ig];
  1639             // double vy  = vely_d[i + ig];
  1640             // double vz  = velz_d[i + ig];
  1641             const double m = mass[i + ig];
  1642             double vnx = (posx[i] - refx[i]) * invdt;
  1643             double vny = (posy[i] - refy[i]) * invdt;
  1644             double vnz = (posz[i] - refz[i]) * invdt;
  1645             df[0] = (vnx - velx[i]) * (m * invdt);
  1646             df[1] = (vny - vely[i]) * (m * invdt);
  1647             df[2] = (vnz - velz[i]) * (m * invdt);
  1650             // Updates velocity and force
  1651             vel_x[i + ig] = vnx;
  1652             vel_y[i + ig] = vny;
  1653             vel_z[i + ig] = vnz;
  1655             f_normal_x[i + ig] += df[0];
  1656             f_normal_y[i + ig] += df[1];
  1657             f_normal_z[i + ig] += df[2];
  1659             // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
  1660             // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
  1661             // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
  1663             // velNew_x[i + ig] = vnx;
  1664             // velNew_y[i + ig] = vny;
  1665             // velNew_z[i + ig] = vnz;
  1668           // consFailure[idx] = 0;
  1669           //printf("Loop is %d %d\n", loop, consFailure[idx]);
  1675 void CallRattle1Kernel(
  1678   // Settle Parameters
  1682   double * __restrict                   vel_x,
  1683   double * __restrict                   vel_y,
  1684   double * __restrict                   vel_z,
  1685   const double * __restrict             pos_x,
  1686   const double * __restrict             pos_y,
  1687   const double * __restrict             pos_z,
  1688   double * __restrict                   f_normal_x,
  1689   double * __restrict                   f_normal_y,
  1690   double * __restrict                   f_normal_z,
  1691   const float * __restrict              mass,
  1692   const int   * __restrict              hydrogenGroupSize,
  1693   const float * __restrict              rigidBondLength,
  1694   const int   * __restrict              atomFixed,
  1695   int * __restrict                      settleList,
  1696   const SettleParameters *              sp,
  1697   const CudaRattleElem * __restrict     rattleList,
  1699   const BigReal                         tol2_d,
  1700   const int                             maxiter_d,
  1702   const int                             nSettleBlocks,
  1703   const int                             nShakeBlocks,
  1704   const WaterModel                      water_model,
  1707   const int nTotalBlocks  = (nSettleBlocks + nShakeBlocks);
  1708 #define CALL(WATER_MODEL, DOFIXED) \
  1709     Rattle1Kernel<getWaterModelGroupSize(WATER_MODEL), WATER_MODEL, DOFIXED> \
  1710     <<<nTotalBlocks, 128, 0, stream>>> \
  1711     (numAtoms, dt, invdt, nSettles, \
  1712      vel_x, vel_y, vel_z, \
  1713      pos_x, pos_y, pos_z, \
  1714      f_normal_x, f_normal_y, f_normal_z, \
  1715      mass, hydrogenGroupSize, rigidBondLength, atomFixed, \
  1717      rattleList, nShakes, tol2_d, maxiter_d, consFailure, nSettleBlocks);
  1719       switch (water_model) {
  1720         case WaterModel::SWM4: CALL(WaterModel::SWM4, true); break;
  1721         case WaterModel::TIP4: CALL(WaterModel::TIP4, true); break;
  1722         case WaterModel::TIP3: CALL(WaterModel::TIP3, true); break;
  1725       switch (water_model) {
  1726         case WaterModel::SWM4: CALL(WaterModel::SWM4, false); break;
  1727         case WaterModel::TIP4: CALL(WaterModel::TIP4, false); break;
  1728         case WaterModel::TIP3: CALL(WaterModel::TIP3, false); break;
  1732   cudaStreamSynchronize(stream);