290   __shared__ patch_pair sh_patch_pair;
   291 #ifndef KEPLER_SHUFFLE   292   __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
   293   __shared__ 
float sh_jBornRad_2d[NUM_WARP][
WARPSIZE];
   295   __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
   296   __shared__ 
float sh_dEdaSumJ_2d[NUM_WARP][
WARPSIZE];
   298 #ifndef KEPLER_SHUFFLE   299   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
   300   volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
   302   volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
   303   volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
   307     const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
   308     if (t < PATCH_PAIR_SIZE) {
   309       int* src = (
int *)&patch_pairs[blockIdx.x];
   310       int* dst = (
int *)&sh_patch_pair;
   317       float offx = sh_patch_pair.offset.x * lata.x 
   318                   + sh_patch_pair.offset.y * latb.x
   319                   + sh_patch_pair.offset.z * latc.x;
   320       float offy = sh_patch_pair.offset.x * lata.y
   321                   + sh_patch_pair.offset.y * latb.y
   322                   + sh_patch_pair.offset.z * latc.y;
   323       float offz = sh_patch_pair.offset.x * lata.z
   324                   + sh_patch_pair.offset.y * latb.z
   325                   + sh_patch_pair.offset.z * latc.z;
   326       sh_patch_pair.offset.x = offx;
   327       sh_patch_pair.offset.y = offy;
   328       sh_patch_pair.offset.z = offz;
   336   float r_cut2 = r_cut*r_cut;
   337   float r_cut_2 = 1.f / r_cut2;
   338   float r_cut_4 = 4.f*r_cut_2*r_cut_2;
   339   float epsilon_s_i = 1.f / epsilon_s;
   340   float epsilon_p_i = 1.f / epsilon_p;
   343   for ( 
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
   345     int nloopi = sh_patch_pair.patch1_size - blocki;
   354     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
   355       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   356       float4 tmpa = ((float4*)atoms)[i];
   357       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
   358       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
   359       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
   360       atomi.charge = - tmpa.w * scaling;
   361       bornRadI = bornRad[i];
   371     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
   372     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
   373     int blockj = (diag_patch_pair) ? blocki : 0;
   375     for (; blockj < sh_patch_pair.patch2_size; blockj += 
WARPSIZE) {
   377       int nloopj = min(sh_patch_pair.patch2_size - blockj, 
WARPSIZE);
   379 #ifdef KEPLER_SHUFFLE   388       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   389         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   390         float4 tmpa = ((float4*)atoms)[j];
   391 #ifdef KEPLER_SHUFFLE   396         bornRadJ = bornRad[j];
   398         sh_jpq[threadIdx.x].position.x = tmpa.x;
   399         sh_jpq[threadIdx.x].position.y = tmpa.y;
   400         sh_jpq[threadIdx.x].position.z = tmpa.z;
   401         sh_jpq[threadIdx.x].charge = tmpa.w;
   402         sh_jBornRad[threadIdx.x] = bornRad[j];
   406       sh_forceJ[threadIdx.x].x = 0.f;
   407       sh_forceJ[threadIdx.x].y = 0.f;
   408       sh_forceJ[threadIdx.x].z = 0.f;
   409       sh_dEdaSumJ[threadIdx.x] = 0.f;
   411       const bool diag_tile = diag_patch_pair && (blocki == blockj);
   414         int j = (t + threadIdx.x) % modval;
   415 #ifndef KEPLER_SHUFFLE   416         float xj = sh_jpq[j].position.x;
   417         float yj = sh_jpq[j].position.y;
   418         float zj = sh_jpq[j].position.z;
   419         float chargej = sh_jpq[j].charge;
   420         float bornRadJ = sh_jBornRad[j];
   422         if (j < nloopj && threadIdx.x < nloopi)
   424           float dx = atomi.position.x - xj;
   425           float dy = atomi.position.y - yj;
   426           float dz = atomi.position.z - zj;
   427           float r2 = dx*dx + dy*dy + dz*dz;
   430           if (r2 < r_cut2 && r2 > 0.01f) {
   432             float r_i = 1.f / sqrt(r2);
   437             float qiqj = atomi.charge*chargej;
   438             float aiaj = bornRadI*bornRadJ;
   439             float aiaj4 = 4*aiaj;
   440             float expr2aiaj4 = exp(-r2/aiaj4);
   441             float fij = sqrt(r2+aiaj*expr2aiaj4);
   443             float expkappa = exp(-kappa*fij);
   444             float Dij = epsilon_p_i - expkappa*epsilon_s_i;
   445             float gbEij = qiqj*Dij*f_i;
   448             float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
   449             float ddrf_i = -ddrfij*f_i*f_i;
   450             float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
   451             float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
   455             float ddrScale = 0.f;
   457             if (smoothDist > 0.f) {
   458               scale = r2 * r_cut_2 - 1.f;
   460               ddrScale = r*(r2-r_cut2)*r_cut_4;
   461               energyT += gbEij * scale;
   462               forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
   465               forcedEdr = -ddrGbEij;
   470               float dEdai = 0.5f*qiqj*f_i*f_i
   471                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
   472                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;
   474               float dEdaj = 0.5f*qiqj*f_i*f_i
   475                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
   476                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;
   477               sh_dEdaSumJ[j] += dEdaj;
   481             float tmpx = dx*forcedEdr;
   482             float tmpy = dy*forcedEdr;
   483             float tmpz = dz*forcedEdr;
   487             sh_forceJ[j].x -= tmpx;
   488             sh_forceJ[j].y -= tmpy;
   489             sh_forceJ[j].z -= tmpz;
   494               float fij = bornRadI;
   495               float expkappa = exp(-kappa*fij);
   496               float Dij = epsilon_p_i - expkappa*epsilon_s_i;
   497               float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
   498               energyT += 0.5f*gbEij;
   502 #ifdef KEPLER_SHUFFLE   510       if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   511         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   512         atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
   513         atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
   514         atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
   515         atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
   521     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
   522       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   523       atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
   524       atomicAdd(&tmp_forces[i_out].x, forceI.x);
   525       atomicAdd(&tmp_forces[i_out].y, forceI.y);
   526       atomicAdd(&tmp_forces[i_out].z, forceI.z);
   534     volatile float* sh_energy = (
float *)&sh_forceJ_2d[threadIdx.y][0].x;
   536     sh_energy[threadIdx.x] = (
float)energyT;
   538       int pos = threadIdx.x + d;
   539       float val = (pos < 
WARPSIZE) ? sh_energy[pos] : 0.0f;
   540       sh_energy[threadIdx.x] += val;
   544     if (threadIdx.x == 0 && threadIdx.y == 0) {
   545       float tot_energy = 0.0f;
   547       for (
int i=0;i < NUM_WARP;++i) {
   548         tot_energy += ((
float *)&sh_forceJ_2d[i][0].x)[0];
   550       int patch1_ind = sh_patch_pair.patch1_ind;
   551       atomicAdd(&tmp_energy[patch1_ind], (
float)tot_energy);
   561     int patch1_ind = sh_patch_pair.patch1_ind;
   562     int patch2_ind = sh_patch_pair.patch2_ind;
   564     if (threadIdx.x == 0 && threadIdx.y == 0) {
   565       sh_patch_pair.patch_done[0] = 
false;
   566       sh_patch_pair.patch_done[1] = 
false;
   568       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
   569       int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
   570       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = 
true;
   571       if (patch1_ind != patch2_ind) {
   572         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
   573         int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
   574         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = 
true;
   580     if (sh_patch_pair.patch_done[0]) {
   581       const int start = sh_patch_pair.patch1_start;
   582       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
   583         forces[start+i] = tmp_forces[start+i];
   584         dEdaSum[start+i] = tmp_dEdaSum[start+i];
   586       energy[patch1_ind] = tmp_energy[patch1_ind];
   589     if (sh_patch_pair.patch_done[1]) {
   590       const int start = sh_patch_pair.patch2_start;
   591       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
   592         forces[start+i] = tmp_forces[start+i];
   593         dEdaSum[start+i] = tmp_dEdaSum[start+i];
   595       energy[patch2_ind] = tmp_energy[patch2_ind];
   598     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
   600 #if __CUDA_ARCH__ < 200   603       __threadfence_system();
 
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)