8 #if __CUDA_ARCH__ < 300    19   const patch_pair *patch_pairs, 
    30   unsigned int *P1_counters
    35   __shared__ patch_pair sh_patch_pair;
    36 #ifndef KEPLER_SHUFFLE    37   __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
    38   __shared__ 
float sh_intRad0j_2d[NUM_WARP][
WARPSIZE];
    41   volatile GBReal* sh_psiSumJ = sh_psiSumJ_2d[threadIdx.y];
    42 #ifndef KEPLER_SHUFFLE    43   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
    44   volatile float* sh_intRad0j = sh_intRad0j_2d[threadIdx.y];
    49     const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
    50     if (t < PATCH_PAIR_SIZE) {
    51       int* src = (
int *)&patch_pairs[blockIdx.x];
    52       int* dst = (
int *)&sh_patch_pair;
    59       float offx = sh_patch_pair.offset.x * lata.x 
    60                   + sh_patch_pair.offset.y * latb.x
    61                   + sh_patch_pair.offset.z * latc.x;
    62       float offy = sh_patch_pair.offset.x * lata.y
    63                   + sh_patch_pair.offset.y * latb.y
    64                   + sh_patch_pair.offset.z * latc.y;
    65       float offz = sh_patch_pair.offset.x * lata.z
    66                   + sh_patch_pair.offset.y * latb.z
    67                   + sh_patch_pair.offset.z * latc.z;
    68       sh_patch_pair.offset.x = offx;
    69       sh_patch_pair.offset.y = offy;
    70       sh_patch_pair.offset.z = offz;
    76   for (
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*
WARPSIZE) {
    78     int nloopi = sh_patch_pair.patch1_size - blocki;
    87     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
    88       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
    89       float4 tmpa = ((float4*)atoms)[i];
    90       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
    91       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
    92       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
    93       atomi.charge = intRad0[i]; 
    94       intRadSi = intRadS[i];
   100     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
   101     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
   102     int blockj = (diag_patch_pair) ? blocki : 0;
   105     for (; blockj < sh_patch_pair.patch2_size; blockj += 
WARPSIZE ) {
   107       int nloopj = min(sh_patch_pair.patch2_size - blockj, 
WARPSIZE);
   109 #ifdef KEPLER_SHUFFLE   118       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   119         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   120         float4 tmpa = ((float4*)atoms)[j];
   121 #ifdef KEPLER_SHUFFLE   125         chargej = intRadS[j];
   126         intRad0j_val = intRad0[j];
   128         sh_jpq[threadIdx.x].position.x = tmpa.x;
   129         sh_jpq[threadIdx.x].position.y = tmpa.y;
   130         sh_jpq[threadIdx.x].position.z = tmpa.z;
   131         sh_jpq[threadIdx.x].charge = intRadS[j];
   132         sh_intRad0j[threadIdx.x] = intRad0[j];
   136       const bool diag_tile = diag_patch_pair && (blocki == blockj);
   139       sh_psiSumJ[threadIdx.x] = 0.f;
   142       int t = diag_tile ? 1 : 0;
   143 #ifdef KEPLER_SHUFFLE   154         int j = (t + threadIdx.x) % modval;
   155 #ifndef KEPLER_SHUFFLE   156         float xj = sh_jpq[j].position.x;
   157         float yj = sh_jpq[j].position.y;
   158         float zj = sh_jpq[j].position.z;
   159         float chargej = sh_jpq[j].charge;
   160         float intRad0j_val = sh_intRad0j[j];
   162         if (j < nloopj && threadIdx.x < nloopi)
   164           float dx = atomi.position.x - xj;
   165           float dy = atomi.position.y - yj;
   166           float dz = atomi.position.z - zj;
   167           float r2 = dx*dx + dy*dy + dz*dz;
   172             float r_i = 1.f / sqrt(r2);
   176             CalcH(r,r2,r_i,a_cut,atomi.charge,chargej,hij,dij);
   180             CalcH(r,r2,r_i,a_cut,intRad0j_val,intRadSi,hji,dji);
   181             sh_psiSumJ[j] += hji;
   184 #ifdef KEPLER_SHUFFLE           193       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   194         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   195        atomicAdd(&tmp_psiSum[i_out], sh_psiSumJ[threadIdx.x]);
   202     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
   203       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   204       atomicAdd(&tmp_psiSum[i_out], psiSumI);
   214     int patch1_ind = sh_patch_pair.patch1_ind;
   215     int patch2_ind = sh_patch_pair.patch2_ind;
   217     if (threadIdx.x == 0 && threadIdx.y == 0) {
   218       sh_patch_pair.patch_done[0] = 
false;
   219       sh_patch_pair.patch_done[1] = 
false;
   221       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
   222       int patch1_old = atomicInc(&P1_counters[patch1_ind], patch1_num_pairs-1);
   223       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = 
true;
   224       if (patch1_ind != patch2_ind) {
   225         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
   226         int patch2_old = atomicInc(&P1_counters[patch2_ind], patch2_num_pairs-1);
   227         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = 
true;
   233     if (sh_patch_pair.patch_done[0]) {
   234       const int start = sh_patch_pair.patch1_start;
   235       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
   236         psiSum[start+i] = tmp_psiSum[start+i];
   240     if (sh_patch_pair.patch_done[1]) {
   241       const int start = sh_patch_pair.patch2_start;
   242       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
   243         psiSum[start+i] = tmp_psiSum[start+i];
   247     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
   249 #if __CUDA_ARCH__ < 200   252       __threadfence_system();
   265   const patch_pair *patch_pairs,
   267   const float *bornRad,         
   274   const float smoothDist,       
   275   const float epsilon_p,        
   276   const float epsilon_s,        
   281   const int doFullElec,         
   286   unsigned int *P2_counters
   290   __shared__ patch_pair sh_patch_pair;
   291 #ifndef KEPLER_SHUFFLE   292   __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
   293   __shared__ 
float sh_jBornRad_2d[NUM_WARP][
WARPSIZE];
   295   __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
   296   __shared__ 
float sh_dEdaSumJ_2d[NUM_WARP][
WARPSIZE];
   298 #ifndef KEPLER_SHUFFLE   299   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
   300   volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
   302   volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
   303   volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
   307     const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
   308     if (t < PATCH_PAIR_SIZE) {
   309       int* src = (
int *)&patch_pairs[blockIdx.x];
   310       int* dst = (
int *)&sh_patch_pair;
   317       float offx = sh_patch_pair.offset.x * lata.x 
   318                   + sh_patch_pair.offset.y * latb.x
   319                   + sh_patch_pair.offset.z * latc.x;
   320       float offy = sh_patch_pair.offset.x * lata.y
   321                   + sh_patch_pair.offset.y * latb.y
   322                   + sh_patch_pair.offset.z * latc.y;
   323       float offz = sh_patch_pair.offset.x * lata.z
   324                   + sh_patch_pair.offset.y * latb.z
   325                   + sh_patch_pair.offset.z * latc.z;
   326       sh_patch_pair.offset.x = offx;
   327       sh_patch_pair.offset.y = offy;
   328       sh_patch_pair.offset.z = offz;
   336   float r_cut2 = r_cut*r_cut;
   337   float r_cut_2 = 1.f / r_cut2;
   338   float r_cut_4 = 4.f*r_cut_2*r_cut_2;
   339   float epsilon_s_i = 1.f / epsilon_s;
   340   float epsilon_p_i = 1.f / epsilon_p;
   343   for ( 
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
   345     int nloopi = sh_patch_pair.patch1_size - blocki;
   354     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
   355       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   356       float4 tmpa = ((float4*)atoms)[i];
   357       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
   358       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
   359       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
   360       atomi.charge = - tmpa.w * scaling;
   361       bornRadI = bornRad[i];
   371     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
   372     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
   373     int blockj = (diag_patch_pair) ? blocki : 0;
   375     for (; blockj < sh_patch_pair.patch2_size; blockj += 
WARPSIZE) {
   377       int nloopj = min(sh_patch_pair.patch2_size - blockj, 
WARPSIZE);
   379 #ifdef KEPLER_SHUFFLE   388       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   389         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   390         float4 tmpa = ((float4*)atoms)[j];
   391 #ifdef KEPLER_SHUFFLE   396         bornRadJ = bornRad[j];
   398         sh_jpq[threadIdx.x].position.x = tmpa.x;
   399         sh_jpq[threadIdx.x].position.y = tmpa.y;
   400         sh_jpq[threadIdx.x].position.z = tmpa.z;
   401         sh_jpq[threadIdx.x].charge = tmpa.w;
   402         sh_jBornRad[threadIdx.x] = bornRad[j];
   406       sh_forceJ[threadIdx.x].x = 0.f;
   407       sh_forceJ[threadIdx.x].y = 0.f;
   408       sh_forceJ[threadIdx.x].z = 0.f;
   409       sh_dEdaSumJ[threadIdx.x] = 0.f;
   411       const bool diag_tile = diag_patch_pair && (blocki == blockj);
   414         int j = (t + threadIdx.x) % modval;
   415 #ifndef KEPLER_SHUFFLE   416         float xj = sh_jpq[j].position.x;
   417         float yj = sh_jpq[j].position.y;
   418         float zj = sh_jpq[j].position.z;
   419         float chargej = sh_jpq[j].charge;
   420         float bornRadJ = sh_jBornRad[j];
   422         if (j < nloopj && threadIdx.x < nloopi)
   424           float dx = atomi.position.x - xj;
   425           float dy = atomi.position.y - yj;
   426           float dz = atomi.position.z - zj;
   427           float r2 = dx*dx + dy*dy + dz*dz;
   430           if (r2 < r_cut2 && r2 > 0.01f) {
   432             float r_i = 1.f / sqrt(r2);
   437             float qiqj = atomi.charge*chargej;
   438             float aiaj = bornRadI*bornRadJ;
   439             float aiaj4 = 4*aiaj;
   440             float expr2aiaj4 = exp(-r2/aiaj4);
   441             float fij = sqrt(r2+aiaj*expr2aiaj4);
   443             float expkappa = exp(-kappa*fij);
   444             float Dij = epsilon_p_i - expkappa*epsilon_s_i;
   445             float gbEij = qiqj*Dij*f_i;
   448             float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
   449             float ddrf_i = -ddrfij*f_i*f_i;
   450             float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
   451             float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
   455             float ddrScale = 0.f;
   457             if (smoothDist > 0.f) {
   458               scale = r2 * r_cut_2 - 1.f;
   460               ddrScale = r*(r2-r_cut2)*r_cut_4;
   461               energyT += gbEij * scale;
   462               forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
   465               forcedEdr = -ddrGbEij;
   470               float dEdai = 0.5f*qiqj*f_i*f_i
   471                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
   472                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;
   474               float dEdaj = 0.5f*qiqj*f_i*f_i
   475                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
   476                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;
   477               sh_dEdaSumJ[j] += dEdaj;
   481             float tmpx = dx*forcedEdr;
   482             float tmpy = dy*forcedEdr;
   483             float tmpz = dz*forcedEdr;
   487             sh_forceJ[j].x -= tmpx;
   488             sh_forceJ[j].y -= tmpy;
   489             sh_forceJ[j].z -= tmpz;
   494               float fij = bornRadI;
   495               float expkappa = exp(-kappa*fij);
   496               float Dij = epsilon_p_i - expkappa*epsilon_s_i;
   497               float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
   498               energyT += 0.5f*gbEij;
   502 #ifdef KEPLER_SHUFFLE   510       if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   511         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   512         atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
   513         atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
   514         atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
   515         atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
   521     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
   522       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   523       atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
   524       atomicAdd(&tmp_forces[i_out].x, forceI.x);
   525       atomicAdd(&tmp_forces[i_out].y, forceI.y);
   526       atomicAdd(&tmp_forces[i_out].z, forceI.z);
   534     volatile float* sh_energy = (
float *)&sh_forceJ_2d[threadIdx.y][0].x;
   536     sh_energy[threadIdx.x] = (
float)energyT;
   538       int pos = threadIdx.x + d;
   539       float val = (pos < 
WARPSIZE) ? sh_energy[pos] : 0.0f;
   540       sh_energy[threadIdx.x] += val;
   544     if (threadIdx.x == 0 && threadIdx.y == 0) {
   545       float tot_energy = 0.0f;
   547       for (
int i=0;i < NUM_WARP;++i) {
   548         tot_energy += ((
float *)&sh_forceJ_2d[i][0].x)[0];
   550       int patch1_ind = sh_patch_pair.patch1_ind;
   551       atomicAdd(&tmp_energy[patch1_ind], (
float)tot_energy);
   561     int patch1_ind = sh_patch_pair.patch1_ind;
   562     int patch2_ind = sh_patch_pair.patch2_ind;
   564     if (threadIdx.x == 0 && threadIdx.y == 0) {
   565       sh_patch_pair.patch_done[0] = 
false;
   566       sh_patch_pair.patch_done[1] = 
false;
   568       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
   569       int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
   570       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = 
true;
   571       if (patch1_ind != patch2_ind) {
   572         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
   573         int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
   574         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = 
true;
   580     if (sh_patch_pair.patch_done[0]) {
   581       const int start = sh_patch_pair.patch1_start;
   582       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
   583         forces[start+i] = tmp_forces[start+i];
   584         dEdaSum[start+i] = tmp_dEdaSum[start+i];
   586       energy[patch1_ind] = tmp_energy[patch1_ind];
   589     if (sh_patch_pair.patch_done[1]) {
   590       const int start = sh_patch_pair.patch2_start;
   591       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
   592         forces[start+i] = tmp_forces[start+i];
   593         dEdaSum[start+i] = tmp_dEdaSum[start+i];
   595       energy[patch2_ind] = tmp_energy[patch2_ind];
   598     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
   600 #if __CUDA_ARCH__ < 200   603       __threadfence_system();
   616   const patch_pair *patch_pairs,  
   618   const float *intRad0,           
   619   const float *intRadS,           
   620   const float *dHdrPrefix,        
   629   unsigned int *P3_counters
   633   __shared__ patch_pair sh_patch_pair;
   634 #ifndef KEPLER_SHUFFLE   635   __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
   636   __shared__ 
float sh_intRadJ0_2d[NUM_WARP][
WARPSIZE];
   637   __shared__ 
float sh_jDHdrPrefix_2d[NUM_WARP][
WARPSIZE];
   639   __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
   641 #ifndef KEPLER_SHUFFLE   642   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
   643   volatile float* sh_intRadJ0 = sh_intRadJ0_2d[threadIdx.y];
   644   volatile float* sh_jDHdrPrefix = sh_jDHdrPrefix_2d[threadIdx.y];
   646   volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
   650     const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
   651     if (t < PATCH_PAIR_SIZE) {
   652       int* src = (
int *)&patch_pairs[blockIdx.x];
   653       int* dst = (
int *)&sh_patch_pair;
   660       float offx = sh_patch_pair.offset.x * lata.x 
   661                   + sh_patch_pair.offset.y * latb.x
   662                   + sh_patch_pair.offset.z * latc.x;
   663       float offy = sh_patch_pair.offset.x * lata.y
   664                   + sh_patch_pair.offset.y * latb.y
   665                   + sh_patch_pair.offset.z * latc.y;
   666       float offz = sh_patch_pair.offset.x * lata.z
   667                   + sh_patch_pair.offset.y * latb.z
   668                   + sh_patch_pair.offset.z * latc.z;
   669       sh_patch_pair.offset.x = offx;
   670       sh_patch_pair.offset.y = offy;
   671       sh_patch_pair.offset.z = offz;
   677   for ( 
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*
WARPSIZE ) {
   679     int nloopi = sh_patch_pair.patch1_size - blocki;
   689     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
   690       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   691       float4 tmpa = ((float4*)atoms)[i];
   692       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
   693       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
   694       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
   695       atomi.charge = intRad0[i]; 
   696       intRadIS = intRadS[i];
   697       dHdrPrefixI = dHdrPrefix[i];
   706     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
   707     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
   708     int blockj = (diag_patch_pair) ? blocki : 0;
   710     for (; blockj < sh_patch_pair.patch2_size; blockj += 
WARPSIZE ) {
   712       int nloopj = min(sh_patch_pair.patch2_size - blockj, 
WARPSIZE);
   714 #ifdef KEPLER_SHUFFLE   724       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   725         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   726         float4 tmpa = ((float4*)atoms)[j];
   727 #ifdef KEPLER_SHUFFLE   731         intRadSJ = intRadS[j];
   732         dHdrPrefixJ = dHdrPrefix[j];
   733         intRadJ0 = intRad0[j];
   735         sh_jpq[threadIdx.x].position.x = tmpa.x;
   736         sh_jpq[threadIdx.x].position.y = tmpa.y;
   737         sh_jpq[threadIdx.x].position.z = tmpa.z;
   738         sh_jpq[threadIdx.x].charge = intRadS[j];
   739         sh_jDHdrPrefix[threadIdx.x] = dHdrPrefix[j]; 
   740         sh_intRadJ0[threadIdx.x] = intRad0[j];
   744       sh_forceJ[threadIdx.x].x = 0.f;
   745       sh_forceJ[threadIdx.x].y = 0.f;
   746       sh_forceJ[threadIdx.x].z = 0.f;
   748       const bool diag_tile = diag_patch_pair && (blocki == blockj);
   750 #ifdef KEPLER_SHUFFLE   760       int t = diag_tile ? 1 : 0;
   762         int j = (t + threadIdx.x) % modval;
   763 #ifndef KEPLER_SHUFFLE   764         float xj = sh_jpq[j].position.x;
   765         float yj = sh_jpq[j].position.y;
   766         float zj = sh_jpq[j].position.z;
   767         float intRadSJ = sh_jpq[j].charge;
   768         float dHdrPrefixJ = sh_jDHdrPrefix[j];
   769         float intRadJ0 = sh_intRadJ0[j];
   771         if (j < nloopj && threadIdx.x < nloopi)
   773           float dx = atomi.position.x - xj;
   774           float dy = atomi.position.y - yj;
   775           float dz = atomi.position.z - zj;
   776           float r2 = dx*dx + dy*dy + dz*dz;
   781             float r_i = 1.f / sqrt(r2);
   785             CalcDH(r,r2,r_i,a_cut,atomi.charge,intRadSJ,dhij,dij);
   786             CalcDH(r,r2,r_i,a_cut,intRadJ0,intRadIS,dhji,dji);
   788             float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
   789             float tmpx = dx * forceAlpha;
   790             float tmpy = dy * forceAlpha;
   791             float tmpz = dz * forceAlpha;
   795             sh_forceJ[j].x -= tmpx;
   796             sh_forceJ[j].y -= tmpy;
   797             sh_forceJ[j].z -= tmpz;
   800 #ifdef KEPLER_SHUFFLE   809       if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
   810         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
   811         atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
   812         atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
   813         atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
   818     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
   819       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
   820       atomicAdd(&tmp_forces[i_out].x, forceI.x);
   821       atomicAdd(&tmp_forces[i_out].y, forceI.y);
   822       atomicAdd(&tmp_forces[i_out].z, forceI.z);
   832     int patch1_ind = sh_patch_pair.patch1_ind;
   833     int patch2_ind = sh_patch_pair.patch2_ind;
   835     if (threadIdx.x == 0 && threadIdx.y == 0) {
   836       sh_patch_pair.patch_done[0] = 
false;
   837       sh_patch_pair.patch_done[1] = 
false;
   839       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
   840       int patch1_old = atomicInc(&P3_counters[patch1_ind], patch1_num_pairs-1);
   841       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = 
true;
   842       if (patch1_ind != patch2_ind) {
   843         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
   844         int patch2_old = atomicInc(&P3_counters[patch2_ind], patch2_num_pairs-1);
   845         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = 
true;
   851     if (sh_patch_pair.patch_done[0]) {
   852       const int start = sh_patch_pair.patch1_start;
   853       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
   854         forces[start+i] = tmp_forces[start+i];
   858     if (sh_patch_pair.patch_done[1]) {
   859       const int start = sh_patch_pair.patch2_start;
   860       for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
   861         forces[start+i] = tmp_forces[start+i];
   865     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
   867 #if __CUDA_ARCH__ < 200   870       __threadfence_system();
 static __global__ void GBIS_P3_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, const float *dHdrPrefix, const float a_cut, const float rho_0, const float scaling, float3 lata, float3 latb, float3 latc, float4 *tmp_forces, float4 *forces, unsigned int *P3_counters)
 
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
 
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
 
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
 
static __global__ void GBIS_P1_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, GBReal *tmp_psiSum, GBReal *psiSum, const float a_cut, const float rho_0, float3 lata, float3 latb, float3 latc, unsigned int *P1_counters)
 
static __global__ void GBIS_P2_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *bornRad, GBReal *tmp_dEdaSum, GBReal *dEdaSum, const float a_cut, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float3 lata, float3 latb, float3 latc, const int doEnergy, const int doFullElec, float4 *tmp_forces, float4 *forces, float *tmp_energy, float *energy, unsigned int *P2_counters)