7 AVXJTiles::AVXJTiles() : _numTiles(0), _numTilesAlloc(0) {
    10 AVXJTiles::~AVXJTiles() {
    18 void AVXJTiles::_realloc() {
    24   _numTilesAlloc = 1.2 * _numTiles;
    25   int e = posix_memalign((
void **)&excl, 64, 
    26                          sizeof(
unsigned int)*_numTilesAlloc << 4);
    27   e = e | posix_memalign((
void **)&atomStart, 64, 
sizeof(
int)*_numTilesAlloc);
    28   e = e | posix_memalign((
void **)&status, 64, 
sizeof(
int)*_numTilesAlloc);
    29   if (e) 
NAMD_die(
"Could not allocate memory for tiles.");
    34 AVXTileLists::AVXTileLists() : _numLists(0), _numListsAlloc(0), 
    35                                _numModifiedAlloc(0), _numExcludedAlloc(0),
    36                                _maxPairLists(0), _maxPairs(0)
    38   #ifndef MEM_OPT_VERSION    39   _exclFlyListBuffer = 0;
    45 AVXTileLists::~AVXTileLists() {
    50   if (_numModifiedAlloc) {
    54   if (_numExcludedAlloc) {
    58   if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
    66   #ifndef MEM_OPT_VERSION    67   delete []_exclFlyListBuffer;
    73 void AVXTileLists::setSimParams(
const float scale, 
const float scale14,
    74                                 const float c1, 
const float c3,
    75                                 const float switchOn2, 
float *fastTable,
    76                                 float *fastEnergyTable, 
float *slowTable,
    77                                 float *slowEnergyTable, 
float *eps4sigma,
    78                                 float *eps4sigma14, 
float *ljTable, 
    79                                 const float ljTableWidth, 
float *modifiedTable,
    80                                 float *modifiedEnergyTable,
    82                                 float *excludedEnergyTable,
    83                                 const int interpolationMode) {
    84   _interpolationMode = interpolationMode;
    86   if (_interpolationMode > 1)
    87     _paramScale = 2.0 * scale;
    90   _paramScale14 = scale14;
    93   _paramSwitchOn2 = switchOn2;
    94   if (_interpolationMode == 3) {
    95     _paramFastTable = fastTable;
    96     _paramFastEnergyTable = fastEnergyTable;
    98   _paramSlowTable = slowTable;
    99   _paramSlowEnergyTable = slowEnergyTable;
   100   _paramModifiedTable = modifiedTable;
   101   _paramModifiedEnergyTable = modifiedEnergyTable;
   102   _paramExcludedTable = excludedTable;
   103   _paramExcludedEnergyTable = excludedEnergyTable;
   104   if (_interpolationMode > 1) {
   105     _paramLjTable = ljTable;
   106     _paramLjWidth = ljTableWidth;
   108     _paramEps4Sigma = eps4sigma;
   109     _paramEps4Sigma14 = eps4sigma14;
   124 template <
bool count, 
bool partitionMode>
   125 int AVXTileLists::_buildBB() {
   130     if (partitionMode || jTiles.maxTiles() == 0)
   131       _buildBB<true, partitionMode>();
   134   const int maxJtiles = jTiles.maxTiles();
   135   const int numTileLists = tiles_p0->numTiles();
   136   const int numTiles2 = tiles_p1->numTiles();
   137   const bool zeroShift = ! (_shx*_shx + _shy*_shy + _shz*_shz > 0);
   138   const bool self = zeroShift && (tiles_p0 == tiles_p1);
   141   if (!count) out_i = 0;
   143   for (
int itileList = 0; itileList < numTileLists; itileList++) {
   144     if (partitionMode && count==
false && listDepth[itileList]==0)
   147     int itileListLen = 0;
   149     if (
self) jinit = itileList;
   155     const __m512 bbIx = _mm512_set1_ps(tiles_p0->bbox_x[itileList] + _shx);
   156     const __m512 bbIy = _mm512_set1_ps(tiles_p0->bbox_y[itileList] + _shy);
   157     const __m512 bbIz = _mm512_set1_ps(tiles_p0->bbox_z[itileList] + _shz);
   158     const __m512 bbIwx = _mm512_set1_ps(tiles_p0->bbox_wx[itileList]);
   159     const __m512 bbIwy = _mm512_set1_ps(tiles_p0->bbox_wy[itileList]);
   160     const __m512 bbIwz = _mm512_set1_ps(tiles_p0->bbox_wz[itileList]);
   163     __m512i jAtomStart = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
   164                                            128, 144, 160, 176, 192, 208, 224,
   166     jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(jinit<<4));
   167     int jtileStart = numJtiles;
   169     __mmask16 loopMask = 0xFFFF;
   170     for (
int j=jinit; j < numTiles2; j += 16) {
   172       if (numTiles2 - j < 16) 
   173         loopMask >>= (16 - (numTiles2 - j));
   176       const __m512 bbJx = _mm512_mask_loadu_ps(_mm512_undefined_ps(), 
   177         loopMask, tiles_p1->bbox_x + j);
   178       const __m512 bbJy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
   179         loopMask, tiles_p1->bbox_y + j);
   180       const __m512 bbJz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
   181         loopMask, tiles_p1->bbox_z + j);
   182       const __m512 bbJwx = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
   183         loopMask, tiles_p1->bbox_wx + j);
   184       const __m512 bbJwy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
   185         loopMask, tiles_p1->bbox_wy + j);
   186       const __m512 bbJwz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
   187         loopMask, tiles_p1->bbox_wz + j);
   190       const __m512 dx_one = _mm512_abs_ps(_mm512_sub_ps(bbIx, bbJx));
   191       const __m512 dx_two = _mm512_add_ps(bbIwx, bbJwx);
   192       const __mmask16 lxmask = _mm512_cmplt_ps_mask(dx_two, dx_one);
   193       const __m512 dx = _mm512_mask_sub_ps(_mm512_setzero_ps(), lxmask,
   196       const __m512 dy_one = _mm512_abs_ps(_mm512_sub_ps(bbIy, bbJy));
   197       const __m512 dy_two = _mm512_add_ps(bbIwy, bbJwy);
   198       const __mmask16 lymask = _mm512_cmplt_ps_mask(dy_two, dy_one);
   199       const __m512 dy = _mm512_mask_sub_ps(_mm512_setzero_ps(), lymask,
   202       const __m512 dz_one = _mm512_abs_ps(_mm512_sub_ps(bbIz, bbJz));
   203       const __m512 dz_two = _mm512_add_ps(bbIwz, bbJwz);
   204       const __mmask16 lzmask = _mm512_cmplt_ps_mask(dz_two, dz_one);
   205       const __m512 dz = _mm512_mask_sub_ps(_mm512_setzero_ps(), lzmask,
   208       const __m512 r2bb = _mm512_fmadd_ps(dz,dz,
   209         _mm512_fmadd_ps(dx, dx, _mm512_mul_ps(dy, dy)));
   211       const __mmask16 keep = _mm512_cmplt_ps_mask(r2bb,
   212         _mm512_set1_ps(_plcutoff2)) & loopMask;
   213       const int nKeep = _popcnt32(keep);
   215       if (count == 
false && nKeep)
   216         if (partitionMode || jtileStart + itileListLen + nKeep <= maxJtiles)
   217           _mm512_mask_compressstoreu_epi32(jTiles.atomStart+jtileStart+
   218                                            itileListLen, keep, jAtomStart);
   220       itileListLen += nKeep;
   222       jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(256));
   225     if (count == 
false) {
   226       if (itileListLen > 0) {
   227         listDepth[out_i] = (
unsigned int)itileListLen;
   228         lists[out_i].atomStart_i = itileList<<4;
   229         lists[out_i].jtileStart = jtileStart;
   232     } 
else if (partitionMode)
   233       listDepth[itileList] = (
unsigned int)itileListLen;
   235     numJtiles += itileListLen;
   240   if (partitionMode && count) {
   241     const int jtilesPerPart = 
static_cast<double>(numJtiles)/_numParts + 0.5;
   242     int jtileCount = 0, curPart = 0, nextPart = jtilesPerPart;
   245     for (
int j = 0; j < numTileLists; j++) {
   246       jtileCount += listDepth[j];
   247       if (curPart < _minPart || curPart >= _maxPart)
   250         numJtiles += listDepth[j];
   251       if (jtileCount >= nextPart) {
   253         nextPart += jtilesPerPart;
   254         if (curPart == _numParts) curPart--;
   260     jTiles.realloc(numJtiles);
   266   if (!partitionMode && jTiles.realloc(numJtiles))
   267     return _buildBB<false, false>();
   272     memset(jTiles.status, 0, numJtiles * 
sizeof(
int));
   273     lists[_numLists].jtileStart = lists[_numLists-1].jtileStart + 
   274       listDepth[_numLists - 1];
   280 void AVXTileLists::delEmptyLists() {
   283   const int numLists = _numLists;
   285   __mmask16 tripMask = 0xFFFF;
   288   for (
int ii = 0; ii < numLists; ii += 16) {
   289     if (numLists - ii < 16) {
   290       vecTrip = numLists - ii;
   291       tripMask >>= (16 - vecTrip);
   295     __m512i depth = _mm512_loadu_epi32(listDepth + ii);
   297     __mmask16 depth_mask = _mm512_cmpneq_epi32_mask(depth,
   298       _mm512_setzero_epi32()) & tripMask;
   300     depth = _mm512_and_epi32(depth, _mm512_set1_epi32((
int)65535));
   302     _mm512_mask_compressstoreu_epi32(listDepth + out_i, depth_mask, depth);
   304     for (
int vi = 0; vi < vecTrip; vi++) {
   305       if (*((
unsigned int*)&(depth) + vi)) {
   306         const int i = ii + vi;
   307         lists[out_i].atomStart_i = lists[i].atomStart_i;
   309         int start = totalTiles;
   310         int startOld = lists[i].jtileStart;
   311         int endOld = lists[i+1].jtileStart;
   312         lists[out_i].jtileStart = start;
   315         for (
int jtileOld=startOld; jtileOld < endOld; jtileOld++) {
   316           int jtile = jTiles.status[jtileOld];
   319             jTiles.atomStart[jtile0] = jTiles.atomStart[jtileOld];
   322             excl = _mm512_loadu_epi32(jTiles.excl + (jtileOld<<4));
   323             _mm512_store_epi32((
void *)(jTiles.excl + (jtile0<<4)), excl);
   328         totalTiles += (*((
unsigned int*)&(depth) + vi));
   332   jTiles.realloc(totalTiles);
   336     lists[_numLists].jtileStart = lists[_numLists-1].jtileStart + 
   337       listDepth[_numLists - 1];
   340 void AVXTileLists::build() {
   341   tiles_p0->buildBoundingBoxes(_lastBuild);
   342   if (tiles_p0 != tiles_p1)
   343     tiles_p1->buildBoundingBoxes(_lastBuild);
   345   if (_maxPart - _minPart < _numParts)
   346     _buildBB<false,true>();
   348     _buildBB<false,false>();
   351   if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
   352     for (
int i = 0; i < _maxPairLists; i++) _numPairs[i] = 0;
   353     int end = tiles_p0->numAtoms() & 15;
   354     if (end <= NAMD_AVXTILES_PAIR_THRESHOLD) {
   355       const int start = tiles_p0->numAtoms() - end;
   357       for (
int i = start; i < end; i++) _pair_i[i-start] = i;
   362 void AVXTileLists::_realloc() {
   363   if (_numListsAlloc) {
   367   _numListsAlloc = 1.2 * _numLists;
   368   int e = posix_memalign((
void **)&lists, 64, 
sizeof(List)*(_numListsAlloc+1));
   369   e = e | posix_memalign((
void **)&listDepth, 64,
   370                  sizeof(
unsigned int)*(_numListsAlloc+1));
   371   if (e) 
NAMD_die(
"Could not allocate memory for tiles.");
   373   if (_numModifiedAlloc == 0) {
   374     int newNum = tiles_p0->numAtoms();
   375     if (newNum < tiles_p1->numAtoms()) newNum = tiles_p1->numAtoms();
   376     reallocModified(2 * newNum);
   378     reallocExcluded(2.2 * newNum);
   380   if (NAMD_AVXTILES_PAIR_THRESHOLD > 0)
   381     _reallocPairLists(tiles_p0->numAtoms() & 15, NAMD_AVXTILES_IPAIRCOUNT);
   386 void AVXTileLists::_reallocModified() {
   387   int numModifiedAlloc = 1.2 * _numModified;
   390   if (posix_memalign((
void **)&newp, 64, 
sizeof(
int) * numModifiedAlloc))
   391     NAMD_die(
"Could not allocate memory for tiles.");
   393   if (_numModifiedAlloc) { 
   394     memcpy(newp, _modified_i, 
sizeof(
int) * _numModifiedAlloc);
   399   if (posix_memalign((
void **)&newp, 64, 
sizeof(
int) * numModifiedAlloc))
   400     NAMD_die(
"Could not allocate memory for tiles.");
   402   if (_numModifiedAlloc) { 
   403     memcpy(newp, _modified_j, 
sizeof(
int) * _numModifiedAlloc);
   408   _numModifiedAlloc = numModifiedAlloc;
   413 void AVXTileLists::_reallocExcluded() {
   414   int numExcludedAlloc = 1.2 * _numExcluded;
   417   if (posix_memalign((
void **)&newp, 64, 
sizeof(
int) * numExcludedAlloc))
   418     NAMD_die(
"Could not allocate memory for tiles.");
   420   if (_numExcludedAlloc) { 
   421     memcpy(newp, _excluded_i, 
sizeof(
int) * _numExcludedAlloc);
   426   if (posix_memalign((
void **)&newp, 64, 
sizeof(
int) * numExcludedAlloc))
   427     NAMD_die(
"Could not allocate memory for tiles.");
   429   if (_numExcludedAlloc) { 
   430     memcpy(newp, _excluded_j, 
sizeof(
int) * _numExcludedAlloc);
   435   _numExcludedAlloc = numExcludedAlloc;
   440 void AVXTileLists::_reallocPairLists(
const int numPairLists, 
   441                                      const int maxPairs) {
   442   if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
   443     if (!_maxPairLists) {
   444       _maxPairLists = NAMD_AVXTILES_PAIR_THRESHOLD;
   445       _maxPairs = maxPairs;
   446       int e = posix_memalign((
void **)&_pair_i, 64, 
sizeof(
int)*_maxPairLists);
   447       e = e | posix_memalign((
void **)&_numPairs, 64, 
   448                              sizeof(
int)*_maxPairLists);
   449       e = e | posix_memalign((
void **)&_pairStart, 64,
   450                              sizeof(
int)*_maxPairLists);
   451       e = e | posix_memalign((
void **)&_pairLists, 64,
   452                      sizeof(
int)*_maxPairLists*_maxPairs);
   453       if (e) 
NAMD_die(
"Could not allocate memory for tiles.");
   456       for (
int i = 0; i < _maxPairLists; i++) {
   457         _pairStart[i] = listStart;
   458         listStart += _maxPairs;
   460     } 
else if (maxPairs > _maxPairs) {
   462       if (posix_memalign((
void **)&hold, 64, 
   463                          sizeof(
int)*_maxPairLists*maxPairs))
   464         NAMD_die(
"Could not allocate memory for tiles.");
   467       for (
int i = 0; i < _maxPairLists; i++) {
   468         if (_numPairs[i]) memcpy(hold + listStart, _pairLists + _pairStart[i],
   469                                  sizeof(
int) * _numPairs[i]);
   470         _pairStart[i] = listStart;
   471         listStart += maxPairs;
   475       _maxPairs = maxPairs;
   480 #endif // NAMD_AVXTILES void NAMD_die(const char *err_msg)