11 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)    12 #include <emmintrin.h>      13 #if defined(__INTEL_COMPILER)    14 #define __align(X) __declspec(align(X) )    16 #define __align(X)  __attribute__((aligned(X) ))    17 #define MISSING_mm_cvtsd_f64    18 #elif defined(__GNUC__)    19 #define __align(X)  __attribute__((aligned(X) ))    21 #define MISSING_mm_cvtsd_f64    24 #define __align(X) __declspec(align(X) )    33 #define DEBUG_MSM_MIGRATE    34 #undef DEBUG_MSM_MIGRATE    36 #define MSM_MAX_BLOCK_SIZE 8    37 #define MSM_MAX_BLOCK_VOLUME \    38   (MSM_MAX_BLOCK_SIZE * MSM_MAX_BLOCK_SIZE * MSM_MAX_BLOCK_SIZE)    40 #define MSM_C1VECTOR_MAX_BLOCK_SIZE (MSM_MAX_BLOCK_SIZE / 2)    41 #define MSM_C1VECTOR_MAX_BLOCK_VOLUME \    42   (MSM_C1VECTOR_MAX_BLOCK_SIZE * \    43    MSM_C1VECTOR_MAX_BLOCK_SIZE * \    44    MSM_C1VECTOR_MAX_BLOCK_SIZE)    49 #define DEBUG_MSM_VERBOSE    50 #undef DEBUG_MSM_VERBOSE    52 #define DEBUG_MSM_GRID    58 #define ASSERT(expr) \    62       snprintf(msg, sizeof(msg), "ASSERT: \"%s\" " \    63           "(%s, %d)\n", #expr, __FILE__, __LINE__); \   120 #if 1 && (defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE))   126         __m128 melem4 = _mm_load_ps(&m.
melem[k]);
   127         __m128 uelem4 = _mm_load_ps(&u.
velem[0]);
   128         __m128 tmp4 = _mm_mul_ps(melem4, uelem4); 
   129         melem4 = _mm_load_ps(&m.
melem[k+4]);
   130         uelem4 = _mm_load_ps(&u.
velem[4]);
   131         tmp4 = _mm_add_ps(tmp4, _mm_mul_ps(melem4, uelem4)); 
   135         sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(2, 3, 0, 1));
   136         sum4 = _mm_add_ps(sum4, tmp4);
   138         sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(1, 0, 3, 2));
   139         sum4 = _mm_add_ps(sum4, tmp4);
   143         _mm_store_ss(&sum, sum4); 
   147 #elif 0 && (defined(__AVX__) && ! defined(NAMD_DISABLE_SSE))   153         __m256 melem8 = _mm256_load_ps(&m.
melem[k]);
   154         __m256 uelem8 = _mm256_load_ps(&u.
velem[0]);
   155         __m256 tmp8 = _mm256_mul_ps(melem8, uelem8); 
   160         sum8 = _mm256_hadd_ps(sum8, sum8);
   161         sum8 = _mm256_hadd_ps(sum8, sum8);
   163         tmp8 = _mm256_permute2f128_ps(tmp8, tmp8, 1);
   164         sum8 = _mm256_hadd_ps(tmp8, sum8);
   168         _mm_store_ss(&sum, sum8); 
   173 #if defined(__INTEL_COMPILER)   174 #pragma vector always   191 #define C1INDEX(drj,dri)  ((drj)*C1_VECTOR_SIZE + (dri))   215         if (
this != &a) 
copy(a);  
   228         if (i < 0 || i >= 
alen) {
   230           snprintf(msg, 
sizeof(msg), 
"Array index:  alen=%d, i=%d\n", 
alen, i);
   243         if (i < 0 || i >= 
alen) {
   245           snprintf(msg, 
sizeof(msg), 
"Array index:  alen=%d, i=%d\n", 
alen, i);
   268       void print(
const char *s=0)
 const {
   269         if (s) printf(
"PRINTING DATA FOR ARRAY \"%s\":\n", s);
   270         printf(
"abuffer=%p\n  alen=%d  amax=%d\n",
   282     if (m == amax) 
return;
   284       T *newbuffer = 
new T[m];
   287         snprintf(msg, 
sizeof(msg),
   288             "Can't allocate %lu KB for msm::Array\n",
   289             (
unsigned long)(m * 
sizeof(T) / 1024));
   292       if (alen > m) alen = m;  
   293       for (
int i = 0;  i < alen;  i++) {
   294         newbuffer[i] = abuffer[i];
   312     for (
int i = 0;  i < alen;  i++) {
   324     tmpn = s.astate;  s.astate = t.astate;  t.astate = tmpn;
   340         if (nelems > 0)  
init(nelems);
   354         int last = a.len() - 1;
   355         if (last < 0) 
return;
   357         if (last > 0) a[0] = a[last];
   366           int parent = (n-1) / 2;
   367           if (a[parent] <= a[n]) 
break;
   379         int right = left + 1;
   381           if (right < len && a[right] <= a[left]) {
   382             if (a[n] <= a[right]) 
break;
   389             if (a[n] <= a[left]) 
break;
   413     Ivec(
int ni, 
int nj, 
int nk) : 
i(ni), 
j(nj), 
k(nk) { }
   416     void pup(PUP::er& p) {
   426       void set(
int pia, 
int pni, 
int pja, 
int pnj, 
int pka, 
int pnk) {
   427         ASSERT(pni >= 0 && pnj >= 0 && pnk >= 0);
   431       void setbounds(
int pia, 
int pib, 
int pja, 
int pjb, 
int pka, 
int pkb) {
   432         set(pia, pib-pia+1, pja, pjb-pja+1, pka, pkb-pka+1);
   448         return ( 
ia() >= n.
ia() && 
ib() <= n.
ib() &&
   453       void pup(PUP::er& p) {
   466   template <
class T, 
int N>
   476       void set(
int pia, 
int pni, 
int pja, 
int pnj, 
int pka, 
int pnk) {
   480       void setbounds(
int pia, 
int pib, 
int pja, 
int pjb, 
int pka, 
int pkb) {
   494       const T& 
elem(
int i, 
int j, 
int k)
 const {
   495         if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
   497           snprintf(msg, 
sizeof(msg), 
"Grid indexing:\n"   498               "ia=%d, ib=%d, i=%d\n"   499               "ja=%d, jb=%d, j=%d\n"   500               "ka=%d, kb=%d, k=%d\n",
   517         if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
   519           snprintf(msg, 
sizeof(msg), 
"Grid indexing:\n"   520               "ia=%d, ib=%d, i=%d\n"   521               "ja=%d, jb=%d, j=%d\n"   522               "ka=%d, kb=%d, k=%d\n",
   529         return ((k-
ka())*
nj() + (j-
ja()))*
ni() + (i-
ia());
   531       const T *
buffer()
 const { 
return gdata; }
   537         for (
int n = 0;  n < len;  n++) { gdata[n] = t; }
   555         const T *gbuf = g.gdata.buffer();
   556         T *buf = gdata.buffer();
   557         for (
int k = 0;  k < gnk;  k++) {
   558           int jkoff = k * nij + koff;
   559           for (
int j = 0;  j < gnj;  j++) {
   560             int ijkoff = j * 
ni + jkoff;
   561             for (
int i = 0;  i < gni;  i++, index++) {
   562               buf[i + ijkoff] += gbuf[index];
   581         T *gbuf = g.gdata.buffer();
   582         const T *buf = gdata.buffer();
   583         for (
int k = 0;  k < gnk;  k++) {
   584           int jkoff = k * nij + koff;
   585           for (
int j = 0;  j < gnj;  j++) {
   586             int ijkoff = j * 
ni + jkoff;
   587             for (
int i = 0;  i < gni;  i++, index++) {
   588               gbuf[index] = buf[i + ijkoff];
   600   class Grid : 
public IndexRange {
   608       void set(
int pia, 
int pni, 
int pja, 
int pnj, 
int pka, 
int pnk) {
   612       void setbounds(
int pia, 
int pib, 
int pja, 
int pjb, 
int pka, 
int pkb) {
   629       const T& 
elem(
int i, 
int j, 
int k)
 const {
   630         if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
   632           snprintf(msg, 
sizeof(msg), 
"Grid indexing:\n"   633               "ia=%d, ib=%d, i=%d\n"   634               "ja=%d, jb=%d, j=%d\n"   635               "ka=%d, kb=%d, k=%d\n",
   652         if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
   654           snprintf(msg, 
sizeof(msg), 
"Grid indexing:\n"   655               "ia=%d, ib=%d, i=%d\n"   656               "ja=%d, jb=%d, j=%d\n"   657               "ka=%d, kb=%d, k=%d\n",
   664         return ((k-
ka())*
nj() + (j-
ja()))*
ni() + (i-
ia());
   671         T *buf = gdata.buffer();
   673         for (
int n = 0;  n < len;  n++) { buf[n] = t; }
   689           if (
ia > g.nlower.i) 
ia = g.nlower.i;
   691           if (
ja > g.nlower.j) 
ja = g.nlower.j;
   693           if (
ka > g.nlower.k) 
ka = g.nlower.k;
   695           int gib1 = g.nlower.i + g.nextent.i;
   696           if (ib1 < gib1) ib1 = gib1;
   698           int gjb1 = g.nlower.j + g.nextent.j;
   699           if (jb1 < gjb1) jb1 = gjb1;
   701           int gkb1 = g.nlower.k + g.nextent.k;
   702           if (kb1 < gkb1) kb1 = gkb1;
   709           int koff = (tmp.nlower.k - 
nlower.
k) * nij
   711           const T *gbuf = tmp.gdata.buffer();
   712           T *buf = gdata.buffer();
   713           for (
int k = 0;  k < tmp.nextent.k;  k++) {
   714             int jkoff = k * nij + koff;
   715             for (
int j = 0;  j < tmp.nextent.j;  j++) {
   716               int ijkoff = j * 
ni + jkoff;
   717               for (
int i = 0;  i < tmp.nextent.i;  i++, index++) {
   718                 buf[i + ijkoff] = gbuf[index];
   724         int gni = g.nextent.i;
   725         int gnj = g.nextent.j;
   726         int gnk = g.nextent.k;
   730         int koff = (g.nlower.k - 
nlower.
k) * nij
   732         const T *gbuf = g.gdata.buffer();
   733         T *buf = gdata.buffer();
   734         for (
int k = 0;  k < gnk;  k++) {
   735           int jkoff = k * nij + koff;
   736           for (
int j = 0;  j < gnj;  j++) {
   737             int ijkoff = j * 
ni + jkoff;
   738             for (
int i = 0;  i < gni;  i++, index++) {
   739               buf[i + ijkoff] += gbuf[index];
   750         int gni = g.nextent.i;
   751         int gnj = g.nextent.j;
   752         int gnk = g.nextent.k;
   756         int koff = (g.nlower.k - 
nlower.
k) * nij
   758         T *gbuf = g.gdata.buffer();
   759         const T *buf = gdata.buffer();
   760         for (
int k = 0;  k < gnk;  k++) {
   761           int jkoff = k * nij + koff;
   762           for (
int j = 0;  j < gnj;  j++) {
   763             int ijkoff = j * 
ni + jkoff;
   764             for (
int i = 0;  i < gni;  i++, index++) {
   765               gbuf[index] = buf[i + ijkoff];
   784         const T *gbuf = g.
buffer();
   785         T *buf = gdata.buffer();
   786         for (
int k = 0;  k < gnk;  k++) {
   787           int jkoff = k * nij + koff;
   788           for (
int j = 0;  j < gnj;  j++) {
   789             int ijkoff = j * 
ni + jkoff;
   790             for (
int i = 0;  i < gni;  i++, index++) {
   791               buf[i + ijkoff] += gbuf[index];
   812         const T *buf = gdata.buffer();
   813         for (
int k = 0;  k < gnk;  k++) {
   814           int jkoff = k * nij + koff;
   815           for (
int j = 0;  j < gnj;  j++) {
   816             int ijkoff = j * 
ni + jkoff;
   817             for (
int i = 0;  i < gni;  i++, index++) {
   818               gbuf[index] = buf[i + ijkoff];
   842     void pup(PUP::er& p) {
   856     void set(
int i, 
int j, 
int k) {
   860       if (i > 1 || j > 1 || k > 1) 
active = 1;
   878     void pup(PUP::er& p) {
   972         if (pn.i < a) pn.i = a;
   973         if (pn.i > b) pn.i = b;
   978         if (pn.j < a) pn.j = a;
   979         if (pn.j > b) pn.j = b;
   984         if (pn.k < a) pn.k = a;
   985         if (pn.k > b) pn.k = b;
   997       bn.n.i = (d >= 0 ? d / 
bsx[level] : -((-d+
bsx[level]-1) / 
bsx[level]));
   999       bn.n.j = (d >= 0 ? d / 
bsy[level] : -((-d+
bsy[level]-1) / 
bsy[level]));
  1001       bn.n.k = (d >= 0 ? d / 
bsz[level] : -((-d+
bsz[level]-1) / 
bsz[level]));
  1017       bn.n.i = (d >= 0 ? d / bsi : -((-d+bsi-1) / bsi));
  1019       bn.n.j = (d >= 0 ? d / bsj : -((-d+bsj-1) / bsj));
  1021       bn.n.k = (d >= 0 ? d / bsk : -((-d+bsk-1) / bsk));
  1048       nr.
set(ia, bsi, ja, bsj, ka, bsk);
  1056       int nia = nrange.
ia();
  1057       int nib = nrange.
ib();
  1058       int nja = nrange.
ja();
  1059       int njb = nrange.
jb();
  1060       int nka = nrange.
ka();
  1061       int nkb = nrange.
kb();
  1063       if (ia < nia) ia = nia;
  1065       if (ib > nib) ib = nib;
  1067       if (ja < nja) ja = nja;
  1069       if (jb > njb) jb = njb;
  1071       if (ka < nka) ka = nka;
  1073       if (kb > nkb) kb = nkb;
  1083       int nia = nrange.
ia();
  1084       int nib = nrange.
ib();
  1085       int nja = nrange.
ja();
  1086       int njb = nrange.
jb();
  1087       int nka = nrange.
ka();
  1088       int nkb = nrange.
kb();
  1090       if (ia < nia) ia = nia;
  1092       if (ib > nib) ib = nib;
  1094       if (ja < nja) ja = nja;
  1096       if (jb > njb) jb = njb;
  1098       if (ka < nka) ka = nka;
  1100       if (kb > nkb) kb = nkb;
  1114       int di=0, dj=0, dk=0;
  1116         while (nb.
n.
i < 0) {
  1118           di += ni * 
bsx[level];
  1120         while (nb.
n.
i >= ni) {
  1122           di -= ni * 
bsx[level];
  1126         while (nb.
n.
j < 0) {
  1128           dj += nj * 
bsy[level];
  1130         while (nb.
n.
j >= nj) {
  1132           dj -= nj * 
bsy[level];
  1136         while (nb.
n.
k < 0) {
  1138           dk += nk * 
bsz[level];
  1140         while (nb.
n.
k >= nk) {
  1142           dk -= nk * 
bsz[level];
  1151       nr.
setbounds(ia + di, ib + di, ja + dj, jb + dj, ka + dk, kb + dk);
  1169       int bsi = foldi * 
bsx[level];
  1170       int bsj = foldj * 
bsy[level];
  1171       int bsk = foldk * 
bsz[level];
  1172       int di=0, dj=0, dk=0;
  1174         while (nb.
n.
i < 0) {
  1178         while (nb.
n.
i >= ni) {
  1184         while (nb.
n.
j < 0) {
  1188         while (nb.
n.
j >= nj) {
  1194         while (nb.
n.
k < 0) {
  1198         while (nb.
n.
k >= nk) {
  1209       nr.
setbounds(ia + di, ib + di, ja + dj, jb + dj, ka + dk, kb + dk);
  1215       int level = bn.
level;
  1220         while (bn.
n.
i < 0) {
  1223         while (bn.
n.
i >= ni) {
  1228         while (bn.
n.
j < 0) {
  1231         while (bn.
n.
j >= nj) {
  1236         while (bn.
n.
k < 0) {
  1239         while (bn.
n.
k >= nk) {
 
GridFixed< T, N > & operator+=(const GridFixed< T, N > &g)
 
Array< Grid< C1Matrix > > gpro_c1hermite
 
Array< PatchDiagram > patchList
 
Array< int > recvGridCutoff
 
const T & operator()(int i, int j, int k) const
 
BlockIndex blockOfGridIndexFold(const Ivec &n, int level) const
 
const T & elem(int i, int j, int k) const
 
Ivec clipIndexToLevel(const Ivec &n, int level) const
 
Grid< T > & operator+=(const GridFixed< T, N > &g)
 
IndexRange clipBlockToIndexRangeFold(const BlockIndex &nb, const IndexRange &nrange) const
 
friend C1Vector operator+(const C1Vector &u, const C1Vector &v)
 
Array & operator=(const Array &a)
 
T & operator()(int i, int j, int k)
 
IndexRange indexRangeOfBlock(const BlockIndex &nb) const
 
int operator==(const Ivec &n)
 
const T & operator()(int i, int j, int k) const
 
int flatindex(int i, int j, int k) const
 
void wrapBlockSend(BlockSend &bs) const
 
Array< BlockSend > sendUp
 
void set(int pia, int pni, int pja, int pnj, int pka, int pnk)
 
void extract(Grid< T > &g)
 
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
 
void swap(Array< T > &s, Array< T > &t)
 
void wrapBlockIndex(BlockIndex &bn) const
 
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
 
Array< Force > ForceArray
 
T & operator()(int i, int j, int k)
 
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
 
Array< Grid< BlockDiagram > > blockLevel
 
T & operator()(const Ivec &n)
 
friend C1Vector operator*(const C1Matrix &m, const C1Vector &u)
 
void init(const IndexRange &n)
 
Array< BlockSend > sendAcross
 
Array< Grid< C1Matrix > > gres_c1hermite
 
Array< AtomCoord > AtomCoordArray
 
IndexRange indexRangeOfBlockFold(const BlockIndex &nb) const
 
PriorityQueue(int nelems=0)
 
const T * buffer(int &n) const
 
Array< PatchData * > PatchPtrArray
 
void updateLower(const Ivec &n)
 
IndexRange nrangeRestricted
 
void extract(GridFixed< T, N > &g)
 
BlockIndex(int ll, const Ivec &nn)
 
void extract(GridFixed< T, N > &g)
 
Array< Grid< C1Matrix > > gvc_c1hermite
 
Array< IndexRange > gridrange
 
Array< PatchSend > sendPatch
 
void NAMD_die(const char *err_msg)
 
BlockIndex blockOfGridIndex(const Ivec &n, int level) const
 
FoldFactor(int i, int j, int k)
 
const T & operator[](int i) const
 
const T & elem(int i, int j, int k) const
 
const Array< T > & data() const
 
void updateLower(const Ivec &n)
 
Float melem[C1_MATRIX_SIZE]
 
void copy(const Array &a)
 
IndexRange clipBlockToIndexRange(const BlockIndex &nb, const IndexRange &nrange) const
 
T & operator()(const Ivec &n)
 
Array< FoldFactor > foldfactor
 
Grid< T > & operator+=(const Grid< T > &g)
 
Array< int > indexGridCutoff
 
Array< Grid< Float > > gc
 
int flatindex(int i, int j, int k) const
 
const T & operator()(const Ivec &n) const
 
void wrapBlockSendFold(BlockSend &bs) const
 
const T & operator()(const Ivec &n) const
 
Array< Grid< Float > > gvc
 
friend Float operator*(const C1Vector &u, const C1Vector &v)
 
void init(const IndexRange &n)
 
Array< Grid< C1Matrix > > gc_c1hermite
 
int operator<=(const IndexRange &n)
 
C1Vector & operator+=(const C1Vector &v)
 
const T & elem(int i) const
 
IndexRange nrangeProlongated
 
Ivec(int ni, int nj, int nk)
 
Float velem[C1_VECTOR_SIZE]
 
Array< BlockSend > sendDown
 
T & elem(int i, int j, int k)
 
T & elem(int i, int j, int k)