1 #include "CudaGlobalMasterServerKernel.h"
     4 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
     6 #define ATOM_BLOCKS 128
     8 template <bool copyPositions, bool copyMasses, bool copyCharges,
     9           bool copyTransforms, bool copyVelocities>
    10 __global__ void copyAtomsToClientsKernel(
    11     const double *__restrict x, const double *__restrict y,
    12     const double *__restrict z, const double *__restrict v_x,
    13     const double *__restrict v_y, const double *__restrict v_z,
    14     const char3 *__restrict d_transform, const float *__restrict d_mass,
    15     const float *__restrict d_charge, const Lattice lat,
    16     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
    18     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
    19   const int i = threadIdx.x + blockIdx.x * blockDim.x;
    20   // double3 pos = {0, 0, 0};
    21   if (i < numCopyTuples) {
    22     const int srcIndex = d_copyList[i].m_soa_index;
    23     const int dstClient = d_copyList[i].m_client_index;
    24     const int dstIndex = d_copyList[i].m_client_atom_pos;
    25     const size_t stride = d_clientBuffers[dstClient].sz;
    27       const double3 pos = {x[srcIndex], y[srcIndex], z[srcIndex]};
    28       const char3 t = d_transform[srcIndex];
    29       const double3 pos_trans = lat.reverse_transform(pos, t);
    30       d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
    31       d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
    32       d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
    35       d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcIndex];
    38       d_clientBuffers[dstClient].d_charge[dstIndex] = d_charge[srcIndex];
    41       d_clientBuffers[dstClient].d_transform[dstIndex] =
    42           d_transform[srcIndex].x;
    43       d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
    44           d_transform[srcIndex].y;
    45       d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
    46           d_transform[srcIndex].z;
    49       d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcIndex];
    50       d_clientBuffers[dstClient].d_vel[dstIndex + stride] = v_y[srcIndex];
    51       d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] = v_z[srcIndex];
    56 void copyAtomsToClientsCUDA(
    57     bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
    58     bool copyVelocities, const double *d_pos_x, const double *d_pos_y,
    59     const double *d_pos_z, const double *d_vel_x, const double *d_vel_y,
    60     const double *d_vel_z, const char3 *d_transform, const float *d_mass,
    61     const float *d_charge, const Lattice lat,
    62     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
    63     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
    64     size_t numClients, cudaStream_t stream) {
    65   if (numCopyTuples == 0) return;
    66   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
    67 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY)                      \
    68   copyAtomsToClientsKernel<bool(POSITION), bool(MASS), bool(CHARGE),           \
    69                            bool(TRANSFORM), bool(VELOCITY)>                    \
    70       <<<grid, ATOM_BLOCKS, 0, stream>>>(                                      \
    71           d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z, d_transform,   \
    72           d_mass, d_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
    73   const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
    74                       (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
    75                       (int(copyVelocities));
    78     break; // Nothing is copied
   176 template <bool copyPositions, bool copyMasses, bool copyCharges,
   177           bool copyTransforms, bool copyVelocities>
   178 __global__ void copyAtomsToClientsKernelMGPU(
   179     const double **__restrict x, const double **__restrict y,
   180     const double **__restrict z, const double **__restrict v_x,
   181     const double **__restrict v_y, const double **__restrict v_z,
   182     const char3 **__restrict d_transform, const float **__restrict d_mass,
   183     const float **__restrict d_charge, const Lattice lat,
   184     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
   185     size_t numCopyTuples,
   186     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
   187   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   188   // double3 pos = {0, 0, 0};
   189   if (i < numCopyTuples) {
   190     const int srcDevIdx = d_copyList[i].m_src_dev_index;
   191     const int srcIndex = d_copyList[i].m_soa_index;
   192     const int dstClient = d_copyList[i].m_client_index;
   193     const int dstIndex = d_copyList[i].m_client_atom_pos;
   194     const size_t stride = d_clientBuffers[dstClient].sz;
   196       const double3 pos = {x[srcDevIdx][srcIndex], y[srcDevIdx][srcIndex],
   197                            z[srcDevIdx][srcIndex]};
   198       const char3 t = d_transform[srcDevIdx][srcIndex];
   199       const double3 pos_trans = lat.reverse_transform(pos, t);
   200       d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
   201       d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
   202       d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
   205       d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcDevIdx][srcIndex];
   208       d_clientBuffers[dstClient].d_charge[dstIndex] =
   209           d_charge[srcDevIdx][srcIndex];
   211     if (copyTransforms) {
   212       d_clientBuffers[dstClient].d_transform[dstIndex] =
   213           d_transform[srcDevIdx][srcIndex].x;
   214       d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
   215           d_transform[srcDevIdx][srcIndex].y;
   216       d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
   217           d_transform[srcDevIdx][srcIndex].z;
   219     if (copyVelocities) {
   220       d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcDevIdx][srcIndex];
   221       d_clientBuffers[dstClient].d_vel[dstIndex + stride] =
   222           v_y[srcDevIdx][srcIndex];
   223       d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] =
   224           v_z[srcDevIdx][srcIndex];
   229 void copyAtomsToClientsCUDAMGPU(
   230     bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
   231     bool copyVelocities, const double **d_peer_pos_x,
   232     const double **d_peer_pos_y, const double **d_peer_pos_z,
   233     const double **d_peer_vel_x, const double **d_peer_vel_y,
   234     const double **d_peer_vel_z, const char3 **d_peer_transform,
   235     const float **d_peer_mass, const float **d_peer_charge, const Lattice lat,
   236     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
   237     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
   238     size_t numClients, cudaStream_t stream) {
   239   if (numCopyTuples == 0) return;
   240   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   241 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY)                      \
   242   copyAtomsToClientsKernelMGPU<bool(POSITION), bool(MASS), bool(CHARGE),       \
   243                                bool(TRANSFORM), bool(VELOCITY)>                \
   244       <<<grid, ATOM_BLOCKS, 0, stream>>>(                                      \
   245           d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_vel_x,              \
   246           d_peer_vel_y, d_peer_vel_z, d_peer_transform, d_peer_mass,           \
   247           d_peer_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
   248   const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
   249                       (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
   250                       (int(copyVelocities));
   253     break; // Nothing is copied
   351 template <bool FIXEDATOM>
   352 __global__ void copyForcesToClientsKernel(
   353     const double *__restrict d_f_normal_x,
   354     const double *__restrict d_f_normal_y,
   355     const double *__restrict d_f_normal_z, const double *__restrict d_f_nbond_x,
   356     const double *__restrict d_f_nbond_y, const double *__restrict d_f_nbond_z,
   357     const double *__restrict d_f_slow_x, const double *__restrict d_f_slow_y,
   358     const double *__restrict d_f_slow_z, const int *__restrict d_atomFixed,
   359     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
   360     size_t numCopyTuples,
   361     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
   362   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   363   if (i < numCopyTuples) {
   364     const int srcIndex = d_copyList[i].m_soa_index;
   365     const int dstClient = d_copyList[i].m_client_index;
   366     const int dstIndex = d_copyList[i].m_client_atom_pos;
   367     const size_t stride = d_clientBuffers[dstClient].sz;
   370       if (d_atomFixed[srcIndex]) {
   373         fx = d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] +
   374              d_f_slow_x[srcIndex];
   375         fy = d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] +
   376              d_f_slow_y[srcIndex];
   377         fz = d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] +
   378              d_f_slow_z[srcIndex];
   382           d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] + d_f_slow_x[srcIndex];
   384           d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] + d_f_slow_y[srcIndex];
   386           d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] + d_f_slow_z[srcIndex];
   388     d_clientBuffers[dstClient].d_data[dstIndex] = fx;
   389     d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
   390     d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
   394 void copyTotalForcesToClientsCUDA(
   395     bool fixedOn, const double *d_f_normal_x, const double *d_f_normal_y,
   396     const double *d_f_normal_z, const double *d_f_nbond_x,
   397     const double *d_f_nbond_y, const double *d_f_nbond_z,
   398     const double *d_f_slow_x, const double *d_f_slow_y,
   399     const double *d_f_slow_z, const int *d_atomFixed,
   400     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
   401     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
   402     size_t numClients, cudaStream_t stream) {
   403   if (numCopyTuples == 0) return;
   404   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   406     copyForcesToClientsKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
   407         d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
   408         d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
   409         d_copyList, numCopyTuples, d_clientBuffers);
   411     copyForcesToClientsKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
   412         d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
   413         d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
   414         d_copyList, numCopyTuples, d_clientBuffers);
   418 template <bool FIXEDATOM>
   419 __global__ void copyForcesToClientsKernelMGPU(
   420     const double **__restrict d_f_normal_x,
   421     const double **__restrict d_f_normal_y,
   422     const double **__restrict d_f_normal_z,
   423     const double **__restrict d_f_nbond_x,
   424     const double **__restrict d_f_nbond_y,
   425     const double **__restrict d_f_nbond_z, const double **__restrict d_f_slow_x,
   426     const double **__restrict d_f_slow_y, const double **__restrict d_f_slow_z,
   427     const int **__restrict d_atomFixed,
   428     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
   429     size_t numCopyTuples,
   430     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
   431   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   432   if (i < numCopyTuples) {
   433     const int srcDevIdx = d_copyList[i].m_src_dev_index;
   434     const int srcIndex = d_copyList[i].m_soa_index;
   435     const int dstClient = d_copyList[i].m_client_index;
   436     const int dstIndex = d_copyList[i].m_client_atom_pos;
   437     const size_t stride = d_clientBuffers[dstClient].sz;
   440       if (d_atomFixed[srcDevIdx][srcIndex]) {
   443         fx = d_f_normal_x[srcDevIdx][srcIndex] +
   444              d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
   445         fy = d_f_normal_y[srcDevIdx][srcIndex] +
   446              d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
   447         fz = d_f_normal_z[srcDevIdx][srcIndex] +
   448              d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
   451       fx = d_f_normal_x[srcDevIdx][srcIndex] +
   452            d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
   453       fy = d_f_normal_y[srcDevIdx][srcIndex] +
   454            d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
   455       fz = d_f_normal_z[srcDevIdx][srcIndex] +
   456            d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
   458     d_clientBuffers[dstClient].d_data[dstIndex] = fx;
   459     d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
   460     d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
   464 void copyTotalForcesToClientsCUDAMGPU(
   465     bool fixedOn, const double **d_peer_f_normal_x,
   466     const double **d_peer_f_normal_y, const double **d_peer_f_normal_z,
   467     const double **d_peer_f_nbond_x, const double **d_peer_f_nbond_y,
   468     const double **d_peer_f_nbond_z, const double **d_peer_f_slow_x,
   469     const double **d_peer_f_slow_y, const double **d_peer_f_slow_z,
   470     const int **d_peer_atomFixed,
   471     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
   472     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
   473     size_t numClients, cudaStream_t stream) {
   474   if (numCopyTuples == 0) return;
   475   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   477     copyForcesToClientsKernelMGPU<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
   478         d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
   479         d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
   480         d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
   481         numCopyTuples, d_clientBuffers);
   483     copyForcesToClientsKernelMGPU<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
   484         d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
   485         d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
   486         d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
   487         numCopyTuples, d_clientBuffers);
   491 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
   492 __global__ void addGlobalForcesFromClientsKernel(
   493     double *__restrict d_f_global_x, double *__restrict d_f_global_y,
   494     double *__restrict d_f_global_z, const int *__restrict d_atomFixed,
   495     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
   496     size_t numCopyTuples,
   497     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
   498   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   499   if (i < numCopyTuples) {
   500     const int dstIndex = d_copyList[i].m_soa_index;
   501     const int srcClient = d_copyList[i].m_client_index;
   502     const int srcIndex = d_copyList[i].m_client_atom_pos;
   503     const size_t stride = d_clientBuffers[srcClient].sz;
   506       if (d_atomFixed[dstIndex]) {
   509         fx = d_clientBuffers[srcClient].d_data[srcIndex];
   510         fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
   511         fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
   514       fx = d_clientBuffers[srcClient].d_data[srcIndex];
   515       fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
   516       fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
   519       d_f_global_x[dstIndex] += fx;
   520       d_f_global_y[dstIndex] += fy;
   521       d_f_global_z[dstIndex] += fz;
   523       atomicAdd(&(d_f_global_x[dstIndex]), fx);
   524       atomicAdd(&(d_f_global_y[dstIndex]), fy);
   525       atomicAdd(&(d_f_global_z[dstIndex]), fz);
   530 void addGlobalForcesFromClients(
   531     const int fixedOn, const int uniqueAtoms,
   532     double *d_f_global_x, double *d_f_global_y,
   533     double *d_f_global_z, const int *__restrict d_atomFixed,
   534     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
   535     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
   536     size_t numClients, cudaStream_t stream) {
   537   if (numCopyTuples == 0) return;
   538   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   539   const int options = (fixedOn << 1) + uniqueAtoms;
   540 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
   541   addGlobalForcesFromClientsKernel<FIXEDATOM, UNIQUE_ATOMS> \
   542     <<<grid, ATOM_BLOCKS, 0, stream>>>( \
   543     d_f_global_x, d_f_global_y, d_f_global_z, d_atomFixed, d_copyList, \
   544     numCopyTuples, d_clientBuffers)
   546     case ((0<<1) + 0): CALL(0, 0); break;
   547     case ((0<<1) + 1): CALL(0, 1); break;
   548     case ((1<<1) + 0): CALL(1, 0); break;
   549     case ((1<<1) + 1): CALL(1, 1); break;
   551       const std::string error = "Error in addGlobalForcesFromClients. No kernel is called.\n";
   552       NAMD_bug(error.c_str());
   558 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
   559 __global__ void addGlobalForcesFromClientsKernelMGPU(
   560     double **__restrict d_f_global_x, double **__restrict d_f_global_y,
   561     double **__restrict d_f_global_z, const int **__restrict d_atomFixed,
   562     const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
   563     size_t numCopyTuples,
   564     CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
   565   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   566   if (i < numCopyTuples) {
   567     const int dstDev = d_copyList[i].m_src_dev_index;
   568     const int dstIndex = d_copyList[i].m_soa_index;
   569     const int srcClient = d_copyList[i].m_client_index;
   570     const int srcIndex = d_copyList[i].m_client_atom_pos;
   571     const size_t stride = d_clientBuffers[srcClient].sz;
   574       if (d_atomFixed[dstIndex]) {
   577         fx = d_clientBuffers[srcClient].d_data[srcIndex];
   578         fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
   579         fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
   582       fx = d_clientBuffers[srcClient].d_data[srcIndex];
   583       fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
   584       fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
   587       d_f_global_x[dstDev][dstIndex] += fx;
   588       d_f_global_y[dstDev][dstIndex] += fy;
   589       d_f_global_z[dstDev][dstIndex] += fz;
   591       atomicAdd(&(d_f_global_x[dstDev][dstIndex]), fx);
   592       atomicAdd(&(d_f_global_y[dstDev][dstIndex]), fy);
   593       atomicAdd(&(d_f_global_z[dstDev][dstIndex]), fz);
   598 void addGlobalForcesFromClientsMGPU(
   599     const int fixedOn, const int uniqueAtoms,
   600     double **d_peer_f_global_x, double **d_peer_f_global_y,
   601     double **d_peer_f_global_z, const int **d_peer_atomFixed,
   602     const CudaGlobalMasterServer::CopyListTuple *d_copyList,
   603     size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
   604     size_t numClients, cudaStream_t stream) {
   605   if (numCopyTuples == 0) return;
   606   const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
   607   const int options = (fixedOn << 1) + uniqueAtoms;
   608 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
   609   addGlobalForcesFromClientsKernelMGPU<FIXEDATOM, UNIQUE_ATOMS>\
   610     <<<grid, ATOM_BLOCKS, 0, stream>>>( \
   611     d_peer_f_global_x, d_peer_f_global_y, d_peer_f_global_z, \
   612     d_peer_atomFixed, d_copyList, numCopyTuples, d_clientBuffers);
   614     case ((0<<1) + 0): CALL(0, 0); break;
   615     case ((0<<1) + 1): CALL(0, 1); break;
   616     case ((1<<1) + 0): CALL(1, 0); break;
   617     case ((1<<1) + 1): CALL(1, 1); break;
   619       const std::string error = "Error in addGlobalForcesFromClientsMGPU. No kernel is called.\n";
   620       NAMD_bug(error.c_str());
   626 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)