9 __global__ void read_CUDA_ARCH_kernel(int *cuda_arch) {
    10   if (threadIdx.x == 0) {
    12 #if __CUDA_ARCH__ == 100
    14 #elif __CUDA_ARCH__ == 110
    16 #elif __CUDA_ARCH__ == 120
    18 #elif __CUDA_ARCH__ == 130
    20 #elif __CUDA_ARCH__ == 200
    22 #elif __CUDA_ARCH__ == 210
    24 #elif __CUDA_ARCH__ == 300
    26 #elif __CUDA_ARCH__ == 350
    28 #elif __CUDA_ARCH__ == 500
    38 // Reads the value of __CUDA_ARCH__ from device code
    40 int read_CUDA_ARCH() {
    43   allocate_device<int>(&d_cuda_arch, 1);
    45   read_CUDA_ARCH_kernel <<< 1, 1 >>> (d_cuda_arch);
    46   cudaError_t err = cudaGetLastError();
    47   if (err != cudaSuccess) {
    49     sprintf(str, "Error executing CUDA kernel read_CUDA_ARCH_kernel in file %s\nError string: %s\nPossible cause: Device compute capability is less than the compute capability the code was compiled for.\n",
    50       __FILE__,cudaGetErrorString(err));
    53   cudaCheck(cudaDeviceSynchronize());
    55   copy_DtoH_sync<int>(d_cuda_arch, &h_cuda_arch, 1);
    56   deallocate_device<int>(&d_cuda_arch);