下面的cuda代码为什么指定compute capability = 8.0 后在A100上跑结果不对?
nvcc a.cu # 正确
nvcc a.cu -gencode arch=compute_80,code=sm_80 # 结果不正确
// a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>using ValueType = float;__constant__ ValueType col_idx[10];__global__ void aa_kernel(ValueType *x) {int idx = threadIdx.x;printf("idx = %d\n", idx);if(idx < 10)x[idx] = col_idx[idx];
}int main() {ValueType *col_h = new ValueType[10];col_h[0] = 1.0;for(int i=1; i<10; i++) col_h[i] = 2*col_h[i-1];ValueType * x_d;cudaError_t err = cudaSuccess;err = cudaMalloc(&x_d, 32*sizeof(ValueType));if(err != cudaSuccess) printf("error\n");cudaMemcpyToSymbol(col_idx, col_h, 10*sizeof(ValueType));cudaStream_t stream;cudaStreamCreate(&stream);aa_kernel<<<1, 32, 0, stream>>>(x_d);cudaDeviceSynchronize();ValueType x[32] = {0};cudaMemcpy(x, x_d, 32*sizeof(ValueType), cudaMemcpyDeviceToHost);for(int i=0; i<12; i++) printf("%f\n", x[i]);return 0;
}