使用 cublas 批量求逆

如何解决使用 cublas 批量求逆

我是 cuda 的新手，我正在尝试找到三个矩阵的逆矩阵。矩阵是 [4 8;3 9]。我想编写一个 cuda 内核来计算 GPU 上所有三个矩阵的逆。这是我写的代码。但我收到此错误：ptxas fatal : Unresolved extern function 'cublasCreate_v2'

#include <stdio.h>

#include <cublas_v2.h>

__global__ void copy(float** a,float* b,float** c,float* d,int count){
int idx = threadIdx.x + blockIdx.x * blockDim.x;

if(idx < count){
    a[idx] = b + idx*2*2 ;
    c[idx] = d + idx*2*2;
}
__syncthreads();


}

__global__ void inv(float** a,float** c){
int Nmatrices = 3; int N = 2;
cublasHandle_t handle;
cublasCreate(&handle);
int *h_PivotArray = (int *)malloc(N*Nmatrices*sizeof(int));
int *h_InfoArray  = (int *)malloc(  Nmatrices*sizeof(int));
cublasSgetrfBatched(handle,N,a,h_PivotArray,h_InfoArray,Nmatrices);
cublasSgetriBatched(handle,c,Nmatrices);

cublasDestroy(handle);
}
int main() {

 int N = 2;

int Nmatrices = 3;//number of batches



// --- Matrices to be inverted
float *h_A = new float[N*N*Nmatrices];
float *r_A = new float[N*N*Nmatrices];//result

h_A[0] = 4.f;
h_A[1] = 3.f;
h_A[2] = 8.f;
h_A[3] = 9.f;
h_A[4] = 4.f;
h_A[5] = 3.f;
h_A[6] = 8.f;
h_A[7] = 9.f;
h_A[8] = 4.f;
h_A[9] = 3.f;
h_A[10] = 8.f;
h_A[11] = 9.f;


 int count = Nmatrices;
// --- Allocate device matrices
float *d_A; cudamalloc((void**)&d_A,N*N*Nmatrices*sizeof(float));
float *c_A; cudamalloc((void**)&c_A,N*N*Nmatrices*sizeof(float));

// --- Move the matrix to be inverted from host to device

// --- Creating the array of pointers needed as input to the batched getrf


float **d_inout_pointers;
cudamalloc((void**)&d_inout_pointers,Nmatrices*sizeof(float *));


float **rd_inout_pointers;
cudamalloc((void**)&rd_inout_pointers,Nmatrices*sizeof(float *));


int *d_PivotArray; cudamalloc((void**)&d_PivotArray,N*Nmatrices*sizeof(int));
int *d_InfoArray;  cudamalloc((void**)&d_InfoArray,Nmatrices*sizeof(int));



for(int i = 0; i<2; i++){
cudamemcpy(d_A,h_A,N*N*Nmatrices*sizeof(float),cudamemcpyHostToDevice);

copy<<<1,10>>>(d_inout_pointers,d_A,rd_inout_pointers,c_A,count);
inv<<<1,1>>>(d_inout_pointers,rd_inout_pointers);


cudamemcpy(h_A,N*N*sizeof(float),cudamemcpyDevicetoHost);
cudamemcpy(r_A,Nmatrices*N*N*sizeof(float),cudamemcpyDevicetoHost);


for (int i=0; i<N*N*Nmatrices; i++) printf("A[%i]=%f\n",i,r_A[i]);
h_A = r_A;


}
cudaFree(c_A);
cudaFree(d_A);
cudaFree(d_inout_pointers);
cudaFree(rd_inout_pointers);
cudaFree(d_InfoArray);
cudaFree(d_PivotArray);



return 0;
}