CPUvectoradd cu include stdio h include stdlib h

在CPU断运行的程序（vectoradd. cu） #include <stdio. h> #include <stdlib. h> #include <math. h> #include <cuda_runtime. h> #include "vector_kernel. cu" #define eps 1 e-9 int main(int argc, char **argv) { int num. Elements = 50000 ; size_t size = num. Elements * sizeof(float) ; printf("[Vector addtion of %d elements]n", num. Elements) ; float *A = (float *)malloc(size) ; float *B = (float *)malloc(size) ; float *C = (float *)malloc(size) ; float *D = (float *)malloc(size) ;

$for(int i = 0; i < num. Elements; ++i) { A[i] = rand()/(float)RAND_MAX ;$

for(int i = 0; i < num. Elements; ++i) { A[i] = rand()/(float)RAND_MAX ; B[i] = rand()/(float)RAND_MAX ; } printf("CPU resultn"); for(int i = 0; i < num. Elements; ++i) { C[i] = A[i] + B[i]; } //alloc the device input vector A float *d_A = NULL ; cuda. Malloc((void **)&d_A, size) ; float *d_B = NULL ; cuda. Malloc((void **)&d_B, size) ; float *d_C = NULL ; cuda. Malloc((void**)&d_C, size) ;

printf("copy input data from the host memory to the CUDA devicen") ; cuda. Memcpy(d_A, A, size, cuda. Memcpy. Host. To. Device) ; cuda. Memcpy(d_B, B, size, cuda. Memcpy. Host. To. Device) ; //Launch the Vector Add CUDA Kernel int threads. Per. Block = 256 ; int blocks. Per. Grid = (num. Elements + threads. Per. Block - 1)/threads. Per. Block ; printf("CUDA kernel launch with %d blocks of %d threadsn", blocks. Per. Grid, threads. Per. Block) ; vector. Add<<<blocks. Per. Grid, threads. Per. Block>>>(d_A, d_B, d_C, num. Elements) ; cuda. Get. Last. Error() ; //copy data from cuda memory to the host memory printf("copy output data from the CUDA device to the host memoryn") ;

// cuda. Memcpy(d_C, D, size, cuda. Memcpy. Device. To. Host) ; cuda. Memcpy(D, d_C, size, cuda. Memcpy. Device. To. Host) ; //verify that the result vector is correct for(int i = 0; i < num. Elements; ++i) { if(fabs( C[i] - D[i]) > eps) { fprintf(stderr, "Result verification failed at element %d!n", i) ; exit(EXIT_FAILURE) ; } } printf("Test PASSEDn") ;

//free device global memory cuda. Free(d_A) ; cuda. Free(d_B) ; cuda. Free(d_C) ; //free host memory free(A) ; free(B) ; free(C) ; //reset the device and exit cuda. Device. Reset() ; printf("Donen") ; return 0 ; }

在GPU端运行的程序(vectoradd_kernel. cu) #ifndef __VECTORADD_KERNEL__ #define __VECTORADD_KERNEL__ __global__ void vector. Add(const float *A, const float *B, float *C, int num. Elements) { int i = block. Dim. x* block. Idx. x + thread. Idx. x ; if(i < num. Elements) C[i] = A[i] + B[i] ; } #endif ��： nvcc –g filename. cu –o filename eg: nvcc –g vectoradd. cu –o vectoradd