template.cu
资源名称:VectorAdd.zip [点击查看]
上传用户:cjzyds
上传日期:2022-03-19
资源大小:8k
文件大小:1k
源码类别:
并行计算
开发平台:
C/C++
- #include <stdio.h>
- #include <cutil.h>
- #define N 100000
- __global__ void VecAdd(float* A, float* B, float* C)
- {
- int i = blockDim.x * blockIdx.x + threadIdx.x;
- if (i < N)
- C[i] = A[i] + B[i];
- }
- int
- main( int argc, char** argv)
- {
- int i;
- size_t size = N * sizeof(float);
- float *h_A, *h_B, *h_C;
- float *d_A, *d_B, *d_C;
- h_A = (float*)malloc(size);
- h_B = (float*)malloc(size);
- h_C = (float*)malloc(size);
- CUDA_SAFE_CALL( cudaMalloc((void**)&d_A, size));
- CUDA_SAFE_CALL( cudaMalloc((void**)&d_B, size));
- CUDA_SAFE_CALL( cudaMalloc((void**)&d_C, size));
- srand(2009);
- for(i=0;i<N;i++)
- {
- h_A[i]=rand()%10;
- h_B[i]=rand()%10;
- }
- CUDA_SAFE_CALL( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
- CUDA_SAFE_CALL( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
- int threadsPerBlock = 256;
- int threadsPerGrid;
- threadsPerGrid = (N + threadsPerBlock -1)/threadsPerBlock;
- VecAdd<<<threadsPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);
- CUT_CHECK_ERROR("Kernel execution failed");
- CUDA_SAFE_CALL( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
- for(i=0;i<N;i+=10000)
- {
- printf("%d: %f + %f = %fn",i,h_A[i],h_B[i],h_C[i]);
- }
- CUDA_SAFE_CALL( cudaFree(d_A));
- CUDA_SAFE_CALL( cudaFree(d_B));
- CUDA_SAFE_CALL( cudaFree(d_C));
- free(h_A);
- free(h_B);
- free(h_C);
- }