OpenCL-4-小试牛刀

前四章已经讲了那么多的概念,下面该真刀真枪地试一试了。

下面是宿主机端程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
//创建2个输入向量
int i;
const int LIST_SIZE=1024;
int *A=(int*)malloc(sizeof(int)*LIST_SIZE);
int *B=(int*)malloc(sizeof(int)*LIST_SIZE);
for (i=0; i<LIST_SIZE; i++) {
A[i]=i;
B[i]=LIST_SIZE-i;
}
//载入内核源码到source_str
FILE *fp;
char *source_str;
size_t source_size;
fp=fopen("vector_add_kernel.cl","r");
if (!fp) {
fprintf(stderr, "Failed to load kernel\n");
exit(1);
}
source_str=(char*)malloc(MAX_SOURCE_SIZE);
source_size=fread(source_str,1,MAX_SOURCE_SIZE,fp);
fclose(fp);
//获得平台和设备信息
cl_platform_id platform_id=NULL;
cl_device_id device_id=NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret=clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret=clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//创建OpenCL上下文
cl_context context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
//创建命令队列
cl_command_queue command_queue=clCreateCommandQueue(context,device_id,0,&ret);
//创建内存缓冲对象,在设备上为每个向量
cl_mem a_mem_obj=clCreateBuffer(context,CL_MEM_READ_ONLY,
LIST_SIZE*sizeof(int),NULL,&ret);
cl_mem b_mem_obj=clCreateBuffer(context,CL_MEM_READ_ONLY,
LIST_SIZE*sizeof(int),NULL,&ret);
cl_mem c_mem_obj=clCreateBuffer(context,CL_MEM_WRITE_ONLY,
LIST_SIZE*sizeof(int),NULL,&ret);
//拷贝数据A和B到对应的内存缓冲
ret=clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,0,
LIST_SIZE*sizeof(int),A,0,NULL,NULL);
ret=clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,0,
LIST_SIZE*sizeof(int),B,0,NULL,NULL);
//创建程序
cl_program program=clCreateProgramWithSource(context,1,
(const char**)&source_str,(const size_t*)&source_size, &ret);
//构建程序
ret=clBuildProgram(program,1,&device_id,NULL,NULL,NULL);
//创建OpenCL内核
cl_kernel kernel=clCreateKernel(program,"vector_add",&ret);
//设置内核参数
ret=clSetKernelArg(kernel,0,sizeof(cl_mem),(void*)&a_mem_obj);
ret=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&b_mem_obj);
ret=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&c_mem_obj);
//执行内核
size_t global_item_size=LIST_SIZE; //处理整个列表
size_t local_item_size=64; //分割为64个组
ret=clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,
&global_item_size,&local_item_size,0,NULL,NULL);
//读取内存缓冲C到本地变量C
int *C=(int*)malloc(sizeof(int)*LIST_SIZE);
ret=clEnqueueReadBuffer(command_queue,c_mem_obj,CL_TRUE,0,
LIST_SIZE*sizeof(int),C,0,NULL,NULL);
//显示结果
for (i=0; i<LIST_SIZE; i++) {
printf("%d + %d = %d\n", A[i], B[i], C[i]);
}
//清理资源
ret=clFlush(command_queue);
ret=clFinish(command_queue);
ret=clReleaseKernel(kernel);
ret=clReleaseProgram(program);
ret=clReleaseMemObject(a_mem_obj);
ret=clReleaseMemObject(b_mem_obj);
ret=clReleaseMemObject(c_mem_obj);
ret=clReleaseCommandQueue(command_queue);
ret=clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}

下面是内核程序(vector_add_kernel.cl):

1
2
3
4
5
6
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
//获得当前要处理元素的索
int i=get_global_id(0);
//做计算
C[i]=A[i]+B[i];
}

之后使用命令:

1
gcc -o Test Test.c -I<Header_to_OpenCL_Path>

进行编译。

版权声明:本文为博主原创文章,转载需声明为转载内容并添加原文地址。

原文地址:http://coderdock.com

Dock wechat
欢迎您扫一扫上面的微信公众号,订阅我的公众号