Mercurial > hg > Members > yuuhi > OpenCL
comparison fft_fixstart/main.cc @ 3:f3cfea46e585
add fft_fixstar sample
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 04 Feb 2013 02:59:58 +0900 |
parents | |
children | 8df0d3128672 |
comparison
equal
deleted
inserted
replaced
2:ccea4e6a1945 | 3:f3cfea46e585 |
---|---|
1 #include <stdio.h> | |
2 #include <stdlib.h> | |
3 #include <math.h> | |
4 #include <sys/stat.h> | |
5 #include <fcntl.h> | |
6 | |
7 #ifdef __APPLE__ | |
8 #include <OpenCL/opencl.h> | |
9 #else | |
10 #include <CL/cl.h> | |
11 #endif | |
12 | |
13 #include "pgm.h" | |
14 | |
15 #define PI 3.14159265358979 | |
16 | |
17 #define MAX_SOURCE_SIZE (0x100000) | |
18 | |
19 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b))) | |
20 | |
21 cl_device_id device_id = NULL; | |
22 cl_context context = NULL; | |
23 cl_command_queue queue = NULL; | |
24 cl_program program = NULL; | |
25 cl_device_type device_type = CL_DEVICE_TYPE_GPU; | |
26 | |
27 enum Mode { | |
28 forward = 0, | |
29 inverse = 1 | |
30 }; | |
31 | |
32 int setWorkSize(size_t* gws, size_t* lws, cl_int x, cl_int y) | |
33 { | |
34 switch(y) { | |
35 case 1: | |
36 gws[0] = x; | |
37 gws[1] = 1; | |
38 lws[0] = 1; | |
39 lws[1] = 1; | |
40 break; | |
41 default: | |
42 gws[0] = x; | |
43 gws[1] = y; | |
44 lws[0] = 1; | |
45 lws[1] = 1; | |
46 break; | |
47 } | |
48 | |
49 return 0; | |
50 } | |
51 | |
52 int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction) | |
53 { | |
54 cl_int ret; | |
55 | |
56 cl_int iter; | |
57 cl_uint flag; | |
58 | |
59 cl_int n = 1<<m; | |
60 | |
61 cl_event kernelDone; | |
62 | |
63 cl_kernel brev = NULL; | |
64 cl_kernel bfly = NULL; | |
65 cl_kernel norm = NULL; | |
66 | |
67 brev = clCreateKernel(program, "bitReverse", &ret); | |
68 bfly = clCreateKernel(program, "butterfly", &ret); | |
69 norm = clCreateKernel(program, "norm", &ret); | |
70 | |
71 size_t gws[2]; | |
72 size_t lws[2]; | |
73 | |
74 switch (direction) { | |
75 case forward:flag = 0x00000000; break; | |
76 case inverse:flag = 0x80000000; break; | |
77 } | |
78 | |
79 ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst); | |
80 ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src); | |
81 ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m); | |
82 ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n); | |
83 | |
84 ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst); | |
85 ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin); | |
86 ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m); | |
87 ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n); | |
88 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag); | |
89 | |
90 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); | |
91 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); | |
92 | |
93 /* Reverse bit ordering */ | |
94 setWorkSize(gws, lws, n, n); | |
95 ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL); | |
96 | |
97 /* Perform Butterfly Operations*/ | |
98 setWorkSize(gws, lws, n/2, n); | |
99 for (iter=1; iter <= m; iter++) { | |
100 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); | |
101 ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone); | |
102 ret = clWaitForEvents(1, &kernelDone); | |
103 } | |
104 | |
105 if (direction == inverse) { | |
106 setWorkSize(gws, lws, n, n); | |
107 ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone); | |
108 ret = clWaitForEvents(1, &kernelDone); | |
109 } | |
110 | |
111 ret = clReleaseKernel(bfly); | |
112 ret = clReleaseKernel(brev); | |
113 ret = clReleaseKernel(norm); | |
114 | |
115 return 0; | |
116 } | |
117 | |
118 char * | |
119 init(int argc, char**argv){ | |
120 | |
121 char *filename = 0; | |
122 | |
123 for (int i = 1; argv[i]; ++i) { | |
124 if (strcmp(argv[i], "-file") == 0) { | |
125 filename = argv[i+1]; | |
126 } else if (strcmp(argv[i], "-cpu") == 0) { | |
127 device_type = CL_DEVICE_TYPE_CPU; | |
128 } else if (strcmp(argv[i], "-gpu") == 0) { | |
129 device_type = CL_DEVICE_TYPE_GPU; | |
130 } | |
131 } | |
132 if ( (argc == 1)||(filename==0)) { | |
133 printf("Usage: ./fft [image filename]\n"); | |
134 exit(-1); | |
135 } | |
136 | |
137 return filename; | |
138 } | |
139 | |
140 int main(int argc, char** argv) { | |
141 cl_mem xmobj = NULL; | |
142 cl_mem rmobj = NULL; | |
143 cl_mem wmobj = NULL; | |
144 cl_kernel sfac = NULL; | |
145 cl_kernel trns = NULL; | |
146 cl_kernel hpfl = NULL; | |
147 | |
148 cl_platform_id platform_id = NULL; | |
149 | |
150 cl_uint ret_num_devices; | |
151 cl_uint ret_num_platforms; | |
152 | |
153 cl_int ret; | |
154 | |
155 cl_float2 *xm; | |
156 cl_float2 *rm; | |
157 cl_float2 *wm; | |
158 | |
159 pgm_t ipgm; | |
160 pgm_t opgm; | |
161 | |
162 const char fileName[] = "./fft.cl"; | |
163 size_t source_size; | |
164 char *source_str; | |
165 cl_int i, j; | |
166 cl_int n; | |
167 cl_int m; | |
168 | |
169 size_t gws[2]; | |
170 size_t lws[2]; | |
171 | |
172 /* Load kernel source code */ | |
173 int fd = open(fileName, O_RDONLY); | |
174 | |
175 if (fd<0) { | |
176 fprintf(stderr, "Failed to load kernel %s.\n",fileName); | |
177 exit(1); | |
178 } | |
179 struct stat stats; | |
180 fstat(fd, &stats); | |
181 off_t size = stats.st_size; | |
182 if (size<=0) { | |
183 fprintf(stderr, "Failed to load kernel.\n"); | |
184 exit(1); | |
185 } | |
186 source_str = (char*)alloca(size); | |
187 source_size = read(fd, source_str, size); | |
188 close( fd ); | |
189 | |
190 char * pgm_file = init(argc,argv); | |
191 | |
192 /* Read image */ | |
193 int err = readPGM(&ipgm, pgm_file); | |
194 if (err<0) { | |
195 fprintf(stderr, "Failed to read image file.\n"); | |
196 exit(1); | |
197 } | |
198 | |
199 n = ipgm.width; | |
200 m = (cl_int)(log((double)n)/log(2.0)); | |
201 | |
202 xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); | |
203 rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); | |
204 wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); | |
205 | |
206 for (i=0; i < n; i++) { | |
207 for (j=0; j < n; j++) { | |
208 ((float*)xm)[(2*n*j)+2*i+0] = (float)ipgm.buf[n*j+i]; | |
209 ((float*)xm)[(2*n*j)+2*i+1] = (float)0; | |
210 } | |
211 } | |
212 | |
213 /* Get platform/device */ | |
214 ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); | |
215 ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); | |
216 | |
217 /* Create OpenCL context */ | |
218 context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); | |
219 | |
220 /* Create Command queue */ | |
221 queue = clCreateCommandQueue(context, device_id, 0, &ret); | |
222 | |
223 /* Create Buffer Objects */ | |
224 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); | |
225 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); | |
226 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); | |
227 | |
228 /* Transfer data to memory buffer */ | |
229 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); | |
230 | |
231 /* Create kernel program from source */ | |
232 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); | |
233 | |
234 /* Build kernel program */ | |
235 ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); | |
236 | |
237 if (ret<0) { | |
238 size_t size; | |
239 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); | |
240 | |
241 char *log = new char[size]; | |
242 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); | |
243 printf("%s ",log); | |
244 exit (ret); | |
245 } | |
246 | |
247 /* Create OpenCL Kernel */ | |
248 sfac = clCreateKernel(program, "spinFact", &ret); | |
249 trns = clCreateKernel(program, "transpose", &ret); | |
250 hpfl = clCreateKernel(program, "highPassFilter", &ret); | |
251 | |
252 /* Create spin factor */ | |
253 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); | |
254 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); | |
255 setWorkSize(gws, lws, n/2, 1); | |
256 ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL); | |
257 | |
258 /* Butterfly Operation */ | |
259 fftCore(rmobj, xmobj, wmobj, m, forward); | |
260 | |
261 /* Transpose matrix */ | |
262 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); | |
263 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); | |
264 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); | |
265 setWorkSize(gws, lws, n, n); | |
266 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); | |
267 | |
268 /* Butterfly Operation */ | |
269 fftCore(rmobj, xmobj, wmobj, m, forward); | |
270 | |
271 /* Apply high-pass filter */ | |
272 cl_int radius = n/8; | |
273 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); | |
274 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); | |
275 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); | |
276 setWorkSize(gws, lws, n, n); | |
277 ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL); | |
278 | |
279 /* Inverse FFT */ | |
280 | |
281 /* Butterfly Operation */ | |
282 fftCore(xmobj, rmobj, wmobj, m, inverse); | |
283 | |
284 /* Transpose matrix */ | |
285 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); | |
286 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); | |
287 setWorkSize(gws, lws, n, n); | |
288 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); | |
289 | |
290 /* Butterfly Operation */ | |
291 | |
292 fftCore(xmobj, rmobj, wmobj, m, inverse); | |
293 | |
294 /* Read data from memory buffer */ | |
295 ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); | |
296 | |
297 /* */ | |
298 float* ampd; | |
299 ampd = (float*)malloc(n*n*sizeof(float)); | |
300 for (i=0; i < n; i++) { | |
301 for (j=0; j < n; j++) { | |
302 ampd[n*((i))+((j))] = (AMP(((float*)xm)[(2*n*i)+2*j], ((float*)xm)[(2*n*i)+2*j+1])); | |
303 } | |
304 } | |
305 opgm.width = n; | |
306 opgm.height = n; | |
307 normalizeF2PGM(&opgm, ampd); | |
308 free(ampd); | |
309 | |
310 /* Write out image */ | |
311 writePGM(&opgm, "output.pgm"); | |
312 | |
313 /* Finalizations*/ | |
314 ret = clFlush(queue); | |
315 ret = clFinish(queue); | |
316 ret = clReleaseKernel(hpfl); | |
317 ret = clReleaseKernel(trns); | |
318 ret = clReleaseKernel(sfac); | |
319 ret = clReleaseProgram(program); | |
320 ret = clReleaseMemObject(xmobj); | |
321 ret = clReleaseMemObject(rmobj); | |
322 ret = clReleaseMemObject(wmobj); | |
323 ret = clReleaseCommandQueue(queue); | |
324 ret = clReleaseContext(context); | |
325 | |
326 destroyPGM(&ipgm); | |
327 destroyPGM(&opgm); | |
328 | |
329 free(wm); | |
330 free(rm); | |
331 free(xm); | |
332 | |
333 fprintf(stdout, "image out put succeeded.\n"); | |
334 return 0; | |
335 } |