Mercurial > hg > Members > yuuhi > OpenCL
comparison fft_fixstart/main.cc @ 10:e38bef2012bc
fix sync bug. and NDrange option
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 11 Feb 2013 06:52:24 +0900 |
parents | ed3d4a769bf3 |
children |
comparison
equal
deleted
inserted
replaced
9:ed3d4a769bf3 | 10:e38bef2012bc |
---|---|
19 | 19 |
20 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b))) | 20 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b))) |
21 | 21 |
22 static double st_time; | 22 static double st_time; |
23 static double ed_time; | 23 static double ed_time; |
24 int ndrange_flag; | |
24 | 25 |
25 cl_device_id device_id = NULL; | 26 cl_device_id device_id = NULL; |
26 cl_context context = NULL; | 27 cl_context context = NULL; |
27 cl_command_queue queue = NULL; | 28 cl_command_queue queue = NULL; |
28 cl_program program = NULL; | 29 cl_program program = NULL; |
100 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag); | 101 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag); |
101 | 102 |
102 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); | 103 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); |
103 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); | 104 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); |
104 | 105 |
105 /* Reverse bit ordering */ | 106 /* Reversee bit ordering */ |
106 setWorkSize(gws, lws, n, n); | 107 if (ndrange_flag == 1) { |
107 ret = clEnqueueTask(queue, brev, 0, NULL, NULL); | 108 setWorkSize(gws, lws, n, n); |
109 ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL); | |
110 } else { | |
111 ret = clEnqueueTask(queue, brev, 0, NULL, NULL); | |
112 } | |
108 | 113 |
109 /* Perform Butterfly Operations*/ | 114 /* Perform Butterfly Operations*/ |
110 setWorkSize(gws, lws, n/2, n); | 115 setWorkSize(gws, lws, n/2, n); |
111 for (iter=1; iter <= m; iter++) { | 116 for (iter=1; iter <= m; iter++) { |
112 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); | 117 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); |
113 ret = clEnqueueTask(queue, bfly, 0, NULL, NULL); | 118 if (ndrange_flag == 1) { |
119 ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone); | |
120 } else { | |
121 ret = clEnqueueTask(queue, bfly, 0, NULL, &kernelDone); | |
122 } | |
114 ret = clWaitForEvents(1, &kernelDone); | 123 ret = clWaitForEvents(1, &kernelDone); |
115 } | 124 } |
116 | 125 |
117 if (direction == inverse) { | 126 if (direction == inverse) { |
118 setWorkSize(gws, lws, n, n); | 127 if (ndrange_flag == 1) { |
119 ret = clEnqueueTask(queue, norm, 0, NULL, NULL); | 128 setWorkSize(gws, lws, n, n); |
129 ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone); | |
130 } else { | |
131 ret = clEnqueueTask(queue, norm, 0, NULL, &kernelDone); | |
132 } | |
120 ret = clWaitForEvents(1, &kernelDone); | 133 ret = clWaitForEvents(1, &kernelDone); |
121 } | 134 } |
122 | 135 |
123 ret = clReleaseKernel(bfly); | 136 ret = clReleaseKernel(bfly); |
124 ret = clReleaseKernel(brev); | 137 ret = clReleaseKernel(brev); |
139 device_type = CL_DEVICE_TYPE_CPU; | 152 device_type = CL_DEVICE_TYPE_CPU; |
140 } else if (strcmp(argv[i], "-gpu") == 0) { | 153 } else if (strcmp(argv[i], "-gpu") == 0) { |
141 device_type = CL_DEVICE_TYPE_GPU; | 154 device_type = CL_DEVICE_TYPE_GPU; |
142 } else if (strcmp(argv[i], "-all") == 0) { | 155 } else if (strcmp(argv[i], "-all") == 0) { |
143 device_type = CL_DEVICE_TYPE_ALL; | 156 device_type = CL_DEVICE_TYPE_ALL; |
157 } else if (strcmp(argv[i], "-nd") == 0 ) { | |
158 ndrange_flag = 1; | |
144 } | 159 } |
145 } | 160 } |
146 if ( (argc == 1)||(filename==0)) { | 161 if ( (argc == 1)||(filename==0)) { |
147 printf("Usage: ./fft -file [image filename] -cpu or -gpu \n"); | 162 printf("Usage: ./fft -file [image filename] -cpu or -gpu \n"); |
148 exit(-1); | 163 exit(-1); |
164 cl_uint ret_num_devices; | 179 cl_uint ret_num_devices; |
165 cl_uint ret_num_platforms; | 180 cl_uint ret_num_platforms; |
166 | 181 |
167 cl_int ret; | 182 cl_int ret; |
168 | 183 |
184 | |
169 cl_float2 *xm; | 185 cl_float2 *xm; |
170 cl_float2 *rm; | 186 cl_float2 *rm; |
171 cl_float2 *wm; | 187 cl_float2 *wm; |
188 | |
189 /* | |
190 * typedef union | |
191 * { | |
192 * cl_float CL_ALIGNED(8) s[2]; | |
193 * #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) | |
194 * __extension__ struct{ cl_float x, y; }; | |
195 * __extension__ struct{ cl_float s0, s1; }; | |
196 * __extension__ struct{ cl_float lo, hi; }; | |
197 * #endif | |
198 * #if defined( __CL_FLOAT2__) | |
199 * __cl_float2 v2; | |
200 * #endif | |
201 * }cl_float2; | |
202 */ | |
172 | 203 |
173 pgm_t ipgm; | 204 pgm_t ipgm; |
174 pgm_t opgm; | 205 pgm_t opgm; |
175 | 206 |
176 const char fileName[] = "./fft.cl"; | 207 const char fileName[] = "./fft.cl"; |
238 st_time = getTime(); | 269 st_time = getTime(); |
239 /* Create Buffer Objects */ | 270 /* Create Buffer Objects */ |
240 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); | 271 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); |
241 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); | 272 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); |
242 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); | 273 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); |
243 | 274 |
244 /* Transfer data to memory buffer */ | 275 /* Transfer data to memory buffer */ |
245 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); | 276 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); |
246 | 277 |
247 /* Create kernel program from source */ | 278 /* Create kernel program from source */ |
248 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); | 279 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); |
266 hpfl = clCreateKernel(program, "highPassFilter", &ret); | 297 hpfl = clCreateKernel(program, "highPassFilter", &ret); |
267 | 298 |
268 /* Create spin factor */ | 299 /* Create spin factor */ |
269 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); | 300 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); |
270 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); | 301 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); |
271 setWorkSize(gws, lws, n/2, 1); | 302 if (ndrange_flag == 1) { |
272 ret = clEnqueueTask(queue, sfac, 0, NULL, NULL); | 303 setWorkSize(gws, lws, n/2, 1); |
304 ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL); | |
305 } else { | |
306 ret = clEnqueueTask(queue, sfac, 0, NULL, NULL); | |
307 } | |
273 | 308 |
274 /* Butterfly Operation */ | 309 /* Butterfly Operation */ |
275 fftCore(rmobj, xmobj, wmobj, m, forward); | 310 fftCore(rmobj, xmobj, wmobj, m, forward); |
276 | 311 |
277 /* Transpose matrix */ | 312 /* Transpose matrix */ |
278 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); | 313 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); |
279 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); | 314 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); |
280 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); | 315 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); |
281 setWorkSize(gws, lws, n, n); | 316 if (ndrange_flag == 1) { |
282 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); | 317 setWorkSize(gws, lws, n, n); |
318 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); | |
319 } else { | |
320 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); | |
321 } | |
283 | 322 |
284 /* Butterfly Operation */ | 323 /* Butterfly Operation */ |
285 fftCore(rmobj, xmobj, wmobj, m, forward); | 324 fftCore(rmobj, xmobj, wmobj, m, forward); |
286 | 325 |
287 /* Apply high-pass filter */ | 326 /* Apply high-pass filter */ |
288 cl_int radius = n/8; | 327 cl_int radius = n/8; |
289 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); | 328 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); |
290 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); | 329 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); |
291 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); | 330 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); |
292 setWorkSize(gws, lws, n, n); | 331 if (ndrange_flag == 1) { |
293 ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL); | 332 setWorkSize(gws, lws, n, n); |
333 ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL); | |
334 } else { | |
335 ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL); | |
336 } | |
294 | 337 |
295 /* Inverse FFT */ | 338 /* Inverse FFT */ |
296 | 339 |
297 /* Butterfly Operation */ | 340 /* Butterfly Operation */ |
298 fftCore(xmobj, rmobj, wmobj, m, inverse); | 341 fftCore(xmobj, rmobj, wmobj, m, inverse); |
299 | 342 |
300 /* Transpose matrix */ | 343 /* Transpose matrix */ |
301 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); | 344 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); |
302 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); | 345 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); |
303 setWorkSize(gws, lws, n, n); | 346 if (ndrange_flag == 1) { |
304 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); | 347 setWorkSize(gws, lws, n, n); |
305 | 348 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); |
349 } else { | |
350 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); | |
351 } | |
306 /* Butterfly Operation */ | 352 /* Butterfly Operation */ |
307 | 353 |
308 fftCore(xmobj, rmobj, wmobj, m, inverse); | 354 fftCore(xmobj, rmobj, wmobj, m, inverse); |
309 | 355 |
310 /* Read data from memory buffer */ | 356 /* Read data from memory buffer */ |