comparison fft_fixstart/main.cc @ 10:e38bef2012bc

fix sync bug. and NDrange option
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Mon, 11 Feb 2013 06:52:24 +0900
parents ed3d4a769bf3
children
comparison
equal deleted inserted replaced
9:ed3d4a769bf3 10:e38bef2012bc
19 19
20 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b))) 20 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b)))
21 21
22 static double st_time; 22 static double st_time;
23 static double ed_time; 23 static double ed_time;
24 int ndrange_flag;
24 25
25 cl_device_id device_id = NULL; 26 cl_device_id device_id = NULL;
26 cl_context context = NULL; 27 cl_context context = NULL;
27 cl_command_queue queue = NULL; 28 cl_command_queue queue = NULL;
28 cl_program program = NULL; 29 cl_program program = NULL;
100 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag); 101 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag);
101 102
102 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); 103 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst);
103 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); 104 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n);
104 105
105 /* Reverse bit ordering */ 106 /* Reversee bit ordering */
106 setWorkSize(gws, lws, n, n); 107 if (ndrange_flag == 1) {
107 ret = clEnqueueTask(queue, brev, 0, NULL, NULL); 108 setWorkSize(gws, lws, n, n);
109 ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL);
110 } else {
111 ret = clEnqueueTask(queue, brev, 0, NULL, NULL);
112 }
108 113
109 /* Perform Butterfly Operations*/ 114 /* Perform Butterfly Operations*/
110 setWorkSize(gws, lws, n/2, n); 115 setWorkSize(gws, lws, n/2, n);
111 for (iter=1; iter <= m; iter++) { 116 for (iter=1; iter <= m; iter++) {
112 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); 117 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter);
113 ret = clEnqueueTask(queue, bfly, 0, NULL, NULL); 118 if (ndrange_flag == 1) {
119 ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone);
120 } else {
121 ret = clEnqueueTask(queue, bfly, 0, NULL, &kernelDone);
122 }
114 ret = clWaitForEvents(1, &kernelDone); 123 ret = clWaitForEvents(1, &kernelDone);
115 } 124 }
116 125
117 if (direction == inverse) { 126 if (direction == inverse) {
118 setWorkSize(gws, lws, n, n); 127 if (ndrange_flag == 1) {
119 ret = clEnqueueTask(queue, norm, 0, NULL, NULL); 128 setWorkSize(gws, lws, n, n);
129 ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone);
130 } else {
131 ret = clEnqueueTask(queue, norm, 0, NULL, &kernelDone);
132 }
120 ret = clWaitForEvents(1, &kernelDone); 133 ret = clWaitForEvents(1, &kernelDone);
121 } 134 }
122 135
123 ret = clReleaseKernel(bfly); 136 ret = clReleaseKernel(bfly);
124 ret = clReleaseKernel(brev); 137 ret = clReleaseKernel(brev);
139 device_type = CL_DEVICE_TYPE_CPU; 152 device_type = CL_DEVICE_TYPE_CPU;
140 } else if (strcmp(argv[i], "-gpu") == 0) { 153 } else if (strcmp(argv[i], "-gpu") == 0) {
141 device_type = CL_DEVICE_TYPE_GPU; 154 device_type = CL_DEVICE_TYPE_GPU;
142 } else if (strcmp(argv[i], "-all") == 0) { 155 } else if (strcmp(argv[i], "-all") == 0) {
143 device_type = CL_DEVICE_TYPE_ALL; 156 device_type = CL_DEVICE_TYPE_ALL;
157 } else if (strcmp(argv[i], "-nd") == 0 ) {
158 ndrange_flag = 1;
144 } 159 }
145 } 160 }
146 if ( (argc == 1)||(filename==0)) { 161 if ( (argc == 1)||(filename==0)) {
147 printf("Usage: ./fft -file [image filename] -cpu or -gpu \n"); 162 printf("Usage: ./fft -file [image filename] -cpu or -gpu \n");
148 exit(-1); 163 exit(-1);
164 cl_uint ret_num_devices; 179 cl_uint ret_num_devices;
165 cl_uint ret_num_platforms; 180 cl_uint ret_num_platforms;
166 181
167 cl_int ret; 182 cl_int ret;
168 183
184
169 cl_float2 *xm; 185 cl_float2 *xm;
170 cl_float2 *rm; 186 cl_float2 *rm;
171 cl_float2 *wm; 187 cl_float2 *wm;
188
189 /*
190 * typedef union
191 * {
192 * cl_float CL_ALIGNED(8) s[2];
193 * #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
194 * __extension__ struct{ cl_float x, y; };
195 * __extension__ struct{ cl_float s0, s1; };
196 * __extension__ struct{ cl_float lo, hi; };
197 * #endif
198 * #if defined( __CL_FLOAT2__)
199 * __cl_float2 v2;
200 * #endif
201 * }cl_float2;
202 */
172 203
173 pgm_t ipgm; 204 pgm_t ipgm;
174 pgm_t opgm; 205 pgm_t opgm;
175 206
176 const char fileName[] = "./fft.cl"; 207 const char fileName[] = "./fft.cl";
238 st_time = getTime(); 269 st_time = getTime();
239 /* Create Buffer Objects */ 270 /* Create Buffer Objects */
240 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); 271 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
241 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); 272 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
242 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); 273 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret);
243 274
244 /* Transfer data to memory buffer */ 275 /* Transfer data to memory buffer */
245 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); 276 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);
246 277
247 /* Create kernel program from source */ 278 /* Create kernel program from source */
248 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); 279 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
266 hpfl = clCreateKernel(program, "highPassFilter", &ret); 297 hpfl = clCreateKernel(program, "highPassFilter", &ret);
267 298
268 /* Create spin factor */ 299 /* Create spin factor */
269 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); 300 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj);
270 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); 301 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n);
271 setWorkSize(gws, lws, n/2, 1); 302 if (ndrange_flag == 1) {
272 ret = clEnqueueTask(queue, sfac, 0, NULL, NULL); 303 setWorkSize(gws, lws, n/2, 1);
304 ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL);
305 } else {
306 ret = clEnqueueTask(queue, sfac, 0, NULL, NULL);
307 }
273 308
274 /* Butterfly Operation */ 309 /* Butterfly Operation */
275 fftCore(rmobj, xmobj, wmobj, m, forward); 310 fftCore(rmobj, xmobj, wmobj, m, forward);
276 311
277 /* Transpose matrix */ 312 /* Transpose matrix */
278 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); 313 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj);
279 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); 314 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj);
280 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); 315 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n);
281 setWorkSize(gws, lws, n, n); 316 if (ndrange_flag == 1) {
282 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); 317 setWorkSize(gws, lws, n, n);
318 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
319 } else {
320 ret = clEnqueueTask(queue, trns, 0, NULL, NULL);
321 }
283 322
284 /* Butterfly Operation */ 323 /* Butterfly Operation */
285 fftCore(rmobj, xmobj, wmobj, m, forward); 324 fftCore(rmobj, xmobj, wmobj, m, forward);
286 325
287 /* Apply high-pass filter */ 326 /* Apply high-pass filter */
288 cl_int radius = n/8; 327 cl_int radius = n/8;
289 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); 328 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj);
290 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); 329 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n);
291 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); 330 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius);
292 setWorkSize(gws, lws, n, n); 331 if (ndrange_flag == 1) {
293 ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL); 332 setWorkSize(gws, lws, n, n);
333 ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL);
334 } else {
335 ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL);
336 }
294 337
295 /* Inverse FFT */ 338 /* Inverse FFT */
296 339
297 /* Butterfly Operation */ 340 /* Butterfly Operation */
298 fftCore(xmobj, rmobj, wmobj, m, inverse); 341 fftCore(xmobj, rmobj, wmobj, m, inverse);
299 342
300 /* Transpose matrix */ 343 /* Transpose matrix */
301 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); 344 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj);
302 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); 345 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj);
303 setWorkSize(gws, lws, n, n); 346 if (ndrange_flag == 1) {
304 ret = clEnqueueTask(queue, trns, 0, NULL, NULL); 347 setWorkSize(gws, lws, n, n);
305 348 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
349 } else {
350 ret = clEnqueueTask(queue, trns, 0, NULL, NULL);
351 }
306 /* Butterfly Operation */ 352 /* Butterfly Operation */
307 353
308 fftCore(xmobj, rmobj, wmobj, m, inverse); 354 fftCore(xmobj, rmobj, wmobj, m, inverse);
309 355
310 /* Read data from memory buffer */ 356 /* Read data from memory buffer */