3
|
1 #include <stdio.h>
|
|
2 #include <stdlib.h>
|
|
3 #include <math.h>
|
|
4 #include <sys/stat.h>
|
|
5 #include <fcntl.h>
|
|
6
|
|
7 #ifdef __APPLE__
|
|
8 #include <OpenCL/opencl.h>
|
|
9 #else
|
|
10 #include <CL/cl.h>
|
|
11 #endif
|
|
12
|
|
13 #include "pgm.h"
|
|
14
|
|
15 #define PI 3.14159265358979
|
|
16
|
|
17 #define MAX_SOURCE_SIZE (0x100000)
|
|
18
|
|
19 #define AMP(a, b) (sqrt((a)*(a)+(b)*(b)))
|
|
20
|
|
21 cl_device_id device_id = NULL;
|
|
22 cl_context context = NULL;
|
|
23 cl_command_queue queue = NULL;
|
|
24 cl_program program = NULL;
|
|
25 cl_device_type device_type = CL_DEVICE_TYPE_GPU;
|
|
26
|
|
27 enum Mode {
|
|
28 forward = 0,
|
|
29 inverse = 1
|
|
30 };
|
|
31
|
|
32 int setWorkSize(size_t* gws, size_t* lws, cl_int x, cl_int y)
|
|
33 {
|
|
34 switch(y) {
|
|
35 case 1:
|
|
36 gws[0] = x;
|
|
37 gws[1] = 1;
|
|
38 lws[0] = 1;
|
|
39 lws[1] = 1;
|
|
40 break;
|
|
41 default:
|
|
42 gws[0] = x;
|
|
43 gws[1] = y;
|
|
44 lws[0] = 1;
|
|
45 lws[1] = 1;
|
|
46 break;
|
|
47 }
|
|
48
|
|
49 return 0;
|
|
50 }
|
|
51
|
|
52 int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction)
|
|
53 {
|
|
54 cl_int ret;
|
|
55
|
|
56 cl_int iter;
|
|
57 cl_uint flag;
|
|
58
|
|
59 cl_int n = 1<<m;
|
|
60
|
|
61 cl_event kernelDone;
|
|
62
|
|
63 cl_kernel brev = NULL;
|
|
64 cl_kernel bfly = NULL;
|
|
65 cl_kernel norm = NULL;
|
|
66
|
|
67 brev = clCreateKernel(program, "bitReverse", &ret);
|
|
68 bfly = clCreateKernel(program, "butterfly", &ret);
|
|
69 norm = clCreateKernel(program, "norm", &ret);
|
|
70
|
|
71 size_t gws[2];
|
|
72 size_t lws[2];
|
|
73
|
|
74 switch (direction) {
|
|
75 case forward:flag = 0x00000000; break;
|
|
76 case inverse:flag = 0x80000000; break;
|
|
77 }
|
|
78
|
|
79 ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst);
|
|
80 ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src);
|
|
81 ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m);
|
|
82 ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n);
|
|
83
|
|
84 ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst);
|
|
85 ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin);
|
|
86 ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m);
|
|
87 ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n);
|
|
88 ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag);
|
|
89
|
|
90 ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst);
|
|
91 ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n);
|
|
92
|
|
93 /* Reverse bit ordering */
|
|
94 setWorkSize(gws, lws, n, n);
|
|
95 ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL);
|
|
96
|
|
97 /* Perform Butterfly Operations*/
|
|
98 setWorkSize(gws, lws, n/2, n);
|
|
99 for (iter=1; iter <= m; iter++) {
|
|
100 ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter);
|
|
101 ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone);
|
|
102 ret = clWaitForEvents(1, &kernelDone);
|
|
103 }
|
|
104
|
|
105 if (direction == inverse) {
|
|
106 setWorkSize(gws, lws, n, n);
|
|
107 ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone);
|
|
108 ret = clWaitForEvents(1, &kernelDone);
|
|
109 }
|
|
110
|
|
111 ret = clReleaseKernel(bfly);
|
|
112 ret = clReleaseKernel(brev);
|
|
113 ret = clReleaseKernel(norm);
|
|
114
|
|
115 return 0;
|
|
116 }
|
|
117
|
|
118 char *
|
|
119 init(int argc, char**argv){
|
|
120
|
|
121 char *filename = 0;
|
|
122
|
|
123 for (int i = 1; argv[i]; ++i) {
|
|
124 if (strcmp(argv[i], "-file") == 0) {
|
|
125 filename = argv[i+1];
|
|
126 } else if (strcmp(argv[i], "-cpu") == 0) {
|
|
127 device_type = CL_DEVICE_TYPE_CPU;
|
|
128 } else if (strcmp(argv[i], "-gpu") == 0) {
|
|
129 device_type = CL_DEVICE_TYPE_GPU;
|
|
130 }
|
|
131 }
|
|
132 if ( (argc == 1)||(filename==0)) {
|
|
133 printf("Usage: ./fft [image filename]\n");
|
|
134 exit(-1);
|
|
135 }
|
|
136
|
|
137 return filename;
|
|
138 }
|
|
139
|
|
140 int main(int argc, char** argv) {
|
|
141 cl_mem xmobj = NULL;
|
|
142 cl_mem rmobj = NULL;
|
|
143 cl_mem wmobj = NULL;
|
|
144 cl_kernel sfac = NULL;
|
|
145 cl_kernel trns = NULL;
|
|
146 cl_kernel hpfl = NULL;
|
|
147
|
|
148 cl_platform_id platform_id = NULL;
|
|
149
|
|
150 cl_uint ret_num_devices;
|
|
151 cl_uint ret_num_platforms;
|
|
152
|
|
153 cl_int ret;
|
|
154
|
|
155 cl_float2 *xm;
|
|
156 cl_float2 *rm;
|
|
157 cl_float2 *wm;
|
|
158
|
|
159 pgm_t ipgm;
|
|
160 pgm_t opgm;
|
|
161
|
|
162 const char fileName[] = "./fft.cl";
|
|
163 size_t source_size;
|
|
164 char *source_str;
|
|
165 cl_int i, j;
|
|
166 cl_int n;
|
|
167 cl_int m;
|
|
168
|
|
169 size_t gws[2];
|
|
170 size_t lws[2];
|
|
171
|
|
172 /* Load kernel source code */
|
|
173 int fd = open(fileName, O_RDONLY);
|
|
174
|
|
175 if (fd<0) {
|
|
176 fprintf(stderr, "Failed to load kernel %s.\n",fileName);
|
|
177 exit(1);
|
|
178 }
|
|
179 struct stat stats;
|
|
180 fstat(fd, &stats);
|
|
181 off_t size = stats.st_size;
|
|
182 if (size<=0) {
|
|
183 fprintf(stderr, "Failed to load kernel.\n");
|
|
184 exit(1);
|
|
185 }
|
|
186 source_str = (char*)alloca(size);
|
|
187 source_size = read(fd, source_str, size);
|
|
188 close( fd );
|
|
189
|
|
190 char * pgm_file = init(argc,argv);
|
|
191
|
|
192 /* Read image */
|
|
193 int err = readPGM(&ipgm, pgm_file);
|
|
194 if (err<0) {
|
|
195 fprintf(stderr, "Failed to read image file.\n");
|
|
196 exit(1);
|
|
197 }
|
|
198
|
|
199 n = ipgm.width;
|
|
200 m = (cl_int)(log((double)n)/log(2.0));
|
|
201
|
|
202 xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
|
|
203 rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
|
|
204 wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));
|
|
205
|
|
206 for (i=0; i < n; i++) {
|
|
207 for (j=0; j < n; j++) {
|
|
208 ((float*)xm)[(2*n*j)+2*i+0] = (float)ipgm.buf[n*j+i];
|
|
209 ((float*)xm)[(2*n*j)+2*i+1] = (float)0;
|
|
210 }
|
|
211 }
|
|
212
|
|
213 /* Get platform/device */
|
|
214 ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
|
|
215 ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
|
|
216
|
|
217 /* Create OpenCL context */
|
|
218 context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
|
|
219
|
|
220 /* Create Command queue */
|
|
221 queue = clCreateCommandQueue(context, device_id, 0, &ret);
|
|
222
|
|
223 /* Create Buffer Objects */
|
|
224 xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
|
|
225 rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
|
|
226 wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret);
|
|
227
|
|
228 /* Transfer data to memory buffer */
|
|
229 ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);
|
|
230
|
|
231 /* Create kernel program from source */
|
|
232 program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
|
|
233
|
|
234 /* Build kernel program */
|
|
235 ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
|
|
236
|
|
237 if (ret<0) {
|
|
238 size_t size;
|
|
239 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
|
|
240
|
|
241 char *log = new char[size];
|
|
242 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
|
|
243 printf("%s ",log);
|
|
244 exit (ret);
|
|
245 }
|
|
246
|
|
247 /* Create OpenCL Kernel */
|
|
248 sfac = clCreateKernel(program, "spinFact", &ret);
|
|
249 trns = clCreateKernel(program, "transpose", &ret);
|
|
250 hpfl = clCreateKernel(program, "highPassFilter", &ret);
|
|
251
|
|
252 /* Create spin factor */
|
|
253 ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj);
|
|
254 ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n);
|
|
255 setWorkSize(gws, lws, n/2, 1);
|
|
256 ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL);
|
|
257
|
|
258 /* Butterfly Operation */
|
|
259 fftCore(rmobj, xmobj, wmobj, m, forward);
|
|
260
|
|
261 /* Transpose matrix */
|
|
262 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj);
|
|
263 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj);
|
|
264 ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n);
|
|
265 setWorkSize(gws, lws, n, n);
|
|
266 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
|
|
267
|
|
268 /* Butterfly Operation */
|
|
269 fftCore(rmobj, xmobj, wmobj, m, forward);
|
|
270
|
|
271 /* Apply high-pass filter */
|
|
272 cl_int radius = n/8;
|
|
273 ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj);
|
|
274 ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n);
|
|
275 ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius);
|
|
276 setWorkSize(gws, lws, n, n);
|
|
277 ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL);
|
|
278
|
|
279 /* Inverse FFT */
|
|
280
|
|
281 /* Butterfly Operation */
|
|
282 fftCore(xmobj, rmobj, wmobj, m, inverse);
|
|
283
|
|
284 /* Transpose matrix */
|
|
285 ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj);
|
|
286 ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj);
|
|
287 setWorkSize(gws, lws, n, n);
|
|
288 ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
|
|
289
|
|
290 /* Butterfly Operation */
|
|
291
|
|
292 fftCore(xmobj, rmobj, wmobj, m, inverse);
|
|
293
|
|
294 /* Read data from memory buffer */
|
|
295 ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);
|
|
296
|
|
297 /* */
|
|
298 float* ampd;
|
|
299 ampd = (float*)malloc(n*n*sizeof(float));
|
|
300 for (i=0; i < n; i++) {
|
|
301 for (j=0; j < n; j++) {
|
|
302 ampd[n*((i))+((j))] = (AMP(((float*)xm)[(2*n*i)+2*j], ((float*)xm)[(2*n*i)+2*j+1]));
|
|
303 }
|
|
304 }
|
|
305 opgm.width = n;
|
|
306 opgm.height = n;
|
|
307 normalizeF2PGM(&opgm, ampd);
|
|
308 free(ampd);
|
|
309
|
|
310 /* Write out image */
|
|
311 writePGM(&opgm, "output.pgm");
|
|
312
|
|
313 /* Finalizations*/
|
|
314 ret = clFlush(queue);
|
|
315 ret = clFinish(queue);
|
|
316 ret = clReleaseKernel(hpfl);
|
|
317 ret = clReleaseKernel(trns);
|
|
318 ret = clReleaseKernel(sfac);
|
|
319 ret = clReleaseProgram(program);
|
|
320 ret = clReleaseMemObject(xmobj);
|
|
321 ret = clReleaseMemObject(rmobj);
|
|
322 ret = clReleaseMemObject(wmobj);
|
|
323 ret = clReleaseCommandQueue(queue);
|
|
324 ret = clReleaseContext(context);
|
|
325
|
|
326 destroyPGM(&ipgm);
|
|
327 destroyPGM(&opgm);
|
|
328
|
|
329 free(wm);
|
|
330 free(rm);
|
|
331 free(xm);
|
|
332
|
|
333 fprintf(stdout, "image out put succeeded.\n");
|
|
334 return 0;
|
|
335 }
|