diff fft_fixstart/main.cc @ 3:f3cfea46e585

add fft_fixstar sample
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Mon, 04 Feb 2013 02:59:58 +0900
parents
children 8df0d3128672
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_fixstart/main.cc	Mon Feb 04 02:59:58 2013 +0900
@@ -0,0 +1,335 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "pgm.h"
+
+#define PI 3.14159265358979
+
+#define MAX_SOURCE_SIZE (0x100000)
+
+#define AMP(a, b) (sqrt((a)*(a)+(b)*(b)))
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue queue = NULL;
+cl_program program = NULL;
+cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+
+enum Mode {
+    forward = 0,
+    inverse = 1
+};
+
+int setWorkSize(size_t* gws, size_t* lws, cl_int x, cl_int y)
+{
+    switch(y) {
+    case 1:
+        gws[0] = x;
+        gws[1] = 1;
+        lws[0] = 1;
+        lws[1] = 1;
+        break;
+    default:
+        gws[0] = x;
+        gws[1] = y;
+        lws[0] = 1;
+        lws[1] = 1;
+        break;
+    }
+
+    return 0;
+}
+
+int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction)
+{
+    cl_int ret;
+
+    cl_int iter;
+    cl_uint flag;
+
+    cl_int n = 1<<m;
+
+    cl_event kernelDone;
+
+    cl_kernel brev = NULL;
+    cl_kernel bfly = NULL;
+    cl_kernel norm = NULL;
+
+    brev = clCreateKernel(program, "bitReverse", &ret);
+    bfly = clCreateKernel(program, "butterfly", &ret);
+    norm = clCreateKernel(program, "norm", &ret);
+
+    size_t gws[2];
+    size_t lws[2];
+
+    switch (direction) {
+    case forward:flag = 0x00000000; break;
+    case inverse:flag = 0x80000000; break;
+    }
+
+    ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst);
+    ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src);
+    ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m);
+    ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n);
+
+    ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst);
+    ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin);
+    ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m);
+    ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n);
+    ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag);
+
+    ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst);
+    ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n);
+
+    /* Reverse bit ordering */
+    setWorkSize(gws, lws, n, n);
+    ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL);
+
+    /* Perform Butterfly Operations*/
+    setWorkSize(gws, lws, n/2, n);
+    for (iter=1; iter <= m; iter++) {
+        ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter);
+        ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone);
+        ret = clWaitForEvents(1, &kernelDone);
+    }
+
+    if (direction == inverse) {
+        setWorkSize(gws, lws, n, n);
+        ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone);
+        ret = clWaitForEvents(1, &kernelDone);
+    }
+
+    ret = clReleaseKernel(bfly);
+    ret = clReleaseKernel(brev);
+    ret = clReleaseKernel(norm);
+
+    return 0;
+}
+
+char *
+init(int argc, char**argv){
+    
+    char *filename = 0;
+
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-file") == 0) {
+            filename = argv[i+1];
+        } else if (strcmp(argv[i], "-cpu") == 0) {
+            device_type = CL_DEVICE_TYPE_CPU;
+        } else if (strcmp(argv[i], "-gpu") == 0) {
+            device_type = CL_DEVICE_TYPE_GPU;
+        }
+    }
+    if ( (argc == 1)||(filename==0)) {
+        printf("Usage: ./fft [image filename]\n");
+        exit(-1);
+    }
+
+    return filename;
+}
+
+int main(int argc, char** argv) {
+    cl_mem xmobj = NULL;
+    cl_mem rmobj = NULL;
+    cl_mem wmobj = NULL;
+    cl_kernel sfac = NULL;
+    cl_kernel trns = NULL;
+    cl_kernel hpfl = NULL;
+
+    cl_platform_id platform_id = NULL;
+
+    cl_uint ret_num_devices;
+    cl_uint ret_num_platforms;
+
+    cl_int ret;
+
+    cl_float2 *xm;
+    cl_float2 *rm;
+    cl_float2 *wm;
+
+    pgm_t ipgm;
+    pgm_t opgm;
+    
+    const char fileName[] = "./fft.cl";
+    size_t source_size;
+    char *source_str;
+    cl_int i, j;
+    cl_int n;
+    cl_int m;
+
+    size_t gws[2];
+    size_t lws[2];
+
+    /* Load kernel source code */
+    int fd = open(fileName, O_RDONLY);
+    
+    if (fd<0) {
+        fprintf(stderr, "Failed to load kernel %s.\n",fileName);
+        exit(1);
+    }
+    struct stat stats;
+    fstat(fd, &stats);
+    off_t size = stats.st_size;
+    if (size<=0) {
+        fprintf(stderr, "Failed to load kernel.\n");
+        exit(1);
+    }
+    source_str = (char*)alloca(size);
+    source_size = read(fd, source_str, size);
+    close( fd );
+
+    char * pgm_file = init(argc,argv);
+    
+    /* Read image */
+    int err = readPGM(&ipgm, pgm_file);
+    if (err<0) {
+        fprintf(stderr, "Failed to read image file.\n");
+        exit(1);
+    }
+
+    n = ipgm.width;
+    m = (cl_int)(log((double)n)/log(2.0));
+
+    xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
+    rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
+    wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));
+
+    for (i=0; i < n; i++) {
+        for (j=0; j < n; j++) {
+            ((float*)xm)[(2*n*j)+2*i+0] = (float)ipgm.buf[n*j+i];
+            ((float*)xm)[(2*n*j)+2*i+1] = (float)0;
+        }
+    }
+
+    /* Get platform/device  */
+    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
+
+    /* Create OpenCL context */
+    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+
+    /* Create Command queue */
+    queue = clCreateCommandQueue(context, device_id, 0, &ret);
+
+    /* Create Buffer Objects */
+    xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
+    rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
+    wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret);
+
+    /* Transfer data to memory buffer */
+    ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);
+
+    /* Create kernel program from source */
+    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
+
+    /* Build kernel program */
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+
+    if (ret<0) {
+        size_t size;
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
+        
+        char *log = new char[size];
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
+        printf("%s ",log);
+        exit (ret);
+    }
+
+    /* Create OpenCL Kernel */
+    sfac = clCreateKernel(program, "spinFact", &ret);
+    trns = clCreateKernel(program, "transpose", &ret);
+    hpfl = clCreateKernel(program, "highPassFilter", &ret);
+
+    /* Create spin factor */
+    ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj);
+    ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n);
+    setWorkSize(gws, lws, n/2, 1);
+    ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL);
+
+    /* Butterfly Operation */
+    fftCore(rmobj, xmobj, wmobj, m, forward);
+
+    /* Transpose matrix */
+    ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj);
+    ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj);
+    ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n);
+    setWorkSize(gws, lws, n, n);
+    ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
+
+    /* Butterfly Operation */
+    fftCore(rmobj, xmobj, wmobj, m, forward);
+
+    /* Apply high-pass filter */
+    cl_int radius = n/8;
+    ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj);
+    ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n);
+    ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius);
+    setWorkSize(gws, lws, n, n);
+    ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL);
+
+    /* Inverse FFT */
+
+    /* Butterfly Operation */
+    fftCore(xmobj, rmobj, wmobj, m, inverse);
+
+    /* Transpose matrix */
+    ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj);
+    ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj);
+    setWorkSize(gws, lws, n, n);
+    ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);
+
+    /* Butterfly Operation */
+    
+    fftCore(xmobj, rmobj, wmobj, m, inverse);
+    
+    /* Read data from memory buffer */ 
+    ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);
+
+    /*  */
+    float* ampd;
+    ampd = (float*)malloc(n*n*sizeof(float));
+    for (i=0; i < n; i++) {
+        for (j=0; j < n; j++) {
+            ampd[n*((i))+((j))] = (AMP(((float*)xm)[(2*n*i)+2*j], ((float*)xm)[(2*n*i)+2*j+1]));
+        }
+    }
+    opgm.width = n;
+    opgm.height = n;
+    normalizeF2PGM(&opgm, ampd);
+    free(ampd);
+
+    /* Write out image */
+    writePGM(&opgm, "output.pgm");
+
+    /* Finalizations*/
+    ret = clFlush(queue);
+    ret = clFinish(queue);
+    ret = clReleaseKernel(hpfl);
+    ret = clReleaseKernel(trns);
+    ret = clReleaseKernel(sfac);
+    ret = clReleaseProgram(program);
+    ret = clReleaseMemObject(xmobj);
+    ret = clReleaseMemObject(rmobj);
+    ret = clReleaseMemObject(wmobj);
+    ret = clReleaseCommandQueue(queue);
+    ret = clReleaseContext(context);
+
+    destroyPGM(&ipgm);
+    destroyPGM(&opgm);
+
+    free(wm);
+    free(rm);
+    free(xm);
+
+    fprintf(stdout, "image out put succeeded.\n");    
+    return 0;
+}