view fft_fixstart/main.cc @ 4:8df0d3128672

add time measurement function
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Mon, 04 Feb 2013 03:43:17 +0900
parents f3cfea46e585
children 3602b23914ad
line wrap: on
line source

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#include "pgm.h"

#define PI 3.14159265358979

#define MAX_SOURCE_SIZE (0x100000)

#define AMP(a, b) (sqrt((a)*(a)+(b)*(b)))

static double st_time;
static double ed_time;

cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
cl_device_type device_type = CL_DEVICE_TYPE_GPU;

enum Mode {
    forward = 0,
    inverse = 1
};

static double
getTime()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + (double)tv.tv_usec*1e-6;
}

int setWorkSize(size_t* gws, size_t* lws, cl_int x, cl_int y)
{
    switch(y) {
    case 1:
        gws[0] = x;
        gws[1] = 1;
        lws[0] = 1;
        lws[1] = 1;
        break;
    default:
        gws[0] = x;
        gws[1] = y;
        lws[0] = 1;
        lws[1] = 1;
        break;
    }

    return 0;
}

int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction)
{
    cl_int ret;

    cl_int iter;
    cl_uint flag;

    cl_int n = 1<<m;

    cl_event kernelDone;

    cl_kernel brev = NULL;
    cl_kernel bfly = NULL;
    cl_kernel norm = NULL;

    brev = clCreateKernel(program, "bitReverse", &ret);
    bfly = clCreateKernel(program, "butterfly", &ret);
    norm = clCreateKernel(program, "norm", &ret);

    size_t gws[2];
    size_t lws[2];

    switch (direction) {
    case forward:flag = 0x00000000; break;
    case inverse:flag = 0x80000000; break;
    }

    ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst);
    ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src);
    ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m);
    ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n);

    ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst);
    ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin);
    ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m);
    ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n);
    ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag);

    ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst);
    ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n);

    /* Reverse bit ordering */
    setWorkSize(gws, lws, n, n);
    ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL);

    /* Perform Butterfly Operations*/
    setWorkSize(gws, lws, n/2, n);
    for (iter=1; iter <= m; iter++) {
        ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter);
        ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone);
        ret = clWaitForEvents(1, &kernelDone);
    }

    if (direction == inverse) {
        setWorkSize(gws, lws, n, n);
        ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone);
        ret = clWaitForEvents(1, &kernelDone);
    }

    ret = clReleaseKernel(bfly);
    ret = clReleaseKernel(brev);
    ret = clReleaseKernel(norm);

    return 0;
}

char *
init(int argc, char**argv){
    
    char *filename = 0;

    for (int i = 1; argv[i]; ++i) {
        if (strcmp(argv[i], "-file") == 0) {
            filename = argv[i+1];
        } else if (strcmp(argv[i], "-cpu") == 0) {
            device_type = CL_DEVICE_TYPE_CPU;
        } else if (strcmp(argv[i], "-gpu") == 0) {
            device_type = CL_DEVICE_TYPE_GPU;
        }
    }
    if ( (argc == 1)||(filename==0)) {
        printf("Usage: ./fft [image filename]\n");
        exit(-1);
    }

    return filename;
}

int main(int argc, char** argv) {
    cl_mem xmobj = NULL;
    cl_mem rmobj = NULL;
    cl_mem wmobj = NULL;
    cl_kernel sfac = NULL;
    cl_kernel trns = NULL;
    cl_kernel hpfl = NULL;

    cl_platform_id platform_id = NULL;

    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

    cl_int ret;

    cl_float2 *xm;
    cl_float2 *rm;
    cl_float2 *wm;

    pgm_t ipgm;
    pgm_t opgm;
    
    const char fileName[] = "./fft.cl";
    size_t source_size;
    char *source_str;
    cl_int i, j;
    cl_int n;
    cl_int m;

    size_t gws[2];
    size_t lws[2];

    /* Load kernel source code */
    int fd = open(fileName, O_RDONLY);
    
    if (fd<0) {
        fprintf(stderr, "Failed to load kernel %s.\n",fileName);
        exit(1);
    }
    struct stat stats;
    fstat(fd, &stats);
    off_t size = stats.st_size;
    if (size<=0) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)alloca(size);
    source_size = read(fd, source_str, size);
    close( fd );

    char * pgm_file = init(argc,argv);
    
    /* Read image */
    int err = readPGM(&ipgm, pgm_file);
    if (err<0) {
        fprintf(stderr, "Failed to read image file.\n");
        exit(1);
    }

    n = ipgm.width;
    m = (cl_int)(log((double)n)/log(2.0));

    xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
    rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
    wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));

    for (i=0; i < n; i++) {
        for (j=0; j < n; j++) {
            ((float*)xm)[(2*n*j)+2*i+0] = (float)ipgm.buf[n*j+i];
            ((float*)xm)[(2*n*j)+2*i+1] = (float)0;
        }
    }

    st_time = getTime();
    /* Get platform/device  */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command queue */
    queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /* Create Buffer Objects */
    xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
    rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret);
    wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret);

    /* Transfer data to memory buffer */
    ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);

    /* Create kernel program from source */
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

    /* Build kernel program */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    if (ret<0) {
        size_t size;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
        
        char *log = new char[size];
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
        printf("%s ",log);
        exit (ret);
    }

    /* Create OpenCL Kernel */
    sfac = clCreateKernel(program, "spinFact", &ret);
    trns = clCreateKernel(program, "transpose", &ret);
    hpfl = clCreateKernel(program, "highPassFilter", &ret);

    /* Create spin factor */
    ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj);
    ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n);
    setWorkSize(gws, lws, n/2, 1);
    ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL);

    /* Butterfly Operation */
    fftCore(rmobj, xmobj, wmobj, m, forward);

    /* Transpose matrix */
    ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj);
    ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj);
    ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n);
    setWorkSize(gws, lws, n, n);
    ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);

    /* Butterfly Operation */
    fftCore(rmobj, xmobj, wmobj, m, forward);

    /* Apply high-pass filter */
    cl_int radius = n/8;
    ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj);
    ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n);
    ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius);
    setWorkSize(gws, lws, n, n);
    ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL);

    /* Inverse FFT */

    /* Butterfly Operation */
    fftCore(xmobj, rmobj, wmobj, m, inverse);

    /* Transpose matrix */
    ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj);
    ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj);
    setWorkSize(gws, lws, n, n);
    ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL);

    /* Butterfly Operation */
    
    fftCore(xmobj, rmobj, wmobj, m, inverse);
    
    /* Read data from memory buffer */ 
    ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL);

    /*  */
    float* ampd;
    ampd = (float*)malloc(n*n*sizeof(float));
    for (i=0; i < n; i++) {
        for (j=0; j < n; j++) {
            ampd[n*((i))+((j))] = (AMP(((float*)xm)[(2*n*i)+2*j], ((float*)xm)[(2*n*i)+2*j+1]));
        }
    }
    opgm.width = n;
    opgm.height = n;
    normalizeF2PGM(&opgm, ampd);
    free(ampd);

    /* Write out image */
    writePGM(&opgm, "output.pgm");

    /* Finalizations*/
    ret = clFlush(queue);
    ret = clFinish(queue);
    ed_time = getTime();
    ret = clReleaseKernel(hpfl);
    ret = clReleaseKernel(trns);
    ret = clReleaseKernel(sfac);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(xmobj);
    ret = clReleaseMemObject(rmobj);
    ret = clReleaseMemObject(wmobj);
    ret = clReleaseCommandQueue(queue);
    ret = clReleaseContext(context);

    destroyPGM(&ipgm);
    destroyPGM(&opgm);

    free(wm);
    free(rm);
    free(xm);

    fprintf(stdout, "image out put succeeded.\n");
    printf("Time: %0.6f\n",ed_time-st_time);
    return 0;
}