changeset 2008:2c8eab01cc78 draft

implement fft using cuda
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 03 Jun 2014 18:10:19 +0900
parents bc2121b09cbc
children 113b1edd2a9a
files example/cuda_fft/main.cc example/cuda_fft/pgm.h
diffstat 2 files changed, 243 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/example/cuda_fft/main.cc	Tue Jun 03 16:02:06 2014 +0900
+++ b/example/cuda_fft/main.cc	Tue Jun 03 18:10:19 2014 +0900
@@ -2,6 +2,7 @@
 #include <sys/time.h>
 #include <string.h>
 #include <cuda.h>
+#include <vector_types.h>
 
 #include "pgm.h"
 
@@ -16,16 +17,6 @@
     inverse = 1
 };
 
-struct int2 {
-    int x;
-    int y;
-};
-
-struct float2 {
-    float x;
-    float y;
-};
-
 CUmodule module;
 
 static double
@@ -40,12 +31,12 @@
 {
     switch(y) {
     case 1:
-        block = x;
-        thread = 1;
+        *block = x;
+        *thread = 1;
         break;
     default:
-        block = x;
-        thread = y;
+        *block = x;
+        *thread = y;
         break;
     }
 
@@ -66,37 +57,38 @@
     setWorkSize(&block, &thread, n, n);
 
     CUfunction bitReverse;
-    cuModuleGetFunction(bitReverse, module, "bitReverse");
+    cuModuleGetFunction(&bitReverse, module, "bitReverse");
     
-    void* kernel_args[] = {&dst, &src, &m, &n};
+    void* bitReverse_args[] = {&dst, &src, &m, &n};
 
     cuLaunchKernel(bitReverse,
                    block, 1, 1,
                    thread, 1, 1,
-                   0, NULL, kernel_args, NULL);
+                   0, NULL, bitReverse_args, NULL);
 
     CUfunction butterfly;
-    cuModuleGetFunction(butterfly, module, "butterfly");
+    cuModuleGetFunction(&butterfly, module, "butterfly");
 
     setWorkSize(&block, &thread, n/2, n);
+    void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
     for (int i=1;i<=m;i++) {
-        kernel_args[] = {&dst, &spin, &m, &n, &i, &flag};
+        butterfly_args[4] = &i;
         cuLaunchKernel(butterfly,
                        block, 1, 1,
                        thread, 1, 1,
-                       0, NULL, kernel_args, NULL);
+                       0, NULL, butterfly_args, NULL);
     }
     
     CUfunction norm;
-    cuModuleGetFunction(norm, module, "norm");
+    cuModuleGetFunction(&norm, module, "norm");
 
+    void* norm_args[] = {&dst, &m};
     if (direction == inverse) {
         setWorkSize(&block, &thread, n, n);
-        kernel_args[] = {&dst, &m};
         cuLaunchKernel(norm,
                        block, 1, 1,
                        thread, 1, 1,
-                       0, NULL, kernel_args, NULL);
+                       0, NULL, norm_args, NULL);
     }
 
     return 0;
@@ -132,7 +124,7 @@
 
     cuModuleLoad(&module, "fft.ptx");
 
-    char* pgm_file = init(argc, argv);
+    char* pgm_file = init(args, argv);
 
     pgm_t ipgm;
     int err = readPGM(&ipgm, pgm_file);
@@ -171,74 +163,88 @@
     cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
 
     CUfunction spinFact;
-    cuModuleGetFunction(spinFact, module, "spinFact");
+    cuModuleGetFunction(&spinFact, module, "spinFact");
     
     int block, thread;
     setWorkSize(&block, &thread, n/2, 1);
 
-    void* kernel_args[] = {&xmobj, &n};
+    void* spinFact_args[] = {&xmobj, &n};
     cuLaunchKernel(spinFact,
                    block, 1, 1,
                    thread, 1, 1,
-                   0, NULL, kernel_args, NULL);
+                   0, NULL, spinFact_args, NULL);
     
     fftCore(rmobj, xmobj, wmobj, m, forward);
     
-    CUfunction transfer;
-    cuModuleGetFunction(transfer, module, "transfer");
+    CUfunction transpose;
+    cuModuleGetFunction(&transpose, module, "transpose");
 
     setWorkSize(&block, &thread, n, n);
 
-    kernel_args[] = {&xmobj, &rmobj, &n};
-    cuLaunchKernel(transfer,
+    void* transpose_args[] = {&xmobj, &rmobj, &n};
+    cuLaunchKernel(transpose,
                    block, 1, 1,
                    thread, 1, 1,
-                   0, NULL, kernel_args, NULL);
+                   0, NULL, transpose_args, NULL);
 
     fftCore(rmobj, xmobj, wmobj, m, forward);
 
     CUfunction highPassFilter;
-    cuModuleGetFunction(transfer, module, "highPassFilter");
+    cuModuleGetFunction(&highPassFilter, module, "highPassFilter");
 
     setWorkSize(&block, &thread, n, n);
 
     int radius = n/8;
-    kernel_args[] = {&rmobj, &n, &radius};
+    void*highPassFilter_args[] = {&rmobj, &n, &radius};
     cuLaunchKernel(highPassFilter,
                    block, 1, 1,
                    thread, 1, 1,
-                   0, NULL, kernel_args, NULL);
+                   0, NULL, highPassFilter_args, NULL);
 
     fftCore(xmobj, rmobj, wmobj, m, inverse);
 
     setWorkSize(&block, &thread, n, n);
     
-    kernel_args[] = {&rmobj, &xmobj};
-    cuLaunchKernel(transfer,
+    void* transpose2_args[] = {&rmobj, &xmobj, &n};
+    cuLaunchKernel(transpose,
                    block, 1, 1,
                    thread, 1, 1,
-                   0, NULL, kernel_args, NULL);
+                   0, NULL, transpose2_args, NULL);
     
     fftCore(xmobj, rmobj, wmobj, m, inverse);
 
+    cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));
+
+    float* ampd;
+    ampd = (float*)malloc(n*n*sizeof(float));
+
+    for (int i=0;i<n*n;i++)
+        ampd[i] = (AMP(xm[i].x, xm[i].y));
     
+    opgm.width = n;
+    opgm.height = n;
+    normalizeF2PGM(&opgm, ampd);
+    free(ampd);
+
+    ed_time = getTime();
+
+    writePGM(&opgm, "output.pgm");
 
     // memory release
-    cuMemFree(devA);
-    for (int i=0;i<num_exec;i++) {
-        cuMemFree(devB[i]);
-        cuMemFree(devOut[i]);
-    }
-    for (int i=0;i<num_stream;i++)
-        cuStreamDestroy(stream[i]);
+    cuMemFree(xmobj);
+    cuMemFree(rmobj);
+    cuMemFree(wmobj);
     cuModuleUnload(module);
     cuCtxDestroy(context);
 
-    delete[] A;
-    delete[] B;
-    for (int i=0;i<num_exec;i++)
-        delete[] result[i];
-    delete[] result;
+    destroyPGM(&ipgm);
+    destroyPGM(&opgm);
+
+    free(xm);
+    free(rm);
+    free(wm);
+
+    printf("Time: %0.6f\n", ed_time-st_time);
 
     return 0;
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/cuda_fft/pgm.h	Tue Jun 03 18:10:19 2014 +0900
@@ -0,0 +1,187 @@
+#ifndef _PGM_H_
+#define _PGM_H_
+ 
+#include <math.h>
+#include <string.h>
+ 
+#define PGM_MAGIC "P5"
+ 
+#ifdef _WIN32
+#define STRTOK_R(ptr, del, saveptr) strtok_s(ptr, del, saveptr)
+#else
+#define STRTOK_R(ptr, del, saveptr) strtok_r(ptr, del, saveptr)
+#endif
+ 
+typedef struct _pgm_t {
+    int width;
+    int height;
+    unsigned char *buf;
+} pgm_t;
+ 
+int readPGM(pgm_t* pgm, const char* filename)
+{
+    char *token, *pc, *saveptr;
+    char *buf;
+    size_t bufsize;
+    char del[] = " \t\n";
+    unsigned char *dot;
+ 
+    long begin, end;
+    int filesize;
+    int i, w, h, luma, pixs;
+ 
+ 
+    FILE* fp;
+    if ((fp = fopen(filename, "rb"))==NULL) {
+        fprintf(stderr, "Failed to open file\n");
+        return -1;
+    }
+ 
+    fseek(fp, 0, SEEK_SET);
+    begin = ftell(fp);
+    fseek(fp, 0, SEEK_END);
+    end = ftell(fp);
+    filesize = (int)(end - begin);
+ 
+    buf = (char*)malloc(filesize * sizeof(char));
+    fseek(fp, 0, SEEK_SET);
+    bufsize = fread(buf, filesize * sizeof(char), 1, fp);
+ 
+    fclose(fp);
+ 
+    token = (char *)STRTOK_R(buf, del, &saveptr);
+    if (strncmp(token, PGM_MAGIC, 2) != 0) {
+        return -1;
+    }
+ 
+    token = (char *)STRTOK_R(NULL, del, &saveptr);
+    if (token[0] == '#' ) {
+        token = (char *)STRTOK_R(NULL, "\n", &saveptr);
+        token = (char *)STRTOK_R(NULL, del, &saveptr);
+    }
+ 
+    w = strtoul(token, &pc, 10);
+    token = (char *)STRTOK_R(NULL, del, &saveptr);
+    h = strtoul(token, &pc, 10);
+    token = (char *)STRTOK_R(NULL, del, &saveptr);
+    luma = strtoul(token, &pc, 10);
+ 
+    token = pc + 1;
+    pixs = w * h;
+ 
+    pgm->buf = (unsigned char *)malloc(pixs * sizeof(unsigned char));
+ 
+    dot = pgm->buf;
+ 
+    for (i=0; i< pixs; i++, dot++) {
+        *dot = *token++;
+    }
+ 
+    pgm->width = w;
+    pgm->height = h;
+ 
+    return 0;
+}
+ 
+int writePGM(pgm_t* pgm, const char* filename)
+{
+    int i, w, h, pixs;
+    FILE* fp;
+    unsigned char* dot;
+ 
+    w = pgm->width;
+    h = pgm->height;
+    pixs = w * h;
+ 
+    if ((fp = fopen(filename, "wb+")) ==NULL) {
+        fprintf(stderr, "Failed to open file\n");
+        return -1;
+    }
+ 
+    fprintf (fp, "%s\n%d %d\n255\n", PGM_MAGIC, w, h);
+ 
+    dot = pgm->buf;
+ 
+    for (i=0; i<pixs; i++, dot++) {
+        putc((unsigned char)*dot, fp);
+    }
+ 
+    fclose(fp);
+ 
+    return 0;
+}
+ 
+int normalizeD2PGM(pgm_t* pgm, double* x)
+{
+    int i, j, w, h;
+ 
+    w = pgm->width;
+    h = pgm->height;
+ 
+    pgm->buf = (unsigned char*)malloc(w * h * sizeof(unsigned char));
+ 
+    double min = 0;
+    double max = 0;
+    for (i=0; i < h; i++) {
+        for (j=0; j < w; j++) {
+            if (max < x[i*w+j])
+                max = x[i*w+j];
+            if (min > x[i*w+j])
+                min = x[i*w+j];
+        }
+    }
+ 
+    for (i=0; i < h; i++) {
+        for (j=0; j < w; j++) {
+            if((max-min)!=0)
+                pgm->buf[i*w+j] = (unsigned char)(255*(x[i*w+j]-min)/(max-min));
+            else
+                pgm->buf[i*w+j]= 0;
+        }
+    }
+ 
+    return 0;
+}
+ 
+int normalizeF2PGM(pgm_t* pgm, float* x)
+{
+    int i, j, w, h;
+ 
+    w = pgm->width;
+    h = pgm->height;
+ 
+    pgm->buf = (unsigned char*)malloc(w * h * sizeof(unsigned char));
+ 
+    float min = 0;
+    float max = 0;
+    for (i=0; i < h; i++) {
+        for (j=0; j < w; j++) {
+            if (max < x[i*w+j])
+                max = x[i*w+j];
+            if (min > x[i*w+j])
+                min = x[i*w+j];
+        }
+    }
+ 
+    for (i=0; i < h; i++) {
+        for (j=0; j < w; j++) {
+            if((max-min)!=0)
+                pgm->buf[i*w+j] = (unsigned char)(255*(x[i*w+j]-min)/(max-min));
+            else
+                pgm->buf[i*w+j]= 0;
+        }
+    }
+ 
+    return 0;
+}
+ 
+int destroyPGM(pgm_t* pgm)
+{
+    if (pgm->buf) {
+        free(pgm->buf);
+    }
+ 
+    return 0;
+}
+ 
+#endif /* _PGM_H_ */