changeset 2006:f6aa6d6a3fa2 draft

add fft using cuda, not running
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 03 Jun 2014 12:07:00 +0900
parents 4d1bc8cd3a62
children bc2121b09cbc
files example/Cuda/main.cc example/Cuda/multiply.cu example/cuda_fft/Makefile example/cuda_fft/Makefile.def example/cuda_fft/fft.cu example/cuda_fft/main.cc
diffstat 6 files changed, 492 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/example/Cuda/main.cc	Tue May 27 14:23:44 2014 +0900
+++ b/example/Cuda/main.cc	Tue Jun 03 12:07:00 2014 +0900
@@ -3,8 +3,8 @@
 #include <string.h>
 #include <cuda.h>
 
-#define LENGTH 10000
-#define THREAD 100
+#define LENGTH 10
+#define THREAD 10
 
 static double
 getTime() {
--- a/example/Cuda/multiply.cu	Tue May 27 14:23:44 2014 +0900
+++ b/example/Cuda/multiply.cu	Tue Jun 03 12:07:00 2014 +0900
@@ -1,5 +1,6 @@
 extern "C" {
-    __global__ void multiply(float* A, float* B, float* C) {
+    __global__ void multiply(float* A, float* B, float* C,int* i) {
+        printf("%d %d\n",i[0],i[1]);
         int index = blockIdx.x * blockDim.x + threadIdx.x;
         C[index] = A[index] * B[0];
     }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/cuda_fft/Makefile	Tue Jun 03 12:07:00 2014 +0900
@@ -0,0 +1,37 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE = # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+CUDA_SRCS_TMP = $(wildcard *.cu)
+CUDA_SRCS_EXCLUDE = # 除外するファイルを書く
+CUDA_SRCS = $(filter-out $(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
+CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx)
+
+CC += $(ABI)
+
+LIBS = -F/Library/Frameworks -framework CUDA
+INCLUDE = -I$(CUDA_PATH)
+
+.SUFFIXES: .cc .o .cu .ptx
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+.cu.ptx:
+	$(NVCC) $(NVCCFLAGS) $<
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET)
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	rm -f *~ \#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/cuda_fft/Makefile.def	Tue Jun 03 12:07:00 2014 +0900
@@ -0,0 +1,8 @@
+TARGET = multiply
+
+OPT = -g -O0
+
+CC = clang++
+NVCC = nvcc
+CFLAGS = -Wall $(OPT)
+NVCCFLAGS = -ptx -arch=sm_20
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/cuda_fft/fft.cu	Tue Jun 03 12:07:00 2014 +0900
@@ -0,0 +1,198 @@
+extern "C" {
+    
+    __global__ void
+    bitReverse(long* param, float* src, float* dst)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+        
+        unsigned int j = gid;
+        
+        unsigned long m = param[0];
+        unsigned long n = param[1];
+        
+        j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
+        j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
+        j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
+        j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
+        j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
+        
+        j >>= (32-m);
+        
+        dst[(nid*n+j)*2] = src[(nid*n+gid)*2];
+        dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1];
+    }
+}
+extern "C" {
+    
+    __global__ void
+    bitReverse(long* param, float* src, float* dst)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+        
+        unsigned int j = gid;
+        
+        unsigned long m = param[0];
+        unsigned long n = param[1];
+        
+        j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
+        j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
+        j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
+        j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
+        j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
+        
+        j >>= (32-m);
+        
+        dst[(nid*n+j)*2] = src[(nid*n+gid)*2];
+        dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1];
+    }
+}
+extern "C" {
+    __global__ void
+    butterfly(long* param, float* x_in, float* w, float* x_out)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+        
+        long n = param[0];
+        long direction_flag = param[1];
+        long iter = param[2];
+        
+        int butterflySize = 1 << (iter-1);
+        int butterflyGrpDist = 1 << iter;
+        int butterflyGrpNum = n >> iter;
+        int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
+        int butterflyGrpOffset = gid & (butterflySize-1);
+        
+        int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
+        int b = a + butterflySize;
+        
+        int l = butterflyGrpNum * butterflyGrpOffset;
+        
+        float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2];
+        
+        xa[0] = x_in[2*a];
+        xa[1] = x_in[2*a+1];
+        xb[0] = x_in[2*b];
+        xb[1] = x_in[2*b+1];
+        xbxx[0] = xbxx[1] = xb[0];
+        xbyy[0] = xbyy[1] = xb[1];
+        
+        wab[0] = w[2*l];
+        if(direction_flag == 0x80000000) {
+            wab[1] = -w[2*l+1];
+        } else {
+            wab[1] = w[2*l+1];
+        }
+        
+        wayx[0] = -wab[1];
+        wayx[1] = wab[0];
+        
+        wbyx[0] = wab[1];
+        wbyx[1] = -wab[0];
+        
+        resa[0] = xa[0] + xbxx[0]*wab[0] + xbyy[0]*wayx[0];
+        resa[1] = xa[1] + xbxx[1]*wab[1] + xbyy[1]*wayx[1];
+        
+        resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0];
+        resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1];
+
+        x_out[2*a] = resa[0];
+        x_out[2*a+1] = resa[1];
+        x_out[2*b] = resb[0];
+        x_out[2*b+1] = resb[1];
+    }
+}
+extern "C" {
+    __global__ void
+    highPassFilter(long* param, float* in, float* image)
+    {
+        unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+
+        long n = param[0];
+        long radius = param[1];
+        
+        int n_2[2];
+        n_2[0] = n_2[1] = n>>1;
+    
+        int mask[2];
+        mask[0] = mask[1] = n-1;
+
+        int gid[2];
+        gid[0] = (xgid + n_2[0]) & mask[0];
+        gid[1] = (ygid + n_2[1]) & mask[1];
+
+        int diff[2];
+        diff[0] = n_2[0] - gid[0];
+        diff[1] = n_2[1] - gid[1];
+    
+        int diff2[2];
+        diff2[0] = diff[0] * diff[0];
+        diff2[1] = diff[1] * diff[1];
+
+        int dist2 = diff2[0] + diff2[1];
+
+        int window[2];
+
+        if (dist2 < radius*radius) {
+            window[0] = window[1] = (int)0L;
+        } else {
+            window[0] = window[1] = (int)-1L;
+        }
+
+        image[(ygid*n+xgid)*2] = (float)((int)in[(ygid*n+xgid)*2] & window[0]);
+        image[(ygid*n+xgid)*2+1] = (float)((int)in[(ygid*n+xgid)*2+1] & window[1]);
+    }
+}
+extern "C" {
+    __global__ void
+    norm(long* param, float* in_x,float* out_x)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1);
+        
+        long n = param[0];
+        
+        out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n;
+        out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n;
+    }
+}
+extern "C" {
+#include <math.h>
+    
+#define PI 3.14159265358979323846
+#define PI_2 1.57079632679489661923
+    
+    __global__ void
+    spinFact(long* param, float* w)
+    {
+        unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+    
+        long n = param[0];
+
+        float angle[2];
+        angle[0] = (float)(2*i*PI/(float)n);
+        angle[1] = (float)((2*i*PI/(float)n) + PI_2);
+
+        w[2*i] = cos(angle[0]);
+        w[2*i+1] = cos(angle[1]);
+    }
+}
+extern "C" {
+    __global__ void 
+    transpose(long* param, float* src, float* dst)
+    {
+        unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1);
+
+        long n = param[0];
+
+        unsigned int iid = ygid * n + xgid;
+        unsigned int oid = xgid * n + ygid;
+
+        dst[2*oid] = src[2*iid];
+        dst[2*oid+1] = src[2*iid+1];
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/cuda_fft/main.cc	Tue Jun 03 12:07:00 2014 +0900
@@ -0,0 +1,245 @@
+#include <stdio.h>
+#include <sys/time.h>
+#include <string.h>
+#include <cuda.h>
+
+#include "pgm.h"
+
+#define PI 3.14159265358979
+#define MAX_SOURCE_SIZE (0x100000)
+#define AMP(a, b) (sqrt((a)*(a)+(b)))
+
+static double st_time;
+static double ed_time;
+enum Mode {
+    forward = 0,
+    inverse = 1
+};
+
+struct int2 {
+    int x;
+    int y;
+};
+
+struct float2 {
+    float x;
+    float y;
+};
+
+CUmodule module;
+
+static double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+int
+setWorkSize(int* block, int* thread, int x, int y)
+{
+    switch(y) {
+    case 1:
+        block = x;
+        thread = 1;
+        break;
+    default:
+        block = x;
+        thread = y;
+        break;
+    }
+
+    return 0;
+}
+int
+fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
+{
+
+    unsigned int flag;
+    switch (direction) {
+    case forward:flag = 0x00000000; break;
+    case inverse:flag = 0x80000000; break;
+    }
+
+    int n = 1<<m;
+    int block, thread;
+    setWorkSize(&block, &thread, n, n);
+
+    CUfunction bitReverse;
+    cuModuleGetFunction(bitReverse, module, "bitReverse");
+    
+    void* kernel_args[] = {&dst, &src, &m, &n};
+
+    cuLaunchKernel(bitReverse,
+                   block, 1, 1,
+                   thread, 1, 1,
+                   0, NULL, kernel_args, NULL);
+
+    CUfunction butterfly;
+    cuModuleGetFunction(butterfly, module, "butterfly");
+
+    setWorkSize(&block, &thread, n/2, n);
+    for (int i=1;i<=m;i++) {
+        kernel_args[] = {&dst, &spin, &m, &n, &i, &flag};
+        cuLaunchKernel(butterfly,
+                       block, 1, 1,
+                       thread, 1, 1,
+                       0, NULL, kernel_args, NULL);
+    }
+    
+    CUfunction norm;
+    cuModuleGetFunction(norm, module, "norm");
+
+    if (direction == inverse) {
+        setWorkSize(&block, &thread, n, n);
+        kernel_args[] = {&dst, &m};
+        cuLaunchKernel(norm,
+                       block, 1, 1,
+                       thread, 1, 1,
+                       0, NULL, kernel_args, NULL);
+    }
+
+    return 0;
+}
+
+char*
+init(int argc, char**argv){
+
+    char *filename = 0;
+
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-file") == 0) {
+            filename = argv[i+1];
+        }
+    }
+
+    if ( (argc == 1)||(filename==0)) {
+        printf("Usage: ./fft -file [image filename] \n");
+        exit(-1);
+    }
+
+    return filename;
+}
+
+int main(int args, char* argv[]) {
+    // initialize and load kernel
+    cuInit(0);
+
+    CUdevice device;
+    cuDeviceGet(&device, 0);
+
+    CUcontext context;
+    cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
+
+    cuModuleLoad(&module, "fft.ptx");
+
+    char* pgm_file = init(argc, argv);
+
+    pgm_t ipgm;
+    int err = readPGM(&ipgm, pgm_file);
+    if (err<0) {
+        fprintf(stderr, "Failed to read image file.\n");
+        exit(1);
+    }
+
+    int n = ipgm.width;
+    int m = (int)(log((double)n)/log(2.0));
+
+    pgm_t opgm;
+
+    float2* xm = (float2*)malloc(n*n*sizeof(float2));
+    float2* rm = (float2*)malloc(n*n*sizeof(float2));
+    float2* wm = (float2*)malloc(n/2*sizeof(float2));
+
+    for (int i=0; i<n*n; i++) {
+        xm[i].x = (float)ipgm.buf[i];
+        xm[i].y = (float)0;
+    }
+    
+    st_time = getTime();
+
+    // memory allocate
+    CUdeviceptr xmobj;
+    cuMemAlloc(&xmobj, n*n*sizeof(float2));
+
+    CUdeviceptr rmobj;
+    cuMemAlloc(&rmobj, n*n*sizeof(float2));
+
+    CUdeviceptr wmobj;
+    cuMemAlloc(&wmobj, (n/2)*sizeof(float2));
+
+    // Synchronous data transfer(host to device)
+    cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
+
+    CUfunction spinFact;
+    cuModuleGetFunction(spinFact, module, "spinFact");
+    
+    int block, thread;
+    setWorkSize(&block, &thread, n/2, 1);
+
+    void* kernel_args[] = {&xmobj, &n};
+    cuLaunchKernel(spinFact,
+                   block, 1, 1,
+                   thread, 1, 1,
+                   0, NULL, kernel_args, NULL);
+    
+    fftCore(rmobj, xmobj, wmobj, m, forward);
+    
+    CUfunction transfer;
+    cuModuleGetFunction(transfer, module, "transfer");
+
+    setWorkSize(&block, &thread, n, n);
+
+    kernel_args[] = {&xmobj, &rmobj, &n};
+    cuLaunchKernel(transfer,
+                   block, 1, 1,
+                   thread, 1, 1,
+                   0, NULL, kernel_args, NULL);
+
+    fftCore(rmobj, xmobj, wmobj, m, forward);
+
+    CUfunction highPassFilter;
+    cuModuleGetFunction(transfer, module, "highPassFilter");
+
+    setWorkSize(&block, &thread, n, n);
+
+    int radius = n/8;
+    kernel_args[] = {&rmobj, &n, &radius};
+    cuLaunchKernel(highPassFilter,
+                   block, 1, 1,
+                   thread, 1, 1,
+                   0, NULL, kernel_args, NULL);
+
+    fftCore(xmobj, rmobj, wmobj, m, inverse);
+
+    setWorkSize(&block, &thread, n, n);
+    
+    kernel_args[] = {&rmobj, &xmobj};
+    cuLaunchKernel(transfer,
+                   block, 1, 1,
+                   thread, 1, 1,
+                   0, NULL, kernel_args, NULL);
+    
+    fftCore(xmobj, rmobj, wmobj, m, inverse);
+
+    
+
+    // memory release
+    cuMemFree(devA);
+    for (int i=0;i<num_exec;i++) {
+        cuMemFree(devB[i]);
+        cuMemFree(devOut[i]);
+    }
+    for (int i=0;i<num_stream;i++)
+        cuStreamDestroy(stream[i]);
+    cuModuleUnload(module);
+    cuCtxDestroy(context);
+
+    delete[] A;
+    delete[] B;
+    for (int i=0;i<num_exec;i++)
+        delete[] result[i];
+    delete[] result;
+
+    return 0;
+}