Mercurial > hg > Game > Cerium

--- a/example/cuda_fft/fft.cu	Tue Jun 03 18:12:25 2014 +0900
+++ b/example/cuda_fft/fft.cu	Wed Jun 11 11:24:58 2014 +0900
@@ -123,7 +123,6 @@
     spinFact(float2* w, int n)
     {
         unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
-
         float2 angle;
         angle.x = (float)(2*i*PI/(float)n);
         angle.y = (float)((2*i*PI/(float)n) + PI_2);
--- a/example/cuda_fft/main.cc	Tue Jun 03 18:12:25 2014 +0900
+++ b/example/cuda_fft/main.cc	Wed Jun 11 11:24:58 2014 +0900
@@ -27,21 +27,22 @@
 }

 int
-setWorkSize(int* block, int* thread, int x, int y)
+setWorkSize(int* xblocks, int* yblocks, int x, int y)
 {
     switch(y) {
     case 1:
-        *block = x;
-        *thread = 1;
+        *xblocks = x;
+        *yblocks = 1;
         break;
     default:
-        *block = x;
-        *thread = y;
+        *xblocks = x;
+        *yblocks = y;
         break;
     }

     return 0;
 }
+
 int
 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
 {
@@ -53,8 +54,8 @@
     }

     int n = 1<<m;
-    int block, thread;
-    setWorkSize(&block, &thread, n, n);
+    int xblocks, yblocks;
+    setWorkSize(&xblocks, &yblocks, n, n);

     CUfunction bitReverse;
     cuModuleGetFunction(&bitReverse, module, "bitReverse");
@@ -62,32 +63,32 @@
     void* bitReverse_args[] = {&dst, &src, &m, &n};

     cuLaunchKernel(bitReverse,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, bitReverse_args, NULL);

     CUfunction butterfly;
     cuModuleGetFunction(&butterfly, module, "butterfly");

-    setWorkSize(&block, &thread, n/2, n);
+    setWorkSize(&xblocks, &yblocks, n/2, n);
     void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
     for (int i=1;i<=m;i++) {
         butterfly_args[4] = &i;
         cuLaunchKernel(butterfly,
-                       block, 1, 1,
-                       thread, 1, 1,
+                       xblocks, yblocks, 1,
+                       1, 1, 1,
                        0, NULL, butterfly_args, NULL);
     }

     CUfunction norm;
     cuModuleGetFunction(&norm, module, "norm");

-    void* norm_args[] = {&dst, &m};
+    void* norm_args[] = {&dst, &n};
     if (direction == inverse) {
-        setWorkSize(&block, &thread, n, n);
+        setWorkSize(&xblocks, &yblocks, n, n);
         cuLaunchKernel(norm,
-                       block, 1, 1,
-                       thread, 1, 1,
+                       xblocks, yblocks, 1,
+                       1, 1, 1,
                        0, NULL, norm_args, NULL);
     }

@@ -122,7 +123,7 @@
     CUcontext context;
     cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);

-    cuModuleLoad(&module, "fft.ptx");
+    printf("%u\n", cuModuleLoad(&module, "fft.ptx"));

     char* pgm_file = init(args, argv);

@@ -152,69 +153,76 @@
     // memory allocate
     CUdeviceptr xmobj;
     cuMemAlloc(&xmobj, n*n*sizeof(float2));
-
+
     CUdeviceptr rmobj;
     cuMemAlloc(&rmobj, n*n*sizeof(float2));

     CUdeviceptr wmobj;
-    cuMemAlloc(&wmobj, (n/2)*sizeof(float2));
+    cuMemAlloc(&wmobj, n/2*sizeof(float2));
+
+    CUfunction spinFact;
+    cuModuleGetFunction(&spinFact, module, "spinFact");
+
+    int xblocks, yblocks;
+    setWorkSize(&xblocks, &yblocks, n/2, 1);

     // Synchronous data transfer(host to device)
     cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));

-    CUfunction spinFact;
-    cuModuleGetFunction(&spinFact, module, "spinFact");
-
-    int block, thread;
-    setWorkSize(&block, &thread, n/2, 1);
-
-    void* spinFact_args[] = {&xmobj, &n};
+    void* spinFact_args[] = {&wmobj, &n};
     cuLaunchKernel(spinFact,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, spinFact_args, NULL);

+
     fftCore(rmobj, xmobj, wmobj, m, forward);

     CUfunction transpose;
     cuModuleGetFunction(&transpose, module, "transpose");

-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);

     void* transpose_args[] = {&xmobj, &rmobj, &n};
     cuLaunchKernel(transpose,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, transpose_args, NULL);

+
     fftCore(rmobj, xmobj, wmobj, m, forward);
+

     CUfunction highPassFilter;
     cuModuleGetFunction(&highPassFilter, module, "highPassFilter");

-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);

     int radius = n/8;
     void*highPassFilter_args[] = {&rmobj, &n, &radius};
     cuLaunchKernel(highPassFilter,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, highPassFilter_args, NULL);

+
     fftCore(xmobj, rmobj, wmobj, m, inverse);

-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);

     void* transpose2_args[] = {&rmobj, &xmobj, &n};
     cuLaunchKernel(transpose,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, transpose2_args, NULL);

     fftCore(xmobj, rmobj, wmobj, m, inverse);

+
     cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));

+    cuStreamSynchronize(NULL);
+
     float* ampd;
     ampd = (float*)malloc(n*n*sizeof(float));