# HG changeset patch # User Yuhi TOMARI # Date 1360533144 -32400 # Node ID e38bef2012bc9450a89897388cd03ad240dcafac # Parent ed3d4a769bf3c40987766e531ca3149adb84c0f0 fix sync bug. and NDrange option diff -r ed3d4a769bf3 -r e38bef2012bc fft_fixstart/main.cc --- a/fft_fixstart/main.cc Tue Feb 05 17:21:18 2013 +0900 +++ b/fft_fixstart/main.cc Mon Feb 11 06:52:24 2013 +0900 @@ -21,6 +21,7 @@ static double st_time; static double ed_time; +int ndrange_flag; cl_device_id device_id = NULL; cl_context context = NULL; @@ -102,21 +103,33 @@ ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); - /* Reverse bit ordering */ - setWorkSize(gws, lws, n, n); - ret = clEnqueueTask(queue, brev, 0, NULL, NULL); + /* Reversee bit ordering */ + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL); + } else { + ret = clEnqueueTask(queue, brev, 0, NULL, NULL); + } /* Perform Butterfly Operations*/ setWorkSize(gws, lws, n/2, n); for (iter=1; iter <= m; iter++) { ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); - ret = clEnqueueTask(queue, bfly, 0, NULL, NULL); + if (ndrange_flag == 1) { + ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone); + } else { + ret = clEnqueueTask(queue, bfly, 0, NULL, &kernelDone); + } ret = clWaitForEvents(1, &kernelDone); } if (direction == inverse) { - setWorkSize(gws, lws, n, n); - ret = clEnqueueTask(queue, norm, 0, NULL, NULL); + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone); + } else { + ret = clEnqueueTask(queue, norm, 0, NULL, &kernelDone); + } ret = clWaitForEvents(1, &kernelDone); } @@ -141,6 +154,8 @@ device_type = CL_DEVICE_TYPE_GPU; } else if (strcmp(argv[i], "-all") == 0) { device_type = CL_DEVICE_TYPE_ALL; + } else if (strcmp(argv[i], "-nd") == 0 ) { + ndrange_flag = 1; } } if ( (argc == 1)||(filename==0)) { @@ -166,10 +181,26 @@ cl_int ret; + cl_float2 *xm; cl_float2 *rm; cl_float2 *wm; + /* + * typedef union + * { + * cl_float CL_ALIGNED(8) s[2]; + * #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + * __extension__ struct{ cl_float x, y; }; + * __extension__ struct{ cl_float s0, s1; }; + * __extension__ struct{ cl_float lo, hi; }; + * #endif + * #if defined( __CL_FLOAT2__) + * __cl_float2 v2; + * #endif + * }cl_float2; + */ + pgm_t ipgm; pgm_t opgm; @@ -240,7 +271,7 @@ xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); - + /* Transfer data to memory buffer */ ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); @@ -268,8 +299,12 @@ /* Create spin factor */ ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); - setWorkSize(gws, lws, n/2, 1); - ret = clEnqueueTask(queue, sfac, 0, NULL, NULL); + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n/2, 1); + ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL); + } else { + ret = clEnqueueTask(queue, sfac, 0, NULL, NULL); + } /* Butterfly Operation */ fftCore(rmobj, xmobj, wmobj, m, forward); @@ -278,8 +313,12 @@ ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); - setWorkSize(gws, lws, n, n); - ret = clEnqueueTask(queue, trns, 0, NULL, NULL); + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); + } else { + ret = clEnqueueTask(queue, trns, 0, NULL, NULL); + } /* Butterfly Operation */ fftCore(rmobj, xmobj, wmobj, m, forward); @@ -289,8 +328,12 @@ ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); - setWorkSize(gws, lws, n, n); - ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL); + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL); + } else { + ret = clEnqueueTask(queue, hpfl, 0, NULL, NULL); + } /* Inverse FFT */ @@ -300,9 +343,12 @@ /* Transpose matrix */ ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); - setWorkSize(gws, lws, n, n); - ret = clEnqueueTask(queue, trns, 0, NULL, NULL); - + if (ndrange_flag == 1) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); + } else { + ret = clEnqueueTask(queue, trns, 0, NULL, NULL); + } /* Butterfly Operation */ fftCore(xmobj, rmobj, wmobj, m, inverse);