Mercurial > hg > Members > yuuhi > OpenCL
diff fft_Example/fft_internal.h @ 2:ccea4e6a1945
add OpenCL example
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 22 Jan 2013 23:19:41 +0900 |
parents | |
children | ea2e7ce9d5bb |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_Example/fft_internal.h Tue Jan 22 23:19:41 2013 +0900 @@ -0,0 +1,163 @@ + +// +// File: fft_internal.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_INTERNAL_H +#define __CLFFT_INTERNAL_H + +#include "clFFT.h" +#include <iostream> +#include <string> +#include <sstream> + +using namespace std; + +typedef enum kernel_dir_t +{ + cl_fft_kernel_x, + cl_fft_kernel_y, + cl_fft_kernel_z +}cl_fft_kernel_dir; + +typedef struct kernel_info_t +{ + cl_kernel kernel; + char *kernel_name; + unsigned lmem_size; + unsigned num_workgroups; + unsigned num_xforms_per_workgroup; + unsigned num_workitems_per_workgroup; + cl_fft_kernel_dir dir; + int in_place_possible; + kernel_info_t *next; +}cl_fft_kernel_info; + +typedef struct +{ + // context in which fft resources are created and kernels are executed + cl_context context; + + // size of signal + clFFT_Dim3 n; + + // dimension of transform ... must be either 1D, 2D or 3D + clFFT_Dimension dim; + + // data format ... must be either interleaved or plannar + clFFT_DataFormat format; + + // string containing kernel source. Generated at runtime based on + // n, dim, format and other parameters + string *kernel_string; + + // CL program containing source and kernel this particular + // n, dim, data format + cl_program program; + + // linked list of kernels which needs to be executed for this fft + cl_fft_kernel_info *kernel_info; + + // number of kernels + int num_kernels; + + // twist kernel for virtualizing fft of very large sizes that do not + // fit in GPU global memory + cl_kernel twist_kernel; + + // flag indicating if temporary intermediate buffer is needed or not. + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // one that does not require global transpose do not need temporary buffer) + // 2D 1024x1024 out-of-place fft however do require intermediate buffer. + // If temp buffer is needed, its allocation is lazy i.e. its not allocated + // until its needed + cl_int temp_buffer_needed; + + // Batch size is runtime parameter and size of temporary buffer (if needed) + // depends on batch size. Allocation of temporary buffer is lazy i.e. its + // only created when needed. Once its created at first call of clFFT_Executexxx + // it is not allocated next time if next time clFFT_Executexxx is called with + // batch size different than the first call. last_batch_size caches the last + // batch size with which this plan is used so that we dont keep allocating/deallocating + // temp buffer if same batch size is used again and again. + unsigned last_batch_size; + + // temporary buffer for interleaved plan + cl_mem tempmemobj; + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + // data format of plan (plannar or interleaved) + cl_mem tempmemobj_real, tempmemobj_imag; + + // Maximum size of signal for which local memory transposed based + // fft is sufficient i.e. no global mem transpose (communication) + // is needed + unsigned max_localmem_fft_size; + + // Maximum work items per work group allowed. This, along with max_radix below controls + // maximum local memory being used by fft kernels of this plan. Set to 256 by default + unsigned max_work_item_per_workgroup; + + // Maximum base radix for local memory fft ... this controls the maximum register + // space used by work items. Currently defaults to 16 + unsigned max_radix; + + // Device depended parameter that tells how many work-items need to be read consecutive + // values to make sure global memory access by work-items of a work-group result in + // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 + unsigned min_mem_coalesce_width; + + // Number of local memory banks. This is used to geneate kernel with local memory + // transposes with appropriate padding to avoid bank conflicts to local memory + // e.g. on NVidia it is 16. + unsigned num_local_mem_banks; +}cl_fft_plan; + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); + +#endif