diff fft_Example/fft_internal.h @ 2:ccea4e6a1945

add OpenCL example
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Jan 2013 23:19:41 +0900
parents
children ea2e7ce9d5bb
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_internal.h	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,163 @@
+
+//
+// File:       fft_internal.h
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CLFFT_INTERNAL_H
+#define __CLFFT_INTERNAL_H
+
+#include "clFFT.h"
+#include <iostream>
+#include <string>
+#include <sstream>
+
+using namespace std;
+
+typedef enum kernel_dir_t
+{
+	cl_fft_kernel_x,
+	cl_fft_kernel_y,
+	cl_fft_kernel_z
+}cl_fft_kernel_dir;
+
+typedef struct kernel_info_t
+{
+	cl_kernel kernel;
+	char *kernel_name;
+	unsigned lmem_size;
+	unsigned num_workgroups;
+    unsigned num_xforms_per_workgroup;
+	unsigned num_workitems_per_workgroup;
+	cl_fft_kernel_dir dir;
+	int in_place_possible;
+	kernel_info_t *next;
+}cl_fft_kernel_info;
+
+typedef struct 
+{
+	// context in which fft resources are created and kernels are executed
+	cl_context              context;
+	
+	// size of signal
+	clFFT_Dim3              n;
+	
+	// dimension of transform ... must be either 1D, 2D or 3D
+	clFFT_Dimension			dim;
+	
+	// data format ... must be either interleaved or plannar
+	clFFT_DataFormat		format;
+	
+	// string containing kernel source. Generated at runtime based on
+	// n, dim, format and other parameters
+	string                  *kernel_string;
+	
+	// CL program containing source and kernel this particular 
+	// n, dim, data format
+	cl_program				program;
+	
+	// linked list of kernels which needs to be executed for this fft
+	cl_fft_kernel_info		*kernel_info;
+	
+	// number of kernels
+	int                     num_kernels;
+	
+	// twist kernel for virtualizing fft of very large sizes that do not
+	// fit in GPU global memory
+	cl_kernel				twist_kernel;
+	
+	// flag indicating if temporary intermediate buffer is needed or not.
+	// this depends on fft kernels being executed and if transform is 
+	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
+	// one that does not require global transpose do not need temporary buffer)
+	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
+	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
+	// until its needed
+	cl_int                  temp_buffer_needed;
+	
+	// Batch size is runtime parameter and size of temporary buffer (if needed)
+	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
+	// only created when needed. Once its created at first call of clFFT_Executexxx
+	// it is not allocated next time if next time clFFT_Executexxx is called with 
+	// batch size different than the first call. last_batch_size caches the last
+	// batch size with which this plan is used so that we dont keep allocating/deallocating
+	// temp buffer if same batch size is used again and again.
+	unsigned                  last_batch_size;
+	
+	// temporary buffer for interleaved plan
+	cl_mem   				tempmemobj;
+	
+	// temporary buffer for planner plan. Only one of tempmemobj or 
+	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
+	// data format of plan (plannar or interleaved)
+	cl_mem                  tempmemobj_real, tempmemobj_imag;
+	
+	// Maximum size of signal for which local memory transposed based
+	// fft is sufficient i.e. no global mem transpose (communication)
+	// is needed
+	unsigned					max_localmem_fft_size;
+	
+	// Maximum work items per work group allowed. This, along with max_radix below controls 
+	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
+	unsigned                  max_work_item_per_workgroup;
+	
+	// Maximum base radix for local memory fft ... this controls the maximum register 
+	// space used by work items. Currently defaults to 16
+	unsigned                  max_radix;
+	
+	// Device depended parameter that tells how many work-items need to be read consecutive
+	// values to make sure global memory access by work-items of a work-group result in 
+	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
+	unsigned                  min_mem_coalesce_width;
+	
+	// Number of local memory banks. This is used to geneate kernel with local memory 
+	// transposes with appropriate padding to avoid bank conflicts to local memory
+	// e.g. on NVidia it is 16.
+	unsigned                  num_local_mem_banks;
+}cl_fft_plan;
+
+void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
+
+#endif