Mercurial > hg > Members > yuuhi > OpenCL
comparison fft_Example/fft_internal.h @ 2:ccea4e6a1945
add OpenCL example
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 22 Jan 2013 23:19:41 +0900 |
parents | |
children | ea2e7ce9d5bb |
comparison
equal
deleted
inserted
replaced
1:b511640282d2 | 2:ccea4e6a1945 |
---|---|
1 | |
2 // | |
3 // File: fft_internal.h | |
4 // | |
5 // Version: <1.0> | |
6 // | |
7 // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") | |
8 // in consideration of your agreement to the following terms, and your use, | |
9 // installation, modification or redistribution of this Apple software | |
10 // constitutes acceptance of these terms. If you do not agree with these | |
11 // terms, please do not use, install, modify or redistribute this Apple | |
12 // software. | |
13 // | |
14 // In consideration of your agreement to abide by the following terms, and | |
15 // subject to these terms, Apple grants you a personal, non - exclusive | |
16 // license, under Apple's copyrights in this original Apple software ( the | |
17 // "Apple Software" ), to use, reproduce, modify and redistribute the Apple | |
18 // Software, with or without modifications, in source and / or binary forms; | |
19 // provided that if you redistribute the Apple Software in its entirety and | |
20 // without modifications, you must retain this notice and the following text | |
21 // and disclaimers in all such redistributions of the Apple Software. Neither | |
22 // the name, trademarks, service marks or logos of Apple Inc. may be used to | |
23 // endorse or promote products derived from the Apple Software without specific | |
24 // prior written permission from Apple. Except as expressly stated in this | |
25 // notice, no other rights or licenses, express or implied, are granted by | |
26 // Apple herein, including but not limited to any patent rights that may be | |
27 // infringed by your derivative works or by other works in which the Apple | |
28 // Software may be incorporated. | |
29 // | |
30 // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO | |
31 // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED | |
32 // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A | |
33 // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION | |
34 // ALONE OR IN COMBINATION WITH YOUR PRODUCTS. | |
35 // | |
36 // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR | |
37 // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
38 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
39 // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION | |
40 // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER | |
41 // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR | |
42 // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
43 // | |
44 // Copyright ( C ) 2008 Apple Inc. All Rights Reserved. | |
45 // | |
46 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
47 | |
48 | |
49 #ifndef __CLFFT_INTERNAL_H | |
50 #define __CLFFT_INTERNAL_H | |
51 | |
52 #include "clFFT.h" | |
53 #include <iostream> | |
54 #include <string> | |
55 #include <sstream> | |
56 | |
57 using namespace std; | |
58 | |
59 typedef enum kernel_dir_t | |
60 { | |
61 cl_fft_kernel_x, | |
62 cl_fft_kernel_y, | |
63 cl_fft_kernel_z | |
64 }cl_fft_kernel_dir; | |
65 | |
66 typedef struct kernel_info_t | |
67 { | |
68 cl_kernel kernel; | |
69 char *kernel_name; | |
70 unsigned lmem_size; | |
71 unsigned num_workgroups; | |
72 unsigned num_xforms_per_workgroup; | |
73 unsigned num_workitems_per_workgroup; | |
74 cl_fft_kernel_dir dir; | |
75 int in_place_possible; | |
76 kernel_info_t *next; | |
77 }cl_fft_kernel_info; | |
78 | |
79 typedef struct | |
80 { | |
81 // context in which fft resources are created and kernels are executed | |
82 cl_context context; | |
83 | |
84 // size of signal | |
85 clFFT_Dim3 n; | |
86 | |
87 // dimension of transform ... must be either 1D, 2D or 3D | |
88 clFFT_Dimension dim; | |
89 | |
90 // data format ... must be either interleaved or plannar | |
91 clFFT_DataFormat format; | |
92 | |
93 // string containing kernel source. Generated at runtime based on | |
94 // n, dim, format and other parameters | |
95 string *kernel_string; | |
96 | |
97 // CL program containing source and kernel this particular | |
98 // n, dim, data format | |
99 cl_program program; | |
100 | |
101 // linked list of kernels which needs to be executed for this fft | |
102 cl_fft_kernel_info *kernel_info; | |
103 | |
104 // number of kernels | |
105 int num_kernels; | |
106 | |
107 // twist kernel for virtualizing fft of very large sizes that do not | |
108 // fit in GPU global memory | |
109 cl_kernel twist_kernel; | |
110 | |
111 // flag indicating if temporary intermediate buffer is needed or not. | |
112 // this depends on fft kernels being executed and if transform is | |
113 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... | |
114 // one that does not require global transpose do not need temporary buffer) | |
115 // 2D 1024x1024 out-of-place fft however do require intermediate buffer. | |
116 // If temp buffer is needed, its allocation is lazy i.e. its not allocated | |
117 // until its needed | |
118 cl_int temp_buffer_needed; | |
119 | |
120 // Batch size is runtime parameter and size of temporary buffer (if needed) | |
121 // depends on batch size. Allocation of temporary buffer is lazy i.e. its | |
122 // only created when needed. Once its created at first call of clFFT_Executexxx | |
123 // it is not allocated next time if next time clFFT_Executexxx is called with | |
124 // batch size different than the first call. last_batch_size caches the last | |
125 // batch size with which this plan is used so that we dont keep allocating/deallocating | |
126 // temp buffer if same batch size is used again and again. | |
127 unsigned last_batch_size; | |
128 | |
129 // temporary buffer for interleaved plan | |
130 cl_mem tempmemobj; | |
131 | |
132 // temporary buffer for planner plan. Only one of tempmemobj or | |
133 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending | |
134 // data format of plan (plannar or interleaved) | |
135 cl_mem tempmemobj_real, tempmemobj_imag; | |
136 | |
137 // Maximum size of signal for which local memory transposed based | |
138 // fft is sufficient i.e. no global mem transpose (communication) | |
139 // is needed | |
140 unsigned max_localmem_fft_size; | |
141 | |
142 // Maximum work items per work group allowed. This, along with max_radix below controls | |
143 // maximum local memory being used by fft kernels of this plan. Set to 256 by default | |
144 unsigned max_work_item_per_workgroup; | |
145 | |
146 // Maximum base radix for local memory fft ... this controls the maximum register | |
147 // space used by work items. Currently defaults to 16 | |
148 unsigned max_radix; | |
149 | |
150 // Device depended parameter that tells how many work-items need to be read consecutive | |
151 // values to make sure global memory access by work-items of a work-group result in | |
152 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 | |
153 unsigned min_mem_coalesce_width; | |
154 | |
155 // Number of local memory banks. This is used to geneate kernel with local memory | |
156 // transposes with appropriate padding to avoid bank conflicts to local memory | |
157 // e.g. on NVidia it is 16. | |
158 unsigned num_local_mem_banks; | |
159 }cl_fft_plan; | |
160 | |
161 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); | |
162 | |
163 #endif |