Mercurial > hg > Members > yuuhi > OpenCL
comparison fft_Example/fft_internal.h @ 7:ea2e7ce9d5bb
add sample.pgm
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 05 Feb 2013 15:19:02 +0900 |
parents | ccea4e6a1945 |
children |
comparison
equal
deleted
inserted
replaced
6:db074091ed0b | 7:ea2e7ce9d5bb |
---|---|
78 | 78 |
79 typedef struct | 79 typedef struct |
80 { | 80 { |
81 // context in which fft resources are created and kernels are executed | 81 // context in which fft resources are created and kernels are executed |
82 cl_context context; | 82 cl_context context; |
83 | 83 |
84 // size of signal | 84 // size of signal |
85 clFFT_Dim3 n; | 85 clFFT_Dim3 n; |
86 | 86 |
87 // dimension of transform ... must be either 1D, 2D or 3D | 87 // dimension of transform ... must be either 1D, 2D or 3D |
88 clFFT_Dimension dim; | 88 clFFT_Dimension dim; |
89 | 89 |
90 // data format ... must be either interleaved or plannar | 90 // data format ... must be either interleaved or plannar |
91 clFFT_DataFormat format; | 91 clFFT_DataFormat format; |
92 | 92 |
93 // string containing kernel source. Generated at runtime based on | 93 // string containing kernel source. Generated at runtime based on |
94 // n, dim, format and other parameters | 94 // n, dim, format and other parameters |
95 string *kernel_string; | 95 string *kernel_string; |
96 | 96 |
97 // CL program containing source and kernel this particular | 97 // CL program containing source and kernel this particular |
98 // n, dim, data format | 98 // n, dim, data format |
99 cl_program program; | 99 cl_program program; |
100 | 100 |
101 // linked list of kernels which needs to be executed for this fft | 101 // linked list of kernels which needs to be executed for this fft |
102 cl_fft_kernel_info *kernel_info; | 102 cl_fft_kernel_info *kernel_info; |
103 | 103 |
104 // number of kernels | 104 // number of kernels |
105 int num_kernels; | 105 int num_kernels; |
106 | 106 |
107 // twist kernel for virtualizing fft of very large sizes that do not | 107 // twist kernel for virtualizing fft of very large sizes that do not |
108 // fit in GPU global memory | 108 // fit in GPU global memory |
109 cl_kernel twist_kernel; | 109 cl_kernel twist_kernel; |
110 | 110 |
111 // flag indicating if temporary intermediate buffer is needed or not. | 111 // flag indicating if temporary intermediate buffer is needed or not. |
112 // this depends on fft kernels being executed and if transform is | 112 // this depends on fft kernels being executed and if transform is |
113 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... | 113 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... |
114 // one that does not require global transpose do not need temporary buffer) | 114 // one that does not require global transpose do not need temporary buffer) |
115 // 2D 1024x1024 out-of-place fft however do require intermediate buffer. | 115 // 2D 1024x1024 out-of-place fft however do require intermediate buffer. |
116 // If temp buffer is needed, its allocation is lazy i.e. its not allocated | 116 // If temp buffer is needed, its allocation is lazy i.e. its not allocated |
117 // until its needed | 117 // until its needed |
118 cl_int temp_buffer_needed; | 118 cl_int temp_buffer_needed; |
119 | 119 |
120 // Batch size is runtime parameter and size of temporary buffer (if needed) | 120 // Batch size is runtime parameter and size of temporary buffer (if needed) |
121 // depends on batch size. Allocation of temporary buffer is lazy i.e. its | 121 // depends on batch size. Allocation of temporary buffer is lazy i.e. its |
122 // only created when needed. Once its created at first call of clFFT_Executexxx | 122 // only created when needed. Once its created at first call of clFFT_Executexxx |
123 // it is not allocated next time if next time clFFT_Executexxx is called with | 123 // it is not allocated next time if next time clFFT_Executexxx is called with |
124 // batch size different than the first call. last_batch_size caches the last | 124 // batch size different than the first call. last_batch_size caches the last |
125 // batch size with which this plan is used so that we dont keep allocating/deallocating | 125 // batch size with which this plan is used so that we dont keep allocating/deallocating |
126 // temp buffer if same batch size is used again and again. | 126 // temp buffer if same batch size is used again and again. |
127 unsigned last_batch_size; | 127 unsigned last_batch_size; |
128 | 128 |
129 // temporary buffer for interleaved plan | 129 // temporary buffer for interleaved plan |
130 cl_mem tempmemobj; | 130 cl_mem tempmemobj; |
131 | 131 |
132 // temporary buffer for planner plan. Only one of tempmemobj or | 132 // temporary buffer for planner plan. Only one of tempmemobj or |
133 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending | 133 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending |
134 // data format of plan (plannar or interleaved) | 134 // data format of plan (plannar or interleaved) |
135 cl_mem tempmemobj_real, tempmemobj_imag; | 135 cl_mem tempmemobj_real, tempmemobj_imag; |
136 | 136 |
137 // Maximum size of signal for which local memory transposed based | 137 // Maximum size of signal for which local memory transposed based |
138 // fft is sufficient i.e. no global mem transpose (communication) | 138 // fft is sufficient i.e. no global mem transpose (communication) |
139 // is needed | 139 // is needed |
140 unsigned max_localmem_fft_size; | 140 unsigned max_localmem_fft_size; |
141 | 141 |
142 // Maximum work items per work group allowed. This, along with max_radix below controls | 142 // Maximum work items per work group allowed. This, along with max_radix below controls |
143 // maximum local memory being used by fft kernels of this plan. Set to 256 by default | 143 // maximum local memory being used by fft kernels of this plan. Set to 256 by default |
144 unsigned max_work_item_per_workgroup; | 144 unsigned max_work_item_per_workgroup; |
145 | 145 |
146 // Maximum base radix for local memory fft ... this controls the maximum register | 146 // Maximum base radix for local memory fft ... this controls the maximum register |
147 // space used by work items. Currently defaults to 16 | 147 // space used by work items. Currently defaults to 16 |
148 unsigned max_radix; | 148 unsigned max_radix; |
149 | 149 |
150 // Device depended parameter that tells how many work-items need to be read consecutive | 150 // Device depended parameter that tells how many work-items need to be read consecutive |
151 // values to make sure global memory access by work-items of a work-group result in | 151 // values to make sure global memory access by work-items of a work-group result in |
152 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 | 152 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 |
153 unsigned min_mem_coalesce_width; | 153 unsigned min_mem_coalesce_width; |
154 | 154 |
155 // Number of local memory banks. This is used to geneate kernel with local memory | 155 // Number of local memory banks. This is used to geneate kernel with local memory |
156 // transposes with appropriate padding to avoid bank conflicts to local memory | 156 // transposes with appropriate padding to avoid bank conflicts to local memory |
157 // e.g. on NVidia it is 16. | 157 // e.g. on NVidia it is 16. |
158 unsigned num_local_mem_banks; | 158 unsigned num_local_mem_banks; |
159 }cl_fft_plan; | 159 }cl_fft_plan; |
160 | 160 |
161 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); | 161 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); |
162 | 162 |
163 #endif | 163 #endif |