comparison fft_Example/fft_internal.h @ 7:ea2e7ce9d5bb

add sample.pgm
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 05 Feb 2013 15:19:02 +0900
parents ccea4e6a1945
children
comparison
equal deleted inserted replaced
6:db074091ed0b 7:ea2e7ce9d5bb
78 78
79 typedef struct 79 typedef struct
80 { 80 {
81 // context in which fft resources are created and kernels are executed 81 // context in which fft resources are created and kernels are executed
82 cl_context context; 82 cl_context context;
83 83
84 // size of signal 84 // size of signal
85 clFFT_Dim3 n; 85 clFFT_Dim3 n;
86 86
87 // dimension of transform ... must be either 1D, 2D or 3D 87 // dimension of transform ... must be either 1D, 2D or 3D
88 clFFT_Dimension dim; 88 clFFT_Dimension dim;
89 89
90 // data format ... must be either interleaved or plannar 90 // data format ... must be either interleaved or plannar
91 clFFT_DataFormat format; 91 clFFT_DataFormat format;
92 92
93 // string containing kernel source. Generated at runtime based on 93 // string containing kernel source. Generated at runtime based on
94 // n, dim, format and other parameters 94 // n, dim, format and other parameters
95 string *kernel_string; 95 string *kernel_string;
96 96
97 // CL program containing source and kernel this particular 97 // CL program containing source and kernel this particular
98 // n, dim, data format 98 // n, dim, data format
99 cl_program program; 99 cl_program program;
100 100
101 // linked list of kernels which needs to be executed for this fft 101 // linked list of kernels which needs to be executed for this fft
102 cl_fft_kernel_info *kernel_info; 102 cl_fft_kernel_info *kernel_info;
103 103
104 // number of kernels 104 // number of kernels
105 int num_kernels; 105 int num_kernels;
106 106
107 // twist kernel for virtualizing fft of very large sizes that do not 107 // twist kernel for virtualizing fft of very large sizes that do not
108 // fit in GPU global memory 108 // fit in GPU global memory
109 cl_kernel twist_kernel; 109 cl_kernel twist_kernel;
110 110
111 // flag indicating if temporary intermediate buffer is needed or not. 111 // flag indicating if temporary intermediate buffer is needed or not.
112 // this depends on fft kernels being executed and if transform is 112 // this depends on fft kernels being executed and if transform is
113 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 113 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
114 // one that does not require global transpose do not need temporary buffer) 114 // one that does not require global transpose do not need temporary buffer)
115 // 2D 1024x1024 out-of-place fft however do require intermediate buffer. 115 // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
116 // If temp buffer is needed, its allocation is lazy i.e. its not allocated 116 // If temp buffer is needed, its allocation is lazy i.e. its not allocated
117 // until its needed 117 // until its needed
118 cl_int temp_buffer_needed; 118 cl_int temp_buffer_needed;
119 119
120 // Batch size is runtime parameter and size of temporary buffer (if needed) 120 // Batch size is runtime parameter and size of temporary buffer (if needed)
121 // depends on batch size. Allocation of temporary buffer is lazy i.e. its 121 // depends on batch size. Allocation of temporary buffer is lazy i.e. its
122 // only created when needed. Once its created at first call of clFFT_Executexxx 122 // only created when needed. Once its created at first call of clFFT_Executexxx
123 // it is not allocated next time if next time clFFT_Executexxx is called with 123 // it is not allocated next time if next time clFFT_Executexxx is called with
124 // batch size different than the first call. last_batch_size caches the last 124 // batch size different than the first call. last_batch_size caches the last
125 // batch size with which this plan is used so that we dont keep allocating/deallocating 125 // batch size with which this plan is used so that we dont keep allocating/deallocating
126 // temp buffer if same batch size is used again and again. 126 // temp buffer if same batch size is used again and again.
127 unsigned last_batch_size; 127 unsigned last_batch_size;
128 128
129 // temporary buffer for interleaved plan 129 // temporary buffer for interleaved plan
130 cl_mem tempmemobj; 130 cl_mem tempmemobj;
131 131
132 // temporary buffer for planner plan. Only one of tempmemobj or 132 // temporary buffer for planner plan. Only one of tempmemobj or
133 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 133 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
134 // data format of plan (plannar or interleaved) 134 // data format of plan (plannar or interleaved)
135 cl_mem tempmemobj_real, tempmemobj_imag; 135 cl_mem tempmemobj_real, tempmemobj_imag;
136 136
137 // Maximum size of signal for which local memory transposed based 137 // Maximum size of signal for which local memory transposed based
138 // fft is sufficient i.e. no global mem transpose (communication) 138 // fft is sufficient i.e. no global mem transpose (communication)
139 // is needed 139 // is needed
140 unsigned max_localmem_fft_size; 140 unsigned max_localmem_fft_size;
141 141
142 // Maximum work items per work group allowed. This, along with max_radix below controls 142 // Maximum work items per work group allowed. This, along with max_radix below controls
143 // maximum local memory being used by fft kernels of this plan. Set to 256 by default 143 // maximum local memory being used by fft kernels of this plan. Set to 256 by default
144 unsigned max_work_item_per_workgroup; 144 unsigned max_work_item_per_workgroup;
145 145
146 // Maximum base radix for local memory fft ... this controls the maximum register 146 // Maximum base radix for local memory fft ... this controls the maximum register
147 // space used by work items. Currently defaults to 16 147 // space used by work items. Currently defaults to 16
148 unsigned max_radix; 148 unsigned max_radix;
149 149
150 // Device depended parameter that tells how many work-items need to be read consecutive 150 // Device depended parameter that tells how many work-items need to be read consecutive
151 // values to make sure global memory access by work-items of a work-group result in 151 // values to make sure global memory access by work-items of a work-group result in
152 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 152 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
153 unsigned min_mem_coalesce_width; 153 unsigned min_mem_coalesce_width;
154 154
155 // Number of local memory banks. This is used to geneate kernel with local memory 155 // Number of local memory banks. This is used to geneate kernel with local memory
156 // transposes with appropriate padding to avoid bank conflicts to local memory 156 // transposes with appropriate padding to avoid bank conflicts to local memory
157 // e.g. on NVidia it is 16. 157 // e.g. on NVidia it is 16.
158 unsigned num_local_mem_banks; 158 unsigned num_local_mem_banks;
159 }cl_fft_plan; 159 }cl_fft_plan;
160 160
161 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); 161 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
162 162
163 #endif 163 #endif