Mercurial > hg > Members > yuuhi > OpenCL
comparison parallel_Prefix_Sum_Example/scan_kernel.cl @ 2:ccea4e6a1945
add OpenCL example
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 22 Jan 2013 23:19:41 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:b511640282d2 | 2:ccea4e6a1945 |
---|---|
1 // | |
2 // File: scan_kernel.cl | |
3 // | |
4 // Abstract: This example shows how to perform an efficient parallel prefix sum (aka Scan | |
5 // using OpenCL. Scan is a common data parallel primitive which can be used for | |
6 // variety of different operations -- this example uses local memory for storing | |
7 // partial sums and avoids memory bank conflicts on architectures which serialize | |
8 // memory operations that are serviced on the same memory bank by offsetting the | |
9 // loads and stores based on the size of the local group and the number of | |
10 // memory banks (see appropriate macro definition). As a result, this example | |
11 // requires that the local group size > 1. | |
12 // | |
13 // Version: <1.0> | |
14 // | |
15 // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") | |
16 // in consideration of your agreement to the following terms, and your use, | |
17 // installation, modification or redistribution of this Apple software | |
18 // constitutes acceptance of these terms. If you do not agree with these | |
19 // terms, please do not use, install, modify or redistribute this Apple | |
20 // software. | |
21 // | |
22 // In consideration of your agreement to abide by the following terms, and | |
23 // subject to these terms, Apple grants you a personal, non - exclusive | |
24 // license, under Apple's copyrights in this original Apple software ( the | |
25 // "Apple Software" ), to use, reproduce, modify and redistribute the Apple | |
26 // Software, with or without modifications, in source and / or binary forms; | |
27 // provided that if you redistribute the Apple Software in its entirety and | |
28 // without modifications, you must retain this notice and the following text | |
29 // and disclaimers in all such redistributions of the Apple Software. Neither | |
30 // the name, trademarks, service marks or logos of Apple Inc. may be used to | |
31 // endorse or promote products derived from the Apple Software without specific | |
32 // prior written permission from Apple. Except as expressly stated in this | |
33 // notice, no other rights or licenses, express or implied, are granted by | |
34 // Apple herein, including but not limited to any patent rights that may be | |
35 // infringed by your derivative works or by other works in which the Apple | |
36 // Software may be incorporated. | |
37 // | |
38 // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO | |
39 // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED | |
40 // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A | |
41 // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION | |
42 // ALONE OR IN COMBINATION WITH YOUR PRODUCTS. | |
43 // | |
44 // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR | |
45 // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
46 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
47 // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION | |
48 // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER | |
49 // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR | |
50 // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 // | |
52 // Copyright ( C ) 2008 Apple Inc. All Rights Reserved. | |
53 // | |
54 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
55 | |
56 #define MEMORY_BANK_COUNT (16) // Adjust to your architecture | |
57 #define LOG2_MEMORY_BANK_COUNT (4) // Set to log2(MEMORY_BANK_COUNT) | |
58 #define ELIMINATE_CONFLICTS (0) // Enable for slow address calculation, but zero bank conflicts | |
59 | |
60 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
61 | |
62 #if (ELIMINATE_CONFLICTS) | |
63 #define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT + (index) >> (2*LOG2_MEMORY_BANK_COUNT)) | |
64 #else | |
65 #define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT) | |
66 #endif | |
67 | |
68 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
69 | |
70 uint4 | |
71 GetAddressMapping(int index) | |
72 { | |
73 const uint local_id = get_local_id(0); | |
74 const uint group_id = get_global_id(0) / get_local_size(0); | |
75 const uint group_size = get_local_size(0); | |
76 | |
77 uint2 global_index; | |
78 global_index.x = index + local_id; | |
79 global_index.y = global_index.x + group_size; | |
80 | |
81 uint2 local_index; | |
82 local_index.x = local_id; | |
83 local_index.y = local_id + group_size; | |
84 | |
85 return (uint4)(global_index.x, global_index.y, local_index.x, local_index.y); | |
86 } | |
87 | |
88 void | |
89 LoadLocalFromGlobal( | |
90 __local float *shared_data, | |
91 __global const float *input_data, | |
92 const uint4 address_pair, | |
93 const uint n) | |
94 { | |
95 const uint global_index_a = address_pair.x; | |
96 const uint global_index_b = address_pair.y; | |
97 | |
98 const uint local_index_a = address_pair.z; | |
99 const uint local_index_b = address_pair.w; | |
100 | |
101 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a); | |
102 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b); | |
103 | |
104 shared_data[local_index_a + bank_offset_a] = input_data[global_index_a]; | |
105 shared_data[local_index_b + bank_offset_b] = input_data[global_index_b]; | |
106 } | |
107 | |
108 void | |
109 LoadLocalFromGlobalNonPowerOfTwo( | |
110 __local float *shared_data, | |
111 __global const float *input_data, | |
112 const uint4 address_pair, | |
113 const uint n) | |
114 { | |
115 const uint global_index_a = address_pair.x; | |
116 const uint global_index_b = address_pair.y; | |
117 | |
118 const uint local_index_a = address_pair.z; | |
119 const uint local_index_b = address_pair.w; | |
120 | |
121 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a); | |
122 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b); | |
123 | |
124 shared_data[local_index_a + bank_offset_a] = input_data[global_index_a]; | |
125 shared_data[local_index_b + bank_offset_b] = (local_index_b < n) ? input_data[global_index_b] : 0; | |
126 | |
127 barrier(CLK_LOCAL_MEM_FENCE); | |
128 } | |
129 | |
130 void | |
131 StoreLocalToGlobal( | |
132 __global float* output_data, | |
133 __local const float* shared_data, | |
134 const uint4 address_pair, | |
135 const uint n) | |
136 { | |
137 barrier(CLK_LOCAL_MEM_FENCE); | |
138 | |
139 const uint global_index_a = address_pair.x; | |
140 const uint global_index_b = address_pair.y; | |
141 | |
142 const uint local_index_a = address_pair.z; | |
143 const uint local_index_b = address_pair.w; | |
144 | |
145 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a); | |
146 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b); | |
147 | |
148 output_data[global_index_a] = shared_data[local_index_a + bank_offset_a]; | |
149 output_data[global_index_b] = shared_data[local_index_b + bank_offset_b]; | |
150 } | |
151 | |
152 void | |
153 StoreLocalToGlobalNonPowerOfTwo( | |
154 __global float* output_data, | |
155 __local const float* shared_data, | |
156 const uint4 address_pair, | |
157 const uint n) | |
158 { | |
159 barrier(CLK_LOCAL_MEM_FENCE); | |
160 | |
161 const uint global_index_a = address_pair.x; | |
162 const uint global_index_b = address_pair.y; | |
163 | |
164 const uint local_index_a = address_pair.z; | |
165 const uint local_index_b = address_pair.w; | |
166 | |
167 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a); | |
168 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b); | |
169 | |
170 output_data[global_index_a] = shared_data[local_index_a + bank_offset_a]; | |
171 if(local_index_b < n) | |
172 output_data[global_index_b] = shared_data[local_index_b + bank_offset_b]; | |
173 } | |
174 | |
175 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
176 | |
177 void | |
178 ClearLastElement( | |
179 __local float* shared_data, | |
180 int group_index) | |
181 { | |
182 const uint local_id = get_local_id(0); | |
183 const uint group_id = get_global_id(0) / get_local_size(0); | |
184 const uint group_size = get_local_size(0); | |
185 | |
186 if (local_id == 0) | |
187 { | |
188 int index = (group_size << 1) - 1; | |
189 index += MEMORY_BANK_OFFSET(index); | |
190 shared_data[index] = 0; | |
191 } | |
192 } | |
193 | |
194 void | |
195 ClearLastElementStoreSum( | |
196 __local float* shared_data, | |
197 __global float *partial_sums, | |
198 int group_index) | |
199 { | |
200 const uint group_id = get_global_id(0) / get_local_size(0); | |
201 const uint group_size = get_local_size(0); | |
202 const uint local_id = get_local_id(0); | |
203 | |
204 if (local_id == 0) | |
205 { | |
206 int index = (group_size << 1) - 1; | |
207 index += MEMORY_BANK_OFFSET(index); | |
208 partial_sums[group_index] = shared_data[index]; | |
209 shared_data[index] = 0; | |
210 } | |
211 } | |
212 | |
213 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
214 | |
215 uint | |
216 BuildPartialSum( | |
217 __local float *shared_data) | |
218 { | |
219 const uint local_id = get_local_id(0); | |
220 const uint group_size = get_local_size(0); | |
221 const uint two = 2; | |
222 uint stride = 1; | |
223 | |
224 for (uint j = group_size; j > 0; j >>= 1) | |
225 { | |
226 barrier(CLK_LOCAL_MEM_FENCE); | |
227 | |
228 if (local_id < j) | |
229 { | |
230 int i = mul24(mul24(two, stride), local_id); | |
231 | |
232 uint local_index_a = i + stride - 1; | |
233 uint local_index_b = local_index_a + stride; | |
234 | |
235 local_index_a += MEMORY_BANK_OFFSET(local_index_a); | |
236 local_index_b += MEMORY_BANK_OFFSET(local_index_b); | |
237 | |
238 shared_data[local_index_b] += shared_data[local_index_a]; | |
239 } | |
240 | |
241 stride *= two; | |
242 } | |
243 | |
244 return stride; | |
245 } | |
246 | |
247 void | |
248 ScanRootToLeaves( | |
249 __local float *shared_data, | |
250 uint stride) | |
251 { | |
252 const uint local_id = get_local_id(0); | |
253 const uint group_id = get_global_id(0) / get_local_size(0); | |
254 const uint group_size = get_local_size(0); | |
255 const uint two = 2; | |
256 | |
257 for (uint j = 1; j <= group_size; j *= two) | |
258 { | |
259 stride >>= 1; | |
260 | |
261 barrier(CLK_LOCAL_MEM_FENCE); | |
262 | |
263 if (local_id < j) | |
264 { | |
265 int i = mul24(mul24(two, stride), local_id); | |
266 | |
267 uint local_index_a = i + stride - 1; | |
268 uint local_index_b = local_index_a + stride; | |
269 | |
270 local_index_a += MEMORY_BANK_OFFSET(local_index_a); | |
271 local_index_b += MEMORY_BANK_OFFSET(local_index_b); | |
272 | |
273 float t = shared_data[local_index_a]; | |
274 shared_data[local_index_a] = shared_data[local_index_b]; | |
275 shared_data[local_index_b] += t; | |
276 } | |
277 } | |
278 } | |
279 | |
280 void | |
281 PreScanGroup( | |
282 __local float *shared_data, | |
283 int group_index) | |
284 { | |
285 const uint group_id = get_global_id(0) / get_local_size(0); | |
286 | |
287 int stride = BuildPartialSum(shared_data); | |
288 ClearLastElement(shared_data, (group_index == 0) ? group_id : group_index); | |
289 ScanRootToLeaves(shared_data, stride); | |
290 } | |
291 | |
292 void | |
293 PreScanGroupStoreSum( | |
294 __global float *partial_sums, | |
295 __local float *shared_data, | |
296 int group_index) | |
297 { | |
298 const uint group_id = get_global_id(0) / get_local_size(0); | |
299 | |
300 int stride = BuildPartialSum(shared_data); | |
301 ClearLastElementStoreSum(shared_data, partial_sums, (group_index == 0) ? group_id : group_index); | |
302 ScanRootToLeaves(shared_data, stride); | |
303 } | |
304 | |
305 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
306 | |
307 __kernel void | |
308 PreScanKernel( | |
309 __global float *output_data, | |
310 __global const float *input_data, | |
311 __local float* shared_data, | |
312 const uint group_index, | |
313 const uint base_index, | |
314 const uint n) | |
315 { | |
316 const uint group_id = get_global_id(0) / get_local_size(0); | |
317 const uint group_size = get_local_size(0); | |
318 | |
319 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index; | |
320 uint4 address_pair = GetAddressMapping(local_index); | |
321 | |
322 LoadLocalFromGlobal(shared_data, input_data, address_pair, n); | |
323 PreScanGroup(shared_data, group_index); | |
324 StoreLocalToGlobal(output_data, shared_data, address_pair, n); | |
325 } | |
326 | |
327 __kernel void | |
328 PreScanStoreSumKernel( | |
329 __global float *output_data, | |
330 __global const float *input_data, | |
331 __global float *partial_sums, | |
332 __local float* shared_data, | |
333 const uint group_index, | |
334 const uint base_index, | |
335 const uint n) | |
336 { | |
337 const uint group_id = get_global_id(0) / get_local_size(0); | |
338 const uint group_size = get_local_size(0); | |
339 | |
340 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index; | |
341 uint4 address_pair = GetAddressMapping(local_index); | |
342 | |
343 LoadLocalFromGlobal(shared_data, input_data, address_pair, n); | |
344 PreScanGroupStoreSum(partial_sums, shared_data, group_index); | |
345 StoreLocalToGlobal(output_data, shared_data, address_pair, n); | |
346 } | |
347 | |
348 __kernel void | |
349 PreScanStoreSumNonPowerOfTwoKernel( | |
350 __global float *output_data, | |
351 __global const float *input_data, | |
352 __global float *partial_sums, | |
353 __local float* shared_data, | |
354 const uint group_index, | |
355 const uint base_index, | |
356 const uint n) | |
357 { | |
358 const uint local_id = get_local_id(0); | |
359 const uint group_id = get_global_id(0) / get_local_size(0); | |
360 const uint group_size = get_local_size(0); | |
361 | |
362 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index; | |
363 uint4 address_pair = GetAddressMapping(local_index); | |
364 | |
365 LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n); | |
366 PreScanGroupStoreSum(partial_sums, shared_data, group_index); | |
367 StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n); | |
368 } | |
369 | |
370 __kernel void | |
371 PreScanNonPowerOfTwoKernel( | |
372 __global float *output_data, | |
373 __global const float *input_data, | |
374 __local float* shared_data, | |
375 const uint group_index, | |
376 const uint base_index, | |
377 const uint n) | |
378 { | |
379 const uint local_id = get_local_id(0); | |
380 const uint group_id = get_global_id(0) / get_local_size(0); | |
381 const uint group_size = get_local_size(0); | |
382 | |
383 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index; | |
384 uint4 address_pair = GetAddressMapping(local_index); | |
385 | |
386 LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n); | |
387 PreScanGroup(shared_data, group_index); | |
388 StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n); | |
389 } | |
390 | |
391 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
392 | |
393 __kernel void UniformAddKernel( | |
394 __global float *output_data, | |
395 __global float *input_data, | |
396 __local float *shared_data, | |
397 const uint group_offset, | |
398 const uint base_index, | |
399 const uint n) | |
400 { | |
401 const uint local_id = get_local_id(0); | |
402 const uint group_id = get_global_id(0) / get_local_size(0); | |
403 const uint group_size = get_local_size(0); | |
404 | |
405 if (local_id == 0) | |
406 shared_data[0] = input_data[group_id + group_offset]; | |
407 | |
408 barrier(CLK_LOCAL_MEM_FENCE); | |
409 | |
410 uint address = mul24(group_id, (group_size << 1)) + base_index + local_id; | |
411 | |
412 output_data[address] += shared_data[0]; | |
413 if( (local_id + group_size) < n) | |
414 output_data[address + group_size] += shared_data[0]; | |
415 } | |
416 | |
417 //////////////////////////////////////////////////////////////////////////////////////////////////// | |
418 | |
419 |