comparison parallel_Prefix_Sum_Example/scan_kernel.cl @ 2:ccea4e6a1945

add OpenCL example
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Jan 2013 23:19:41 +0900
parents
children
comparison
equal deleted inserted replaced
1:b511640282d2 2:ccea4e6a1945
1 //
2 // File: scan_kernel.cl
3 //
4 // Abstract: This example shows how to perform an efficient parallel prefix sum (aka Scan
5 // using OpenCL. Scan is a common data parallel primitive which can be used for
6 // variety of different operations -- this example uses local memory for storing
7 // partial sums and avoids memory bank conflicts on architectures which serialize
8 // memory operations that are serviced on the same memory bank by offsetting the
9 // loads and stores based on the size of the local group and the number of
10 // memory banks (see appropriate macro definition). As a result, this example
11 // requires that the local group size > 1.
12 //
13 // Version: <1.0>
14 //
15 // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
16 // in consideration of your agreement to the following terms, and your use,
17 // installation, modification or redistribution of this Apple software
18 // constitutes acceptance of these terms. If you do not agree with these
19 // terms, please do not use, install, modify or redistribute this Apple
20 // software.
21 //
22 // In consideration of your agreement to abide by the following terms, and
23 // subject to these terms, Apple grants you a personal, non - exclusive
24 // license, under Apple's copyrights in this original Apple software ( the
25 // "Apple Software" ), to use, reproduce, modify and redistribute the Apple
26 // Software, with or without modifications, in source and / or binary forms;
27 // provided that if you redistribute the Apple Software in its entirety and
28 // without modifications, you must retain this notice and the following text
29 // and disclaimers in all such redistributions of the Apple Software. Neither
30 // the name, trademarks, service marks or logos of Apple Inc. may be used to
31 // endorse or promote products derived from the Apple Software without specific
32 // prior written permission from Apple. Except as expressly stated in this
33 // notice, no other rights or licenses, express or implied, are granted by
34 // Apple herein, including but not limited to any patent rights that may be
35 // infringed by your derivative works or by other works in which the Apple
36 // Software may be incorporated.
37 //
38 // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
39 // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
40 // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
41 // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
42 // ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
43 //
44 // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
45 // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
46 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
47 // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
48 // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
49 // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
50 // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 //
52 // Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
53 //
54 ////////////////////////////////////////////////////////////////////////////////////////////////////
55
56 #define MEMORY_BANK_COUNT (16) // Adjust to your architecture
57 #define LOG2_MEMORY_BANK_COUNT (4) // Set to log2(MEMORY_BANK_COUNT)
58 #define ELIMINATE_CONFLICTS (0) // Enable for slow address calculation, but zero bank conflicts
59
60 ////////////////////////////////////////////////////////////////////////////////////////////////////
61
62 #if (ELIMINATE_CONFLICTS)
63 #define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT + (index) >> (2*LOG2_MEMORY_BANK_COUNT))
64 #else
65 #define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT)
66 #endif
67
68 ////////////////////////////////////////////////////////////////////////////////////////////////////
69
70 uint4
71 GetAddressMapping(int index)
72 {
73 const uint local_id = get_local_id(0);
74 const uint group_id = get_global_id(0) / get_local_size(0);
75 const uint group_size = get_local_size(0);
76
77 uint2 global_index;
78 global_index.x = index + local_id;
79 global_index.y = global_index.x + group_size;
80
81 uint2 local_index;
82 local_index.x = local_id;
83 local_index.y = local_id + group_size;
84
85 return (uint4)(global_index.x, global_index.y, local_index.x, local_index.y);
86 }
87
88 void
89 LoadLocalFromGlobal(
90 __local float *shared_data,
91 __global const float *input_data,
92 const uint4 address_pair,
93 const uint n)
94 {
95 const uint global_index_a = address_pair.x;
96 const uint global_index_b = address_pair.y;
97
98 const uint local_index_a = address_pair.z;
99 const uint local_index_b = address_pair.w;
100
101 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
102 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
103
104 shared_data[local_index_a + bank_offset_a] = input_data[global_index_a];
105 shared_data[local_index_b + bank_offset_b] = input_data[global_index_b];
106 }
107
108 void
109 LoadLocalFromGlobalNonPowerOfTwo(
110 __local float *shared_data,
111 __global const float *input_data,
112 const uint4 address_pair,
113 const uint n)
114 {
115 const uint global_index_a = address_pair.x;
116 const uint global_index_b = address_pair.y;
117
118 const uint local_index_a = address_pair.z;
119 const uint local_index_b = address_pair.w;
120
121 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
122 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
123
124 shared_data[local_index_a + bank_offset_a] = input_data[global_index_a];
125 shared_data[local_index_b + bank_offset_b] = (local_index_b < n) ? input_data[global_index_b] : 0;
126
127 barrier(CLK_LOCAL_MEM_FENCE);
128 }
129
130 void
131 StoreLocalToGlobal(
132 __global float* output_data,
133 __local const float* shared_data,
134 const uint4 address_pair,
135 const uint n)
136 {
137 barrier(CLK_LOCAL_MEM_FENCE);
138
139 const uint global_index_a = address_pair.x;
140 const uint global_index_b = address_pair.y;
141
142 const uint local_index_a = address_pair.z;
143 const uint local_index_b = address_pair.w;
144
145 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
146 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
147
148 output_data[global_index_a] = shared_data[local_index_a + bank_offset_a];
149 output_data[global_index_b] = shared_data[local_index_b + bank_offset_b];
150 }
151
152 void
153 StoreLocalToGlobalNonPowerOfTwo(
154 __global float* output_data,
155 __local const float* shared_data,
156 const uint4 address_pair,
157 const uint n)
158 {
159 barrier(CLK_LOCAL_MEM_FENCE);
160
161 const uint global_index_a = address_pair.x;
162 const uint global_index_b = address_pair.y;
163
164 const uint local_index_a = address_pair.z;
165 const uint local_index_b = address_pair.w;
166
167 const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
168 const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
169
170 output_data[global_index_a] = shared_data[local_index_a + bank_offset_a];
171 if(local_index_b < n)
172 output_data[global_index_b] = shared_data[local_index_b + bank_offset_b];
173 }
174
175 ////////////////////////////////////////////////////////////////////////////////////////////////////
176
177 void
178 ClearLastElement(
179 __local float* shared_data,
180 int group_index)
181 {
182 const uint local_id = get_local_id(0);
183 const uint group_id = get_global_id(0) / get_local_size(0);
184 const uint group_size = get_local_size(0);
185
186 if (local_id == 0)
187 {
188 int index = (group_size << 1) - 1;
189 index += MEMORY_BANK_OFFSET(index);
190 shared_data[index] = 0;
191 }
192 }
193
194 void
195 ClearLastElementStoreSum(
196 __local float* shared_data,
197 __global float *partial_sums,
198 int group_index)
199 {
200 const uint group_id = get_global_id(0) / get_local_size(0);
201 const uint group_size = get_local_size(0);
202 const uint local_id = get_local_id(0);
203
204 if (local_id == 0)
205 {
206 int index = (group_size << 1) - 1;
207 index += MEMORY_BANK_OFFSET(index);
208 partial_sums[group_index] = shared_data[index];
209 shared_data[index] = 0;
210 }
211 }
212
213 ////////////////////////////////////////////////////////////////////////////////////////////////////
214
215 uint
216 BuildPartialSum(
217 __local float *shared_data)
218 {
219 const uint local_id = get_local_id(0);
220 const uint group_size = get_local_size(0);
221 const uint two = 2;
222 uint stride = 1;
223
224 for (uint j = group_size; j > 0; j >>= 1)
225 {
226 barrier(CLK_LOCAL_MEM_FENCE);
227
228 if (local_id < j)
229 {
230 int i = mul24(mul24(two, stride), local_id);
231
232 uint local_index_a = i + stride - 1;
233 uint local_index_b = local_index_a + stride;
234
235 local_index_a += MEMORY_BANK_OFFSET(local_index_a);
236 local_index_b += MEMORY_BANK_OFFSET(local_index_b);
237
238 shared_data[local_index_b] += shared_data[local_index_a];
239 }
240
241 stride *= two;
242 }
243
244 return stride;
245 }
246
247 void
248 ScanRootToLeaves(
249 __local float *shared_data,
250 uint stride)
251 {
252 const uint local_id = get_local_id(0);
253 const uint group_id = get_global_id(0) / get_local_size(0);
254 const uint group_size = get_local_size(0);
255 const uint two = 2;
256
257 for (uint j = 1; j <= group_size; j *= two)
258 {
259 stride >>= 1;
260
261 barrier(CLK_LOCAL_MEM_FENCE);
262
263 if (local_id < j)
264 {
265 int i = mul24(mul24(two, stride), local_id);
266
267 uint local_index_a = i + stride - 1;
268 uint local_index_b = local_index_a + stride;
269
270 local_index_a += MEMORY_BANK_OFFSET(local_index_a);
271 local_index_b += MEMORY_BANK_OFFSET(local_index_b);
272
273 float t = shared_data[local_index_a];
274 shared_data[local_index_a] = shared_data[local_index_b];
275 shared_data[local_index_b] += t;
276 }
277 }
278 }
279
280 void
281 PreScanGroup(
282 __local float *shared_data,
283 int group_index)
284 {
285 const uint group_id = get_global_id(0) / get_local_size(0);
286
287 int stride = BuildPartialSum(shared_data);
288 ClearLastElement(shared_data, (group_index == 0) ? group_id : group_index);
289 ScanRootToLeaves(shared_data, stride);
290 }
291
292 void
293 PreScanGroupStoreSum(
294 __global float *partial_sums,
295 __local float *shared_data,
296 int group_index)
297 {
298 const uint group_id = get_global_id(0) / get_local_size(0);
299
300 int stride = BuildPartialSum(shared_data);
301 ClearLastElementStoreSum(shared_data, partial_sums, (group_index == 0) ? group_id : group_index);
302 ScanRootToLeaves(shared_data, stride);
303 }
304
305 ////////////////////////////////////////////////////////////////////////////////////////////////////
306
307 __kernel void
308 PreScanKernel(
309 __global float *output_data,
310 __global const float *input_data,
311 __local float* shared_data,
312 const uint group_index,
313 const uint base_index,
314 const uint n)
315 {
316 const uint group_id = get_global_id(0) / get_local_size(0);
317 const uint group_size = get_local_size(0);
318
319 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
320 uint4 address_pair = GetAddressMapping(local_index);
321
322 LoadLocalFromGlobal(shared_data, input_data, address_pair, n);
323 PreScanGroup(shared_data, group_index);
324 StoreLocalToGlobal(output_data, shared_data, address_pair, n);
325 }
326
327 __kernel void
328 PreScanStoreSumKernel(
329 __global float *output_data,
330 __global const float *input_data,
331 __global float *partial_sums,
332 __local float* shared_data,
333 const uint group_index,
334 const uint base_index,
335 const uint n)
336 {
337 const uint group_id = get_global_id(0) / get_local_size(0);
338 const uint group_size = get_local_size(0);
339
340 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
341 uint4 address_pair = GetAddressMapping(local_index);
342
343 LoadLocalFromGlobal(shared_data, input_data, address_pair, n);
344 PreScanGroupStoreSum(partial_sums, shared_data, group_index);
345 StoreLocalToGlobal(output_data, shared_data, address_pair, n);
346 }
347
348 __kernel void
349 PreScanStoreSumNonPowerOfTwoKernel(
350 __global float *output_data,
351 __global const float *input_data,
352 __global float *partial_sums,
353 __local float* shared_data,
354 const uint group_index,
355 const uint base_index,
356 const uint n)
357 {
358 const uint local_id = get_local_id(0);
359 const uint group_id = get_global_id(0) / get_local_size(0);
360 const uint group_size = get_local_size(0);
361
362 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
363 uint4 address_pair = GetAddressMapping(local_index);
364
365 LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n);
366 PreScanGroupStoreSum(partial_sums, shared_data, group_index);
367 StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n);
368 }
369
370 __kernel void
371 PreScanNonPowerOfTwoKernel(
372 __global float *output_data,
373 __global const float *input_data,
374 __local float* shared_data,
375 const uint group_index,
376 const uint base_index,
377 const uint n)
378 {
379 const uint local_id = get_local_id(0);
380 const uint group_id = get_global_id(0) / get_local_size(0);
381 const uint group_size = get_local_size(0);
382
383 uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
384 uint4 address_pair = GetAddressMapping(local_index);
385
386 LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n);
387 PreScanGroup(shared_data, group_index);
388 StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n);
389 }
390
391 ////////////////////////////////////////////////////////////////////////////////////////////////////
392
393 __kernel void UniformAddKernel(
394 __global float *output_data,
395 __global float *input_data,
396 __local float *shared_data,
397 const uint group_offset,
398 const uint base_index,
399 const uint n)
400 {
401 const uint local_id = get_local_id(0);
402 const uint group_id = get_global_id(0) / get_local_size(0);
403 const uint group_size = get_local_size(0);
404
405 if (local_id == 0)
406 shared_data[0] = input_data[group_id + group_offset];
407
408 barrier(CLK_LOCAL_MEM_FENCE);
409
410 uint address = mul24(group_id, (group_size << 1)) + base_index + local_id;
411
412 output_data[address] += shared_data[0];
413 if( (local_id + group_size) < n)
414 output_data[address + group_size] += shared_data[0];
415 }
416
417 ////////////////////////////////////////////////////////////////////////////////////////////////////
418
419