111
|
1 /* workitems.c -- The main runtime entry that performs work-item execution in
|
|
2 various ways and the builtin functions closely related to the
|
|
3 implementation.
|
|
4
|
145
|
5 Copyright (C) 2015-2020 Free Software Foundation, Inc.
|
111
|
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
|
|
7 for General Processor Tech.
|
|
8
|
|
9 Permission is hereby granted, free of charge, to any person obtaining a
|
|
10 copy of this software and associated documentation files
|
|
11 (the "Software"), to deal in the Software without restriction, including
|
|
12 without limitation the rights to use, copy, modify, merge, publish,
|
|
13 distribute, sublicense, and/or sell copies of the Software, and to
|
|
14 permit persons to whom the Software is furnished to do so, subject to
|
|
15 the following conditions:
|
|
16
|
|
17 The above copyright notice and this permission notice shall be included
|
|
18 in all copies or substantial portions of the Software.
|
|
19
|
|
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
|
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
|
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
27 */
|
|
28
|
|
29 /* The fiber based multiple work-item work-group execution uses ucontext
|
|
30 based user mode threading. However, if gccbrig is able to optimize the
|
|
31 kernel to a much faster work-group function that implements the multiple
|
|
32 WI execution using loops instead of fibers requiring slow context switches,
|
|
33 the fiber-based implementation won't be called.
|
|
34 */
|
|
35
|
|
36 #include <stdlib.h>
|
|
37 #include <signal.h>
|
|
38 #include <string.h>
|
|
39
|
|
40 #include "workitems.h"
|
|
41 #include "phsa-rt.h"
|
|
42
|
|
43 #ifdef HAVE_FIBERS
|
|
44 #include "fibers.h"
|
|
45 #endif
|
|
46
|
|
47 #ifdef BENCHMARK_PHSA_RT
|
|
48 #include <stdio.h>
|
|
49 #include <time.h>
|
|
50
|
|
51 static uint64_t wi_count = 0;
|
|
52 static uint64_t wis_skipped = 0;
|
|
53 static uint64_t wi_total = 0;
|
|
54 static clock_t start_time;
|
|
55
|
|
56 #endif
|
|
57
|
|
58 #ifdef DEBUG_PHSA_RT
|
|
59 #include <stdio.h>
|
|
60 #endif
|
|
61
|
|
62 #define PRIVATE_SEGMENT_ALIGN 256
|
|
63 #define FIBER_STACK_SIZE (64*1024)
|
|
64 #define GROUP_SEGMENT_ALIGN 256
|
|
65
|
131
|
66 /* Preserve this amount of additional space in the alloca stack as we need to
|
|
67 store the alloca frame pointer to the alloca frame, thus must preserve
|
|
68 space for it. This thus supports at most 1024 functions with allocas in
|
|
69 a call chain. */
|
|
70 #define ALLOCA_OVERHEAD 1024*4
|
|
71
|
111
|
72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
|
|
73
|
|
74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
|
|
75
|
|
76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
|
|
77
|
|
78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
|
|
79
|
|
80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
|
|
81
|
|
82 void
|
|
83 phsa_fatal_error (int code)
|
|
84 {
|
|
85 exit (code);
|
|
86 }
|
|
87
|
|
88 #ifdef HAVE_FIBERS
|
|
89 /* ucontext-based work-item thread implementation. Runs all work-items in
|
|
90 separate fibers. */
|
|
91
|
|
92 static void
|
|
93 phsa_work_item_thread (int arg0, int arg1)
|
|
94 {
|
|
95 void *arg = fiber_int_args_to_ptr (arg0, arg1);
|
|
96
|
|
97 PHSAWorkItem *wi = (PHSAWorkItem *) arg;
|
|
98 volatile PHSAWorkGroup *wg = wi->wg;
|
|
99 PHSAKernelLaunchData *l_data = wi->launch_data;
|
|
100
|
|
101 do
|
|
102 {
|
|
103 int retcode
|
|
104 = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
|
|
105
|
|
106 /* At this point the threads can assume that either more_wgs is 0 or
|
|
107 the current_work_group_* is set to point to the WG executed next. */
|
|
108 if (!wi->wg->more_wgs)
|
|
109 break;
|
131
|
110
|
|
111 wi->group_x = wg->x;
|
|
112 wi->group_y = wg->y;
|
|
113 wi->group_z = wg->z;
|
|
114
|
|
115 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
116 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
117 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
|
118
|
111
|
119 #ifdef DEBUG_PHSA_RT
|
|
120 printf (
|
|
121 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
|
131
|
122 wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
|
|
123 l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
|
111
|
124 #endif
|
|
125
|
|
126 if (wi->x < __hsail_currentworkgroupsize (0, wi)
|
|
127 && wi->y < __hsail_currentworkgroupsize (1, wi)
|
|
128 && wi->z < __hsail_currentworkgroupsize (2, wi))
|
|
129 {
|
|
130 l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
|
|
131 wg->initial_group_offset, wg->private_base_ptr);
|
|
132 #ifdef DEBUG_PHSA_RT
|
|
133 printf ("done.\n");
|
|
134 #endif
|
|
135 #ifdef BENCHMARK_PHSA_RT
|
|
136 wi_count++;
|
|
137 #endif
|
|
138 }
|
|
139 else
|
|
140 {
|
|
141 #ifdef DEBUG_PHSA_RT
|
|
142 printf ("skipped (partial WG).\n");
|
|
143 #endif
|
|
144 #ifdef BENCHMARK_PHSA_RT
|
|
145 wis_skipped++;
|
|
146 #endif
|
|
147 }
|
|
148
|
|
149 retcode
|
|
150 = fiber_barrier_reach ((fiber_barrier_t *)
|
|
151 l_data->wg_completion_barrier);
|
|
152
|
|
153 /* The first thread updates the WG to execute next etc. */
|
|
154
|
|
155 if (retcode == 0)
|
|
156 {
|
|
157 #ifdef EXECUTE_WGS_BACKWARDS
|
|
158 if (wg->x == l_data->wg_min_x)
|
|
159 {
|
|
160 wg->x = l_data->wg_max_x - 1;
|
|
161 if (wg->y == l_data->wg_min_y)
|
|
162 {
|
|
163 wg->y = l_data->wg_max_y - 1;
|
|
164 if (wg->z == l_data->wg_min_z)
|
|
165 wg->more_wgs = 0;
|
|
166 else
|
|
167 wg->z--;
|
|
168 }
|
|
169 else
|
|
170 wg->y--;
|
|
171 }
|
|
172 else
|
|
173 wg->x--;
|
|
174 #else
|
|
175 if (wg->x + 1 >= l_data->wg_max_x)
|
|
176 {
|
|
177 wg->x = l_data->wg_min_x;
|
|
178 if (wg->y + 1 >= l_data->wg_max_y)
|
|
179 {
|
|
180 wg->y = l_data->wg_min_y;
|
|
181 if (wg->z + 1 >= l_data->wg_max_z)
|
|
182 wg->more_wgs = 0;
|
|
183 else
|
|
184 wg->z++;
|
|
185 }
|
|
186 else
|
|
187 wg->y++;
|
|
188 }
|
|
189 else
|
|
190 wg->x++;
|
|
191 #endif
|
131
|
192 wi->group_x = wg->x;
|
|
193 wi->group_y = wg->y;
|
|
194 wi->group_z = wg->z;
|
|
195
|
|
196 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
197 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
198 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
111
|
199
|
|
200 /* Reinitialize the work-group barrier according to the new WG's
|
|
201 size, which might not be the same as the previous ones, due
|
|
202 to "partial WGs". */
|
|
203 size_t wg_size = __hsail_currentworkgroupsize (0, wi)
|
|
204 * __hsail_currentworkgroupsize (1, wi)
|
|
205 * __hsail_currentworkgroupsize (2, wi);
|
|
206
|
|
207 #ifdef DEBUG_PHSA_RT
|
|
208 printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
|
|
209 #endif
|
|
210 fiber_barrier_init ((fiber_barrier_t *)
|
|
211 wi->launch_data->wg_sync_barrier,
|
|
212 wg_size);
|
|
213
|
|
214 #ifdef BENCHMARK_PHSA_RT
|
|
215 if (wi_count % 1000 == 0)
|
|
216 {
|
|
217 clock_t spent_time = clock () - start_time;
|
|
218 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
219 double wis_per_sec = wi_count / spent_time_sec;
|
|
220 uint64_t eta_sec
|
|
221 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
|
|
222
|
|
223 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
|
|
224 "%lu s)\n",
|
|
225 wi_count, wis_skipped, (uint64_t) spent_time_sec,
|
|
226 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
|
|
227 }
|
|
228 #endif
|
|
229 }
|
|
230 }
|
|
231 while (1);
|
|
232
|
|
233 fiber_exit ();
|
|
234 }
|
|
235 #endif
|
|
236
|
|
237 #define MIN(a, b) ((a < b) ? a : b)
|
|
238 #define MAX(a, b) ((a > b) ? a : b)
|
|
239
|
|
240 #ifdef HAVE_FIBERS
|
|
241 /* Spawns a given number of work-items to execute a set of work-groups,
|
|
242 blocks until their completion. */
|
|
243
|
|
244 static void
|
|
245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
246 uint32_t group_local_offset, size_t wg_size_x,
|
|
247 size_t wg_size_y, size_t wg_size_z)
|
|
248 {
|
|
249 PHSAWorkItem *wi_threads = NULL;
|
|
250 PHSAWorkGroup wg;
|
|
251 size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
|
131
|
252 uint32_t group_x, group_y, group_z;
|
111
|
253 fiber_barrier_t wg_start_barrier;
|
|
254 fiber_barrier_t wg_completion_barrier;
|
|
255 fiber_barrier_t wg_sync_barrier;
|
|
256
|
|
257 max_x = wg_size_x == 0 ? 1 : wg_size_x;
|
|
258 max_y = wg_size_y == 0 ? 1 : wg_size_y;
|
|
259 max_z = wg_size_z == 0 ? 1 : wg_size_z;
|
|
260
|
|
261 size_t wg_size = max_x * max_y * max_z;
|
|
262 if (wg_size > PHSA_MAX_WG_SIZE)
|
|
263 phsa_fatal_error (2);
|
|
264
|
|
265 wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
|
|
266 if (wg.private_segment_total_size > 0
|
|
267 && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
|
|
268 wg.private_segment_total_size)
|
|
269 != 0)
|
|
270 phsa_fatal_error (3);
|
|
271
|
131
|
272 wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
|
111
|
273 wg.alloca_frame_p = wg.alloca_stack_p;
|
|
274 wg.initial_group_offset = group_local_offset;
|
|
275
|
|
276 #ifdef EXECUTE_WGS_BACKWARDS
|
131
|
277 group_x = context->wg_max_x - 1;
|
|
278 group_y = context->wg_max_y - 1;
|
|
279 group_z = context->wg_max_z - 1;
|
111
|
280 #else
|
131
|
281 group_x = context->wg_min_x;
|
|
282 group_y = context->wg_min_y;
|
|
283 group_z = context->wg_min_z;
|
111
|
284 #endif
|
|
285
|
|
286 fiber_barrier_init (&wg_sync_barrier, wg_size);
|
|
287 fiber_barrier_init (&wg_start_barrier, wg_size);
|
|
288 fiber_barrier_init (&wg_completion_barrier, wg_size);
|
|
289
|
|
290 context->wg_start_barrier = &wg_start_barrier;
|
|
291 context->wg_sync_barrier = &wg_sync_barrier;
|
|
292 context->wg_completion_barrier = &wg_completion_barrier;
|
|
293
|
|
294 wg.more_wgs = 1;
|
|
295 wg.group_base_ptr = group_base_ptr;
|
|
296
|
|
297 #ifdef BENCHMARK_PHSA_RT
|
|
298 wi_count = 0;
|
|
299 wis_skipped = 0;
|
|
300 start_time = clock ();
|
|
301 #endif
|
|
302 wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
|
|
303 for (x = 0; x < max_x; ++x)
|
|
304 for (y = 0; y < max_y; ++y)
|
|
305 for (z = 0; z < max_z; ++z)
|
|
306 {
|
|
307 PHSAWorkItem *wi = &wi_threads[flat_wi_id];
|
|
308 wi->launch_data = context;
|
|
309 wi->wg = &wg;
|
131
|
310
|
|
311 wg.x = wi->group_x = group_x;
|
|
312 wg.y = wi->group_y = group_y;
|
|
313 wg.z = wi->group_z = group_z;
|
|
314
|
|
315 wi->wg_size_x = context->dp->workgroup_size_x;
|
|
316 wi->wg_size_y = context->dp->workgroup_size_y;
|
|
317 wi->wg_size_z = context->dp->workgroup_size_z;
|
|
318
|
|
319 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
320 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
321 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
|
322
|
111
|
323 wi->x = x;
|
|
324 wi->y = y;
|
|
325 wi->z = z;
|
|
326
|
|
327 /* TODO: set the stack size according to the private
|
|
328 segment size. Too big stack consumes huge amount of
|
|
329 memory in case of huge number of WIs and a too small stack
|
|
330 will fail in mysterious and potentially dangerous ways. */
|
|
331
|
|
332 fiber_init (&wi->fiber, phsa_work_item_thread, wi,
|
|
333 FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
|
|
334 ++flat_wi_id;
|
|
335 }
|
|
336
|
|
337 do
|
|
338 {
|
|
339 --flat_wi_id;
|
|
340 fiber_join (&wi_threads[flat_wi_id].fiber);
|
|
341 }
|
|
342 while (flat_wi_id > 0);
|
|
343
|
|
344 if (wg.private_segment_total_size > 0)
|
|
345 free (wg.private_base_ptr);
|
|
346
|
|
347 free (wi_threads);
|
|
348 }
|
|
349
|
|
350 /* Spawn the work-item threads to execute work-groups and let
|
|
351 them execute all the WGs, including a potential partial WG. */
|
|
352
|
|
353 static void
|
|
354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
355 uint32_t group_local_offset)
|
|
356 {
|
|
357 hsa_kernel_dispatch_packet_t *dp = context->dp;
|
|
358 size_t x, y, z;
|
|
359
|
|
360 context->group_segment_start_addr = (size_t) group_base_ptr;
|
|
361
|
|
362 /* HSA seems to allow the WG size to be larger than the grid size. We need to
|
|
363 saturate the effective WG size to the grid size to prevent the extra WIs
|
|
364 from executing. */
|
|
365 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
|
|
366 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
|
|
367 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
|
|
368 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
|
|
369 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
|
|
370
|
|
371 #ifdef BENCHMARK_PHSA_RT
|
|
372 wi_total = (uint64_t) dp->grid_size_x
|
|
373 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
|
|
374 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
|
|
375 #endif
|
|
376
|
|
377 /* For now execute all work groups in a single coarse thread (does not utilize
|
|
378 multicore/multithread). */
|
|
379 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
|
|
380
|
|
381 int dims = dp->setup & 0x3;
|
|
382
|
|
383 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
|
|
384 / dp->workgroup_size_x;
|
|
385
|
|
386 context->wg_max_y
|
|
387 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
388 / dp->workgroup_size_y;
|
|
389
|
|
390 context->wg_max_z
|
|
391 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
392 / dp->workgroup_size_z;
|
|
393
|
|
394 #ifdef DEBUG_PHSA_RT
|
|
395 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
|
|
396 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
|
|
397 context->wg_min_x, context->wg_min_y, context->wg_min_z,
|
|
398 context->wg_max_x, context->wg_max_y, context->wg_max_z,
|
|
399 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
|
|
400 dp->grid_size_y, dp->grid_size_z);
|
|
401 #endif
|
|
402
|
|
403 phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
|
|
404 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
|
|
405 }
|
|
406 #endif
|
|
407
|
|
408 /* Executes the given work-group function for all work groups in the grid.
|
|
409
|
|
410 A work-group function is a version of the original kernel which executes
|
|
411 the kernel for all work-items in a work-group. It is produced by gccbrig
|
|
412 if it can handle the kernel's barrier usage and is much faster way to
|
|
413 execute massive numbers of work-items in a non-SPMD machine than fibers
|
|
414 (easily 100x faster). */
|
|
415 static void
|
|
416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
417 uint32_t group_local_offset)
|
|
418 {
|
|
419 hsa_kernel_dispatch_packet_t *dp = context->dp;
|
|
420 size_t x, y, z, wg_x, wg_y, wg_z;
|
|
421
|
|
422 context->group_segment_start_addr = (size_t) group_base_ptr;
|
|
423
|
|
424 /* HSA seems to allow the WG size to be larger than the grid size. We need
|
|
425 to saturate the effective WG size to the grid size to prevent the extra WIs
|
|
426 from executing. */
|
|
427 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
|
|
428 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
|
|
429 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
|
|
430 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
|
|
431 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
|
|
432
|
|
433 #ifdef BENCHMARK_PHSA_RT
|
|
434 wi_total = (uint64_t) dp->grid_size_x
|
|
435 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
|
|
436 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
|
|
437 #endif
|
|
438
|
|
439 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
|
|
440
|
|
441 int dims = dp->setup & 0x3;
|
|
442
|
|
443 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
|
|
444 / dp->workgroup_size_x;
|
|
445
|
|
446 context->wg_max_y
|
|
447 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
448 / dp->workgroup_size_y;
|
|
449
|
|
450 context->wg_max_z
|
|
451 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
452 / dp->workgroup_size_z;
|
|
453
|
|
454 #ifdef DEBUG_PHSA_RT
|
|
455 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
|
|
456 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
|
|
457 context->wg_min_x, context->wg_min_y, context->wg_min_z,
|
|
458 context->wg_max_x, context->wg_max_y, context->wg_max_z,
|
|
459 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
|
|
460 dp->grid_size_y, dp->grid_size_z);
|
|
461 #endif
|
|
462
|
|
463 PHSAWorkItem wi;
|
|
464 PHSAWorkGroup wg;
|
|
465 wi.wg = &wg;
|
|
466 wi.x = wi.y = wi.z = 0;
|
|
467 wi.launch_data = context;
|
|
468
|
|
469 #ifdef BENCHMARK_PHSA_RT
|
|
470 start_time = clock ();
|
|
471 uint64_t wg_count = 0;
|
|
472 #endif
|
|
473
|
|
474 size_t wg_size = __hsail_workgroupsize (0, &wi)
|
|
475 * __hsail_workgroupsize (1, &wi)
|
|
476 * __hsail_workgroupsize (2, &wi);
|
|
477
|
|
478 void *private_base_ptr = NULL;
|
|
479 if (dp->private_segment_size > 0
|
|
480 && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
|
|
481 dp->private_segment_size * wg_size)
|
|
482 != 0)
|
|
483 phsa_fatal_error (3);
|
|
484
|
131
|
485 wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
|
111
|
486 wg.alloca_frame_p = wg.alloca_stack_p;
|
|
487
|
|
488 wg.private_base_ptr = private_base_ptr;
|
|
489 wg.group_base_ptr = group_base_ptr;
|
|
490
|
|
491 #ifdef DEBUG_PHSA_RT
|
|
492 printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
|
|
493 wg_size, private_base_ptr);
|
|
494 #endif
|
|
495
|
|
496 for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
|
|
497 for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
|
|
498 for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
|
|
499 {
|
131
|
500 wi.group_x = wg_x;
|
|
501 wi.group_y = wg_y;
|
|
502 wi.group_z = wg_z;
|
|
503
|
|
504 wi.wg_size_x = context->dp->workgroup_size_x;
|
|
505 wi.wg_size_y = context->dp->workgroup_size_y;
|
|
506 wi.wg_size_z = context->dp->workgroup_size_z;
|
|
507
|
|
508 wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
|
|
509 wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
|
|
510 wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
|
111
|
511
|
|
512 context->kernel (context->kernarg_addr, &wi, group_base_ptr,
|
|
513 group_local_offset, private_base_ptr);
|
|
514
|
|
515 #if defined (BENCHMARK_PHSA_RT)
|
|
516 wg_count++;
|
|
517 if (wg_count % 1000000 == 0)
|
|
518 {
|
|
519 clock_t spent_time = clock () - start_time;
|
|
520 uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
|
|
521 + wg_z * sat_wg_size_z;
|
|
522 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
523 double wis_per_sec = wi_count / spent_time_sec;
|
|
524 uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
|
|
525
|
|
526 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
|
|
527 wi_count, (uint64_t) spent_time_sec,
|
|
528 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
|
|
529 }
|
|
530 #endif
|
|
531 }
|
|
532
|
|
533 #ifdef BENCHMARK_PHSA_RT
|
|
534 clock_t spent_time = clock () - start_time;
|
|
535 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
536 double wis_per_sec = wi_total / spent_time_sec;
|
|
537
|
|
538 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
|
|
539 (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
|
|
540 #endif
|
|
541 free (private_base_ptr);
|
|
542 private_base_ptr = NULL;
|
|
543 }
|
|
544
|
|
545 /* gccbrig generates the following from each HSAIL kernel:
|
|
546
|
|
547 1) The actual kernel function (a single work-item kernel or a work-group
|
|
548 function) generated from HSAIL (BRIG).
|
|
549
|
|
550 static void _Kernel (void* args, void* context, void* group_base_ptr)
|
|
551 {
|
|
552 ...
|
|
553 }
|
|
554
|
|
555 2) A public facing kernel function that is called from the PHSA runtime:
|
|
556
|
|
557 a) A single work-item function (that requires fibers for multi-WI):
|
|
558
|
|
559 void Kernel (void* context)
|
|
560 {
|
|
561 __launch_launch_kernel (_Kernel, context);
|
|
562 }
|
|
563
|
|
564 or
|
|
565
|
|
566 b) a when gccbrig could generate a work-group function:
|
|
567
|
|
568 void Kernel (void* context)
|
|
569 {
|
|
570 __hsail_launch_wg_function (_Kernel, context);
|
|
571 }
|
|
572 */
|
|
573
|
|
574 #ifdef HAVE_FIBERS
|
|
575
|
|
576 void
|
|
577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
|
|
578 void *group_base_ptr, uint32_t group_local_offset)
|
|
579 {
|
|
580 context->kernel = kernel;
|
|
581 phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
|
|
582 }
|
|
583 #endif
|
|
584
|
|
585 void
|
|
586 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
|
|
587 PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
588 uint32_t group_local_offset)
|
|
589 {
|
|
590 context->kernel = kernel;
|
|
591 phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
|
|
592 }
|
|
593
|
|
594 uint32_t
|
|
595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
|
|
596 {
|
|
597 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
598
|
|
599 uint32_t id;
|
|
600 switch (dim)
|
|
601 {
|
|
602 default:
|
|
603 case 0:
|
|
604 /* Overflow semantics in the case of WG dim > grid dim. */
|
131
|
605 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
|
111
|
606 % dp->grid_size_x;
|
|
607 break;
|
|
608 case 1:
|
131
|
609 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
|
111
|
610 % dp->grid_size_y;
|
|
611 break;
|
|
612 case 2:
|
131
|
613 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
|
111
|
614 % dp->grid_size_z;
|
|
615 break;
|
|
616 }
|
|
617 return id;
|
|
618 }
|
|
619
|
|
620 uint64_t
|
|
621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
|
|
622 {
|
|
623 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
624
|
|
625 uint64_t id;
|
|
626 switch (dim)
|
|
627 {
|
|
628 default:
|
|
629 case 0:
|
|
630 /* Overflow semantics in the case of WG dim > grid dim. */
|
131
|
631 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
|
111
|
632 % dp->grid_size_x;
|
|
633 break;
|
|
634 case 1:
|
131
|
635 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
|
111
|
636 % dp->grid_size_y;
|
|
637 break;
|
|
638 case 2:
|
131
|
639 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
|
111
|
640 % dp->grid_size_z;
|
|
641 break;
|
|
642 }
|
|
643 return id;
|
|
644 }
|
|
645
|
|
646
|
|
647 uint32_t
|
|
648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
|
|
649 {
|
|
650 PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
651 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
652
|
|
653 /* The number of dimensions is in the two least significant bits. */
|
|
654 int dims = dp->setup & 0x3;
|
|
655
|
|
656 uint32_t id;
|
|
657 switch (dim)
|
|
658 {
|
|
659 default:
|
|
660 case 0:
|
|
661 id = c->x;
|
|
662 break;
|
|
663 case 1:
|
|
664 id = dims < 2 ? 0 : c->y;
|
|
665 break;
|
|
666 case 2:
|
|
667 id = dims < 3 ? 0 : c->z;
|
|
668 break;
|
|
669 }
|
|
670 return id;
|
|
671 }
|
|
672
|
|
673 uint32_t
|
|
674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
|
|
675 {
|
|
676 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
677 int dims = dp->setup & 0x3;
|
|
678
|
|
679 uint32_t id;
|
|
680 switch (dim)
|
|
681 {
|
|
682 default:
|
|
683 case 0:
|
|
684 id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
|
|
685 break;
|
|
686 case 1:
|
|
687 id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
688 / dp->workgroup_size_y;
|
|
689 break;
|
|
690 case 2:
|
|
691 id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
692 / dp->workgroup_size_z;
|
|
693 break;
|
|
694 }
|
|
695 return id;
|
|
696 }
|
|
697
|
|
698 uint32_t
|
|
699 __hsail_workitemflatid (PHSAWorkItem *c)
|
|
700 {
|
|
701 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
|
|
702
|
|
703 return c->x + c->y * dp->workgroup_size_x
|
|
704 + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
|
|
705 }
|
|
706
|
|
707 uint32_t
|
|
708 __hsail_currentworkitemflatid (PHSAWorkItem *c)
|
|
709 {
|
|
710 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
|
|
711
|
|
712 return c->x + c->y * __hsail_currentworkgroupsize (0, c)
|
|
713 + c->z * __hsail_currentworkgroupsize (0, c)
|
|
714 * __hsail_currentworkgroupsize (1, c);
|
|
715 }
|
|
716
|
|
717 void
|
|
718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
|
|
719 {
|
|
720 switch (dim)
|
|
721 {
|
|
722 default:
|
|
723 case 0:
|
|
724 context->x = id;
|
|
725 break;
|
|
726 case 1:
|
|
727 context->y = id;
|
|
728 break;
|
|
729 case 2:
|
|
730 context->z = id;
|
|
731 break;
|
|
732 }
|
|
733 }
|
|
734
|
|
735 uint64_t
|
|
736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
|
|
737 {
|
|
738 PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
739 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
740
|
|
741 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
|
|
742 uint64_t id0 = __hsail_workitemabsid (0, context);
|
|
743 uint64_t id1 = __hsail_workitemabsid (1, context);
|
|
744 uint64_t id2 = __hsail_workitemabsid (2, context);
|
|
745
|
|
746 uint64_t max0 = dp->grid_size_x;
|
|
747 uint64_t max1 = dp->grid_size_y;
|
|
748 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
|
|
749
|
|
750 return id;
|
|
751 }
|
|
752
|
|
753 uint32_t
|
|
754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
|
|
755 {
|
|
756 PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
757 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
758
|
|
759 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
|
|
760 uint64_t id0 = __hsail_workitemabsid (0, context);
|
|
761 uint64_t id1 = __hsail_workitemabsid (1, context);
|
|
762 uint64_t id2 = __hsail_workitemabsid (2, context);
|
|
763
|
|
764 uint64_t max0 = dp->grid_size_x;
|
|
765 uint64_t max1 = dp->grid_size_y;
|
|
766 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
|
|
767 return (uint32_t) id;
|
|
768 }
|
|
769
|
|
770 uint32_t
|
|
771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
|
|
772 {
|
|
773 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
774 uint32_t wg_size = 0;
|
|
775 switch (dim)
|
|
776 {
|
|
777 default:
|
|
778 case 0:
|
131
|
779 if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
|
111
|
780 wg_size = dp->workgroup_size_x; /* Full WG. */
|
|
781 else
|
|
782 wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
|
|
783 break;
|
|
784 case 1:
|
131
|
785 if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
|
111
|
786 wg_size = dp->workgroup_size_y; /* Full WG. */
|
|
787 else
|
|
788 wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
|
|
789 break;
|
|
790 case 2:
|
131
|
791 if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
|
111
|
792 wg_size = dp->workgroup_size_z; /* Full WG. */
|
|
793 else
|
|
794 wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
|
|
795 break;
|
|
796 }
|
|
797 return wg_size;
|
|
798 }
|
|
799
|
|
800 uint32_t
|
|
801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
|
|
802 {
|
|
803 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
804 switch (dim)
|
|
805 {
|
|
806 default:
|
|
807 case 0:
|
|
808 return dp->workgroup_size_x;
|
|
809 case 1:
|
|
810 return dp->workgroup_size_y;
|
|
811 case 2:
|
|
812 return dp->workgroup_size_z;
|
|
813 }
|
|
814 }
|
|
815
|
|
816 uint32_t
|
|
817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
|
|
818 {
|
|
819 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
820 switch (dim)
|
|
821 {
|
|
822 default:
|
|
823 case 0:
|
|
824 return dp->grid_size_x;
|
|
825 case 1:
|
|
826 return dp->grid_size_y;
|
|
827 case 2:
|
|
828 return dp->grid_size_z;
|
|
829 }
|
|
830 }
|
|
831
|
|
832 uint32_t
|
|
833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
|
|
834 {
|
|
835 switch (dim)
|
|
836 {
|
|
837 default:
|
|
838 case 0:
|
131
|
839 return wi->group_x;
|
111
|
840 case 1:
|
131
|
841 return wi->group_y;
|
111
|
842 case 2:
|
131
|
843 return wi->group_z;
|
111
|
844 }
|
|
845 }
|
|
846
|
|
847 uint32_t
|
|
848 __hsail_dim (PHSAWorkItem *wi)
|
|
849 {
|
|
850 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
851 return dp->setup & 0x3;
|
|
852 }
|
|
853
|
|
854 uint64_t
|
|
855 __hsail_packetid (PHSAWorkItem *wi)
|
|
856 {
|
|
857 return wi->launch_data->packet_id;
|
|
858 }
|
|
859
|
|
860 uint32_t
|
|
861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
|
|
862 {
|
|
863 return (uint32_t) wi->launch_data->dp->completion_signal.handle;
|
|
864 }
|
|
865
|
|
866 uint64_t
|
|
867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
|
|
868 {
|
|
869 return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
|
|
870 }
|
|
871
|
|
872 #ifdef HAVE_FIBERS
|
|
873 void
|
|
874 __hsail_barrier (PHSAWorkItem *wi)
|
|
875 {
|
|
876 fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
|
|
877 }
|
|
878 #endif
|
|
879
|
|
880 /* Return a 32b private segment address that points to a dynamically
|
|
881 allocated chunk of 'size' with 'align'.
|
|
882
|
|
883 Allocates the space from the end of the private segment allocated
|
|
884 for the whole work group. In implementations with separate private
|
|
885 memories per WI, we will need to have a stack pointer per WI. But in
|
|
886 the current implementation, the segment is shared, so we possibly
|
|
887 save some space in case all WIs do not call the alloca.
|
|
888
|
|
889 The "alloca frames" are organized as follows:
|
|
890
|
|
891 wg->alloca_stack_p points to the last allocated data (initially
|
|
892 outside the private segment)
|
|
893 wg->alloca_frame_p points to the first address _outside_ the current
|
|
894 function's allocations (initially to the same as alloca_stack_p)
|
|
895
|
|
896 The data is allocated downwards from the end of the private segment.
|
|
897
|
|
898 In the beginning of a new function which has allocas, a new alloca
|
|
899 frame is pushed which adds the current alloca_frame_p (the current
|
|
900 function's frame starting point) to the top of the alloca stack and
|
|
901 alloca_frame_p is set to the current stack position.
|
|
902
|
|
903 At the exit points of a function with allocas, the alloca frame
|
|
904 is popped before returning. This involves popping the alloca_frame_p
|
|
905 to the one of the previous function in the call stack, and alloca_stack_p
|
|
906 similarly, to the position of the last word alloca'd by the previous
|
|
907 function.
|
|
908 */
|
|
909
|
|
910 uint32_t
|
|
911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
|
|
912 {
|
|
913 volatile PHSAWorkGroup *wg = wi->wg;
|
131
|
914 int64_t new_pos = wg->alloca_stack_p - size;
|
111
|
915 while (new_pos % align != 0)
|
|
916 new_pos--;
|
131
|
917 if (new_pos < 0)
|
|
918 phsa_fatal_error (2);
|
|
919
|
111
|
920 wg->alloca_stack_p = new_pos;
|
|
921
|
|
922 #ifdef DEBUG_ALLOCA
|
|
923 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
|
|
924 wg->alloca_stack_p, wg->alloca_frame_p);
|
|
925 #endif
|
|
926 return new_pos;
|
|
927 }
|
|
928
|
|
929 /* Initializes a new "alloca frame" in the private segment.
|
|
930 This should be called at all the function entry points in case
|
|
931 the function contains at least one call to alloca. */
|
|
932
|
|
933 void
|
|
934 __hsail_alloca_push_frame (PHSAWorkItem *wi)
|
|
935 {
|
|
936 volatile PHSAWorkGroup *wg = wi->wg;
|
|
937
|
|
938 /* Store the alloca_frame_p without any alignment padding so
|
|
939 we know exactly where the previous frame ended after popping
|
|
940 it. */
|
|
941 #ifdef DEBUG_ALLOCA
|
|
942 printf ("--- push frame ");
|
|
943 #endif
|
|
944 uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
|
|
945 memcpy (wg->private_base_ptr + last_word_offs,
|
|
946 (const void *) &wg->alloca_frame_p, 4);
|
|
947 wg->alloca_frame_p = last_word_offs;
|
|
948
|
|
949 #ifdef DEBUG_ALLOCA
|
|
950 printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
|
|
951 #endif
|
|
952 }
|
|
953
|
|
954 /* Frees the current "alloca frame" and restores the frame
|
|
955 pointer.
|
|
956 This should be called at all the function return points in case
|
|
957 the function contains at least one call to alloca. Restores the
|
|
958 alloca stack to the condition it was before pushing the frame
|
|
959 the last time. */
|
|
960 void
|
|
961 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
|
|
962 {
|
|
963 volatile PHSAWorkGroup *wg = wi->wg;
|
|
964
|
|
965 wg->alloca_stack_p = wg->alloca_frame_p;
|
|
966 memcpy ((void *) &wg->alloca_frame_p,
|
|
967 (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
|
|
968 /* Now frame_p points to the beginning of the previous function's
|
|
969 frame and stack_p to its end. */
|
|
970
|
|
971 wg->alloca_stack_p += 4;
|
|
972
|
|
973 #ifdef DEBUG_ALLOCA
|
|
974 printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
|
|
975 wg->alloca_frame_p);
|
|
976 #endif
|
|
977 }
|