annotate libhsail-rt/rt/workitems.c @ 158:494b0b89df80 default tip

...
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 18:13:55 +0900
parents 1830386684a0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* workitems.c -- The main runtime entry that performs work-item execution in
kono
parents:
diff changeset
2 various ways and the builtin functions closely related to the
kono
parents:
diff changeset
3 implementation.
kono
parents:
diff changeset
4
145
1830386684a0 gcc-9.2.0
anatofuz
parents: 131
diff changeset
5 Copyright (C) 2015-2020 Free Software Foundation, Inc.
111
kono
parents:
diff changeset
6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
kono
parents:
diff changeset
7 for General Processor Tech.
kono
parents:
diff changeset
8
kono
parents:
diff changeset
9 Permission is hereby granted, free of charge, to any person obtaining a
kono
parents:
diff changeset
10 copy of this software and associated documentation files
kono
parents:
diff changeset
11 (the "Software"), to deal in the Software without restriction, including
kono
parents:
diff changeset
12 without limitation the rights to use, copy, modify, merge, publish,
kono
parents:
diff changeset
13 distribute, sublicense, and/or sell copies of the Software, and to
kono
parents:
diff changeset
14 permit persons to whom the Software is furnished to do so, subject to
kono
parents:
diff changeset
15 the following conditions:
kono
parents:
diff changeset
16
kono
parents:
diff changeset
17 The above copyright notice and this permission notice shall be included
kono
parents:
diff changeset
18 in all copies or substantial portions of the Software.
kono
parents:
diff changeset
19
kono
parents:
diff changeset
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
kono
parents:
diff changeset
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
kono
parents:
diff changeset
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
kono
parents:
diff changeset
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
kono
parents:
diff changeset
24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
kono
parents:
diff changeset
25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
kono
parents:
diff changeset
26 USE OR OTHER DEALINGS IN THE SOFTWARE.
kono
parents:
diff changeset
27 */
kono
parents:
diff changeset
28
kono
parents:
diff changeset
29 /* The fiber based multiple work-item work-group execution uses ucontext
kono
parents:
diff changeset
30 based user mode threading. However, if gccbrig is able to optimize the
kono
parents:
diff changeset
31 kernel to a much faster work-group function that implements the multiple
kono
parents:
diff changeset
32 WI execution using loops instead of fibers requiring slow context switches,
kono
parents:
diff changeset
33 the fiber-based implementation won't be called.
kono
parents:
diff changeset
34 */
kono
parents:
diff changeset
35
kono
parents:
diff changeset
36 #include <stdlib.h>
kono
parents:
diff changeset
37 #include <signal.h>
kono
parents:
diff changeset
38 #include <string.h>
kono
parents:
diff changeset
39
kono
parents:
diff changeset
40 #include "workitems.h"
kono
parents:
diff changeset
41 #include "phsa-rt.h"
kono
parents:
diff changeset
42
kono
parents:
diff changeset
43 #ifdef HAVE_FIBERS
kono
parents:
diff changeset
44 #include "fibers.h"
kono
parents:
diff changeset
45 #endif
kono
parents:
diff changeset
46
kono
parents:
diff changeset
47 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
48 #include <stdio.h>
kono
parents:
diff changeset
49 #include <time.h>
kono
parents:
diff changeset
50
kono
parents:
diff changeset
51 static uint64_t wi_count = 0;
kono
parents:
diff changeset
52 static uint64_t wis_skipped = 0;
kono
parents:
diff changeset
53 static uint64_t wi_total = 0;
kono
parents:
diff changeset
54 static clock_t start_time;
kono
parents:
diff changeset
55
kono
parents:
diff changeset
56 #endif
kono
parents:
diff changeset
57
kono
parents:
diff changeset
58 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
59 #include <stdio.h>
kono
parents:
diff changeset
60 #endif
kono
parents:
diff changeset
61
kono
parents:
diff changeset
62 #define PRIVATE_SEGMENT_ALIGN 256
kono
parents:
diff changeset
63 #define FIBER_STACK_SIZE (64*1024)
kono
parents:
diff changeset
64 #define GROUP_SEGMENT_ALIGN 256
kono
parents:
diff changeset
65
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
66 /* Preserve this amount of additional space in the alloca stack as we need to
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
67 store the alloca frame pointer to the alloca frame, thus must preserve
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
68 space for it. This thus supports at most 1024 functions with allocas in
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
69 a call chain. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
70 #define ALLOCA_OVERHEAD 1024*4
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
71
111
kono
parents:
diff changeset
72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
kono
parents:
diff changeset
73
kono
parents:
diff changeset
74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
kono
parents:
diff changeset
75
kono
parents:
diff changeset
76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
kono
parents:
diff changeset
77
kono
parents:
diff changeset
78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
kono
parents:
diff changeset
79
kono
parents:
diff changeset
80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
kono
parents:
diff changeset
81
kono
parents:
diff changeset
82 void
kono
parents:
diff changeset
83 phsa_fatal_error (int code)
kono
parents:
diff changeset
84 {
kono
parents:
diff changeset
85 exit (code);
kono
parents:
diff changeset
86 }
kono
parents:
diff changeset
87
kono
parents:
diff changeset
88 #ifdef HAVE_FIBERS
kono
parents:
diff changeset
89 /* ucontext-based work-item thread implementation. Runs all work-items in
kono
parents:
diff changeset
90 separate fibers. */
kono
parents:
diff changeset
91
kono
parents:
diff changeset
92 static void
kono
parents:
diff changeset
93 phsa_work_item_thread (int arg0, int arg1)
kono
parents:
diff changeset
94 {
kono
parents:
diff changeset
95 void *arg = fiber_int_args_to_ptr (arg0, arg1);
kono
parents:
diff changeset
96
kono
parents:
diff changeset
97 PHSAWorkItem *wi = (PHSAWorkItem *) arg;
kono
parents:
diff changeset
98 volatile PHSAWorkGroup *wg = wi->wg;
kono
parents:
diff changeset
99 PHSAKernelLaunchData *l_data = wi->launch_data;
kono
parents:
diff changeset
100
kono
parents:
diff changeset
101 do
kono
parents:
diff changeset
102 {
kono
parents:
diff changeset
103 int retcode
kono
parents:
diff changeset
104 = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
kono
parents:
diff changeset
105
kono
parents:
diff changeset
106 /* At this point the threads can assume that either more_wgs is 0 or
kono
parents:
diff changeset
107 the current_work_group_* is set to point to the WG executed next. */
kono
parents:
diff changeset
108 if (!wi->wg->more_wgs)
kono
parents:
diff changeset
109 break;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
110
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
111 wi->group_x = wg->x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
112 wi->group_y = wg->y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
113 wi->group_z = wg->z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
114
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
115 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
116 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
117 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
118
111
kono
parents:
diff changeset
119 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
120 printf (
kono
parents:
diff changeset
121 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
122 wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
123 l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
111
kono
parents:
diff changeset
124 #endif
kono
parents:
diff changeset
125
kono
parents:
diff changeset
126 if (wi->x < __hsail_currentworkgroupsize (0, wi)
kono
parents:
diff changeset
127 && wi->y < __hsail_currentworkgroupsize (1, wi)
kono
parents:
diff changeset
128 && wi->z < __hsail_currentworkgroupsize (2, wi))
kono
parents:
diff changeset
129 {
kono
parents:
diff changeset
130 l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
kono
parents:
diff changeset
131 wg->initial_group_offset, wg->private_base_ptr);
kono
parents:
diff changeset
132 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
133 printf ("done.\n");
kono
parents:
diff changeset
134 #endif
kono
parents:
diff changeset
135 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
136 wi_count++;
kono
parents:
diff changeset
137 #endif
kono
parents:
diff changeset
138 }
kono
parents:
diff changeset
139 else
kono
parents:
diff changeset
140 {
kono
parents:
diff changeset
141 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
142 printf ("skipped (partial WG).\n");
kono
parents:
diff changeset
143 #endif
kono
parents:
diff changeset
144 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
145 wis_skipped++;
kono
parents:
diff changeset
146 #endif
kono
parents:
diff changeset
147 }
kono
parents:
diff changeset
148
kono
parents:
diff changeset
149 retcode
kono
parents:
diff changeset
150 = fiber_barrier_reach ((fiber_barrier_t *)
kono
parents:
diff changeset
151 l_data->wg_completion_barrier);
kono
parents:
diff changeset
152
kono
parents:
diff changeset
153 /* The first thread updates the WG to execute next etc. */
kono
parents:
diff changeset
154
kono
parents:
diff changeset
155 if (retcode == 0)
kono
parents:
diff changeset
156 {
kono
parents:
diff changeset
157 #ifdef EXECUTE_WGS_BACKWARDS
kono
parents:
diff changeset
158 if (wg->x == l_data->wg_min_x)
kono
parents:
diff changeset
159 {
kono
parents:
diff changeset
160 wg->x = l_data->wg_max_x - 1;
kono
parents:
diff changeset
161 if (wg->y == l_data->wg_min_y)
kono
parents:
diff changeset
162 {
kono
parents:
diff changeset
163 wg->y = l_data->wg_max_y - 1;
kono
parents:
diff changeset
164 if (wg->z == l_data->wg_min_z)
kono
parents:
diff changeset
165 wg->more_wgs = 0;
kono
parents:
diff changeset
166 else
kono
parents:
diff changeset
167 wg->z--;
kono
parents:
diff changeset
168 }
kono
parents:
diff changeset
169 else
kono
parents:
diff changeset
170 wg->y--;
kono
parents:
diff changeset
171 }
kono
parents:
diff changeset
172 else
kono
parents:
diff changeset
173 wg->x--;
kono
parents:
diff changeset
174 #else
kono
parents:
diff changeset
175 if (wg->x + 1 >= l_data->wg_max_x)
kono
parents:
diff changeset
176 {
kono
parents:
diff changeset
177 wg->x = l_data->wg_min_x;
kono
parents:
diff changeset
178 if (wg->y + 1 >= l_data->wg_max_y)
kono
parents:
diff changeset
179 {
kono
parents:
diff changeset
180 wg->y = l_data->wg_min_y;
kono
parents:
diff changeset
181 if (wg->z + 1 >= l_data->wg_max_z)
kono
parents:
diff changeset
182 wg->more_wgs = 0;
kono
parents:
diff changeset
183 else
kono
parents:
diff changeset
184 wg->z++;
kono
parents:
diff changeset
185 }
kono
parents:
diff changeset
186 else
kono
parents:
diff changeset
187 wg->y++;
kono
parents:
diff changeset
188 }
kono
parents:
diff changeset
189 else
kono
parents:
diff changeset
190 wg->x++;
kono
parents:
diff changeset
191 #endif
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
192 wi->group_x = wg->x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
193 wi->group_y = wg->y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
194 wi->group_z = wg->z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
195
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
196 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
197 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
198 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
111
kono
parents:
diff changeset
199
kono
parents:
diff changeset
200 /* Reinitialize the work-group barrier according to the new WG's
kono
parents:
diff changeset
201 size, which might not be the same as the previous ones, due
kono
parents:
diff changeset
202 to "partial WGs". */
kono
parents:
diff changeset
203 size_t wg_size = __hsail_currentworkgroupsize (0, wi)
kono
parents:
diff changeset
204 * __hsail_currentworkgroupsize (1, wi)
kono
parents:
diff changeset
205 * __hsail_currentworkgroupsize (2, wi);
kono
parents:
diff changeset
206
kono
parents:
diff changeset
207 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
208 printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
kono
parents:
diff changeset
209 #endif
kono
parents:
diff changeset
210 fiber_barrier_init ((fiber_barrier_t *)
kono
parents:
diff changeset
211 wi->launch_data->wg_sync_barrier,
kono
parents:
diff changeset
212 wg_size);
kono
parents:
diff changeset
213
kono
parents:
diff changeset
214 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
215 if (wi_count % 1000 == 0)
kono
parents:
diff changeset
216 {
kono
parents:
diff changeset
217 clock_t spent_time = clock () - start_time;
kono
parents:
diff changeset
218 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
kono
parents:
diff changeset
219 double wis_per_sec = wi_count / spent_time_sec;
kono
parents:
diff changeset
220 uint64_t eta_sec
kono
parents:
diff changeset
221 = (wi_total - wi_count - wis_skipped) / wis_per_sec;
kono
parents:
diff changeset
222
kono
parents:
diff changeset
223 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
kono
parents:
diff changeset
224 "%lu s)\n",
kono
parents:
diff changeset
225 wi_count, wis_skipped, (uint64_t) spent_time_sec,
kono
parents:
diff changeset
226 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
kono
parents:
diff changeset
227 }
kono
parents:
diff changeset
228 #endif
kono
parents:
diff changeset
229 }
kono
parents:
diff changeset
230 }
kono
parents:
diff changeset
231 while (1);
kono
parents:
diff changeset
232
kono
parents:
diff changeset
233 fiber_exit ();
kono
parents:
diff changeset
234 }
kono
parents:
diff changeset
235 #endif
kono
parents:
diff changeset
236
kono
parents:
diff changeset
237 #define MIN(a, b) ((a < b) ? a : b)
kono
parents:
diff changeset
238 #define MAX(a, b) ((a > b) ? a : b)
kono
parents:
diff changeset
239
kono
parents:
diff changeset
240 #ifdef HAVE_FIBERS
kono
parents:
diff changeset
241 /* Spawns a given number of work-items to execute a set of work-groups,
kono
parents:
diff changeset
242 blocks until their completion. */
kono
parents:
diff changeset
243
kono
parents:
diff changeset
244 static void
kono
parents:
diff changeset
245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
kono
parents:
diff changeset
246 uint32_t group_local_offset, size_t wg_size_x,
kono
parents:
diff changeset
247 size_t wg_size_y, size_t wg_size_z)
kono
parents:
diff changeset
248 {
kono
parents:
diff changeset
249 PHSAWorkItem *wi_threads = NULL;
kono
parents:
diff changeset
250 PHSAWorkGroup wg;
kono
parents:
diff changeset
251 size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
252 uint32_t group_x, group_y, group_z;
111
kono
parents:
diff changeset
253 fiber_barrier_t wg_start_barrier;
kono
parents:
diff changeset
254 fiber_barrier_t wg_completion_barrier;
kono
parents:
diff changeset
255 fiber_barrier_t wg_sync_barrier;
kono
parents:
diff changeset
256
kono
parents:
diff changeset
257 max_x = wg_size_x == 0 ? 1 : wg_size_x;
kono
parents:
diff changeset
258 max_y = wg_size_y == 0 ? 1 : wg_size_y;
kono
parents:
diff changeset
259 max_z = wg_size_z == 0 ? 1 : wg_size_z;
kono
parents:
diff changeset
260
kono
parents:
diff changeset
261 size_t wg_size = max_x * max_y * max_z;
kono
parents:
diff changeset
262 if (wg_size > PHSA_MAX_WG_SIZE)
kono
parents:
diff changeset
263 phsa_fatal_error (2);
kono
parents:
diff changeset
264
kono
parents:
diff changeset
265 wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
kono
parents:
diff changeset
266 if (wg.private_segment_total_size > 0
kono
parents:
diff changeset
267 && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
kono
parents:
diff changeset
268 wg.private_segment_total_size)
kono
parents:
diff changeset
269 != 0)
kono
parents:
diff changeset
270 phsa_fatal_error (3);
kono
parents:
diff changeset
271
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
272 wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
111
kono
parents:
diff changeset
273 wg.alloca_frame_p = wg.alloca_stack_p;
kono
parents:
diff changeset
274 wg.initial_group_offset = group_local_offset;
kono
parents:
diff changeset
275
kono
parents:
diff changeset
276 #ifdef EXECUTE_WGS_BACKWARDS
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
277 group_x = context->wg_max_x - 1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
278 group_y = context->wg_max_y - 1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
279 group_z = context->wg_max_z - 1;
111
kono
parents:
diff changeset
280 #else
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
281 group_x = context->wg_min_x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
282 group_y = context->wg_min_y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
283 group_z = context->wg_min_z;
111
kono
parents:
diff changeset
284 #endif
kono
parents:
diff changeset
285
kono
parents:
diff changeset
286 fiber_barrier_init (&wg_sync_barrier, wg_size);
kono
parents:
diff changeset
287 fiber_barrier_init (&wg_start_barrier, wg_size);
kono
parents:
diff changeset
288 fiber_barrier_init (&wg_completion_barrier, wg_size);
kono
parents:
diff changeset
289
kono
parents:
diff changeset
290 context->wg_start_barrier = &wg_start_barrier;
kono
parents:
diff changeset
291 context->wg_sync_barrier = &wg_sync_barrier;
kono
parents:
diff changeset
292 context->wg_completion_barrier = &wg_completion_barrier;
kono
parents:
diff changeset
293
kono
parents:
diff changeset
294 wg.more_wgs = 1;
kono
parents:
diff changeset
295 wg.group_base_ptr = group_base_ptr;
kono
parents:
diff changeset
296
kono
parents:
diff changeset
297 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
298 wi_count = 0;
kono
parents:
diff changeset
299 wis_skipped = 0;
kono
parents:
diff changeset
300 start_time = clock ();
kono
parents:
diff changeset
301 #endif
kono
parents:
diff changeset
302 wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
kono
parents:
diff changeset
303 for (x = 0; x < max_x; ++x)
kono
parents:
diff changeset
304 for (y = 0; y < max_y; ++y)
kono
parents:
diff changeset
305 for (z = 0; z < max_z; ++z)
kono
parents:
diff changeset
306 {
kono
parents:
diff changeset
307 PHSAWorkItem *wi = &wi_threads[flat_wi_id];
kono
parents:
diff changeset
308 wi->launch_data = context;
kono
parents:
diff changeset
309 wi->wg = &wg;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
310
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
311 wg.x = wi->group_x = group_x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
312 wg.y = wi->group_y = group_y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
313 wg.z = wi->group_z = group_z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
314
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
315 wi->wg_size_x = context->dp->workgroup_size_x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
316 wi->wg_size_y = context->dp->workgroup_size_y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
317 wi->wg_size_z = context->dp->workgroup_size_z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
318
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
319 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
320 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
321 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
322
111
kono
parents:
diff changeset
323 wi->x = x;
kono
parents:
diff changeset
324 wi->y = y;
kono
parents:
diff changeset
325 wi->z = z;
kono
parents:
diff changeset
326
kono
parents:
diff changeset
327 /* TODO: set the stack size according to the private
kono
parents:
diff changeset
328 segment size. Too big stack consumes huge amount of
kono
parents:
diff changeset
329 memory in case of huge number of WIs and a too small stack
kono
parents:
diff changeset
330 will fail in mysterious and potentially dangerous ways. */
kono
parents:
diff changeset
331
kono
parents:
diff changeset
332 fiber_init (&wi->fiber, phsa_work_item_thread, wi,
kono
parents:
diff changeset
333 FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
kono
parents:
diff changeset
334 ++flat_wi_id;
kono
parents:
diff changeset
335 }
kono
parents:
diff changeset
336
kono
parents:
diff changeset
337 do
kono
parents:
diff changeset
338 {
kono
parents:
diff changeset
339 --flat_wi_id;
kono
parents:
diff changeset
340 fiber_join (&wi_threads[flat_wi_id].fiber);
kono
parents:
diff changeset
341 }
kono
parents:
diff changeset
342 while (flat_wi_id > 0);
kono
parents:
diff changeset
343
kono
parents:
diff changeset
344 if (wg.private_segment_total_size > 0)
kono
parents:
diff changeset
345 free (wg.private_base_ptr);
kono
parents:
diff changeset
346
kono
parents:
diff changeset
347 free (wi_threads);
kono
parents:
diff changeset
348 }
kono
parents:
diff changeset
349
kono
parents:
diff changeset
350 /* Spawn the work-item threads to execute work-groups and let
kono
parents:
diff changeset
351 them execute all the WGs, including a potential partial WG. */
kono
parents:
diff changeset
352
kono
parents:
diff changeset
353 static void
kono
parents:
diff changeset
354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
kono
parents:
diff changeset
355 uint32_t group_local_offset)
kono
parents:
diff changeset
356 {
kono
parents:
diff changeset
357 hsa_kernel_dispatch_packet_t *dp = context->dp;
kono
parents:
diff changeset
358 size_t x, y, z;
kono
parents:
diff changeset
359
kono
parents:
diff changeset
360 context->group_segment_start_addr = (size_t) group_base_ptr;
kono
parents:
diff changeset
361
kono
parents:
diff changeset
362 /* HSA seems to allow the WG size to be larger than the grid size. We need to
kono
parents:
diff changeset
363 saturate the effective WG size to the grid size to prevent the extra WIs
kono
parents:
diff changeset
364 from executing. */
kono
parents:
diff changeset
365 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
kono
parents:
diff changeset
366 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
kono
parents:
diff changeset
367 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
kono
parents:
diff changeset
368 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
kono
parents:
diff changeset
369 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
kono
parents:
diff changeset
370
kono
parents:
diff changeset
371 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
372 wi_total = (uint64_t) dp->grid_size_x
kono
parents:
diff changeset
373 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
kono
parents:
diff changeset
374 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
kono
parents:
diff changeset
375 #endif
kono
parents:
diff changeset
376
kono
parents:
diff changeset
377 /* For now execute all work groups in a single coarse thread (does not utilize
kono
parents:
diff changeset
378 multicore/multithread). */
kono
parents:
diff changeset
379 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
kono
parents:
diff changeset
380
kono
parents:
diff changeset
381 int dims = dp->setup & 0x3;
kono
parents:
diff changeset
382
kono
parents:
diff changeset
383 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
kono
parents:
diff changeset
384 / dp->workgroup_size_x;
kono
parents:
diff changeset
385
kono
parents:
diff changeset
386 context->wg_max_y
kono
parents:
diff changeset
387 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
kono
parents:
diff changeset
388 / dp->workgroup_size_y;
kono
parents:
diff changeset
389
kono
parents:
diff changeset
390 context->wg_max_z
kono
parents:
diff changeset
391 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
kono
parents:
diff changeset
392 / dp->workgroup_size_z;
kono
parents:
diff changeset
393
kono
parents:
diff changeset
394 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
395 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
kono
parents:
diff changeset
396 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
kono
parents:
diff changeset
397 context->wg_min_x, context->wg_min_y, context->wg_min_z,
kono
parents:
diff changeset
398 context->wg_max_x, context->wg_max_y, context->wg_max_z,
kono
parents:
diff changeset
399 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
kono
parents:
diff changeset
400 dp->grid_size_y, dp->grid_size_z);
kono
parents:
diff changeset
401 #endif
kono
parents:
diff changeset
402
kono
parents:
diff changeset
403 phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
kono
parents:
diff changeset
404 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
kono
parents:
diff changeset
405 }
kono
parents:
diff changeset
406 #endif
kono
parents:
diff changeset
407
kono
parents:
diff changeset
408 /* Executes the given work-group function for all work groups in the grid.
kono
parents:
diff changeset
409
kono
parents:
diff changeset
410 A work-group function is a version of the original kernel which executes
kono
parents:
diff changeset
411 the kernel for all work-items in a work-group. It is produced by gccbrig
kono
parents:
diff changeset
412 if it can handle the kernel's barrier usage and is much faster way to
kono
parents:
diff changeset
413 execute massive numbers of work-items in a non-SPMD machine than fibers
kono
parents:
diff changeset
414 (easily 100x faster). */
kono
parents:
diff changeset
415 static void
kono
parents:
diff changeset
416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
kono
parents:
diff changeset
417 uint32_t group_local_offset)
kono
parents:
diff changeset
418 {
kono
parents:
diff changeset
419 hsa_kernel_dispatch_packet_t *dp = context->dp;
kono
parents:
diff changeset
420 size_t x, y, z, wg_x, wg_y, wg_z;
kono
parents:
diff changeset
421
kono
parents:
diff changeset
422 context->group_segment_start_addr = (size_t) group_base_ptr;
kono
parents:
diff changeset
423
kono
parents:
diff changeset
424 /* HSA seems to allow the WG size to be larger than the grid size. We need
kono
parents:
diff changeset
425 to saturate the effective WG size to the grid size to prevent the extra WIs
kono
parents:
diff changeset
426 from executing. */
kono
parents:
diff changeset
427 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
kono
parents:
diff changeset
428 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
kono
parents:
diff changeset
429 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
kono
parents:
diff changeset
430 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
kono
parents:
diff changeset
431 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
kono
parents:
diff changeset
432
kono
parents:
diff changeset
433 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
434 wi_total = (uint64_t) dp->grid_size_x
kono
parents:
diff changeset
435 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
kono
parents:
diff changeset
436 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
kono
parents:
diff changeset
437 #endif
kono
parents:
diff changeset
438
kono
parents:
diff changeset
439 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
kono
parents:
diff changeset
440
kono
parents:
diff changeset
441 int dims = dp->setup & 0x3;
kono
parents:
diff changeset
442
kono
parents:
diff changeset
443 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
kono
parents:
diff changeset
444 / dp->workgroup_size_x;
kono
parents:
diff changeset
445
kono
parents:
diff changeset
446 context->wg_max_y
kono
parents:
diff changeset
447 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
kono
parents:
diff changeset
448 / dp->workgroup_size_y;
kono
parents:
diff changeset
449
kono
parents:
diff changeset
450 context->wg_max_z
kono
parents:
diff changeset
451 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
kono
parents:
diff changeset
452 / dp->workgroup_size_z;
kono
parents:
diff changeset
453
kono
parents:
diff changeset
454 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
455 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
kono
parents:
diff changeset
456 "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
kono
parents:
diff changeset
457 context->wg_min_x, context->wg_min_y, context->wg_min_z,
kono
parents:
diff changeset
458 context->wg_max_x, context->wg_max_y, context->wg_max_z,
kono
parents:
diff changeset
459 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
kono
parents:
diff changeset
460 dp->grid_size_y, dp->grid_size_z);
kono
parents:
diff changeset
461 #endif
kono
parents:
diff changeset
462
kono
parents:
diff changeset
463 PHSAWorkItem wi;
kono
parents:
diff changeset
464 PHSAWorkGroup wg;
kono
parents:
diff changeset
465 wi.wg = &wg;
kono
parents:
diff changeset
466 wi.x = wi.y = wi.z = 0;
kono
parents:
diff changeset
467 wi.launch_data = context;
kono
parents:
diff changeset
468
kono
parents:
diff changeset
469 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
470 start_time = clock ();
kono
parents:
diff changeset
471 uint64_t wg_count = 0;
kono
parents:
diff changeset
472 #endif
kono
parents:
diff changeset
473
kono
parents:
diff changeset
474 size_t wg_size = __hsail_workgroupsize (0, &wi)
kono
parents:
diff changeset
475 * __hsail_workgroupsize (1, &wi)
kono
parents:
diff changeset
476 * __hsail_workgroupsize (2, &wi);
kono
parents:
diff changeset
477
kono
parents:
diff changeset
478 void *private_base_ptr = NULL;
kono
parents:
diff changeset
479 if (dp->private_segment_size > 0
kono
parents:
diff changeset
480 && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
kono
parents:
diff changeset
481 dp->private_segment_size * wg_size)
kono
parents:
diff changeset
482 != 0)
kono
parents:
diff changeset
483 phsa_fatal_error (3);
kono
parents:
diff changeset
484
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
485 wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
111
kono
parents:
diff changeset
486 wg.alloca_frame_p = wg.alloca_stack_p;
kono
parents:
diff changeset
487
kono
parents:
diff changeset
488 wg.private_base_ptr = private_base_ptr;
kono
parents:
diff changeset
489 wg.group_base_ptr = group_base_ptr;
kono
parents:
diff changeset
490
kono
parents:
diff changeset
491 #ifdef DEBUG_PHSA_RT
kono
parents:
diff changeset
492 printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
kono
parents:
diff changeset
493 wg_size, private_base_ptr);
kono
parents:
diff changeset
494 #endif
kono
parents:
diff changeset
495
kono
parents:
diff changeset
496 for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
kono
parents:
diff changeset
497 for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
kono
parents:
diff changeset
498 for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
kono
parents:
diff changeset
499 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
500 wi.group_x = wg_x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
501 wi.group_y = wg_y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
502 wi.group_z = wg_z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
503
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
504 wi.wg_size_x = context->dp->workgroup_size_x;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
505 wi.wg_size_y = context->dp->workgroup_size_y;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
506 wi.wg_size_z = context->dp->workgroup_size_z;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
507
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
508 wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
509 wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
510 wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
111
kono
parents:
diff changeset
511
kono
parents:
diff changeset
512 context->kernel (context->kernarg_addr, &wi, group_base_ptr,
kono
parents:
diff changeset
513 group_local_offset, private_base_ptr);
kono
parents:
diff changeset
514
kono
parents:
diff changeset
515 #if defined (BENCHMARK_PHSA_RT)
kono
parents:
diff changeset
516 wg_count++;
kono
parents:
diff changeset
517 if (wg_count % 1000000 == 0)
kono
parents:
diff changeset
518 {
kono
parents:
diff changeset
519 clock_t spent_time = clock () - start_time;
kono
parents:
diff changeset
520 uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
kono
parents:
diff changeset
521 + wg_z * sat_wg_size_z;
kono
parents:
diff changeset
522 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
kono
parents:
diff changeset
523 double wis_per_sec = wi_count / spent_time_sec;
kono
parents:
diff changeset
524 uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
kono
parents:
diff changeset
525
kono
parents:
diff changeset
526 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
kono
parents:
diff changeset
527 wi_count, (uint64_t) spent_time_sec,
kono
parents:
diff changeset
528 (uint64_t) wis_per_sec, (uint64_t) eta_sec);
kono
parents:
diff changeset
529 }
kono
parents:
diff changeset
530 #endif
kono
parents:
diff changeset
531 }
kono
parents:
diff changeset
532
kono
parents:
diff changeset
533 #ifdef BENCHMARK_PHSA_RT
kono
parents:
diff changeset
534 clock_t spent_time = clock () - start_time;
kono
parents:
diff changeset
535 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
kono
parents:
diff changeset
536 double wis_per_sec = wi_total / spent_time_sec;
kono
parents:
diff changeset
537
kono
parents:
diff changeset
538 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
kono
parents:
diff changeset
539 (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
kono
parents:
diff changeset
540 #endif
kono
parents:
diff changeset
541 free (private_base_ptr);
kono
parents:
diff changeset
542 private_base_ptr = NULL;
kono
parents:
diff changeset
543 }
kono
parents:
diff changeset
544
kono
parents:
diff changeset
545 /* gccbrig generates the following from each HSAIL kernel:
kono
parents:
diff changeset
546
kono
parents:
diff changeset
547 1) The actual kernel function (a single work-item kernel or a work-group
kono
parents:
diff changeset
548 function) generated from HSAIL (BRIG).
kono
parents:
diff changeset
549
kono
parents:
diff changeset
550 static void _Kernel (void* args, void* context, void* group_base_ptr)
kono
parents:
diff changeset
551 {
kono
parents:
diff changeset
552 ...
kono
parents:
diff changeset
553 }
kono
parents:
diff changeset
554
kono
parents:
diff changeset
555 2) A public facing kernel function that is called from the PHSA runtime:
kono
parents:
diff changeset
556
kono
parents:
diff changeset
557 a) A single work-item function (that requires fibers for multi-WI):
kono
parents:
diff changeset
558
kono
parents:
diff changeset
559 void Kernel (void* context)
kono
parents:
diff changeset
560 {
kono
parents:
diff changeset
561 __launch_launch_kernel (_Kernel, context);
kono
parents:
diff changeset
562 }
kono
parents:
diff changeset
563
kono
parents:
diff changeset
564 or
kono
parents:
diff changeset
565
kono
parents:
diff changeset
566 b) a when gccbrig could generate a work-group function:
kono
parents:
diff changeset
567
kono
parents:
diff changeset
568 void Kernel (void* context)
kono
parents:
diff changeset
569 {
kono
parents:
diff changeset
570 __hsail_launch_wg_function (_Kernel, context);
kono
parents:
diff changeset
571 }
kono
parents:
diff changeset
572 */
kono
parents:
diff changeset
573
kono
parents:
diff changeset
574 #ifdef HAVE_FIBERS
kono
parents:
diff changeset
575
kono
parents:
diff changeset
576 void
kono
parents:
diff changeset
577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
kono
parents:
diff changeset
578 void *group_base_ptr, uint32_t group_local_offset)
kono
parents:
diff changeset
579 {
kono
parents:
diff changeset
580 context->kernel = kernel;
kono
parents:
diff changeset
581 phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
kono
parents:
diff changeset
582 }
kono
parents:
diff changeset
583 #endif
kono
parents:
diff changeset
584
kono
parents:
diff changeset
585 void
kono
parents:
diff changeset
586 __hsail_launch_wg_function (gccbrigKernelFunc kernel,
kono
parents:
diff changeset
587 PHSAKernelLaunchData *context, void *group_base_ptr,
kono
parents:
diff changeset
588 uint32_t group_local_offset)
kono
parents:
diff changeset
589 {
kono
parents:
diff changeset
590 context->kernel = kernel;
kono
parents:
diff changeset
591 phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
kono
parents:
diff changeset
592 }
kono
parents:
diff changeset
593
kono
parents:
diff changeset
594 uint32_t
kono
parents:
diff changeset
595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
kono
parents:
diff changeset
596 {
kono
parents:
diff changeset
597 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
598
kono
parents:
diff changeset
599 uint32_t id;
kono
parents:
diff changeset
600 switch (dim)
kono
parents:
diff changeset
601 {
kono
parents:
diff changeset
602 default:
kono
parents:
diff changeset
603 case 0:
kono
parents:
diff changeset
604 /* Overflow semantics in the case of WG dim > grid dim. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
605 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
111
kono
parents:
diff changeset
606 % dp->grid_size_x;
kono
parents:
diff changeset
607 break;
kono
parents:
diff changeset
608 case 1:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
609 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
111
kono
parents:
diff changeset
610 % dp->grid_size_y;
kono
parents:
diff changeset
611 break;
kono
parents:
diff changeset
612 case 2:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
613 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
111
kono
parents:
diff changeset
614 % dp->grid_size_z;
kono
parents:
diff changeset
615 break;
kono
parents:
diff changeset
616 }
kono
parents:
diff changeset
617 return id;
kono
parents:
diff changeset
618 }
kono
parents:
diff changeset
619
kono
parents:
diff changeset
620 uint64_t
kono
parents:
diff changeset
621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
kono
parents:
diff changeset
622 {
kono
parents:
diff changeset
623 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
624
kono
parents:
diff changeset
625 uint64_t id;
kono
parents:
diff changeset
626 switch (dim)
kono
parents:
diff changeset
627 {
kono
parents:
diff changeset
628 default:
kono
parents:
diff changeset
629 case 0:
kono
parents:
diff changeset
630 /* Overflow semantics in the case of WG dim > grid dim. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
631 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
111
kono
parents:
diff changeset
632 % dp->grid_size_x;
kono
parents:
diff changeset
633 break;
kono
parents:
diff changeset
634 case 1:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
635 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
111
kono
parents:
diff changeset
636 % dp->grid_size_y;
kono
parents:
diff changeset
637 break;
kono
parents:
diff changeset
638 case 2:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
639 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
111
kono
parents:
diff changeset
640 % dp->grid_size_z;
kono
parents:
diff changeset
641 break;
kono
parents:
diff changeset
642 }
kono
parents:
diff changeset
643 return id;
kono
parents:
diff changeset
644 }
kono
parents:
diff changeset
645
kono
parents:
diff changeset
646
kono
parents:
diff changeset
647 uint32_t
kono
parents:
diff changeset
648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
kono
parents:
diff changeset
649 {
kono
parents:
diff changeset
650 PHSAWorkItem *c = (PHSAWorkItem *) context;
kono
parents:
diff changeset
651 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
652
kono
parents:
diff changeset
653 /* The number of dimensions is in the two least significant bits. */
kono
parents:
diff changeset
654 int dims = dp->setup & 0x3;
kono
parents:
diff changeset
655
kono
parents:
diff changeset
656 uint32_t id;
kono
parents:
diff changeset
657 switch (dim)
kono
parents:
diff changeset
658 {
kono
parents:
diff changeset
659 default:
kono
parents:
diff changeset
660 case 0:
kono
parents:
diff changeset
661 id = c->x;
kono
parents:
diff changeset
662 break;
kono
parents:
diff changeset
663 case 1:
kono
parents:
diff changeset
664 id = dims < 2 ? 0 : c->y;
kono
parents:
diff changeset
665 break;
kono
parents:
diff changeset
666 case 2:
kono
parents:
diff changeset
667 id = dims < 3 ? 0 : c->z;
kono
parents:
diff changeset
668 break;
kono
parents:
diff changeset
669 }
kono
parents:
diff changeset
670 return id;
kono
parents:
diff changeset
671 }
kono
parents:
diff changeset
672
kono
parents:
diff changeset
673 uint32_t
kono
parents:
diff changeset
674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
kono
parents:
diff changeset
675 {
kono
parents:
diff changeset
676 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
677 int dims = dp->setup & 0x3;
kono
parents:
diff changeset
678
kono
parents:
diff changeset
679 uint32_t id;
kono
parents:
diff changeset
680 switch (dim)
kono
parents:
diff changeset
681 {
kono
parents:
diff changeset
682 default:
kono
parents:
diff changeset
683 case 0:
kono
parents:
diff changeset
684 id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
kono
parents:
diff changeset
685 break;
kono
parents:
diff changeset
686 case 1:
kono
parents:
diff changeset
687 id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
kono
parents:
diff changeset
688 / dp->workgroup_size_y;
kono
parents:
diff changeset
689 break;
kono
parents:
diff changeset
690 case 2:
kono
parents:
diff changeset
691 id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
kono
parents:
diff changeset
692 / dp->workgroup_size_z;
kono
parents:
diff changeset
693 break;
kono
parents:
diff changeset
694 }
kono
parents:
diff changeset
695 return id;
kono
parents:
diff changeset
696 }
kono
parents:
diff changeset
697
kono
parents:
diff changeset
698 uint32_t
kono
parents:
diff changeset
699 __hsail_workitemflatid (PHSAWorkItem *c)
kono
parents:
diff changeset
700 {
kono
parents:
diff changeset
701 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
kono
parents:
diff changeset
702
kono
parents:
diff changeset
703 return c->x + c->y * dp->workgroup_size_x
kono
parents:
diff changeset
704 + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
kono
parents:
diff changeset
705 }
kono
parents:
diff changeset
706
kono
parents:
diff changeset
707 uint32_t
kono
parents:
diff changeset
708 __hsail_currentworkitemflatid (PHSAWorkItem *c)
kono
parents:
diff changeset
709 {
kono
parents:
diff changeset
710 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
kono
parents:
diff changeset
711
kono
parents:
diff changeset
712 return c->x + c->y * __hsail_currentworkgroupsize (0, c)
kono
parents:
diff changeset
713 + c->z * __hsail_currentworkgroupsize (0, c)
kono
parents:
diff changeset
714 * __hsail_currentworkgroupsize (1, c);
kono
parents:
diff changeset
715 }
kono
parents:
diff changeset
716
kono
parents:
diff changeset
717 void
kono
parents:
diff changeset
718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
kono
parents:
diff changeset
719 {
kono
parents:
diff changeset
720 switch (dim)
kono
parents:
diff changeset
721 {
kono
parents:
diff changeset
722 default:
kono
parents:
diff changeset
723 case 0:
kono
parents:
diff changeset
724 context->x = id;
kono
parents:
diff changeset
725 break;
kono
parents:
diff changeset
726 case 1:
kono
parents:
diff changeset
727 context->y = id;
kono
parents:
diff changeset
728 break;
kono
parents:
diff changeset
729 case 2:
kono
parents:
diff changeset
730 context->z = id;
kono
parents:
diff changeset
731 break;
kono
parents:
diff changeset
732 }
kono
parents:
diff changeset
733 }
kono
parents:
diff changeset
734
kono
parents:
diff changeset
735 uint64_t
kono
parents:
diff changeset
736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
kono
parents:
diff changeset
737 {
kono
parents:
diff changeset
738 PHSAWorkItem *c = (PHSAWorkItem *) context;
kono
parents:
diff changeset
739 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
740
kono
parents:
diff changeset
741 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
kono
parents:
diff changeset
742 uint64_t id0 = __hsail_workitemabsid (0, context);
kono
parents:
diff changeset
743 uint64_t id1 = __hsail_workitemabsid (1, context);
kono
parents:
diff changeset
744 uint64_t id2 = __hsail_workitemabsid (2, context);
kono
parents:
diff changeset
745
kono
parents:
diff changeset
746 uint64_t max0 = dp->grid_size_x;
kono
parents:
diff changeset
747 uint64_t max1 = dp->grid_size_y;
kono
parents:
diff changeset
748 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
kono
parents:
diff changeset
749
kono
parents:
diff changeset
750 return id;
kono
parents:
diff changeset
751 }
kono
parents:
diff changeset
752
kono
parents:
diff changeset
753 uint32_t
kono
parents:
diff changeset
754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
kono
parents:
diff changeset
755 {
kono
parents:
diff changeset
756 PHSAWorkItem *c = (PHSAWorkItem *) context;
kono
parents:
diff changeset
757 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
kono
parents:
diff changeset
758
kono
parents:
diff changeset
759 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
kono
parents:
diff changeset
760 uint64_t id0 = __hsail_workitemabsid (0, context);
kono
parents:
diff changeset
761 uint64_t id1 = __hsail_workitemabsid (1, context);
kono
parents:
diff changeset
762 uint64_t id2 = __hsail_workitemabsid (2, context);
kono
parents:
diff changeset
763
kono
parents:
diff changeset
764 uint64_t max0 = dp->grid_size_x;
kono
parents:
diff changeset
765 uint64_t max1 = dp->grid_size_y;
kono
parents:
diff changeset
766 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
kono
parents:
diff changeset
767 return (uint32_t) id;
kono
parents:
diff changeset
768 }
kono
parents:
diff changeset
769
kono
parents:
diff changeset
770 uint32_t
kono
parents:
diff changeset
771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
kono
parents:
diff changeset
772 {
kono
parents:
diff changeset
773 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
kono
parents:
diff changeset
774 uint32_t wg_size = 0;
kono
parents:
diff changeset
775 switch (dim)
kono
parents:
diff changeset
776 {
kono
parents:
diff changeset
777 default:
kono
parents:
diff changeset
778 case 0:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
779 if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
111
kono
parents:
diff changeset
780 wg_size = dp->workgroup_size_x; /* Full WG. */
kono
parents:
diff changeset
781 else
kono
parents:
diff changeset
782 wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
kono
parents:
diff changeset
783 break;
kono
parents:
diff changeset
784 case 1:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
785 if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
111
kono
parents:
diff changeset
786 wg_size = dp->workgroup_size_y; /* Full WG. */
kono
parents:
diff changeset
787 else
kono
parents:
diff changeset
788 wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
kono
parents:
diff changeset
789 break;
kono
parents:
diff changeset
790 case 2:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
791 if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
111
kono
parents:
diff changeset
792 wg_size = dp->workgroup_size_z; /* Full WG. */
kono
parents:
diff changeset
793 else
kono
parents:
diff changeset
794 wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
kono
parents:
diff changeset
795 break;
kono
parents:
diff changeset
796 }
kono
parents:
diff changeset
797 return wg_size;
kono
parents:
diff changeset
798 }
kono
parents:
diff changeset
799
kono
parents:
diff changeset
800 uint32_t
kono
parents:
diff changeset
801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
kono
parents:
diff changeset
802 {
kono
parents:
diff changeset
803 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
kono
parents:
diff changeset
804 switch (dim)
kono
parents:
diff changeset
805 {
kono
parents:
diff changeset
806 default:
kono
parents:
diff changeset
807 case 0:
kono
parents:
diff changeset
808 return dp->workgroup_size_x;
kono
parents:
diff changeset
809 case 1:
kono
parents:
diff changeset
810 return dp->workgroup_size_y;
kono
parents:
diff changeset
811 case 2:
kono
parents:
diff changeset
812 return dp->workgroup_size_z;
kono
parents:
diff changeset
813 }
kono
parents:
diff changeset
814 }
kono
parents:
diff changeset
815
kono
parents:
diff changeset
816 uint32_t
kono
parents:
diff changeset
817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
kono
parents:
diff changeset
818 {
kono
parents:
diff changeset
819 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
kono
parents:
diff changeset
820 switch (dim)
kono
parents:
diff changeset
821 {
kono
parents:
diff changeset
822 default:
kono
parents:
diff changeset
823 case 0:
kono
parents:
diff changeset
824 return dp->grid_size_x;
kono
parents:
diff changeset
825 case 1:
kono
parents:
diff changeset
826 return dp->grid_size_y;
kono
parents:
diff changeset
827 case 2:
kono
parents:
diff changeset
828 return dp->grid_size_z;
kono
parents:
diff changeset
829 }
kono
parents:
diff changeset
830 }
kono
parents:
diff changeset
831
kono
parents:
diff changeset
832 uint32_t
kono
parents:
diff changeset
833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
kono
parents:
diff changeset
834 {
kono
parents:
diff changeset
835 switch (dim)
kono
parents:
diff changeset
836 {
kono
parents:
diff changeset
837 default:
kono
parents:
diff changeset
838 case 0:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
839 return wi->group_x;
111
kono
parents:
diff changeset
840 case 1:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
841 return wi->group_y;
111
kono
parents:
diff changeset
842 case 2:
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
843 return wi->group_z;
111
kono
parents:
diff changeset
844 }
kono
parents:
diff changeset
845 }
kono
parents:
diff changeset
846
kono
parents:
diff changeset
847 uint32_t
kono
parents:
diff changeset
848 __hsail_dim (PHSAWorkItem *wi)
kono
parents:
diff changeset
849 {
kono
parents:
diff changeset
850 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
kono
parents:
diff changeset
851 return dp->setup & 0x3;
kono
parents:
diff changeset
852 }
kono
parents:
diff changeset
853
kono
parents:
diff changeset
854 uint64_t
kono
parents:
diff changeset
855 __hsail_packetid (PHSAWorkItem *wi)
kono
parents:
diff changeset
856 {
kono
parents:
diff changeset
857 return wi->launch_data->packet_id;
kono
parents:
diff changeset
858 }
kono
parents:
diff changeset
859
kono
parents:
diff changeset
860 uint32_t
kono
parents:
diff changeset
861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
kono
parents:
diff changeset
862 {
kono
parents:
diff changeset
863 return (uint32_t) wi->launch_data->dp->completion_signal.handle;
kono
parents:
diff changeset
864 }
kono
parents:
diff changeset
865
kono
parents:
diff changeset
866 uint64_t
kono
parents:
diff changeset
867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
kono
parents:
diff changeset
868 {
kono
parents:
diff changeset
869 return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
kono
parents:
diff changeset
870 }
kono
parents:
diff changeset
871
kono
parents:
diff changeset
872 #ifdef HAVE_FIBERS
kono
parents:
diff changeset
873 void
kono
parents:
diff changeset
874 __hsail_barrier (PHSAWorkItem *wi)
kono
parents:
diff changeset
875 {
kono
parents:
diff changeset
876 fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
kono
parents:
diff changeset
877 }
kono
parents:
diff changeset
878 #endif
kono
parents:
diff changeset
879
kono
parents:
diff changeset
880 /* Return a 32b private segment address that points to a dynamically
kono
parents:
diff changeset
881 allocated chunk of 'size' with 'align'.
kono
parents:
diff changeset
882
kono
parents:
diff changeset
883 Allocates the space from the end of the private segment allocated
kono
parents:
diff changeset
884 for the whole work group. In implementations with separate private
kono
parents:
diff changeset
885 memories per WI, we will need to have a stack pointer per WI. But in
kono
parents:
diff changeset
886 the current implementation, the segment is shared, so we possibly
kono
parents:
diff changeset
887 save some space in case all WIs do not call the alloca.
kono
parents:
diff changeset
888
kono
parents:
diff changeset
889 The "alloca frames" are organized as follows:
kono
parents:
diff changeset
890
kono
parents:
diff changeset
891 wg->alloca_stack_p points to the last allocated data (initially
kono
parents:
diff changeset
892 outside the private segment)
kono
parents:
diff changeset
893 wg->alloca_frame_p points to the first address _outside_ the current
kono
parents:
diff changeset
894 function's allocations (initially to the same as alloca_stack_p)
kono
parents:
diff changeset
895
kono
parents:
diff changeset
896 The data is allocated downwards from the end of the private segment.
kono
parents:
diff changeset
897
kono
parents:
diff changeset
898 In the beginning of a new function which has allocas, a new alloca
kono
parents:
diff changeset
899 frame is pushed which adds the current alloca_frame_p (the current
kono
parents:
diff changeset
900 function's frame starting point) to the top of the alloca stack and
kono
parents:
diff changeset
901 alloca_frame_p is set to the current stack position.
kono
parents:
diff changeset
902
kono
parents:
diff changeset
903 At the exit points of a function with allocas, the alloca frame
kono
parents:
diff changeset
904 is popped before returning. This involves popping the alloca_frame_p
kono
parents:
diff changeset
905 to the one of the previous function in the call stack, and alloca_stack_p
kono
parents:
diff changeset
906 similarly, to the position of the last word alloca'd by the previous
kono
parents:
diff changeset
907 function.
kono
parents:
diff changeset
908 */
kono
parents:
diff changeset
909
kono
parents:
diff changeset
910 uint32_t
kono
parents:
diff changeset
911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
kono
parents:
diff changeset
912 {
kono
parents:
diff changeset
913 volatile PHSAWorkGroup *wg = wi->wg;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
914 int64_t new_pos = wg->alloca_stack_p - size;
111
kono
parents:
diff changeset
915 while (new_pos % align != 0)
kono
parents:
diff changeset
916 new_pos--;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
917 if (new_pos < 0)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
918 phsa_fatal_error (2);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
919
111
kono
parents:
diff changeset
920 wg->alloca_stack_p = new_pos;
kono
parents:
diff changeset
921
kono
parents:
diff changeset
922 #ifdef DEBUG_ALLOCA
kono
parents:
diff changeset
923 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
kono
parents:
diff changeset
924 wg->alloca_stack_p, wg->alloca_frame_p);
kono
parents:
diff changeset
925 #endif
kono
parents:
diff changeset
926 return new_pos;
kono
parents:
diff changeset
927 }
kono
parents:
diff changeset
928
kono
parents:
diff changeset
929 /* Initializes a new "alloca frame" in the private segment.
kono
parents:
diff changeset
930 This should be called at all the function entry points in case
kono
parents:
diff changeset
931 the function contains at least one call to alloca. */
kono
parents:
diff changeset
932
kono
parents:
diff changeset
933 void
kono
parents:
diff changeset
934 __hsail_alloca_push_frame (PHSAWorkItem *wi)
kono
parents:
diff changeset
935 {
kono
parents:
diff changeset
936 volatile PHSAWorkGroup *wg = wi->wg;
kono
parents:
diff changeset
937
kono
parents:
diff changeset
938 /* Store the alloca_frame_p without any alignment padding so
kono
parents:
diff changeset
939 we know exactly where the previous frame ended after popping
kono
parents:
diff changeset
940 it. */
kono
parents:
diff changeset
941 #ifdef DEBUG_ALLOCA
kono
parents:
diff changeset
942 printf ("--- push frame ");
kono
parents:
diff changeset
943 #endif
kono
parents:
diff changeset
944 uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
kono
parents:
diff changeset
945 memcpy (wg->private_base_ptr + last_word_offs,
kono
parents:
diff changeset
946 (const void *) &wg->alloca_frame_p, 4);
kono
parents:
diff changeset
947 wg->alloca_frame_p = last_word_offs;
kono
parents:
diff changeset
948
kono
parents:
diff changeset
949 #ifdef DEBUG_ALLOCA
kono
parents:
diff changeset
950 printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
kono
parents:
diff changeset
951 #endif
kono
parents:
diff changeset
952 }
kono
parents:
diff changeset
953
kono
parents:
diff changeset
954 /* Frees the current "alloca frame" and restores the frame
kono
parents:
diff changeset
955 pointer.
kono
parents:
diff changeset
956 This should be called at all the function return points in case
kono
parents:
diff changeset
957 the function contains at least one call to alloca. Restores the
kono
parents:
diff changeset
958 alloca stack to the condition it was before pushing the frame
kono
parents:
diff changeset
959 the last time. */
kono
parents:
diff changeset
960 void
kono
parents:
diff changeset
961 __hsail_alloca_pop_frame (PHSAWorkItem *wi)
kono
parents:
diff changeset
962 {
kono
parents:
diff changeset
963 volatile PHSAWorkGroup *wg = wi->wg;
kono
parents:
diff changeset
964
kono
parents:
diff changeset
965 wg->alloca_stack_p = wg->alloca_frame_p;
kono
parents:
diff changeset
966 memcpy ((void *) &wg->alloca_frame_p,
kono
parents:
diff changeset
967 (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
kono
parents:
diff changeset
968 /* Now frame_p points to the beginning of the previous function's
kono
parents:
diff changeset
969 frame and stack_p to its end. */
kono
parents:
diff changeset
970
kono
parents:
diff changeset
971 wg->alloca_stack_p += 4;
kono
parents:
diff changeset
972
kono
parents:
diff changeset
973 #ifdef DEBUG_ALLOCA
kono
parents:
diff changeset
974 printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
kono
parents:
diff changeset
975 wg->alloca_frame_p);
kono
parents:
diff changeset
976 #endif
kono
parents:
diff changeset
977 }