145
|
1 /* Run a stand-alone AMD GCN kernel.
|
|
2
|
|
3 Copyright 2017 Mentor Graphics Corporation
|
|
4 Copyright (C) 2018-2020 Free Software Foundation, Inc.
|
|
5
|
|
6 This program is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 This program is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
|
18
|
|
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
|
|
20
|
|
21 The kernel entry point's signature must use a standard main signature:
|
|
22
|
|
23 int main(int argc, char **argv)
|
|
24 */
|
|
25
|
|
26 #include <stdint.h>
|
|
27 #include <stdbool.h>
|
|
28 #include <stdlib.h>
|
|
29 #include <malloc.h>
|
|
30 #include <stdio.h>
|
|
31 #include <string.h>
|
|
32 #include <dlfcn.h>
|
|
33 #include <unistd.h>
|
|
34 #include <elf.h>
|
|
35 #include <signal.h>
|
|
36
|
|
37 /* These probably won't be in elf.h for a while. */
|
|
38 #ifndef R_AMDGPU_NONE
|
|
39 #define R_AMDGPU_NONE 0
|
|
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
|
|
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
|
|
42 #define R_AMDGPU_ABS64 3 /* S + A */
|
|
43 #define R_AMDGPU_REL32 4 /* S + A - P */
|
|
44 #define R_AMDGPU_REL64 5 /* S + A - P */
|
|
45 #define R_AMDGPU_ABS32 6 /* S + A */
|
|
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
|
|
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
|
|
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
|
|
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
|
|
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
|
|
51 #define reserved 12
|
|
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
|
|
53 #endif
|
|
54
|
|
55 #include "hsa.h"
|
|
56
|
|
57 #ifndef HSA_RUNTIME_LIB
|
|
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
|
|
59 #endif
|
|
60
|
|
61 #ifndef VERSION_STRING
|
|
62 #define VERSION_STRING "(version unknown)"
|
|
63 #endif
|
|
64
|
|
65 bool debug = false;
|
|
66
|
|
67 hsa_agent_t device = { 0 };
|
|
68 hsa_queue_t *queue = NULL;
|
|
69 uint64_t init_array_kernel = 0;
|
|
70 uint64_t fini_array_kernel = 0;
|
|
71 uint64_t main_kernel = 0;
|
|
72 hsa_executable_t executable = { 0 };
|
|
73
|
|
74 hsa_region_t kernargs_region = { 0 };
|
|
75 hsa_region_t heap_region = { 0 };
|
|
76 uint32_t kernarg_segment_size = 0;
|
|
77 uint32_t group_segment_size = 0;
|
|
78 uint32_t private_segment_size = 0;
|
|
79
|
|
80 static void
|
|
81 usage (const char *progname)
|
|
82 {
|
|
83 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
|
|
84 "Options:\n"
|
|
85 " --help\n"
|
|
86 " --version\n"
|
|
87 " --debug\n", progname);
|
|
88 }
|
|
89
|
|
90 static void
|
|
91 version (const char *progname)
|
|
92 {
|
|
93 printf ("%s " VERSION_STRING "\n", progname);
|
|
94 }
|
|
95
|
|
96 /* As an HSA runtime is dlopened, following structure defines the necessary
|
|
97 function pointers.
|
|
98 Code adapted from libgomp. */
|
|
99
|
|
100 struct hsa_runtime_fn_info
|
|
101 {
|
|
102 /* HSA runtime. */
|
|
103 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
|
|
104 const char **status_string);
|
|
105 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
|
|
106 hsa_agent_info_t attribute,
|
|
107 void *value);
|
|
108 hsa_status_t (*hsa_init_fn) (void);
|
|
109 hsa_status_t (*hsa_iterate_agents_fn)
|
|
110 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
|
|
111 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
|
|
112 hsa_region_info_t attribute,
|
|
113 void *value);
|
|
114 hsa_status_t (*hsa_queue_create_fn)
|
|
115 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
|
|
116 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
|
|
117 void *data, uint32_t private_segment_size,
|
|
118 uint32_t group_segment_size, hsa_queue_t **queue);
|
|
119 hsa_status_t (*hsa_agent_iterate_regions_fn)
|
|
120 (hsa_agent_t agent,
|
|
121 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
|
|
122 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
|
|
123 hsa_status_t (*hsa_executable_create_fn)
|
|
124 (hsa_profile_t profile, hsa_executable_state_t executable_state,
|
|
125 const char *options, hsa_executable_t *executable);
|
|
126 hsa_status_t (*hsa_executable_global_variable_define_fn)
|
|
127 (hsa_executable_t executable, const char *variable_name, void *address);
|
|
128 hsa_status_t (*hsa_executable_load_code_object_fn)
|
|
129 (hsa_executable_t executable, hsa_agent_t agent,
|
|
130 hsa_code_object_t code_object, const char *options);
|
|
131 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
|
|
132 const char *options);
|
|
133 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
|
|
134 uint32_t num_consumers,
|
|
135 const hsa_agent_t *consumers,
|
|
136 hsa_signal_t *signal);
|
|
137 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
|
|
138 void **ptr);
|
|
139 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
|
|
140 hsa_access_permission_t access);
|
|
141 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
|
|
142 size_t size);
|
|
143 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
|
|
144 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
|
|
145 hsa_status_t (*hsa_executable_get_symbol_fn)
|
|
146 (hsa_executable_t executable, const char *module_name,
|
|
147 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
|
|
148 hsa_executable_symbol_t *symbol);
|
|
149 hsa_status_t (*hsa_executable_symbol_get_info_fn)
|
|
150 (hsa_executable_symbol_t executable_symbol,
|
|
151 hsa_executable_symbol_info_t attribute, void *value);
|
|
152 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
|
|
153 hsa_signal_value_t value);
|
|
154 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
|
|
155 (hsa_signal_t signal, hsa_signal_condition_t condition,
|
|
156 hsa_signal_value_t compare_value, uint64_t timeout_hint,
|
|
157 hsa_wait_state_t wait_state_hint);
|
|
158 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
|
|
159 (hsa_signal_t signal, hsa_signal_condition_t condition,
|
|
160 hsa_signal_value_t compare_value, uint64_t timeout_hint,
|
|
161 hsa_wait_state_t wait_state_hint);
|
|
162 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
|
|
163 hsa_status_t (*hsa_code_object_deserialize_fn)
|
|
164 (void *serialized_code_object, size_t serialized_code_object_size,
|
|
165 const char *options, hsa_code_object_t *code_object);
|
|
166 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
|
|
167 (const hsa_queue_t *queue);
|
|
168 void (*hsa_queue_store_write_index_relaxed_fn)
|
|
169 (const hsa_queue_t *queue, uint64_t value);
|
|
170 hsa_status_t (*hsa_shut_down_fn) ();
|
|
171 };
|
|
172
|
|
173 /* HSA runtime functions that are initialized in init_hsa_context.
|
|
174 Code adapted from libgomp. */
|
|
175
|
|
176 static struct hsa_runtime_fn_info hsa_fns;
|
|
177
|
|
178 #define DLSYM_FN(function) \
|
|
179 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
|
|
180 if (hsa_fns.function##_fn == NULL) \
|
|
181 goto fail;
|
|
182
|
|
183 static void
|
|
184 init_hsa_runtime_functions (void)
|
|
185 {
|
|
186 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
|
|
187 if (handle == NULL)
|
|
188 {
|
|
189 fprintf (stderr,
|
|
190 "The HSA runtime is required to run GCN kernels on hardware.\n"
|
|
191 "%s: File not found or could not be opened\n",
|
|
192 HSA_RUNTIME_LIB);
|
|
193 exit (1);
|
|
194 }
|
|
195
|
|
196 DLSYM_FN (hsa_status_string)
|
|
197 DLSYM_FN (hsa_agent_get_info)
|
|
198 DLSYM_FN (hsa_init)
|
|
199 DLSYM_FN (hsa_iterate_agents)
|
|
200 DLSYM_FN (hsa_region_get_info)
|
|
201 DLSYM_FN (hsa_queue_create)
|
|
202 DLSYM_FN (hsa_agent_iterate_regions)
|
|
203 DLSYM_FN (hsa_executable_destroy)
|
|
204 DLSYM_FN (hsa_executable_create)
|
|
205 DLSYM_FN (hsa_executable_global_variable_define)
|
|
206 DLSYM_FN (hsa_executable_load_code_object)
|
|
207 DLSYM_FN (hsa_executable_freeze)
|
|
208 DLSYM_FN (hsa_signal_create)
|
|
209 DLSYM_FN (hsa_memory_allocate)
|
|
210 DLSYM_FN (hsa_memory_assign_agent)
|
|
211 DLSYM_FN (hsa_memory_copy)
|
|
212 DLSYM_FN (hsa_memory_free)
|
|
213 DLSYM_FN (hsa_signal_destroy)
|
|
214 DLSYM_FN (hsa_executable_get_symbol)
|
|
215 DLSYM_FN (hsa_executable_symbol_get_info)
|
|
216 DLSYM_FN (hsa_signal_wait_acquire)
|
|
217 DLSYM_FN (hsa_signal_wait_relaxed)
|
|
218 DLSYM_FN (hsa_signal_store_relaxed)
|
|
219 DLSYM_FN (hsa_queue_destroy)
|
|
220 DLSYM_FN (hsa_code_object_deserialize)
|
|
221 DLSYM_FN (hsa_queue_load_write_index_relaxed)
|
|
222 DLSYM_FN (hsa_queue_store_write_index_relaxed)
|
|
223 DLSYM_FN (hsa_shut_down)
|
|
224
|
|
225 return;
|
|
226
|
|
227 fail:
|
|
228 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
|
|
229 exit (1);
|
|
230 }
|
|
231
|
|
232 #undef DLSYM_FN
|
|
233
|
|
234 /* Report a fatal error STR together with the HSA error corresponding to
|
|
235 STATUS and terminate execution of the current process. */
|
|
236
|
|
237 static void
|
|
238 hsa_fatal (const char *str, hsa_status_t status)
|
|
239 {
|
|
240 const char *hsa_error_msg;
|
|
241 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
|
|
242 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
|
|
243 hsa_error_msg);
|
|
244 exit (1);
|
|
245 }
|
|
246
|
|
247 /* Helper macros to ensure we check the return values from the HSA Runtime.
|
|
248 These just keep the rest of the code a bit cleaner. */
|
|
249
|
|
250 #define XHSA_CMP(FN, CMP, MSG) \
|
|
251 do { \
|
|
252 hsa_status_t status = (FN); \
|
|
253 if (!(CMP)) \
|
|
254 hsa_fatal ((MSG), status); \
|
|
255 else if (debug) \
|
|
256 fprintf (stderr, "%s: OK\n", (MSG)); \
|
|
257 } while (0)
|
|
258 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
|
|
259
|
|
260 /* Callback of hsa_iterate_agents.
|
|
261 Called once for each available device, and returns "break" when a
|
|
262 suitable one has been found. */
|
|
263
|
|
264 static hsa_status_t
|
|
265 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
|
|
266 {
|
|
267 hsa_device_type_t device_type;
|
|
268 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
|
|
269 &device_type),
|
|
270 "Get agent type");
|
|
271
|
|
272 /* Select only GPU devices. */
|
|
273 /* TODO: support selecting from multiple GPUs. */
|
|
274 if (HSA_DEVICE_TYPE_GPU == device_type)
|
|
275 {
|
|
276 device = agent;
|
|
277 return HSA_STATUS_INFO_BREAK;
|
|
278 }
|
|
279
|
|
280 /* The device was not suitable. */
|
|
281 return HSA_STATUS_SUCCESS;
|
|
282 }
|
|
283
|
|
284 /* Callback of hsa_iterate_regions.
|
|
285 Called once for each available memory region, and returns "break" when a
|
|
286 suitable one has been found. */
|
|
287
|
|
288 static hsa_status_t
|
|
289 get_memory_region (hsa_region_t region, hsa_region_t *retval,
|
|
290 hsa_region_global_flag_t kind)
|
|
291 {
|
|
292 /* Reject non-global regions. */
|
|
293 hsa_region_segment_t segment;
|
|
294 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
|
|
295 if (HSA_REGION_SEGMENT_GLOBAL != segment)
|
|
296 return HSA_STATUS_SUCCESS;
|
|
297
|
|
298 /* Find a region with the KERNARG flag set. */
|
|
299 hsa_region_global_flag_t flags;
|
|
300 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
|
|
301 &flags);
|
|
302 if (flags & kind)
|
|
303 {
|
|
304 *retval = region;
|
|
305 return HSA_STATUS_INFO_BREAK;
|
|
306 }
|
|
307
|
|
308 /* The region was not suitable. */
|
|
309 return HSA_STATUS_SUCCESS;
|
|
310 }
|
|
311
|
|
312 static hsa_status_t
|
|
313 get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
|
|
314 {
|
|
315 return get_memory_region (region, &kernargs_region,
|
|
316 HSA_REGION_GLOBAL_FLAG_KERNARG);
|
|
317 }
|
|
318
|
|
319 static hsa_status_t
|
|
320 get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
|
|
321 {
|
|
322 return get_memory_region (region, &heap_region,
|
|
323 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
|
|
324 }
|
|
325
|
|
326 /* Initialize the HSA Runtime library and GPU device. */
|
|
327
|
|
328 static void
|
|
329 init_device ()
|
|
330 {
|
|
331 /* Load the shared library and find the API functions. */
|
|
332 init_hsa_runtime_functions ();
|
|
333
|
|
334 /* Initialize the HSA Runtime. */
|
|
335 XHSA (hsa_fns.hsa_init_fn (),
|
|
336 "Initialize run-time");
|
|
337
|
|
338 /* Select a suitable device.
|
|
339 The call-back function, get_gpu_agent, does the selection. */
|
|
340 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
|
|
341 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
|
|
342 "Find a device");
|
|
343
|
|
344 /* Initialize the queue used for launching kernels. */
|
|
345 uint32_t queue_size = 0;
|
|
346 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
|
|
347 &queue_size),
|
|
348 "Find max queue size");
|
|
349 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
|
|
350 HSA_QUEUE_TYPE_SINGLE, NULL,
|
|
351 NULL, UINT32_MAX, UINT32_MAX, &queue),
|
|
352 "Set up a device queue");
|
|
353
|
|
354 /* Select a memory region for the kernel arguments.
|
|
355 The call-back function, get_kernarg_region, does the selection. */
|
|
356 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
|
|
357 NULL),
|
|
358 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
|
|
359 "Locate kernargs memory");
|
|
360
|
|
361 /* Select a memory region for the kernel heap.
|
|
362 The call-back function, get_heap_region, does the selection. */
|
|
363 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
|
|
364 NULL),
|
|
365 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
|
|
366 "Locate device memory");
|
|
367 }
|
|
368
|
|
369
|
|
370 /* Read a whole input file.
|
|
371 Code copied from mkoffload. */
|
|
372
|
|
373 static char *
|
|
374 read_file (const char *filename, size_t *plen)
|
|
375 {
|
|
376 size_t alloc = 16384;
|
|
377 size_t base = 0;
|
|
378 char *buffer;
|
|
379
|
|
380 FILE *stream = fopen (filename, "rb");
|
|
381 if (!stream)
|
|
382 {
|
|
383 perror (filename);
|
|
384 exit (1);
|
|
385 }
|
|
386
|
|
387 if (!fseek (stream, 0, SEEK_END))
|
|
388 {
|
|
389 /* Get the file size. */
|
|
390 long s = ftell (stream);
|
|
391 if (s >= 0)
|
|
392 alloc = s + 100;
|
|
393 fseek (stream, 0, SEEK_SET);
|
|
394 }
|
|
395 buffer = malloc (alloc);
|
|
396
|
|
397 for (;;)
|
|
398 {
|
|
399 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
|
|
400
|
|
401 if (!n)
|
|
402 break;
|
|
403 base += n;
|
|
404 if (base + 1 == alloc)
|
|
405 {
|
|
406 alloc *= 2;
|
|
407 buffer = realloc (buffer, alloc);
|
|
408 }
|
|
409 }
|
|
410 buffer[base] = 0;
|
|
411 *plen = base;
|
|
412
|
|
413 fclose (stream);
|
|
414
|
|
415 return buffer;
|
|
416 }
|
|
417
|
|
418 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
|
|
419
|
|
420 static void
|
|
421 load_image (const char *filename)
|
|
422 {
|
|
423 size_t image_size;
|
|
424 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
|
|
425
|
|
426 /* An "executable" consists of one or more code objects. */
|
|
427 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
|
|
428 HSA_EXECUTABLE_STATE_UNFROZEN, "",
|
|
429 &executable),
|
|
430 "Initialize GCN executable");
|
|
431
|
|
432 /* Hide relocations from the HSA runtime loader.
|
|
433 Keep a copy of the unmodified section headers to use later. */
|
|
434 Elf64_Shdr *image_sections =
|
|
435 (Elf64_Shdr *) ((char *) image + image->e_shoff);
|
|
436 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
|
|
437 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
|
|
438 for (int i = image->e_shnum - 1; i >= 0; i--)
|
|
439 {
|
|
440 if (image_sections[i].sh_type == SHT_RELA
|
|
441 || image_sections[i].sh_type == SHT_REL)
|
|
442 /* Change section type to something harmless. */
|
|
443 image_sections[i].sh_type = SHT_NOTE;
|
|
444 }
|
|
445
|
|
446 /* Add the HSACO to the executable. */
|
|
447 hsa_code_object_t co = { 0 };
|
|
448 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
|
|
449 "Deserialize GCN code object");
|
|
450 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
|
|
451 ""),
|
|
452 "Load GCN code object");
|
|
453
|
|
454 /* We're done modifying he executable. */
|
|
455 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
|
|
456 "Freeze GCN executable");
|
|
457
|
|
458 /* Locate the "_init_array" function, and read the kernel's properties. */
|
|
459 hsa_executable_symbol_t symbol;
|
|
460 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
|
|
461 device, 0, &symbol),
|
|
462 "Find '_init_array' function");
|
|
463 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
464 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
|
|
465 "Extract '_init_array' kernel object kernel object");
|
|
466
|
|
467 /* Locate the "_fini_array" function, and read the kernel's properties. */
|
|
468 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
|
|
469 device, 0, &symbol),
|
|
470 "Find '_fini_array' function");
|
|
471 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
472 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
|
|
473 "Extract '_fini_array' kernel object kernel object");
|
|
474
|
|
475 /* Locate the "main" function, and read the kernel's properties. */
|
|
476 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
|
|
477 device, 0, &symbol),
|
|
478 "Find 'main' function");
|
|
479 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
480 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
|
|
481 "Extract 'main' kernel object");
|
|
482 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
483 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
|
|
484 &kernarg_segment_size),
|
|
485 "Extract kernarg segment size");
|
|
486 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
487 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
|
|
488 &group_segment_size),
|
|
489 "Extract group segment size");
|
|
490 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
491 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
|
492 &private_segment_size),
|
|
493 "Extract private segment size");
|
|
494
|
|
495 /* Find main function in ELF, and calculate actual load offset. */
|
|
496 Elf64_Addr load_offset;
|
|
497 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
|
|
498 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
|
499 &load_offset),
|
|
500 "Extract 'main' symbol address");
|
|
501 for (int i = 0; i < image->e_shnum; i++)
|
|
502 if (sections[i].sh_type == SHT_SYMTAB)
|
|
503 {
|
|
504 Elf64_Shdr *strtab = §ions[sections[i].sh_link];
|
|
505 char *strings = (char *) image + strtab->sh_offset;
|
|
506
|
|
507 for (size_t offset = 0;
|
|
508 offset < sections[i].sh_size;
|
|
509 offset += sections[i].sh_entsize)
|
|
510 {
|
|
511 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
|
|
512 + sections[i].sh_offset + offset);
|
|
513 if (strcmp ("main", strings + sym->st_name) == 0)
|
|
514 {
|
|
515 load_offset -= sym->st_value;
|
|
516 goto found_main;
|
|
517 }
|
|
518 }
|
|
519 }
|
|
520 /* We only get here when main was not found.
|
|
521 This should never happen. */
|
|
522 fprintf (stderr, "Error: main function not found.\n");
|
|
523 abort ();
|
|
524 found_main:;
|
|
525
|
|
526 /* Find dynamic symbol table. */
|
|
527 Elf64_Shdr *dynsym = NULL;
|
|
528 for (int i = 0; i < image->e_shnum; i++)
|
|
529 if (sections[i].sh_type == SHT_DYNSYM)
|
|
530 {
|
|
531 dynsym = §ions[i];
|
|
532 break;
|
|
533 }
|
|
534
|
|
535 /* Fix up relocations. */
|
|
536 for (int i = 0; i < image->e_shnum; i++)
|
|
537 {
|
|
538 if (sections[i].sh_type == SHT_RELA)
|
|
539 for (size_t offset = 0;
|
|
540 offset < sections[i].sh_size;
|
|
541 offset += sections[i].sh_entsize)
|
|
542 {
|
|
543 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
|
|
544 + sections[i].sh_offset
|
|
545 + offset);
|
|
546 Elf64_Sym *sym =
|
|
547 (dynsym
|
|
548 ? (Elf64_Sym *) ((char *) image
|
|
549 + dynsym->sh_offset
|
|
550 + (dynsym->sh_entsize
|
|
551 * ELF64_R_SYM (reloc->r_info))) : NULL);
|
|
552
|
|
553 int64_t S = (sym ? sym->st_value : 0);
|
|
554 int64_t P = reloc->r_offset + load_offset;
|
|
555 int64_t A = reloc->r_addend;
|
|
556 int64_t B = load_offset;
|
|
557 int64_t V, size;
|
|
558 switch (ELF64_R_TYPE (reloc->r_info))
|
|
559 {
|
|
560 case R_AMDGPU_ABS32_LO:
|
|
561 V = (S + A) & 0xFFFFFFFF;
|
|
562 size = 4;
|
|
563 break;
|
|
564 case R_AMDGPU_ABS32_HI:
|
|
565 V = (S + A) >> 32;
|
|
566 size = 4;
|
|
567 break;
|
|
568 case R_AMDGPU_ABS64:
|
|
569 V = S + A;
|
|
570 size = 8;
|
|
571 break;
|
|
572 case R_AMDGPU_REL32:
|
|
573 V = S + A - P;
|
|
574 size = 4;
|
|
575 break;
|
|
576 case R_AMDGPU_REL64:
|
|
577 /* FIXME
|
|
578 LLD seems to emit REL64 where the the assembler has ABS64.
|
|
579 This is clearly wrong because it's not what the compiler
|
|
580 is expecting. Let's assume, for now, that it's a bug.
|
|
581 In any case, GCN kernels are always self contained and
|
|
582 therefore relative relocations will have been resolved
|
|
583 already, so this should be a safe workaround. */
|
|
584 V = S + A /* - P */ ;
|
|
585 size = 8;
|
|
586 break;
|
|
587 case R_AMDGPU_ABS32:
|
|
588 V = S + A;
|
|
589 size = 4;
|
|
590 break;
|
|
591 /* TODO R_AMDGPU_GOTPCREL */
|
|
592 /* TODO R_AMDGPU_GOTPCREL32_LO */
|
|
593 /* TODO R_AMDGPU_GOTPCREL32_HI */
|
|
594 case R_AMDGPU_REL32_LO:
|
|
595 V = (S + A - P) & 0xFFFFFFFF;
|
|
596 size = 4;
|
|
597 break;
|
|
598 case R_AMDGPU_REL32_HI:
|
|
599 V = (S + A - P) >> 32;
|
|
600 size = 4;
|
|
601 break;
|
|
602 case R_AMDGPU_RELATIVE64:
|
|
603 V = B + A;
|
|
604 size = 8;
|
|
605 break;
|
|
606 default:
|
|
607 fprintf (stderr, "Error: unsupported relocation type.\n");
|
|
608 exit (1);
|
|
609 }
|
|
610 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
|
|
611 "Fix up relocation");
|
|
612 }
|
|
613 }
|
|
614 }
|
|
615
|
|
616 /* Allocate some device memory from the kernargs region.
|
|
617 The returned address will be 32-bit (with excess zeroed on 64-bit host),
|
|
618 and accessible via the same address on both host and target (via
|
|
619 __flat_scalar GCN address space). */
|
|
620
|
|
621 static void *
|
|
622 device_malloc (size_t size, hsa_region_t region)
|
|
623 {
|
|
624 void *result;
|
|
625 XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
|
|
626 "Allocate device memory");
|
|
627 return result;
|
|
628 }
|
|
629
|
|
630 /* These are the device pointers that will be transferred to the target.
|
|
631 The HSA Runtime points the kernargs register here.
|
|
632 They correspond to function signature:
|
|
633 int main (int argc, char *argv[], int *return_value)
|
|
634 The compiler expects this, for kernel functions, and will
|
|
635 automatically assign the exit value to *return_value. */
|
|
636 struct kernargs
|
|
637 {
|
|
638 /* Kernargs. */
|
|
639 int32_t argc;
|
|
640 int64_t argv;
|
|
641 int64_t out_ptr;
|
|
642 int64_t heap_ptr;
|
|
643
|
|
644 /* Output data. */
|
|
645 struct output
|
|
646 {
|
|
647 int return_value;
|
|
648 unsigned int next_output;
|
|
649 struct printf_data
|
|
650 {
|
|
651 int written;
|
|
652 char msg[128];
|
|
653 int type;
|
|
654 union
|
|
655 {
|
|
656 int64_t ivalue;
|
|
657 double dvalue;
|
|
658 char text[128];
|
|
659 };
|
|
660 } queue[1024];
|
|
661 unsigned int consumed;
|
|
662 } output_data;
|
|
663 };
|
|
664
|
|
665 struct heap
|
|
666 {
|
|
667 int64_t size;
|
|
668 char data[0];
|
|
669 } heap;
|
|
670
|
|
671 /* Print any console output from the kernel.
|
|
672 We print all entries from "consumed" to the next entry without a "written"
|
|
673 flag, or "next_output" is reached. The buffer is circular, but the
|
|
674 indices are absolute. It is assumed the kernel will stop writing data
|
|
675 if "next_output" wraps (becomes smaller than "consumed"). */
|
|
676 void
|
|
677 gomp_print_output (struct kernargs *kernargs, bool final)
|
|
678 {
|
|
679 unsigned int limit = (sizeof (kernargs->output_data.queue)
|
|
680 / sizeof (kernargs->output_data.queue[0]));
|
|
681
|
|
682 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
|
|
683 __ATOMIC_ACQUIRE);
|
|
684 unsigned int to = kernargs->output_data.next_output;
|
|
685
|
|
686 if (from > to)
|
|
687 {
|
|
688 /* Overflow. */
|
|
689 if (final)
|
|
690 printf ("GCN print buffer overflowed.\n");
|
|
691 return;
|
|
692 }
|
|
693
|
|
694 unsigned int i;
|
|
695 for (i = from; i < to; i++)
|
|
696 {
|
|
697 struct printf_data *data = &kernargs->output_data.queue[i%limit];
|
|
698
|
|
699 if (!data->written && !final)
|
|
700 break;
|
|
701
|
|
702 switch (data->type)
|
|
703 {
|
|
704 case 0:
|
|
705 printf ("%.128s%ld\n", data->msg, data->ivalue);
|
|
706 break;
|
|
707 case 1:
|
|
708 printf ("%.128s%f\n", data->msg, data->dvalue);
|
|
709 break;
|
|
710 case 2:
|
|
711 printf ("%.128s%.128s\n", data->msg, data->text);
|
|
712 break;
|
|
713 case 3:
|
|
714 printf ("%.128s%.128s", data->msg, data->text);
|
|
715 break;
|
|
716 default:
|
|
717 printf ("GCN print buffer error!\n");
|
|
718 break;
|
|
719 }
|
|
720
|
|
721 data->written = 0;
|
|
722 __atomic_store_n (&kernargs->output_data.consumed, i+1,
|
|
723 __ATOMIC_RELEASE);
|
|
724 }
|
|
725 fflush (stdout);
|
|
726 }
|
|
727
|
|
728 /* Execute an already-loaded kernel on the device. */
|
|
729
|
|
730 static void
|
|
731 run (uint64_t kernel, void *kernargs)
|
|
732 {
|
|
733 /* A "signal" is used to launch and monitor the kernel. */
|
|
734 hsa_signal_t signal;
|
|
735 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
|
|
736 "Create signal");
|
|
737
|
|
738 /* Configure for a single-worker kernel. */
|
|
739 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
|
|
740 const uint32_t queueMask = queue->size - 1;
|
|
741 hsa_kernel_dispatch_packet_t *dispatch_packet =
|
|
742 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
|
|
743 queueMask]);
|
|
744 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
|
745 dispatch_packet->workgroup_size_x = (uint16_t) 1;
|
|
746 dispatch_packet->workgroup_size_y = (uint16_t) 64;
|
|
747 dispatch_packet->workgroup_size_z = (uint16_t) 1;
|
|
748 dispatch_packet->grid_size_x = 1;
|
|
749 dispatch_packet->grid_size_y = 64;
|
|
750 dispatch_packet->grid_size_z = 1;
|
|
751 dispatch_packet->completion_signal = signal;
|
|
752 dispatch_packet->kernel_object = kernel;
|
|
753 dispatch_packet->kernarg_address = (void *) kernargs;
|
|
754 dispatch_packet->private_segment_size = private_segment_size;
|
|
755 dispatch_packet->group_segment_size = group_segment_size;
|
|
756
|
|
757 uint16_t header = 0;
|
|
758 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
|
759 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
|
760 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
|
761
|
|
762 __atomic_store_n ((uint32_t *) dispatch_packet,
|
|
763 header | (dispatch_packet->setup << 16),
|
|
764 __ATOMIC_RELEASE);
|
|
765
|
|
766 if (debug)
|
|
767 fprintf (stderr, "Launch kernel\n");
|
|
768
|
|
769 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
|
|
770 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
|
|
771 /* Kernel running ...... */
|
|
772 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
|
|
773 1, 1000000,
|
|
774 HSA_WAIT_STATE_ACTIVE) != 0)
|
|
775 {
|
|
776 usleep (10000);
|
|
777 gomp_print_output (kernargs, false);
|
|
778 }
|
|
779
|
|
780 gomp_print_output (kernargs, true);
|
|
781
|
|
782 if (debug)
|
|
783 fprintf (stderr, "Kernel exited\n");
|
|
784
|
|
785 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
|
|
786 "Clean up signal");
|
|
787 }
|
|
788
|
|
789 int
|
|
790 main (int argc, char *argv[])
|
|
791 {
|
|
792 int kernel_arg = 0;
|
|
793 for (int i = 1; i < argc; i++)
|
|
794 {
|
|
795 if (!strcmp (argv[i], "--help"))
|
|
796 {
|
|
797 usage (argv[0]);
|
|
798 return 0;
|
|
799 }
|
|
800 else if (!strcmp (argv[i], "--version"))
|
|
801 {
|
|
802 version (argv[0]);
|
|
803 return 0;
|
|
804 }
|
|
805 else if (!strcmp (argv[i], "--debug"))
|
|
806 debug = true;
|
|
807 else if (argv[i][0] == '-')
|
|
808 {
|
|
809 usage (argv[0]);
|
|
810 return 1;
|
|
811 }
|
|
812 else
|
|
813 {
|
|
814 kernel_arg = i;
|
|
815 break;
|
|
816 }
|
|
817 }
|
|
818
|
|
819 if (!kernel_arg)
|
|
820 {
|
|
821 /* No kernel arguments were found. */
|
|
822 usage (argv[0]);
|
|
823 return 1;
|
|
824 }
|
|
825
|
|
826 /* The remaining arguments are for the GCN kernel. */
|
|
827 int kernel_argc = argc - kernel_arg;
|
|
828 char **kernel_argv = &argv[kernel_arg];
|
|
829
|
|
830 init_device ();
|
|
831 load_image (kernel_argv[0]);
|
|
832
|
|
833 /* Calculate size of function parameters + argv data. */
|
|
834 size_t args_size = 0;
|
|
835 for (int i = 0; i < kernel_argc; i++)
|
|
836 args_size += strlen (kernel_argv[i]) + 1;
|
|
837
|
|
838 /* Allocate device memory for both function parameters and the argv
|
|
839 data. */
|
|
840 struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
|
|
841 kernargs_region);
|
|
842 struct argdata
|
|
843 {
|
|
844 int64_t argv_data[kernel_argc];
|
|
845 char strings[args_size];
|
|
846 } *args = device_malloc (sizeof (struct argdata), kernargs_region);
|
|
847
|
|
848 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
|
|
849 struct heap *heap = device_malloc (heap_size, heap_region);
|
|
850 XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
|
|
851 HSA_ACCESS_PERMISSION_RW),
|
|
852 "Assign heap to device agent");
|
|
853
|
|
854 /* Write the data to the target. */
|
|
855 kernargs->argc = kernel_argc;
|
|
856 kernargs->argv = (int64_t) args->argv_data;
|
|
857 kernargs->out_ptr = (int64_t) &kernargs->output_data;
|
|
858 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
|
|
859 kernargs->output_data.next_output = 0;
|
|
860 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
|
|
861 / sizeof (kernargs->output_data.queue[0])); i++)
|
|
862 kernargs->output_data.queue[i].written = 0;
|
|
863 kernargs->output_data.consumed = 0;
|
|
864 int offset = 0;
|
|
865 for (int i = 0; i < kernel_argc; i++)
|
|
866 {
|
|
867 size_t arg_len = strlen (kernel_argv[i]) + 1;
|
|
868 args->argv_data[i] = (int64_t) &args->strings[offset];
|
|
869 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
|
|
870 offset += arg_len;
|
|
871 }
|
|
872 kernargs->heap_ptr = (int64_t) heap;
|
|
873 hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
|
|
874
|
|
875 /* Run constructors on the GPU. */
|
|
876 run (init_array_kernel, kernargs);
|
|
877
|
|
878 /* Run the kernel on the GPU. */
|
|
879 run (main_kernel, kernargs);
|
|
880 unsigned int return_value =
|
|
881 (unsigned int) kernargs->output_data.return_value;
|
|
882
|
|
883 /* Run destructors on the GPU. */
|
|
884 run (fini_array_kernel, kernargs);
|
|
885
|
|
886 unsigned int upper = (return_value & ~0xffff) >> 16;
|
|
887 if (upper == 0xcafe)
|
|
888 {
|
|
889 printf ("Kernel exit value was never set\n");
|
|
890 return_value = 0xff;
|
|
891 }
|
|
892 else if (upper == 0xffff)
|
|
893 ; /* Set by exit. */
|
|
894 else if (upper == 0)
|
|
895 ; /* Set by return from main. */
|
|
896 else
|
|
897 printf ("Possible kernel exit value corruption, 2 most significant bytes "
|
|
898 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
|
|
899
|
|
900 if (upper == 0xffff)
|
|
901 {
|
|
902 unsigned int signal = (return_value >> 8) & 0xff;
|
|
903 if (signal == SIGABRT)
|
|
904 printf ("Kernel aborted\n");
|
|
905 else if (signal != 0)
|
|
906 printf ("Kernel received unkown signal\n");
|
|
907 }
|
|
908
|
|
909 if (debug)
|
|
910 printf ("Kernel exit value: %d\n", return_value & 0xff);
|
|
911
|
|
912 /* Clean shut down. */
|
|
913 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
|
|
914 "Clean up device memory");
|
|
915 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
|
|
916 "Clean up GCN executable");
|
|
917 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
|
|
918 "Clean up device queue");
|
|
919 XHSA (hsa_fns.hsa_shut_down_fn (),
|
|
920 "Shut down run-time");
|
|
921
|
|
922 return return_value & 0xff;
|
|
923 }
|