comparison gcc/config/gcn/gcn-run.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1 /* Run a stand-alone AMD GCN kernel.
2
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright (C) 2018-2020 Free Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21 The kernel entry point's signature must use a standard main signature:
22
23 int main(int argc, char **argv)
24 */
25
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36
37 /* These probably won't be in elf.h for a while. */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
51 #define reserved 12
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
53 #endif
54
55 #include "hsa.h"
56
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
60
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
64
65 bool debug = false;
66
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t init_array_kernel = 0;
70 uint64_t fini_array_kernel = 0;
71 uint64_t main_kernel = 0;
72 hsa_executable_t executable = { 0 };
73
74 hsa_region_t kernargs_region = { 0 };
75 hsa_region_t heap_region = { 0 };
76 uint32_t kernarg_segment_size = 0;
77 uint32_t group_segment_size = 0;
78 uint32_t private_segment_size = 0;
79
80 static void
81 usage (const char *progname)
82 {
83 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
84 "Options:\n"
85 " --help\n"
86 " --version\n"
87 " --debug\n", progname);
88 }
89
90 static void
91 version (const char *progname)
92 {
93 printf ("%s " VERSION_STRING "\n", progname);
94 }
95
96 /* As an HSA runtime is dlopened, following structure defines the necessary
97 function pointers.
98 Code adapted from libgomp. */
99
100 struct hsa_runtime_fn_info
101 {
102 /* HSA runtime. */
103 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
104 const char **status_string);
105 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
106 hsa_agent_info_t attribute,
107 void *value);
108 hsa_status_t (*hsa_init_fn) (void);
109 hsa_status_t (*hsa_iterate_agents_fn)
110 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
111 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
112 hsa_region_info_t attribute,
113 void *value);
114 hsa_status_t (*hsa_queue_create_fn)
115 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
116 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
117 void *data, uint32_t private_segment_size,
118 uint32_t group_segment_size, hsa_queue_t **queue);
119 hsa_status_t (*hsa_agent_iterate_regions_fn)
120 (hsa_agent_t agent,
121 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
122 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
123 hsa_status_t (*hsa_executable_create_fn)
124 (hsa_profile_t profile, hsa_executable_state_t executable_state,
125 const char *options, hsa_executable_t *executable);
126 hsa_status_t (*hsa_executable_global_variable_define_fn)
127 (hsa_executable_t executable, const char *variable_name, void *address);
128 hsa_status_t (*hsa_executable_load_code_object_fn)
129 (hsa_executable_t executable, hsa_agent_t agent,
130 hsa_code_object_t code_object, const char *options);
131 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
132 const char *options);
133 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
134 uint32_t num_consumers,
135 const hsa_agent_t *consumers,
136 hsa_signal_t *signal);
137 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
138 void **ptr);
139 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
140 hsa_access_permission_t access);
141 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
142 size_t size);
143 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
144 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
145 hsa_status_t (*hsa_executable_get_symbol_fn)
146 (hsa_executable_t executable, const char *module_name,
147 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
148 hsa_executable_symbol_t *symbol);
149 hsa_status_t (*hsa_executable_symbol_get_info_fn)
150 (hsa_executable_symbol_t executable_symbol,
151 hsa_executable_symbol_info_t attribute, void *value);
152 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
153 hsa_signal_value_t value);
154 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
155 (hsa_signal_t signal, hsa_signal_condition_t condition,
156 hsa_signal_value_t compare_value, uint64_t timeout_hint,
157 hsa_wait_state_t wait_state_hint);
158 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
159 (hsa_signal_t signal, hsa_signal_condition_t condition,
160 hsa_signal_value_t compare_value, uint64_t timeout_hint,
161 hsa_wait_state_t wait_state_hint);
162 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
163 hsa_status_t (*hsa_code_object_deserialize_fn)
164 (void *serialized_code_object, size_t serialized_code_object_size,
165 const char *options, hsa_code_object_t *code_object);
166 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
167 (const hsa_queue_t *queue);
168 void (*hsa_queue_store_write_index_relaxed_fn)
169 (const hsa_queue_t *queue, uint64_t value);
170 hsa_status_t (*hsa_shut_down_fn) ();
171 };
172
173 /* HSA runtime functions that are initialized in init_hsa_context.
174 Code adapted from libgomp. */
175
176 static struct hsa_runtime_fn_info hsa_fns;
177
178 #define DLSYM_FN(function) \
179 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
180 if (hsa_fns.function##_fn == NULL) \
181 goto fail;
182
183 static void
184 init_hsa_runtime_functions (void)
185 {
186 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
187 if (handle == NULL)
188 {
189 fprintf (stderr,
190 "The HSA runtime is required to run GCN kernels on hardware.\n"
191 "%s: File not found or could not be opened\n",
192 HSA_RUNTIME_LIB);
193 exit (1);
194 }
195
196 DLSYM_FN (hsa_status_string)
197 DLSYM_FN (hsa_agent_get_info)
198 DLSYM_FN (hsa_init)
199 DLSYM_FN (hsa_iterate_agents)
200 DLSYM_FN (hsa_region_get_info)
201 DLSYM_FN (hsa_queue_create)
202 DLSYM_FN (hsa_agent_iterate_regions)
203 DLSYM_FN (hsa_executable_destroy)
204 DLSYM_FN (hsa_executable_create)
205 DLSYM_FN (hsa_executable_global_variable_define)
206 DLSYM_FN (hsa_executable_load_code_object)
207 DLSYM_FN (hsa_executable_freeze)
208 DLSYM_FN (hsa_signal_create)
209 DLSYM_FN (hsa_memory_allocate)
210 DLSYM_FN (hsa_memory_assign_agent)
211 DLSYM_FN (hsa_memory_copy)
212 DLSYM_FN (hsa_memory_free)
213 DLSYM_FN (hsa_signal_destroy)
214 DLSYM_FN (hsa_executable_get_symbol)
215 DLSYM_FN (hsa_executable_symbol_get_info)
216 DLSYM_FN (hsa_signal_wait_acquire)
217 DLSYM_FN (hsa_signal_wait_relaxed)
218 DLSYM_FN (hsa_signal_store_relaxed)
219 DLSYM_FN (hsa_queue_destroy)
220 DLSYM_FN (hsa_code_object_deserialize)
221 DLSYM_FN (hsa_queue_load_write_index_relaxed)
222 DLSYM_FN (hsa_queue_store_write_index_relaxed)
223 DLSYM_FN (hsa_shut_down)
224
225 return;
226
227 fail:
228 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
229 exit (1);
230 }
231
232 #undef DLSYM_FN
233
234 /* Report a fatal error STR together with the HSA error corresponding to
235 STATUS and terminate execution of the current process. */
236
237 static void
238 hsa_fatal (const char *str, hsa_status_t status)
239 {
240 const char *hsa_error_msg;
241 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
242 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
243 hsa_error_msg);
244 exit (1);
245 }
246
247 /* Helper macros to ensure we check the return values from the HSA Runtime.
248 These just keep the rest of the code a bit cleaner. */
249
250 #define XHSA_CMP(FN, CMP, MSG) \
251 do { \
252 hsa_status_t status = (FN); \
253 if (!(CMP)) \
254 hsa_fatal ((MSG), status); \
255 else if (debug) \
256 fprintf (stderr, "%s: OK\n", (MSG)); \
257 } while (0)
258 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
259
260 /* Callback of hsa_iterate_agents.
261 Called once for each available device, and returns "break" when a
262 suitable one has been found. */
263
264 static hsa_status_t
265 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
266 {
267 hsa_device_type_t device_type;
268 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
269 &device_type),
270 "Get agent type");
271
272 /* Select only GPU devices. */
273 /* TODO: support selecting from multiple GPUs. */
274 if (HSA_DEVICE_TYPE_GPU == device_type)
275 {
276 device = agent;
277 return HSA_STATUS_INFO_BREAK;
278 }
279
280 /* The device was not suitable. */
281 return HSA_STATUS_SUCCESS;
282 }
283
284 /* Callback of hsa_iterate_regions.
285 Called once for each available memory region, and returns "break" when a
286 suitable one has been found. */
287
288 static hsa_status_t
289 get_memory_region (hsa_region_t region, hsa_region_t *retval,
290 hsa_region_global_flag_t kind)
291 {
292 /* Reject non-global regions. */
293 hsa_region_segment_t segment;
294 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
295 if (HSA_REGION_SEGMENT_GLOBAL != segment)
296 return HSA_STATUS_SUCCESS;
297
298 /* Find a region with the KERNARG flag set. */
299 hsa_region_global_flag_t flags;
300 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
301 &flags);
302 if (flags & kind)
303 {
304 *retval = region;
305 return HSA_STATUS_INFO_BREAK;
306 }
307
308 /* The region was not suitable. */
309 return HSA_STATUS_SUCCESS;
310 }
311
312 static hsa_status_t
313 get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
314 {
315 return get_memory_region (region, &kernargs_region,
316 HSA_REGION_GLOBAL_FLAG_KERNARG);
317 }
318
319 static hsa_status_t
320 get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
321 {
322 return get_memory_region (region, &heap_region,
323 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
324 }
325
326 /* Initialize the HSA Runtime library and GPU device. */
327
328 static void
329 init_device ()
330 {
331 /* Load the shared library and find the API functions. */
332 init_hsa_runtime_functions ();
333
334 /* Initialize the HSA Runtime. */
335 XHSA (hsa_fns.hsa_init_fn (),
336 "Initialize run-time");
337
338 /* Select a suitable device.
339 The call-back function, get_gpu_agent, does the selection. */
340 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
341 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
342 "Find a device");
343
344 /* Initialize the queue used for launching kernels. */
345 uint32_t queue_size = 0;
346 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
347 &queue_size),
348 "Find max queue size");
349 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
350 HSA_QUEUE_TYPE_SINGLE, NULL,
351 NULL, UINT32_MAX, UINT32_MAX, &queue),
352 "Set up a device queue");
353
354 /* Select a memory region for the kernel arguments.
355 The call-back function, get_kernarg_region, does the selection. */
356 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
357 NULL),
358 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
359 "Locate kernargs memory");
360
361 /* Select a memory region for the kernel heap.
362 The call-back function, get_heap_region, does the selection. */
363 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
364 NULL),
365 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
366 "Locate device memory");
367 }
368
369
370 /* Read a whole input file.
371 Code copied from mkoffload. */
372
373 static char *
374 read_file (const char *filename, size_t *plen)
375 {
376 size_t alloc = 16384;
377 size_t base = 0;
378 char *buffer;
379
380 FILE *stream = fopen (filename, "rb");
381 if (!stream)
382 {
383 perror (filename);
384 exit (1);
385 }
386
387 if (!fseek (stream, 0, SEEK_END))
388 {
389 /* Get the file size. */
390 long s = ftell (stream);
391 if (s >= 0)
392 alloc = s + 100;
393 fseek (stream, 0, SEEK_SET);
394 }
395 buffer = malloc (alloc);
396
397 for (;;)
398 {
399 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
400
401 if (!n)
402 break;
403 base += n;
404 if (base + 1 == alloc)
405 {
406 alloc *= 2;
407 buffer = realloc (buffer, alloc);
408 }
409 }
410 buffer[base] = 0;
411 *plen = base;
412
413 fclose (stream);
414
415 return buffer;
416 }
417
418 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
419
420 static void
421 load_image (const char *filename)
422 {
423 size_t image_size;
424 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
425
426 /* An "executable" consists of one or more code objects. */
427 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
428 HSA_EXECUTABLE_STATE_UNFROZEN, "",
429 &executable),
430 "Initialize GCN executable");
431
432 /* Hide relocations from the HSA runtime loader.
433 Keep a copy of the unmodified section headers to use later. */
434 Elf64_Shdr *image_sections =
435 (Elf64_Shdr *) ((char *) image + image->e_shoff);
436 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
437 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
438 for (int i = image->e_shnum - 1; i >= 0; i--)
439 {
440 if (image_sections[i].sh_type == SHT_RELA
441 || image_sections[i].sh_type == SHT_REL)
442 /* Change section type to something harmless. */
443 image_sections[i].sh_type = SHT_NOTE;
444 }
445
446 /* Add the HSACO to the executable. */
447 hsa_code_object_t co = { 0 };
448 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
449 "Deserialize GCN code object");
450 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
451 ""),
452 "Load GCN code object");
453
454 /* We're done modifying he executable. */
455 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
456 "Freeze GCN executable");
457
458 /* Locate the "_init_array" function, and read the kernel's properties. */
459 hsa_executable_symbol_t symbol;
460 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
461 device, 0, &symbol),
462 "Find '_init_array' function");
463 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
464 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
465 "Extract '_init_array' kernel object kernel object");
466
467 /* Locate the "_fini_array" function, and read the kernel's properties. */
468 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
469 device, 0, &symbol),
470 "Find '_fini_array' function");
471 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
472 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
473 "Extract '_fini_array' kernel object kernel object");
474
475 /* Locate the "main" function, and read the kernel's properties. */
476 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
477 device, 0, &symbol),
478 "Find 'main' function");
479 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
480 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
481 "Extract 'main' kernel object");
482 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
483 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
484 &kernarg_segment_size),
485 "Extract kernarg segment size");
486 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
487 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
488 &group_segment_size),
489 "Extract group segment size");
490 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
491 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
492 &private_segment_size),
493 "Extract private segment size");
494
495 /* Find main function in ELF, and calculate actual load offset. */
496 Elf64_Addr load_offset;
497 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
498 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
499 &load_offset),
500 "Extract 'main' symbol address");
501 for (int i = 0; i < image->e_shnum; i++)
502 if (sections[i].sh_type == SHT_SYMTAB)
503 {
504 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
505 char *strings = (char *) image + strtab->sh_offset;
506
507 for (size_t offset = 0;
508 offset < sections[i].sh_size;
509 offset += sections[i].sh_entsize)
510 {
511 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
512 + sections[i].sh_offset + offset);
513 if (strcmp ("main", strings + sym->st_name) == 0)
514 {
515 load_offset -= sym->st_value;
516 goto found_main;
517 }
518 }
519 }
520 /* We only get here when main was not found.
521 This should never happen. */
522 fprintf (stderr, "Error: main function not found.\n");
523 abort ();
524 found_main:;
525
526 /* Find dynamic symbol table. */
527 Elf64_Shdr *dynsym = NULL;
528 for (int i = 0; i < image->e_shnum; i++)
529 if (sections[i].sh_type == SHT_DYNSYM)
530 {
531 dynsym = &sections[i];
532 break;
533 }
534
535 /* Fix up relocations. */
536 for (int i = 0; i < image->e_shnum; i++)
537 {
538 if (sections[i].sh_type == SHT_RELA)
539 for (size_t offset = 0;
540 offset < sections[i].sh_size;
541 offset += sections[i].sh_entsize)
542 {
543 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
544 + sections[i].sh_offset
545 + offset);
546 Elf64_Sym *sym =
547 (dynsym
548 ? (Elf64_Sym *) ((char *) image
549 + dynsym->sh_offset
550 + (dynsym->sh_entsize
551 * ELF64_R_SYM (reloc->r_info))) : NULL);
552
553 int64_t S = (sym ? sym->st_value : 0);
554 int64_t P = reloc->r_offset + load_offset;
555 int64_t A = reloc->r_addend;
556 int64_t B = load_offset;
557 int64_t V, size;
558 switch (ELF64_R_TYPE (reloc->r_info))
559 {
560 case R_AMDGPU_ABS32_LO:
561 V = (S + A) & 0xFFFFFFFF;
562 size = 4;
563 break;
564 case R_AMDGPU_ABS32_HI:
565 V = (S + A) >> 32;
566 size = 4;
567 break;
568 case R_AMDGPU_ABS64:
569 V = S + A;
570 size = 8;
571 break;
572 case R_AMDGPU_REL32:
573 V = S + A - P;
574 size = 4;
575 break;
576 case R_AMDGPU_REL64:
577 /* FIXME
578 LLD seems to emit REL64 where the the assembler has ABS64.
579 This is clearly wrong because it's not what the compiler
580 is expecting. Let's assume, for now, that it's a bug.
581 In any case, GCN kernels are always self contained and
582 therefore relative relocations will have been resolved
583 already, so this should be a safe workaround. */
584 V = S + A /* - P */ ;
585 size = 8;
586 break;
587 case R_AMDGPU_ABS32:
588 V = S + A;
589 size = 4;
590 break;
591 /* TODO R_AMDGPU_GOTPCREL */
592 /* TODO R_AMDGPU_GOTPCREL32_LO */
593 /* TODO R_AMDGPU_GOTPCREL32_HI */
594 case R_AMDGPU_REL32_LO:
595 V = (S + A - P) & 0xFFFFFFFF;
596 size = 4;
597 break;
598 case R_AMDGPU_REL32_HI:
599 V = (S + A - P) >> 32;
600 size = 4;
601 break;
602 case R_AMDGPU_RELATIVE64:
603 V = B + A;
604 size = 8;
605 break;
606 default:
607 fprintf (stderr, "Error: unsupported relocation type.\n");
608 exit (1);
609 }
610 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
611 "Fix up relocation");
612 }
613 }
614 }
615
616 /* Allocate some device memory from the kernargs region.
617 The returned address will be 32-bit (with excess zeroed on 64-bit host),
618 and accessible via the same address on both host and target (via
619 __flat_scalar GCN address space). */
620
621 static void *
622 device_malloc (size_t size, hsa_region_t region)
623 {
624 void *result;
625 XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
626 "Allocate device memory");
627 return result;
628 }
629
630 /* These are the device pointers that will be transferred to the target.
631 The HSA Runtime points the kernargs register here.
632 They correspond to function signature:
633 int main (int argc, char *argv[], int *return_value)
634 The compiler expects this, for kernel functions, and will
635 automatically assign the exit value to *return_value. */
636 struct kernargs
637 {
638 /* Kernargs. */
639 int32_t argc;
640 int64_t argv;
641 int64_t out_ptr;
642 int64_t heap_ptr;
643
644 /* Output data. */
645 struct output
646 {
647 int return_value;
648 unsigned int next_output;
649 struct printf_data
650 {
651 int written;
652 char msg[128];
653 int type;
654 union
655 {
656 int64_t ivalue;
657 double dvalue;
658 char text[128];
659 };
660 } queue[1024];
661 unsigned int consumed;
662 } output_data;
663 };
664
665 struct heap
666 {
667 int64_t size;
668 char data[0];
669 } heap;
670
671 /* Print any console output from the kernel.
672 We print all entries from "consumed" to the next entry without a "written"
673 flag, or "next_output" is reached. The buffer is circular, but the
674 indices are absolute. It is assumed the kernel will stop writing data
675 if "next_output" wraps (becomes smaller than "consumed"). */
676 void
677 gomp_print_output (struct kernargs *kernargs, bool final)
678 {
679 unsigned int limit = (sizeof (kernargs->output_data.queue)
680 / sizeof (kernargs->output_data.queue[0]));
681
682 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
683 __ATOMIC_ACQUIRE);
684 unsigned int to = kernargs->output_data.next_output;
685
686 if (from > to)
687 {
688 /* Overflow. */
689 if (final)
690 printf ("GCN print buffer overflowed.\n");
691 return;
692 }
693
694 unsigned int i;
695 for (i = from; i < to; i++)
696 {
697 struct printf_data *data = &kernargs->output_data.queue[i%limit];
698
699 if (!data->written && !final)
700 break;
701
702 switch (data->type)
703 {
704 case 0:
705 printf ("%.128s%ld\n", data->msg, data->ivalue);
706 break;
707 case 1:
708 printf ("%.128s%f\n", data->msg, data->dvalue);
709 break;
710 case 2:
711 printf ("%.128s%.128s\n", data->msg, data->text);
712 break;
713 case 3:
714 printf ("%.128s%.128s", data->msg, data->text);
715 break;
716 default:
717 printf ("GCN print buffer error!\n");
718 break;
719 }
720
721 data->written = 0;
722 __atomic_store_n (&kernargs->output_data.consumed, i+1,
723 __ATOMIC_RELEASE);
724 }
725 fflush (stdout);
726 }
727
728 /* Execute an already-loaded kernel on the device. */
729
730 static void
731 run (uint64_t kernel, void *kernargs)
732 {
733 /* A "signal" is used to launch and monitor the kernel. */
734 hsa_signal_t signal;
735 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
736 "Create signal");
737
738 /* Configure for a single-worker kernel. */
739 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
740 const uint32_t queueMask = queue->size - 1;
741 hsa_kernel_dispatch_packet_t *dispatch_packet =
742 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
743 queueMask]);
744 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
745 dispatch_packet->workgroup_size_x = (uint16_t) 1;
746 dispatch_packet->workgroup_size_y = (uint16_t) 64;
747 dispatch_packet->workgroup_size_z = (uint16_t) 1;
748 dispatch_packet->grid_size_x = 1;
749 dispatch_packet->grid_size_y = 64;
750 dispatch_packet->grid_size_z = 1;
751 dispatch_packet->completion_signal = signal;
752 dispatch_packet->kernel_object = kernel;
753 dispatch_packet->kernarg_address = (void *) kernargs;
754 dispatch_packet->private_segment_size = private_segment_size;
755 dispatch_packet->group_segment_size = group_segment_size;
756
757 uint16_t header = 0;
758 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
759 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
760 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
761
762 __atomic_store_n ((uint32_t *) dispatch_packet,
763 header | (dispatch_packet->setup << 16),
764 __ATOMIC_RELEASE);
765
766 if (debug)
767 fprintf (stderr, "Launch kernel\n");
768
769 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
770 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
771 /* Kernel running ...... */
772 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
773 1, 1000000,
774 HSA_WAIT_STATE_ACTIVE) != 0)
775 {
776 usleep (10000);
777 gomp_print_output (kernargs, false);
778 }
779
780 gomp_print_output (kernargs, true);
781
782 if (debug)
783 fprintf (stderr, "Kernel exited\n");
784
785 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
786 "Clean up signal");
787 }
788
789 int
790 main (int argc, char *argv[])
791 {
792 int kernel_arg = 0;
793 for (int i = 1; i < argc; i++)
794 {
795 if (!strcmp (argv[i], "--help"))
796 {
797 usage (argv[0]);
798 return 0;
799 }
800 else if (!strcmp (argv[i], "--version"))
801 {
802 version (argv[0]);
803 return 0;
804 }
805 else if (!strcmp (argv[i], "--debug"))
806 debug = true;
807 else if (argv[i][0] == '-')
808 {
809 usage (argv[0]);
810 return 1;
811 }
812 else
813 {
814 kernel_arg = i;
815 break;
816 }
817 }
818
819 if (!kernel_arg)
820 {
821 /* No kernel arguments were found. */
822 usage (argv[0]);
823 return 1;
824 }
825
826 /* The remaining arguments are for the GCN kernel. */
827 int kernel_argc = argc - kernel_arg;
828 char **kernel_argv = &argv[kernel_arg];
829
830 init_device ();
831 load_image (kernel_argv[0]);
832
833 /* Calculate size of function parameters + argv data. */
834 size_t args_size = 0;
835 for (int i = 0; i < kernel_argc; i++)
836 args_size += strlen (kernel_argv[i]) + 1;
837
838 /* Allocate device memory for both function parameters and the argv
839 data. */
840 struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
841 kernargs_region);
842 struct argdata
843 {
844 int64_t argv_data[kernel_argc];
845 char strings[args_size];
846 } *args = device_malloc (sizeof (struct argdata), kernargs_region);
847
848 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
849 struct heap *heap = device_malloc (heap_size, heap_region);
850 XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
851 HSA_ACCESS_PERMISSION_RW),
852 "Assign heap to device agent");
853
854 /* Write the data to the target. */
855 kernargs->argc = kernel_argc;
856 kernargs->argv = (int64_t) args->argv_data;
857 kernargs->out_ptr = (int64_t) &kernargs->output_data;
858 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
859 kernargs->output_data.next_output = 0;
860 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
861 / sizeof (kernargs->output_data.queue[0])); i++)
862 kernargs->output_data.queue[i].written = 0;
863 kernargs->output_data.consumed = 0;
864 int offset = 0;
865 for (int i = 0; i < kernel_argc; i++)
866 {
867 size_t arg_len = strlen (kernel_argv[i]) + 1;
868 args->argv_data[i] = (int64_t) &args->strings[offset];
869 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
870 offset += arg_len;
871 }
872 kernargs->heap_ptr = (int64_t) heap;
873 hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
874
875 /* Run constructors on the GPU. */
876 run (init_array_kernel, kernargs);
877
878 /* Run the kernel on the GPU. */
879 run (main_kernel, kernargs);
880 unsigned int return_value =
881 (unsigned int) kernargs->output_data.return_value;
882
883 /* Run destructors on the GPU. */
884 run (fini_array_kernel, kernargs);
885
886 unsigned int upper = (return_value & ~0xffff) >> 16;
887 if (upper == 0xcafe)
888 {
889 printf ("Kernel exit value was never set\n");
890 return_value = 0xff;
891 }
892 else if (upper == 0xffff)
893 ; /* Set by exit. */
894 else if (upper == 0)
895 ; /* Set by return from main. */
896 else
897 printf ("Possible kernel exit value corruption, 2 most significant bytes "
898 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
899
900 if (upper == 0xffff)
901 {
902 unsigned int signal = (return_value >> 8) & 0xff;
903 if (signal == SIGABRT)
904 printf ("Kernel aborted\n");
905 else if (signal != 0)
906 printf ("Kernel received unkown signal\n");
907 }
908
909 if (debug)
910 printf ("Kernel exit value: %d\n", return_value & 0xff);
911
912 /* Clean shut down. */
913 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
914 "Clean up device memory");
915 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
916 "Clean up GCN executable");
917 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
918 "Clean up device queue");
919 XHSA (hsa_fns.hsa_shut_down_fn (),
920 "Shut down run-time");
921
922 return return_value & 0xff;
923 }