Mercurial > hg > CbC > CbC_gcc
comparison gcc/config/gcn/gcn-run.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* Run a stand-alone AMD GCN kernel. | |
2 | |
3 Copyright 2017 Mentor Graphics Corporation | |
4 Copyright (C) 2018-2020 Free Software Foundation, Inc. | |
5 | |
6 This program is free software: you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation, either version 3 of the License, or | |
9 (at your option) any later version. | |
10 | |
11 This program is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ | |
18 | |
19 /* This program will run a compiled stand-alone GCN kernel on a GPU. | |
20 | |
21 The kernel entry point's signature must use a standard main signature: | |
22 | |
23 int main(int argc, char **argv) | |
24 */ | |
25 | |
26 #include <stdint.h> | |
27 #include <stdbool.h> | |
28 #include <stdlib.h> | |
29 #include <malloc.h> | |
30 #include <stdio.h> | |
31 #include <string.h> | |
32 #include <dlfcn.h> | |
33 #include <unistd.h> | |
34 #include <elf.h> | |
35 #include <signal.h> | |
36 | |
37 /* These probably won't be in elf.h for a while. */ | |
38 #ifndef R_AMDGPU_NONE | |
39 #define R_AMDGPU_NONE 0 | |
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ | |
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */ | |
42 #define R_AMDGPU_ABS64 3 /* S + A */ | |
43 #define R_AMDGPU_REL32 4 /* S + A - P */ | |
44 #define R_AMDGPU_REL64 5 /* S + A - P */ | |
45 #define R_AMDGPU_ABS32 6 /* S + A */ | |
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */ | |
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */ | |
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */ | |
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */ | |
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */ | |
51 #define reserved 12 | |
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */ | |
53 #endif | |
54 | |
55 #include "hsa.h" | |
56 | |
57 #ifndef HSA_RUNTIME_LIB | |
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so" | |
59 #endif | |
60 | |
61 #ifndef VERSION_STRING | |
62 #define VERSION_STRING "(version unknown)" | |
63 #endif | |
64 | |
65 bool debug = false; | |
66 | |
67 hsa_agent_t device = { 0 }; | |
68 hsa_queue_t *queue = NULL; | |
69 uint64_t init_array_kernel = 0; | |
70 uint64_t fini_array_kernel = 0; | |
71 uint64_t main_kernel = 0; | |
72 hsa_executable_t executable = { 0 }; | |
73 | |
74 hsa_region_t kernargs_region = { 0 }; | |
75 hsa_region_t heap_region = { 0 }; | |
76 uint32_t kernarg_segment_size = 0; | |
77 uint32_t group_segment_size = 0; | |
78 uint32_t private_segment_size = 0; | |
79 | |
80 static void | |
81 usage (const char *progname) | |
82 { | |
83 printf ("Usage: %s [options] kernel [kernel-args]\n\n" | |
84 "Options:\n" | |
85 " --help\n" | |
86 " --version\n" | |
87 " --debug\n", progname); | |
88 } | |
89 | |
90 static void | |
91 version (const char *progname) | |
92 { | |
93 printf ("%s " VERSION_STRING "\n", progname); | |
94 } | |
95 | |
96 /* As an HSA runtime is dlopened, following structure defines the necessary | |
97 function pointers. | |
98 Code adapted from libgomp. */ | |
99 | |
100 struct hsa_runtime_fn_info | |
101 { | |
102 /* HSA runtime. */ | |
103 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, | |
104 const char **status_string); | |
105 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, | |
106 hsa_agent_info_t attribute, | |
107 void *value); | |
108 hsa_status_t (*hsa_init_fn) (void); | |
109 hsa_status_t (*hsa_iterate_agents_fn) | |
110 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data); | |
111 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, | |
112 hsa_region_info_t attribute, | |
113 void *value); | |
114 hsa_status_t (*hsa_queue_create_fn) | |
115 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, | |
116 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data), | |
117 void *data, uint32_t private_segment_size, | |
118 uint32_t group_segment_size, hsa_queue_t **queue); | |
119 hsa_status_t (*hsa_agent_iterate_regions_fn) | |
120 (hsa_agent_t agent, | |
121 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data); | |
122 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); | |
123 hsa_status_t (*hsa_executable_create_fn) | |
124 (hsa_profile_t profile, hsa_executable_state_t executable_state, | |
125 const char *options, hsa_executable_t *executable); | |
126 hsa_status_t (*hsa_executable_global_variable_define_fn) | |
127 (hsa_executable_t executable, const char *variable_name, void *address); | |
128 hsa_status_t (*hsa_executable_load_code_object_fn) | |
129 (hsa_executable_t executable, hsa_agent_t agent, | |
130 hsa_code_object_t code_object, const char *options); | |
131 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable, | |
132 const char *options); | |
133 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, | |
134 uint32_t num_consumers, | |
135 const hsa_agent_t *consumers, | |
136 hsa_signal_t *signal); | |
137 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, | |
138 void **ptr); | |
139 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, | |
140 hsa_access_permission_t access); | |
141 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, | |
142 size_t size); | |
143 hsa_status_t (*hsa_memory_free_fn) (void *ptr); | |
144 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); | |
145 hsa_status_t (*hsa_executable_get_symbol_fn) | |
146 (hsa_executable_t executable, const char *module_name, | |
147 const char *symbol_name, hsa_agent_t agent, int32_t call_convention, | |
148 hsa_executable_symbol_t *symbol); | |
149 hsa_status_t (*hsa_executable_symbol_get_info_fn) | |
150 (hsa_executable_symbol_t executable_symbol, | |
151 hsa_executable_symbol_info_t attribute, void *value); | |
152 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, | |
153 hsa_signal_value_t value); | |
154 hsa_signal_value_t (*hsa_signal_wait_acquire_fn) | |
155 (hsa_signal_t signal, hsa_signal_condition_t condition, | |
156 hsa_signal_value_t compare_value, uint64_t timeout_hint, | |
157 hsa_wait_state_t wait_state_hint); | |
158 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn) | |
159 (hsa_signal_t signal, hsa_signal_condition_t condition, | |
160 hsa_signal_value_t compare_value, uint64_t timeout_hint, | |
161 hsa_wait_state_t wait_state_hint); | |
162 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); | |
163 hsa_status_t (*hsa_code_object_deserialize_fn) | |
164 (void *serialized_code_object, size_t serialized_code_object_size, | |
165 const char *options, hsa_code_object_t *code_object); | |
166 uint64_t (*hsa_queue_load_write_index_relaxed_fn) | |
167 (const hsa_queue_t *queue); | |
168 void (*hsa_queue_store_write_index_relaxed_fn) | |
169 (const hsa_queue_t *queue, uint64_t value); | |
170 hsa_status_t (*hsa_shut_down_fn) (); | |
171 }; | |
172 | |
173 /* HSA runtime functions that are initialized in init_hsa_context. | |
174 Code adapted from libgomp. */ | |
175 | |
176 static struct hsa_runtime_fn_info hsa_fns; | |
177 | |
178 #define DLSYM_FN(function) \ | |
179 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \ | |
180 if (hsa_fns.function##_fn == NULL) \ | |
181 goto fail; | |
182 | |
183 static void | |
184 init_hsa_runtime_functions (void) | |
185 { | |
186 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY); | |
187 if (handle == NULL) | |
188 { | |
189 fprintf (stderr, | |
190 "The HSA runtime is required to run GCN kernels on hardware.\n" | |
191 "%s: File not found or could not be opened\n", | |
192 HSA_RUNTIME_LIB); | |
193 exit (1); | |
194 } | |
195 | |
196 DLSYM_FN (hsa_status_string) | |
197 DLSYM_FN (hsa_agent_get_info) | |
198 DLSYM_FN (hsa_init) | |
199 DLSYM_FN (hsa_iterate_agents) | |
200 DLSYM_FN (hsa_region_get_info) | |
201 DLSYM_FN (hsa_queue_create) | |
202 DLSYM_FN (hsa_agent_iterate_regions) | |
203 DLSYM_FN (hsa_executable_destroy) | |
204 DLSYM_FN (hsa_executable_create) | |
205 DLSYM_FN (hsa_executable_global_variable_define) | |
206 DLSYM_FN (hsa_executable_load_code_object) | |
207 DLSYM_FN (hsa_executable_freeze) | |
208 DLSYM_FN (hsa_signal_create) | |
209 DLSYM_FN (hsa_memory_allocate) | |
210 DLSYM_FN (hsa_memory_assign_agent) | |
211 DLSYM_FN (hsa_memory_copy) | |
212 DLSYM_FN (hsa_memory_free) | |
213 DLSYM_FN (hsa_signal_destroy) | |
214 DLSYM_FN (hsa_executable_get_symbol) | |
215 DLSYM_FN (hsa_executable_symbol_get_info) | |
216 DLSYM_FN (hsa_signal_wait_acquire) | |
217 DLSYM_FN (hsa_signal_wait_relaxed) | |
218 DLSYM_FN (hsa_signal_store_relaxed) | |
219 DLSYM_FN (hsa_queue_destroy) | |
220 DLSYM_FN (hsa_code_object_deserialize) | |
221 DLSYM_FN (hsa_queue_load_write_index_relaxed) | |
222 DLSYM_FN (hsa_queue_store_write_index_relaxed) | |
223 DLSYM_FN (hsa_shut_down) | |
224 | |
225 return; | |
226 | |
227 fail: | |
228 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n"); | |
229 exit (1); | |
230 } | |
231 | |
232 #undef DLSYM_FN | |
233 | |
234 /* Report a fatal error STR together with the HSA error corresponding to | |
235 STATUS and terminate execution of the current process. */ | |
236 | |
237 static void | |
238 hsa_fatal (const char *str, hsa_status_t status) | |
239 { | |
240 const char *hsa_error_msg; | |
241 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); | |
242 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str, | |
243 hsa_error_msg); | |
244 exit (1); | |
245 } | |
246 | |
247 /* Helper macros to ensure we check the return values from the HSA Runtime. | |
248 These just keep the rest of the code a bit cleaner. */ | |
249 | |
250 #define XHSA_CMP(FN, CMP, MSG) \ | |
251 do { \ | |
252 hsa_status_t status = (FN); \ | |
253 if (!(CMP)) \ | |
254 hsa_fatal ((MSG), status); \ | |
255 else if (debug) \ | |
256 fprintf (stderr, "%s: OK\n", (MSG)); \ | |
257 } while (0) | |
258 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG) | |
259 | |
260 /* Callback of hsa_iterate_agents. | |
261 Called once for each available device, and returns "break" when a | |
262 suitable one has been found. */ | |
263 | |
264 static hsa_status_t | |
265 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) | |
266 { | |
267 hsa_device_type_t device_type; | |
268 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, | |
269 &device_type), | |
270 "Get agent type"); | |
271 | |
272 /* Select only GPU devices. */ | |
273 /* TODO: support selecting from multiple GPUs. */ | |
274 if (HSA_DEVICE_TYPE_GPU == device_type) | |
275 { | |
276 device = agent; | |
277 return HSA_STATUS_INFO_BREAK; | |
278 } | |
279 | |
280 /* The device was not suitable. */ | |
281 return HSA_STATUS_SUCCESS; | |
282 } | |
283 | |
284 /* Callback of hsa_iterate_regions. | |
285 Called once for each available memory region, and returns "break" when a | |
286 suitable one has been found. */ | |
287 | |
288 static hsa_status_t | |
289 get_memory_region (hsa_region_t region, hsa_region_t *retval, | |
290 hsa_region_global_flag_t kind) | |
291 { | |
292 /* Reject non-global regions. */ | |
293 hsa_region_segment_t segment; | |
294 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment); | |
295 if (HSA_REGION_SEGMENT_GLOBAL != segment) | |
296 return HSA_STATUS_SUCCESS; | |
297 | |
298 /* Find a region with the KERNARG flag set. */ | |
299 hsa_region_global_flag_t flags; | |
300 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, | |
301 &flags); | |
302 if (flags & kind) | |
303 { | |
304 *retval = region; | |
305 return HSA_STATUS_INFO_BREAK; | |
306 } | |
307 | |
308 /* The region was not suitable. */ | |
309 return HSA_STATUS_SUCCESS; | |
310 } | |
311 | |
312 static hsa_status_t | |
313 get_kernarg_region (hsa_region_t region, void *data __attribute__((unused))) | |
314 { | |
315 return get_memory_region (region, &kernargs_region, | |
316 HSA_REGION_GLOBAL_FLAG_KERNARG); | |
317 } | |
318 | |
319 static hsa_status_t | |
320 get_heap_region (hsa_region_t region, void *data __attribute__((unused))) | |
321 { | |
322 return get_memory_region (region, &heap_region, | |
323 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); | |
324 } | |
325 | |
326 /* Initialize the HSA Runtime library and GPU device. */ | |
327 | |
328 static void | |
329 init_device () | |
330 { | |
331 /* Load the shared library and find the API functions. */ | |
332 init_hsa_runtime_functions (); | |
333 | |
334 /* Initialize the HSA Runtime. */ | |
335 XHSA (hsa_fns.hsa_init_fn (), | |
336 "Initialize run-time"); | |
337 | |
338 /* Select a suitable device. | |
339 The call-back function, get_gpu_agent, does the selection. */ | |
340 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL), | |
341 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
342 "Find a device"); | |
343 | |
344 /* Initialize the queue used for launching kernels. */ | |
345 uint32_t queue_size = 0; | |
346 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
347 &queue_size), | |
348 "Find max queue size"); | |
349 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size, | |
350 HSA_QUEUE_TYPE_SINGLE, NULL, | |
351 NULL, UINT32_MAX, UINT32_MAX, &queue), | |
352 "Set up a device queue"); | |
353 | |
354 /* Select a memory region for the kernel arguments. | |
355 The call-back function, get_kernarg_region, does the selection. */ | |
356 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region, | |
357 NULL), | |
358 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
359 "Locate kernargs memory"); | |
360 | |
361 /* Select a memory region for the kernel heap. | |
362 The call-back function, get_heap_region, does the selection. */ | |
363 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region, | |
364 NULL), | |
365 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
366 "Locate device memory"); | |
367 } | |
368 | |
369 | |
370 /* Read a whole input file. | |
371 Code copied from mkoffload. */ | |
372 | |
373 static char * | |
374 read_file (const char *filename, size_t *plen) | |
375 { | |
376 size_t alloc = 16384; | |
377 size_t base = 0; | |
378 char *buffer; | |
379 | |
380 FILE *stream = fopen (filename, "rb"); | |
381 if (!stream) | |
382 { | |
383 perror (filename); | |
384 exit (1); | |
385 } | |
386 | |
387 if (!fseek (stream, 0, SEEK_END)) | |
388 { | |
389 /* Get the file size. */ | |
390 long s = ftell (stream); | |
391 if (s >= 0) | |
392 alloc = s + 100; | |
393 fseek (stream, 0, SEEK_SET); | |
394 } | |
395 buffer = malloc (alloc); | |
396 | |
397 for (;;) | |
398 { | |
399 size_t n = fread (buffer + base, 1, alloc - base - 1, stream); | |
400 | |
401 if (!n) | |
402 break; | |
403 base += n; | |
404 if (base + 1 == alloc) | |
405 { | |
406 alloc *= 2; | |
407 buffer = realloc (buffer, alloc); | |
408 } | |
409 } | |
410 buffer[base] = 0; | |
411 *plen = base; | |
412 | |
413 fclose (stream); | |
414 | |
415 return buffer; | |
416 } | |
417 | |
418 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */ | |
419 | |
420 static void | |
421 load_image (const char *filename) | |
422 { | |
423 size_t image_size; | |
424 Elf64_Ehdr *image = (void *) read_file (filename, &image_size); | |
425 | |
426 /* An "executable" consists of one or more code objects. */ | |
427 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, | |
428 HSA_EXECUTABLE_STATE_UNFROZEN, "", | |
429 &executable), | |
430 "Initialize GCN executable"); | |
431 | |
432 /* Hide relocations from the HSA runtime loader. | |
433 Keep a copy of the unmodified section headers to use later. */ | |
434 Elf64_Shdr *image_sections = | |
435 (Elf64_Shdr *) ((char *) image + image->e_shoff); | |
436 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum); | |
437 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum); | |
438 for (int i = image->e_shnum - 1; i >= 0; i--) | |
439 { | |
440 if (image_sections[i].sh_type == SHT_RELA | |
441 || image_sections[i].sh_type == SHT_REL) | |
442 /* Change section type to something harmless. */ | |
443 image_sections[i].sh_type = SHT_NOTE; | |
444 } | |
445 | |
446 /* Add the HSACO to the executable. */ | |
447 hsa_code_object_t co = { 0 }; | |
448 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co), | |
449 "Deserialize GCN code object"); | |
450 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co, | |
451 ""), | |
452 "Load GCN code object"); | |
453 | |
454 /* We're done modifying he executable. */ | |
455 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""), | |
456 "Freeze GCN executable"); | |
457 | |
458 /* Locate the "_init_array" function, and read the kernel's properties. */ | |
459 hsa_executable_symbol_t symbol; | |
460 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array", | |
461 device, 0, &symbol), | |
462 "Find '_init_array' function"); | |
463 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
464 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel), | |
465 "Extract '_init_array' kernel object kernel object"); | |
466 | |
467 /* Locate the "_fini_array" function, and read the kernel's properties. */ | |
468 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array", | |
469 device, 0, &symbol), | |
470 "Find '_fini_array' function"); | |
471 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
472 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel), | |
473 "Extract '_fini_array' kernel object kernel object"); | |
474 | |
475 /* Locate the "main" function, and read the kernel's properties. */ | |
476 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main", | |
477 device, 0, &symbol), | |
478 "Find 'main' function"); | |
479 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
480 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel), | |
481 "Extract 'main' kernel object"); | |
482 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
483 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
484 &kernarg_segment_size), | |
485 "Extract kernarg segment size"); | |
486 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
487 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
488 &group_segment_size), | |
489 "Extract group segment size"); | |
490 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
491 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
492 &private_segment_size), | |
493 "Extract private segment size"); | |
494 | |
495 /* Find main function in ELF, and calculate actual load offset. */ | |
496 Elf64_Addr load_offset; | |
497 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
498 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, | |
499 &load_offset), | |
500 "Extract 'main' symbol address"); | |
501 for (int i = 0; i < image->e_shnum; i++) | |
502 if (sections[i].sh_type == SHT_SYMTAB) | |
503 { | |
504 Elf64_Shdr *strtab = §ions[sections[i].sh_link]; | |
505 char *strings = (char *) image + strtab->sh_offset; | |
506 | |
507 for (size_t offset = 0; | |
508 offset < sections[i].sh_size; | |
509 offset += sections[i].sh_entsize) | |
510 { | |
511 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image | |
512 + sections[i].sh_offset + offset); | |
513 if (strcmp ("main", strings + sym->st_name) == 0) | |
514 { | |
515 load_offset -= sym->st_value; | |
516 goto found_main; | |
517 } | |
518 } | |
519 } | |
520 /* We only get here when main was not found. | |
521 This should never happen. */ | |
522 fprintf (stderr, "Error: main function not found.\n"); | |
523 abort (); | |
524 found_main:; | |
525 | |
526 /* Find dynamic symbol table. */ | |
527 Elf64_Shdr *dynsym = NULL; | |
528 for (int i = 0; i < image->e_shnum; i++) | |
529 if (sections[i].sh_type == SHT_DYNSYM) | |
530 { | |
531 dynsym = §ions[i]; | |
532 break; | |
533 } | |
534 | |
535 /* Fix up relocations. */ | |
536 for (int i = 0; i < image->e_shnum; i++) | |
537 { | |
538 if (sections[i].sh_type == SHT_RELA) | |
539 for (size_t offset = 0; | |
540 offset < sections[i].sh_size; | |
541 offset += sections[i].sh_entsize) | |
542 { | |
543 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image | |
544 + sections[i].sh_offset | |
545 + offset); | |
546 Elf64_Sym *sym = | |
547 (dynsym | |
548 ? (Elf64_Sym *) ((char *) image | |
549 + dynsym->sh_offset | |
550 + (dynsym->sh_entsize | |
551 * ELF64_R_SYM (reloc->r_info))) : NULL); | |
552 | |
553 int64_t S = (sym ? sym->st_value : 0); | |
554 int64_t P = reloc->r_offset + load_offset; | |
555 int64_t A = reloc->r_addend; | |
556 int64_t B = load_offset; | |
557 int64_t V, size; | |
558 switch (ELF64_R_TYPE (reloc->r_info)) | |
559 { | |
560 case R_AMDGPU_ABS32_LO: | |
561 V = (S + A) & 0xFFFFFFFF; | |
562 size = 4; | |
563 break; | |
564 case R_AMDGPU_ABS32_HI: | |
565 V = (S + A) >> 32; | |
566 size = 4; | |
567 break; | |
568 case R_AMDGPU_ABS64: | |
569 V = S + A; | |
570 size = 8; | |
571 break; | |
572 case R_AMDGPU_REL32: | |
573 V = S + A - P; | |
574 size = 4; | |
575 break; | |
576 case R_AMDGPU_REL64: | |
577 /* FIXME | |
578 LLD seems to emit REL64 where the the assembler has ABS64. | |
579 This is clearly wrong because it's not what the compiler | |
580 is expecting. Let's assume, for now, that it's a bug. | |
581 In any case, GCN kernels are always self contained and | |
582 therefore relative relocations will have been resolved | |
583 already, so this should be a safe workaround. */ | |
584 V = S + A /* - P */ ; | |
585 size = 8; | |
586 break; | |
587 case R_AMDGPU_ABS32: | |
588 V = S + A; | |
589 size = 4; | |
590 break; | |
591 /* TODO R_AMDGPU_GOTPCREL */ | |
592 /* TODO R_AMDGPU_GOTPCREL32_LO */ | |
593 /* TODO R_AMDGPU_GOTPCREL32_HI */ | |
594 case R_AMDGPU_REL32_LO: | |
595 V = (S + A - P) & 0xFFFFFFFF; | |
596 size = 4; | |
597 break; | |
598 case R_AMDGPU_REL32_HI: | |
599 V = (S + A - P) >> 32; | |
600 size = 4; | |
601 break; | |
602 case R_AMDGPU_RELATIVE64: | |
603 V = B + A; | |
604 size = 8; | |
605 break; | |
606 default: | |
607 fprintf (stderr, "Error: unsupported relocation type.\n"); | |
608 exit (1); | |
609 } | |
610 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size), | |
611 "Fix up relocation"); | |
612 } | |
613 } | |
614 } | |
615 | |
616 /* Allocate some device memory from the kernargs region. | |
617 The returned address will be 32-bit (with excess zeroed on 64-bit host), | |
618 and accessible via the same address on both host and target (via | |
619 __flat_scalar GCN address space). */ | |
620 | |
621 static void * | |
622 device_malloc (size_t size, hsa_region_t region) | |
623 { | |
624 void *result; | |
625 XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result), | |
626 "Allocate device memory"); | |
627 return result; | |
628 } | |
629 | |
630 /* These are the device pointers that will be transferred to the target. | |
631 The HSA Runtime points the kernargs register here. | |
632 They correspond to function signature: | |
633 int main (int argc, char *argv[], int *return_value) | |
634 The compiler expects this, for kernel functions, and will | |
635 automatically assign the exit value to *return_value. */ | |
636 struct kernargs | |
637 { | |
638 /* Kernargs. */ | |
639 int32_t argc; | |
640 int64_t argv; | |
641 int64_t out_ptr; | |
642 int64_t heap_ptr; | |
643 | |
644 /* Output data. */ | |
645 struct output | |
646 { | |
647 int return_value; | |
648 unsigned int next_output; | |
649 struct printf_data | |
650 { | |
651 int written; | |
652 char msg[128]; | |
653 int type; | |
654 union | |
655 { | |
656 int64_t ivalue; | |
657 double dvalue; | |
658 char text[128]; | |
659 }; | |
660 } queue[1024]; | |
661 unsigned int consumed; | |
662 } output_data; | |
663 }; | |
664 | |
665 struct heap | |
666 { | |
667 int64_t size; | |
668 char data[0]; | |
669 } heap; | |
670 | |
671 /* Print any console output from the kernel. | |
672 We print all entries from "consumed" to the next entry without a "written" | |
673 flag, or "next_output" is reached. The buffer is circular, but the | |
674 indices are absolute. It is assumed the kernel will stop writing data | |
675 if "next_output" wraps (becomes smaller than "consumed"). */ | |
676 void | |
677 gomp_print_output (struct kernargs *kernargs, bool final) | |
678 { | |
679 unsigned int limit = (sizeof (kernargs->output_data.queue) | |
680 / sizeof (kernargs->output_data.queue[0])); | |
681 | |
682 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, | |
683 __ATOMIC_ACQUIRE); | |
684 unsigned int to = kernargs->output_data.next_output; | |
685 | |
686 if (from > to) | |
687 { | |
688 /* Overflow. */ | |
689 if (final) | |
690 printf ("GCN print buffer overflowed.\n"); | |
691 return; | |
692 } | |
693 | |
694 unsigned int i; | |
695 for (i = from; i < to; i++) | |
696 { | |
697 struct printf_data *data = &kernargs->output_data.queue[i%limit]; | |
698 | |
699 if (!data->written && !final) | |
700 break; | |
701 | |
702 switch (data->type) | |
703 { | |
704 case 0: | |
705 printf ("%.128s%ld\n", data->msg, data->ivalue); | |
706 break; | |
707 case 1: | |
708 printf ("%.128s%f\n", data->msg, data->dvalue); | |
709 break; | |
710 case 2: | |
711 printf ("%.128s%.128s\n", data->msg, data->text); | |
712 break; | |
713 case 3: | |
714 printf ("%.128s%.128s", data->msg, data->text); | |
715 break; | |
716 default: | |
717 printf ("GCN print buffer error!\n"); | |
718 break; | |
719 } | |
720 | |
721 data->written = 0; | |
722 __atomic_store_n (&kernargs->output_data.consumed, i+1, | |
723 __ATOMIC_RELEASE); | |
724 } | |
725 fflush (stdout); | |
726 } | |
727 | |
728 /* Execute an already-loaded kernel on the device. */ | |
729 | |
730 static void | |
731 run (uint64_t kernel, void *kernargs) | |
732 { | |
733 /* A "signal" is used to launch and monitor the kernel. */ | |
734 hsa_signal_t signal; | |
735 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal), | |
736 "Create signal"); | |
737 | |
738 /* Configure for a single-worker kernel. */ | |
739 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue); | |
740 const uint32_t queueMask = queue->size - 1; | |
741 hsa_kernel_dispatch_packet_t *dispatch_packet = | |
742 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index & | |
743 queueMask]); | |
744 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
745 dispatch_packet->workgroup_size_x = (uint16_t) 1; | |
746 dispatch_packet->workgroup_size_y = (uint16_t) 64; | |
747 dispatch_packet->workgroup_size_z = (uint16_t) 1; | |
748 dispatch_packet->grid_size_x = 1; | |
749 dispatch_packet->grid_size_y = 64; | |
750 dispatch_packet->grid_size_z = 1; | |
751 dispatch_packet->completion_signal = signal; | |
752 dispatch_packet->kernel_object = kernel; | |
753 dispatch_packet->kernarg_address = (void *) kernargs; | |
754 dispatch_packet->private_segment_size = private_segment_size; | |
755 dispatch_packet->group_segment_size = group_segment_size; | |
756 | |
757 uint16_t header = 0; | |
758 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; | |
759 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; | |
760 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; | |
761 | |
762 __atomic_store_n ((uint32_t *) dispatch_packet, | |
763 header | (dispatch_packet->setup << 16), | |
764 __ATOMIC_RELEASE); | |
765 | |
766 if (debug) | |
767 fprintf (stderr, "Launch kernel\n"); | |
768 | |
769 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1); | |
770 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index); | |
771 /* Kernel running ...... */ | |
772 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT, | |
773 1, 1000000, | |
774 HSA_WAIT_STATE_ACTIVE) != 0) | |
775 { | |
776 usleep (10000); | |
777 gomp_print_output (kernargs, false); | |
778 } | |
779 | |
780 gomp_print_output (kernargs, true); | |
781 | |
782 if (debug) | |
783 fprintf (stderr, "Kernel exited\n"); | |
784 | |
785 XHSA (hsa_fns.hsa_signal_destroy_fn (signal), | |
786 "Clean up signal"); | |
787 } | |
788 | |
789 int | |
790 main (int argc, char *argv[]) | |
791 { | |
792 int kernel_arg = 0; | |
793 for (int i = 1; i < argc; i++) | |
794 { | |
795 if (!strcmp (argv[i], "--help")) | |
796 { | |
797 usage (argv[0]); | |
798 return 0; | |
799 } | |
800 else if (!strcmp (argv[i], "--version")) | |
801 { | |
802 version (argv[0]); | |
803 return 0; | |
804 } | |
805 else if (!strcmp (argv[i], "--debug")) | |
806 debug = true; | |
807 else if (argv[i][0] == '-') | |
808 { | |
809 usage (argv[0]); | |
810 return 1; | |
811 } | |
812 else | |
813 { | |
814 kernel_arg = i; | |
815 break; | |
816 } | |
817 } | |
818 | |
819 if (!kernel_arg) | |
820 { | |
821 /* No kernel arguments were found. */ | |
822 usage (argv[0]); | |
823 return 1; | |
824 } | |
825 | |
826 /* The remaining arguments are for the GCN kernel. */ | |
827 int kernel_argc = argc - kernel_arg; | |
828 char **kernel_argv = &argv[kernel_arg]; | |
829 | |
830 init_device (); | |
831 load_image (kernel_argv[0]); | |
832 | |
833 /* Calculate size of function parameters + argv data. */ | |
834 size_t args_size = 0; | |
835 for (int i = 0; i < kernel_argc; i++) | |
836 args_size += strlen (kernel_argv[i]) + 1; | |
837 | |
838 /* Allocate device memory for both function parameters and the argv | |
839 data. */ | |
840 struct kernargs *kernargs = device_malloc (sizeof (*kernargs), | |
841 kernargs_region); | |
842 struct argdata | |
843 { | |
844 int64_t argv_data[kernel_argc]; | |
845 char strings[args_size]; | |
846 } *args = device_malloc (sizeof (struct argdata), kernargs_region); | |
847 | |
848 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ | |
849 struct heap *heap = device_malloc (heap_size, heap_region); | |
850 XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, | |
851 HSA_ACCESS_PERMISSION_RW), | |
852 "Assign heap to device agent"); | |
853 | |
854 /* Write the data to the target. */ | |
855 kernargs->argc = kernel_argc; | |
856 kernargs->argv = (int64_t) args->argv_data; | |
857 kernargs->out_ptr = (int64_t) &kernargs->output_data; | |
858 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */ | |
859 kernargs->output_data.next_output = 0; | |
860 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) | |
861 / sizeof (kernargs->output_data.queue[0])); i++) | |
862 kernargs->output_data.queue[i].written = 0; | |
863 kernargs->output_data.consumed = 0; | |
864 int offset = 0; | |
865 for (int i = 0; i < kernel_argc; i++) | |
866 { | |
867 size_t arg_len = strlen (kernel_argv[i]) + 1; | |
868 args->argv_data[i] = (int64_t) &args->strings[offset]; | |
869 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); | |
870 offset += arg_len; | |
871 } | |
872 kernargs->heap_ptr = (int64_t) heap; | |
873 hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); | |
874 | |
875 /* Run constructors on the GPU. */ | |
876 run (init_array_kernel, kernargs); | |
877 | |
878 /* Run the kernel on the GPU. */ | |
879 run (main_kernel, kernargs); | |
880 unsigned int return_value = | |
881 (unsigned int) kernargs->output_data.return_value; | |
882 | |
883 /* Run destructors on the GPU. */ | |
884 run (fini_array_kernel, kernargs); | |
885 | |
886 unsigned int upper = (return_value & ~0xffff) >> 16; | |
887 if (upper == 0xcafe) | |
888 { | |
889 printf ("Kernel exit value was never set\n"); | |
890 return_value = 0xff; | |
891 } | |
892 else if (upper == 0xffff) | |
893 ; /* Set by exit. */ | |
894 else if (upper == 0) | |
895 ; /* Set by return from main. */ | |
896 else | |
897 printf ("Possible kernel exit value corruption, 2 most significant bytes " | |
898 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value); | |
899 | |
900 if (upper == 0xffff) | |
901 { | |
902 unsigned int signal = (return_value >> 8) & 0xff; | |
903 if (signal == SIGABRT) | |
904 printf ("Kernel aborted\n"); | |
905 else if (signal != 0) | |
906 printf ("Kernel received unkown signal\n"); | |
907 } | |
908 | |
909 if (debug) | |
910 printf ("Kernel exit value: %d\n", return_value & 0xff); | |
911 | |
912 /* Clean shut down. */ | |
913 XHSA (hsa_fns.hsa_memory_free_fn (kernargs), | |
914 "Clean up device memory"); | |
915 XHSA (hsa_fns.hsa_executable_destroy_fn (executable), | |
916 "Clean up GCN executable"); | |
917 XHSA (hsa_fns.hsa_queue_destroy_fn (queue), | |
918 "Clean up device queue"); | |
919 XHSA (hsa_fns.hsa_shut_down_fn (), | |
920 "Shut down run-time"); | |
921 | |
922 return return_value & 0xff; | |
923 } |