Mercurial > hg > CbC > CbC_gcc
diff gcc/config/gcn/gcn-run.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gcc/config/gcn/gcn-run.c Thu Feb 13 11:34:05 2020 +0900 @@ -0,0 +1,923 @@ +/* Run a stand-alone AMD GCN kernel. + + Copyright 2017 Mentor Graphics Corporation + Copyright (C) 2018-2020 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* This program will run a compiled stand-alone GCN kernel on a GPU. + + The kernel entry point's signature must use a standard main signature: + + int main(int argc, char **argv) +*/ + +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <malloc.h> +#include <stdio.h> +#include <string.h> +#include <dlfcn.h> +#include <unistd.h> +#include <elf.h> +#include <signal.h> + +/* These probably won't be in elf.h for a while. */ +#ifndef R_AMDGPU_NONE +#define R_AMDGPU_NONE 0 +#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ +#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */ +#define R_AMDGPU_ABS64 3 /* S + A */ +#define R_AMDGPU_REL32 4 /* S + A - P */ +#define R_AMDGPU_REL64 5 /* S + A - P */ +#define R_AMDGPU_ABS32 6 /* S + A */ +#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */ +#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */ +#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */ +#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */ +#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */ +#define reserved 12 +#define R_AMDGPU_RELATIVE64 13 /* B + A */ +#endif + +#include "hsa.h" + +#ifndef HSA_RUNTIME_LIB +#define HSA_RUNTIME_LIB "libhsa-runtime64.so" +#endif + +#ifndef VERSION_STRING +#define VERSION_STRING "(version unknown)" +#endif + +bool debug = false; + +hsa_agent_t device = { 0 }; +hsa_queue_t *queue = NULL; +uint64_t init_array_kernel = 0; +uint64_t fini_array_kernel = 0; +uint64_t main_kernel = 0; +hsa_executable_t executable = { 0 }; + +hsa_region_t kernargs_region = { 0 }; +hsa_region_t heap_region = { 0 }; +uint32_t kernarg_segment_size = 0; +uint32_t group_segment_size = 0; +uint32_t private_segment_size = 0; + +static void +usage (const char *progname) +{ + printf ("Usage: %s [options] kernel [kernel-args]\n\n" + "Options:\n" + " --help\n" + " --version\n" + " --debug\n", progname); +} + +static void +version (const char *progname) +{ + printf ("%s " VERSION_STRING "\n", progname); +} + +/* As an HSA runtime is dlopened, following structure defines the necessary + function pointers. + Code adapted from libgomp. */ + +struct hsa_runtime_fn_info +{ + /* HSA runtime. */ + hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, + const char **status_string); + hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value); + hsa_status_t (*hsa_init_fn) (void); + hsa_status_t (*hsa_iterate_agents_fn) + (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data); + hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, + hsa_region_info_t attribute, + void *value); + hsa_status_t (*hsa_queue_create_fn) + (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, + void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data), + void *data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t **queue); + hsa_status_t (*hsa_agent_iterate_regions_fn) + (hsa_agent_t agent, + hsa_status_t (*callback) (hsa_region_t region, void *data), void *data); + hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); + hsa_status_t (*hsa_executable_create_fn) + (hsa_profile_t profile, hsa_executable_state_t executable_state, + const char *options, hsa_executable_t *executable); + hsa_status_t (*hsa_executable_global_variable_define_fn) + (hsa_executable_t executable, const char *variable_name, void *address); + hsa_status_t (*hsa_executable_load_code_object_fn) + (hsa_executable_t executable, hsa_agent_t agent, + hsa_code_object_t code_object, const char *options); + hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable, + const char *options); + hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal); + hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, + void **ptr); + hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); + hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, + size_t size); + hsa_status_t (*hsa_memory_free_fn) (void *ptr); + hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); + hsa_status_t (*hsa_executable_get_symbol_fn) + (hsa_executable_t executable, const char *module_name, + const char *symbol_name, hsa_agent_t agent, int32_t call_convention, + hsa_executable_symbol_t *symbol); + hsa_status_t (*hsa_executable_symbol_get_info_fn) + (hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, void *value); + void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, + hsa_signal_value_t value); + hsa_signal_value_t (*hsa_signal_wait_acquire_fn) + (hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + hsa_signal_value_t (*hsa_signal_wait_relaxed_fn) + (hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); + hsa_status_t (*hsa_code_object_deserialize_fn) + (void *serialized_code_object, size_t serialized_code_object_size, + const char *options, hsa_code_object_t *code_object); + uint64_t (*hsa_queue_load_write_index_relaxed_fn) + (const hsa_queue_t *queue); + void (*hsa_queue_store_write_index_relaxed_fn) + (const hsa_queue_t *queue, uint64_t value); + hsa_status_t (*hsa_shut_down_fn) (); +}; + +/* HSA runtime functions that are initialized in init_hsa_context. + Code adapted from libgomp. */ + +static struct hsa_runtime_fn_info hsa_fns; + +#define DLSYM_FN(function) \ + *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \ + if (hsa_fns.function##_fn == NULL) \ + goto fail; + +static void +init_hsa_runtime_functions (void) +{ + void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY); + if (handle == NULL) + { + fprintf (stderr, + "The HSA runtime is required to run GCN kernels on hardware.\n" + "%s: File not found or could not be opened\n", + HSA_RUNTIME_LIB); + exit (1); + } + + DLSYM_FN (hsa_status_string) + DLSYM_FN (hsa_agent_get_info) + DLSYM_FN (hsa_init) + DLSYM_FN (hsa_iterate_agents) + DLSYM_FN (hsa_region_get_info) + DLSYM_FN (hsa_queue_create) + DLSYM_FN (hsa_agent_iterate_regions) + DLSYM_FN (hsa_executable_destroy) + DLSYM_FN (hsa_executable_create) + DLSYM_FN (hsa_executable_global_variable_define) + DLSYM_FN (hsa_executable_load_code_object) + DLSYM_FN (hsa_executable_freeze) + DLSYM_FN (hsa_signal_create) + DLSYM_FN (hsa_memory_allocate) + DLSYM_FN (hsa_memory_assign_agent) + DLSYM_FN (hsa_memory_copy) + DLSYM_FN (hsa_memory_free) + DLSYM_FN (hsa_signal_destroy) + DLSYM_FN (hsa_executable_get_symbol) + DLSYM_FN (hsa_executable_symbol_get_info) + DLSYM_FN (hsa_signal_wait_acquire) + DLSYM_FN (hsa_signal_wait_relaxed) + DLSYM_FN (hsa_signal_store_relaxed) + DLSYM_FN (hsa_queue_destroy) + DLSYM_FN (hsa_code_object_deserialize) + DLSYM_FN (hsa_queue_load_write_index_relaxed) + DLSYM_FN (hsa_queue_store_write_index_relaxed) + DLSYM_FN (hsa_shut_down) + + return; + +fail: + fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n"); + exit (1); +} + +#undef DLSYM_FN + +/* Report a fatal error STR together with the HSA error corresponding to + STATUS and terminate execution of the current process. */ + +static void +hsa_fatal (const char *str, hsa_status_t status) +{ + const char *hsa_error_msg; + hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); + fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str, + hsa_error_msg); + exit (1); +} + +/* Helper macros to ensure we check the return values from the HSA Runtime. + These just keep the rest of the code a bit cleaner. */ + +#define XHSA_CMP(FN, CMP, MSG) \ + do { \ + hsa_status_t status = (FN); \ + if (!(CMP)) \ + hsa_fatal ((MSG), status); \ + else if (debug) \ + fprintf (stderr, "%s: OK\n", (MSG)); \ + } while (0) +#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG) + +/* Callback of hsa_iterate_agents. + Called once for each available device, and returns "break" when a + suitable one has been found. */ + +static hsa_status_t +get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) +{ + hsa_device_type_t device_type; + XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, + &device_type), + "Get agent type"); + + /* Select only GPU devices. */ + /* TODO: support selecting from multiple GPUs. */ + if (HSA_DEVICE_TYPE_GPU == device_type) + { + device = agent; + return HSA_STATUS_INFO_BREAK; + } + + /* The device was not suitable. */ + return HSA_STATUS_SUCCESS; +} + +/* Callback of hsa_iterate_regions. + Called once for each available memory region, and returns "break" when a + suitable one has been found. */ + +static hsa_status_t +get_memory_region (hsa_region_t region, hsa_region_t *retval, + hsa_region_global_flag_t kind) +{ + /* Reject non-global regions. */ + hsa_region_segment_t segment; + hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment); + if (HSA_REGION_SEGMENT_GLOBAL != segment) + return HSA_STATUS_SUCCESS; + + /* Find a region with the KERNARG flag set. */ + hsa_region_global_flag_t flags; + hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, + &flags); + if (flags & kind) + { + *retval = region; + return HSA_STATUS_INFO_BREAK; + } + + /* The region was not suitable. */ + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t +get_kernarg_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &kernargs_region, + HSA_REGION_GLOBAL_FLAG_KERNARG); +} + +static hsa_status_t +get_heap_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &heap_region, + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); +} + +/* Initialize the HSA Runtime library and GPU device. */ + +static void +init_device () +{ + /* Load the shared library and find the API functions. */ + init_hsa_runtime_functions (); + + /* Initialize the HSA Runtime. */ + XHSA (hsa_fns.hsa_init_fn (), + "Initialize run-time"); + + /* Select a suitable device. + The call-back function, get_gpu_agent, does the selection. */ + XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL), + status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, + "Find a device"); + + /* Initialize the queue used for launching kernels. */ + uint32_t queue_size = 0; + XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, + &queue_size), + "Find max queue size"); + XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size, + HSA_QUEUE_TYPE_SINGLE, NULL, + NULL, UINT32_MAX, UINT32_MAX, &queue), + "Set up a device queue"); + + /* Select a memory region for the kernel arguments. + The call-back function, get_kernarg_region, does the selection. */ + XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region, + NULL), + status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, + "Locate kernargs memory"); + + /* Select a memory region for the kernel heap. + The call-back function, get_heap_region, does the selection. */ + XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region, + NULL), + status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, + "Locate device memory"); +} + + +/* Read a whole input file. + Code copied from mkoffload. */ + +static char * +read_file (const char *filename, size_t *plen) +{ + size_t alloc = 16384; + size_t base = 0; + char *buffer; + + FILE *stream = fopen (filename, "rb"); + if (!stream) + { + perror (filename); + exit (1); + } + + if (!fseek (stream, 0, SEEK_END)) + { + /* Get the file size. */ + long s = ftell (stream); + if (s >= 0) + alloc = s + 100; + fseek (stream, 0, SEEK_SET); + } + buffer = malloc (alloc); + + for (;;) + { + size_t n = fread (buffer + base, 1, alloc - base - 1, stream); + + if (!n) + break; + base += n; + if (base + 1 == alloc) + { + alloc *= 2; + buffer = realloc (buffer, alloc); + } + } + buffer[base] = 0; + *plen = base; + + fclose (stream); + + return buffer; +} + +/* Read a HSA Code Object (HSACO) from file, and load it into the device. */ + +static void +load_image (const char *filename) +{ + size_t image_size; + Elf64_Ehdr *image = (void *) read_file (filename, &image_size); + + /* An "executable" consists of one or more code objects. */ + XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, + HSA_EXECUTABLE_STATE_UNFROZEN, "", + &executable), + "Initialize GCN executable"); + + /* Hide relocations from the HSA runtime loader. + Keep a copy of the unmodified section headers to use later. */ + Elf64_Shdr *image_sections = + (Elf64_Shdr *) ((char *) image + image->e_shoff); + Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum); + memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum); + for (int i = image->e_shnum - 1; i >= 0; i--) + { + if (image_sections[i].sh_type == SHT_RELA + || image_sections[i].sh_type == SHT_REL) + /* Change section type to something harmless. */ + image_sections[i].sh_type = SHT_NOTE; + } + + /* Add the HSACO to the executable. */ + hsa_code_object_t co = { 0 }; + XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co), + "Deserialize GCN code object"); + XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co, + ""), + "Load GCN code object"); + + /* We're done modifying he executable. */ + XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""), + "Freeze GCN executable"); + + /* Locate the "_init_array" function, and read the kernel's properties. */ + hsa_executable_symbol_t symbol; + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array", + device, 0, &symbol), + "Find '_init_array' function"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel), + "Extract '_init_array' kernel object kernel object"); + + /* Locate the "_fini_array" function, and read the kernel's properties. */ + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array", + device, 0, &symbol), + "Find '_fini_array' function"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel), + "Extract '_fini_array' kernel object kernel object"); + + /* Locate the "main" function, and read the kernel's properties. */ + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main", + device, 0, &symbol), + "Find 'main' function"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel), + "Extract 'main' kernel object"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernarg_segment_size), + "Extract kernarg segment size"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &group_segment_size), + "Extract group segment size"); + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size), + "Extract private segment size"); + + /* Find main function in ELF, and calculate actual load offset. */ + Elf64_Addr load_offset; + XHSA (hsa_fns.hsa_executable_symbol_get_info_fn + (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, + &load_offset), + "Extract 'main' symbol address"); + for (int i = 0; i < image->e_shnum; i++) + if (sections[i].sh_type == SHT_SYMTAB) + { + Elf64_Shdr *strtab = §ions[sections[i].sh_link]; + char *strings = (char *) image + strtab->sh_offset; + + for (size_t offset = 0; + offset < sections[i].sh_size; + offset += sections[i].sh_entsize) + { + Elf64_Sym *sym = (Elf64_Sym *) ((char *) image + + sections[i].sh_offset + offset); + if (strcmp ("main", strings + sym->st_name) == 0) + { + load_offset -= sym->st_value; + goto found_main; + } + } + } + /* We only get here when main was not found. + This should never happen. */ + fprintf (stderr, "Error: main function not found.\n"); + abort (); +found_main:; + + /* Find dynamic symbol table. */ + Elf64_Shdr *dynsym = NULL; + for (int i = 0; i < image->e_shnum; i++) + if (sections[i].sh_type == SHT_DYNSYM) + { + dynsym = §ions[i]; + break; + } + + /* Fix up relocations. */ + for (int i = 0; i < image->e_shnum; i++) + { + if (sections[i].sh_type == SHT_RELA) + for (size_t offset = 0; + offset < sections[i].sh_size; + offset += sections[i].sh_entsize) + { + Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image + + sections[i].sh_offset + + offset); + Elf64_Sym *sym = + (dynsym + ? (Elf64_Sym *) ((char *) image + + dynsym->sh_offset + + (dynsym->sh_entsize + * ELF64_R_SYM (reloc->r_info))) : NULL); + + int64_t S = (sym ? sym->st_value : 0); + int64_t P = reloc->r_offset + load_offset; + int64_t A = reloc->r_addend; + int64_t B = load_offset; + int64_t V, size; + switch (ELF64_R_TYPE (reloc->r_info)) + { + case R_AMDGPU_ABS32_LO: + V = (S + A) & 0xFFFFFFFF; + size = 4; + break; + case R_AMDGPU_ABS32_HI: + V = (S + A) >> 32; + size = 4; + break; + case R_AMDGPU_ABS64: + V = S + A; + size = 8; + break; + case R_AMDGPU_REL32: + V = S + A - P; + size = 4; + break; + case R_AMDGPU_REL64: + /* FIXME + LLD seems to emit REL64 where the the assembler has ABS64. + This is clearly wrong because it's not what the compiler + is expecting. Let's assume, for now, that it's a bug. + In any case, GCN kernels are always self contained and + therefore relative relocations will have been resolved + already, so this should be a safe workaround. */ + V = S + A /* - P */ ; + size = 8; + break; + case R_AMDGPU_ABS32: + V = S + A; + size = 4; + break; + /* TODO R_AMDGPU_GOTPCREL */ + /* TODO R_AMDGPU_GOTPCREL32_LO */ + /* TODO R_AMDGPU_GOTPCREL32_HI */ + case R_AMDGPU_REL32_LO: + V = (S + A - P) & 0xFFFFFFFF; + size = 4; + break; + case R_AMDGPU_REL32_HI: + V = (S + A - P) >> 32; + size = 4; + break; + case R_AMDGPU_RELATIVE64: + V = B + A; + size = 8; + break; + default: + fprintf (stderr, "Error: unsupported relocation type.\n"); + exit (1); + } + XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size), + "Fix up relocation"); + } + } +} + +/* Allocate some device memory from the kernargs region. + The returned address will be 32-bit (with excess zeroed on 64-bit host), + and accessible via the same address on both host and target (via + __flat_scalar GCN address space). */ + +static void * +device_malloc (size_t size, hsa_region_t region) +{ + void *result; + XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result), + "Allocate device memory"); + return result; +} + +/* These are the device pointers that will be transferred to the target. + The HSA Runtime points the kernargs register here. + They correspond to function signature: + int main (int argc, char *argv[], int *return_value) + The compiler expects this, for kernel functions, and will + automatically assign the exit value to *return_value. */ +struct kernargs +{ + /* Kernargs. */ + int32_t argc; + int64_t argv; + int64_t out_ptr; + int64_t heap_ptr; + + /* Output data. */ + struct output + { + int return_value; + unsigned int next_output; + struct printf_data + { + int written; + char msg[128]; + int type; + union + { + int64_t ivalue; + double dvalue; + char text[128]; + }; + } queue[1024]; + unsigned int consumed; + } output_data; +}; + +struct heap +{ + int64_t size; + char data[0]; +} heap; + +/* Print any console output from the kernel. + We print all entries from "consumed" to the next entry without a "written" + flag, or "next_output" is reached. The buffer is circular, but the + indices are absolute. It is assumed the kernel will stop writing data + if "next_output" wraps (becomes smaller than "consumed"). */ +void +gomp_print_output (struct kernargs *kernargs, bool final) +{ + unsigned int limit = (sizeof (kernargs->output_data.queue) + / sizeof (kernargs->output_data.queue[0])); + + unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, + __ATOMIC_ACQUIRE); + unsigned int to = kernargs->output_data.next_output; + + if (from > to) + { + /* Overflow. */ + if (final) + printf ("GCN print buffer overflowed.\n"); + return; + } + + unsigned int i; + for (i = from; i < to; i++) + { + struct printf_data *data = &kernargs->output_data.queue[i%limit]; + + if (!data->written && !final) + break; + + switch (data->type) + { + case 0: + printf ("%.128s%ld\n", data->msg, data->ivalue); + break; + case 1: + printf ("%.128s%f\n", data->msg, data->dvalue); + break; + case 2: + printf ("%.128s%.128s\n", data->msg, data->text); + break; + case 3: + printf ("%.128s%.128s", data->msg, data->text); + break; + default: + printf ("GCN print buffer error!\n"); + break; + } + + data->written = 0; + __atomic_store_n (&kernargs->output_data.consumed, i+1, + __ATOMIC_RELEASE); + } + fflush (stdout); +} + +/* Execute an already-loaded kernel on the device. */ + +static void +run (uint64_t kernel, void *kernargs) +{ + /* A "signal" is used to launch and monitor the kernel. */ + hsa_signal_t signal; + XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal), + "Create signal"); + + /* Configure for a single-worker kernel. */ + uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue); + const uint32_t queueMask = queue->size - 1; + hsa_kernel_dispatch_packet_t *dispatch_packet = + &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index & + queueMask]); + dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatch_packet->workgroup_size_x = (uint16_t) 1; + dispatch_packet->workgroup_size_y = (uint16_t) 64; + dispatch_packet->workgroup_size_z = (uint16_t) 1; + dispatch_packet->grid_size_x = 1; + dispatch_packet->grid_size_y = 64; + dispatch_packet->grid_size_z = 1; + dispatch_packet->completion_signal = signal; + dispatch_packet->kernel_object = kernel; + dispatch_packet->kernarg_address = (void *) kernargs; + dispatch_packet->private_segment_size = private_segment_size; + dispatch_packet->group_segment_size = group_segment_size; + + uint16_t header = 0; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + + __atomic_store_n ((uint32_t *) dispatch_packet, + header | (dispatch_packet->setup << 16), + __ATOMIC_RELEASE); + + if (debug) + fprintf (stderr, "Launch kernel\n"); + + hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1); + hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index); + /* Kernel running ...... */ + while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT, + 1, 1000000, + HSA_WAIT_STATE_ACTIVE) != 0) + { + usleep (10000); + gomp_print_output (kernargs, false); + } + + gomp_print_output (kernargs, true); + + if (debug) + fprintf (stderr, "Kernel exited\n"); + + XHSA (hsa_fns.hsa_signal_destroy_fn (signal), + "Clean up signal"); +} + +int +main (int argc, char *argv[]) +{ + int kernel_arg = 0; + for (int i = 1; i < argc; i++) + { + if (!strcmp (argv[i], "--help")) + { + usage (argv[0]); + return 0; + } + else if (!strcmp (argv[i], "--version")) + { + version (argv[0]); + return 0; + } + else if (!strcmp (argv[i], "--debug")) + debug = true; + else if (argv[i][0] == '-') + { + usage (argv[0]); + return 1; + } + else + { + kernel_arg = i; + break; + } + } + + if (!kernel_arg) + { + /* No kernel arguments were found. */ + usage (argv[0]); + return 1; + } + + /* The remaining arguments are for the GCN kernel. */ + int kernel_argc = argc - kernel_arg; + char **kernel_argv = &argv[kernel_arg]; + + init_device (); + load_image (kernel_argv[0]); + + /* Calculate size of function parameters + argv data. */ + size_t args_size = 0; + for (int i = 0; i < kernel_argc; i++) + args_size += strlen (kernel_argv[i]) + 1; + + /* Allocate device memory for both function parameters and the argv + data. */ + struct kernargs *kernargs = device_malloc (sizeof (*kernargs), + kernargs_region); + struct argdata + { + int64_t argv_data[kernel_argc]; + char strings[args_size]; + } *args = device_malloc (sizeof (struct argdata), kernargs_region); + + size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ + struct heap *heap = device_malloc (heap_size, heap_region); + XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, + HSA_ACCESS_PERMISSION_RW), + "Assign heap to device agent"); + + /* Write the data to the target. */ + kernargs->argc = kernel_argc; + kernargs->argv = (int64_t) args->argv_data; + kernargs->out_ptr = (int64_t) &kernargs->output_data; + kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */ + kernargs->output_data.next_output = 0; + for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) + / sizeof (kernargs->output_data.queue[0])); i++) + kernargs->output_data.queue[i].written = 0; + kernargs->output_data.consumed = 0; + int offset = 0; + for (int i = 0; i < kernel_argc; i++) + { + size_t arg_len = strlen (kernel_argv[i]) + 1; + args->argv_data[i] = (int64_t) &args->strings[offset]; + memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); + offset += arg_len; + } + kernargs->heap_ptr = (int64_t) heap; + hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); + + /* Run constructors on the GPU. */ + run (init_array_kernel, kernargs); + + /* Run the kernel on the GPU. */ + run (main_kernel, kernargs); + unsigned int return_value = + (unsigned int) kernargs->output_data.return_value; + + /* Run destructors on the GPU. */ + run (fini_array_kernel, kernargs); + + unsigned int upper = (return_value & ~0xffff) >> 16; + if (upper == 0xcafe) + { + printf ("Kernel exit value was never set\n"); + return_value = 0xff; + } + else if (upper == 0xffff) + ; /* Set by exit. */ + else if (upper == 0) + ; /* Set by return from main. */ + else + printf ("Possible kernel exit value corruption, 2 most significant bytes " + "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value); + + if (upper == 0xffff) + { + unsigned int signal = (return_value >> 8) & 0xff; + if (signal == SIGABRT) + printf ("Kernel aborted\n"); + else if (signal != 0) + printf ("Kernel received unkown signal\n"); + } + + if (debug) + printf ("Kernel exit value: %d\n", return_value & 0xff); + + /* Clean shut down. */ + XHSA (hsa_fns.hsa_memory_free_fn (kernargs), + "Clean up device memory"); + XHSA (hsa_fns.hsa_executable_destroy_fn (executable), + "Clean up GCN executable"); + XHSA (hsa_fns.hsa_queue_destroy_fn (queue), + "Clean up device queue"); + XHSA (hsa_fns.hsa_shut_down_fn (), + "Shut down run-time"); + + return return_value & 0xff; +}