Mercurial > hg > CbC > CbC_gcc
diff libgomp/oacc-mem.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
line wrap: on
line diff
--- a/libgomp/oacc-mem.c Thu Oct 25 07:37:49 2018 +0900 +++ b/libgomp/oacc-mem.c Thu Feb 13 11:34:05 2020 +0900 @@ -1,6 +1,6 @@ /* OpenACC Runtime initialization routines - Copyright (C) 2013-2018 Free Software Foundation, Inc. + Copyright (C) 2013-2020 Free Software Foundation, Inc. Contributed by Mentor Embedded. @@ -27,11 +27,9 @@ <http://www.gnu.org/licenses/>. */ #include "openacc.h" -#include "config.h" #include "libgomp.h" #include "gomp-constants.h" #include "oacc-int.h" -#include <stdint.h> #include <string.h> #include <assert.h> @@ -52,43 +50,41 @@ return key; } -/* Return block containing [D->S), or NULL if not contained. - The list isn't ordered by device address, so we have to iterate - over the whole array. This is not expected to be a common - operation. The device lock associated with TGT must be locked on entry, and - remains locked on exit. */ +/* Helper for lookup_dev. Iterate over splay tree. */ static splay_tree_key -lookup_dev (struct target_mem_desc *tgt, void *d, size_t s) +lookup_dev_1 (splay_tree_node node, uintptr_t d, size_t s) { - int i; - struct target_mem_desc *t; + splay_tree_key key = &node->key; + if (d >= key->tgt->tgt_start && d + s <= key->tgt->tgt_end) + return key; + + key = NULL; + if (node->left) + key = lookup_dev_1 (node->left, d, s); + if (!key && node->right) + key = lookup_dev_1 (node->right, d, s); + + return key; +} - if (!tgt) +/* Return block containing [D->S), or NULL if not contained. + + This iterates over the splay tree. This is not expected to be a common + operation. + + The device lock associated with MEM_MAP must be locked on entry, and remains + locked on exit. */ + +static splay_tree_key +lookup_dev (splay_tree mem_map, void *d, size_t s) +{ + if (!mem_map || !mem_map->root) return NULL; - for (t = tgt; t != NULL; t = t->prev) - { - if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s) - break; - } - - if (!t) - return NULL; + return lookup_dev_1 (mem_map->root, (uintptr_t) d, s); +} - for (i = 0; i < t->list_count; i++) - { - void * offset; - - splay_tree_key k = &t->array[i].key; - offset = d - t->tgt_start + k->tgt_offset; - - if (k->host_start + offset <= (void *) k->host_end) - return k; - } - - return NULL; -} /* OpenACC is silent on how memory exhaustion is indicated. We return NULL. */ @@ -108,12 +104,21 @@ if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return malloc (s); - return thr->dev->alloc_func (thr->dev->target_id, s); + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + + void *res = thr->dev->alloc_func (thr->dev->target_id, s); + + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } + + return res; } -/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event - the device address is mapped. We choose to check if it mapped, - and if it is, to unmap it. */ void acc_free (void *d) { @@ -131,30 +136,43 @@ if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return free (d); + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + gomp_mutex_lock (&acc_dev->lock); /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ - if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1))) + if ((k = lookup_dev (&acc_dev->mem_map, d, 1))) { - void *offset; - - offset = d - k->tgt->tgt_start + k->tgt_offset; - + void *offset = d - k->tgt->tgt_start + k->tgt_offset; + void *h = k->host_start + offset; + size_t h_size = k->host_end - k->host_start; gomp_mutex_unlock (&acc_dev->lock); - - acc_unmap_data ((void *)(k->host_start + offset)); + /* PR92503 "[OpenACC] Behavior of 'acc_free' if the memory space is still + used in a mapping". */ + gomp_fatal ("refusing to free device memory space at %p that is still" + " mapped at [%p,+%d]", + d, h, (int) h_size); } else gomp_mutex_unlock (&acc_dev->lock); if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); + + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } } -void -acc_memcpy_to_device (void *d, void *h, size_t s) +static void +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, + const char *libfnname) { /* No need to call lazy open here, as the device pointer must have been obtained from a routine that did that. */ @@ -164,31 +182,57 @@ if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { - memmove (d, h, s); + if (from) + memmove (h, d, s); + else + memmove (d, h, s); return; } - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) - gomp_fatal ("error in %s", __FUNCTION__); + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + if (profiling_p) + { + prof_info.async = async; + prof_info.async_queue = prof_info.async; + } + + goacc_aq aq = get_goacc_asyncqueue (async); + if (from) + gomp_copy_dev2host (thr->dev, aq, h, d, s); + else + gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL); + + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } +} + +void +acc_memcpy_to_device (void *d, void *h, size_t s) +{ + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); +} + +void +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) +{ + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); } void acc_memcpy_from_device (void *h, void *d, size_t s) { - /* No need to call lazy open here, as the device pointer must have - been obtained from a routine that did that. */ - struct goacc_thread *thr = goacc_thread (); - - assert (thr && thr->dev); + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); +} - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) - { - memmove (h, d, s); - return; - } - - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) - gomp_fatal ("error in %s", __FUNCTION__); +void +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) +{ + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); } /* Return the device pointer that corresponds to host data H. Or NULL @@ -209,6 +253,9 @@ if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h; + /* In the following, no OpenACC Profiling Interface events can possibly be + generated. */ + gomp_mutex_lock (&dev->lock); n = lookup_host (dev, h, 1); @@ -246,9 +293,12 @@ if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return d; + /* In the following, no OpenACC Profiling Interface events can possibly be + generated. */ + gomp_mutex_lock (&acc_dev->lock); - n = lookup_dev (acc_dev->openacc.data_environ, d, 1); + n = lookup_dev (&acc_dev->mem_map, d, 1); if (!n) { @@ -283,6 +333,9 @@ if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h != NULL; + /* In the following, no OpenACC Profiling Interface events can possibly be + generated. */ + gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); @@ -327,6 +380,10 @@ gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map", (void *)h, (int)s, (void *)d, (int)s); + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + gomp_mutex_lock (&acc_dev->lock); if (lookup_host (acc_dev, h, s)) @@ -336,7 +393,7 @@ (int)s); } - if (lookup_dev (thr->dev->openacc.data_environ, d, s)) + if (lookup_dev (&thr->dev->mem_map, d, s)) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d, @@ -346,14 +403,20 @@ gomp_mutex_unlock (&acc_dev->lock); tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes, - &kinds, true, GOMP_MAP_VARS_OPENACC); - tgt->list[0].key->refcount = REFCOUNT_INFINITY; - } + &kinds, true, GOMP_MAP_VARS_ENTER_DATA); + assert (tgt); + splay_tree_key n = tgt->list[0].key; + assert (n->refcount == 1); + assert (n->virtual_refcount == 0); + /* Special reference counting behavior. */ + n->refcount = REFCOUNT_INFINITY; - gomp_mutex_lock (&acc_dev->lock); - tgt->prev = acc_dev->openacc.data_environ; - acc_dev->openacc.data_environ = tgt; - gomp_mutex_unlock (&acc_dev->lock); + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } + } } void @@ -368,12 +431,13 @@ if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; - size_t host_size; + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); gomp_mutex_lock (&acc_dev->lock); splay_tree_key n = lookup_host (acc_dev, h, 1); - struct target_mem_desc *t; if (!n) { @@ -381,7 +445,7 @@ gomp_fatal ("%p is not a mapped block", (void *)h); } - host_size = n->host_end - n->host_start; + size_t host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h) { @@ -389,52 +453,56 @@ gomp_fatal ("[%p,%d] surrounds %p", (void *) n->host_start, (int) host_size, (void *) h); } - - /* Mark for removal. */ - n->refcount = 1; - - t = n->tgt; - - if (t->refcount == 2) + /* TODO This currently doesn't catch 'REFCOUNT_INFINITY' usage different from + 'acc_map_data'. Maybe 'virtual_refcount' can be used for disambiguating + the different 'REFCOUNT_INFINITY' cases, or simply separate + 'REFCOUNT_INFINITY' values per different usage ('REFCOUNT_ACC_MAP_DATA' + etc.)? */ + else if (n->refcount != REFCOUNT_INFINITY) { - struct target_mem_desc *tp; + gomp_mutex_unlock (&acc_dev->lock); + gomp_fatal ("refusing to unmap block [%p,+%d] that has not been mapped" + " by 'acc_map_data'", + (void *) h, (int) host_size); + } - /* This is the last reference, so pull the descriptor off the - chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from - freeing the device memory. */ - t->tgt_end = 0; - t->to_free = 0; + splay_tree_remove (&acc_dev->mem_map, n); + + struct target_mem_desc *tgt = n->tgt; - for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; - tp = t, t = t->prev) - if (n->tgt == t) - { - if (tp) - tp->prev = t->prev; - else - acc_dev->openacc.data_environ = t->prev; - - break; - } + if (tgt->refcount == REFCOUNT_INFINITY) + { + gomp_mutex_unlock (&acc_dev->lock); + gomp_fatal ("cannot unmap target block"); + } + else if (tgt->refcount > 1) + tgt->refcount--; + else + { + free (tgt->array); + free (tgt); } gomp_mutex_unlock (&acc_dev->lock); - gomp_unmap_vars (t, true); + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } } -#define FLAG_PRESENT (1 << 0) -#define FLAG_CREATE (1 << 1) -#define FLAG_COPY (1 << 2) + +/* Enter dynamic mapping for a single datum. Return the device pointer. */ static void * -present_create_copy (unsigned f, void *h, size_t s) +goacc_enter_datum (void **hostaddrs, size_t *sizes, void *kinds, int async) { void *d; splay_tree_key n; - if (!h || !s) - gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s); + if (!hostaddrs[0] || !sizes[0]) + gomp_fatal ("[%p,+%d] is a bad range", hostaddrs[0], (int) sizes[0]); goacc_lazy_initialize (); @@ -442,66 +510,67 @@ struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) - return h; + return hostaddrs[0]; + + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + if (profiling_p) + { + prof_info.async = async; + prof_info.async_queue = prof_info.async; + } gomp_mutex_lock (&acc_dev->lock); - n = lookup_host (acc_dev, h, s); + n = lookup_host (acc_dev, hostaddrs[0], sizes[0]); if (n) { - /* Present. */ - d = (void *) (n->tgt->tgt_start + n->tgt_offset); + void *h = hostaddrs[0]; + size_t s = sizes[0]; - if (!(f & FLAG_PRESENT)) - { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("[%p,+%d] already mapped to [%p,+%d]", - (void *)h, (int)s, (void *)d, (int)s); - } + /* Present. */ + d = (void *) (n->tgt->tgt_start + n->tgt_offset + h - n->host_start); + if ((h + s) > (void *)n->host_end) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s); } + assert (n->refcount != REFCOUNT_LINK); if (n->refcount != REFCOUNT_INFINITY) { n->refcount++; - n->dynamic_refcount++; + n->virtual_refcount++; } + gomp_mutex_unlock (&acc_dev->lock); } - else if (!(f & FLAG_CREATE)) - { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s); - } else { - struct target_mem_desc *tgt; - size_t mapnum = 1; - unsigned short kinds; - void *hostaddrs = h; - - if (f & FLAG_COPY) - kinds = GOMP_MAP_TO; - else - kinds = GOMP_MAP_ALLOC; + const size_t mapnum = 1; gomp_mutex_unlock (&acc_dev->lock); - tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, - GOMP_MAP_VARS_OPENACC); - /* Initialize dynamic refcount. */ - tgt->list[0].key->dynamic_refcount = 1; + goacc_aq aq = get_goacc_asyncqueue (async); + + gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds, + true, GOMP_MAP_VARS_OPENACC_ENTER_DATA); gomp_mutex_lock (&acc_dev->lock); + n = lookup_host (acc_dev, hostaddrs[0], sizes[0]); + assert (n != NULL); + assert (n->tgt_offset == 0); + assert ((uintptr_t) hostaddrs[0] == n->host_start); + d = (void *) n->tgt->tgt_start; + gomp_mutex_unlock (&acc_dev->lock); + } - d = tgt->to_free; - tgt->prev = acc_dev->openacc.data_environ; - acc_dev->openacc.data_environ = tgt; - - gomp_mutex_unlock (&acc_dev->lock); + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; } return d; @@ -510,175 +579,214 @@ void * acc_create (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); + unsigned short kinds[1] = { GOMP_MAP_ALLOC }; + return goacc_enter_datum (&h, &s, &kinds, acc_async_sync); +} + +void +acc_create_async (void *h, size_t s, int async) +{ + unsigned short kinds[1] = { GOMP_MAP_ALLOC }; + goacc_enter_datum (&h, &s, &kinds, async); } +/* acc_present_or_create used to be what acc_create is now. */ +/* acc_pcreate is acc_present_or_create by a different name. */ +#ifdef HAVE_ATTRIBUTE_ALIAS +strong_alias (acc_create, acc_present_or_create) +strong_alias (acc_create, acc_pcreate) +#else +void * +acc_present_or_create (void *h, size_t s) +{ + return acc_create (h, s); +} + +void * +acc_pcreate (void *h, size_t s) +{ + return acc_create (h, s); +} +#endif + void * acc_copyin (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); -} - -void * -acc_present_or_create (void *h, size_t s) -{ - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); + unsigned short kinds[1] = { GOMP_MAP_TO }; + return goacc_enter_datum (&h, &s, &kinds, acc_async_sync); } -/* acc_pcreate is acc_present_or_create by a different name. */ +void +acc_copyin_async (void *h, size_t s, int async) +{ + unsigned short kinds[1] = { GOMP_MAP_TO }; + goacc_enter_datum (&h, &s, &kinds, async); +} + +/* acc_present_or_copyin used to be what acc_copyin is now. */ +/* acc_pcopyin is acc_present_or_copyin by a different name. */ #ifdef HAVE_ATTRIBUTE_ALIAS -strong_alias (acc_present_or_create, acc_pcreate) +strong_alias (acc_copyin, acc_present_or_copyin) +strong_alias (acc_copyin, acc_pcopyin) #else void * -acc_pcreate (void *h, size_t s) -{ - return acc_present_or_create (h, s); -} -#endif - -void * acc_present_or_copyin (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); + return acc_copyin (h, s); } -/* acc_pcopyin is acc_present_or_copyin by a different name. */ -#ifdef HAVE_ATTRIBUTE_ALIAS -strong_alias (acc_present_or_copyin, acc_pcopyin) -#else void * acc_pcopyin (void *h, size_t s) { - return acc_present_or_copyin (h, s); + return acc_copyin (h, s); } #endif -#define FLAG_COPYOUT (1 << 0) -#define FLAG_FINALIZE (1 << 1) + +/* Exit a dynamic mapping for a single variable. */ static void -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) +goacc_exit_datum (void *h, size_t s, unsigned short kind, int async) { - size_t host_size; - splay_tree_key n; - void *d; + /* No need to call lazy open, as the data must already have been + mapped. */ + + kind &= 0xff; + struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; - gomp_mutex_lock (&acc_dev->lock); - - n = lookup_host (acc_dev, h, s); - - /* No need to call lazy open, as the data must already have been - mapped. */ - - if (!n) + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + if (profiling_p) { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s); + prof_info.async = async; + prof_info.async_queue = prof_info.async; } - d = (void *) (n->tgt->tgt_start + n->tgt_offset - + (uintptr_t) h - n->host_start); + gomp_mutex_lock (&acc_dev->lock); - host_size = n->host_end - n->host_start; + splay_tree_key n = lookup_host (acc_dev, h, s); + if (!n) + /* PR92726, RP92970, PR92984: no-op. */ + goto out; - if (n->host_start != (uintptr_t) h || host_size != s) + if ((uintptr_t) h < n->host_start || (uintptr_t) h + s > n->host_end) { + size_t host_size = n->host_end - n->host_start; gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]", - (void *) n->host_start, (int) host_size, (void *) h, (int) s); + gomp_fatal ("[%p,+%d] outside mapped block [%p,+%d]", + (void *) h, (int) s, (void *) n->host_start, (int) host_size); } - if (n->refcount == REFCOUNT_INFINITY) + bool finalize = (kind == GOMP_MAP_DELETE + || kind == GOMP_MAP_FORCE_FROM); + if (finalize) { - n->refcount = 0; - n->dynamic_refcount = 0; - } - if (n->refcount < n->dynamic_refcount) - { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("Dynamic reference counting assert fail\n"); + if (n->refcount != REFCOUNT_INFINITY) + n->refcount -= n->virtual_refcount; + n->virtual_refcount = 0; } - if (f & FLAG_FINALIZE) + if (n->virtual_refcount > 0) { - n->refcount -= n->dynamic_refcount; - n->dynamic_refcount = 0; + if (n->refcount != REFCOUNT_INFINITY) + n->refcount--; + n->virtual_refcount--; } - else if (n->dynamic_refcount) - { - n->dynamic_refcount--; - n->refcount--; - } + else if (n->refcount > 0 && n->refcount != REFCOUNT_INFINITY) + n->refcount--; if (n->refcount == 0) { - if (n->tgt->refcount == 2) + goacc_aq aq = get_goacc_asyncqueue (async); + + bool copyout = (kind == GOMP_MAP_FROM + || kind == GOMP_MAP_FORCE_FROM); + if (copyout) { - struct target_mem_desc *tp, *t; - for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; - tp = t, t = t->prev) - if (n->tgt == t) - { - if (tp) - tp->prev = t->prev; - else - acc_dev->openacc.data_environ = t->prev; - break; - } + void *d = (void *) (n->tgt->tgt_start + n->tgt_offset + + (uintptr_t) h - n->host_start); + gomp_copy_dev2host (acc_dev, aq, h, d, s); } - if (f & FLAG_COPYOUT) - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); - - gomp_remove_var (acc_dev, n); + if (aq) + /* TODO We can't do the 'is_tgt_unmapped' checking -- see the + 'gomp_unref_tgt' comment in + <http://mid.mail-archive.com/878snl36eu.fsf@euler.schwinge.homeip.net>; + PR92881. */ + gomp_remove_var_async (acc_dev, n, aq); + else + { + bool is_tgt_unmapped = gomp_remove_var (acc_dev, n); + assert (is_tgt_unmapped); + } } + out: gomp_mutex_unlock (&acc_dev->lock); + + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } } void acc_delete (void *h , size_t s) { - delete_copyout (0, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_RELEASE, acc_async_sync); +} + +void +acc_delete_async (void *h , size_t s, int async) +{ + goacc_exit_datum (h, s, GOMP_MAP_RELEASE, async); } void acc_delete_finalize (void *h , size_t s) { - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_DELETE, acc_async_sync); } void acc_delete_finalize_async (void *h , size_t s, int async) { - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_DELETE, async); } void acc_copyout (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_FROM, acc_async_sync); +} + +void +acc_copyout_async (void *h, size_t s, int async) +{ + goacc_exit_datum (h, s, GOMP_MAP_FROM, async); } void acc_copyout_finalize (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_FORCE_FROM, acc_async_sync); } void acc_copyout_finalize_async (void *h, size_t s, int async) { - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); + goacc_exit_datum (h, s, GOMP_MAP_FORCE_FROM, async); } static void -update_dev_host (int is_dev, void *h, size_t s) +update_dev_host (int is_dev, void *h, size_t s, int async) { splay_tree_key n; void *d; @@ -691,6 +799,21 @@ if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; + /* Fortran optional arguments that are non-present result in a + NULL host address here. This can safely be ignored as it is + not possible to 'update' a non-present optional argument. */ + if (h == NULL) + return; + + acc_prof_info prof_info; + acc_api_info api_info; + bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info); + if (profiling_p) + { + prof_info.async = async; + prof_info.async_queue = prof_info.async; + } + gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); @@ -704,157 +827,496 @@ d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); + goacc_aq aq = get_goacc_asyncqueue (async); + if (is_dev) - acc_dev->host2dev_func (acc_dev->target_id, d, h, s); + gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL); else - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + gomp_copy_dev2host (acc_dev, aq, h, d, s); gomp_mutex_unlock (&acc_dev->lock); + + if (profiling_p) + { + thr->prof_info = NULL; + thr->api_info = NULL; + } } void acc_update_device (void *h, size_t s) { - update_dev_host (1, h, s); + update_dev_host (1, h, s, acc_async_sync); +} + +void +acc_update_device_async (void *h, size_t s, int async) +{ + update_dev_host (1, h, s, async); } void acc_update_self (void *h, size_t s) { - update_dev_host (0, h, s); + update_dev_host (0, h, s, acc_async_sync); +} + +void +acc_update_self_async (void *h, size_t s, int async) +{ + update_dev_host (0, h, s, async); } void -gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, - void *kinds) +acc_attach_async (void **hostaddr, int async) { - struct target_mem_desc *tgt; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; - - if (acc_is_present (*hostaddrs, *sizes)) - { - splay_tree_key n; - gomp_mutex_lock (&acc_dev->lock); - n = lookup_host (acc_dev, *hostaddrs, *sizes); - gomp_mutex_unlock (&acc_dev->lock); + goacc_aq aq = get_goacc_asyncqueue (async); - tgt = n->tgt; - for (size_t i = 0; i < tgt->list_count; i++) - if (tgt->list[i].key == n) - { - for (size_t j = 0; j < mapnum; j++) - if (i + j < tgt->list_count && tgt->list[i + j].key) - { - tgt->list[i + j].key->refcount++; - tgt->list[i + j].key->dynamic_refcount++; - } - return; - } - /* Should not reach here. */ - gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset"); - } + struct splay_tree_key_s cur_node; + splay_tree_key n; - gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); - tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, - NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); - gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); - - /* Initialize dynamic refcount. */ - tgt->list[0].key->dynamic_refcount = 1; + if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + return; gomp_mutex_lock (&acc_dev->lock); - tgt->prev = acc_dev->openacc.data_environ; - acc_dev->openacc.data_environ = tgt; + + cur_node.host_start = (uintptr_t) hostaddr; + cur_node.host_end = cur_node.host_start + sizeof (void *); + n = splay_tree_lookup (&acc_dev->mem_map, &cur_node); + + if (n == NULL) + gomp_fatal ("struct not mapped for acc_attach"); + + gomp_attach_pointer (acc_dev, aq, &acc_dev->mem_map, n, (uintptr_t) hostaddr, + 0, NULL); + gomp_mutex_unlock (&acc_dev->lock); } void -gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async, - int finalize, int mapnum) +acc_attach (void **hostaddr) +{ + acc_attach_async (hostaddr, acc_async_sync); +} + +static void +goacc_detach_internal (void **hostaddr, int async, bool finalize) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; + struct splay_tree_key_s cur_node; splay_tree_key n; - struct target_mem_desc *t; - int minrefs = (mapnum == 1) ? 2 : 3; + struct goacc_asyncqueue *aq = get_goacc_asyncqueue (async); - if (!acc_is_present (h, s)) + if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); - n = lookup_host (acc_dev, h, 1); + cur_node.host_start = (uintptr_t) hostaddr; + cur_node.host_end = cur_node.host_start + sizeof (void *); + n = splay_tree_lookup (&acc_dev->mem_map, &cur_node); + + if (n == NULL) + gomp_fatal ("struct not mapped for acc_detach"); + + gomp_detach_pointer (acc_dev, aq, n, (uintptr_t) hostaddr, finalize, NULL); + + gomp_mutex_unlock (&acc_dev->lock); +} + +void +acc_detach (void **hostaddr) +{ + goacc_detach_internal (hostaddr, acc_async_sync, false); +} + +void +acc_detach_async (void **hostaddr, int async) +{ + goacc_detach_internal (hostaddr, async, false); +} + +void +acc_detach_finalize (void **hostaddr) +{ + goacc_detach_internal (hostaddr, acc_async_sync, true); +} + +void +acc_detach_finalize_async (void **hostaddr, int async) +{ + goacc_detach_internal (hostaddr, async, true); +} + +/* Some types of (pointer) variables use several consecutive mappings, which + must be treated as a group for enter/exit data directives. This function + returns the last mapping in such a group (inclusive), or POS for singleton + mappings. */ - if (!n) +static int +find_group_last (int pos, size_t mapnum, size_t *sizes, unsigned short *kinds) +{ + unsigned char kind0 = kinds[pos] & 0xff; + int first_pos = pos; + + switch (kind0) { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("%p is not a mapped block", (void *)h); + case GOMP_MAP_TO_PSET: + while (pos + 1 < mapnum && (kinds[pos + 1] & 0xff) == GOMP_MAP_POINTER) + pos++; + /* We expect at least one GOMP_MAP_POINTER after a GOMP_MAP_TO_PSET. */ + assert (pos > first_pos); + break; + + case GOMP_MAP_STRUCT: + pos += sizes[pos]; + break; + + case GOMP_MAP_POINTER: + case GOMP_MAP_ALWAYS_POINTER: + /* These mappings are only expected after some other mapping. If we + see one by itself, something has gone wrong. */ + gomp_fatal ("unexpected mapping"); + break; + + default: + /* GOMP_MAP_ALWAYS_POINTER can only appear directly after some other + mapping. */ + if (pos + 1 < mapnum) + { + unsigned char kind1 = kinds[pos + 1] & 0xff; + if (kind1 == GOMP_MAP_ALWAYS_POINTER) + return pos + 1; + } + + /* We can have zero or more GOMP_MAP_POINTER mappings after a to/from + (etc.) mapping. */ + while (pos + 1 < mapnum && (kinds[pos + 1] & 0xff) == GOMP_MAP_POINTER) + pos++; } - gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); + return pos; +} + +/* Map variables for OpenACC "enter data". We can't just call + gomp_map_vars_async once, because individual mapped variables might have + "exit data" called for them at different times. */ - t = n->tgt; +static void +goacc_enter_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum, + void **hostaddrs, size_t *sizes, + unsigned short *kinds, goacc_aq aq) +{ + for (size_t i = 0; i < mapnum; i++) + { + int group_last = find_group_last (i, mapnum, sizes, kinds); + + gomp_map_vars_async (acc_dev, aq, + (group_last - i) + 1, + &hostaddrs[i], NULL, + &sizes[i], &kinds[i], true, + GOMP_MAP_VARS_OPENACC_ENTER_DATA); + + i = group_last; + } +} + +/* Unmap variables for OpenACC "exit data". */ - if (n->refcount < n->dynamic_refcount) +static void +goacc_exit_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum, + void **hostaddrs, size_t *sizes, + unsigned short *kinds, goacc_aq aq) +{ + gomp_mutex_lock (&acc_dev->lock); + + /* Handle "detach" before copyback/deletion of mapped data. */ + for (size_t i = 0; i < mapnum; ++i) { - gomp_mutex_unlock (&acc_dev->lock); - gomp_fatal ("Dynamic reference counting assert fail\n"); + unsigned char kind = kinds[i] & 0xff; + bool finalize = false; + switch (kind) + { + case GOMP_MAP_FORCE_DETACH: + finalize = true; + /* Fallthrough. */ + + case GOMP_MAP_DETACH: + { + struct splay_tree_key_s cur_node; + uintptr_t hostaddr = (uintptr_t) hostaddrs[i]; + cur_node.host_start = hostaddr; + cur_node.host_end = cur_node.host_start + sizeof (void *); + splay_tree_key n + = splay_tree_lookup (&acc_dev->mem_map, &cur_node); + + if (n == NULL) + gomp_fatal ("struct not mapped for detach operation"); + + gomp_detach_pointer (acc_dev, aq, n, hostaddr, finalize, NULL); + } + break; + default: + ; + } } - if (finalize) + for (size_t i = 0; i < mapnum; ++i) { - n->refcount -= n->dynamic_refcount; - n->dynamic_refcount = 0; - } - else if (n->dynamic_refcount) - { - n->dynamic_refcount--; - n->refcount--; + unsigned char kind = kinds[i] & 0xff; + bool copyfrom = false; + bool finalize = false; + + if (kind == GOMP_MAP_FORCE_FROM + || kind == GOMP_MAP_DELETE + || kind == GOMP_MAP_FORCE_DETACH) + finalize = true; + + switch (kind) + { + case GOMP_MAP_FROM: + case GOMP_MAP_FORCE_FROM: + case GOMP_MAP_ALWAYS_FROM: + copyfrom = true; + /* Fallthrough. */ + + case GOMP_MAP_TO_PSET: + case GOMP_MAP_POINTER: + case GOMP_MAP_DELETE: + case GOMP_MAP_RELEASE: + case GOMP_MAP_DETACH: + case GOMP_MAP_FORCE_DETACH: + { + struct splay_tree_key_s cur_node; + size_t size; + if (kind == GOMP_MAP_POINTER + || kind == GOMP_MAP_DETACH + || kind == GOMP_MAP_FORCE_DETACH) + size = sizeof (void *); + else + size = sizes[i]; + cur_node.host_start = (uintptr_t) hostaddrs[i]; + cur_node.host_end = cur_node.host_start + size; + splay_tree_key n + = splay_tree_lookup (&acc_dev->mem_map, &cur_node); + + if (n == NULL) + continue; + + if (finalize) + { + if (n->refcount != REFCOUNT_INFINITY) + n->refcount -= n->virtual_refcount; + n->virtual_refcount = 0; + } + + if (n->virtual_refcount > 0) + { + if (n->refcount != REFCOUNT_INFINITY) + n->refcount--; + n->virtual_refcount--; + } + else if (n->refcount > 0 && n->refcount != REFCOUNT_INFINITY) + n->refcount--; + + if (copyfrom + && (kind != GOMP_MAP_FROM || n->refcount == 0)) + gomp_copy_dev2host (acc_dev, aq, (void *) cur_node.host_start, + (void *) (n->tgt->tgt_start + n->tgt_offset + + cur_node.host_start + - n->host_start), + cur_node.host_end - cur_node.host_start); + + if (n->refcount == 0) + gomp_remove_var_async (acc_dev, n, aq); + } + break; + + case GOMP_MAP_STRUCT: + { + int elems = sizes[i]; + for (int j = 1; j <= elems; j++) + { + struct splay_tree_key_s k; + k.host_start = (uintptr_t) hostaddrs[i + j]; + k.host_end = k.host_start + sizes[i + j]; + splay_tree_key str; + str = splay_tree_lookup (&acc_dev->mem_map, &k); + if (str) + { + if (finalize) + { + if (str->refcount != REFCOUNT_INFINITY) + str->refcount -= str->virtual_refcount; + str->virtual_refcount = 0; + } + if (str->virtual_refcount > 0) + { + if (str->refcount != REFCOUNT_INFINITY) + str->refcount--; + str->virtual_refcount--; + } + else if (str->refcount > 0 + && str->refcount != REFCOUNT_INFINITY) + str->refcount--; + if (str->refcount == 0) + gomp_remove_var_async (acc_dev, str, aq); + } + } + i += elems; + } + break; + + default: + gomp_fatal (">>>> goacc_exit_data_internal UNHANDLED kind 0x%.2x", + kind); + } } gomp_mutex_unlock (&acc_dev->lock); +} - if (n->refcount == 0) +void +GOACC_enter_exit_data (int flags_m, size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, int async, + int num_waits, ...) +{ + int flags = GOACC_FLAGS_UNMARSHAL (flags_m); + + struct goacc_thread *thr; + struct gomp_device_descr *acc_dev; + bool data_enter = false; + size_t i; + + goacc_lazy_initialize (); + + thr = goacc_thread (); + acc_dev = thr->dev; + + /* Determine if this is an "acc enter data". */ + for (i = 0; i < mapnum; ++i) { - if (t->refcount == minrefs) + unsigned char kind = kinds[i] & 0xff; + + if (kind == GOMP_MAP_POINTER + || kind == GOMP_MAP_TO_PSET + || kind == GOMP_MAP_STRUCT) + continue; + + if (kind == GOMP_MAP_FORCE_ALLOC + || kind == GOMP_MAP_FORCE_PRESENT + || kind == GOMP_MAP_ATTACH + || kind == GOMP_MAP_FORCE_TO + || kind == GOMP_MAP_TO + || kind == GOMP_MAP_ALLOC) { - /* This is the last reference, so pull the descriptor off the - chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from - freeing the device memory. */ - struct target_mem_desc *tp; - for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; - tp = t, t = t->prev) - { - if (n->tgt == t) - { - if (tp) - tp->prev = t->prev; - else - acc_dev->openacc.data_environ = t->prev; - break; - } - } + data_enter = true; + break; } - /* Set refcount to 1 to allow gomp_unmap_vars to unmap it. */ - n->refcount = 1; - t->refcount = minrefs; - for (size_t i = 0; i < t->list_count; i++) - if (t->list[i].key == n) - { - t->list[i].copy_from = force_copyfrom ? 1 : 0; - break; - } + if (kind == GOMP_MAP_RELEASE + || kind == GOMP_MAP_DELETE + || kind == GOMP_MAP_DETACH + || kind == GOMP_MAP_FORCE_DETACH + || kind == GOMP_MAP_FROM + || kind == GOMP_MAP_FORCE_FROM) + break; - /* If running synchronously, unmap immediately. */ - if (async < acc_async_noval) - gomp_unmap_vars (t, true); - else - t->device_descr->openacc.register_async_cleanup_func (t, async); + gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", + kind); } - gomp_mutex_unlock (&acc_dev->lock); + bool profiling_p = GOACC_PROFILING_DISPATCH_P (true); + + acc_prof_info prof_info; + if (profiling_p) + { + thr->prof_info = &prof_info; + + prof_info.event_type + = data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start; + prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES; + prof_info.version = _ACC_PROF_INFO_VERSION; + prof_info.device_type = acc_device_type (acc_dev->type); + prof_info.device_number = acc_dev->target_id; + prof_info.thread_id = -1; + prof_info.async = async; + prof_info.async_queue = prof_info.async; + prof_info.src_file = NULL; + prof_info.func_name = NULL; + prof_info.line_no = -1; + prof_info.end_line_no = -1; + prof_info.func_line_no = -1; + prof_info.func_end_line_no = -1; + } + acc_event_info enter_exit_data_event_info; + if (profiling_p) + { + enter_exit_data_event_info.other_event.event_type + = prof_info.event_type; + enter_exit_data_event_info.other_event.valid_bytes + = _ACC_OTHER_EVENT_INFO_VALID_BYTES; + enter_exit_data_event_info.other_event.parent_construct + = data_enter ? acc_construct_enter_data : acc_construct_exit_data; + enter_exit_data_event_info.other_event.implicit = 0; + enter_exit_data_event_info.other_event.tool_info = NULL; + } + acc_api_info api_info; + if (profiling_p) + { + thr->api_info = &api_info; - gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); + api_info.device_api = acc_device_api_none; + api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES; + api_info.device_type = prof_info.device_type; + api_info.vendor = -1; + api_info.device_handle = NULL; + api_info.context_handle = NULL; + api_info.async_handle = NULL; + } + + if (profiling_p) + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + + if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + || (flags & GOACC_FLAG_HOST_FALLBACK)) + { + prof_info.device_type = acc_device_host; + api_info.device_type = prof_info.device_type; + + goto out_prof; + } + + if (num_waits) + { + va_list ap; + + va_start (ap, num_waits); + goacc_wait (async, num_waits, &ap); + va_end (ap); + } + + goacc_aq aq = get_goacc_asyncqueue (async); + + if (data_enter) + goacc_enter_data_internal (acc_dev, mapnum, hostaddrs, sizes, kinds, aq); + else + goacc_exit_data_internal (acc_dev, mapnum, hostaddrs, sizes, kinds, aq); + + out_prof: + if (profiling_p) + { + prof_info.event_type + = data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end; + enter_exit_data_event_info.other_event.event_type = prof_info.event_type; + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + + thr->prof_info = NULL; + thr->api_info = NULL; + } }