annotate libgomp/plugin/plugin-nvptx.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Plugin for NVPTX execution.
kono
parents:
diff changeset
2
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
111
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 Contributed by Mentor Embedded.
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 This file is part of the GNU Offloading and Multi Processing Library
kono
parents:
diff changeset
8 (libgomp).
kono
parents:
diff changeset
9
kono
parents:
diff changeset
10 Libgomp is free software; you can redistribute it and/or modify it
kono
parents:
diff changeset
11 under the terms of the GNU General Public License as published by
kono
parents:
diff changeset
12 the Free Software Foundation; either version 3, or (at your option)
kono
parents:
diff changeset
13 any later version.
kono
parents:
diff changeset
14
kono
parents:
diff changeset
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
kono
parents:
diff changeset
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
kono
parents:
diff changeset
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
kono
parents:
diff changeset
18 more details.
kono
parents:
diff changeset
19
kono
parents:
diff changeset
20 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
21 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
22 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
23
kono
parents:
diff changeset
24 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
25 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
27 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
28
kono
parents:
diff changeset
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
kono
parents:
diff changeset
30 library appears to hold some implicit state, but the documentation
kono
parents:
diff changeset
31 is not clear as to what that state might be. Or how one might
kono
parents:
diff changeset
32 propagate it from one thread to another. */
kono
parents:
diff changeset
33
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
34 #define _GNU_SOURCE
111
kono
parents:
diff changeset
35 #include "openacc.h"
kono
parents:
diff changeset
36 #include "config.h"
kono
parents:
diff changeset
37 #include "libgomp-plugin.h"
kono
parents:
diff changeset
38 #include "oacc-plugin.h"
kono
parents:
diff changeset
39 #include "gomp-constants.h"
kono
parents:
diff changeset
40
kono
parents:
diff changeset
41 #include <pthread.h>
kono
parents:
diff changeset
42 #include <cuda.h>
kono
parents:
diff changeset
43 #include <stdbool.h>
kono
parents:
diff changeset
44 #include <stdint.h>
kono
parents:
diff changeset
45 #include <limits.h>
kono
parents:
diff changeset
46 #include <string.h>
kono
parents:
diff changeset
47 #include <stdio.h>
kono
parents:
diff changeset
48 #include <unistd.h>
kono
parents:
diff changeset
49 #include <assert.h>
kono
parents:
diff changeset
50 #include <errno.h>
kono
parents:
diff changeset
51
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
52 #if CUDA_VERSION < 6000
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
53 extern CUresult cuGetErrorString (CUresult, const char **);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
55 #endif
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
56
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
57 #if CUDA_VERSION >= 6050
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
58 #undef cuLinkCreate
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
59 #undef cuLinkAddData
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
61 const char *, unsigned, CUjit_option *, void **);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
63 #else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
64 typedef size_t (*CUoccupancyB2DSize)(int);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
66 const char *, unsigned, CUjit_option *, void **);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
69 CUoccupancyB2DSize, size_t, int);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
70 #endif
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
71
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
72 #define DO_PRAGMA(x) _Pragma (#x)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
73
111
kono
parents:
diff changeset
74 #if PLUGIN_NVPTX_DYNAMIC
kono
parents:
diff changeset
75 # include <dlfcn.h>
kono
parents:
diff changeset
76
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
77 struct cuda_lib_s {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
78
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
79 # define CUDA_ONE_CALL(call) \
111
kono
parents:
diff changeset
80 __typeof (call) *call;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
82 CUDA_ONE_CALL (call)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
83 #include "cuda-lib.def"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
84 # undef CUDA_ONE_CALL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
85 # undef CUDA_ONE_CALL_MAYBE_NULL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
86
111
kono
parents:
diff changeset
87 } cuda_lib;
kono
parents:
diff changeset
88
kono
parents:
diff changeset
89 /* -1 if init_cuda_lib has not been called yet, false
kono
parents:
diff changeset
90 if it has been and failed, true if it has been and succeeded. */
kono
parents:
diff changeset
91 static signed char cuda_lib_inited = -1;
kono
parents:
diff changeset
92
kono
parents:
diff changeset
93 /* Dynamically load the CUDA runtime library and initialize function
kono
parents:
diff changeset
94 pointers, return false if unsuccessful, true if successful. */
kono
parents:
diff changeset
95 static bool
kono
parents:
diff changeset
96 init_cuda_lib (void)
kono
parents:
diff changeset
97 {
kono
parents:
diff changeset
98 if (cuda_lib_inited != -1)
kono
parents:
diff changeset
99 return cuda_lib_inited;
kono
parents:
diff changeset
100 const char *cuda_runtime_lib = "libcuda.so.1";
kono
parents:
diff changeset
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
kono
parents:
diff changeset
102 cuda_lib_inited = false;
kono
parents:
diff changeset
103 if (h == NULL)
kono
parents:
diff changeset
104 return false;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
105
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
108 # define CUDA_ONE_CALL_1(call, allow_null) \
111
kono
parents:
diff changeset
109 cuda_lib.call = dlsym (h, #call); \
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
110 if (!allow_null && cuda_lib.call == NULL) \
111
kono
parents:
diff changeset
111 return false;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
112 #include "cuda-lib.def"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
113 # undef CUDA_ONE_CALL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
114 # undef CUDA_ONE_CALL_1
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
115 # undef CUDA_ONE_CALL_MAYBE_NULL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
116
111
kono
parents:
diff changeset
117 cuda_lib_inited = true;
kono
parents:
diff changeset
118 return true;
kono
parents:
diff changeset
119 }
kono
parents:
diff changeset
120 # define CUDA_CALL_PREFIX cuda_lib.
kono
parents:
diff changeset
121 #else
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
122
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
123 # define CUDA_ONE_CALL(call)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
125 #include "cuda-lib.def"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
126 #undef CUDA_ONE_CALL_MAYBE_NULL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
127 #undef CUDA_ONE_CALL
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
128
111
kono
parents:
diff changeset
129 # define CUDA_CALL_PREFIX
kono
parents:
diff changeset
130 # define init_cuda_lib() true
kono
parents:
diff changeset
131 #endif
kono
parents:
diff changeset
132
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
133 #include "secure_getenv.h"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
134
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
135 #undef MIN
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
136 #undef MAX
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
139
111
kono
parents:
diff changeset
140 /* Convenience macros for the frequently used CUDA library call and
kono
parents:
diff changeset
141 error handling sequence as well as CUDA library calls that
kono
parents:
diff changeset
142 do the error checking themselves or don't do it at all. */
kono
parents:
diff changeset
143
kono
parents:
diff changeset
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
kono
parents:
diff changeset
145 do { \
kono
parents:
diff changeset
146 unsigned __r \
kono
parents:
diff changeset
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
kono
parents:
diff changeset
148 if (__r != CUDA_SUCCESS) \
kono
parents:
diff changeset
149 { \
kono
parents:
diff changeset
150 GOMP_PLUGIN_error (#FN " error: %s", \
kono
parents:
diff changeset
151 cuda_error (__r)); \
kono
parents:
diff changeset
152 return ERET; \
kono
parents:
diff changeset
153 } \
kono
parents:
diff changeset
154 } while (0)
kono
parents:
diff changeset
155
kono
parents:
diff changeset
156 #define CUDA_CALL(FN, ...) \
kono
parents:
diff changeset
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
kono
parents:
diff changeset
158
kono
parents:
diff changeset
159 #define CUDA_CALL_ASSERT(FN, ...) \
kono
parents:
diff changeset
160 do { \
kono
parents:
diff changeset
161 unsigned __r \
kono
parents:
diff changeset
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
kono
parents:
diff changeset
163 if (__r != CUDA_SUCCESS) \
kono
parents:
diff changeset
164 { \
kono
parents:
diff changeset
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
kono
parents:
diff changeset
166 cuda_error (__r)); \
kono
parents:
diff changeset
167 } \
kono
parents:
diff changeset
168 } while (0)
kono
parents:
diff changeset
169
kono
parents:
diff changeset
170 #define CUDA_CALL_NOCHECK(FN, ...) \
kono
parents:
diff changeset
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
kono
parents:
diff changeset
172
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
173 #define CUDA_CALL_EXISTS(FN) \
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
174 CUDA_CALL_PREFIX FN
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
175
111
kono
parents:
diff changeset
176 static const char *
kono
parents:
diff changeset
177 cuda_error (CUresult r)
kono
parents:
diff changeset
178 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
179 const char *fallback = "unknown cuda error";
111
kono
parents:
diff changeset
180 const char *desc;
kono
parents:
diff changeset
181
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
183 return fallback;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
184
111
kono
parents:
diff changeset
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
186 if (r == CUDA_SUCCESS)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
187 return desc;
111
kono
parents:
diff changeset
188
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
189 return fallback;
111
kono
parents:
diff changeset
190 }
kono
parents:
diff changeset
191
kono
parents:
diff changeset
192 static unsigned int instantiated_devices = 0;
kono
parents:
diff changeset
193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
kono
parents:
diff changeset
194
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
195 struct cuda_map
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
196 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
197 CUdeviceptr d;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
198 size_t size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
199 bool active;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
200 struct cuda_map *next;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
201 };
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
202
111
kono
parents:
diff changeset
203 struct ptx_stream
kono
parents:
diff changeset
204 {
kono
parents:
diff changeset
205 CUstream stream;
kono
parents:
diff changeset
206 pthread_t host_thread;
kono
parents:
diff changeset
207 bool multithreaded;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
208 struct cuda_map *map;
111
kono
parents:
diff changeset
209 struct ptx_stream *next;
kono
parents:
diff changeset
210 };
kono
parents:
diff changeset
211
kono
parents:
diff changeset
212 /* Thread-specific data for PTX. */
kono
parents:
diff changeset
213
kono
parents:
diff changeset
214 struct nvptx_thread
kono
parents:
diff changeset
215 {
kono
parents:
diff changeset
216 struct ptx_stream *current_stream;
kono
parents:
diff changeset
217 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
218 };
kono
parents:
diff changeset
219
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
220 static struct cuda_map *
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
221 cuda_map_create (size_t size)
111
kono
parents:
diff changeset
222 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
223 struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
224
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
225 assert (map);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
226
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
227 map->next = NULL;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
228 map->size = size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
229 map->active = false;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
230
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
231 CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
232 assert (map->d);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
233
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
234 return map;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
235 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
236
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
237 static void
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
238 cuda_map_destroy (struct cuda_map *map)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
239 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
240 CUDA_CALL_ASSERT (cuMemFree, map->d);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
241 free (map);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
242 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
243
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
244 /* The following map_* routines manage the CUDA device memory that
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
245 contains the data mapping arguments for cuLaunchKernel. Each
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
246 asynchronous PTX stream may have multiple pending kernel
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
247 invocations, which are launched in a FIFO order. As such, the map
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
248 routines maintains a queue of cuLaunchKernel arguments.
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
249
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
250 Calls to map_push and map_pop must be guarded by ptx_event_lock.
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
251 Likewise, calls to map_init and map_fini are guarded by
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
252 ptx_dev_lock inside GOMP_OFFLOAD_init_device and
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
253 GOMP_OFFLOAD_fini_device, respectively. */
111
kono
parents:
diff changeset
254
kono
parents:
diff changeset
255 static bool
kono
parents:
diff changeset
256 map_init (struct ptx_stream *s)
kono
parents:
diff changeset
257 {
kono
parents:
diff changeset
258 int size = getpagesize ();
kono
parents:
diff changeset
259
kono
parents:
diff changeset
260 assert (s);
kono
parents:
diff changeset
261
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
262 s->map = cuda_map_create (size);
111
kono
parents:
diff changeset
263
kono
parents:
diff changeset
264 return true;
kono
parents:
diff changeset
265 }
kono
parents:
diff changeset
266
kono
parents:
diff changeset
267 static bool
kono
parents:
diff changeset
268 map_fini (struct ptx_stream *s)
kono
parents:
diff changeset
269 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
270 assert (s->map->next == NULL);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
271 assert (!s->map->active);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
272
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
273 cuda_map_destroy (s->map);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
274
111
kono
parents:
diff changeset
275 return true;
kono
parents:
diff changeset
276 }
kono
parents:
diff changeset
277
kono
parents:
diff changeset
278 static void
kono
parents:
diff changeset
279 map_pop (struct ptx_stream *s)
kono
parents:
diff changeset
280 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
281 struct cuda_map *next;
111
kono
parents:
diff changeset
282
kono
parents:
diff changeset
283 assert (s != NULL);
kono
parents:
diff changeset
284
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
285 if (s->map->next == NULL)
111
kono
parents:
diff changeset
286 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
287 s->map->active = false;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
288 return;
111
kono
parents:
diff changeset
289 }
kono
parents:
diff changeset
290
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
291 next = s->map->next;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
292 cuda_map_destroy (s->map);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
293 s->map = next;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
294 }
111
kono
parents:
diff changeset
295
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
296 static CUdeviceptr
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
297 map_push (struct ptx_stream *s, size_t size)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
298 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
299 struct cuda_map *map = NULL, *t = NULL;
111
kono
parents:
diff changeset
300
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
301 assert (s);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
302 assert (s->map);
111
kono
parents:
diff changeset
303
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
304 /* Each PTX stream requires a separate data region to store the
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
305 launch arguments for cuLaunchKernel. Allocate a new
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
306 cuda_map and push it to the end of the list. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
307 if (s->map->active)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
308 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
309 map = cuda_map_create (size);
111
kono
parents:
diff changeset
310
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
311 for (t = s->map; t->next != NULL; t = t->next)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
312 ;
111
kono
parents:
diff changeset
313
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
314 t->next = map;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
315 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
316 else if (s->map->size < size)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
317 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
318 cuda_map_destroy (s->map);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
319 map = cuda_map_create (size);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
320 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
321 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
322 map = s->map;
111
kono
parents:
diff changeset
323
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
324 s->map = map;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
325 s->map->active = true;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
326
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
327 return s->map->d;
111
kono
parents:
diff changeset
328 }
kono
parents:
diff changeset
329
kono
parents:
diff changeset
330 /* Target data function launch information. */
kono
parents:
diff changeset
331
kono
parents:
diff changeset
332 struct targ_fn_launch
kono
parents:
diff changeset
333 {
kono
parents:
diff changeset
334 const char *fn;
kono
parents:
diff changeset
335 unsigned short dim[GOMP_DIM_MAX];
kono
parents:
diff changeset
336 };
kono
parents:
diff changeset
337
kono
parents:
diff changeset
338 /* Target PTX object information. */
kono
parents:
diff changeset
339
kono
parents:
diff changeset
340 struct targ_ptx_obj
kono
parents:
diff changeset
341 {
kono
parents:
diff changeset
342 const char *code;
kono
parents:
diff changeset
343 size_t size;
kono
parents:
diff changeset
344 };
kono
parents:
diff changeset
345
kono
parents:
diff changeset
346 /* Target data image information. */
kono
parents:
diff changeset
347
kono
parents:
diff changeset
348 typedef struct nvptx_tdata
kono
parents:
diff changeset
349 {
kono
parents:
diff changeset
350 const struct targ_ptx_obj *ptx_objs;
kono
parents:
diff changeset
351 unsigned ptx_num;
kono
parents:
diff changeset
352
kono
parents:
diff changeset
353 const char *const *var_names;
kono
parents:
diff changeset
354 unsigned var_num;
kono
parents:
diff changeset
355
kono
parents:
diff changeset
356 const struct targ_fn_launch *fn_descs;
kono
parents:
diff changeset
357 unsigned fn_num;
kono
parents:
diff changeset
358 } nvptx_tdata_t;
kono
parents:
diff changeset
359
kono
parents:
diff changeset
360 /* Descriptor of a loaded function. */
kono
parents:
diff changeset
361
kono
parents:
diff changeset
362 struct targ_fn_descriptor
kono
parents:
diff changeset
363 {
kono
parents:
diff changeset
364 CUfunction fn;
kono
parents:
diff changeset
365 const struct targ_fn_launch *launch;
kono
parents:
diff changeset
366 int regs_per_thread;
kono
parents:
diff changeset
367 int max_threads_per_block;
kono
parents:
diff changeset
368 };
kono
parents:
diff changeset
369
kono
parents:
diff changeset
370 /* A loaded PTX image. */
kono
parents:
diff changeset
371 struct ptx_image_data
kono
parents:
diff changeset
372 {
kono
parents:
diff changeset
373 const void *target_data;
kono
parents:
diff changeset
374 CUmodule module;
kono
parents:
diff changeset
375
kono
parents:
diff changeset
376 struct targ_fn_descriptor *fns; /* Array of functions. */
kono
parents:
diff changeset
377
kono
parents:
diff changeset
378 struct ptx_image_data *next;
kono
parents:
diff changeset
379 };
kono
parents:
diff changeset
380
kono
parents:
diff changeset
381 struct ptx_device
kono
parents:
diff changeset
382 {
kono
parents:
diff changeset
383 CUcontext ctx;
kono
parents:
diff changeset
384 bool ctx_shared;
kono
parents:
diff changeset
385 CUdevice dev;
kono
parents:
diff changeset
386 struct ptx_stream *null_stream;
kono
parents:
diff changeset
387 /* All non-null streams associated with this device (actually context),
kono
parents:
diff changeset
388 either created implicitly or passed in from the user (via
kono
parents:
diff changeset
389 acc_set_cuda_stream). */
kono
parents:
diff changeset
390 struct ptx_stream *active_streams;
kono
parents:
diff changeset
391 struct {
kono
parents:
diff changeset
392 struct ptx_stream **arr;
kono
parents:
diff changeset
393 int size;
kono
parents:
diff changeset
394 } async_streams;
kono
parents:
diff changeset
395 /* A lock for use when manipulating the above stream list and array. */
kono
parents:
diff changeset
396 pthread_mutex_t stream_lock;
kono
parents:
diff changeset
397 int ord;
kono
parents:
diff changeset
398 bool overlap;
kono
parents:
diff changeset
399 bool map;
kono
parents:
diff changeset
400 bool concur;
kono
parents:
diff changeset
401 bool mkern;
kono
parents:
diff changeset
402 int mode;
kono
parents:
diff changeset
403 int clock_khz;
kono
parents:
diff changeset
404 int num_sms;
kono
parents:
diff changeset
405 int regs_per_block;
kono
parents:
diff changeset
406 int regs_per_sm;
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
407 int warp_size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
408 int max_threads_per_block;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
409 int max_threads_per_multiprocessor;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
410 int default_dims[GOMP_DIM_MAX];
111
kono
parents:
diff changeset
411
kono
parents:
diff changeset
412 struct ptx_image_data *images; /* Images loaded on device. */
kono
parents:
diff changeset
413 pthread_mutex_t image_lock; /* Lock for above list. */
kono
parents:
diff changeset
414
kono
parents:
diff changeset
415 struct ptx_device *next;
kono
parents:
diff changeset
416 };
kono
parents:
diff changeset
417
kono
parents:
diff changeset
418 enum ptx_event_type
kono
parents:
diff changeset
419 {
kono
parents:
diff changeset
420 PTX_EVT_MEM,
kono
parents:
diff changeset
421 PTX_EVT_KNL,
kono
parents:
diff changeset
422 PTX_EVT_SYNC,
kono
parents:
diff changeset
423 PTX_EVT_ASYNC_CLEANUP
kono
parents:
diff changeset
424 };
kono
parents:
diff changeset
425
kono
parents:
diff changeset
426 struct ptx_event
kono
parents:
diff changeset
427 {
kono
parents:
diff changeset
428 CUevent *evt;
kono
parents:
diff changeset
429 int type;
kono
parents:
diff changeset
430 void *addr;
kono
parents:
diff changeset
431 int ord;
kono
parents:
diff changeset
432 int val;
kono
parents:
diff changeset
433
kono
parents:
diff changeset
434 struct ptx_event *next;
kono
parents:
diff changeset
435 };
kono
parents:
diff changeset
436
kono
parents:
diff changeset
437 static pthread_mutex_t ptx_event_lock;
kono
parents:
diff changeset
438 static struct ptx_event *ptx_events;
kono
parents:
diff changeset
439
kono
parents:
diff changeset
440 static struct ptx_device **ptx_devices;
kono
parents:
diff changeset
441
kono
parents:
diff changeset
442 static inline struct nvptx_thread *
kono
parents:
diff changeset
443 nvptx_thread (void)
kono
parents:
diff changeset
444 {
kono
parents:
diff changeset
445 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
kono
parents:
diff changeset
446 }
kono
parents:
diff changeset
447
kono
parents:
diff changeset
448 static bool
kono
parents:
diff changeset
449 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
kono
parents:
diff changeset
450 {
kono
parents:
diff changeset
451 int i;
kono
parents:
diff changeset
452 struct ptx_stream *null_stream
kono
parents:
diff changeset
453 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
kono
parents:
diff changeset
454
kono
parents:
diff changeset
455 null_stream->stream = NULL;
kono
parents:
diff changeset
456 null_stream->host_thread = pthread_self ();
kono
parents:
diff changeset
457 null_stream->multithreaded = true;
kono
parents:
diff changeset
458 if (!map_init (null_stream))
kono
parents:
diff changeset
459 return false;
kono
parents:
diff changeset
460
kono
parents:
diff changeset
461 ptx_dev->null_stream = null_stream;
kono
parents:
diff changeset
462 ptx_dev->active_streams = NULL;
kono
parents:
diff changeset
463 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
kono
parents:
diff changeset
464
kono
parents:
diff changeset
465 if (concurrency < 1)
kono
parents:
diff changeset
466 concurrency = 1;
kono
parents:
diff changeset
467
kono
parents:
diff changeset
468 /* This is just a guess -- make space for as many async streams as the
kono
parents:
diff changeset
469 current device is capable of concurrently executing. This can grow
kono
parents:
diff changeset
470 later as necessary. No streams are created yet. */
kono
parents:
diff changeset
471 ptx_dev->async_streams.arr
kono
parents:
diff changeset
472 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
kono
parents:
diff changeset
473 ptx_dev->async_streams.size = concurrency;
kono
parents:
diff changeset
474
kono
parents:
diff changeset
475 for (i = 0; i < concurrency; i++)
kono
parents:
diff changeset
476 ptx_dev->async_streams.arr[i] = NULL;
kono
parents:
diff changeset
477
kono
parents:
diff changeset
478 return true;
kono
parents:
diff changeset
479 }
kono
parents:
diff changeset
480
kono
parents:
diff changeset
481 static bool
kono
parents:
diff changeset
482 fini_streams_for_device (struct ptx_device *ptx_dev)
kono
parents:
diff changeset
483 {
kono
parents:
diff changeset
484 free (ptx_dev->async_streams.arr);
kono
parents:
diff changeset
485
kono
parents:
diff changeset
486 bool ret = true;
kono
parents:
diff changeset
487 while (ptx_dev->active_streams != NULL)
kono
parents:
diff changeset
488 {
kono
parents:
diff changeset
489 struct ptx_stream *s = ptx_dev->active_streams;
kono
parents:
diff changeset
490 ptx_dev->active_streams = ptx_dev->active_streams->next;
kono
parents:
diff changeset
491
kono
parents:
diff changeset
492 ret &= map_fini (s);
kono
parents:
diff changeset
493
kono
parents:
diff changeset
494 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
kono
parents:
diff changeset
495 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
496 {
kono
parents:
diff changeset
497 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
kono
parents:
diff changeset
498 ret = false;
kono
parents:
diff changeset
499 }
kono
parents:
diff changeset
500 free (s);
kono
parents:
diff changeset
501 }
kono
parents:
diff changeset
502
kono
parents:
diff changeset
503 ret &= map_fini (ptx_dev->null_stream);
kono
parents:
diff changeset
504 free (ptx_dev->null_stream);
kono
parents:
diff changeset
505 return ret;
kono
parents:
diff changeset
506 }
kono
parents:
diff changeset
507
kono
parents:
diff changeset
508 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
kono
parents:
diff changeset
509 thread THREAD (and also current device/context). If CREATE is true, create
kono
parents:
diff changeset
510 the stream if it does not exist (or use EXISTING if it is non-NULL), and
kono
parents:
diff changeset
511 associate the stream with the same thread argument. Returns stream to use
kono
parents:
diff changeset
512 as result. */
kono
parents:
diff changeset
513
kono
parents:
diff changeset
514 static struct ptx_stream *
kono
parents:
diff changeset
515 select_stream_for_async (int async, pthread_t thread, bool create,
kono
parents:
diff changeset
516 CUstream existing)
kono
parents:
diff changeset
517 {
kono
parents:
diff changeset
518 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
519 /* Local copy of TLS variable. */
kono
parents:
diff changeset
520 struct ptx_device *ptx_dev = nvthd->ptx_dev;
kono
parents:
diff changeset
521 struct ptx_stream *stream = NULL;
kono
parents:
diff changeset
522 int orig_async = async;
kono
parents:
diff changeset
523
kono
parents:
diff changeset
524 /* The special value acc_async_noval (-1) maps (for now) to an
kono
parents:
diff changeset
525 implicitly-created stream, which is then handled the same as any other
kono
parents:
diff changeset
526 numbered async stream. Other options are available, e.g. using the null
kono
parents:
diff changeset
527 stream for anonymous async operations, or choosing an idle stream from an
kono
parents:
diff changeset
528 active set. But, stick with this for now. */
kono
parents:
diff changeset
529 if (async > acc_async_sync)
kono
parents:
diff changeset
530 async++;
kono
parents:
diff changeset
531
kono
parents:
diff changeset
532 if (create)
kono
parents:
diff changeset
533 pthread_mutex_lock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
534
kono
parents:
diff changeset
535 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
kono
parents:
diff changeset
536 null stream, and in fact better performance may be obtainable if it doesn't
kono
parents:
diff changeset
537 (because the null stream enforces overly-strict synchronisation with
kono
parents:
diff changeset
538 respect to other streams for legacy reasons, and that's probably not
kono
parents:
diff changeset
539 needed with OpenACC). Maybe investigate later. */
kono
parents:
diff changeset
540 if (async == acc_async_sync)
kono
parents:
diff changeset
541 stream = ptx_dev->null_stream;
kono
parents:
diff changeset
542 else if (async >= 0 && async < ptx_dev->async_streams.size
kono
parents:
diff changeset
543 && ptx_dev->async_streams.arr[async] && !(create && existing))
kono
parents:
diff changeset
544 stream = ptx_dev->async_streams.arr[async];
kono
parents:
diff changeset
545 else if (async >= 0 && create)
kono
parents:
diff changeset
546 {
kono
parents:
diff changeset
547 if (async >= ptx_dev->async_streams.size)
kono
parents:
diff changeset
548 {
kono
parents:
diff changeset
549 int i, newsize = ptx_dev->async_streams.size * 2;
kono
parents:
diff changeset
550
kono
parents:
diff changeset
551 if (async >= newsize)
kono
parents:
diff changeset
552 newsize = async + 1;
kono
parents:
diff changeset
553
kono
parents:
diff changeset
554 ptx_dev->async_streams.arr
kono
parents:
diff changeset
555 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
kono
parents:
diff changeset
556 newsize * sizeof (struct ptx_stream *));
kono
parents:
diff changeset
557
kono
parents:
diff changeset
558 for (i = ptx_dev->async_streams.size; i < newsize; i++)
kono
parents:
diff changeset
559 ptx_dev->async_streams.arr[i] = NULL;
kono
parents:
diff changeset
560
kono
parents:
diff changeset
561 ptx_dev->async_streams.size = newsize;
kono
parents:
diff changeset
562 }
kono
parents:
diff changeset
563
kono
parents:
diff changeset
564 /* Create a new stream on-demand if there isn't one already, or if we're
kono
parents:
diff changeset
565 setting a particular async value to an existing (externally-provided)
kono
parents:
diff changeset
566 stream. */
kono
parents:
diff changeset
567 if (!ptx_dev->async_streams.arr[async] || existing)
kono
parents:
diff changeset
568 {
kono
parents:
diff changeset
569 CUresult r;
kono
parents:
diff changeset
570 struct ptx_stream *s
kono
parents:
diff changeset
571 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
kono
parents:
diff changeset
572
kono
parents:
diff changeset
573 if (existing)
kono
parents:
diff changeset
574 s->stream = existing;
kono
parents:
diff changeset
575 else
kono
parents:
diff changeset
576 {
kono
parents:
diff changeset
577 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
kono
parents:
diff changeset
578 CU_STREAM_DEFAULT);
kono
parents:
diff changeset
579 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
580 {
kono
parents:
diff changeset
581 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
582 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
kono
parents:
diff changeset
583 cuda_error (r));
kono
parents:
diff changeset
584 }
kono
parents:
diff changeset
585 }
kono
parents:
diff changeset
586
kono
parents:
diff changeset
587 /* If CREATE is true, we're going to be queueing some work on this
kono
parents:
diff changeset
588 stream. Associate it with the current host thread. */
kono
parents:
diff changeset
589 s->host_thread = thread;
kono
parents:
diff changeset
590 s->multithreaded = false;
kono
parents:
diff changeset
591
kono
parents:
diff changeset
592 if (!map_init (s))
kono
parents:
diff changeset
593 {
kono
parents:
diff changeset
594 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
595 GOMP_PLUGIN_fatal ("map_init fail");
kono
parents:
diff changeset
596 }
kono
parents:
diff changeset
597
kono
parents:
diff changeset
598 s->next = ptx_dev->active_streams;
kono
parents:
diff changeset
599 ptx_dev->active_streams = s;
kono
parents:
diff changeset
600 ptx_dev->async_streams.arr[async] = s;
kono
parents:
diff changeset
601 }
kono
parents:
diff changeset
602
kono
parents:
diff changeset
603 stream = ptx_dev->async_streams.arr[async];
kono
parents:
diff changeset
604 }
kono
parents:
diff changeset
605 else if (async < 0)
kono
parents:
diff changeset
606 {
kono
parents:
diff changeset
607 if (create)
kono
parents:
diff changeset
608 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
609 GOMP_PLUGIN_fatal ("bad async %d", async);
kono
parents:
diff changeset
610 }
kono
parents:
diff changeset
611
kono
parents:
diff changeset
612 if (create)
kono
parents:
diff changeset
613 {
kono
parents:
diff changeset
614 assert (stream != NULL);
kono
parents:
diff changeset
615
kono
parents:
diff changeset
616 /* If we're trying to use the same stream from different threads
kono
parents:
diff changeset
617 simultaneously, set stream->multithreaded to true. This affects the
kono
parents:
diff changeset
618 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
kono
parents:
diff changeset
619 only wait for asynchronous launches from the same host thread they are
kono
parents:
diff changeset
620 invoked on. If multiple threads use the same async value, we make note
kono
parents:
diff changeset
621 of that here and fall back to testing/waiting for all threads in those
kono
parents:
diff changeset
622 functions. */
kono
parents:
diff changeset
623 if (thread != stream->host_thread)
kono
parents:
diff changeset
624 stream->multithreaded = true;
kono
parents:
diff changeset
625
kono
parents:
diff changeset
626 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
627 }
kono
parents:
diff changeset
628 else if (stream && !stream->multithreaded
kono
parents:
diff changeset
629 && !pthread_equal (stream->host_thread, thread))
kono
parents:
diff changeset
630 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
kono
parents:
diff changeset
631
kono
parents:
diff changeset
632 return stream;
kono
parents:
diff changeset
633 }
kono
parents:
diff changeset
634
kono
parents:
diff changeset
635 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
kono
parents:
diff changeset
636 should be locked on entry and remains locked on exit. */
kono
parents:
diff changeset
637
kono
parents:
diff changeset
638 static bool
kono
parents:
diff changeset
639 nvptx_init (void)
kono
parents:
diff changeset
640 {
kono
parents:
diff changeset
641 int ndevs;
kono
parents:
diff changeset
642
kono
parents:
diff changeset
643 if (instantiated_devices != 0)
kono
parents:
diff changeset
644 return true;
kono
parents:
diff changeset
645
kono
parents:
diff changeset
646 ptx_events = NULL;
kono
parents:
diff changeset
647 pthread_mutex_init (&ptx_event_lock, NULL);
kono
parents:
diff changeset
648
kono
parents:
diff changeset
649 if (!init_cuda_lib ())
kono
parents:
diff changeset
650 return false;
kono
parents:
diff changeset
651
kono
parents:
diff changeset
652 CUDA_CALL (cuInit, 0);
kono
parents:
diff changeset
653
kono
parents:
diff changeset
654 CUDA_CALL (cuDeviceGetCount, &ndevs);
kono
parents:
diff changeset
655 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
kono
parents:
diff changeset
656 * ndevs);
kono
parents:
diff changeset
657 return true;
kono
parents:
diff changeset
658 }
kono
parents:
diff changeset
659
kono
parents:
diff changeset
660 /* Select the N'th PTX device for the current host thread. The device must
kono
parents:
diff changeset
661 have been previously opened before calling this function. */
kono
parents:
diff changeset
662
kono
parents:
diff changeset
663 static bool
kono
parents:
diff changeset
664 nvptx_attach_host_thread_to_device (int n)
kono
parents:
diff changeset
665 {
kono
parents:
diff changeset
666 CUdevice dev;
kono
parents:
diff changeset
667 CUresult r;
kono
parents:
diff changeset
668 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
669 CUcontext thd_ctx;
kono
parents:
diff changeset
670
kono
parents:
diff changeset
671 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
kono
parents:
diff changeset
672 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
kono
parents:
diff changeset
673 {
kono
parents:
diff changeset
674 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
kono
parents:
diff changeset
675 return false;
kono
parents:
diff changeset
676 }
kono
parents:
diff changeset
677
kono
parents:
diff changeset
678 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
kono
parents:
diff changeset
679 return true;
kono
parents:
diff changeset
680 else
kono
parents:
diff changeset
681 {
kono
parents:
diff changeset
682 CUcontext old_ctx;
kono
parents:
diff changeset
683
kono
parents:
diff changeset
684 ptx_dev = ptx_devices[n];
kono
parents:
diff changeset
685 if (!ptx_dev)
kono
parents:
diff changeset
686 {
kono
parents:
diff changeset
687 GOMP_PLUGIN_error ("device %d not found", n);
kono
parents:
diff changeset
688 return false;
kono
parents:
diff changeset
689 }
kono
parents:
diff changeset
690
kono
parents:
diff changeset
691 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
kono
parents:
diff changeset
692
kono
parents:
diff changeset
693 /* We don't necessarily have a current context (e.g. if it has been
kono
parents:
diff changeset
694 destroyed. Pop it if we do though. */
kono
parents:
diff changeset
695 if (thd_ctx != NULL)
kono
parents:
diff changeset
696 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
kono
parents:
diff changeset
697
kono
parents:
diff changeset
698 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
kono
parents:
diff changeset
699 }
kono
parents:
diff changeset
700 return true;
kono
parents:
diff changeset
701 }
kono
parents:
diff changeset
702
kono
parents:
diff changeset
703 static struct ptx_device *
kono
parents:
diff changeset
704 nvptx_open_device (int n)
kono
parents:
diff changeset
705 {
kono
parents:
diff changeset
706 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
707 CUdevice dev, ctx_dev;
kono
parents:
diff changeset
708 CUresult r;
kono
parents:
diff changeset
709 int async_engines, pi;
kono
parents:
diff changeset
710
kono
parents:
diff changeset
711 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
kono
parents:
diff changeset
712
kono
parents:
diff changeset
713 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
kono
parents:
diff changeset
714
kono
parents:
diff changeset
715 ptx_dev->ord = n;
kono
parents:
diff changeset
716 ptx_dev->dev = dev;
kono
parents:
diff changeset
717 ptx_dev->ctx_shared = false;
kono
parents:
diff changeset
718
kono
parents:
diff changeset
719 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
kono
parents:
diff changeset
720 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
kono
parents:
diff changeset
721 {
kono
parents:
diff changeset
722 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
kono
parents:
diff changeset
723 return NULL;
kono
parents:
diff changeset
724 }
kono
parents:
diff changeset
725
kono
parents:
diff changeset
726 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
kono
parents:
diff changeset
727 {
kono
parents:
diff changeset
728 /* The current host thread has an active context for a different device.
kono
parents:
diff changeset
729 Detach it. */
kono
parents:
diff changeset
730 CUcontext old_ctx;
kono
parents:
diff changeset
731 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
kono
parents:
diff changeset
732 }
kono
parents:
diff changeset
733
kono
parents:
diff changeset
734 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
kono
parents:
diff changeset
735
kono
parents:
diff changeset
736 if (!ptx_dev->ctx)
kono
parents:
diff changeset
737 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
kono
parents:
diff changeset
738 else
kono
parents:
diff changeset
739 ptx_dev->ctx_shared = true;
kono
parents:
diff changeset
740
kono
parents:
diff changeset
741 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
742 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
kono
parents:
diff changeset
743 ptx_dev->overlap = pi;
kono
parents:
diff changeset
744
kono
parents:
diff changeset
745 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
746 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
kono
parents:
diff changeset
747 ptx_dev->map = pi;
kono
parents:
diff changeset
748
kono
parents:
diff changeset
749 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
750 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
kono
parents:
diff changeset
751 ptx_dev->concur = pi;
kono
parents:
diff changeset
752
kono
parents:
diff changeset
753 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
754 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
kono
parents:
diff changeset
755 ptx_dev->mode = pi;
kono
parents:
diff changeset
756
kono
parents:
diff changeset
757 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
758 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
kono
parents:
diff changeset
759 ptx_dev->mkern = pi;
kono
parents:
diff changeset
760
kono
parents:
diff changeset
761 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
762 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
kono
parents:
diff changeset
763 ptx_dev->clock_khz = pi;
kono
parents:
diff changeset
764
kono
parents:
diff changeset
765 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
766 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
kono
parents:
diff changeset
767 ptx_dev->num_sms = pi;
kono
parents:
diff changeset
768
kono
parents:
diff changeset
769 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
770 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
kono
parents:
diff changeset
771 ptx_dev->regs_per_block = pi;
kono
parents:
diff changeset
772
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
773 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
111
kono
parents:
diff changeset
774 in CUDA 6.0 and newer. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
775 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
776 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
777 dev);
111
kono
parents:
diff changeset
778 /* Fallback: use limit of registers per block, which is usually equal. */
kono
parents:
diff changeset
779 if (r == CUDA_ERROR_INVALID_VALUE)
kono
parents:
diff changeset
780 pi = ptx_dev->regs_per_block;
kono
parents:
diff changeset
781 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
782 {
kono
parents:
diff changeset
783 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
kono
parents:
diff changeset
784 return NULL;
kono
parents:
diff changeset
785 }
kono
parents:
diff changeset
786 ptx_dev->regs_per_sm = pi;
kono
parents:
diff changeset
787
kono
parents:
diff changeset
788 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
789 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
kono
parents:
diff changeset
790 if (pi != 32)
kono
parents:
diff changeset
791 {
kono
parents:
diff changeset
792 GOMP_PLUGIN_error ("Only warp size 32 is supported");
kono
parents:
diff changeset
793 return NULL;
kono
parents:
diff changeset
794 }
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
795 ptx_dev->warp_size = pi;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
796
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
797 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
798 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
799 ptx_dev->max_threads_per_block = pi;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
800
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
801 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
802 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
803 ptx_dev->max_threads_per_multiprocessor = pi;
111
kono
parents:
diff changeset
804
kono
parents:
diff changeset
805 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
kono
parents:
diff changeset
806 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
kono
parents:
diff changeset
807 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
808 async_engines = 1;
kono
parents:
diff changeset
809
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
810 for (int i = 0; i != GOMP_DIM_MAX; i++)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
811 ptx_dev->default_dims[i] = 0;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
812
111
kono
parents:
diff changeset
813 ptx_dev->images = NULL;
kono
parents:
diff changeset
814 pthread_mutex_init (&ptx_dev->image_lock, NULL);
kono
parents:
diff changeset
815
kono
parents:
diff changeset
816 if (!init_streams_for_device (ptx_dev, async_engines))
kono
parents:
diff changeset
817 return NULL;
kono
parents:
diff changeset
818
kono
parents:
diff changeset
819 return ptx_dev;
kono
parents:
diff changeset
820 }
kono
parents:
diff changeset
821
kono
parents:
diff changeset
822 static bool
kono
parents:
diff changeset
823 nvptx_close_device (struct ptx_device *ptx_dev)
kono
parents:
diff changeset
824 {
kono
parents:
diff changeset
825 if (!ptx_dev)
kono
parents:
diff changeset
826 return true;
kono
parents:
diff changeset
827
kono
parents:
diff changeset
828 if (!fini_streams_for_device (ptx_dev))
kono
parents:
diff changeset
829 return false;
kono
parents:
diff changeset
830
kono
parents:
diff changeset
831 pthread_mutex_destroy (&ptx_dev->image_lock);
kono
parents:
diff changeset
832
kono
parents:
diff changeset
833 if (!ptx_dev->ctx_shared)
kono
parents:
diff changeset
834 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
kono
parents:
diff changeset
835
kono
parents:
diff changeset
836 free (ptx_dev);
kono
parents:
diff changeset
837 return true;
kono
parents:
diff changeset
838 }
kono
parents:
diff changeset
839
kono
parents:
diff changeset
840 static int
kono
parents:
diff changeset
841 nvptx_get_num_devices (void)
kono
parents:
diff changeset
842 {
kono
parents:
diff changeset
843 int n;
kono
parents:
diff changeset
844
kono
parents:
diff changeset
845 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
kono
parents:
diff changeset
846 configurations. */
kono
parents:
diff changeset
847 if (sizeof (void *) != 8)
kono
parents:
diff changeset
848 {
kono
parents:
diff changeset
849 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
kono
parents:
diff changeset
850 " only 64-bit configurations are supported\n");
kono
parents:
diff changeset
851 return 0;
kono
parents:
diff changeset
852 }
kono
parents:
diff changeset
853
kono
parents:
diff changeset
854 /* This function will be called before the plugin has been initialized in
kono
parents:
diff changeset
855 order to enumerate available devices, but CUDA API routines can't be used
kono
parents:
diff changeset
856 until cuInit has been called. Just call it now (but don't yet do any
kono
parents:
diff changeset
857 further initialization). */
kono
parents:
diff changeset
858 if (instantiated_devices == 0)
kono
parents:
diff changeset
859 {
kono
parents:
diff changeset
860 if (!init_cuda_lib ())
kono
parents:
diff changeset
861 return 0;
kono
parents:
diff changeset
862 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
kono
parents:
diff changeset
863 /* This is not an error: e.g. we may have CUDA libraries installed but
kono
parents:
diff changeset
864 no devices available. */
kono
parents:
diff changeset
865 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
866 {
kono
parents:
diff changeset
867 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
kono
parents:
diff changeset
868 cuda_error (r));
kono
parents:
diff changeset
869 return 0;
kono
parents:
diff changeset
870 }
kono
parents:
diff changeset
871 }
kono
parents:
diff changeset
872
kono
parents:
diff changeset
873 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
kono
parents:
diff changeset
874 return n;
kono
parents:
diff changeset
875 }
kono
parents:
diff changeset
876
kono
parents:
diff changeset
877 static void
kono
parents:
diff changeset
878 notify_var (const char *var_name, const char *env_var)
kono
parents:
diff changeset
879 {
kono
parents:
diff changeset
880 if (env_var == NULL)
kono
parents:
diff changeset
881 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
kono
parents:
diff changeset
882 else
kono
parents:
diff changeset
883 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
kono
parents:
diff changeset
884 }
kono
parents:
diff changeset
885
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
886 static void
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
887 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
888 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
889 const char *var_name = "GOMP_NVPTX_JIT";
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
890 const char *env_var = secure_getenv (var_name);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
891 notify_var (var_name, env_var);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
892
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
893 if (env_var == NULL)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
894 return;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
895
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
896 const char *c = env_var;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
897 while (*c != '\0')
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
898 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
899 while (*c == ' ')
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
900 c++;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
901
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
902 if (c[0] == '-' && c[1] == 'O'
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
903 && '0' <= c[2] && c[2] <= '4'
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
904 && (c[3] == '\0' || c[3] == ' '))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
905 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
906 *gomp_nvptx_o = c[2] - '0';
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
907 c += 3;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
908 continue;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
909 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
910
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
911 GOMP_PLUGIN_error ("Error parsing %s", var_name);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
912 break;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
913 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
914 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
915
111
kono
parents:
diff changeset
916 static bool
kono
parents:
diff changeset
917 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
kono
parents:
diff changeset
918 unsigned num_objs)
kono
parents:
diff changeset
919 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
920 CUjit_option opts[7];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
921 void *optvals[7];
111
kono
parents:
diff changeset
922 float elapsed = 0.0;
kono
parents:
diff changeset
923 char elog[1024];
kono
parents:
diff changeset
924 char ilog[16384];
kono
parents:
diff changeset
925 CUlinkState linkstate;
kono
parents:
diff changeset
926 CUresult r;
kono
parents:
diff changeset
927 void *linkout;
kono
parents:
diff changeset
928 size_t linkoutsize __attribute__ ((unused));
kono
parents:
diff changeset
929
kono
parents:
diff changeset
930 opts[0] = CU_JIT_WALL_TIME;
kono
parents:
diff changeset
931 optvals[0] = &elapsed;
kono
parents:
diff changeset
932
kono
parents:
diff changeset
933 opts[1] = CU_JIT_INFO_LOG_BUFFER;
kono
parents:
diff changeset
934 optvals[1] = &ilog[0];
kono
parents:
diff changeset
935
kono
parents:
diff changeset
936 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
kono
parents:
diff changeset
937 optvals[2] = (void *) sizeof ilog;
kono
parents:
diff changeset
938
kono
parents:
diff changeset
939 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
kono
parents:
diff changeset
940 optvals[3] = &elog[0];
kono
parents:
diff changeset
941
kono
parents:
diff changeset
942 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
kono
parents:
diff changeset
943 optvals[4] = (void *) sizeof elog;
kono
parents:
diff changeset
944
kono
parents:
diff changeset
945 opts[5] = CU_JIT_LOG_VERBOSE;
kono
parents:
diff changeset
946 optvals[5] = (void *) 1;
kono
parents:
diff changeset
947
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
948 static intptr_t gomp_nvptx_o = -1;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
949
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
950 static bool init_done = false;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
951 if (!init_done)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
952 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
953 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
954 init_done = true;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
955 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
956
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
957 int nopts = 6;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
958 if (gomp_nvptx_o != -1)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
959 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
960 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
961 optvals[nopts] = (void *) gomp_nvptx_o;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
962 nopts++;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
963 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
964
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
965 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
966 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
967 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
968 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
111
kono
parents:
diff changeset
969
kono
parents:
diff changeset
970 for (; num_objs--; ptx_objs++)
kono
parents:
diff changeset
971 {
kono
parents:
diff changeset
972 /* cuLinkAddData's 'data' argument erroneously omits the const
kono
parents:
diff changeset
973 qualifier. */
kono
parents:
diff changeset
974 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
975 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
976 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
977 (char *) ptx_objs->code, ptx_objs->size,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
978 0, 0, 0, 0);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
979 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
980 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
981 (char *) ptx_objs->code, ptx_objs->size,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
982 0, 0, 0, 0);
111
kono
parents:
diff changeset
983 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
984 {
kono
parents:
diff changeset
985 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
kono
parents:
diff changeset
986 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
kono
parents:
diff changeset
987 cuda_error (r));
kono
parents:
diff changeset
988 return false;
kono
parents:
diff changeset
989 }
kono
parents:
diff changeset
990 }
kono
parents:
diff changeset
991
kono
parents:
diff changeset
992 GOMP_PLUGIN_debug (0, "Linking\n");
kono
parents:
diff changeset
993 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
kono
parents:
diff changeset
994
kono
parents:
diff changeset
995 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
kono
parents:
diff changeset
996 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
kono
parents:
diff changeset
997
kono
parents:
diff changeset
998 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
999 {
kono
parents:
diff changeset
1000 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
kono
parents:
diff changeset
1001 return false;
kono
parents:
diff changeset
1002 }
kono
parents:
diff changeset
1003
kono
parents:
diff changeset
1004 CUDA_CALL (cuModuleLoadData, module, linkout);
kono
parents:
diff changeset
1005 CUDA_CALL (cuLinkDestroy, linkstate);
kono
parents:
diff changeset
1006 return true;
kono
parents:
diff changeset
1007 }
kono
parents:
diff changeset
1008
kono
parents:
diff changeset
1009 static void
kono
parents:
diff changeset
1010 event_gc (bool memmap_lockable)
kono
parents:
diff changeset
1011 {
kono
parents:
diff changeset
1012 struct ptx_event *ptx_event = ptx_events;
kono
parents:
diff changeset
1013 struct ptx_event *async_cleanups = NULL;
kono
parents:
diff changeset
1014 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1015
kono
parents:
diff changeset
1016 pthread_mutex_lock (&ptx_event_lock);
kono
parents:
diff changeset
1017
kono
parents:
diff changeset
1018 while (ptx_event != NULL)
kono
parents:
diff changeset
1019 {
kono
parents:
diff changeset
1020 CUresult r;
kono
parents:
diff changeset
1021 struct ptx_event *e = ptx_event;
kono
parents:
diff changeset
1022
kono
parents:
diff changeset
1023 ptx_event = ptx_event->next;
kono
parents:
diff changeset
1024
kono
parents:
diff changeset
1025 if (e->ord != nvthd->ptx_dev->ord)
kono
parents:
diff changeset
1026 continue;
kono
parents:
diff changeset
1027
kono
parents:
diff changeset
1028 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
kono
parents:
diff changeset
1029 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
1030 {
kono
parents:
diff changeset
1031 bool append_async = false;
kono
parents:
diff changeset
1032 CUevent *te;
kono
parents:
diff changeset
1033
kono
parents:
diff changeset
1034 te = e->evt;
kono
parents:
diff changeset
1035
kono
parents:
diff changeset
1036 switch (e->type)
kono
parents:
diff changeset
1037 {
kono
parents:
diff changeset
1038 case PTX_EVT_MEM:
kono
parents:
diff changeset
1039 case PTX_EVT_SYNC:
kono
parents:
diff changeset
1040 break;
kono
parents:
diff changeset
1041
kono
parents:
diff changeset
1042 case PTX_EVT_KNL:
kono
parents:
diff changeset
1043 map_pop (e->addr);
kono
parents:
diff changeset
1044 break;
kono
parents:
diff changeset
1045
kono
parents:
diff changeset
1046 case PTX_EVT_ASYNC_CLEANUP:
kono
parents:
diff changeset
1047 {
kono
parents:
diff changeset
1048 /* The function gomp_plugin_async_unmap_vars needs to claim the
kono
parents:
diff changeset
1049 memory-map splay tree lock for the current device, so we
kono
parents:
diff changeset
1050 can't call it when one of our callers has already claimed
kono
parents:
diff changeset
1051 the lock. In that case, just delay the GC for this event
kono
parents:
diff changeset
1052 until later. */
kono
parents:
diff changeset
1053 if (!memmap_lockable)
kono
parents:
diff changeset
1054 continue;
kono
parents:
diff changeset
1055
kono
parents:
diff changeset
1056 append_async = true;
kono
parents:
diff changeset
1057 }
kono
parents:
diff changeset
1058 break;
kono
parents:
diff changeset
1059 }
kono
parents:
diff changeset
1060
kono
parents:
diff changeset
1061 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
kono
parents:
diff changeset
1062 free ((void *)te);
kono
parents:
diff changeset
1063
kono
parents:
diff changeset
1064 /* Unlink 'e' from ptx_events list. */
kono
parents:
diff changeset
1065 if (ptx_events == e)
kono
parents:
diff changeset
1066 ptx_events = ptx_events->next;
kono
parents:
diff changeset
1067 else
kono
parents:
diff changeset
1068 {
kono
parents:
diff changeset
1069 struct ptx_event *e_ = ptx_events;
kono
parents:
diff changeset
1070 while (e_->next != e)
kono
parents:
diff changeset
1071 e_ = e_->next;
kono
parents:
diff changeset
1072 e_->next = e_->next->next;
kono
parents:
diff changeset
1073 }
kono
parents:
diff changeset
1074
kono
parents:
diff changeset
1075 if (append_async)
kono
parents:
diff changeset
1076 {
kono
parents:
diff changeset
1077 e->next = async_cleanups;
kono
parents:
diff changeset
1078 async_cleanups = e;
kono
parents:
diff changeset
1079 }
kono
parents:
diff changeset
1080 else
kono
parents:
diff changeset
1081 free (e);
kono
parents:
diff changeset
1082 }
kono
parents:
diff changeset
1083 }
kono
parents:
diff changeset
1084
kono
parents:
diff changeset
1085 pthread_mutex_unlock (&ptx_event_lock);
kono
parents:
diff changeset
1086
kono
parents:
diff changeset
1087 /* We have to do these here, after ptx_event_lock is released. */
kono
parents:
diff changeset
1088 while (async_cleanups)
kono
parents:
diff changeset
1089 {
kono
parents:
diff changeset
1090 struct ptx_event *e = async_cleanups;
kono
parents:
diff changeset
1091 async_cleanups = async_cleanups->next;
kono
parents:
diff changeset
1092
kono
parents:
diff changeset
1093 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
kono
parents:
diff changeset
1094 free (e);
kono
parents:
diff changeset
1095 }
kono
parents:
diff changeset
1096 }
kono
parents:
diff changeset
1097
kono
parents:
diff changeset
1098 static void
kono
parents:
diff changeset
1099 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
kono
parents:
diff changeset
1100 {
kono
parents:
diff changeset
1101 struct ptx_event *ptx_event;
kono
parents:
diff changeset
1102 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1103
kono
parents:
diff changeset
1104 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
kono
parents:
diff changeset
1105 || type == PTX_EVT_ASYNC_CLEANUP);
kono
parents:
diff changeset
1106
kono
parents:
diff changeset
1107 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
kono
parents:
diff changeset
1108 ptx_event->type = type;
kono
parents:
diff changeset
1109 ptx_event->evt = e;
kono
parents:
diff changeset
1110 ptx_event->addr = h;
kono
parents:
diff changeset
1111 ptx_event->ord = nvthd->ptx_dev->ord;
kono
parents:
diff changeset
1112 ptx_event->val = val;
kono
parents:
diff changeset
1113
kono
parents:
diff changeset
1114 pthread_mutex_lock (&ptx_event_lock);
kono
parents:
diff changeset
1115
kono
parents:
diff changeset
1116 ptx_event->next = ptx_events;
kono
parents:
diff changeset
1117 ptx_events = ptx_event;
kono
parents:
diff changeset
1118
kono
parents:
diff changeset
1119 pthread_mutex_unlock (&ptx_event_lock);
kono
parents:
diff changeset
1120 }
kono
parents:
diff changeset
1121
kono
parents:
diff changeset
1122 static void
kono
parents:
diff changeset
1123 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
kono
parents:
diff changeset
1124 int async, unsigned *dims, void *targ_mem_desc)
kono
parents:
diff changeset
1125 {
kono
parents:
diff changeset
1126 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
kono
parents:
diff changeset
1127 CUfunction function;
kono
parents:
diff changeset
1128 CUresult r;
kono
parents:
diff changeset
1129 int i;
kono
parents:
diff changeset
1130 struct ptx_stream *dev_str;
kono
parents:
diff changeset
1131 void *kargs[1];
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1132 void *hp;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1133 CUdeviceptr dp;
111
kono
parents:
diff changeset
1134 struct nvptx_thread *nvthd = nvptx_thread ();
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1135 int warp_size = nvthd->ptx_dev->warp_size;
111
kono
parents:
diff changeset
1136 const char *maybe_abort_msg = "(perhaps abort was called)";
kono
parents:
diff changeset
1137
kono
parents:
diff changeset
1138 function = targ_fn->fn;
kono
parents:
diff changeset
1139
kono
parents:
diff changeset
1140 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1141 assert (dev_str == nvthd->current_stream);
kono
parents:
diff changeset
1142
kono
parents:
diff changeset
1143 /* Initialize the launch dimensions. Typically this is constant,
kono
parents:
diff changeset
1144 provided by the device compiler, but we must permit runtime
kono
parents:
diff changeset
1145 values. */
kono
parents:
diff changeset
1146 int seen_zero = 0;
kono
parents:
diff changeset
1147 for (i = 0; i != GOMP_DIM_MAX; i++)
kono
parents:
diff changeset
1148 {
kono
parents:
diff changeset
1149 if (targ_fn->launch->dim[i])
kono
parents:
diff changeset
1150 dims[i] = targ_fn->launch->dim[i];
kono
parents:
diff changeset
1151 if (!dims[i])
kono
parents:
diff changeset
1152 seen_zero = 1;
kono
parents:
diff changeset
1153 }
kono
parents:
diff changeset
1154
kono
parents:
diff changeset
1155 if (seen_zero)
kono
parents:
diff changeset
1156 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1157 pthread_mutex_lock (&ptx_dev_lock);
111
kono
parents:
diff changeset
1158
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1159 static int gomp_openacc_dims[GOMP_DIM_MAX];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1160 if (!gomp_openacc_dims[0])
111
kono
parents:
diff changeset
1161 {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1162 /* See if the user provided GOMP_OPENACC_DIM environment
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1163 variable to specify runtime defaults. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1164 for (int i = 0; i < GOMP_DIM_MAX; ++i)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1165 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1166 }
111
kono
parents:
diff changeset
1167
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1168 if (!nvthd->ptx_dev->default_dims[0])
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1169 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1170 int default_dims[GOMP_DIM_MAX];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1171 for (int i = 0; i < GOMP_DIM_MAX; ++i)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1172 default_dims[i] = gomp_openacc_dims[i];
111
kono
parents:
diff changeset
1173
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1174 int gang, worker, vector;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1175 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1176 int block_size = nvthd->ptx_dev->max_threads_per_block;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1177 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1178 int dev_size = nvthd->ptx_dev->num_sms;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1179 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1180 " dev_size=%d, cpu_size=%d\n",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1181 warp_size, block_size, dev_size, cpu_size);
111
kono
parents:
diff changeset
1182
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1183 gang = (cpu_size / block_size) * dev_size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1184 worker = block_size / warp_size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1185 vector = warp_size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1186 }
111
kono
parents:
diff changeset
1187
kono
parents:
diff changeset
1188 /* There is no upper bound on the gang size. The best size
kono
parents:
diff changeset
1189 matches the hardware configuration. Logical gangs are
kono
parents:
diff changeset
1190 scheduled onto physical hardware. To maximize usage, we
kono
parents:
diff changeset
1191 should guess a large number. */
kono
parents:
diff changeset
1192 if (default_dims[GOMP_DIM_GANG] < 1)
kono
parents:
diff changeset
1193 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
kono
parents:
diff changeset
1194 /* The worker size must not exceed the hardware. */
kono
parents:
diff changeset
1195 if (default_dims[GOMP_DIM_WORKER] < 1
kono
parents:
diff changeset
1196 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
kono
parents:
diff changeset
1197 default_dims[GOMP_DIM_WORKER] = worker;
kono
parents:
diff changeset
1198 /* The vector size must exactly match the hardware. */
kono
parents:
diff changeset
1199 if (default_dims[GOMP_DIM_VECTOR] < 1
kono
parents:
diff changeset
1200 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
kono
parents:
diff changeset
1201 default_dims[GOMP_DIM_VECTOR] = vector;
kono
parents:
diff changeset
1202
kono
parents:
diff changeset
1203 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
kono
parents:
diff changeset
1204 default_dims[GOMP_DIM_GANG],
kono
parents:
diff changeset
1205 default_dims[GOMP_DIM_WORKER],
kono
parents:
diff changeset
1206 default_dims[GOMP_DIM_VECTOR]);
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1207
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1208 for (i = 0; i != GOMP_DIM_MAX; i++)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1209 nvthd->ptx_dev->default_dims[i] = default_dims[i];
111
kono
parents:
diff changeset
1210 }
kono
parents:
diff changeset
1211 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1212
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1213 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1214 bool default_dim_p[GOMP_DIM_MAX];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1215 for (i = 0; i != GOMP_DIM_MAX; i++)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1216 default_dim_p[i] = !dims[i];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1217
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1218 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1219 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1220 for (i = 0; i != GOMP_DIM_MAX; i++)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1221 if (default_dim_p[i])
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1222 dims[i] = nvthd->ptx_dev->default_dims[i];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1223
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1224 if (default_dim_p[GOMP_DIM_VECTOR])
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1225 dims[GOMP_DIM_VECTOR]
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1226 = MIN (dims[GOMP_DIM_VECTOR],
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1227 (targ_fn->max_threads_per_block / warp_size
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1228 * warp_size));
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1229
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1230 if (default_dim_p[GOMP_DIM_WORKER])
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1231 dims[GOMP_DIM_WORKER]
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1232 = MIN (dims[GOMP_DIM_WORKER],
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1233 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1234 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1235 else
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1236 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1237 /* Handle the case that the compiler allows the runtime to choose
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1238 the vector-length conservatively, by ignoring
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1239 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1240 it. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1241 int vectors = 0;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1242 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1243 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1244 exceed targ_fn->max_threads_per_block. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1245 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1246 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1247 int grids, blocks;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1248
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1249 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1250 &blocks, function, NULL, 0,
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1251 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1252 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1253 "grid = %d, block = %d\n", grids, blocks);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1254
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1255 /* Keep the num_gangs proportional to the block size. In
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1256 the case were a block size is limited by shared-memory
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1257 or the register file capacity, the runtime will not
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1258 excessively over assign gangs to the multiprocessor
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1259 units if their state is going to be swapped out even
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1260 more than necessary. The constant factor 2 is there to
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1261 prevent threads from idling when there is insufficient
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1262 work for them. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1263 if (gangs == 0)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1264 gangs = 2 * grids * (blocks / warp_size);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1265
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1266 if (vectors == 0)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1267 vectors = warp_size;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1268
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1269 if (workers == 0)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1270 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1271 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1272 ? vectors
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1273 : dims[GOMP_DIM_VECTOR]);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1274 workers = blocks / actual_vectors;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1275 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1276
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1277 for (i = 0; i != GOMP_DIM_MAX; i++)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1278 if (default_dim_p[i])
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1279 switch (i)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1280 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1281 case GOMP_DIM_GANG: dims[i] = gangs; break;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1282 case GOMP_DIM_WORKER: dims[i] = workers; break;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1283 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1284 default: GOMP_PLUGIN_fatal ("invalid dim");
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1285 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1286 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1287 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1288 }
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1289
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1290 /* Check if the accelerator has sufficient hardware resources to
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1291 launch the offloaded kernel. */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1292 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1293 > targ_fn->max_threads_per_block)
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1294 {
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1295 int suggest_workers
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1296 = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1297 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1298 " launch '%s' with num_workers = %d; recompile the"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1299 " program with 'num_workers = %d' on that offloaded"
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1300 " region or '-fopenacc-dim=:%d'",
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1301 targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1302 suggest_workers, suggest_workers);
111
kono
parents:
diff changeset
1303 }
kono
parents:
diff changeset
1304
kono
parents:
diff changeset
1305 /* This reserves a chunk of a pre-allocated page of memory mapped on both
kono
parents:
diff changeset
1306 the host and the device. HP is a host pointer to the new chunk, and DP is
kono
parents:
diff changeset
1307 the corresponding device pointer. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1308 pthread_mutex_lock (&ptx_event_lock);
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1309 dp = map_push (dev_str, mapnum * sizeof (void *));
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1310 pthread_mutex_unlock (&ptx_event_lock);
111
kono
parents:
diff changeset
1311
kono
parents:
diff changeset
1312 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
kono
parents:
diff changeset
1313
kono
parents:
diff changeset
1314 /* Copy the array of arguments to the mapped page. */
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1315 hp = alloca(sizeof(void *) * mapnum);
111
kono
parents:
diff changeset
1316 for (i = 0; i < mapnum; i++)
kono
parents:
diff changeset
1317 ((void **) hp)[i] = devaddrs[i];
kono
parents:
diff changeset
1318
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1319 /* Copy the (device) pointers to arguments to the device */
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
1320 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
111
kono
parents:
diff changeset
1321 mapnum * sizeof (void *));
kono
parents:
diff changeset
1322 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
kono
parents:
diff changeset
1323 " gangs=%u, workers=%u, vectors=%u\n",
kono
parents:
diff changeset
1324 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
kono
parents:
diff changeset
1325 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
kono
parents:
diff changeset
1326
kono
parents:
diff changeset
1327 // OpenACC CUDA
kono
parents:
diff changeset
1328 //
kono
parents:
diff changeset
1329 // num_gangs nctaid.x
kono
parents:
diff changeset
1330 // num_workers ntid.y
kono
parents:
diff changeset
1331 // vector length ntid.x
kono
parents:
diff changeset
1332
kono
parents:
diff changeset
1333 kargs[0] = &dp;
kono
parents:
diff changeset
1334 CUDA_CALL_ASSERT (cuLaunchKernel, function,
kono
parents:
diff changeset
1335 dims[GOMP_DIM_GANG], 1, 1,
kono
parents:
diff changeset
1336 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
kono
parents:
diff changeset
1337 0, dev_str->stream, kargs, 0);
kono
parents:
diff changeset
1338
kono
parents:
diff changeset
1339 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1340 if (async < acc_async_noval)
kono
parents:
diff changeset
1341 {
kono
parents:
diff changeset
1342 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
kono
parents:
diff changeset
1343 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1344 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1345 maybe_abort_msg);
kono
parents:
diff changeset
1346 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1347 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
1348 }
kono
parents:
diff changeset
1349 else
kono
parents:
diff changeset
1350 {
kono
parents:
diff changeset
1351 CUevent *e;
kono
parents:
diff changeset
1352
kono
parents:
diff changeset
1353 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1354
kono
parents:
diff changeset
1355 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1356 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1357 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1358 maybe_abort_msg);
kono
parents:
diff changeset
1359 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1360 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
kono
parents:
diff changeset
1361
kono
parents:
diff changeset
1362 event_gc (true);
kono
parents:
diff changeset
1363
kono
parents:
diff changeset
1364 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
kono
parents:
diff changeset
1365
kono
parents:
diff changeset
1366 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
kono
parents:
diff changeset
1367 }
kono
parents:
diff changeset
1368 #else
kono
parents:
diff changeset
1369 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
kono
parents:
diff changeset
1370 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1371 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1372 maybe_abort_msg);
kono
parents:
diff changeset
1373 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1374 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
1375 #endif
kono
parents:
diff changeset
1376
kono
parents:
diff changeset
1377 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
kono
parents:
diff changeset
1378 targ_fn->launch->fn);
kono
parents:
diff changeset
1379
kono
parents:
diff changeset
1380 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1381 if (async < acc_async_noval)
kono
parents:
diff changeset
1382 #endif
kono
parents:
diff changeset
1383 map_pop (dev_str);
kono
parents:
diff changeset
1384 }
kono
parents:
diff changeset
1385
kono
parents:
diff changeset
1386 void * openacc_get_current_cuda_context (void);
kono
parents:
diff changeset
1387
kono
parents:
diff changeset
1388 static void *
kono
parents:
diff changeset
1389 nvptx_alloc (size_t s)
kono
parents:
diff changeset
1390 {
kono
parents:
diff changeset
1391 CUdeviceptr d;
kono
parents:
diff changeset
1392
kono
parents:
diff changeset
1393 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
kono
parents:
diff changeset
1394 return (void *) d;
kono
parents:
diff changeset
1395 }
kono
parents:
diff changeset
1396
kono
parents:
diff changeset
1397 static bool
kono
parents:
diff changeset
1398 nvptx_free (void *p)
kono
parents:
diff changeset
1399 {
kono
parents:
diff changeset
1400 CUdeviceptr pb;
kono
parents:
diff changeset
1401 size_t ps;
kono
parents:
diff changeset
1402
kono
parents:
diff changeset
1403 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
kono
parents:
diff changeset
1404 if ((CUdeviceptr) p != pb)
kono
parents:
diff changeset
1405 {
kono
parents:
diff changeset
1406 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1407 return false;
kono
parents:
diff changeset
1408 }
kono
parents:
diff changeset
1409
kono
parents:
diff changeset
1410 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
kono
parents:
diff changeset
1411 return true;
kono
parents:
diff changeset
1412 }
kono
parents:
diff changeset
1413
kono
parents:
diff changeset
1414
kono
parents:
diff changeset
1415 static bool
kono
parents:
diff changeset
1416 nvptx_host2dev (void *d, const void *h, size_t s)
kono
parents:
diff changeset
1417 {
kono
parents:
diff changeset
1418 CUdeviceptr pb;
kono
parents:
diff changeset
1419 size_t ps;
kono
parents:
diff changeset
1420 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1421
kono
parents:
diff changeset
1422 if (!s)
kono
parents:
diff changeset
1423 return true;
kono
parents:
diff changeset
1424 if (!d)
kono
parents:
diff changeset
1425 {
kono
parents:
diff changeset
1426 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1427 return false;
kono
parents:
diff changeset
1428 }
kono
parents:
diff changeset
1429
kono
parents:
diff changeset
1430 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
kono
parents:
diff changeset
1431
kono
parents:
diff changeset
1432 if (!pb)
kono
parents:
diff changeset
1433 {
kono
parents:
diff changeset
1434 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1435 return false;
kono
parents:
diff changeset
1436 }
kono
parents:
diff changeset
1437 if (!h)
kono
parents:
diff changeset
1438 {
kono
parents:
diff changeset
1439 GOMP_PLUGIN_error ("invalid host address");
kono
parents:
diff changeset
1440 return false;
kono
parents:
diff changeset
1441 }
kono
parents:
diff changeset
1442 if (d == h)
kono
parents:
diff changeset
1443 {
kono
parents:
diff changeset
1444 GOMP_PLUGIN_error ("invalid host or device address");
kono
parents:
diff changeset
1445 return false;
kono
parents:
diff changeset
1446 }
kono
parents:
diff changeset
1447 if ((void *)(d + s) > (void *)(pb + ps))
kono
parents:
diff changeset
1448 {
kono
parents:
diff changeset
1449 GOMP_PLUGIN_error ("invalid size");
kono
parents:
diff changeset
1450 return false;
kono
parents:
diff changeset
1451 }
kono
parents:
diff changeset
1452
kono
parents:
diff changeset
1453 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1454 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1455 {
kono
parents:
diff changeset
1456 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1457 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1458 event_gc (false);
kono
parents:
diff changeset
1459 CUDA_CALL (cuMemcpyHtoDAsync,
kono
parents:
diff changeset
1460 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
kono
parents:
diff changeset
1461 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
1462 event_add (PTX_EVT_MEM, e, (void *)h, 0);
kono
parents:
diff changeset
1463 }
kono
parents:
diff changeset
1464 else
kono
parents:
diff changeset
1465 #endif
kono
parents:
diff changeset
1466 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
kono
parents:
diff changeset
1467
kono
parents:
diff changeset
1468 return true;
kono
parents:
diff changeset
1469 }
kono
parents:
diff changeset
1470
kono
parents:
diff changeset
1471 static bool
kono
parents:
diff changeset
1472 nvptx_dev2host (void *h, const void *d, size_t s)
kono
parents:
diff changeset
1473 {
kono
parents:
diff changeset
1474 CUdeviceptr pb;
kono
parents:
diff changeset
1475 size_t ps;
kono
parents:
diff changeset
1476 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1477
kono
parents:
diff changeset
1478 if (!s)
kono
parents:
diff changeset
1479 return true;
kono
parents:
diff changeset
1480 if (!d)
kono
parents:
diff changeset
1481 {
kono
parents:
diff changeset
1482 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1483 return false;
kono
parents:
diff changeset
1484 }
kono
parents:
diff changeset
1485
kono
parents:
diff changeset
1486 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
kono
parents:
diff changeset
1487
kono
parents:
diff changeset
1488 if (!pb)
kono
parents:
diff changeset
1489 {
kono
parents:
diff changeset
1490 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1491 return false;
kono
parents:
diff changeset
1492 }
kono
parents:
diff changeset
1493 if (!h)
kono
parents:
diff changeset
1494 {
kono
parents:
diff changeset
1495 GOMP_PLUGIN_error ("invalid host address");
kono
parents:
diff changeset
1496 return false;
kono
parents:
diff changeset
1497 }
kono
parents:
diff changeset
1498 if (d == h)
kono
parents:
diff changeset
1499 {
kono
parents:
diff changeset
1500 GOMP_PLUGIN_error ("invalid host or device address");
kono
parents:
diff changeset
1501 return false;
kono
parents:
diff changeset
1502 }
kono
parents:
diff changeset
1503 if ((void *)(d + s) > (void *)(pb + ps))
kono
parents:
diff changeset
1504 {
kono
parents:
diff changeset
1505 GOMP_PLUGIN_error ("invalid size");
kono
parents:
diff changeset
1506 return false;
kono
parents:
diff changeset
1507 }
kono
parents:
diff changeset
1508
kono
parents:
diff changeset
1509 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1510 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1511 {
kono
parents:
diff changeset
1512 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1513 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1514 event_gc (false);
kono
parents:
diff changeset
1515 CUDA_CALL (cuMemcpyDtoHAsync,
kono
parents:
diff changeset
1516 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
kono
parents:
diff changeset
1517 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
1518 event_add (PTX_EVT_MEM, e, (void *)h, 0);
kono
parents:
diff changeset
1519 }
kono
parents:
diff changeset
1520 else
kono
parents:
diff changeset
1521 #endif
kono
parents:
diff changeset
1522 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
kono
parents:
diff changeset
1523
kono
parents:
diff changeset
1524 return true;
kono
parents:
diff changeset
1525 }
kono
parents:
diff changeset
1526
kono
parents:
diff changeset
1527 static void
kono
parents:
diff changeset
1528 nvptx_set_async (int async)
kono
parents:
diff changeset
1529 {
kono
parents:
diff changeset
1530 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1531 nvthd->current_stream
kono
parents:
diff changeset
1532 = select_stream_for_async (async, pthread_self (), true, NULL);
kono
parents:
diff changeset
1533 }
kono
parents:
diff changeset
1534
kono
parents:
diff changeset
1535 static int
kono
parents:
diff changeset
1536 nvptx_async_test (int async)
kono
parents:
diff changeset
1537 {
kono
parents:
diff changeset
1538 CUresult r;
kono
parents:
diff changeset
1539 struct ptx_stream *s;
kono
parents:
diff changeset
1540
kono
parents:
diff changeset
1541 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1542
kono
parents:
diff changeset
1543 if (!s)
kono
parents:
diff changeset
1544 GOMP_PLUGIN_fatal ("unknown async %d", async);
kono
parents:
diff changeset
1545
kono
parents:
diff changeset
1546 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
kono
parents:
diff changeset
1547 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
1548 {
kono
parents:
diff changeset
1549 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
kono
parents:
diff changeset
1550 whether all work has completed on this stream, and if so omits the call
kono
parents:
diff changeset
1551 to the wait hook. If that happens, event_gc might not get called
kono
parents:
diff changeset
1552 (which prevents variables from getting unmapped and their associated
kono
parents:
diff changeset
1553 device storage freed), so call it here. */
kono
parents:
diff changeset
1554 event_gc (true);
kono
parents:
diff changeset
1555 return 1;
kono
parents:
diff changeset
1556 }
kono
parents:
diff changeset
1557 else if (r == CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1558 return 0;
kono
parents:
diff changeset
1559
kono
parents:
diff changeset
1560 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
kono
parents:
diff changeset
1561
kono
parents:
diff changeset
1562 return 0;
kono
parents:
diff changeset
1563 }
kono
parents:
diff changeset
1564
kono
parents:
diff changeset
1565 static int
kono
parents:
diff changeset
1566 nvptx_async_test_all (void)
kono
parents:
diff changeset
1567 {
kono
parents:
diff changeset
1568 struct ptx_stream *s;
kono
parents:
diff changeset
1569 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1570 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1571
kono
parents:
diff changeset
1572 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1573
kono
parents:
diff changeset
1574 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
kono
parents:
diff changeset
1575 {
kono
parents:
diff changeset
1576 if ((s->multithreaded || pthread_equal (s->host_thread, self))
kono
parents:
diff changeset
1577 && CUDA_CALL_NOCHECK (cuStreamQuery,
kono
parents:
diff changeset
1578 s->stream) == CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1579 {
kono
parents:
diff changeset
1580 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1581 return 0;
kono
parents:
diff changeset
1582 }
kono
parents:
diff changeset
1583 }
kono
parents:
diff changeset
1584
kono
parents:
diff changeset
1585 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1586
kono
parents:
diff changeset
1587 event_gc (true);
kono
parents:
diff changeset
1588
kono
parents:
diff changeset
1589 return 1;
kono
parents:
diff changeset
1590 }
kono
parents:
diff changeset
1591
kono
parents:
diff changeset
1592 static void
kono
parents:
diff changeset
1593 nvptx_wait (int async)
kono
parents:
diff changeset
1594 {
kono
parents:
diff changeset
1595 struct ptx_stream *s;
kono
parents:
diff changeset
1596
kono
parents:
diff changeset
1597 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1598 if (!s)
kono
parents:
diff changeset
1599 GOMP_PLUGIN_fatal ("unknown async %d", async);
kono
parents:
diff changeset
1600
kono
parents:
diff changeset
1601 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
kono
parents:
diff changeset
1602
kono
parents:
diff changeset
1603 event_gc (true);
kono
parents:
diff changeset
1604 }
kono
parents:
diff changeset
1605
kono
parents:
diff changeset
1606 static void
kono
parents:
diff changeset
1607 nvptx_wait_async (int async1, int async2)
kono
parents:
diff changeset
1608 {
kono
parents:
diff changeset
1609 CUevent *e;
kono
parents:
diff changeset
1610 struct ptx_stream *s1, *s2;
kono
parents:
diff changeset
1611 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1612
kono
parents:
diff changeset
1613 /* The stream that is waiting (rather than being waited for) doesn't
kono
parents:
diff changeset
1614 necessarily have to exist already. */
kono
parents:
diff changeset
1615 s2 = select_stream_for_async (async2, self, true, NULL);
kono
parents:
diff changeset
1616
kono
parents:
diff changeset
1617 s1 = select_stream_for_async (async1, self, false, NULL);
kono
parents:
diff changeset
1618 if (!s1)
kono
parents:
diff changeset
1619 GOMP_PLUGIN_fatal ("invalid async 1\n");
kono
parents:
diff changeset
1620
kono
parents:
diff changeset
1621 if (s1 == s2)
kono
parents:
diff changeset
1622 GOMP_PLUGIN_fatal ("identical parameters");
kono
parents:
diff changeset
1623
kono
parents:
diff changeset
1624 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1625
kono
parents:
diff changeset
1626 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1627
kono
parents:
diff changeset
1628 event_gc (true);
kono
parents:
diff changeset
1629
kono
parents:
diff changeset
1630 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
kono
parents:
diff changeset
1631
kono
parents:
diff changeset
1632 event_add (PTX_EVT_SYNC, e, NULL, 0);
kono
parents:
diff changeset
1633
kono
parents:
diff changeset
1634 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
kono
parents:
diff changeset
1635 }
kono
parents:
diff changeset
1636
kono
parents:
diff changeset
1637 static void
kono
parents:
diff changeset
1638 nvptx_wait_all (void)
kono
parents:
diff changeset
1639 {
kono
parents:
diff changeset
1640 CUresult r;
kono
parents:
diff changeset
1641 struct ptx_stream *s;
kono
parents:
diff changeset
1642 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1643 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1644
kono
parents:
diff changeset
1645 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1646
kono
parents:
diff changeset
1647 /* Wait for active streams initiated by this thread (or by multiple threads)
kono
parents:
diff changeset
1648 to complete. */
kono
parents:
diff changeset
1649 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
kono
parents:
diff changeset
1650 {
kono
parents:
diff changeset
1651 if (s->multithreaded || pthread_equal (s->host_thread, self))
kono
parents:
diff changeset
1652 {
kono
parents:
diff changeset
1653 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
kono
parents:
diff changeset
1654 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
1655 continue;
kono
parents:
diff changeset
1656 else if (r != CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1657 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
kono
parents:
diff changeset
1658
kono
parents:
diff changeset
1659 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
kono
parents:
diff changeset
1660 }
kono
parents:
diff changeset
1661 }
kono
parents:
diff changeset
1662
kono
parents:
diff changeset
1663 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1664
kono
parents:
diff changeset
1665 event_gc (true);
kono
parents:
diff changeset
1666 }
kono
parents:
diff changeset
1667
kono
parents:
diff changeset
1668 static void
kono
parents:
diff changeset
1669 nvptx_wait_all_async (int async)
kono
parents:
diff changeset
1670 {
kono
parents:
diff changeset
1671 struct ptx_stream *waiting_stream, *other_stream;
kono
parents:
diff changeset
1672 CUevent *e;
kono
parents:
diff changeset
1673 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1674 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1675
kono
parents:
diff changeset
1676 /* The stream doing the waiting. This could be the first mention of the
kono
parents:
diff changeset
1677 stream, so create it if necessary. */
kono
parents:
diff changeset
1678 waiting_stream
kono
parents:
diff changeset
1679 = select_stream_for_async (async, pthread_self (), true, NULL);
kono
parents:
diff changeset
1680
kono
parents:
diff changeset
1681 /* Launches on the null stream already block on other streams in the
kono
parents:
diff changeset
1682 context. */
kono
parents:
diff changeset
1683 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1684 return;
kono
parents:
diff changeset
1685
kono
parents:
diff changeset
1686 event_gc (true);
kono
parents:
diff changeset
1687
kono
parents:
diff changeset
1688 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1689
kono
parents:
diff changeset
1690 for (other_stream = nvthd->ptx_dev->active_streams;
kono
parents:
diff changeset
1691 other_stream != NULL;
kono
parents:
diff changeset
1692 other_stream = other_stream->next)
kono
parents:
diff changeset
1693 {
kono
parents:
diff changeset
1694 if (!other_stream->multithreaded
kono
parents:
diff changeset
1695 && !pthread_equal (other_stream->host_thread, self))
kono
parents:
diff changeset
1696 continue;
kono
parents:
diff changeset
1697
kono
parents:
diff changeset
1698 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1699
kono
parents:
diff changeset
1700 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1701
kono
parents:
diff changeset
1702 /* Record an event on the waited-for stream. */
kono
parents:
diff changeset
1703 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
kono
parents:
diff changeset
1704
kono
parents:
diff changeset
1705 event_add (PTX_EVT_SYNC, e, NULL, 0);
kono
parents:
diff changeset
1706
kono
parents:
diff changeset
1707 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
kono
parents:
diff changeset
1708 }
kono
parents:
diff changeset
1709
kono
parents:
diff changeset
1710 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1711 }
kono
parents:
diff changeset
1712
kono
parents:
diff changeset
1713 static void *
kono
parents:
diff changeset
1714 nvptx_get_current_cuda_device (void)
kono
parents:
diff changeset
1715 {
kono
parents:
diff changeset
1716 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1717
kono
parents:
diff changeset
1718 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1719 return NULL;
kono
parents:
diff changeset
1720
kono
parents:
diff changeset
1721 return &nvthd->ptx_dev->dev;
kono
parents:
diff changeset
1722 }
kono
parents:
diff changeset
1723
kono
parents:
diff changeset
1724 static void *
kono
parents:
diff changeset
1725 nvptx_get_current_cuda_context (void)
kono
parents:
diff changeset
1726 {
kono
parents:
diff changeset
1727 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1728
kono
parents:
diff changeset
1729 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1730 return NULL;
kono
parents:
diff changeset
1731
kono
parents:
diff changeset
1732 return nvthd->ptx_dev->ctx;
kono
parents:
diff changeset
1733 }
kono
parents:
diff changeset
1734
kono
parents:
diff changeset
1735 static void *
kono
parents:
diff changeset
1736 nvptx_get_cuda_stream (int async)
kono
parents:
diff changeset
1737 {
kono
parents:
diff changeset
1738 struct ptx_stream *s;
kono
parents:
diff changeset
1739 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1740
kono
parents:
diff changeset
1741 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1742 return NULL;
kono
parents:
diff changeset
1743
kono
parents:
diff changeset
1744 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1745
kono
parents:
diff changeset
1746 return s ? s->stream : NULL;
kono
parents:
diff changeset
1747 }
kono
parents:
diff changeset
1748
kono
parents:
diff changeset
1749 static int
kono
parents:
diff changeset
1750 nvptx_set_cuda_stream (int async, void *stream)
kono
parents:
diff changeset
1751 {
kono
parents:
diff changeset
1752 struct ptx_stream *oldstream;
kono
parents:
diff changeset
1753 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1754 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1755
kono
parents:
diff changeset
1756 if (async < 0)
kono
parents:
diff changeset
1757 GOMP_PLUGIN_fatal ("bad async %d", async);
kono
parents:
diff changeset
1758
kono
parents:
diff changeset
1759 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1760
kono
parents:
diff changeset
1761 /* We have a list of active streams and an array mapping async values to
kono
parents:
diff changeset
1762 entries of that list. We need to take "ownership" of the passed-in stream,
kono
parents:
diff changeset
1763 and add it to our list, removing the previous entry also (if there was one)
kono
parents:
diff changeset
1764 in order to prevent resource leaks. Note the potential for surprise
kono
parents:
diff changeset
1765 here: maybe we should keep track of passed-in streams and leave it up to
kono
parents:
diff changeset
1766 the user to tidy those up, but that doesn't work for stream handles
kono
parents:
diff changeset
1767 returned from acc_get_cuda_stream above... */
kono
parents:
diff changeset
1768
kono
parents:
diff changeset
1769 oldstream = select_stream_for_async (async, self, false, NULL);
kono
parents:
diff changeset
1770
kono
parents:
diff changeset
1771 if (oldstream)
kono
parents:
diff changeset
1772 {
kono
parents:
diff changeset
1773 if (nvthd->ptx_dev->active_streams == oldstream)
kono
parents:
diff changeset
1774 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
kono
parents:
diff changeset
1775 else
kono
parents:
diff changeset
1776 {
kono
parents:
diff changeset
1777 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
kono
parents:
diff changeset
1778 while (s->next != oldstream)
kono
parents:
diff changeset
1779 s = s->next;
kono
parents:
diff changeset
1780 s->next = s->next->next;
kono
parents:
diff changeset
1781 }
kono
parents:
diff changeset
1782
kono
parents:
diff changeset
1783 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
kono
parents:
diff changeset
1784
kono
parents:
diff changeset
1785 if (!map_fini (oldstream))
kono
parents:
diff changeset
1786 GOMP_PLUGIN_fatal ("error when freeing host memory");
kono
parents:
diff changeset
1787
kono
parents:
diff changeset
1788 free (oldstream);
kono
parents:
diff changeset
1789 }
kono
parents:
diff changeset
1790
kono
parents:
diff changeset
1791 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1792
kono
parents:
diff changeset
1793 (void) select_stream_for_async (async, self, true, (CUstream) stream);
kono
parents:
diff changeset
1794
kono
parents:
diff changeset
1795 return 1;
kono
parents:
diff changeset
1796 }
kono
parents:
diff changeset
1797
kono
parents:
diff changeset
1798 /* Plugin entry points. */
kono
parents:
diff changeset
1799
kono
parents:
diff changeset
1800 const char *
kono
parents:
diff changeset
1801 GOMP_OFFLOAD_get_name (void)
kono
parents:
diff changeset
1802 {
kono
parents:
diff changeset
1803 return "nvptx";
kono
parents:
diff changeset
1804 }
kono
parents:
diff changeset
1805
kono
parents:
diff changeset
1806 unsigned int
kono
parents:
diff changeset
1807 GOMP_OFFLOAD_get_caps (void)
kono
parents:
diff changeset
1808 {
kono
parents:
diff changeset
1809 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
kono
parents:
diff changeset
1810 }
kono
parents:
diff changeset
1811
kono
parents:
diff changeset
1812 int
kono
parents:
diff changeset
1813 GOMP_OFFLOAD_get_type (void)
kono
parents:
diff changeset
1814 {
kono
parents:
diff changeset
1815 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
kono
parents:
diff changeset
1816 }
kono
parents:
diff changeset
1817
kono
parents:
diff changeset
1818 int
kono
parents:
diff changeset
1819 GOMP_OFFLOAD_get_num_devices (void)
kono
parents:
diff changeset
1820 {
kono
parents:
diff changeset
1821 return nvptx_get_num_devices ();
kono
parents:
diff changeset
1822 }
kono
parents:
diff changeset
1823
kono
parents:
diff changeset
1824 bool
kono
parents:
diff changeset
1825 GOMP_OFFLOAD_init_device (int n)
kono
parents:
diff changeset
1826 {
kono
parents:
diff changeset
1827 struct ptx_device *dev;
kono
parents:
diff changeset
1828
kono
parents:
diff changeset
1829 pthread_mutex_lock (&ptx_dev_lock);
kono
parents:
diff changeset
1830
kono
parents:
diff changeset
1831 if (!nvptx_init () || ptx_devices[n] != NULL)
kono
parents:
diff changeset
1832 {
kono
parents:
diff changeset
1833 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1834 return false;
kono
parents:
diff changeset
1835 }
kono
parents:
diff changeset
1836
kono
parents:
diff changeset
1837 dev = nvptx_open_device (n);
kono
parents:
diff changeset
1838 if (dev)
kono
parents:
diff changeset
1839 {
kono
parents:
diff changeset
1840 ptx_devices[n] = dev;
kono
parents:
diff changeset
1841 instantiated_devices++;
kono
parents:
diff changeset
1842 }
kono
parents:
diff changeset
1843
kono
parents:
diff changeset
1844 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1845
kono
parents:
diff changeset
1846 return dev != NULL;
kono
parents:
diff changeset
1847 }
kono
parents:
diff changeset
1848
kono
parents:
diff changeset
1849 bool
kono
parents:
diff changeset
1850 GOMP_OFFLOAD_fini_device (int n)
kono
parents:
diff changeset
1851 {
kono
parents:
diff changeset
1852 pthread_mutex_lock (&ptx_dev_lock);
kono
parents:
diff changeset
1853
kono
parents:
diff changeset
1854 if (ptx_devices[n] != NULL)
kono
parents:
diff changeset
1855 {
kono
parents:
diff changeset
1856 if (!nvptx_attach_host_thread_to_device (n)
kono
parents:
diff changeset
1857 || !nvptx_close_device (ptx_devices[n]))
kono
parents:
diff changeset
1858 {
kono
parents:
diff changeset
1859 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1860 return false;
kono
parents:
diff changeset
1861 }
kono
parents:
diff changeset
1862 ptx_devices[n] = NULL;
kono
parents:
diff changeset
1863 instantiated_devices--;
kono
parents:
diff changeset
1864 }
kono
parents:
diff changeset
1865
kono
parents:
diff changeset
1866 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1867 return true;
kono
parents:
diff changeset
1868 }
kono
parents:
diff changeset
1869
kono
parents:
diff changeset
1870 /* Return the libgomp version number we're compatible with. There is
kono
parents:
diff changeset
1871 no requirement for cross-version compatibility. */
kono
parents:
diff changeset
1872
kono
parents:
diff changeset
1873 unsigned
kono
parents:
diff changeset
1874 GOMP_OFFLOAD_version (void)
kono
parents:
diff changeset
1875 {
kono
parents:
diff changeset
1876 return GOMP_VERSION;
kono
parents:
diff changeset
1877 }
kono
parents:
diff changeset
1878
kono
parents:
diff changeset
1879 /* Initialize __nvptx_clocktick, if present in MODULE. */
kono
parents:
diff changeset
1880
kono
parents:
diff changeset
1881 static void
kono
parents:
diff changeset
1882 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
kono
parents:
diff changeset
1883 {
kono
parents:
diff changeset
1884 CUdeviceptr dptr;
kono
parents:
diff changeset
1885 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
kono
parents:
diff changeset
1886 module, "__nvptx_clocktick");
kono
parents:
diff changeset
1887 if (r == CUDA_ERROR_NOT_FOUND)
kono
parents:
diff changeset
1888 return;
kono
parents:
diff changeset
1889 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1890 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
kono
parents:
diff changeset
1891 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
kono
parents:
diff changeset
1892 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
kono
parents:
diff changeset
1893 sizeof (__nvptx_clocktick));
kono
parents:
diff changeset
1894 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1895 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
kono
parents:
diff changeset
1896 }
kono
parents:
diff changeset
1897
kono
parents:
diff changeset
1898 /* Load the (partial) program described by TARGET_DATA to device
kono
parents:
diff changeset
1899 number ORD. Allocate and return TARGET_TABLE. */
kono
parents:
diff changeset
1900
kono
parents:
diff changeset
1901 int
kono
parents:
diff changeset
1902 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
kono
parents:
diff changeset
1903 struct addr_pair **target_table)
kono
parents:
diff changeset
1904 {
kono
parents:
diff changeset
1905 CUmodule module;
kono
parents:
diff changeset
1906 const char *const *var_names;
kono
parents:
diff changeset
1907 const struct targ_fn_launch *fn_descs;
kono
parents:
diff changeset
1908 unsigned int fn_entries, var_entries, i, j;
kono
parents:
diff changeset
1909 struct targ_fn_descriptor *targ_fns;
kono
parents:
diff changeset
1910 struct addr_pair *targ_tbl;
kono
parents:
diff changeset
1911 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
kono
parents:
diff changeset
1912 struct ptx_image_data *new_image;
kono
parents:
diff changeset
1913 struct ptx_device *dev;
kono
parents:
diff changeset
1914
kono
parents:
diff changeset
1915 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
kono
parents:
diff changeset
1916 {
kono
parents:
diff changeset
1917 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
kono
parents:
diff changeset
1918 " (expected %u, received %u)",
kono
parents:
diff changeset
1919 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
kono
parents:
diff changeset
1920 return -1;
kono
parents:
diff changeset
1921 }
kono
parents:
diff changeset
1922
kono
parents:
diff changeset
1923 if (!nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
1924 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
kono
parents:
diff changeset
1925 return -1;
kono
parents:
diff changeset
1926
kono
parents:
diff changeset
1927 dev = ptx_devices[ord];
kono
parents:
diff changeset
1928
kono
parents:
diff changeset
1929 /* The mkoffload utility emits a struct of pointers/integers at the
kono
parents:
diff changeset
1930 start of each offload image. The array of kernel names and the
kono
parents:
diff changeset
1931 functions addresses form a one-to-one correspondence. */
kono
parents:
diff changeset
1932
kono
parents:
diff changeset
1933 var_entries = img_header->var_num;
kono
parents:
diff changeset
1934 var_names = img_header->var_names;
kono
parents:
diff changeset
1935 fn_entries = img_header->fn_num;
kono
parents:
diff changeset
1936 fn_descs = img_header->fn_descs;
kono
parents:
diff changeset
1937
kono
parents:
diff changeset
1938 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
kono
parents:
diff changeset
1939 * (fn_entries + var_entries));
kono
parents:
diff changeset
1940 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
kono
parents:
diff changeset
1941 * fn_entries);
kono
parents:
diff changeset
1942
kono
parents:
diff changeset
1943 *target_table = targ_tbl;
kono
parents:
diff changeset
1944
kono
parents:
diff changeset
1945 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
kono
parents:
diff changeset
1946 new_image->target_data = target_data;
kono
parents:
diff changeset
1947 new_image->module = module;
kono
parents:
diff changeset
1948 new_image->fns = targ_fns;
kono
parents:
diff changeset
1949
kono
parents:
diff changeset
1950 pthread_mutex_lock (&dev->image_lock);
kono
parents:
diff changeset
1951 new_image->next = dev->images;
kono
parents:
diff changeset
1952 dev->images = new_image;
kono
parents:
diff changeset
1953 pthread_mutex_unlock (&dev->image_lock);
kono
parents:
diff changeset
1954
kono
parents:
diff changeset
1955 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
kono
parents:
diff changeset
1956 {
kono
parents:
diff changeset
1957 CUfunction function;
kono
parents:
diff changeset
1958 int nregs, mthrs;
kono
parents:
diff changeset
1959
kono
parents:
diff changeset
1960 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
kono
parents:
diff changeset
1961 fn_descs[i].fn);
kono
parents:
diff changeset
1962 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
kono
parents:
diff changeset
1963 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
kono
parents:
diff changeset
1964 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
kono
parents:
diff changeset
1965 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
kono
parents:
diff changeset
1966
kono
parents:
diff changeset
1967 targ_fns->fn = function;
kono
parents:
diff changeset
1968 targ_fns->launch = &fn_descs[i];
kono
parents:
diff changeset
1969 targ_fns->regs_per_thread = nregs;
kono
parents:
diff changeset
1970 targ_fns->max_threads_per_block = mthrs;
kono
parents:
diff changeset
1971
kono
parents:
diff changeset
1972 targ_tbl->start = (uintptr_t) targ_fns;
kono
parents:
diff changeset
1973 targ_tbl->end = targ_tbl->start + 1;
kono
parents:
diff changeset
1974 }
kono
parents:
diff changeset
1975
kono
parents:
diff changeset
1976 for (j = 0; j < var_entries; j++, targ_tbl++)
kono
parents:
diff changeset
1977 {
kono
parents:
diff changeset
1978 CUdeviceptr var;
kono
parents:
diff changeset
1979 size_t bytes;
kono
parents:
diff changeset
1980
kono
parents:
diff changeset
1981 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
kono
parents:
diff changeset
1982 &var, &bytes, module, var_names[j]);
kono
parents:
diff changeset
1983
kono
parents:
diff changeset
1984 targ_tbl->start = (uintptr_t) var;
kono
parents:
diff changeset
1985 targ_tbl->end = targ_tbl->start + bytes;
kono
parents:
diff changeset
1986 }
kono
parents:
diff changeset
1987
kono
parents:
diff changeset
1988 nvptx_set_clocktick (module, dev);
kono
parents:
diff changeset
1989
kono
parents:
diff changeset
1990 return fn_entries + var_entries;
kono
parents:
diff changeset
1991 }
kono
parents:
diff changeset
1992
kono
parents:
diff changeset
1993 /* Unload the program described by TARGET_DATA. DEV_DATA is the
kono
parents:
diff changeset
1994 function descriptors allocated by G_O_load_image. */
kono
parents:
diff changeset
1995
kono
parents:
diff changeset
1996 bool
kono
parents:
diff changeset
1997 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
kono
parents:
diff changeset
1998 {
kono
parents:
diff changeset
1999 struct ptx_image_data *image, **prev_p;
kono
parents:
diff changeset
2000 struct ptx_device *dev = ptx_devices[ord];
kono
parents:
diff changeset
2001
kono
parents:
diff changeset
2002 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
kono
parents:
diff changeset
2003 {
kono
parents:
diff changeset
2004 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
kono
parents:
diff changeset
2005 " (expected %u, received %u)",
kono
parents:
diff changeset
2006 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
kono
parents:
diff changeset
2007 return false;
kono
parents:
diff changeset
2008 }
kono
parents:
diff changeset
2009
kono
parents:
diff changeset
2010 bool ret = true;
kono
parents:
diff changeset
2011 pthread_mutex_lock (&dev->image_lock);
kono
parents:
diff changeset
2012 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
kono
parents:
diff changeset
2013 if (image->target_data == target_data)
kono
parents:
diff changeset
2014 {
kono
parents:
diff changeset
2015 *prev_p = image->next;
kono
parents:
diff changeset
2016 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
kono
parents:
diff changeset
2017 ret = false;
kono
parents:
diff changeset
2018 free (image->fns);
kono
parents:
diff changeset
2019 free (image);
kono
parents:
diff changeset
2020 break;
kono
parents:
diff changeset
2021 }
kono
parents:
diff changeset
2022 pthread_mutex_unlock (&dev->image_lock);
kono
parents:
diff changeset
2023 return ret;
kono
parents:
diff changeset
2024 }
kono
parents:
diff changeset
2025
kono
parents:
diff changeset
2026 void *
kono
parents:
diff changeset
2027 GOMP_OFFLOAD_alloc (int ord, size_t size)
kono
parents:
diff changeset
2028 {
kono
parents:
diff changeset
2029 if (!nvptx_attach_host_thread_to_device (ord))
kono
parents:
diff changeset
2030 return NULL;
kono
parents:
diff changeset
2031 return nvptx_alloc (size);
kono
parents:
diff changeset
2032 }
kono
parents:
diff changeset
2033
kono
parents:
diff changeset
2034 bool
kono
parents:
diff changeset
2035 GOMP_OFFLOAD_free (int ord, void *ptr)
kono
parents:
diff changeset
2036 {
kono
parents:
diff changeset
2037 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
2038 && nvptx_free (ptr));
kono
parents:
diff changeset
2039 }
kono
parents:
diff changeset
2040
kono
parents:
diff changeset
2041 bool
kono
parents:
diff changeset
2042 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
2043 {
kono
parents:
diff changeset
2044 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
2045 && nvptx_dev2host (dst, src, n));
kono
parents:
diff changeset
2046 }
kono
parents:
diff changeset
2047
kono
parents:
diff changeset
2048 bool
kono
parents:
diff changeset
2049 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
2050 {
kono
parents:
diff changeset
2051 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
2052 && nvptx_host2dev (dst, src, n));
kono
parents:
diff changeset
2053 }
kono
parents:
diff changeset
2054
kono
parents:
diff changeset
2055 bool
kono
parents:
diff changeset
2056 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
2057 {
kono
parents:
diff changeset
2058 struct ptx_device *ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
2059 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
kono
parents:
diff changeset
2060 ptx_dev->null_stream->stream);
kono
parents:
diff changeset
2061 return true;
kono
parents:
diff changeset
2062 }
kono
parents:
diff changeset
2063
kono
parents:
diff changeset
2064 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
kono
parents:
diff changeset
2065
kono
parents:
diff changeset
2066 void
kono
parents:
diff changeset
2067 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
kono
parents:
diff changeset
2068 void **hostaddrs, void **devaddrs,
kono
parents:
diff changeset
2069 int async, unsigned *dims, void *targ_mem_desc)
kono
parents:
diff changeset
2070 {
kono
parents:
diff changeset
2071 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
kono
parents:
diff changeset
2072 }
kono
parents:
diff changeset
2073
kono
parents:
diff changeset
2074 void
kono
parents:
diff changeset
2075 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
kono
parents:
diff changeset
2076 {
kono
parents:
diff changeset
2077 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
2078 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
2079
kono
parents:
diff changeset
2080 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
2081 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
2082 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
kono
parents:
diff changeset
2083 }
kono
parents:
diff changeset
2084
kono
parents:
diff changeset
2085 int
kono
parents:
diff changeset
2086 GOMP_OFFLOAD_openacc_async_test (int async)
kono
parents:
diff changeset
2087 {
kono
parents:
diff changeset
2088 return nvptx_async_test (async);
kono
parents:
diff changeset
2089 }
kono
parents:
diff changeset
2090
kono
parents:
diff changeset
2091 int
kono
parents:
diff changeset
2092 GOMP_OFFLOAD_openacc_async_test_all (void)
kono
parents:
diff changeset
2093 {
kono
parents:
diff changeset
2094 return nvptx_async_test_all ();
kono
parents:
diff changeset
2095 }
kono
parents:
diff changeset
2096
kono
parents:
diff changeset
2097 void
kono
parents:
diff changeset
2098 GOMP_OFFLOAD_openacc_async_wait (int async)
kono
parents:
diff changeset
2099 {
kono
parents:
diff changeset
2100 nvptx_wait (async);
kono
parents:
diff changeset
2101 }
kono
parents:
diff changeset
2102
kono
parents:
diff changeset
2103 void
kono
parents:
diff changeset
2104 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
kono
parents:
diff changeset
2105 {
kono
parents:
diff changeset
2106 nvptx_wait_async (async1, async2);
kono
parents:
diff changeset
2107 }
kono
parents:
diff changeset
2108
kono
parents:
diff changeset
2109 void
kono
parents:
diff changeset
2110 GOMP_OFFLOAD_openacc_async_wait_all (void)
kono
parents:
diff changeset
2111 {
kono
parents:
diff changeset
2112 nvptx_wait_all ();
kono
parents:
diff changeset
2113 }
kono
parents:
diff changeset
2114
kono
parents:
diff changeset
2115 void
kono
parents:
diff changeset
2116 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
kono
parents:
diff changeset
2117 {
kono
parents:
diff changeset
2118 nvptx_wait_all_async (async);
kono
parents:
diff changeset
2119 }
kono
parents:
diff changeset
2120
kono
parents:
diff changeset
2121 void
kono
parents:
diff changeset
2122 GOMP_OFFLOAD_openacc_async_set_async (int async)
kono
parents:
diff changeset
2123 {
kono
parents:
diff changeset
2124 nvptx_set_async (async);
kono
parents:
diff changeset
2125 }
kono
parents:
diff changeset
2126
kono
parents:
diff changeset
2127 void *
kono
parents:
diff changeset
2128 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
kono
parents:
diff changeset
2129 {
kono
parents:
diff changeset
2130 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
2131 struct nvptx_thread *nvthd
kono
parents:
diff changeset
2132 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
kono
parents:
diff changeset
2133 CUcontext thd_ctx;
kono
parents:
diff changeset
2134
kono
parents:
diff changeset
2135 ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
2136
kono
parents:
diff changeset
2137 assert (ptx_dev);
kono
parents:
diff changeset
2138
kono
parents:
diff changeset
2139 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
kono
parents:
diff changeset
2140
kono
parents:
diff changeset
2141 assert (ptx_dev->ctx);
kono
parents:
diff changeset
2142
kono
parents:
diff changeset
2143 if (!thd_ctx)
kono
parents:
diff changeset
2144 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
kono
parents:
diff changeset
2145
kono
parents:
diff changeset
2146 nvthd->current_stream = ptx_dev->null_stream;
kono
parents:
diff changeset
2147 nvthd->ptx_dev = ptx_dev;
kono
parents:
diff changeset
2148
kono
parents:
diff changeset
2149 return (void *) nvthd;
kono
parents:
diff changeset
2150 }
kono
parents:
diff changeset
2151
kono
parents:
diff changeset
2152 void
kono
parents:
diff changeset
2153 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
kono
parents:
diff changeset
2154 {
kono
parents:
diff changeset
2155 free (data);
kono
parents:
diff changeset
2156 }
kono
parents:
diff changeset
2157
kono
parents:
diff changeset
2158 void *
kono
parents:
diff changeset
2159 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
kono
parents:
diff changeset
2160 {
kono
parents:
diff changeset
2161 return nvptx_get_current_cuda_device ();
kono
parents:
diff changeset
2162 }
kono
parents:
diff changeset
2163
kono
parents:
diff changeset
2164 void *
kono
parents:
diff changeset
2165 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
kono
parents:
diff changeset
2166 {
kono
parents:
diff changeset
2167 return nvptx_get_current_cuda_context ();
kono
parents:
diff changeset
2168 }
kono
parents:
diff changeset
2169
kono
parents:
diff changeset
2170 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
kono
parents:
diff changeset
2171
kono
parents:
diff changeset
2172 void *
kono
parents:
diff changeset
2173 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
kono
parents:
diff changeset
2174 {
kono
parents:
diff changeset
2175 return nvptx_get_cuda_stream (async);
kono
parents:
diff changeset
2176 }
kono
parents:
diff changeset
2177
kono
parents:
diff changeset
2178 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
kono
parents:
diff changeset
2179
kono
parents:
diff changeset
2180 int
kono
parents:
diff changeset
2181 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
kono
parents:
diff changeset
2182 {
kono
parents:
diff changeset
2183 return nvptx_set_cuda_stream (async, stream);
kono
parents:
diff changeset
2184 }
kono
parents:
diff changeset
2185
kono
parents:
diff changeset
2186 /* Adjust launch dimensions: pick good values for number of blocks and warps
kono
parents:
diff changeset
2187 and ensure that number of warps does not exceed CUDA limits as well as GCC's
kono
parents:
diff changeset
2188 own limits. */
kono
parents:
diff changeset
2189
kono
parents:
diff changeset
2190 static void
kono
parents:
diff changeset
2191 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
kono
parents:
diff changeset
2192 struct ptx_device *ptx_dev,
kono
parents:
diff changeset
2193 int *teams_p, int *threads_p)
kono
parents:
diff changeset
2194 {
kono
parents:
diff changeset
2195 int max_warps_block = fn->max_threads_per_block / 32;
kono
parents:
diff changeset
2196 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
kono
parents:
diff changeset
2197 and libgcc, which matches documented limit of all GPUs as of 2015. */
kono
parents:
diff changeset
2198 if (max_warps_block > 32)
kono
parents:
diff changeset
2199 max_warps_block = 32;
kono
parents:
diff changeset
2200 if (*threads_p <= 0)
kono
parents:
diff changeset
2201 *threads_p = 8;
kono
parents:
diff changeset
2202 if (*threads_p > max_warps_block)
kono
parents:
diff changeset
2203 *threads_p = max_warps_block;
kono
parents:
diff changeset
2204
kono
parents:
diff changeset
2205 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
kono
parents:
diff changeset
2206 /* This is an estimate of how many blocks the device can host simultaneously.
kono
parents:
diff changeset
2207 Actual limit, which may be lower, can be queried with "occupancy control"
kono
parents:
diff changeset
2208 driver interface (since CUDA 6.0). */
kono
parents:
diff changeset
2209 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
kono
parents:
diff changeset
2210 if (*teams_p <= 0 || *teams_p > max_blocks)
kono
parents:
diff changeset
2211 *teams_p = max_blocks;
kono
parents:
diff changeset
2212 }
kono
parents:
diff changeset
2213
kono
parents:
diff changeset
2214 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
kono
parents:
diff changeset
2215 target regions. */
kono
parents:
diff changeset
2216
kono
parents:
diff changeset
2217 static size_t
kono
parents:
diff changeset
2218 nvptx_stacks_size ()
kono
parents:
diff changeset
2219 {
kono
parents:
diff changeset
2220 return 128 * 1024;
kono
parents:
diff changeset
2221 }
kono
parents:
diff changeset
2222
kono
parents:
diff changeset
2223 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
kono
parents:
diff changeset
2224
kono
parents:
diff changeset
2225 static void *
kono
parents:
diff changeset
2226 nvptx_stacks_alloc (size_t size, int num)
kono
parents:
diff changeset
2227 {
kono
parents:
diff changeset
2228 CUdeviceptr stacks;
kono
parents:
diff changeset
2229 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
kono
parents:
diff changeset
2230 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2231 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
kono
parents:
diff changeset
2232 return (void *) stacks;
kono
parents:
diff changeset
2233 }
kono
parents:
diff changeset
2234
kono
parents:
diff changeset
2235 /* Release storage previously allocated by nvptx_stacks_alloc. */
kono
parents:
diff changeset
2236
kono
parents:
diff changeset
2237 static void
kono
parents:
diff changeset
2238 nvptx_stacks_free (void *p, int num)
kono
parents:
diff changeset
2239 {
kono
parents:
diff changeset
2240 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
kono
parents:
diff changeset
2241 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2242 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
kono
parents:
diff changeset
2243 }
kono
parents:
diff changeset
2244
kono
parents:
diff changeset
2245 void
kono
parents:
diff changeset
2246 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
kono
parents:
diff changeset
2247 {
kono
parents:
diff changeset
2248 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
kono
parents:
diff changeset
2249 CUresult r;
kono
parents:
diff changeset
2250 struct ptx_device *ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
2251 const char *maybe_abort_msg = "(perhaps abort was called)";
kono
parents:
diff changeset
2252 int teams = 0, threads = 0;
kono
parents:
diff changeset
2253
kono
parents:
diff changeset
2254 if (!args)
kono
parents:
diff changeset
2255 GOMP_PLUGIN_fatal ("No target arguments provided");
kono
parents:
diff changeset
2256 while (*args)
kono
parents:
diff changeset
2257 {
kono
parents:
diff changeset
2258 intptr_t id = (intptr_t) *args++, val;
kono
parents:
diff changeset
2259 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
kono
parents:
diff changeset
2260 val = (intptr_t) *args++;
kono
parents:
diff changeset
2261 else
kono
parents:
diff changeset
2262 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
kono
parents:
diff changeset
2263 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
kono
parents:
diff changeset
2264 continue;
kono
parents:
diff changeset
2265 val = val > INT_MAX ? INT_MAX : val;
kono
parents:
diff changeset
2266 id &= GOMP_TARGET_ARG_ID_MASK;
kono
parents:
diff changeset
2267 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
kono
parents:
diff changeset
2268 teams = val;
kono
parents:
diff changeset
2269 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
kono
parents:
diff changeset
2270 threads = val;
kono
parents:
diff changeset
2271 }
kono
parents:
diff changeset
2272 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
kono
parents:
diff changeset
2273
kono
parents:
diff changeset
2274 size_t stack_size = nvptx_stacks_size ();
kono
parents:
diff changeset
2275 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
kono
parents:
diff changeset
2276 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
kono
parents:
diff changeset
2277 size_t fn_args_size = sizeof fn_args;
kono
parents:
diff changeset
2278 void *config[] = {
kono
parents:
diff changeset
2279 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
kono
parents:
diff changeset
2280 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
kono
parents:
diff changeset
2281 CU_LAUNCH_PARAM_END
kono
parents:
diff changeset
2282 };
kono
parents:
diff changeset
2283 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
kono
parents:
diff changeset
2284 32, threads, 1, 0, ptx_dev->null_stream->stream,
kono
parents:
diff changeset
2285 NULL, config);
kono
parents:
diff changeset
2286 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2287 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
kono
parents:
diff changeset
2288
kono
parents:
diff changeset
2289 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
kono
parents:
diff changeset
2290 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
2291 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
2292 maybe_abort_msg);
kono
parents:
diff changeset
2293 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2294 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
2295 nvptx_stacks_free (stacks, teams * threads);
kono
parents:
diff changeset
2296 }
kono
parents:
diff changeset
2297
kono
parents:
diff changeset
2298 void
kono
parents:
diff changeset
2299 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
kono
parents:
diff changeset
2300 void *async_data)
kono
parents:
diff changeset
2301 {
kono
parents:
diff changeset
2302 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
kono
parents:
diff changeset
2303 }