111
|
1 /* Plugin for NVPTX execution.
|
|
2
|
131
|
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
|
111
|
4
|
|
5 Contributed by Mentor Embedded.
|
|
6
|
|
7 This file is part of the GNU Offloading and Multi Processing Library
|
|
8 (libgomp).
|
|
9
|
|
10 Libgomp is free software; you can redistribute it and/or modify it
|
|
11 under the terms of the GNU General Public License as published by
|
|
12 the Free Software Foundation; either version 3, or (at your option)
|
|
13 any later version.
|
|
14
|
|
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
18 more details.
|
|
19
|
|
20 Under Section 7 of GPL version 3, you are granted additional
|
|
21 permissions described in the GCC Runtime Library Exception, version
|
|
22 3.1, as published by the Free Software Foundation.
|
|
23
|
|
24 You should have received a copy of the GNU General Public License and
|
|
25 a copy of the GCC Runtime Library Exception along with this program;
|
|
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
27 <http://www.gnu.org/licenses/>. */
|
|
28
|
|
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
|
|
30 library appears to hold some implicit state, but the documentation
|
|
31 is not clear as to what that state might be. Or how one might
|
|
32 propagate it from one thread to another. */
|
|
33
|
131
|
34 #define _GNU_SOURCE
|
111
|
35 #include "openacc.h"
|
|
36 #include "config.h"
|
|
37 #include "libgomp-plugin.h"
|
|
38 #include "oacc-plugin.h"
|
|
39 #include "gomp-constants.h"
|
|
40
|
|
41 #include <pthread.h>
|
|
42 #include <cuda.h>
|
|
43 #include <stdbool.h>
|
|
44 #include <stdint.h>
|
|
45 #include <limits.h>
|
|
46 #include <string.h>
|
|
47 #include <stdio.h>
|
|
48 #include <unistd.h>
|
|
49 #include <assert.h>
|
|
50 #include <errno.h>
|
|
51
|
131
|
52 #if CUDA_VERSION < 6000
|
|
53 extern CUresult cuGetErrorString (CUresult, const char **);
|
|
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
|
|
55 #endif
|
|
56
|
|
57 #if CUDA_VERSION >= 6050
|
|
58 #undef cuLinkCreate
|
|
59 #undef cuLinkAddData
|
|
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
|
|
61 const char *, unsigned, CUjit_option *, void **);
|
|
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
|
|
63 #else
|
|
64 typedef size_t (*CUoccupancyB2DSize)(int);
|
|
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
|
|
66 const char *, unsigned, CUjit_option *, void **);
|
|
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
|
|
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
|
|
69 CUoccupancyB2DSize, size_t, int);
|
|
70 #endif
|
|
71
|
|
72 #define DO_PRAGMA(x) _Pragma (#x)
|
|
73
|
111
|
74 #if PLUGIN_NVPTX_DYNAMIC
|
|
75 # include <dlfcn.h>
|
|
76
|
131
|
77 struct cuda_lib_s {
|
|
78
|
|
79 # define CUDA_ONE_CALL(call) \
|
111
|
80 __typeof (call) *call;
|
131
|
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
|
|
82 CUDA_ONE_CALL (call)
|
|
83 #include "cuda-lib.def"
|
|
84 # undef CUDA_ONE_CALL
|
|
85 # undef CUDA_ONE_CALL_MAYBE_NULL
|
|
86
|
111
|
87 } cuda_lib;
|
|
88
|
|
89 /* -1 if init_cuda_lib has not been called yet, false
|
|
90 if it has been and failed, true if it has been and succeeded. */
|
|
91 static signed char cuda_lib_inited = -1;
|
|
92
|
|
93 /* Dynamically load the CUDA runtime library and initialize function
|
|
94 pointers, return false if unsuccessful, true if successful. */
|
|
95 static bool
|
|
96 init_cuda_lib (void)
|
|
97 {
|
|
98 if (cuda_lib_inited != -1)
|
|
99 return cuda_lib_inited;
|
|
100 const char *cuda_runtime_lib = "libcuda.so.1";
|
|
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
|
|
102 cuda_lib_inited = false;
|
|
103 if (h == NULL)
|
|
104 return false;
|
131
|
105
|
|
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
|
|
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
|
|
108 # define CUDA_ONE_CALL_1(call, allow_null) \
|
111
|
109 cuda_lib.call = dlsym (h, #call); \
|
131
|
110 if (!allow_null && cuda_lib.call == NULL) \
|
111
|
111 return false;
|
131
|
112 #include "cuda-lib.def"
|
|
113 # undef CUDA_ONE_CALL
|
|
114 # undef CUDA_ONE_CALL_1
|
|
115 # undef CUDA_ONE_CALL_MAYBE_NULL
|
|
116
|
111
|
117 cuda_lib_inited = true;
|
|
118 return true;
|
|
119 }
|
|
120 # define CUDA_CALL_PREFIX cuda_lib.
|
|
121 #else
|
131
|
122
|
|
123 # define CUDA_ONE_CALL(call)
|
|
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
|
|
125 #include "cuda-lib.def"
|
|
126 #undef CUDA_ONE_CALL_MAYBE_NULL
|
|
127 #undef CUDA_ONE_CALL
|
|
128
|
111
|
129 # define CUDA_CALL_PREFIX
|
|
130 # define init_cuda_lib() true
|
|
131 #endif
|
|
132
|
131
|
133 #include "secure_getenv.h"
|
|
134
|
|
135 #undef MIN
|
|
136 #undef MAX
|
|
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
|
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
|
|
139
|
111
|
140 /* Convenience macros for the frequently used CUDA library call and
|
|
141 error handling sequence as well as CUDA library calls that
|
|
142 do the error checking themselves or don't do it at all. */
|
|
143
|
|
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
|
|
145 do { \
|
|
146 unsigned __r \
|
|
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
|
148 if (__r != CUDA_SUCCESS) \
|
|
149 { \
|
|
150 GOMP_PLUGIN_error (#FN " error: %s", \
|
|
151 cuda_error (__r)); \
|
|
152 return ERET; \
|
|
153 } \
|
|
154 } while (0)
|
|
155
|
|
156 #define CUDA_CALL(FN, ...) \
|
|
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
|
|
158
|
|
159 #define CUDA_CALL_ASSERT(FN, ...) \
|
|
160 do { \
|
|
161 unsigned __r \
|
|
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
|
163 if (__r != CUDA_SUCCESS) \
|
|
164 { \
|
|
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
|
|
166 cuda_error (__r)); \
|
|
167 } \
|
|
168 } while (0)
|
|
169
|
|
170 #define CUDA_CALL_NOCHECK(FN, ...) \
|
|
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
|
|
172
|
131
|
173 #define CUDA_CALL_EXISTS(FN) \
|
|
174 CUDA_CALL_PREFIX FN
|
|
175
|
111
|
176 static const char *
|
|
177 cuda_error (CUresult r)
|
|
178 {
|
131
|
179 const char *fallback = "unknown cuda error";
|
111
|
180 const char *desc;
|
|
181
|
131
|
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
|
|
183 return fallback;
|
|
184
|
111
|
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
|
131
|
186 if (r == CUDA_SUCCESS)
|
|
187 return desc;
|
111
|
188
|
131
|
189 return fallback;
|
111
|
190 }
|
|
191
|
|
192 static unsigned int instantiated_devices = 0;
|
|
193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
194
|
131
|
195 struct cuda_map
|
|
196 {
|
|
197 CUdeviceptr d;
|
|
198 size_t size;
|
|
199 bool active;
|
|
200 struct cuda_map *next;
|
|
201 };
|
|
202
|
111
|
203 struct ptx_stream
|
|
204 {
|
|
205 CUstream stream;
|
|
206 pthread_t host_thread;
|
|
207 bool multithreaded;
|
131
|
208 struct cuda_map *map;
|
111
|
209 struct ptx_stream *next;
|
|
210 };
|
|
211
|
|
212 /* Thread-specific data for PTX. */
|
|
213
|
|
214 struct nvptx_thread
|
|
215 {
|
|
216 struct ptx_stream *current_stream;
|
|
217 struct ptx_device *ptx_dev;
|
|
218 };
|
|
219
|
131
|
220 static struct cuda_map *
|
|
221 cuda_map_create (size_t size)
|
111
|
222 {
|
131
|
223 struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
|
|
224
|
|
225 assert (map);
|
|
226
|
|
227 map->next = NULL;
|
|
228 map->size = size;
|
|
229 map->active = false;
|
|
230
|
|
231 CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
|
|
232 assert (map->d);
|
|
233
|
|
234 return map;
|
|
235 }
|
|
236
|
|
237 static void
|
|
238 cuda_map_destroy (struct cuda_map *map)
|
|
239 {
|
|
240 CUDA_CALL_ASSERT (cuMemFree, map->d);
|
|
241 free (map);
|
|
242 }
|
|
243
|
|
244 /* The following map_* routines manage the CUDA device memory that
|
|
245 contains the data mapping arguments for cuLaunchKernel. Each
|
|
246 asynchronous PTX stream may have multiple pending kernel
|
|
247 invocations, which are launched in a FIFO order. As such, the map
|
|
248 routines maintains a queue of cuLaunchKernel arguments.
|
|
249
|
|
250 Calls to map_push and map_pop must be guarded by ptx_event_lock.
|
|
251 Likewise, calls to map_init and map_fini are guarded by
|
|
252 ptx_dev_lock inside GOMP_OFFLOAD_init_device and
|
|
253 GOMP_OFFLOAD_fini_device, respectively. */
|
111
|
254
|
|
255 static bool
|
|
256 map_init (struct ptx_stream *s)
|
|
257 {
|
|
258 int size = getpagesize ();
|
|
259
|
|
260 assert (s);
|
|
261
|
131
|
262 s->map = cuda_map_create (size);
|
111
|
263
|
|
264 return true;
|
|
265 }
|
|
266
|
|
267 static bool
|
|
268 map_fini (struct ptx_stream *s)
|
|
269 {
|
131
|
270 assert (s->map->next == NULL);
|
|
271 assert (!s->map->active);
|
|
272
|
|
273 cuda_map_destroy (s->map);
|
|
274
|
111
|
275 return true;
|
|
276 }
|
|
277
|
|
278 static void
|
|
279 map_pop (struct ptx_stream *s)
|
|
280 {
|
131
|
281 struct cuda_map *next;
|
111
|
282
|
|
283 assert (s != NULL);
|
|
284
|
131
|
285 if (s->map->next == NULL)
|
111
|
286 {
|
131
|
287 s->map->active = false;
|
|
288 return;
|
111
|
289 }
|
|
290
|
131
|
291 next = s->map->next;
|
|
292 cuda_map_destroy (s->map);
|
|
293 s->map = next;
|
|
294 }
|
111
|
295
|
131
|
296 static CUdeviceptr
|
|
297 map_push (struct ptx_stream *s, size_t size)
|
|
298 {
|
|
299 struct cuda_map *map = NULL, *t = NULL;
|
111
|
300
|
131
|
301 assert (s);
|
|
302 assert (s->map);
|
111
|
303
|
131
|
304 /* Each PTX stream requires a separate data region to store the
|
|
305 launch arguments for cuLaunchKernel. Allocate a new
|
|
306 cuda_map and push it to the end of the list. */
|
|
307 if (s->map->active)
|
|
308 {
|
|
309 map = cuda_map_create (size);
|
111
|
310
|
131
|
311 for (t = s->map; t->next != NULL; t = t->next)
|
|
312 ;
|
111
|
313
|
131
|
314 t->next = map;
|
|
315 }
|
|
316 else if (s->map->size < size)
|
|
317 {
|
|
318 cuda_map_destroy (s->map);
|
|
319 map = cuda_map_create (size);
|
|
320 }
|
|
321 else
|
|
322 map = s->map;
|
111
|
323
|
131
|
324 s->map = map;
|
|
325 s->map->active = true;
|
|
326
|
|
327 return s->map->d;
|
111
|
328 }
|
|
329
|
|
330 /* Target data function launch information. */
|
|
331
|
|
332 struct targ_fn_launch
|
|
333 {
|
|
334 const char *fn;
|
|
335 unsigned short dim[GOMP_DIM_MAX];
|
|
336 };
|
|
337
|
|
338 /* Target PTX object information. */
|
|
339
|
|
340 struct targ_ptx_obj
|
|
341 {
|
|
342 const char *code;
|
|
343 size_t size;
|
|
344 };
|
|
345
|
|
346 /* Target data image information. */
|
|
347
|
|
348 typedef struct nvptx_tdata
|
|
349 {
|
|
350 const struct targ_ptx_obj *ptx_objs;
|
|
351 unsigned ptx_num;
|
|
352
|
|
353 const char *const *var_names;
|
|
354 unsigned var_num;
|
|
355
|
|
356 const struct targ_fn_launch *fn_descs;
|
|
357 unsigned fn_num;
|
|
358 } nvptx_tdata_t;
|
|
359
|
|
360 /* Descriptor of a loaded function. */
|
|
361
|
|
362 struct targ_fn_descriptor
|
|
363 {
|
|
364 CUfunction fn;
|
|
365 const struct targ_fn_launch *launch;
|
|
366 int regs_per_thread;
|
|
367 int max_threads_per_block;
|
|
368 };
|
|
369
|
|
370 /* A loaded PTX image. */
|
|
371 struct ptx_image_data
|
|
372 {
|
|
373 const void *target_data;
|
|
374 CUmodule module;
|
|
375
|
|
376 struct targ_fn_descriptor *fns; /* Array of functions. */
|
|
377
|
|
378 struct ptx_image_data *next;
|
|
379 };
|
|
380
|
|
381 struct ptx_device
|
|
382 {
|
|
383 CUcontext ctx;
|
|
384 bool ctx_shared;
|
|
385 CUdevice dev;
|
|
386 struct ptx_stream *null_stream;
|
|
387 /* All non-null streams associated with this device (actually context),
|
|
388 either created implicitly or passed in from the user (via
|
|
389 acc_set_cuda_stream). */
|
|
390 struct ptx_stream *active_streams;
|
|
391 struct {
|
|
392 struct ptx_stream **arr;
|
|
393 int size;
|
|
394 } async_streams;
|
|
395 /* A lock for use when manipulating the above stream list and array. */
|
|
396 pthread_mutex_t stream_lock;
|
|
397 int ord;
|
|
398 bool overlap;
|
|
399 bool map;
|
|
400 bool concur;
|
|
401 bool mkern;
|
|
402 int mode;
|
|
403 int clock_khz;
|
|
404 int num_sms;
|
|
405 int regs_per_block;
|
|
406 int regs_per_sm;
|
131
|
407 int warp_size;
|
|
408 int max_threads_per_block;
|
|
409 int max_threads_per_multiprocessor;
|
|
410 int default_dims[GOMP_DIM_MAX];
|
111
|
411
|
|
412 struct ptx_image_data *images; /* Images loaded on device. */
|
|
413 pthread_mutex_t image_lock; /* Lock for above list. */
|
|
414
|
|
415 struct ptx_device *next;
|
|
416 };
|
|
417
|
|
418 enum ptx_event_type
|
|
419 {
|
|
420 PTX_EVT_MEM,
|
|
421 PTX_EVT_KNL,
|
|
422 PTX_EVT_SYNC,
|
|
423 PTX_EVT_ASYNC_CLEANUP
|
|
424 };
|
|
425
|
|
426 struct ptx_event
|
|
427 {
|
|
428 CUevent *evt;
|
|
429 int type;
|
|
430 void *addr;
|
|
431 int ord;
|
|
432 int val;
|
|
433
|
|
434 struct ptx_event *next;
|
|
435 };
|
|
436
|
|
437 static pthread_mutex_t ptx_event_lock;
|
|
438 static struct ptx_event *ptx_events;
|
|
439
|
|
440 static struct ptx_device **ptx_devices;
|
|
441
|
|
442 static inline struct nvptx_thread *
|
|
443 nvptx_thread (void)
|
|
444 {
|
|
445 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
|
|
446 }
|
|
447
|
|
448 static bool
|
|
449 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
|
|
450 {
|
|
451 int i;
|
|
452 struct ptx_stream *null_stream
|
|
453 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
|
|
454
|
|
455 null_stream->stream = NULL;
|
|
456 null_stream->host_thread = pthread_self ();
|
|
457 null_stream->multithreaded = true;
|
|
458 if (!map_init (null_stream))
|
|
459 return false;
|
|
460
|
|
461 ptx_dev->null_stream = null_stream;
|
|
462 ptx_dev->active_streams = NULL;
|
|
463 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
|
|
464
|
|
465 if (concurrency < 1)
|
|
466 concurrency = 1;
|
|
467
|
|
468 /* This is just a guess -- make space for as many async streams as the
|
|
469 current device is capable of concurrently executing. This can grow
|
|
470 later as necessary. No streams are created yet. */
|
|
471 ptx_dev->async_streams.arr
|
|
472 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
|
|
473 ptx_dev->async_streams.size = concurrency;
|
|
474
|
|
475 for (i = 0; i < concurrency; i++)
|
|
476 ptx_dev->async_streams.arr[i] = NULL;
|
|
477
|
|
478 return true;
|
|
479 }
|
|
480
|
|
481 static bool
|
|
482 fini_streams_for_device (struct ptx_device *ptx_dev)
|
|
483 {
|
|
484 free (ptx_dev->async_streams.arr);
|
|
485
|
|
486 bool ret = true;
|
|
487 while (ptx_dev->active_streams != NULL)
|
|
488 {
|
|
489 struct ptx_stream *s = ptx_dev->active_streams;
|
|
490 ptx_dev->active_streams = ptx_dev->active_streams->next;
|
|
491
|
|
492 ret &= map_fini (s);
|
|
493
|
|
494 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
|
|
495 if (r != CUDA_SUCCESS)
|
|
496 {
|
|
497 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
|
|
498 ret = false;
|
|
499 }
|
|
500 free (s);
|
|
501 }
|
|
502
|
|
503 ret &= map_fini (ptx_dev->null_stream);
|
|
504 free (ptx_dev->null_stream);
|
|
505 return ret;
|
|
506 }
|
|
507
|
|
508 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
|
|
509 thread THREAD (and also current device/context). If CREATE is true, create
|
|
510 the stream if it does not exist (or use EXISTING if it is non-NULL), and
|
|
511 associate the stream with the same thread argument. Returns stream to use
|
|
512 as result. */
|
|
513
|
|
514 static struct ptx_stream *
|
|
515 select_stream_for_async (int async, pthread_t thread, bool create,
|
|
516 CUstream existing)
|
|
517 {
|
|
518 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
519 /* Local copy of TLS variable. */
|
|
520 struct ptx_device *ptx_dev = nvthd->ptx_dev;
|
|
521 struct ptx_stream *stream = NULL;
|
|
522 int orig_async = async;
|
|
523
|
|
524 /* The special value acc_async_noval (-1) maps (for now) to an
|
|
525 implicitly-created stream, which is then handled the same as any other
|
|
526 numbered async stream. Other options are available, e.g. using the null
|
|
527 stream for anonymous async operations, or choosing an idle stream from an
|
|
528 active set. But, stick with this for now. */
|
|
529 if (async > acc_async_sync)
|
|
530 async++;
|
|
531
|
|
532 if (create)
|
|
533 pthread_mutex_lock (&ptx_dev->stream_lock);
|
|
534
|
|
535 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
|
|
536 null stream, and in fact better performance may be obtainable if it doesn't
|
|
537 (because the null stream enforces overly-strict synchronisation with
|
|
538 respect to other streams for legacy reasons, and that's probably not
|
|
539 needed with OpenACC). Maybe investigate later. */
|
|
540 if (async == acc_async_sync)
|
|
541 stream = ptx_dev->null_stream;
|
|
542 else if (async >= 0 && async < ptx_dev->async_streams.size
|
|
543 && ptx_dev->async_streams.arr[async] && !(create && existing))
|
|
544 stream = ptx_dev->async_streams.arr[async];
|
|
545 else if (async >= 0 && create)
|
|
546 {
|
|
547 if (async >= ptx_dev->async_streams.size)
|
|
548 {
|
|
549 int i, newsize = ptx_dev->async_streams.size * 2;
|
|
550
|
|
551 if (async >= newsize)
|
|
552 newsize = async + 1;
|
|
553
|
|
554 ptx_dev->async_streams.arr
|
|
555 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
|
|
556 newsize * sizeof (struct ptx_stream *));
|
|
557
|
|
558 for (i = ptx_dev->async_streams.size; i < newsize; i++)
|
|
559 ptx_dev->async_streams.arr[i] = NULL;
|
|
560
|
|
561 ptx_dev->async_streams.size = newsize;
|
|
562 }
|
|
563
|
|
564 /* Create a new stream on-demand if there isn't one already, or if we're
|
|
565 setting a particular async value to an existing (externally-provided)
|
|
566 stream. */
|
|
567 if (!ptx_dev->async_streams.arr[async] || existing)
|
|
568 {
|
|
569 CUresult r;
|
|
570 struct ptx_stream *s
|
|
571 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
|
|
572
|
|
573 if (existing)
|
|
574 s->stream = existing;
|
|
575 else
|
|
576 {
|
|
577 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
|
|
578 CU_STREAM_DEFAULT);
|
|
579 if (r != CUDA_SUCCESS)
|
|
580 {
|
|
581 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
582 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
|
|
583 cuda_error (r));
|
|
584 }
|
|
585 }
|
|
586
|
|
587 /* If CREATE is true, we're going to be queueing some work on this
|
|
588 stream. Associate it with the current host thread. */
|
|
589 s->host_thread = thread;
|
|
590 s->multithreaded = false;
|
|
591
|
|
592 if (!map_init (s))
|
|
593 {
|
|
594 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
595 GOMP_PLUGIN_fatal ("map_init fail");
|
|
596 }
|
|
597
|
|
598 s->next = ptx_dev->active_streams;
|
|
599 ptx_dev->active_streams = s;
|
|
600 ptx_dev->async_streams.arr[async] = s;
|
|
601 }
|
|
602
|
|
603 stream = ptx_dev->async_streams.arr[async];
|
|
604 }
|
|
605 else if (async < 0)
|
|
606 {
|
|
607 if (create)
|
|
608 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
609 GOMP_PLUGIN_fatal ("bad async %d", async);
|
|
610 }
|
|
611
|
|
612 if (create)
|
|
613 {
|
|
614 assert (stream != NULL);
|
|
615
|
|
616 /* If we're trying to use the same stream from different threads
|
|
617 simultaneously, set stream->multithreaded to true. This affects the
|
|
618 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
|
|
619 only wait for asynchronous launches from the same host thread they are
|
|
620 invoked on. If multiple threads use the same async value, we make note
|
|
621 of that here and fall back to testing/waiting for all threads in those
|
|
622 functions. */
|
|
623 if (thread != stream->host_thread)
|
|
624 stream->multithreaded = true;
|
|
625
|
|
626 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
627 }
|
|
628 else if (stream && !stream->multithreaded
|
|
629 && !pthread_equal (stream->host_thread, thread))
|
|
630 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
|
|
631
|
|
632 return stream;
|
|
633 }
|
|
634
|
|
635 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
|
|
636 should be locked on entry and remains locked on exit. */
|
|
637
|
|
638 static bool
|
|
639 nvptx_init (void)
|
|
640 {
|
|
641 int ndevs;
|
|
642
|
|
643 if (instantiated_devices != 0)
|
|
644 return true;
|
|
645
|
|
646 ptx_events = NULL;
|
|
647 pthread_mutex_init (&ptx_event_lock, NULL);
|
|
648
|
|
649 if (!init_cuda_lib ())
|
|
650 return false;
|
|
651
|
|
652 CUDA_CALL (cuInit, 0);
|
|
653
|
|
654 CUDA_CALL (cuDeviceGetCount, &ndevs);
|
|
655 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
|
|
656 * ndevs);
|
|
657 return true;
|
|
658 }
|
|
659
|
|
660 /* Select the N'th PTX device for the current host thread. The device must
|
|
661 have been previously opened before calling this function. */
|
|
662
|
|
663 static bool
|
|
664 nvptx_attach_host_thread_to_device (int n)
|
|
665 {
|
|
666 CUdevice dev;
|
|
667 CUresult r;
|
|
668 struct ptx_device *ptx_dev;
|
|
669 CUcontext thd_ctx;
|
|
670
|
|
671 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
|
|
672 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
|
673 {
|
|
674 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
|
675 return false;
|
|
676 }
|
|
677
|
|
678 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
|
|
679 return true;
|
|
680 else
|
|
681 {
|
|
682 CUcontext old_ctx;
|
|
683
|
|
684 ptx_dev = ptx_devices[n];
|
|
685 if (!ptx_dev)
|
|
686 {
|
|
687 GOMP_PLUGIN_error ("device %d not found", n);
|
|
688 return false;
|
|
689 }
|
|
690
|
|
691 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
|
|
692
|
|
693 /* We don't necessarily have a current context (e.g. if it has been
|
|
694 destroyed. Pop it if we do though. */
|
|
695 if (thd_ctx != NULL)
|
|
696 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
|
|
697
|
|
698 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
|
|
699 }
|
|
700 return true;
|
|
701 }
|
|
702
|
|
703 static struct ptx_device *
|
|
704 nvptx_open_device (int n)
|
|
705 {
|
|
706 struct ptx_device *ptx_dev;
|
|
707 CUdevice dev, ctx_dev;
|
|
708 CUresult r;
|
|
709 int async_engines, pi;
|
|
710
|
|
711 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
|
|
712
|
|
713 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
|
|
714
|
|
715 ptx_dev->ord = n;
|
|
716 ptx_dev->dev = dev;
|
|
717 ptx_dev->ctx_shared = false;
|
|
718
|
|
719 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
|
|
720 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
|
721 {
|
|
722 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
|
723 return NULL;
|
|
724 }
|
|
725
|
|
726 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
|
|
727 {
|
|
728 /* The current host thread has an active context for a different device.
|
|
729 Detach it. */
|
|
730 CUcontext old_ctx;
|
|
731 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
|
|
732 }
|
|
733
|
|
734 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
|
|
735
|
|
736 if (!ptx_dev->ctx)
|
|
737 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
|
|
738 else
|
|
739 ptx_dev->ctx_shared = true;
|
|
740
|
|
741 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
742 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
|
|
743 ptx_dev->overlap = pi;
|
|
744
|
|
745 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
746 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
|
|
747 ptx_dev->map = pi;
|
|
748
|
|
749 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
750 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
|
|
751 ptx_dev->concur = pi;
|
|
752
|
|
753 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
754 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
|
755 ptx_dev->mode = pi;
|
|
756
|
|
757 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
758 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
|
|
759 ptx_dev->mkern = pi;
|
|
760
|
|
761 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
762 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
|
763 ptx_dev->clock_khz = pi;
|
|
764
|
|
765 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
766 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
|
767 ptx_dev->num_sms = pi;
|
|
768
|
|
769 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
770 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
|
|
771 ptx_dev->regs_per_block = pi;
|
|
772
|
131
|
773 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
|
111
|
774 in CUDA 6.0 and newer. */
|
131
|
775 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
|
|
776 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
|
|
777 dev);
|
111
|
778 /* Fallback: use limit of registers per block, which is usually equal. */
|
|
779 if (r == CUDA_ERROR_INVALID_VALUE)
|
|
780 pi = ptx_dev->regs_per_block;
|
|
781 else if (r != CUDA_SUCCESS)
|
|
782 {
|
|
783 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
|
|
784 return NULL;
|
|
785 }
|
|
786 ptx_dev->regs_per_sm = pi;
|
|
787
|
|
788 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
789 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
|
|
790 if (pi != 32)
|
|
791 {
|
|
792 GOMP_PLUGIN_error ("Only warp size 32 is supported");
|
|
793 return NULL;
|
|
794 }
|
131
|
795 ptx_dev->warp_size = pi;
|
|
796
|
|
797 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
|
|
798 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
|
|
799 ptx_dev->max_threads_per_block = pi;
|
|
800
|
|
801 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
|
|
802 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
|
|
803 ptx_dev->max_threads_per_multiprocessor = pi;
|
111
|
804
|
|
805 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
|
|
806 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
|
807 if (r != CUDA_SUCCESS)
|
|
808 async_engines = 1;
|
|
809
|
131
|
810 for (int i = 0; i != GOMP_DIM_MAX; i++)
|
|
811 ptx_dev->default_dims[i] = 0;
|
|
812
|
111
|
813 ptx_dev->images = NULL;
|
|
814 pthread_mutex_init (&ptx_dev->image_lock, NULL);
|
|
815
|
|
816 if (!init_streams_for_device (ptx_dev, async_engines))
|
|
817 return NULL;
|
|
818
|
|
819 return ptx_dev;
|
|
820 }
|
|
821
|
|
822 static bool
|
|
823 nvptx_close_device (struct ptx_device *ptx_dev)
|
|
824 {
|
|
825 if (!ptx_dev)
|
|
826 return true;
|
|
827
|
|
828 if (!fini_streams_for_device (ptx_dev))
|
|
829 return false;
|
|
830
|
|
831 pthread_mutex_destroy (&ptx_dev->image_lock);
|
|
832
|
|
833 if (!ptx_dev->ctx_shared)
|
|
834 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
|
|
835
|
|
836 free (ptx_dev);
|
|
837 return true;
|
|
838 }
|
|
839
|
|
840 static int
|
|
841 nvptx_get_num_devices (void)
|
|
842 {
|
|
843 int n;
|
|
844
|
|
845 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
|
|
846 configurations. */
|
|
847 if (sizeof (void *) != 8)
|
|
848 {
|
|
849 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
|
|
850 " only 64-bit configurations are supported\n");
|
|
851 return 0;
|
|
852 }
|
|
853
|
|
854 /* This function will be called before the plugin has been initialized in
|
|
855 order to enumerate available devices, but CUDA API routines can't be used
|
|
856 until cuInit has been called. Just call it now (but don't yet do any
|
|
857 further initialization). */
|
|
858 if (instantiated_devices == 0)
|
|
859 {
|
|
860 if (!init_cuda_lib ())
|
|
861 return 0;
|
|
862 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
|
|
863 /* This is not an error: e.g. we may have CUDA libraries installed but
|
|
864 no devices available. */
|
|
865 if (r != CUDA_SUCCESS)
|
|
866 {
|
|
867 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
|
|
868 cuda_error (r));
|
|
869 return 0;
|
|
870 }
|
|
871 }
|
|
872
|
|
873 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
|
|
874 return n;
|
|
875 }
|
|
876
|
|
877 static void
|
|
878 notify_var (const char *var_name, const char *env_var)
|
|
879 {
|
|
880 if (env_var == NULL)
|
|
881 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
|
|
882 else
|
|
883 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
|
|
884 }
|
|
885
|
131
|
886 static void
|
|
887 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
|
|
888 {
|
|
889 const char *var_name = "GOMP_NVPTX_JIT";
|
|
890 const char *env_var = secure_getenv (var_name);
|
|
891 notify_var (var_name, env_var);
|
|
892
|
|
893 if (env_var == NULL)
|
|
894 return;
|
|
895
|
|
896 const char *c = env_var;
|
|
897 while (*c != '\0')
|
|
898 {
|
|
899 while (*c == ' ')
|
|
900 c++;
|
|
901
|
|
902 if (c[0] == '-' && c[1] == 'O'
|
|
903 && '0' <= c[2] && c[2] <= '4'
|
|
904 && (c[3] == '\0' || c[3] == ' '))
|
|
905 {
|
|
906 *gomp_nvptx_o = c[2] - '0';
|
|
907 c += 3;
|
|
908 continue;
|
|
909 }
|
|
910
|
|
911 GOMP_PLUGIN_error ("Error parsing %s", var_name);
|
|
912 break;
|
|
913 }
|
|
914 }
|
|
915
|
111
|
916 static bool
|
|
917 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
|
|
918 unsigned num_objs)
|
|
919 {
|
131
|
920 CUjit_option opts[7];
|
|
921 void *optvals[7];
|
111
|
922 float elapsed = 0.0;
|
|
923 char elog[1024];
|
|
924 char ilog[16384];
|
|
925 CUlinkState linkstate;
|
|
926 CUresult r;
|
|
927 void *linkout;
|
|
928 size_t linkoutsize __attribute__ ((unused));
|
|
929
|
|
930 opts[0] = CU_JIT_WALL_TIME;
|
|
931 optvals[0] = &elapsed;
|
|
932
|
|
933 opts[1] = CU_JIT_INFO_LOG_BUFFER;
|
|
934 optvals[1] = &ilog[0];
|
|
935
|
|
936 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
|
937 optvals[2] = (void *) sizeof ilog;
|
|
938
|
|
939 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
|
|
940 optvals[3] = &elog[0];
|
|
941
|
|
942 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
|
943 optvals[4] = (void *) sizeof elog;
|
|
944
|
|
945 opts[5] = CU_JIT_LOG_VERBOSE;
|
|
946 optvals[5] = (void *) 1;
|
|
947
|
131
|
948 static intptr_t gomp_nvptx_o = -1;
|
|
949
|
|
950 static bool init_done = false;
|
|
951 if (!init_done)
|
|
952 {
|
|
953 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
|
|
954 init_done = true;
|
|
955 }
|
|
956
|
|
957 int nopts = 6;
|
|
958 if (gomp_nvptx_o != -1)
|
|
959 {
|
|
960 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
|
|
961 optvals[nopts] = (void *) gomp_nvptx_o;
|
|
962 nopts++;
|
|
963 }
|
|
964
|
|
965 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
|
|
966 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
|
|
967 else
|
|
968 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
|
111
|
969
|
|
970 for (; num_objs--; ptx_objs++)
|
|
971 {
|
|
972 /* cuLinkAddData's 'data' argument erroneously omits the const
|
|
973 qualifier. */
|
|
974 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
|
131
|
975 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
|
|
976 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
|
|
977 (char *) ptx_objs->code, ptx_objs->size,
|
|
978 0, 0, 0, 0);
|
|
979 else
|
|
980 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
|
|
981 (char *) ptx_objs->code, ptx_objs->size,
|
|
982 0, 0, 0, 0);
|
111
|
983 if (r != CUDA_SUCCESS)
|
|
984 {
|
|
985 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
|
|
986 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
|
|
987 cuda_error (r));
|
|
988 return false;
|
|
989 }
|
|
990 }
|
|
991
|
|
992 GOMP_PLUGIN_debug (0, "Linking\n");
|
|
993 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
|
|
994
|
|
995 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
|
|
996 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
|
|
997
|
|
998 if (r != CUDA_SUCCESS)
|
|
999 {
|
|
1000 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
|
|
1001 return false;
|
|
1002 }
|
|
1003
|
|
1004 CUDA_CALL (cuModuleLoadData, module, linkout);
|
|
1005 CUDA_CALL (cuLinkDestroy, linkstate);
|
|
1006 return true;
|
|
1007 }
|
|
1008
|
|
1009 static void
|
|
1010 event_gc (bool memmap_lockable)
|
|
1011 {
|
|
1012 struct ptx_event *ptx_event = ptx_events;
|
|
1013 struct ptx_event *async_cleanups = NULL;
|
|
1014 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1015
|
|
1016 pthread_mutex_lock (&ptx_event_lock);
|
|
1017
|
|
1018 while (ptx_event != NULL)
|
|
1019 {
|
|
1020 CUresult r;
|
|
1021 struct ptx_event *e = ptx_event;
|
|
1022
|
|
1023 ptx_event = ptx_event->next;
|
|
1024
|
|
1025 if (e->ord != nvthd->ptx_dev->ord)
|
|
1026 continue;
|
|
1027
|
|
1028 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
|
|
1029 if (r == CUDA_SUCCESS)
|
|
1030 {
|
|
1031 bool append_async = false;
|
|
1032 CUevent *te;
|
|
1033
|
|
1034 te = e->evt;
|
|
1035
|
|
1036 switch (e->type)
|
|
1037 {
|
|
1038 case PTX_EVT_MEM:
|
|
1039 case PTX_EVT_SYNC:
|
|
1040 break;
|
|
1041
|
|
1042 case PTX_EVT_KNL:
|
|
1043 map_pop (e->addr);
|
|
1044 break;
|
|
1045
|
|
1046 case PTX_EVT_ASYNC_CLEANUP:
|
|
1047 {
|
|
1048 /* The function gomp_plugin_async_unmap_vars needs to claim the
|
|
1049 memory-map splay tree lock for the current device, so we
|
|
1050 can't call it when one of our callers has already claimed
|
|
1051 the lock. In that case, just delay the GC for this event
|
|
1052 until later. */
|
|
1053 if (!memmap_lockable)
|
|
1054 continue;
|
|
1055
|
|
1056 append_async = true;
|
|
1057 }
|
|
1058 break;
|
|
1059 }
|
|
1060
|
|
1061 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
|
|
1062 free ((void *)te);
|
|
1063
|
|
1064 /* Unlink 'e' from ptx_events list. */
|
|
1065 if (ptx_events == e)
|
|
1066 ptx_events = ptx_events->next;
|
|
1067 else
|
|
1068 {
|
|
1069 struct ptx_event *e_ = ptx_events;
|
|
1070 while (e_->next != e)
|
|
1071 e_ = e_->next;
|
|
1072 e_->next = e_->next->next;
|
|
1073 }
|
|
1074
|
|
1075 if (append_async)
|
|
1076 {
|
|
1077 e->next = async_cleanups;
|
|
1078 async_cleanups = e;
|
|
1079 }
|
|
1080 else
|
|
1081 free (e);
|
|
1082 }
|
|
1083 }
|
|
1084
|
|
1085 pthread_mutex_unlock (&ptx_event_lock);
|
|
1086
|
|
1087 /* We have to do these here, after ptx_event_lock is released. */
|
|
1088 while (async_cleanups)
|
|
1089 {
|
|
1090 struct ptx_event *e = async_cleanups;
|
|
1091 async_cleanups = async_cleanups->next;
|
|
1092
|
|
1093 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
|
|
1094 free (e);
|
|
1095 }
|
|
1096 }
|
|
1097
|
|
1098 static void
|
|
1099 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
|
|
1100 {
|
|
1101 struct ptx_event *ptx_event;
|
|
1102 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1103
|
|
1104 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
|
|
1105 || type == PTX_EVT_ASYNC_CLEANUP);
|
|
1106
|
|
1107 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
|
|
1108 ptx_event->type = type;
|
|
1109 ptx_event->evt = e;
|
|
1110 ptx_event->addr = h;
|
|
1111 ptx_event->ord = nvthd->ptx_dev->ord;
|
|
1112 ptx_event->val = val;
|
|
1113
|
|
1114 pthread_mutex_lock (&ptx_event_lock);
|
|
1115
|
|
1116 ptx_event->next = ptx_events;
|
|
1117 ptx_events = ptx_event;
|
|
1118
|
|
1119 pthread_mutex_unlock (&ptx_event_lock);
|
|
1120 }
|
|
1121
|
|
1122 static void
|
|
1123 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
|
1124 int async, unsigned *dims, void *targ_mem_desc)
|
|
1125 {
|
|
1126 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
|
|
1127 CUfunction function;
|
|
1128 CUresult r;
|
|
1129 int i;
|
|
1130 struct ptx_stream *dev_str;
|
|
1131 void *kargs[1];
|
131
|
1132 void *hp;
|
|
1133 CUdeviceptr dp;
|
111
|
1134 struct nvptx_thread *nvthd = nvptx_thread ();
|
131
|
1135 int warp_size = nvthd->ptx_dev->warp_size;
|
111
|
1136 const char *maybe_abort_msg = "(perhaps abort was called)";
|
|
1137
|
|
1138 function = targ_fn->fn;
|
|
1139
|
|
1140 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1141 assert (dev_str == nvthd->current_stream);
|
|
1142
|
|
1143 /* Initialize the launch dimensions. Typically this is constant,
|
|
1144 provided by the device compiler, but we must permit runtime
|
|
1145 values. */
|
|
1146 int seen_zero = 0;
|
|
1147 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1148 {
|
|
1149 if (targ_fn->launch->dim[i])
|
|
1150 dims[i] = targ_fn->launch->dim[i];
|
|
1151 if (!dims[i])
|
|
1152 seen_zero = 1;
|
|
1153 }
|
|
1154
|
|
1155 if (seen_zero)
|
|
1156 {
|
131
|
1157 pthread_mutex_lock (&ptx_dev_lock);
|
111
|
1158
|
131
|
1159 static int gomp_openacc_dims[GOMP_DIM_MAX];
|
|
1160 if (!gomp_openacc_dims[0])
|
111
|
1161 {
|
131
|
1162 /* See if the user provided GOMP_OPENACC_DIM environment
|
|
1163 variable to specify runtime defaults. */
|
|
1164 for (int i = 0; i < GOMP_DIM_MAX; ++i)
|
|
1165 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
|
|
1166 }
|
111
|
1167
|
131
|
1168 if (!nvthd->ptx_dev->default_dims[0])
|
|
1169 {
|
|
1170 int default_dims[GOMP_DIM_MAX];
|
|
1171 for (int i = 0; i < GOMP_DIM_MAX; ++i)
|
|
1172 default_dims[i] = gomp_openacc_dims[i];
|
111
|
1173
|
131
|
1174 int gang, worker, vector;
|
|
1175 {
|
|
1176 int block_size = nvthd->ptx_dev->max_threads_per_block;
|
|
1177 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
|
|
1178 int dev_size = nvthd->ptx_dev->num_sms;
|
|
1179 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
|
1180 " dev_size=%d, cpu_size=%d\n",
|
|
1181 warp_size, block_size, dev_size, cpu_size);
|
111
|
1182
|
131
|
1183 gang = (cpu_size / block_size) * dev_size;
|
|
1184 worker = block_size / warp_size;
|
|
1185 vector = warp_size;
|
|
1186 }
|
111
|
1187
|
|
1188 /* There is no upper bound on the gang size. The best size
|
|
1189 matches the hardware configuration. Logical gangs are
|
|
1190 scheduled onto physical hardware. To maximize usage, we
|
|
1191 should guess a large number. */
|
|
1192 if (default_dims[GOMP_DIM_GANG] < 1)
|
|
1193 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
|
|
1194 /* The worker size must not exceed the hardware. */
|
|
1195 if (default_dims[GOMP_DIM_WORKER] < 1
|
|
1196 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
|
|
1197 default_dims[GOMP_DIM_WORKER] = worker;
|
|
1198 /* The vector size must exactly match the hardware. */
|
|
1199 if (default_dims[GOMP_DIM_VECTOR] < 1
|
|
1200 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
|
|
1201 default_dims[GOMP_DIM_VECTOR] = vector;
|
|
1202
|
|
1203 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
|
|
1204 default_dims[GOMP_DIM_GANG],
|
|
1205 default_dims[GOMP_DIM_WORKER],
|
|
1206 default_dims[GOMP_DIM_VECTOR]);
|
131
|
1207
|
|
1208 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1209 nvthd->ptx_dev->default_dims[i] = default_dims[i];
|
111
|
1210 }
|
|
1211 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1212
|
131
|
1213 {
|
|
1214 bool default_dim_p[GOMP_DIM_MAX];
|
|
1215 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1216 default_dim_p[i] = !dims[i];
|
|
1217
|
|
1218 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
|
|
1219 {
|
|
1220 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1221 if (default_dim_p[i])
|
|
1222 dims[i] = nvthd->ptx_dev->default_dims[i];
|
|
1223
|
|
1224 if (default_dim_p[GOMP_DIM_VECTOR])
|
|
1225 dims[GOMP_DIM_VECTOR]
|
|
1226 = MIN (dims[GOMP_DIM_VECTOR],
|
|
1227 (targ_fn->max_threads_per_block / warp_size
|
|
1228 * warp_size));
|
|
1229
|
|
1230 if (default_dim_p[GOMP_DIM_WORKER])
|
|
1231 dims[GOMP_DIM_WORKER]
|
|
1232 = MIN (dims[GOMP_DIM_WORKER],
|
|
1233 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
|
|
1234 }
|
|
1235 else
|
|
1236 {
|
|
1237 /* Handle the case that the compiler allows the runtime to choose
|
|
1238 the vector-length conservatively, by ignoring
|
|
1239 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
|
|
1240 it. */
|
|
1241 int vectors = 0;
|
|
1242 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
|
|
1243 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
|
|
1244 exceed targ_fn->max_threads_per_block. */
|
|
1245 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
|
|
1246 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
|
|
1247 int grids, blocks;
|
|
1248
|
|
1249 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
|
|
1250 &blocks, function, NULL, 0,
|
|
1251 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
|
|
1252 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
|
|
1253 "grid = %d, block = %d\n", grids, blocks);
|
|
1254
|
|
1255 /* Keep the num_gangs proportional to the block size. In
|
|
1256 the case were a block size is limited by shared-memory
|
|
1257 or the register file capacity, the runtime will not
|
|
1258 excessively over assign gangs to the multiprocessor
|
|
1259 units if their state is going to be swapped out even
|
|
1260 more than necessary. The constant factor 2 is there to
|
|
1261 prevent threads from idling when there is insufficient
|
|
1262 work for them. */
|
|
1263 if (gangs == 0)
|
|
1264 gangs = 2 * grids * (blocks / warp_size);
|
|
1265
|
|
1266 if (vectors == 0)
|
|
1267 vectors = warp_size;
|
|
1268
|
|
1269 if (workers == 0)
|
|
1270 {
|
|
1271 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
|
|
1272 ? vectors
|
|
1273 : dims[GOMP_DIM_VECTOR]);
|
|
1274 workers = blocks / actual_vectors;
|
|
1275 }
|
|
1276
|
|
1277 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1278 if (default_dim_p[i])
|
|
1279 switch (i)
|
|
1280 {
|
|
1281 case GOMP_DIM_GANG: dims[i] = gangs; break;
|
|
1282 case GOMP_DIM_WORKER: dims[i] = workers; break;
|
|
1283 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
|
|
1284 default: GOMP_PLUGIN_fatal ("invalid dim");
|
|
1285 }
|
|
1286 }
|
|
1287 }
|
|
1288 }
|
|
1289
|
|
1290 /* Check if the accelerator has sufficient hardware resources to
|
|
1291 launch the offloaded kernel. */
|
|
1292 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
|
|
1293 > targ_fn->max_threads_per_block)
|
|
1294 {
|
|
1295 int suggest_workers
|
|
1296 = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
|
|
1297 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
|
|
1298 " launch '%s' with num_workers = %d; recompile the"
|
|
1299 " program with 'num_workers = %d' on that offloaded"
|
|
1300 " region or '-fopenacc-dim=:%d'",
|
|
1301 targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
|
|
1302 suggest_workers, suggest_workers);
|
111
|
1303 }
|
|
1304
|
|
1305 /* This reserves a chunk of a pre-allocated page of memory mapped on both
|
|
1306 the host and the device. HP is a host pointer to the new chunk, and DP is
|
|
1307 the corresponding device pointer. */
|
131
|
1308 pthread_mutex_lock (&ptx_event_lock);
|
|
1309 dp = map_push (dev_str, mapnum * sizeof (void *));
|
|
1310 pthread_mutex_unlock (&ptx_event_lock);
|
111
|
1311
|
|
1312 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
|
1313
|
|
1314 /* Copy the array of arguments to the mapped page. */
|
131
|
1315 hp = alloca(sizeof(void *) * mapnum);
|
111
|
1316 for (i = 0; i < mapnum; i++)
|
|
1317 ((void **) hp)[i] = devaddrs[i];
|
|
1318
|
131
|
1319 /* Copy the (device) pointers to arguments to the device */
|
|
1320 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
|
111
|
1321 mapnum * sizeof (void *));
|
|
1322 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
|
1323 " gangs=%u, workers=%u, vectors=%u\n",
|
|
1324 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
|
|
1325 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
|
|
1326
|
|
1327 // OpenACC CUDA
|
|
1328 //
|
|
1329 // num_gangs nctaid.x
|
|
1330 // num_workers ntid.y
|
|
1331 // vector length ntid.x
|
|
1332
|
|
1333 kargs[0] = &dp;
|
|
1334 CUDA_CALL_ASSERT (cuLaunchKernel, function,
|
|
1335 dims[GOMP_DIM_GANG], 1, 1,
|
|
1336 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
|
|
1337 0, dev_str->stream, kargs, 0);
|
|
1338
|
|
1339 #ifndef DISABLE_ASYNC
|
|
1340 if (async < acc_async_noval)
|
|
1341 {
|
|
1342 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
|
|
1343 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1344 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
|
|
1345 maybe_abort_msg);
|
|
1346 else if (r != CUDA_SUCCESS)
|
|
1347 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
|
|
1348 }
|
|
1349 else
|
|
1350 {
|
|
1351 CUevent *e;
|
|
1352
|
|
1353 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1354
|
|
1355 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1356 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1357 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
|
|
1358 maybe_abort_msg);
|
|
1359 else if (r != CUDA_SUCCESS)
|
|
1360 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
|
|
1361
|
|
1362 event_gc (true);
|
|
1363
|
|
1364 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
|
|
1365
|
|
1366 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
|
1367 }
|
|
1368 #else
|
|
1369 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
|
1370 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1371 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
|
1372 maybe_abort_msg);
|
|
1373 else if (r != CUDA_SUCCESS)
|
|
1374 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
|
|
1375 #endif
|
|
1376
|
|
1377 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
|
|
1378 targ_fn->launch->fn);
|
|
1379
|
|
1380 #ifndef DISABLE_ASYNC
|
|
1381 if (async < acc_async_noval)
|
|
1382 #endif
|
|
1383 map_pop (dev_str);
|
|
1384 }
|
|
1385
|
|
1386 void * openacc_get_current_cuda_context (void);
|
|
1387
|
|
1388 static void *
|
|
1389 nvptx_alloc (size_t s)
|
|
1390 {
|
|
1391 CUdeviceptr d;
|
|
1392
|
|
1393 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
|
|
1394 return (void *) d;
|
|
1395 }
|
|
1396
|
|
1397 static bool
|
|
1398 nvptx_free (void *p)
|
|
1399 {
|
|
1400 CUdeviceptr pb;
|
|
1401 size_t ps;
|
|
1402
|
|
1403 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
|
|
1404 if ((CUdeviceptr) p != pb)
|
|
1405 {
|
|
1406 GOMP_PLUGIN_error ("invalid device address");
|
|
1407 return false;
|
|
1408 }
|
|
1409
|
|
1410 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
|
|
1411 return true;
|
|
1412 }
|
|
1413
|
|
1414
|
|
1415 static bool
|
|
1416 nvptx_host2dev (void *d, const void *h, size_t s)
|
|
1417 {
|
|
1418 CUdeviceptr pb;
|
|
1419 size_t ps;
|
|
1420 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1421
|
|
1422 if (!s)
|
|
1423 return true;
|
|
1424 if (!d)
|
|
1425 {
|
|
1426 GOMP_PLUGIN_error ("invalid device address");
|
|
1427 return false;
|
|
1428 }
|
|
1429
|
|
1430 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
|
|
1431
|
|
1432 if (!pb)
|
|
1433 {
|
|
1434 GOMP_PLUGIN_error ("invalid device address");
|
|
1435 return false;
|
|
1436 }
|
|
1437 if (!h)
|
|
1438 {
|
|
1439 GOMP_PLUGIN_error ("invalid host address");
|
|
1440 return false;
|
|
1441 }
|
|
1442 if (d == h)
|
|
1443 {
|
|
1444 GOMP_PLUGIN_error ("invalid host or device address");
|
|
1445 return false;
|
|
1446 }
|
|
1447 if ((void *)(d + s) > (void *)(pb + ps))
|
|
1448 {
|
|
1449 GOMP_PLUGIN_error ("invalid size");
|
|
1450 return false;
|
|
1451 }
|
|
1452
|
|
1453 #ifndef DISABLE_ASYNC
|
|
1454 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
|
|
1455 {
|
|
1456 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1457 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1458 event_gc (false);
|
|
1459 CUDA_CALL (cuMemcpyHtoDAsync,
|
|
1460 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
|
|
1461 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
1462 event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
|
1463 }
|
|
1464 else
|
|
1465 #endif
|
|
1466 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
|
|
1467
|
|
1468 return true;
|
|
1469 }
|
|
1470
|
|
1471 static bool
|
|
1472 nvptx_dev2host (void *h, const void *d, size_t s)
|
|
1473 {
|
|
1474 CUdeviceptr pb;
|
|
1475 size_t ps;
|
|
1476 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1477
|
|
1478 if (!s)
|
|
1479 return true;
|
|
1480 if (!d)
|
|
1481 {
|
|
1482 GOMP_PLUGIN_error ("invalid device address");
|
|
1483 return false;
|
|
1484 }
|
|
1485
|
|
1486 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
|
|
1487
|
|
1488 if (!pb)
|
|
1489 {
|
|
1490 GOMP_PLUGIN_error ("invalid device address");
|
|
1491 return false;
|
|
1492 }
|
|
1493 if (!h)
|
|
1494 {
|
|
1495 GOMP_PLUGIN_error ("invalid host address");
|
|
1496 return false;
|
|
1497 }
|
|
1498 if (d == h)
|
|
1499 {
|
|
1500 GOMP_PLUGIN_error ("invalid host or device address");
|
|
1501 return false;
|
|
1502 }
|
|
1503 if ((void *)(d + s) > (void *)(pb + ps))
|
|
1504 {
|
|
1505 GOMP_PLUGIN_error ("invalid size");
|
|
1506 return false;
|
|
1507 }
|
|
1508
|
|
1509 #ifndef DISABLE_ASYNC
|
|
1510 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
|
|
1511 {
|
|
1512 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1513 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1514 event_gc (false);
|
|
1515 CUDA_CALL (cuMemcpyDtoHAsync,
|
|
1516 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
|
|
1517 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
1518 event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
|
1519 }
|
|
1520 else
|
|
1521 #endif
|
|
1522 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
|
|
1523
|
|
1524 return true;
|
|
1525 }
|
|
1526
|
|
1527 static void
|
|
1528 nvptx_set_async (int async)
|
|
1529 {
|
|
1530 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1531 nvthd->current_stream
|
|
1532 = select_stream_for_async (async, pthread_self (), true, NULL);
|
|
1533 }
|
|
1534
|
|
1535 static int
|
|
1536 nvptx_async_test (int async)
|
|
1537 {
|
|
1538 CUresult r;
|
|
1539 struct ptx_stream *s;
|
|
1540
|
|
1541 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1542
|
|
1543 if (!s)
|
|
1544 GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
1545
|
|
1546 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
|
1547 if (r == CUDA_SUCCESS)
|
|
1548 {
|
|
1549 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
|
|
1550 whether all work has completed on this stream, and if so omits the call
|
|
1551 to the wait hook. If that happens, event_gc might not get called
|
|
1552 (which prevents variables from getting unmapped and their associated
|
|
1553 device storage freed), so call it here. */
|
|
1554 event_gc (true);
|
|
1555 return 1;
|
|
1556 }
|
|
1557 else if (r == CUDA_ERROR_NOT_READY)
|
|
1558 return 0;
|
|
1559
|
|
1560 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
|
|
1561
|
|
1562 return 0;
|
|
1563 }
|
|
1564
|
|
1565 static int
|
|
1566 nvptx_async_test_all (void)
|
|
1567 {
|
|
1568 struct ptx_stream *s;
|
|
1569 pthread_t self = pthread_self ();
|
|
1570 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1571
|
|
1572 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1573
|
|
1574 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
|
1575 {
|
|
1576 if ((s->multithreaded || pthread_equal (s->host_thread, self))
|
|
1577 && CUDA_CALL_NOCHECK (cuStreamQuery,
|
|
1578 s->stream) == CUDA_ERROR_NOT_READY)
|
|
1579 {
|
|
1580 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1581 return 0;
|
|
1582 }
|
|
1583 }
|
|
1584
|
|
1585 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1586
|
|
1587 event_gc (true);
|
|
1588
|
|
1589 return 1;
|
|
1590 }
|
|
1591
|
|
1592 static void
|
|
1593 nvptx_wait (int async)
|
|
1594 {
|
|
1595 struct ptx_stream *s;
|
|
1596
|
|
1597 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1598 if (!s)
|
|
1599 GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
1600
|
|
1601 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
|
|
1602
|
|
1603 event_gc (true);
|
|
1604 }
|
|
1605
|
|
1606 static void
|
|
1607 nvptx_wait_async (int async1, int async2)
|
|
1608 {
|
|
1609 CUevent *e;
|
|
1610 struct ptx_stream *s1, *s2;
|
|
1611 pthread_t self = pthread_self ();
|
|
1612
|
|
1613 /* The stream that is waiting (rather than being waited for) doesn't
|
|
1614 necessarily have to exist already. */
|
|
1615 s2 = select_stream_for_async (async2, self, true, NULL);
|
|
1616
|
|
1617 s1 = select_stream_for_async (async1, self, false, NULL);
|
|
1618 if (!s1)
|
|
1619 GOMP_PLUGIN_fatal ("invalid async 1\n");
|
|
1620
|
|
1621 if (s1 == s2)
|
|
1622 GOMP_PLUGIN_fatal ("identical parameters");
|
|
1623
|
|
1624 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1625
|
|
1626 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1627
|
|
1628 event_gc (true);
|
|
1629
|
|
1630 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
|
|
1631
|
|
1632 event_add (PTX_EVT_SYNC, e, NULL, 0);
|
|
1633
|
|
1634 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
|
|
1635 }
|
|
1636
|
|
1637 static void
|
|
1638 nvptx_wait_all (void)
|
|
1639 {
|
|
1640 CUresult r;
|
|
1641 struct ptx_stream *s;
|
|
1642 pthread_t self = pthread_self ();
|
|
1643 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1644
|
|
1645 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1646
|
|
1647 /* Wait for active streams initiated by this thread (or by multiple threads)
|
|
1648 to complete. */
|
|
1649 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
|
1650 {
|
|
1651 if (s->multithreaded || pthread_equal (s->host_thread, self))
|
|
1652 {
|
|
1653 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
|
1654 if (r == CUDA_SUCCESS)
|
|
1655 continue;
|
|
1656 else if (r != CUDA_ERROR_NOT_READY)
|
|
1657 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
|
|
1658
|
|
1659 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
|
|
1660 }
|
|
1661 }
|
|
1662
|
|
1663 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1664
|
|
1665 event_gc (true);
|
|
1666 }
|
|
1667
|
|
1668 static void
|
|
1669 nvptx_wait_all_async (int async)
|
|
1670 {
|
|
1671 struct ptx_stream *waiting_stream, *other_stream;
|
|
1672 CUevent *e;
|
|
1673 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1674 pthread_t self = pthread_self ();
|
|
1675
|
|
1676 /* The stream doing the waiting. This could be the first mention of the
|
|
1677 stream, so create it if necessary. */
|
|
1678 waiting_stream
|
|
1679 = select_stream_for_async (async, pthread_self (), true, NULL);
|
|
1680
|
|
1681 /* Launches on the null stream already block on other streams in the
|
|
1682 context. */
|
|
1683 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
|
|
1684 return;
|
|
1685
|
|
1686 event_gc (true);
|
|
1687
|
|
1688 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1689
|
|
1690 for (other_stream = nvthd->ptx_dev->active_streams;
|
|
1691 other_stream != NULL;
|
|
1692 other_stream = other_stream->next)
|
|
1693 {
|
|
1694 if (!other_stream->multithreaded
|
|
1695 && !pthread_equal (other_stream->host_thread, self))
|
|
1696 continue;
|
|
1697
|
|
1698 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1699
|
|
1700 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1701
|
|
1702 /* Record an event on the waited-for stream. */
|
|
1703 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
|
|
1704
|
|
1705 event_add (PTX_EVT_SYNC, e, NULL, 0);
|
|
1706
|
|
1707 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
|
|
1708 }
|
|
1709
|
|
1710 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1711 }
|
|
1712
|
|
1713 static void *
|
|
1714 nvptx_get_current_cuda_device (void)
|
|
1715 {
|
|
1716 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1717
|
|
1718 if (!nvthd || !nvthd->ptx_dev)
|
|
1719 return NULL;
|
|
1720
|
|
1721 return &nvthd->ptx_dev->dev;
|
|
1722 }
|
|
1723
|
|
1724 static void *
|
|
1725 nvptx_get_current_cuda_context (void)
|
|
1726 {
|
|
1727 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1728
|
|
1729 if (!nvthd || !nvthd->ptx_dev)
|
|
1730 return NULL;
|
|
1731
|
|
1732 return nvthd->ptx_dev->ctx;
|
|
1733 }
|
|
1734
|
|
1735 static void *
|
|
1736 nvptx_get_cuda_stream (int async)
|
|
1737 {
|
|
1738 struct ptx_stream *s;
|
|
1739 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1740
|
|
1741 if (!nvthd || !nvthd->ptx_dev)
|
|
1742 return NULL;
|
|
1743
|
|
1744 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1745
|
|
1746 return s ? s->stream : NULL;
|
|
1747 }
|
|
1748
|
|
1749 static int
|
|
1750 nvptx_set_cuda_stream (int async, void *stream)
|
|
1751 {
|
|
1752 struct ptx_stream *oldstream;
|
|
1753 pthread_t self = pthread_self ();
|
|
1754 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1755
|
|
1756 if (async < 0)
|
|
1757 GOMP_PLUGIN_fatal ("bad async %d", async);
|
|
1758
|
|
1759 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1760
|
|
1761 /* We have a list of active streams and an array mapping async values to
|
|
1762 entries of that list. We need to take "ownership" of the passed-in stream,
|
|
1763 and add it to our list, removing the previous entry also (if there was one)
|
|
1764 in order to prevent resource leaks. Note the potential for surprise
|
|
1765 here: maybe we should keep track of passed-in streams and leave it up to
|
|
1766 the user to tidy those up, but that doesn't work for stream handles
|
|
1767 returned from acc_get_cuda_stream above... */
|
|
1768
|
|
1769 oldstream = select_stream_for_async (async, self, false, NULL);
|
|
1770
|
|
1771 if (oldstream)
|
|
1772 {
|
|
1773 if (nvthd->ptx_dev->active_streams == oldstream)
|
|
1774 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
|
|
1775 else
|
|
1776 {
|
|
1777 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
|
|
1778 while (s->next != oldstream)
|
|
1779 s = s->next;
|
|
1780 s->next = s->next->next;
|
|
1781 }
|
|
1782
|
|
1783 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
|
|
1784
|
|
1785 if (!map_fini (oldstream))
|
|
1786 GOMP_PLUGIN_fatal ("error when freeing host memory");
|
|
1787
|
|
1788 free (oldstream);
|
|
1789 }
|
|
1790
|
|
1791 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1792
|
|
1793 (void) select_stream_for_async (async, self, true, (CUstream) stream);
|
|
1794
|
|
1795 return 1;
|
|
1796 }
|
|
1797
|
|
1798 /* Plugin entry points. */
|
|
1799
|
|
1800 const char *
|
|
1801 GOMP_OFFLOAD_get_name (void)
|
|
1802 {
|
|
1803 return "nvptx";
|
|
1804 }
|
|
1805
|
|
1806 unsigned int
|
|
1807 GOMP_OFFLOAD_get_caps (void)
|
|
1808 {
|
|
1809 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
|
|
1810 }
|
|
1811
|
|
1812 int
|
|
1813 GOMP_OFFLOAD_get_type (void)
|
|
1814 {
|
|
1815 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
|
|
1816 }
|
|
1817
|
|
1818 int
|
|
1819 GOMP_OFFLOAD_get_num_devices (void)
|
|
1820 {
|
|
1821 return nvptx_get_num_devices ();
|
|
1822 }
|
|
1823
|
|
1824 bool
|
|
1825 GOMP_OFFLOAD_init_device (int n)
|
|
1826 {
|
|
1827 struct ptx_device *dev;
|
|
1828
|
|
1829 pthread_mutex_lock (&ptx_dev_lock);
|
|
1830
|
|
1831 if (!nvptx_init () || ptx_devices[n] != NULL)
|
|
1832 {
|
|
1833 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1834 return false;
|
|
1835 }
|
|
1836
|
|
1837 dev = nvptx_open_device (n);
|
|
1838 if (dev)
|
|
1839 {
|
|
1840 ptx_devices[n] = dev;
|
|
1841 instantiated_devices++;
|
|
1842 }
|
|
1843
|
|
1844 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1845
|
|
1846 return dev != NULL;
|
|
1847 }
|
|
1848
|
|
1849 bool
|
|
1850 GOMP_OFFLOAD_fini_device (int n)
|
|
1851 {
|
|
1852 pthread_mutex_lock (&ptx_dev_lock);
|
|
1853
|
|
1854 if (ptx_devices[n] != NULL)
|
|
1855 {
|
|
1856 if (!nvptx_attach_host_thread_to_device (n)
|
|
1857 || !nvptx_close_device (ptx_devices[n]))
|
|
1858 {
|
|
1859 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1860 return false;
|
|
1861 }
|
|
1862 ptx_devices[n] = NULL;
|
|
1863 instantiated_devices--;
|
|
1864 }
|
|
1865
|
|
1866 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1867 return true;
|
|
1868 }
|
|
1869
|
|
1870 /* Return the libgomp version number we're compatible with. There is
|
|
1871 no requirement for cross-version compatibility. */
|
|
1872
|
|
1873 unsigned
|
|
1874 GOMP_OFFLOAD_version (void)
|
|
1875 {
|
|
1876 return GOMP_VERSION;
|
|
1877 }
|
|
1878
|
|
1879 /* Initialize __nvptx_clocktick, if present in MODULE. */
|
|
1880
|
|
1881 static void
|
|
1882 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
|
|
1883 {
|
|
1884 CUdeviceptr dptr;
|
|
1885 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
|
|
1886 module, "__nvptx_clocktick");
|
|
1887 if (r == CUDA_ERROR_NOT_FOUND)
|
|
1888 return;
|
|
1889 if (r != CUDA_SUCCESS)
|
|
1890 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
|
|
1891 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
|
|
1892 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
|
|
1893 sizeof (__nvptx_clocktick));
|
|
1894 if (r != CUDA_SUCCESS)
|
|
1895 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
|
1896 }
|
|
1897
|
|
1898 /* Load the (partial) program described by TARGET_DATA to device
|
|
1899 number ORD. Allocate and return TARGET_TABLE. */
|
|
1900
|
|
1901 int
|
|
1902 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|
1903 struct addr_pair **target_table)
|
|
1904 {
|
|
1905 CUmodule module;
|
|
1906 const char *const *var_names;
|
|
1907 const struct targ_fn_launch *fn_descs;
|
|
1908 unsigned int fn_entries, var_entries, i, j;
|
|
1909 struct targ_fn_descriptor *targ_fns;
|
|
1910 struct addr_pair *targ_tbl;
|
|
1911 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
|
|
1912 struct ptx_image_data *new_image;
|
|
1913 struct ptx_device *dev;
|
|
1914
|
|
1915 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
|
|
1916 {
|
|
1917 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
|
|
1918 " (expected %u, received %u)",
|
|
1919 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
|
|
1920 return -1;
|
|
1921 }
|
|
1922
|
|
1923 if (!nvptx_attach_host_thread_to_device (ord)
|
|
1924 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
|
|
1925 return -1;
|
|
1926
|
|
1927 dev = ptx_devices[ord];
|
|
1928
|
|
1929 /* The mkoffload utility emits a struct of pointers/integers at the
|
|
1930 start of each offload image. The array of kernel names and the
|
|
1931 functions addresses form a one-to-one correspondence. */
|
|
1932
|
|
1933 var_entries = img_header->var_num;
|
|
1934 var_names = img_header->var_names;
|
|
1935 fn_entries = img_header->fn_num;
|
|
1936 fn_descs = img_header->fn_descs;
|
|
1937
|
|
1938 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
|
|
1939 * (fn_entries + var_entries));
|
|
1940 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
|
|
1941 * fn_entries);
|
|
1942
|
|
1943 *target_table = targ_tbl;
|
|
1944
|
|
1945 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
|
|
1946 new_image->target_data = target_data;
|
|
1947 new_image->module = module;
|
|
1948 new_image->fns = targ_fns;
|
|
1949
|
|
1950 pthread_mutex_lock (&dev->image_lock);
|
|
1951 new_image->next = dev->images;
|
|
1952 dev->images = new_image;
|
|
1953 pthread_mutex_unlock (&dev->image_lock);
|
|
1954
|
|
1955 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
|
|
1956 {
|
|
1957 CUfunction function;
|
|
1958 int nregs, mthrs;
|
|
1959
|
|
1960 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
|
|
1961 fn_descs[i].fn);
|
|
1962 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
|
|
1963 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
|
|
1964 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
|
|
1965 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
|
|
1966
|
|
1967 targ_fns->fn = function;
|
|
1968 targ_fns->launch = &fn_descs[i];
|
|
1969 targ_fns->regs_per_thread = nregs;
|
|
1970 targ_fns->max_threads_per_block = mthrs;
|
|
1971
|
|
1972 targ_tbl->start = (uintptr_t) targ_fns;
|
|
1973 targ_tbl->end = targ_tbl->start + 1;
|
|
1974 }
|
|
1975
|
|
1976 for (j = 0; j < var_entries; j++, targ_tbl++)
|
|
1977 {
|
|
1978 CUdeviceptr var;
|
|
1979 size_t bytes;
|
|
1980
|
|
1981 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
|
|
1982 &var, &bytes, module, var_names[j]);
|
|
1983
|
|
1984 targ_tbl->start = (uintptr_t) var;
|
|
1985 targ_tbl->end = targ_tbl->start + bytes;
|
|
1986 }
|
|
1987
|
|
1988 nvptx_set_clocktick (module, dev);
|
|
1989
|
|
1990 return fn_entries + var_entries;
|
|
1991 }
|
|
1992
|
|
1993 /* Unload the program described by TARGET_DATA. DEV_DATA is the
|
|
1994 function descriptors allocated by G_O_load_image. */
|
|
1995
|
|
1996 bool
|
|
1997 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
|
|
1998 {
|
|
1999 struct ptx_image_data *image, **prev_p;
|
|
2000 struct ptx_device *dev = ptx_devices[ord];
|
|
2001
|
|
2002 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
|
|
2003 {
|
|
2004 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
|
|
2005 " (expected %u, received %u)",
|
|
2006 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
|
|
2007 return false;
|
|
2008 }
|
|
2009
|
|
2010 bool ret = true;
|
|
2011 pthread_mutex_lock (&dev->image_lock);
|
|
2012 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
|
|
2013 if (image->target_data == target_data)
|
|
2014 {
|
|
2015 *prev_p = image->next;
|
|
2016 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
|
|
2017 ret = false;
|
|
2018 free (image->fns);
|
|
2019 free (image);
|
|
2020 break;
|
|
2021 }
|
|
2022 pthread_mutex_unlock (&dev->image_lock);
|
|
2023 return ret;
|
|
2024 }
|
|
2025
|
|
2026 void *
|
|
2027 GOMP_OFFLOAD_alloc (int ord, size_t size)
|
|
2028 {
|
|
2029 if (!nvptx_attach_host_thread_to_device (ord))
|
|
2030 return NULL;
|
|
2031 return nvptx_alloc (size);
|
|
2032 }
|
|
2033
|
|
2034 bool
|
|
2035 GOMP_OFFLOAD_free (int ord, void *ptr)
|
|
2036 {
|
|
2037 return (nvptx_attach_host_thread_to_device (ord)
|
|
2038 && nvptx_free (ptr));
|
|
2039 }
|
|
2040
|
|
2041 bool
|
|
2042 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
|
|
2043 {
|
|
2044 return (nvptx_attach_host_thread_to_device (ord)
|
|
2045 && nvptx_dev2host (dst, src, n));
|
|
2046 }
|
|
2047
|
|
2048 bool
|
|
2049 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
|
|
2050 {
|
|
2051 return (nvptx_attach_host_thread_to_device (ord)
|
|
2052 && nvptx_host2dev (dst, src, n));
|
|
2053 }
|
|
2054
|
|
2055 bool
|
|
2056 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
|
|
2057 {
|
|
2058 struct ptx_device *ptx_dev = ptx_devices[ord];
|
|
2059 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
|
|
2060 ptx_dev->null_stream->stream);
|
|
2061 return true;
|
|
2062 }
|
|
2063
|
|
2064 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
|
|
2065
|
|
2066 void
|
|
2067 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
|
|
2068 void **hostaddrs, void **devaddrs,
|
|
2069 int async, unsigned *dims, void *targ_mem_desc)
|
|
2070 {
|
|
2071 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
|
|
2072 }
|
|
2073
|
|
2074 void
|
|
2075 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
|
|
2076 {
|
|
2077 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
2078 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
2079
|
|
2080 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
2081 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
2082 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
|
|
2083 }
|
|
2084
|
|
2085 int
|
|
2086 GOMP_OFFLOAD_openacc_async_test (int async)
|
|
2087 {
|
|
2088 return nvptx_async_test (async);
|
|
2089 }
|
|
2090
|
|
2091 int
|
|
2092 GOMP_OFFLOAD_openacc_async_test_all (void)
|
|
2093 {
|
|
2094 return nvptx_async_test_all ();
|
|
2095 }
|
|
2096
|
|
2097 void
|
|
2098 GOMP_OFFLOAD_openacc_async_wait (int async)
|
|
2099 {
|
|
2100 nvptx_wait (async);
|
|
2101 }
|
|
2102
|
|
2103 void
|
|
2104 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
|
|
2105 {
|
|
2106 nvptx_wait_async (async1, async2);
|
|
2107 }
|
|
2108
|
|
2109 void
|
|
2110 GOMP_OFFLOAD_openacc_async_wait_all (void)
|
|
2111 {
|
|
2112 nvptx_wait_all ();
|
|
2113 }
|
|
2114
|
|
2115 void
|
|
2116 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
|
|
2117 {
|
|
2118 nvptx_wait_all_async (async);
|
|
2119 }
|
|
2120
|
|
2121 void
|
|
2122 GOMP_OFFLOAD_openacc_async_set_async (int async)
|
|
2123 {
|
|
2124 nvptx_set_async (async);
|
|
2125 }
|
|
2126
|
|
2127 void *
|
|
2128 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
|
|
2129 {
|
|
2130 struct ptx_device *ptx_dev;
|
|
2131 struct nvptx_thread *nvthd
|
|
2132 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
|
|
2133 CUcontext thd_ctx;
|
|
2134
|
|
2135 ptx_dev = ptx_devices[ord];
|
|
2136
|
|
2137 assert (ptx_dev);
|
|
2138
|
|
2139 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
|
|
2140
|
|
2141 assert (ptx_dev->ctx);
|
|
2142
|
|
2143 if (!thd_ctx)
|
|
2144 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
|
|
2145
|
|
2146 nvthd->current_stream = ptx_dev->null_stream;
|
|
2147 nvthd->ptx_dev = ptx_dev;
|
|
2148
|
|
2149 return (void *) nvthd;
|
|
2150 }
|
|
2151
|
|
2152 void
|
|
2153 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
|
|
2154 {
|
|
2155 free (data);
|
|
2156 }
|
|
2157
|
|
2158 void *
|
|
2159 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
|
|
2160 {
|
|
2161 return nvptx_get_current_cuda_device ();
|
|
2162 }
|
|
2163
|
|
2164 void *
|
|
2165 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
|
|
2166 {
|
|
2167 return nvptx_get_current_cuda_context ();
|
|
2168 }
|
|
2169
|
|
2170 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
|
|
2171
|
|
2172 void *
|
|
2173 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
|
|
2174 {
|
|
2175 return nvptx_get_cuda_stream (async);
|
|
2176 }
|
|
2177
|
|
2178 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
|
|
2179
|
|
2180 int
|
|
2181 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
|
|
2182 {
|
|
2183 return nvptx_set_cuda_stream (async, stream);
|
|
2184 }
|
|
2185
|
|
2186 /* Adjust launch dimensions: pick good values for number of blocks and warps
|
|
2187 and ensure that number of warps does not exceed CUDA limits as well as GCC's
|
|
2188 own limits. */
|
|
2189
|
|
2190 static void
|
|
2191 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
|
|
2192 struct ptx_device *ptx_dev,
|
|
2193 int *teams_p, int *threads_p)
|
|
2194 {
|
|
2195 int max_warps_block = fn->max_threads_per_block / 32;
|
|
2196 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
|
|
2197 and libgcc, which matches documented limit of all GPUs as of 2015. */
|
|
2198 if (max_warps_block > 32)
|
|
2199 max_warps_block = 32;
|
|
2200 if (*threads_p <= 0)
|
|
2201 *threads_p = 8;
|
|
2202 if (*threads_p > max_warps_block)
|
|
2203 *threads_p = max_warps_block;
|
|
2204
|
|
2205 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
|
|
2206 /* This is an estimate of how many blocks the device can host simultaneously.
|
|
2207 Actual limit, which may be lower, can be queried with "occupancy control"
|
|
2208 driver interface (since CUDA 6.0). */
|
|
2209 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
|
|
2210 if (*teams_p <= 0 || *teams_p > max_blocks)
|
|
2211 *teams_p = max_blocks;
|
|
2212 }
|
|
2213
|
|
2214 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
|
|
2215 target regions. */
|
|
2216
|
|
2217 static size_t
|
|
2218 nvptx_stacks_size ()
|
|
2219 {
|
|
2220 return 128 * 1024;
|
|
2221 }
|
|
2222
|
|
2223 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
|
|
2224
|
|
2225 static void *
|
|
2226 nvptx_stacks_alloc (size_t size, int num)
|
|
2227 {
|
|
2228 CUdeviceptr stacks;
|
|
2229 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
|
|
2230 if (r != CUDA_SUCCESS)
|
|
2231 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
|
|
2232 return (void *) stacks;
|
|
2233 }
|
|
2234
|
|
2235 /* Release storage previously allocated by nvptx_stacks_alloc. */
|
|
2236
|
|
2237 static void
|
|
2238 nvptx_stacks_free (void *p, int num)
|
|
2239 {
|
|
2240 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
|
|
2241 if (r != CUDA_SUCCESS)
|
|
2242 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
|
|
2243 }
|
|
2244
|
|
2245 void
|
|
2246 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|
2247 {
|
|
2248 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
|
|
2249 CUresult r;
|
|
2250 struct ptx_device *ptx_dev = ptx_devices[ord];
|
|
2251 const char *maybe_abort_msg = "(perhaps abort was called)";
|
|
2252 int teams = 0, threads = 0;
|
|
2253
|
|
2254 if (!args)
|
|
2255 GOMP_PLUGIN_fatal ("No target arguments provided");
|
|
2256 while (*args)
|
|
2257 {
|
|
2258 intptr_t id = (intptr_t) *args++, val;
|
|
2259 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
|
|
2260 val = (intptr_t) *args++;
|
|
2261 else
|
|
2262 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
|
|
2263 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
|
|
2264 continue;
|
|
2265 val = val > INT_MAX ? INT_MAX : val;
|
|
2266 id &= GOMP_TARGET_ARG_ID_MASK;
|
|
2267 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
|
|
2268 teams = val;
|
|
2269 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
|
|
2270 threads = val;
|
|
2271 }
|
|
2272 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
|
|
2273
|
|
2274 size_t stack_size = nvptx_stacks_size ();
|
|
2275 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
|
|
2276 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
|
|
2277 size_t fn_args_size = sizeof fn_args;
|
|
2278 void *config[] = {
|
|
2279 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
|
|
2280 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
|
|
2281 CU_LAUNCH_PARAM_END
|
|
2282 };
|
|
2283 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
|
2284 32, threads, 1, 0, ptx_dev->null_stream->stream,
|
|
2285 NULL, config);
|
|
2286 if (r != CUDA_SUCCESS)
|
|
2287 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
|
2288
|
|
2289 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
|
2290 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
2291 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
|
2292 maybe_abort_msg);
|
|
2293 else if (r != CUDA_SUCCESS)
|
|
2294 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
|
|
2295 nvptx_stacks_free (stacks, teams * threads);
|
|
2296 }
|
|
2297
|
|
2298 void
|
|
2299 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
|
|
2300 void *async_data)
|
|
2301 {
|
|
2302 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
|
|
2303 }
|