111
|
1 /* Plugin for NVPTX execution.
|
|
2
|
|
3 Copyright (C) 2013-2017 Free Software Foundation, Inc.
|
|
4
|
|
5 Contributed by Mentor Embedded.
|
|
6
|
|
7 This file is part of the GNU Offloading and Multi Processing Library
|
|
8 (libgomp).
|
|
9
|
|
10 Libgomp is free software; you can redistribute it and/or modify it
|
|
11 under the terms of the GNU General Public License as published by
|
|
12 the Free Software Foundation; either version 3, or (at your option)
|
|
13 any later version.
|
|
14
|
|
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
18 more details.
|
|
19
|
|
20 Under Section 7 of GPL version 3, you are granted additional
|
|
21 permissions described in the GCC Runtime Library Exception, version
|
|
22 3.1, as published by the Free Software Foundation.
|
|
23
|
|
24 You should have received a copy of the GNU General Public License and
|
|
25 a copy of the GCC Runtime Library Exception along with this program;
|
|
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
27 <http://www.gnu.org/licenses/>. */
|
|
28
|
|
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
|
|
30 library appears to hold some implicit state, but the documentation
|
|
31 is not clear as to what that state might be. Or how one might
|
|
32 propagate it from one thread to another. */
|
|
33
|
|
34 #include "openacc.h"
|
|
35 #include "config.h"
|
|
36 #include "libgomp-plugin.h"
|
|
37 #include "oacc-plugin.h"
|
|
38 #include "gomp-constants.h"
|
|
39
|
|
40 #include <pthread.h>
|
|
41 #include <cuda.h>
|
|
42 #include <stdbool.h>
|
|
43 #include <stdint.h>
|
|
44 #include <limits.h>
|
|
45 #include <string.h>
|
|
46 #include <stdio.h>
|
|
47 #include <unistd.h>
|
|
48 #include <assert.h>
|
|
49 #include <errno.h>
|
|
50
|
|
51 #if PLUGIN_NVPTX_DYNAMIC
|
|
52 # include <dlfcn.h>
|
|
53
|
|
54 # define CUDA_CALLS \
|
|
55 CUDA_ONE_CALL (cuCtxCreate) \
|
|
56 CUDA_ONE_CALL (cuCtxDestroy) \
|
|
57 CUDA_ONE_CALL (cuCtxGetCurrent) \
|
|
58 CUDA_ONE_CALL (cuCtxGetDevice) \
|
|
59 CUDA_ONE_CALL (cuCtxPopCurrent) \
|
|
60 CUDA_ONE_CALL (cuCtxPushCurrent) \
|
|
61 CUDA_ONE_CALL (cuCtxSynchronize) \
|
|
62 CUDA_ONE_CALL (cuDeviceGet) \
|
|
63 CUDA_ONE_CALL (cuDeviceGetAttribute) \
|
|
64 CUDA_ONE_CALL (cuDeviceGetCount) \
|
|
65 CUDA_ONE_CALL (cuEventCreate) \
|
|
66 CUDA_ONE_CALL (cuEventDestroy) \
|
|
67 CUDA_ONE_CALL (cuEventElapsedTime) \
|
|
68 CUDA_ONE_CALL (cuEventQuery) \
|
|
69 CUDA_ONE_CALL (cuEventRecord) \
|
|
70 CUDA_ONE_CALL (cuEventSynchronize) \
|
|
71 CUDA_ONE_CALL (cuFuncGetAttribute) \
|
|
72 CUDA_ONE_CALL (cuGetErrorString) \
|
|
73 CUDA_ONE_CALL (cuInit) \
|
|
74 CUDA_ONE_CALL (cuLaunchKernel) \
|
|
75 CUDA_ONE_CALL (cuLinkAddData) \
|
|
76 CUDA_ONE_CALL (cuLinkComplete) \
|
|
77 CUDA_ONE_CALL (cuLinkCreate) \
|
|
78 CUDA_ONE_CALL (cuLinkDestroy) \
|
|
79 CUDA_ONE_CALL (cuMemAlloc) \
|
|
80 CUDA_ONE_CALL (cuMemAllocHost) \
|
|
81 CUDA_ONE_CALL (cuMemcpy) \
|
|
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
|
|
83 CUDA_ONE_CALL (cuMemcpyDtoH) \
|
|
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
|
|
85 CUDA_ONE_CALL (cuMemcpyHtoD) \
|
|
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
|
|
87 CUDA_ONE_CALL (cuMemFree) \
|
|
88 CUDA_ONE_CALL (cuMemFreeHost) \
|
|
89 CUDA_ONE_CALL (cuMemGetAddressRange) \
|
|
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
|
|
91 CUDA_ONE_CALL (cuModuleGetFunction) \
|
|
92 CUDA_ONE_CALL (cuModuleGetGlobal) \
|
|
93 CUDA_ONE_CALL (cuModuleLoad) \
|
|
94 CUDA_ONE_CALL (cuModuleLoadData) \
|
|
95 CUDA_ONE_CALL (cuModuleUnload) \
|
|
96 CUDA_ONE_CALL (cuStreamCreate) \
|
|
97 CUDA_ONE_CALL (cuStreamDestroy) \
|
|
98 CUDA_ONE_CALL (cuStreamQuery) \
|
|
99 CUDA_ONE_CALL (cuStreamSynchronize) \
|
|
100 CUDA_ONE_CALL (cuStreamWaitEvent)
|
|
101 # define CUDA_ONE_CALL(call) \
|
|
102 __typeof (call) *call;
|
|
103 struct cuda_lib_s {
|
|
104 CUDA_CALLS
|
|
105 } cuda_lib;
|
|
106
|
|
107 /* -1 if init_cuda_lib has not been called yet, false
|
|
108 if it has been and failed, true if it has been and succeeded. */
|
|
109 static signed char cuda_lib_inited = -1;
|
|
110
|
|
111 /* Dynamically load the CUDA runtime library and initialize function
|
|
112 pointers, return false if unsuccessful, true if successful. */
|
|
113 static bool
|
|
114 init_cuda_lib (void)
|
|
115 {
|
|
116 if (cuda_lib_inited != -1)
|
|
117 return cuda_lib_inited;
|
|
118 const char *cuda_runtime_lib = "libcuda.so.1";
|
|
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
|
|
120 cuda_lib_inited = false;
|
|
121 if (h == NULL)
|
|
122 return false;
|
|
123 # undef CUDA_ONE_CALL
|
|
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
|
|
125 # define CUDA_ONE_CALL_1(call) \
|
|
126 cuda_lib.call = dlsym (h, #call); \
|
|
127 if (cuda_lib.call == NULL) \
|
|
128 return false;
|
|
129 CUDA_CALLS
|
|
130 cuda_lib_inited = true;
|
|
131 return true;
|
|
132 }
|
|
133 # undef CUDA_ONE_CALL
|
|
134 # undef CUDA_ONE_CALL_1
|
|
135 # define CUDA_CALL_PREFIX cuda_lib.
|
|
136 #else
|
|
137 # define CUDA_CALL_PREFIX
|
|
138 # define init_cuda_lib() true
|
|
139 #endif
|
|
140
|
|
141 /* Convenience macros for the frequently used CUDA library call and
|
|
142 error handling sequence as well as CUDA library calls that
|
|
143 do the error checking themselves or don't do it at all. */
|
|
144
|
|
145 #define CUDA_CALL_ERET(ERET, FN, ...) \
|
|
146 do { \
|
|
147 unsigned __r \
|
|
148 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
|
149 if (__r != CUDA_SUCCESS) \
|
|
150 { \
|
|
151 GOMP_PLUGIN_error (#FN " error: %s", \
|
|
152 cuda_error (__r)); \
|
|
153 return ERET; \
|
|
154 } \
|
|
155 } while (0)
|
|
156
|
|
157 #define CUDA_CALL(FN, ...) \
|
|
158 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
|
|
159
|
|
160 #define CUDA_CALL_ASSERT(FN, ...) \
|
|
161 do { \
|
|
162 unsigned __r \
|
|
163 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
|
164 if (__r != CUDA_SUCCESS) \
|
|
165 { \
|
|
166 GOMP_PLUGIN_fatal (#FN " error: %s", \
|
|
167 cuda_error (__r)); \
|
|
168 } \
|
|
169 } while (0)
|
|
170
|
|
171 #define CUDA_CALL_NOCHECK(FN, ...) \
|
|
172 CUDA_CALL_PREFIX FN (__VA_ARGS__)
|
|
173
|
|
174 static const char *
|
|
175 cuda_error (CUresult r)
|
|
176 {
|
|
177 #if CUDA_VERSION < 7000
|
|
178 /* Specified in documentation and present in library from at least
|
|
179 5.5. Not declared in header file prior to 7.0. */
|
|
180 extern CUresult cuGetErrorString (CUresult, const char **);
|
|
181 #endif
|
|
182 const char *desc;
|
|
183
|
|
184 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
|
|
185 if (r != CUDA_SUCCESS)
|
|
186 desc = "unknown cuda error";
|
|
187
|
|
188 return desc;
|
|
189 }
|
|
190
|
|
191 static unsigned int instantiated_devices = 0;
|
|
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
193
|
|
194 struct ptx_stream
|
|
195 {
|
|
196 CUstream stream;
|
|
197 pthread_t host_thread;
|
|
198 bool multithreaded;
|
|
199
|
|
200 CUdeviceptr d;
|
|
201 void *h;
|
|
202 void *h_begin;
|
|
203 void *h_end;
|
|
204 void *h_next;
|
|
205 void *h_prev;
|
|
206 void *h_tail;
|
|
207
|
|
208 struct ptx_stream *next;
|
|
209 };
|
|
210
|
|
211 /* Thread-specific data for PTX. */
|
|
212
|
|
213 struct nvptx_thread
|
|
214 {
|
|
215 struct ptx_stream *current_stream;
|
|
216 struct ptx_device *ptx_dev;
|
|
217 };
|
|
218
|
|
219 struct map
|
|
220 {
|
|
221 int async;
|
|
222 size_t size;
|
|
223 char mappings[0];
|
|
224 };
|
|
225
|
|
226 static bool
|
|
227 map_init (struct ptx_stream *s)
|
|
228 {
|
|
229 int size = getpagesize ();
|
|
230
|
|
231 assert (s);
|
|
232 assert (!s->d);
|
|
233 assert (!s->h);
|
|
234
|
|
235 CUDA_CALL (cuMemAllocHost, &s->h, size);
|
|
236 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
|
|
237
|
|
238 assert (s->h);
|
|
239
|
|
240 s->h_begin = s->h;
|
|
241 s->h_end = s->h_begin + size;
|
|
242 s->h_next = s->h_prev = s->h_tail = s->h_begin;
|
|
243
|
|
244 assert (s->h_next);
|
|
245 assert (s->h_end);
|
|
246 return true;
|
|
247 }
|
|
248
|
|
249 static bool
|
|
250 map_fini (struct ptx_stream *s)
|
|
251 {
|
|
252 CUDA_CALL (cuMemFreeHost, s->h);
|
|
253 return true;
|
|
254 }
|
|
255
|
|
256 static void
|
|
257 map_pop (struct ptx_stream *s)
|
|
258 {
|
|
259 struct map *m;
|
|
260
|
|
261 assert (s != NULL);
|
|
262 assert (s->h_next);
|
|
263 assert (s->h_prev);
|
|
264 assert (s->h_tail);
|
|
265
|
|
266 m = s->h_tail;
|
|
267
|
|
268 s->h_tail += m->size;
|
|
269
|
|
270 if (s->h_tail >= s->h_end)
|
|
271 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
|
|
272
|
|
273 if (s->h_next == s->h_tail)
|
|
274 s->h_prev = s->h_next;
|
|
275
|
|
276 assert (s->h_next >= s->h_begin);
|
|
277 assert (s->h_tail >= s->h_begin);
|
|
278 assert (s->h_prev >= s->h_begin);
|
|
279
|
|
280 assert (s->h_next <= s->h_end);
|
|
281 assert (s->h_tail <= s->h_end);
|
|
282 assert (s->h_prev <= s->h_end);
|
|
283 }
|
|
284
|
|
285 static void
|
|
286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
|
|
287 {
|
|
288 int left;
|
|
289 int offset;
|
|
290 struct map *m;
|
|
291
|
|
292 assert (s != NULL);
|
|
293
|
|
294 left = s->h_end - s->h_next;
|
|
295 size += sizeof (struct map);
|
|
296
|
|
297 assert (s->h_prev);
|
|
298 assert (s->h_next);
|
|
299
|
|
300 if (size >= left)
|
|
301 {
|
|
302 m = s->h_prev;
|
|
303 m->size += left;
|
|
304 s->h_next = s->h_begin;
|
|
305
|
|
306 if (s->h_next + size > s->h_end)
|
|
307 GOMP_PLUGIN_fatal ("unable to push map");
|
|
308 }
|
|
309
|
|
310 assert (s->h_next);
|
|
311
|
|
312 m = s->h_next;
|
|
313 m->async = async;
|
|
314 m->size = size;
|
|
315
|
|
316 offset = (void *)&m->mappings[0] - s->h;
|
|
317
|
|
318 *d = (void *)(s->d + offset);
|
|
319 *h = (void *)(s->h + offset);
|
|
320
|
|
321 s->h_prev = s->h_next;
|
|
322 s->h_next += size;
|
|
323
|
|
324 assert (s->h_prev);
|
|
325 assert (s->h_next);
|
|
326
|
|
327 assert (s->h_next >= s->h_begin);
|
|
328 assert (s->h_tail >= s->h_begin);
|
|
329 assert (s->h_prev >= s->h_begin);
|
|
330 assert (s->h_next <= s->h_end);
|
|
331 assert (s->h_tail <= s->h_end);
|
|
332 assert (s->h_prev <= s->h_end);
|
|
333
|
|
334 return;
|
|
335 }
|
|
336
|
|
337 /* Target data function launch information. */
|
|
338
|
|
339 struct targ_fn_launch
|
|
340 {
|
|
341 const char *fn;
|
|
342 unsigned short dim[GOMP_DIM_MAX];
|
|
343 };
|
|
344
|
|
345 /* Target PTX object information. */
|
|
346
|
|
347 struct targ_ptx_obj
|
|
348 {
|
|
349 const char *code;
|
|
350 size_t size;
|
|
351 };
|
|
352
|
|
353 /* Target data image information. */
|
|
354
|
|
355 typedef struct nvptx_tdata
|
|
356 {
|
|
357 const struct targ_ptx_obj *ptx_objs;
|
|
358 unsigned ptx_num;
|
|
359
|
|
360 const char *const *var_names;
|
|
361 unsigned var_num;
|
|
362
|
|
363 const struct targ_fn_launch *fn_descs;
|
|
364 unsigned fn_num;
|
|
365 } nvptx_tdata_t;
|
|
366
|
|
367 /* Descriptor of a loaded function. */
|
|
368
|
|
369 struct targ_fn_descriptor
|
|
370 {
|
|
371 CUfunction fn;
|
|
372 const struct targ_fn_launch *launch;
|
|
373 int regs_per_thread;
|
|
374 int max_threads_per_block;
|
|
375 };
|
|
376
|
|
377 /* A loaded PTX image. */
|
|
378 struct ptx_image_data
|
|
379 {
|
|
380 const void *target_data;
|
|
381 CUmodule module;
|
|
382
|
|
383 struct targ_fn_descriptor *fns; /* Array of functions. */
|
|
384
|
|
385 struct ptx_image_data *next;
|
|
386 };
|
|
387
|
|
388 struct ptx_device
|
|
389 {
|
|
390 CUcontext ctx;
|
|
391 bool ctx_shared;
|
|
392 CUdevice dev;
|
|
393 struct ptx_stream *null_stream;
|
|
394 /* All non-null streams associated with this device (actually context),
|
|
395 either created implicitly or passed in from the user (via
|
|
396 acc_set_cuda_stream). */
|
|
397 struct ptx_stream *active_streams;
|
|
398 struct {
|
|
399 struct ptx_stream **arr;
|
|
400 int size;
|
|
401 } async_streams;
|
|
402 /* A lock for use when manipulating the above stream list and array. */
|
|
403 pthread_mutex_t stream_lock;
|
|
404 int ord;
|
|
405 bool overlap;
|
|
406 bool map;
|
|
407 bool concur;
|
|
408 bool mkern;
|
|
409 int mode;
|
|
410 int clock_khz;
|
|
411 int num_sms;
|
|
412 int regs_per_block;
|
|
413 int regs_per_sm;
|
|
414
|
|
415 struct ptx_image_data *images; /* Images loaded on device. */
|
|
416 pthread_mutex_t image_lock; /* Lock for above list. */
|
|
417
|
|
418 struct ptx_device *next;
|
|
419 };
|
|
420
|
|
421 enum ptx_event_type
|
|
422 {
|
|
423 PTX_EVT_MEM,
|
|
424 PTX_EVT_KNL,
|
|
425 PTX_EVT_SYNC,
|
|
426 PTX_EVT_ASYNC_CLEANUP
|
|
427 };
|
|
428
|
|
429 struct ptx_event
|
|
430 {
|
|
431 CUevent *evt;
|
|
432 int type;
|
|
433 void *addr;
|
|
434 int ord;
|
|
435 int val;
|
|
436
|
|
437 struct ptx_event *next;
|
|
438 };
|
|
439
|
|
440 static pthread_mutex_t ptx_event_lock;
|
|
441 static struct ptx_event *ptx_events;
|
|
442
|
|
443 static struct ptx_device **ptx_devices;
|
|
444
|
|
445 static inline struct nvptx_thread *
|
|
446 nvptx_thread (void)
|
|
447 {
|
|
448 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
|
|
449 }
|
|
450
|
|
451 static bool
|
|
452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
|
|
453 {
|
|
454 int i;
|
|
455 struct ptx_stream *null_stream
|
|
456 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
|
|
457
|
|
458 null_stream->stream = NULL;
|
|
459 null_stream->host_thread = pthread_self ();
|
|
460 null_stream->multithreaded = true;
|
|
461 null_stream->d = (CUdeviceptr) NULL;
|
|
462 null_stream->h = NULL;
|
|
463 if (!map_init (null_stream))
|
|
464 return false;
|
|
465
|
|
466 ptx_dev->null_stream = null_stream;
|
|
467 ptx_dev->active_streams = NULL;
|
|
468 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
|
|
469
|
|
470 if (concurrency < 1)
|
|
471 concurrency = 1;
|
|
472
|
|
473 /* This is just a guess -- make space for as many async streams as the
|
|
474 current device is capable of concurrently executing. This can grow
|
|
475 later as necessary. No streams are created yet. */
|
|
476 ptx_dev->async_streams.arr
|
|
477 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
|
|
478 ptx_dev->async_streams.size = concurrency;
|
|
479
|
|
480 for (i = 0; i < concurrency; i++)
|
|
481 ptx_dev->async_streams.arr[i] = NULL;
|
|
482
|
|
483 return true;
|
|
484 }
|
|
485
|
|
486 static bool
|
|
487 fini_streams_for_device (struct ptx_device *ptx_dev)
|
|
488 {
|
|
489 free (ptx_dev->async_streams.arr);
|
|
490
|
|
491 bool ret = true;
|
|
492 while (ptx_dev->active_streams != NULL)
|
|
493 {
|
|
494 struct ptx_stream *s = ptx_dev->active_streams;
|
|
495 ptx_dev->active_streams = ptx_dev->active_streams->next;
|
|
496
|
|
497 ret &= map_fini (s);
|
|
498
|
|
499 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
|
|
500 if (r != CUDA_SUCCESS)
|
|
501 {
|
|
502 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
|
|
503 ret = false;
|
|
504 }
|
|
505 free (s);
|
|
506 }
|
|
507
|
|
508 ret &= map_fini (ptx_dev->null_stream);
|
|
509 free (ptx_dev->null_stream);
|
|
510 return ret;
|
|
511 }
|
|
512
|
|
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
|
|
514 thread THREAD (and also current device/context). If CREATE is true, create
|
|
515 the stream if it does not exist (or use EXISTING if it is non-NULL), and
|
|
516 associate the stream with the same thread argument. Returns stream to use
|
|
517 as result. */
|
|
518
|
|
519 static struct ptx_stream *
|
|
520 select_stream_for_async (int async, pthread_t thread, bool create,
|
|
521 CUstream existing)
|
|
522 {
|
|
523 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
524 /* Local copy of TLS variable. */
|
|
525 struct ptx_device *ptx_dev = nvthd->ptx_dev;
|
|
526 struct ptx_stream *stream = NULL;
|
|
527 int orig_async = async;
|
|
528
|
|
529 /* The special value acc_async_noval (-1) maps (for now) to an
|
|
530 implicitly-created stream, which is then handled the same as any other
|
|
531 numbered async stream. Other options are available, e.g. using the null
|
|
532 stream for anonymous async operations, or choosing an idle stream from an
|
|
533 active set. But, stick with this for now. */
|
|
534 if (async > acc_async_sync)
|
|
535 async++;
|
|
536
|
|
537 if (create)
|
|
538 pthread_mutex_lock (&ptx_dev->stream_lock);
|
|
539
|
|
540 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
|
|
541 null stream, and in fact better performance may be obtainable if it doesn't
|
|
542 (because the null stream enforces overly-strict synchronisation with
|
|
543 respect to other streams for legacy reasons, and that's probably not
|
|
544 needed with OpenACC). Maybe investigate later. */
|
|
545 if (async == acc_async_sync)
|
|
546 stream = ptx_dev->null_stream;
|
|
547 else if (async >= 0 && async < ptx_dev->async_streams.size
|
|
548 && ptx_dev->async_streams.arr[async] && !(create && existing))
|
|
549 stream = ptx_dev->async_streams.arr[async];
|
|
550 else if (async >= 0 && create)
|
|
551 {
|
|
552 if (async >= ptx_dev->async_streams.size)
|
|
553 {
|
|
554 int i, newsize = ptx_dev->async_streams.size * 2;
|
|
555
|
|
556 if (async >= newsize)
|
|
557 newsize = async + 1;
|
|
558
|
|
559 ptx_dev->async_streams.arr
|
|
560 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
|
|
561 newsize * sizeof (struct ptx_stream *));
|
|
562
|
|
563 for (i = ptx_dev->async_streams.size; i < newsize; i++)
|
|
564 ptx_dev->async_streams.arr[i] = NULL;
|
|
565
|
|
566 ptx_dev->async_streams.size = newsize;
|
|
567 }
|
|
568
|
|
569 /* Create a new stream on-demand if there isn't one already, or if we're
|
|
570 setting a particular async value to an existing (externally-provided)
|
|
571 stream. */
|
|
572 if (!ptx_dev->async_streams.arr[async] || existing)
|
|
573 {
|
|
574 CUresult r;
|
|
575 struct ptx_stream *s
|
|
576 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
|
|
577
|
|
578 if (existing)
|
|
579 s->stream = existing;
|
|
580 else
|
|
581 {
|
|
582 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
|
|
583 CU_STREAM_DEFAULT);
|
|
584 if (r != CUDA_SUCCESS)
|
|
585 {
|
|
586 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
587 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
|
|
588 cuda_error (r));
|
|
589 }
|
|
590 }
|
|
591
|
|
592 /* If CREATE is true, we're going to be queueing some work on this
|
|
593 stream. Associate it with the current host thread. */
|
|
594 s->host_thread = thread;
|
|
595 s->multithreaded = false;
|
|
596
|
|
597 s->d = (CUdeviceptr) NULL;
|
|
598 s->h = NULL;
|
|
599 if (!map_init (s))
|
|
600 {
|
|
601 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
602 GOMP_PLUGIN_fatal ("map_init fail");
|
|
603 }
|
|
604
|
|
605 s->next = ptx_dev->active_streams;
|
|
606 ptx_dev->active_streams = s;
|
|
607 ptx_dev->async_streams.arr[async] = s;
|
|
608 }
|
|
609
|
|
610 stream = ptx_dev->async_streams.arr[async];
|
|
611 }
|
|
612 else if (async < 0)
|
|
613 {
|
|
614 if (create)
|
|
615 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
616 GOMP_PLUGIN_fatal ("bad async %d", async);
|
|
617 }
|
|
618
|
|
619 if (create)
|
|
620 {
|
|
621 assert (stream != NULL);
|
|
622
|
|
623 /* If we're trying to use the same stream from different threads
|
|
624 simultaneously, set stream->multithreaded to true. This affects the
|
|
625 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
|
|
626 only wait for asynchronous launches from the same host thread they are
|
|
627 invoked on. If multiple threads use the same async value, we make note
|
|
628 of that here and fall back to testing/waiting for all threads in those
|
|
629 functions. */
|
|
630 if (thread != stream->host_thread)
|
|
631 stream->multithreaded = true;
|
|
632
|
|
633 pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
634 }
|
|
635 else if (stream && !stream->multithreaded
|
|
636 && !pthread_equal (stream->host_thread, thread))
|
|
637 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
|
|
638
|
|
639 return stream;
|
|
640 }
|
|
641
|
|
642 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
|
|
643 should be locked on entry and remains locked on exit. */
|
|
644
|
|
645 static bool
|
|
646 nvptx_init (void)
|
|
647 {
|
|
648 int ndevs;
|
|
649
|
|
650 if (instantiated_devices != 0)
|
|
651 return true;
|
|
652
|
|
653 ptx_events = NULL;
|
|
654 pthread_mutex_init (&ptx_event_lock, NULL);
|
|
655
|
|
656 if (!init_cuda_lib ())
|
|
657 return false;
|
|
658
|
|
659 CUDA_CALL (cuInit, 0);
|
|
660
|
|
661 CUDA_CALL (cuDeviceGetCount, &ndevs);
|
|
662 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
|
|
663 * ndevs);
|
|
664 return true;
|
|
665 }
|
|
666
|
|
667 /* Select the N'th PTX device for the current host thread. The device must
|
|
668 have been previously opened before calling this function. */
|
|
669
|
|
670 static bool
|
|
671 nvptx_attach_host_thread_to_device (int n)
|
|
672 {
|
|
673 CUdevice dev;
|
|
674 CUresult r;
|
|
675 struct ptx_device *ptx_dev;
|
|
676 CUcontext thd_ctx;
|
|
677
|
|
678 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
|
|
679 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
|
680 {
|
|
681 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
|
682 return false;
|
|
683 }
|
|
684
|
|
685 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
|
|
686 return true;
|
|
687 else
|
|
688 {
|
|
689 CUcontext old_ctx;
|
|
690
|
|
691 ptx_dev = ptx_devices[n];
|
|
692 if (!ptx_dev)
|
|
693 {
|
|
694 GOMP_PLUGIN_error ("device %d not found", n);
|
|
695 return false;
|
|
696 }
|
|
697
|
|
698 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
|
|
699
|
|
700 /* We don't necessarily have a current context (e.g. if it has been
|
|
701 destroyed. Pop it if we do though. */
|
|
702 if (thd_ctx != NULL)
|
|
703 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
|
|
704
|
|
705 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
|
|
706 }
|
|
707 return true;
|
|
708 }
|
|
709
|
|
710 static struct ptx_device *
|
|
711 nvptx_open_device (int n)
|
|
712 {
|
|
713 struct ptx_device *ptx_dev;
|
|
714 CUdevice dev, ctx_dev;
|
|
715 CUresult r;
|
|
716 int async_engines, pi;
|
|
717
|
|
718 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
|
|
719
|
|
720 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
|
|
721
|
|
722 ptx_dev->ord = n;
|
|
723 ptx_dev->dev = dev;
|
|
724 ptx_dev->ctx_shared = false;
|
|
725
|
|
726 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
|
|
727 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
|
728 {
|
|
729 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
|
730 return NULL;
|
|
731 }
|
|
732
|
|
733 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
|
|
734 {
|
|
735 /* The current host thread has an active context for a different device.
|
|
736 Detach it. */
|
|
737 CUcontext old_ctx;
|
|
738 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
|
|
739 }
|
|
740
|
|
741 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
|
|
742
|
|
743 if (!ptx_dev->ctx)
|
|
744 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
|
|
745 else
|
|
746 ptx_dev->ctx_shared = true;
|
|
747
|
|
748 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
749 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
|
|
750 ptx_dev->overlap = pi;
|
|
751
|
|
752 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
753 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
|
|
754 ptx_dev->map = pi;
|
|
755
|
|
756 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
757 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
|
|
758 ptx_dev->concur = pi;
|
|
759
|
|
760 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
761 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
|
762 ptx_dev->mode = pi;
|
|
763
|
|
764 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
765 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
|
|
766 ptx_dev->mkern = pi;
|
|
767
|
|
768 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
769 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
|
770 ptx_dev->clock_khz = pi;
|
|
771
|
|
772 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
773 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
|
774 ptx_dev->num_sms = pi;
|
|
775
|
|
776 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
777 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
|
|
778 ptx_dev->regs_per_block = pi;
|
|
779
|
|
780 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
|
|
781 in CUDA 6.0 and newer. */
|
|
782 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
|
|
783 /* Fallback: use limit of registers per block, which is usually equal. */
|
|
784 if (r == CUDA_ERROR_INVALID_VALUE)
|
|
785 pi = ptx_dev->regs_per_block;
|
|
786 else if (r != CUDA_SUCCESS)
|
|
787 {
|
|
788 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
|
|
789 return NULL;
|
|
790 }
|
|
791 ptx_dev->regs_per_sm = pi;
|
|
792
|
|
793 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
|
794 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
|
|
795 if (pi != 32)
|
|
796 {
|
|
797 GOMP_PLUGIN_error ("Only warp size 32 is supported");
|
|
798 return NULL;
|
|
799 }
|
|
800
|
|
801 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
|
|
802 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
|
803 if (r != CUDA_SUCCESS)
|
|
804 async_engines = 1;
|
|
805
|
|
806 ptx_dev->images = NULL;
|
|
807 pthread_mutex_init (&ptx_dev->image_lock, NULL);
|
|
808
|
|
809 if (!init_streams_for_device (ptx_dev, async_engines))
|
|
810 return NULL;
|
|
811
|
|
812 return ptx_dev;
|
|
813 }
|
|
814
|
|
815 static bool
|
|
816 nvptx_close_device (struct ptx_device *ptx_dev)
|
|
817 {
|
|
818 if (!ptx_dev)
|
|
819 return true;
|
|
820
|
|
821 if (!fini_streams_for_device (ptx_dev))
|
|
822 return false;
|
|
823
|
|
824 pthread_mutex_destroy (&ptx_dev->image_lock);
|
|
825
|
|
826 if (!ptx_dev->ctx_shared)
|
|
827 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
|
|
828
|
|
829 free (ptx_dev);
|
|
830 return true;
|
|
831 }
|
|
832
|
|
833 static int
|
|
834 nvptx_get_num_devices (void)
|
|
835 {
|
|
836 int n;
|
|
837
|
|
838 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
|
|
839 configurations. */
|
|
840 if (sizeof (void *) != 8)
|
|
841 {
|
|
842 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
|
|
843 " only 64-bit configurations are supported\n");
|
|
844 return 0;
|
|
845 }
|
|
846
|
|
847 /* This function will be called before the plugin has been initialized in
|
|
848 order to enumerate available devices, but CUDA API routines can't be used
|
|
849 until cuInit has been called. Just call it now (but don't yet do any
|
|
850 further initialization). */
|
|
851 if (instantiated_devices == 0)
|
|
852 {
|
|
853 if (!init_cuda_lib ())
|
|
854 return 0;
|
|
855 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
|
|
856 /* This is not an error: e.g. we may have CUDA libraries installed but
|
|
857 no devices available. */
|
|
858 if (r != CUDA_SUCCESS)
|
|
859 {
|
|
860 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
|
|
861 cuda_error (r));
|
|
862 return 0;
|
|
863 }
|
|
864 }
|
|
865
|
|
866 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
|
|
867 return n;
|
|
868 }
|
|
869
|
|
870 static void
|
|
871 notify_var (const char *var_name, const char *env_var)
|
|
872 {
|
|
873 if (env_var == NULL)
|
|
874 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
|
|
875 else
|
|
876 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
|
|
877 }
|
|
878
|
|
879 static bool
|
|
880 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
|
|
881 unsigned num_objs)
|
|
882 {
|
|
883 CUjit_option opts[6];
|
|
884 void *optvals[6];
|
|
885 float elapsed = 0.0;
|
|
886 char elog[1024];
|
|
887 char ilog[16384];
|
|
888 CUlinkState linkstate;
|
|
889 CUresult r;
|
|
890 void *linkout;
|
|
891 size_t linkoutsize __attribute__ ((unused));
|
|
892
|
|
893 opts[0] = CU_JIT_WALL_TIME;
|
|
894 optvals[0] = &elapsed;
|
|
895
|
|
896 opts[1] = CU_JIT_INFO_LOG_BUFFER;
|
|
897 optvals[1] = &ilog[0];
|
|
898
|
|
899 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
|
900 optvals[2] = (void *) sizeof ilog;
|
|
901
|
|
902 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
|
|
903 optvals[3] = &elog[0];
|
|
904
|
|
905 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
|
906 optvals[4] = (void *) sizeof elog;
|
|
907
|
|
908 opts[5] = CU_JIT_LOG_VERBOSE;
|
|
909 optvals[5] = (void *) 1;
|
|
910
|
|
911 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
|
|
912
|
|
913 for (; num_objs--; ptx_objs++)
|
|
914 {
|
|
915 /* cuLinkAddData's 'data' argument erroneously omits the const
|
|
916 qualifier. */
|
|
917 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
|
|
918 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
|
|
919 (char *) ptx_objs->code, ptx_objs->size,
|
|
920 0, 0, 0, 0);
|
|
921 if (r != CUDA_SUCCESS)
|
|
922 {
|
|
923 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
|
|
924 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
|
|
925 cuda_error (r));
|
|
926 return false;
|
|
927 }
|
|
928 }
|
|
929
|
|
930 GOMP_PLUGIN_debug (0, "Linking\n");
|
|
931 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
|
|
932
|
|
933 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
|
|
934 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
|
|
935
|
|
936 if (r != CUDA_SUCCESS)
|
|
937 {
|
|
938 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
|
|
939 return false;
|
|
940 }
|
|
941
|
|
942 CUDA_CALL (cuModuleLoadData, module, linkout);
|
|
943 CUDA_CALL (cuLinkDestroy, linkstate);
|
|
944 return true;
|
|
945 }
|
|
946
|
|
947 static void
|
|
948 event_gc (bool memmap_lockable)
|
|
949 {
|
|
950 struct ptx_event *ptx_event = ptx_events;
|
|
951 struct ptx_event *async_cleanups = NULL;
|
|
952 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
953
|
|
954 pthread_mutex_lock (&ptx_event_lock);
|
|
955
|
|
956 while (ptx_event != NULL)
|
|
957 {
|
|
958 CUresult r;
|
|
959 struct ptx_event *e = ptx_event;
|
|
960
|
|
961 ptx_event = ptx_event->next;
|
|
962
|
|
963 if (e->ord != nvthd->ptx_dev->ord)
|
|
964 continue;
|
|
965
|
|
966 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
|
|
967 if (r == CUDA_SUCCESS)
|
|
968 {
|
|
969 bool append_async = false;
|
|
970 CUevent *te;
|
|
971
|
|
972 te = e->evt;
|
|
973
|
|
974 switch (e->type)
|
|
975 {
|
|
976 case PTX_EVT_MEM:
|
|
977 case PTX_EVT_SYNC:
|
|
978 break;
|
|
979
|
|
980 case PTX_EVT_KNL:
|
|
981 map_pop (e->addr);
|
|
982 break;
|
|
983
|
|
984 case PTX_EVT_ASYNC_CLEANUP:
|
|
985 {
|
|
986 /* The function gomp_plugin_async_unmap_vars needs to claim the
|
|
987 memory-map splay tree lock for the current device, so we
|
|
988 can't call it when one of our callers has already claimed
|
|
989 the lock. In that case, just delay the GC for this event
|
|
990 until later. */
|
|
991 if (!memmap_lockable)
|
|
992 continue;
|
|
993
|
|
994 append_async = true;
|
|
995 }
|
|
996 break;
|
|
997 }
|
|
998
|
|
999 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
|
|
1000 free ((void *)te);
|
|
1001
|
|
1002 /* Unlink 'e' from ptx_events list. */
|
|
1003 if (ptx_events == e)
|
|
1004 ptx_events = ptx_events->next;
|
|
1005 else
|
|
1006 {
|
|
1007 struct ptx_event *e_ = ptx_events;
|
|
1008 while (e_->next != e)
|
|
1009 e_ = e_->next;
|
|
1010 e_->next = e_->next->next;
|
|
1011 }
|
|
1012
|
|
1013 if (append_async)
|
|
1014 {
|
|
1015 e->next = async_cleanups;
|
|
1016 async_cleanups = e;
|
|
1017 }
|
|
1018 else
|
|
1019 free (e);
|
|
1020 }
|
|
1021 }
|
|
1022
|
|
1023 pthread_mutex_unlock (&ptx_event_lock);
|
|
1024
|
|
1025 /* We have to do these here, after ptx_event_lock is released. */
|
|
1026 while (async_cleanups)
|
|
1027 {
|
|
1028 struct ptx_event *e = async_cleanups;
|
|
1029 async_cleanups = async_cleanups->next;
|
|
1030
|
|
1031 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
|
|
1032 free (e);
|
|
1033 }
|
|
1034 }
|
|
1035
|
|
1036 static void
|
|
1037 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
|
|
1038 {
|
|
1039 struct ptx_event *ptx_event;
|
|
1040 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1041
|
|
1042 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
|
|
1043 || type == PTX_EVT_ASYNC_CLEANUP);
|
|
1044
|
|
1045 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
|
|
1046 ptx_event->type = type;
|
|
1047 ptx_event->evt = e;
|
|
1048 ptx_event->addr = h;
|
|
1049 ptx_event->ord = nvthd->ptx_dev->ord;
|
|
1050 ptx_event->val = val;
|
|
1051
|
|
1052 pthread_mutex_lock (&ptx_event_lock);
|
|
1053
|
|
1054 ptx_event->next = ptx_events;
|
|
1055 ptx_events = ptx_event;
|
|
1056
|
|
1057 pthread_mutex_unlock (&ptx_event_lock);
|
|
1058 }
|
|
1059
|
|
1060 static void
|
|
1061 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
|
1062 int async, unsigned *dims, void *targ_mem_desc)
|
|
1063 {
|
|
1064 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
|
|
1065 CUfunction function;
|
|
1066 CUresult r;
|
|
1067 int i;
|
|
1068 struct ptx_stream *dev_str;
|
|
1069 void *kargs[1];
|
|
1070 void *hp, *dp;
|
|
1071 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1072 const char *maybe_abort_msg = "(perhaps abort was called)";
|
|
1073
|
|
1074 function = targ_fn->fn;
|
|
1075
|
|
1076 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1077 assert (dev_str == nvthd->current_stream);
|
|
1078
|
|
1079 /* Initialize the launch dimensions. Typically this is constant,
|
|
1080 provided by the device compiler, but we must permit runtime
|
|
1081 values. */
|
|
1082 int seen_zero = 0;
|
|
1083 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1084 {
|
|
1085 if (targ_fn->launch->dim[i])
|
|
1086 dims[i] = targ_fn->launch->dim[i];
|
|
1087 if (!dims[i])
|
|
1088 seen_zero = 1;
|
|
1089 }
|
|
1090
|
|
1091 if (seen_zero)
|
|
1092 {
|
|
1093 /* See if the user provided GOMP_OPENACC_DIM environment
|
|
1094 variable to specify runtime defaults. */
|
|
1095 static int default_dims[GOMP_DIM_MAX];
|
|
1096
|
|
1097 pthread_mutex_lock (&ptx_dev_lock);
|
|
1098 if (!default_dims[0])
|
|
1099 {
|
|
1100 const char *var_name = "GOMP_OPENACC_DIM";
|
|
1101 /* We only read the environment variable once. You can't
|
|
1102 change it in the middle of execution. The syntax is
|
|
1103 the same as for the -fopenacc-dim compilation option. */
|
|
1104 const char *env_var = getenv (var_name);
|
|
1105 notify_var (var_name, env_var);
|
|
1106 if (env_var)
|
|
1107 {
|
|
1108 const char *pos = env_var;
|
|
1109
|
|
1110 for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
|
|
1111 {
|
|
1112 if (i && *pos++ != ':')
|
|
1113 break;
|
|
1114 if (*pos != ':')
|
|
1115 {
|
|
1116 const char *eptr;
|
|
1117
|
|
1118 errno = 0;
|
|
1119 long val = strtol (pos, (char **)&eptr, 10);
|
|
1120 if (errno || val < 0 || (unsigned)val != val)
|
|
1121 break;
|
|
1122 default_dims[i] = (int)val;
|
|
1123 pos = eptr;
|
|
1124 }
|
|
1125 }
|
|
1126 }
|
|
1127
|
|
1128 int warp_size, block_size, dev_size, cpu_size;
|
|
1129 CUdevice dev = nvptx_thread()->ptx_dev->dev;
|
|
1130 /* 32 is the default for known hardware. */
|
|
1131 int gang = 0, worker = 32, vector = 32;
|
|
1132 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
|
|
1133
|
|
1134 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
|
|
1135 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
|
|
1136 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
|
|
1137 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
|
|
1138
|
|
1139 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
|
|
1140 dev) == CUDA_SUCCESS
|
|
1141 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
|
|
1142 dev) == CUDA_SUCCESS
|
|
1143 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
|
|
1144 dev) == CUDA_SUCCESS
|
|
1145 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
|
|
1146 dev) == CUDA_SUCCESS)
|
|
1147 {
|
|
1148 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
|
1149 " dev_size=%d, cpu_size=%d\n",
|
|
1150 warp_size, block_size, dev_size, cpu_size);
|
|
1151 gang = (cpu_size / block_size) * dev_size;
|
|
1152 worker = block_size / warp_size;
|
|
1153 vector = warp_size;
|
|
1154 }
|
|
1155
|
|
1156 /* There is no upper bound on the gang size. The best size
|
|
1157 matches the hardware configuration. Logical gangs are
|
|
1158 scheduled onto physical hardware. To maximize usage, we
|
|
1159 should guess a large number. */
|
|
1160 if (default_dims[GOMP_DIM_GANG] < 1)
|
|
1161 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
|
|
1162 /* The worker size must not exceed the hardware. */
|
|
1163 if (default_dims[GOMP_DIM_WORKER] < 1
|
|
1164 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
|
|
1165 default_dims[GOMP_DIM_WORKER] = worker;
|
|
1166 /* The vector size must exactly match the hardware. */
|
|
1167 if (default_dims[GOMP_DIM_VECTOR] < 1
|
|
1168 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
|
|
1169 default_dims[GOMP_DIM_VECTOR] = vector;
|
|
1170
|
|
1171 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
|
|
1172 default_dims[GOMP_DIM_GANG],
|
|
1173 default_dims[GOMP_DIM_WORKER],
|
|
1174 default_dims[GOMP_DIM_VECTOR]);
|
|
1175 }
|
|
1176 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1177
|
|
1178 for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
1179 if (!dims[i])
|
|
1180 dims[i] = default_dims[i];
|
|
1181 }
|
|
1182
|
|
1183 /* This reserves a chunk of a pre-allocated page of memory mapped on both
|
|
1184 the host and the device. HP is a host pointer to the new chunk, and DP is
|
|
1185 the corresponding device pointer. */
|
|
1186 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
|
|
1187
|
|
1188 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
|
1189
|
|
1190 /* Copy the array of arguments to the mapped page. */
|
|
1191 for (i = 0; i < mapnum; i++)
|
|
1192 ((void **) hp)[i] = devaddrs[i];
|
|
1193
|
|
1194 /* Copy the (device) pointers to arguments to the device (dp and hp might in
|
|
1195 fact have the same value on a unified-memory system). */
|
|
1196 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
|
|
1197 mapnum * sizeof (void *));
|
|
1198 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
|
1199 " gangs=%u, workers=%u, vectors=%u\n",
|
|
1200 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
|
|
1201 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
|
|
1202
|
|
1203 // OpenACC CUDA
|
|
1204 //
|
|
1205 // num_gangs nctaid.x
|
|
1206 // num_workers ntid.y
|
|
1207 // vector length ntid.x
|
|
1208
|
|
1209 kargs[0] = &dp;
|
|
1210 CUDA_CALL_ASSERT (cuLaunchKernel, function,
|
|
1211 dims[GOMP_DIM_GANG], 1, 1,
|
|
1212 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
|
|
1213 0, dev_str->stream, kargs, 0);
|
|
1214
|
|
1215 #ifndef DISABLE_ASYNC
|
|
1216 if (async < acc_async_noval)
|
|
1217 {
|
|
1218 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
|
|
1219 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1220 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
|
|
1221 maybe_abort_msg);
|
|
1222 else if (r != CUDA_SUCCESS)
|
|
1223 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
|
|
1224 }
|
|
1225 else
|
|
1226 {
|
|
1227 CUevent *e;
|
|
1228
|
|
1229 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1230
|
|
1231 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1232 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1233 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
|
|
1234 maybe_abort_msg);
|
|
1235 else if (r != CUDA_SUCCESS)
|
|
1236 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
|
|
1237
|
|
1238 event_gc (true);
|
|
1239
|
|
1240 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
|
|
1241
|
|
1242 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
|
1243 }
|
|
1244 #else
|
|
1245 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
|
1246 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
1247 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
|
1248 maybe_abort_msg);
|
|
1249 else if (r != CUDA_SUCCESS)
|
|
1250 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
|
|
1251 #endif
|
|
1252
|
|
1253 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
|
|
1254 targ_fn->launch->fn);
|
|
1255
|
|
1256 #ifndef DISABLE_ASYNC
|
|
1257 if (async < acc_async_noval)
|
|
1258 #endif
|
|
1259 map_pop (dev_str);
|
|
1260 }
|
|
1261
|
|
1262 void * openacc_get_current_cuda_context (void);
|
|
1263
|
|
1264 static void *
|
|
1265 nvptx_alloc (size_t s)
|
|
1266 {
|
|
1267 CUdeviceptr d;
|
|
1268
|
|
1269 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
|
|
1270 return (void *) d;
|
|
1271 }
|
|
1272
|
|
1273 static bool
|
|
1274 nvptx_free (void *p)
|
|
1275 {
|
|
1276 CUdeviceptr pb;
|
|
1277 size_t ps;
|
|
1278
|
|
1279 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
|
|
1280 if ((CUdeviceptr) p != pb)
|
|
1281 {
|
|
1282 GOMP_PLUGIN_error ("invalid device address");
|
|
1283 return false;
|
|
1284 }
|
|
1285
|
|
1286 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
|
|
1287 return true;
|
|
1288 }
|
|
1289
|
|
1290
|
|
1291 static bool
|
|
1292 nvptx_host2dev (void *d, const void *h, size_t s)
|
|
1293 {
|
|
1294 CUdeviceptr pb;
|
|
1295 size_t ps;
|
|
1296 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1297
|
|
1298 if (!s)
|
|
1299 return true;
|
|
1300 if (!d)
|
|
1301 {
|
|
1302 GOMP_PLUGIN_error ("invalid device address");
|
|
1303 return false;
|
|
1304 }
|
|
1305
|
|
1306 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
|
|
1307
|
|
1308 if (!pb)
|
|
1309 {
|
|
1310 GOMP_PLUGIN_error ("invalid device address");
|
|
1311 return false;
|
|
1312 }
|
|
1313 if (!h)
|
|
1314 {
|
|
1315 GOMP_PLUGIN_error ("invalid host address");
|
|
1316 return false;
|
|
1317 }
|
|
1318 if (d == h)
|
|
1319 {
|
|
1320 GOMP_PLUGIN_error ("invalid host or device address");
|
|
1321 return false;
|
|
1322 }
|
|
1323 if ((void *)(d + s) > (void *)(pb + ps))
|
|
1324 {
|
|
1325 GOMP_PLUGIN_error ("invalid size");
|
|
1326 return false;
|
|
1327 }
|
|
1328
|
|
1329 #ifndef DISABLE_ASYNC
|
|
1330 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
|
|
1331 {
|
|
1332 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1333 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1334 event_gc (false);
|
|
1335 CUDA_CALL (cuMemcpyHtoDAsync,
|
|
1336 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
|
|
1337 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
1338 event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
|
1339 }
|
|
1340 else
|
|
1341 #endif
|
|
1342 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
|
|
1343
|
|
1344 return true;
|
|
1345 }
|
|
1346
|
|
1347 static bool
|
|
1348 nvptx_dev2host (void *h, const void *d, size_t s)
|
|
1349 {
|
|
1350 CUdeviceptr pb;
|
|
1351 size_t ps;
|
|
1352 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1353
|
|
1354 if (!s)
|
|
1355 return true;
|
|
1356 if (!d)
|
|
1357 {
|
|
1358 GOMP_PLUGIN_error ("invalid device address");
|
|
1359 return false;
|
|
1360 }
|
|
1361
|
|
1362 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
|
|
1363
|
|
1364 if (!pb)
|
|
1365 {
|
|
1366 GOMP_PLUGIN_error ("invalid device address");
|
|
1367 return false;
|
|
1368 }
|
|
1369 if (!h)
|
|
1370 {
|
|
1371 GOMP_PLUGIN_error ("invalid host address");
|
|
1372 return false;
|
|
1373 }
|
|
1374 if (d == h)
|
|
1375 {
|
|
1376 GOMP_PLUGIN_error ("invalid host or device address");
|
|
1377 return false;
|
|
1378 }
|
|
1379 if ((void *)(d + s) > (void *)(pb + ps))
|
|
1380 {
|
|
1381 GOMP_PLUGIN_error ("invalid size");
|
|
1382 return false;
|
|
1383 }
|
|
1384
|
|
1385 #ifndef DISABLE_ASYNC
|
|
1386 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
|
|
1387 {
|
|
1388 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1389 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1390 event_gc (false);
|
|
1391 CUDA_CALL (cuMemcpyDtoHAsync,
|
|
1392 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
|
|
1393 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
1394 event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
|
1395 }
|
|
1396 else
|
|
1397 #endif
|
|
1398 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
|
|
1399
|
|
1400 return true;
|
|
1401 }
|
|
1402
|
|
1403 static void
|
|
1404 nvptx_set_async (int async)
|
|
1405 {
|
|
1406 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1407 nvthd->current_stream
|
|
1408 = select_stream_for_async (async, pthread_self (), true, NULL);
|
|
1409 }
|
|
1410
|
|
1411 static int
|
|
1412 nvptx_async_test (int async)
|
|
1413 {
|
|
1414 CUresult r;
|
|
1415 struct ptx_stream *s;
|
|
1416
|
|
1417 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1418
|
|
1419 if (!s)
|
|
1420 GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
1421
|
|
1422 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
|
1423 if (r == CUDA_SUCCESS)
|
|
1424 {
|
|
1425 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
|
|
1426 whether all work has completed on this stream, and if so omits the call
|
|
1427 to the wait hook. If that happens, event_gc might not get called
|
|
1428 (which prevents variables from getting unmapped and their associated
|
|
1429 device storage freed), so call it here. */
|
|
1430 event_gc (true);
|
|
1431 return 1;
|
|
1432 }
|
|
1433 else if (r == CUDA_ERROR_NOT_READY)
|
|
1434 return 0;
|
|
1435
|
|
1436 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
|
|
1437
|
|
1438 return 0;
|
|
1439 }
|
|
1440
|
|
1441 static int
|
|
1442 nvptx_async_test_all (void)
|
|
1443 {
|
|
1444 struct ptx_stream *s;
|
|
1445 pthread_t self = pthread_self ();
|
|
1446 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1447
|
|
1448 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1449
|
|
1450 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
|
1451 {
|
|
1452 if ((s->multithreaded || pthread_equal (s->host_thread, self))
|
|
1453 && CUDA_CALL_NOCHECK (cuStreamQuery,
|
|
1454 s->stream) == CUDA_ERROR_NOT_READY)
|
|
1455 {
|
|
1456 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1457 return 0;
|
|
1458 }
|
|
1459 }
|
|
1460
|
|
1461 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1462
|
|
1463 event_gc (true);
|
|
1464
|
|
1465 return 1;
|
|
1466 }
|
|
1467
|
|
1468 static void
|
|
1469 nvptx_wait (int async)
|
|
1470 {
|
|
1471 struct ptx_stream *s;
|
|
1472
|
|
1473 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1474 if (!s)
|
|
1475 GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
1476
|
|
1477 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
|
|
1478
|
|
1479 event_gc (true);
|
|
1480 }
|
|
1481
|
|
1482 static void
|
|
1483 nvptx_wait_async (int async1, int async2)
|
|
1484 {
|
|
1485 CUevent *e;
|
|
1486 struct ptx_stream *s1, *s2;
|
|
1487 pthread_t self = pthread_self ();
|
|
1488
|
|
1489 /* The stream that is waiting (rather than being waited for) doesn't
|
|
1490 necessarily have to exist already. */
|
|
1491 s2 = select_stream_for_async (async2, self, true, NULL);
|
|
1492
|
|
1493 s1 = select_stream_for_async (async1, self, false, NULL);
|
|
1494 if (!s1)
|
|
1495 GOMP_PLUGIN_fatal ("invalid async 1\n");
|
|
1496
|
|
1497 if (s1 == s2)
|
|
1498 GOMP_PLUGIN_fatal ("identical parameters");
|
|
1499
|
|
1500 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1501
|
|
1502 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1503
|
|
1504 event_gc (true);
|
|
1505
|
|
1506 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
|
|
1507
|
|
1508 event_add (PTX_EVT_SYNC, e, NULL, 0);
|
|
1509
|
|
1510 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
|
|
1511 }
|
|
1512
|
|
1513 static void
|
|
1514 nvptx_wait_all (void)
|
|
1515 {
|
|
1516 CUresult r;
|
|
1517 struct ptx_stream *s;
|
|
1518 pthread_t self = pthread_self ();
|
|
1519 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1520
|
|
1521 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1522
|
|
1523 /* Wait for active streams initiated by this thread (or by multiple threads)
|
|
1524 to complete. */
|
|
1525 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
|
1526 {
|
|
1527 if (s->multithreaded || pthread_equal (s->host_thread, self))
|
|
1528 {
|
|
1529 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
|
1530 if (r == CUDA_SUCCESS)
|
|
1531 continue;
|
|
1532 else if (r != CUDA_ERROR_NOT_READY)
|
|
1533 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
|
|
1534
|
|
1535 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
|
|
1536 }
|
|
1537 }
|
|
1538
|
|
1539 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1540
|
|
1541 event_gc (true);
|
|
1542 }
|
|
1543
|
|
1544 static void
|
|
1545 nvptx_wait_all_async (int async)
|
|
1546 {
|
|
1547 struct ptx_stream *waiting_stream, *other_stream;
|
|
1548 CUevent *e;
|
|
1549 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1550 pthread_t self = pthread_self ();
|
|
1551
|
|
1552 /* The stream doing the waiting. This could be the first mention of the
|
|
1553 stream, so create it if necessary. */
|
|
1554 waiting_stream
|
|
1555 = select_stream_for_async (async, pthread_self (), true, NULL);
|
|
1556
|
|
1557 /* Launches on the null stream already block on other streams in the
|
|
1558 context. */
|
|
1559 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
|
|
1560 return;
|
|
1561
|
|
1562 event_gc (true);
|
|
1563
|
|
1564 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1565
|
|
1566 for (other_stream = nvthd->ptx_dev->active_streams;
|
|
1567 other_stream != NULL;
|
|
1568 other_stream = other_stream->next)
|
|
1569 {
|
|
1570 if (!other_stream->multithreaded
|
|
1571 && !pthread_equal (other_stream->host_thread, self))
|
|
1572 continue;
|
|
1573
|
|
1574 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1575
|
|
1576 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1577
|
|
1578 /* Record an event on the waited-for stream. */
|
|
1579 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
|
|
1580
|
|
1581 event_add (PTX_EVT_SYNC, e, NULL, 0);
|
|
1582
|
|
1583 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
|
|
1584 }
|
|
1585
|
|
1586 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1587 }
|
|
1588
|
|
1589 static void *
|
|
1590 nvptx_get_current_cuda_device (void)
|
|
1591 {
|
|
1592 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1593
|
|
1594 if (!nvthd || !nvthd->ptx_dev)
|
|
1595 return NULL;
|
|
1596
|
|
1597 return &nvthd->ptx_dev->dev;
|
|
1598 }
|
|
1599
|
|
1600 static void *
|
|
1601 nvptx_get_current_cuda_context (void)
|
|
1602 {
|
|
1603 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1604
|
|
1605 if (!nvthd || !nvthd->ptx_dev)
|
|
1606 return NULL;
|
|
1607
|
|
1608 return nvthd->ptx_dev->ctx;
|
|
1609 }
|
|
1610
|
|
1611 static void *
|
|
1612 nvptx_get_cuda_stream (int async)
|
|
1613 {
|
|
1614 struct ptx_stream *s;
|
|
1615 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1616
|
|
1617 if (!nvthd || !nvthd->ptx_dev)
|
|
1618 return NULL;
|
|
1619
|
|
1620 s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
1621
|
|
1622 return s ? s->stream : NULL;
|
|
1623 }
|
|
1624
|
|
1625 static int
|
|
1626 nvptx_set_cuda_stream (int async, void *stream)
|
|
1627 {
|
|
1628 struct ptx_stream *oldstream;
|
|
1629 pthread_t self = pthread_self ();
|
|
1630 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1631
|
|
1632 if (async < 0)
|
|
1633 GOMP_PLUGIN_fatal ("bad async %d", async);
|
|
1634
|
|
1635 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
1636
|
|
1637 /* We have a list of active streams and an array mapping async values to
|
|
1638 entries of that list. We need to take "ownership" of the passed-in stream,
|
|
1639 and add it to our list, removing the previous entry also (if there was one)
|
|
1640 in order to prevent resource leaks. Note the potential for surprise
|
|
1641 here: maybe we should keep track of passed-in streams and leave it up to
|
|
1642 the user to tidy those up, but that doesn't work for stream handles
|
|
1643 returned from acc_get_cuda_stream above... */
|
|
1644
|
|
1645 oldstream = select_stream_for_async (async, self, false, NULL);
|
|
1646
|
|
1647 if (oldstream)
|
|
1648 {
|
|
1649 if (nvthd->ptx_dev->active_streams == oldstream)
|
|
1650 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
|
|
1651 else
|
|
1652 {
|
|
1653 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
|
|
1654 while (s->next != oldstream)
|
|
1655 s = s->next;
|
|
1656 s->next = s->next->next;
|
|
1657 }
|
|
1658
|
|
1659 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
|
|
1660
|
|
1661 if (!map_fini (oldstream))
|
|
1662 GOMP_PLUGIN_fatal ("error when freeing host memory");
|
|
1663
|
|
1664 free (oldstream);
|
|
1665 }
|
|
1666
|
|
1667 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
|
1668
|
|
1669 (void) select_stream_for_async (async, self, true, (CUstream) stream);
|
|
1670
|
|
1671 return 1;
|
|
1672 }
|
|
1673
|
|
1674 /* Plugin entry points. */
|
|
1675
|
|
1676 const char *
|
|
1677 GOMP_OFFLOAD_get_name (void)
|
|
1678 {
|
|
1679 return "nvptx";
|
|
1680 }
|
|
1681
|
|
1682 unsigned int
|
|
1683 GOMP_OFFLOAD_get_caps (void)
|
|
1684 {
|
|
1685 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
|
|
1686 }
|
|
1687
|
|
1688 int
|
|
1689 GOMP_OFFLOAD_get_type (void)
|
|
1690 {
|
|
1691 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
|
|
1692 }
|
|
1693
|
|
1694 int
|
|
1695 GOMP_OFFLOAD_get_num_devices (void)
|
|
1696 {
|
|
1697 return nvptx_get_num_devices ();
|
|
1698 }
|
|
1699
|
|
1700 bool
|
|
1701 GOMP_OFFLOAD_init_device (int n)
|
|
1702 {
|
|
1703 struct ptx_device *dev;
|
|
1704
|
|
1705 pthread_mutex_lock (&ptx_dev_lock);
|
|
1706
|
|
1707 if (!nvptx_init () || ptx_devices[n] != NULL)
|
|
1708 {
|
|
1709 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1710 return false;
|
|
1711 }
|
|
1712
|
|
1713 dev = nvptx_open_device (n);
|
|
1714 if (dev)
|
|
1715 {
|
|
1716 ptx_devices[n] = dev;
|
|
1717 instantiated_devices++;
|
|
1718 }
|
|
1719
|
|
1720 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1721
|
|
1722 return dev != NULL;
|
|
1723 }
|
|
1724
|
|
1725 bool
|
|
1726 GOMP_OFFLOAD_fini_device (int n)
|
|
1727 {
|
|
1728 pthread_mutex_lock (&ptx_dev_lock);
|
|
1729
|
|
1730 if (ptx_devices[n] != NULL)
|
|
1731 {
|
|
1732 if (!nvptx_attach_host_thread_to_device (n)
|
|
1733 || !nvptx_close_device (ptx_devices[n]))
|
|
1734 {
|
|
1735 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1736 return false;
|
|
1737 }
|
|
1738 ptx_devices[n] = NULL;
|
|
1739 instantiated_devices--;
|
|
1740 }
|
|
1741
|
|
1742 pthread_mutex_unlock (&ptx_dev_lock);
|
|
1743 return true;
|
|
1744 }
|
|
1745
|
|
1746 /* Return the libgomp version number we're compatible with. There is
|
|
1747 no requirement for cross-version compatibility. */
|
|
1748
|
|
1749 unsigned
|
|
1750 GOMP_OFFLOAD_version (void)
|
|
1751 {
|
|
1752 return GOMP_VERSION;
|
|
1753 }
|
|
1754
|
|
1755 /* Initialize __nvptx_clocktick, if present in MODULE. */
|
|
1756
|
|
1757 static void
|
|
1758 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
|
|
1759 {
|
|
1760 CUdeviceptr dptr;
|
|
1761 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
|
|
1762 module, "__nvptx_clocktick");
|
|
1763 if (r == CUDA_ERROR_NOT_FOUND)
|
|
1764 return;
|
|
1765 if (r != CUDA_SUCCESS)
|
|
1766 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
|
|
1767 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
|
|
1768 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
|
|
1769 sizeof (__nvptx_clocktick));
|
|
1770 if (r != CUDA_SUCCESS)
|
|
1771 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
|
1772 }
|
|
1773
|
|
1774 /* Load the (partial) program described by TARGET_DATA to device
|
|
1775 number ORD. Allocate and return TARGET_TABLE. */
|
|
1776
|
|
1777 int
|
|
1778 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
|
|
1779 struct addr_pair **target_table)
|
|
1780 {
|
|
1781 CUmodule module;
|
|
1782 const char *const *var_names;
|
|
1783 const struct targ_fn_launch *fn_descs;
|
|
1784 unsigned int fn_entries, var_entries, i, j;
|
|
1785 struct targ_fn_descriptor *targ_fns;
|
|
1786 struct addr_pair *targ_tbl;
|
|
1787 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
|
|
1788 struct ptx_image_data *new_image;
|
|
1789 struct ptx_device *dev;
|
|
1790
|
|
1791 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
|
|
1792 {
|
|
1793 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
|
|
1794 " (expected %u, received %u)",
|
|
1795 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
|
|
1796 return -1;
|
|
1797 }
|
|
1798
|
|
1799 if (!nvptx_attach_host_thread_to_device (ord)
|
|
1800 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
|
|
1801 return -1;
|
|
1802
|
|
1803 dev = ptx_devices[ord];
|
|
1804
|
|
1805 /* The mkoffload utility emits a struct of pointers/integers at the
|
|
1806 start of each offload image. The array of kernel names and the
|
|
1807 functions addresses form a one-to-one correspondence. */
|
|
1808
|
|
1809 var_entries = img_header->var_num;
|
|
1810 var_names = img_header->var_names;
|
|
1811 fn_entries = img_header->fn_num;
|
|
1812 fn_descs = img_header->fn_descs;
|
|
1813
|
|
1814 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
|
|
1815 * (fn_entries + var_entries));
|
|
1816 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
|
|
1817 * fn_entries);
|
|
1818
|
|
1819 *target_table = targ_tbl;
|
|
1820
|
|
1821 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
|
|
1822 new_image->target_data = target_data;
|
|
1823 new_image->module = module;
|
|
1824 new_image->fns = targ_fns;
|
|
1825
|
|
1826 pthread_mutex_lock (&dev->image_lock);
|
|
1827 new_image->next = dev->images;
|
|
1828 dev->images = new_image;
|
|
1829 pthread_mutex_unlock (&dev->image_lock);
|
|
1830
|
|
1831 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
|
|
1832 {
|
|
1833 CUfunction function;
|
|
1834 int nregs, mthrs;
|
|
1835
|
|
1836 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
|
|
1837 fn_descs[i].fn);
|
|
1838 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
|
|
1839 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
|
|
1840 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
|
|
1841 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
|
|
1842
|
|
1843 targ_fns->fn = function;
|
|
1844 targ_fns->launch = &fn_descs[i];
|
|
1845 targ_fns->regs_per_thread = nregs;
|
|
1846 targ_fns->max_threads_per_block = mthrs;
|
|
1847
|
|
1848 targ_tbl->start = (uintptr_t) targ_fns;
|
|
1849 targ_tbl->end = targ_tbl->start + 1;
|
|
1850 }
|
|
1851
|
|
1852 for (j = 0; j < var_entries; j++, targ_tbl++)
|
|
1853 {
|
|
1854 CUdeviceptr var;
|
|
1855 size_t bytes;
|
|
1856
|
|
1857 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
|
|
1858 &var, &bytes, module, var_names[j]);
|
|
1859
|
|
1860 targ_tbl->start = (uintptr_t) var;
|
|
1861 targ_tbl->end = targ_tbl->start + bytes;
|
|
1862 }
|
|
1863
|
|
1864 nvptx_set_clocktick (module, dev);
|
|
1865
|
|
1866 return fn_entries + var_entries;
|
|
1867 }
|
|
1868
|
|
1869 /* Unload the program described by TARGET_DATA. DEV_DATA is the
|
|
1870 function descriptors allocated by G_O_load_image. */
|
|
1871
|
|
1872 bool
|
|
1873 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
|
|
1874 {
|
|
1875 struct ptx_image_data *image, **prev_p;
|
|
1876 struct ptx_device *dev = ptx_devices[ord];
|
|
1877
|
|
1878 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
|
|
1879 {
|
|
1880 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
|
|
1881 " (expected %u, received %u)",
|
|
1882 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
|
|
1883 return false;
|
|
1884 }
|
|
1885
|
|
1886 bool ret = true;
|
|
1887 pthread_mutex_lock (&dev->image_lock);
|
|
1888 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
|
|
1889 if (image->target_data == target_data)
|
|
1890 {
|
|
1891 *prev_p = image->next;
|
|
1892 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
|
|
1893 ret = false;
|
|
1894 free (image->fns);
|
|
1895 free (image);
|
|
1896 break;
|
|
1897 }
|
|
1898 pthread_mutex_unlock (&dev->image_lock);
|
|
1899 return ret;
|
|
1900 }
|
|
1901
|
|
1902 void *
|
|
1903 GOMP_OFFLOAD_alloc (int ord, size_t size)
|
|
1904 {
|
|
1905 if (!nvptx_attach_host_thread_to_device (ord))
|
|
1906 return NULL;
|
|
1907 return nvptx_alloc (size);
|
|
1908 }
|
|
1909
|
|
1910 bool
|
|
1911 GOMP_OFFLOAD_free (int ord, void *ptr)
|
|
1912 {
|
|
1913 return (nvptx_attach_host_thread_to_device (ord)
|
|
1914 && nvptx_free (ptr));
|
|
1915 }
|
|
1916
|
|
1917 bool
|
|
1918 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
|
|
1919 {
|
|
1920 return (nvptx_attach_host_thread_to_device (ord)
|
|
1921 && nvptx_dev2host (dst, src, n));
|
|
1922 }
|
|
1923
|
|
1924 bool
|
|
1925 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
|
|
1926 {
|
|
1927 return (nvptx_attach_host_thread_to_device (ord)
|
|
1928 && nvptx_host2dev (dst, src, n));
|
|
1929 }
|
|
1930
|
|
1931 bool
|
|
1932 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
|
|
1933 {
|
|
1934 struct ptx_device *ptx_dev = ptx_devices[ord];
|
|
1935 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
|
|
1936 ptx_dev->null_stream->stream);
|
|
1937 return true;
|
|
1938 }
|
|
1939
|
|
1940 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
|
|
1941
|
|
1942 void
|
|
1943 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
|
|
1944 void **hostaddrs, void **devaddrs,
|
|
1945 int async, unsigned *dims, void *targ_mem_desc)
|
|
1946 {
|
|
1947 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
|
|
1948 }
|
|
1949
|
|
1950 void
|
|
1951 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
|
|
1952 {
|
|
1953 struct nvptx_thread *nvthd = nvptx_thread ();
|
|
1954 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
1955
|
|
1956 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
|
1957 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
|
|
1958 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
|
|
1959 }
|
|
1960
|
|
1961 int
|
|
1962 GOMP_OFFLOAD_openacc_async_test (int async)
|
|
1963 {
|
|
1964 return nvptx_async_test (async);
|
|
1965 }
|
|
1966
|
|
1967 int
|
|
1968 GOMP_OFFLOAD_openacc_async_test_all (void)
|
|
1969 {
|
|
1970 return nvptx_async_test_all ();
|
|
1971 }
|
|
1972
|
|
1973 void
|
|
1974 GOMP_OFFLOAD_openacc_async_wait (int async)
|
|
1975 {
|
|
1976 nvptx_wait (async);
|
|
1977 }
|
|
1978
|
|
1979 void
|
|
1980 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
|
|
1981 {
|
|
1982 nvptx_wait_async (async1, async2);
|
|
1983 }
|
|
1984
|
|
1985 void
|
|
1986 GOMP_OFFLOAD_openacc_async_wait_all (void)
|
|
1987 {
|
|
1988 nvptx_wait_all ();
|
|
1989 }
|
|
1990
|
|
1991 void
|
|
1992 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
|
|
1993 {
|
|
1994 nvptx_wait_all_async (async);
|
|
1995 }
|
|
1996
|
|
1997 void
|
|
1998 GOMP_OFFLOAD_openacc_async_set_async (int async)
|
|
1999 {
|
|
2000 nvptx_set_async (async);
|
|
2001 }
|
|
2002
|
|
2003 void *
|
|
2004 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
|
|
2005 {
|
|
2006 struct ptx_device *ptx_dev;
|
|
2007 struct nvptx_thread *nvthd
|
|
2008 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
|
|
2009 CUcontext thd_ctx;
|
|
2010
|
|
2011 ptx_dev = ptx_devices[ord];
|
|
2012
|
|
2013 assert (ptx_dev);
|
|
2014
|
|
2015 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
|
|
2016
|
|
2017 assert (ptx_dev->ctx);
|
|
2018
|
|
2019 if (!thd_ctx)
|
|
2020 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
|
|
2021
|
|
2022 nvthd->current_stream = ptx_dev->null_stream;
|
|
2023 nvthd->ptx_dev = ptx_dev;
|
|
2024
|
|
2025 return (void *) nvthd;
|
|
2026 }
|
|
2027
|
|
2028 void
|
|
2029 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
|
|
2030 {
|
|
2031 free (data);
|
|
2032 }
|
|
2033
|
|
2034 void *
|
|
2035 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
|
|
2036 {
|
|
2037 return nvptx_get_current_cuda_device ();
|
|
2038 }
|
|
2039
|
|
2040 void *
|
|
2041 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
|
|
2042 {
|
|
2043 return nvptx_get_current_cuda_context ();
|
|
2044 }
|
|
2045
|
|
2046 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
|
|
2047
|
|
2048 void *
|
|
2049 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
|
|
2050 {
|
|
2051 return nvptx_get_cuda_stream (async);
|
|
2052 }
|
|
2053
|
|
2054 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
|
|
2055
|
|
2056 int
|
|
2057 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
|
|
2058 {
|
|
2059 return nvptx_set_cuda_stream (async, stream);
|
|
2060 }
|
|
2061
|
|
2062 /* Adjust launch dimensions: pick good values for number of blocks and warps
|
|
2063 and ensure that number of warps does not exceed CUDA limits as well as GCC's
|
|
2064 own limits. */
|
|
2065
|
|
2066 static void
|
|
2067 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
|
|
2068 struct ptx_device *ptx_dev,
|
|
2069 int *teams_p, int *threads_p)
|
|
2070 {
|
|
2071 int max_warps_block = fn->max_threads_per_block / 32;
|
|
2072 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
|
|
2073 and libgcc, which matches documented limit of all GPUs as of 2015. */
|
|
2074 if (max_warps_block > 32)
|
|
2075 max_warps_block = 32;
|
|
2076 if (*threads_p <= 0)
|
|
2077 *threads_p = 8;
|
|
2078 if (*threads_p > max_warps_block)
|
|
2079 *threads_p = max_warps_block;
|
|
2080
|
|
2081 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
|
|
2082 /* This is an estimate of how many blocks the device can host simultaneously.
|
|
2083 Actual limit, which may be lower, can be queried with "occupancy control"
|
|
2084 driver interface (since CUDA 6.0). */
|
|
2085 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
|
|
2086 if (*teams_p <= 0 || *teams_p > max_blocks)
|
|
2087 *teams_p = max_blocks;
|
|
2088 }
|
|
2089
|
|
2090 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
|
|
2091 target regions. */
|
|
2092
|
|
2093 static size_t
|
|
2094 nvptx_stacks_size ()
|
|
2095 {
|
|
2096 return 128 * 1024;
|
|
2097 }
|
|
2098
|
|
2099 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
|
|
2100
|
|
2101 static void *
|
|
2102 nvptx_stacks_alloc (size_t size, int num)
|
|
2103 {
|
|
2104 CUdeviceptr stacks;
|
|
2105 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
|
|
2106 if (r != CUDA_SUCCESS)
|
|
2107 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
|
|
2108 return (void *) stacks;
|
|
2109 }
|
|
2110
|
|
2111 /* Release storage previously allocated by nvptx_stacks_alloc. */
|
|
2112
|
|
2113 static void
|
|
2114 nvptx_stacks_free (void *p, int num)
|
|
2115 {
|
|
2116 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
|
|
2117 if (r != CUDA_SUCCESS)
|
|
2118 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
|
|
2119 }
|
|
2120
|
|
2121 void
|
|
2122 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
|
2123 {
|
|
2124 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
|
|
2125 CUresult r;
|
|
2126 struct ptx_device *ptx_dev = ptx_devices[ord];
|
|
2127 const char *maybe_abort_msg = "(perhaps abort was called)";
|
|
2128 int teams = 0, threads = 0;
|
|
2129
|
|
2130 if (!args)
|
|
2131 GOMP_PLUGIN_fatal ("No target arguments provided");
|
|
2132 while (*args)
|
|
2133 {
|
|
2134 intptr_t id = (intptr_t) *args++, val;
|
|
2135 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
|
|
2136 val = (intptr_t) *args++;
|
|
2137 else
|
|
2138 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
|
|
2139 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
|
|
2140 continue;
|
|
2141 val = val > INT_MAX ? INT_MAX : val;
|
|
2142 id &= GOMP_TARGET_ARG_ID_MASK;
|
|
2143 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
|
|
2144 teams = val;
|
|
2145 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
|
|
2146 threads = val;
|
|
2147 }
|
|
2148 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
|
|
2149
|
|
2150 size_t stack_size = nvptx_stacks_size ();
|
|
2151 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
|
|
2152 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
|
|
2153 size_t fn_args_size = sizeof fn_args;
|
|
2154 void *config[] = {
|
|
2155 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
|
|
2156 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
|
|
2157 CU_LAUNCH_PARAM_END
|
|
2158 };
|
|
2159 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
|
2160 32, threads, 1, 0, ptx_dev->null_stream->stream,
|
|
2161 NULL, config);
|
|
2162 if (r != CUDA_SUCCESS)
|
|
2163 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
|
2164
|
|
2165 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
|
2166 if (r == CUDA_ERROR_LAUNCH_FAILED)
|
|
2167 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
|
2168 maybe_abort_msg);
|
|
2169 else if (r != CUDA_SUCCESS)
|
|
2170 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
|
|
2171 nvptx_stacks_free (stacks, teams * threads);
|
|
2172 }
|
|
2173
|
|
2174 void
|
|
2175 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
|
|
2176 void *async_data)
|
|
2177 {
|
|
2178 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
|
|
2179 }
|