annotate libgomp/plugin/plugin-nvptx.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Plugin for NVPTX execution.
kono
parents:
diff changeset
2
kono
parents:
diff changeset
3 Copyright (C) 2013-2017 Free Software Foundation, Inc.
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 Contributed by Mentor Embedded.
kono
parents:
diff changeset
6
kono
parents:
diff changeset
7 This file is part of the GNU Offloading and Multi Processing Library
kono
parents:
diff changeset
8 (libgomp).
kono
parents:
diff changeset
9
kono
parents:
diff changeset
10 Libgomp is free software; you can redistribute it and/or modify it
kono
parents:
diff changeset
11 under the terms of the GNU General Public License as published by
kono
parents:
diff changeset
12 the Free Software Foundation; either version 3, or (at your option)
kono
parents:
diff changeset
13 any later version.
kono
parents:
diff changeset
14
kono
parents:
diff changeset
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
kono
parents:
diff changeset
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
kono
parents:
diff changeset
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
kono
parents:
diff changeset
18 more details.
kono
parents:
diff changeset
19
kono
parents:
diff changeset
20 Under Section 7 of GPL version 3, you are granted additional
kono
parents:
diff changeset
21 permissions described in the GCC Runtime Library Exception, version
kono
parents:
diff changeset
22 3.1, as published by the Free Software Foundation.
kono
parents:
diff changeset
23
kono
parents:
diff changeset
24 You should have received a copy of the GNU General Public License and
kono
parents:
diff changeset
25 a copy of the GCC Runtime Library Exception along with this program;
kono
parents:
diff changeset
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
kono
parents:
diff changeset
27 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
28
kono
parents:
diff changeset
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
kono
parents:
diff changeset
30 library appears to hold some implicit state, but the documentation
kono
parents:
diff changeset
31 is not clear as to what that state might be. Or how one might
kono
parents:
diff changeset
32 propagate it from one thread to another. */
kono
parents:
diff changeset
33
kono
parents:
diff changeset
34 #include "openacc.h"
kono
parents:
diff changeset
35 #include "config.h"
kono
parents:
diff changeset
36 #include "libgomp-plugin.h"
kono
parents:
diff changeset
37 #include "oacc-plugin.h"
kono
parents:
diff changeset
38 #include "gomp-constants.h"
kono
parents:
diff changeset
39
kono
parents:
diff changeset
40 #include <pthread.h>
kono
parents:
diff changeset
41 #include <cuda.h>
kono
parents:
diff changeset
42 #include <stdbool.h>
kono
parents:
diff changeset
43 #include <stdint.h>
kono
parents:
diff changeset
44 #include <limits.h>
kono
parents:
diff changeset
45 #include <string.h>
kono
parents:
diff changeset
46 #include <stdio.h>
kono
parents:
diff changeset
47 #include <unistd.h>
kono
parents:
diff changeset
48 #include <assert.h>
kono
parents:
diff changeset
49 #include <errno.h>
kono
parents:
diff changeset
50
kono
parents:
diff changeset
51 #if PLUGIN_NVPTX_DYNAMIC
kono
parents:
diff changeset
52 # include <dlfcn.h>
kono
parents:
diff changeset
53
kono
parents:
diff changeset
54 # define CUDA_CALLS \
kono
parents:
diff changeset
55 CUDA_ONE_CALL (cuCtxCreate) \
kono
parents:
diff changeset
56 CUDA_ONE_CALL (cuCtxDestroy) \
kono
parents:
diff changeset
57 CUDA_ONE_CALL (cuCtxGetCurrent) \
kono
parents:
diff changeset
58 CUDA_ONE_CALL (cuCtxGetDevice) \
kono
parents:
diff changeset
59 CUDA_ONE_CALL (cuCtxPopCurrent) \
kono
parents:
diff changeset
60 CUDA_ONE_CALL (cuCtxPushCurrent) \
kono
parents:
diff changeset
61 CUDA_ONE_CALL (cuCtxSynchronize) \
kono
parents:
diff changeset
62 CUDA_ONE_CALL (cuDeviceGet) \
kono
parents:
diff changeset
63 CUDA_ONE_CALL (cuDeviceGetAttribute) \
kono
parents:
diff changeset
64 CUDA_ONE_CALL (cuDeviceGetCount) \
kono
parents:
diff changeset
65 CUDA_ONE_CALL (cuEventCreate) \
kono
parents:
diff changeset
66 CUDA_ONE_CALL (cuEventDestroy) \
kono
parents:
diff changeset
67 CUDA_ONE_CALL (cuEventElapsedTime) \
kono
parents:
diff changeset
68 CUDA_ONE_CALL (cuEventQuery) \
kono
parents:
diff changeset
69 CUDA_ONE_CALL (cuEventRecord) \
kono
parents:
diff changeset
70 CUDA_ONE_CALL (cuEventSynchronize) \
kono
parents:
diff changeset
71 CUDA_ONE_CALL (cuFuncGetAttribute) \
kono
parents:
diff changeset
72 CUDA_ONE_CALL (cuGetErrorString) \
kono
parents:
diff changeset
73 CUDA_ONE_CALL (cuInit) \
kono
parents:
diff changeset
74 CUDA_ONE_CALL (cuLaunchKernel) \
kono
parents:
diff changeset
75 CUDA_ONE_CALL (cuLinkAddData) \
kono
parents:
diff changeset
76 CUDA_ONE_CALL (cuLinkComplete) \
kono
parents:
diff changeset
77 CUDA_ONE_CALL (cuLinkCreate) \
kono
parents:
diff changeset
78 CUDA_ONE_CALL (cuLinkDestroy) \
kono
parents:
diff changeset
79 CUDA_ONE_CALL (cuMemAlloc) \
kono
parents:
diff changeset
80 CUDA_ONE_CALL (cuMemAllocHost) \
kono
parents:
diff changeset
81 CUDA_ONE_CALL (cuMemcpy) \
kono
parents:
diff changeset
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
kono
parents:
diff changeset
83 CUDA_ONE_CALL (cuMemcpyDtoH) \
kono
parents:
diff changeset
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
kono
parents:
diff changeset
85 CUDA_ONE_CALL (cuMemcpyHtoD) \
kono
parents:
diff changeset
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
kono
parents:
diff changeset
87 CUDA_ONE_CALL (cuMemFree) \
kono
parents:
diff changeset
88 CUDA_ONE_CALL (cuMemFreeHost) \
kono
parents:
diff changeset
89 CUDA_ONE_CALL (cuMemGetAddressRange) \
kono
parents:
diff changeset
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
kono
parents:
diff changeset
91 CUDA_ONE_CALL (cuModuleGetFunction) \
kono
parents:
diff changeset
92 CUDA_ONE_CALL (cuModuleGetGlobal) \
kono
parents:
diff changeset
93 CUDA_ONE_CALL (cuModuleLoad) \
kono
parents:
diff changeset
94 CUDA_ONE_CALL (cuModuleLoadData) \
kono
parents:
diff changeset
95 CUDA_ONE_CALL (cuModuleUnload) \
kono
parents:
diff changeset
96 CUDA_ONE_CALL (cuStreamCreate) \
kono
parents:
diff changeset
97 CUDA_ONE_CALL (cuStreamDestroy) \
kono
parents:
diff changeset
98 CUDA_ONE_CALL (cuStreamQuery) \
kono
parents:
diff changeset
99 CUDA_ONE_CALL (cuStreamSynchronize) \
kono
parents:
diff changeset
100 CUDA_ONE_CALL (cuStreamWaitEvent)
kono
parents:
diff changeset
101 # define CUDA_ONE_CALL(call) \
kono
parents:
diff changeset
102 __typeof (call) *call;
kono
parents:
diff changeset
103 struct cuda_lib_s {
kono
parents:
diff changeset
104 CUDA_CALLS
kono
parents:
diff changeset
105 } cuda_lib;
kono
parents:
diff changeset
106
kono
parents:
diff changeset
107 /* -1 if init_cuda_lib has not been called yet, false
kono
parents:
diff changeset
108 if it has been and failed, true if it has been and succeeded. */
kono
parents:
diff changeset
109 static signed char cuda_lib_inited = -1;
kono
parents:
diff changeset
110
kono
parents:
diff changeset
111 /* Dynamically load the CUDA runtime library and initialize function
kono
parents:
diff changeset
112 pointers, return false if unsuccessful, true if successful. */
kono
parents:
diff changeset
113 static bool
kono
parents:
diff changeset
114 init_cuda_lib (void)
kono
parents:
diff changeset
115 {
kono
parents:
diff changeset
116 if (cuda_lib_inited != -1)
kono
parents:
diff changeset
117 return cuda_lib_inited;
kono
parents:
diff changeset
118 const char *cuda_runtime_lib = "libcuda.so.1";
kono
parents:
diff changeset
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
kono
parents:
diff changeset
120 cuda_lib_inited = false;
kono
parents:
diff changeset
121 if (h == NULL)
kono
parents:
diff changeset
122 return false;
kono
parents:
diff changeset
123 # undef CUDA_ONE_CALL
kono
parents:
diff changeset
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
kono
parents:
diff changeset
125 # define CUDA_ONE_CALL_1(call) \
kono
parents:
diff changeset
126 cuda_lib.call = dlsym (h, #call); \
kono
parents:
diff changeset
127 if (cuda_lib.call == NULL) \
kono
parents:
diff changeset
128 return false;
kono
parents:
diff changeset
129 CUDA_CALLS
kono
parents:
diff changeset
130 cuda_lib_inited = true;
kono
parents:
diff changeset
131 return true;
kono
parents:
diff changeset
132 }
kono
parents:
diff changeset
133 # undef CUDA_ONE_CALL
kono
parents:
diff changeset
134 # undef CUDA_ONE_CALL_1
kono
parents:
diff changeset
135 # define CUDA_CALL_PREFIX cuda_lib.
kono
parents:
diff changeset
136 #else
kono
parents:
diff changeset
137 # define CUDA_CALL_PREFIX
kono
parents:
diff changeset
138 # define init_cuda_lib() true
kono
parents:
diff changeset
139 #endif
kono
parents:
diff changeset
140
kono
parents:
diff changeset
141 /* Convenience macros for the frequently used CUDA library call and
kono
parents:
diff changeset
142 error handling sequence as well as CUDA library calls that
kono
parents:
diff changeset
143 do the error checking themselves or don't do it at all. */
kono
parents:
diff changeset
144
kono
parents:
diff changeset
145 #define CUDA_CALL_ERET(ERET, FN, ...) \
kono
parents:
diff changeset
146 do { \
kono
parents:
diff changeset
147 unsigned __r \
kono
parents:
diff changeset
148 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
kono
parents:
diff changeset
149 if (__r != CUDA_SUCCESS) \
kono
parents:
diff changeset
150 { \
kono
parents:
diff changeset
151 GOMP_PLUGIN_error (#FN " error: %s", \
kono
parents:
diff changeset
152 cuda_error (__r)); \
kono
parents:
diff changeset
153 return ERET; \
kono
parents:
diff changeset
154 } \
kono
parents:
diff changeset
155 } while (0)
kono
parents:
diff changeset
156
kono
parents:
diff changeset
157 #define CUDA_CALL(FN, ...) \
kono
parents:
diff changeset
158 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
kono
parents:
diff changeset
159
kono
parents:
diff changeset
160 #define CUDA_CALL_ASSERT(FN, ...) \
kono
parents:
diff changeset
161 do { \
kono
parents:
diff changeset
162 unsigned __r \
kono
parents:
diff changeset
163 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
kono
parents:
diff changeset
164 if (__r != CUDA_SUCCESS) \
kono
parents:
diff changeset
165 { \
kono
parents:
diff changeset
166 GOMP_PLUGIN_fatal (#FN " error: %s", \
kono
parents:
diff changeset
167 cuda_error (__r)); \
kono
parents:
diff changeset
168 } \
kono
parents:
diff changeset
169 } while (0)
kono
parents:
diff changeset
170
kono
parents:
diff changeset
171 #define CUDA_CALL_NOCHECK(FN, ...) \
kono
parents:
diff changeset
172 CUDA_CALL_PREFIX FN (__VA_ARGS__)
kono
parents:
diff changeset
173
kono
parents:
diff changeset
174 static const char *
kono
parents:
diff changeset
175 cuda_error (CUresult r)
kono
parents:
diff changeset
176 {
kono
parents:
diff changeset
177 #if CUDA_VERSION < 7000
kono
parents:
diff changeset
178 /* Specified in documentation and present in library from at least
kono
parents:
diff changeset
179 5.5. Not declared in header file prior to 7.0. */
kono
parents:
diff changeset
180 extern CUresult cuGetErrorString (CUresult, const char **);
kono
parents:
diff changeset
181 #endif
kono
parents:
diff changeset
182 const char *desc;
kono
parents:
diff changeset
183
kono
parents:
diff changeset
184 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
kono
parents:
diff changeset
185 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
186 desc = "unknown cuda error";
kono
parents:
diff changeset
187
kono
parents:
diff changeset
188 return desc;
kono
parents:
diff changeset
189 }
kono
parents:
diff changeset
190
kono
parents:
diff changeset
191 static unsigned int instantiated_devices = 0;
kono
parents:
diff changeset
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
kono
parents:
diff changeset
193
kono
parents:
diff changeset
194 struct ptx_stream
kono
parents:
diff changeset
195 {
kono
parents:
diff changeset
196 CUstream stream;
kono
parents:
diff changeset
197 pthread_t host_thread;
kono
parents:
diff changeset
198 bool multithreaded;
kono
parents:
diff changeset
199
kono
parents:
diff changeset
200 CUdeviceptr d;
kono
parents:
diff changeset
201 void *h;
kono
parents:
diff changeset
202 void *h_begin;
kono
parents:
diff changeset
203 void *h_end;
kono
parents:
diff changeset
204 void *h_next;
kono
parents:
diff changeset
205 void *h_prev;
kono
parents:
diff changeset
206 void *h_tail;
kono
parents:
diff changeset
207
kono
parents:
diff changeset
208 struct ptx_stream *next;
kono
parents:
diff changeset
209 };
kono
parents:
diff changeset
210
kono
parents:
diff changeset
211 /* Thread-specific data for PTX. */
kono
parents:
diff changeset
212
kono
parents:
diff changeset
213 struct nvptx_thread
kono
parents:
diff changeset
214 {
kono
parents:
diff changeset
215 struct ptx_stream *current_stream;
kono
parents:
diff changeset
216 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
217 };
kono
parents:
diff changeset
218
kono
parents:
diff changeset
219 struct map
kono
parents:
diff changeset
220 {
kono
parents:
diff changeset
221 int async;
kono
parents:
diff changeset
222 size_t size;
kono
parents:
diff changeset
223 char mappings[0];
kono
parents:
diff changeset
224 };
kono
parents:
diff changeset
225
kono
parents:
diff changeset
226 static bool
kono
parents:
diff changeset
227 map_init (struct ptx_stream *s)
kono
parents:
diff changeset
228 {
kono
parents:
diff changeset
229 int size = getpagesize ();
kono
parents:
diff changeset
230
kono
parents:
diff changeset
231 assert (s);
kono
parents:
diff changeset
232 assert (!s->d);
kono
parents:
diff changeset
233 assert (!s->h);
kono
parents:
diff changeset
234
kono
parents:
diff changeset
235 CUDA_CALL (cuMemAllocHost, &s->h, size);
kono
parents:
diff changeset
236 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
kono
parents:
diff changeset
237
kono
parents:
diff changeset
238 assert (s->h);
kono
parents:
diff changeset
239
kono
parents:
diff changeset
240 s->h_begin = s->h;
kono
parents:
diff changeset
241 s->h_end = s->h_begin + size;
kono
parents:
diff changeset
242 s->h_next = s->h_prev = s->h_tail = s->h_begin;
kono
parents:
diff changeset
243
kono
parents:
diff changeset
244 assert (s->h_next);
kono
parents:
diff changeset
245 assert (s->h_end);
kono
parents:
diff changeset
246 return true;
kono
parents:
diff changeset
247 }
kono
parents:
diff changeset
248
kono
parents:
diff changeset
249 static bool
kono
parents:
diff changeset
250 map_fini (struct ptx_stream *s)
kono
parents:
diff changeset
251 {
kono
parents:
diff changeset
252 CUDA_CALL (cuMemFreeHost, s->h);
kono
parents:
diff changeset
253 return true;
kono
parents:
diff changeset
254 }
kono
parents:
diff changeset
255
kono
parents:
diff changeset
256 static void
kono
parents:
diff changeset
257 map_pop (struct ptx_stream *s)
kono
parents:
diff changeset
258 {
kono
parents:
diff changeset
259 struct map *m;
kono
parents:
diff changeset
260
kono
parents:
diff changeset
261 assert (s != NULL);
kono
parents:
diff changeset
262 assert (s->h_next);
kono
parents:
diff changeset
263 assert (s->h_prev);
kono
parents:
diff changeset
264 assert (s->h_tail);
kono
parents:
diff changeset
265
kono
parents:
diff changeset
266 m = s->h_tail;
kono
parents:
diff changeset
267
kono
parents:
diff changeset
268 s->h_tail += m->size;
kono
parents:
diff changeset
269
kono
parents:
diff changeset
270 if (s->h_tail >= s->h_end)
kono
parents:
diff changeset
271 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
kono
parents:
diff changeset
272
kono
parents:
diff changeset
273 if (s->h_next == s->h_tail)
kono
parents:
diff changeset
274 s->h_prev = s->h_next;
kono
parents:
diff changeset
275
kono
parents:
diff changeset
276 assert (s->h_next >= s->h_begin);
kono
parents:
diff changeset
277 assert (s->h_tail >= s->h_begin);
kono
parents:
diff changeset
278 assert (s->h_prev >= s->h_begin);
kono
parents:
diff changeset
279
kono
parents:
diff changeset
280 assert (s->h_next <= s->h_end);
kono
parents:
diff changeset
281 assert (s->h_tail <= s->h_end);
kono
parents:
diff changeset
282 assert (s->h_prev <= s->h_end);
kono
parents:
diff changeset
283 }
kono
parents:
diff changeset
284
kono
parents:
diff changeset
285 static void
kono
parents:
diff changeset
286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
kono
parents:
diff changeset
287 {
kono
parents:
diff changeset
288 int left;
kono
parents:
diff changeset
289 int offset;
kono
parents:
diff changeset
290 struct map *m;
kono
parents:
diff changeset
291
kono
parents:
diff changeset
292 assert (s != NULL);
kono
parents:
diff changeset
293
kono
parents:
diff changeset
294 left = s->h_end - s->h_next;
kono
parents:
diff changeset
295 size += sizeof (struct map);
kono
parents:
diff changeset
296
kono
parents:
diff changeset
297 assert (s->h_prev);
kono
parents:
diff changeset
298 assert (s->h_next);
kono
parents:
diff changeset
299
kono
parents:
diff changeset
300 if (size >= left)
kono
parents:
diff changeset
301 {
kono
parents:
diff changeset
302 m = s->h_prev;
kono
parents:
diff changeset
303 m->size += left;
kono
parents:
diff changeset
304 s->h_next = s->h_begin;
kono
parents:
diff changeset
305
kono
parents:
diff changeset
306 if (s->h_next + size > s->h_end)
kono
parents:
diff changeset
307 GOMP_PLUGIN_fatal ("unable to push map");
kono
parents:
diff changeset
308 }
kono
parents:
diff changeset
309
kono
parents:
diff changeset
310 assert (s->h_next);
kono
parents:
diff changeset
311
kono
parents:
diff changeset
312 m = s->h_next;
kono
parents:
diff changeset
313 m->async = async;
kono
parents:
diff changeset
314 m->size = size;
kono
parents:
diff changeset
315
kono
parents:
diff changeset
316 offset = (void *)&m->mappings[0] - s->h;
kono
parents:
diff changeset
317
kono
parents:
diff changeset
318 *d = (void *)(s->d + offset);
kono
parents:
diff changeset
319 *h = (void *)(s->h + offset);
kono
parents:
diff changeset
320
kono
parents:
diff changeset
321 s->h_prev = s->h_next;
kono
parents:
diff changeset
322 s->h_next += size;
kono
parents:
diff changeset
323
kono
parents:
diff changeset
324 assert (s->h_prev);
kono
parents:
diff changeset
325 assert (s->h_next);
kono
parents:
diff changeset
326
kono
parents:
diff changeset
327 assert (s->h_next >= s->h_begin);
kono
parents:
diff changeset
328 assert (s->h_tail >= s->h_begin);
kono
parents:
diff changeset
329 assert (s->h_prev >= s->h_begin);
kono
parents:
diff changeset
330 assert (s->h_next <= s->h_end);
kono
parents:
diff changeset
331 assert (s->h_tail <= s->h_end);
kono
parents:
diff changeset
332 assert (s->h_prev <= s->h_end);
kono
parents:
diff changeset
333
kono
parents:
diff changeset
334 return;
kono
parents:
diff changeset
335 }
kono
parents:
diff changeset
336
kono
parents:
diff changeset
337 /* Target data function launch information. */
kono
parents:
diff changeset
338
kono
parents:
diff changeset
339 struct targ_fn_launch
kono
parents:
diff changeset
340 {
kono
parents:
diff changeset
341 const char *fn;
kono
parents:
diff changeset
342 unsigned short dim[GOMP_DIM_MAX];
kono
parents:
diff changeset
343 };
kono
parents:
diff changeset
344
kono
parents:
diff changeset
345 /* Target PTX object information. */
kono
parents:
diff changeset
346
kono
parents:
diff changeset
347 struct targ_ptx_obj
kono
parents:
diff changeset
348 {
kono
parents:
diff changeset
349 const char *code;
kono
parents:
diff changeset
350 size_t size;
kono
parents:
diff changeset
351 };
kono
parents:
diff changeset
352
kono
parents:
diff changeset
353 /* Target data image information. */
kono
parents:
diff changeset
354
kono
parents:
diff changeset
355 typedef struct nvptx_tdata
kono
parents:
diff changeset
356 {
kono
parents:
diff changeset
357 const struct targ_ptx_obj *ptx_objs;
kono
parents:
diff changeset
358 unsigned ptx_num;
kono
parents:
diff changeset
359
kono
parents:
diff changeset
360 const char *const *var_names;
kono
parents:
diff changeset
361 unsigned var_num;
kono
parents:
diff changeset
362
kono
parents:
diff changeset
363 const struct targ_fn_launch *fn_descs;
kono
parents:
diff changeset
364 unsigned fn_num;
kono
parents:
diff changeset
365 } nvptx_tdata_t;
kono
parents:
diff changeset
366
kono
parents:
diff changeset
367 /* Descriptor of a loaded function. */
kono
parents:
diff changeset
368
kono
parents:
diff changeset
369 struct targ_fn_descriptor
kono
parents:
diff changeset
370 {
kono
parents:
diff changeset
371 CUfunction fn;
kono
parents:
diff changeset
372 const struct targ_fn_launch *launch;
kono
parents:
diff changeset
373 int regs_per_thread;
kono
parents:
diff changeset
374 int max_threads_per_block;
kono
parents:
diff changeset
375 };
kono
parents:
diff changeset
376
kono
parents:
diff changeset
377 /* A loaded PTX image. */
kono
parents:
diff changeset
378 struct ptx_image_data
kono
parents:
diff changeset
379 {
kono
parents:
diff changeset
380 const void *target_data;
kono
parents:
diff changeset
381 CUmodule module;
kono
parents:
diff changeset
382
kono
parents:
diff changeset
383 struct targ_fn_descriptor *fns; /* Array of functions. */
kono
parents:
diff changeset
384
kono
parents:
diff changeset
385 struct ptx_image_data *next;
kono
parents:
diff changeset
386 };
kono
parents:
diff changeset
387
kono
parents:
diff changeset
388 struct ptx_device
kono
parents:
diff changeset
389 {
kono
parents:
diff changeset
390 CUcontext ctx;
kono
parents:
diff changeset
391 bool ctx_shared;
kono
parents:
diff changeset
392 CUdevice dev;
kono
parents:
diff changeset
393 struct ptx_stream *null_stream;
kono
parents:
diff changeset
394 /* All non-null streams associated with this device (actually context),
kono
parents:
diff changeset
395 either created implicitly or passed in from the user (via
kono
parents:
diff changeset
396 acc_set_cuda_stream). */
kono
parents:
diff changeset
397 struct ptx_stream *active_streams;
kono
parents:
diff changeset
398 struct {
kono
parents:
diff changeset
399 struct ptx_stream **arr;
kono
parents:
diff changeset
400 int size;
kono
parents:
diff changeset
401 } async_streams;
kono
parents:
diff changeset
402 /* A lock for use when manipulating the above stream list and array. */
kono
parents:
diff changeset
403 pthread_mutex_t stream_lock;
kono
parents:
diff changeset
404 int ord;
kono
parents:
diff changeset
405 bool overlap;
kono
parents:
diff changeset
406 bool map;
kono
parents:
diff changeset
407 bool concur;
kono
parents:
diff changeset
408 bool mkern;
kono
parents:
diff changeset
409 int mode;
kono
parents:
diff changeset
410 int clock_khz;
kono
parents:
diff changeset
411 int num_sms;
kono
parents:
diff changeset
412 int regs_per_block;
kono
parents:
diff changeset
413 int regs_per_sm;
kono
parents:
diff changeset
414
kono
parents:
diff changeset
415 struct ptx_image_data *images; /* Images loaded on device. */
kono
parents:
diff changeset
416 pthread_mutex_t image_lock; /* Lock for above list. */
kono
parents:
diff changeset
417
kono
parents:
diff changeset
418 struct ptx_device *next;
kono
parents:
diff changeset
419 };
kono
parents:
diff changeset
420
kono
parents:
diff changeset
421 enum ptx_event_type
kono
parents:
diff changeset
422 {
kono
parents:
diff changeset
423 PTX_EVT_MEM,
kono
parents:
diff changeset
424 PTX_EVT_KNL,
kono
parents:
diff changeset
425 PTX_EVT_SYNC,
kono
parents:
diff changeset
426 PTX_EVT_ASYNC_CLEANUP
kono
parents:
diff changeset
427 };
kono
parents:
diff changeset
428
kono
parents:
diff changeset
429 struct ptx_event
kono
parents:
diff changeset
430 {
kono
parents:
diff changeset
431 CUevent *evt;
kono
parents:
diff changeset
432 int type;
kono
parents:
diff changeset
433 void *addr;
kono
parents:
diff changeset
434 int ord;
kono
parents:
diff changeset
435 int val;
kono
parents:
diff changeset
436
kono
parents:
diff changeset
437 struct ptx_event *next;
kono
parents:
diff changeset
438 };
kono
parents:
diff changeset
439
kono
parents:
diff changeset
440 static pthread_mutex_t ptx_event_lock;
kono
parents:
diff changeset
441 static struct ptx_event *ptx_events;
kono
parents:
diff changeset
442
kono
parents:
diff changeset
443 static struct ptx_device **ptx_devices;
kono
parents:
diff changeset
444
kono
parents:
diff changeset
445 static inline struct nvptx_thread *
kono
parents:
diff changeset
446 nvptx_thread (void)
kono
parents:
diff changeset
447 {
kono
parents:
diff changeset
448 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
kono
parents:
diff changeset
449 }
kono
parents:
diff changeset
450
kono
parents:
diff changeset
451 static bool
kono
parents:
diff changeset
452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
kono
parents:
diff changeset
453 {
kono
parents:
diff changeset
454 int i;
kono
parents:
diff changeset
455 struct ptx_stream *null_stream
kono
parents:
diff changeset
456 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
kono
parents:
diff changeset
457
kono
parents:
diff changeset
458 null_stream->stream = NULL;
kono
parents:
diff changeset
459 null_stream->host_thread = pthread_self ();
kono
parents:
diff changeset
460 null_stream->multithreaded = true;
kono
parents:
diff changeset
461 null_stream->d = (CUdeviceptr) NULL;
kono
parents:
diff changeset
462 null_stream->h = NULL;
kono
parents:
diff changeset
463 if (!map_init (null_stream))
kono
parents:
diff changeset
464 return false;
kono
parents:
diff changeset
465
kono
parents:
diff changeset
466 ptx_dev->null_stream = null_stream;
kono
parents:
diff changeset
467 ptx_dev->active_streams = NULL;
kono
parents:
diff changeset
468 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
kono
parents:
diff changeset
469
kono
parents:
diff changeset
470 if (concurrency < 1)
kono
parents:
diff changeset
471 concurrency = 1;
kono
parents:
diff changeset
472
kono
parents:
diff changeset
473 /* This is just a guess -- make space for as many async streams as the
kono
parents:
diff changeset
474 current device is capable of concurrently executing. This can grow
kono
parents:
diff changeset
475 later as necessary. No streams are created yet. */
kono
parents:
diff changeset
476 ptx_dev->async_streams.arr
kono
parents:
diff changeset
477 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
kono
parents:
diff changeset
478 ptx_dev->async_streams.size = concurrency;
kono
parents:
diff changeset
479
kono
parents:
diff changeset
480 for (i = 0; i < concurrency; i++)
kono
parents:
diff changeset
481 ptx_dev->async_streams.arr[i] = NULL;
kono
parents:
diff changeset
482
kono
parents:
diff changeset
483 return true;
kono
parents:
diff changeset
484 }
kono
parents:
diff changeset
485
kono
parents:
diff changeset
486 static bool
kono
parents:
diff changeset
487 fini_streams_for_device (struct ptx_device *ptx_dev)
kono
parents:
diff changeset
488 {
kono
parents:
diff changeset
489 free (ptx_dev->async_streams.arr);
kono
parents:
diff changeset
490
kono
parents:
diff changeset
491 bool ret = true;
kono
parents:
diff changeset
492 while (ptx_dev->active_streams != NULL)
kono
parents:
diff changeset
493 {
kono
parents:
diff changeset
494 struct ptx_stream *s = ptx_dev->active_streams;
kono
parents:
diff changeset
495 ptx_dev->active_streams = ptx_dev->active_streams->next;
kono
parents:
diff changeset
496
kono
parents:
diff changeset
497 ret &= map_fini (s);
kono
parents:
diff changeset
498
kono
parents:
diff changeset
499 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
kono
parents:
diff changeset
500 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
501 {
kono
parents:
diff changeset
502 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
kono
parents:
diff changeset
503 ret = false;
kono
parents:
diff changeset
504 }
kono
parents:
diff changeset
505 free (s);
kono
parents:
diff changeset
506 }
kono
parents:
diff changeset
507
kono
parents:
diff changeset
508 ret &= map_fini (ptx_dev->null_stream);
kono
parents:
diff changeset
509 free (ptx_dev->null_stream);
kono
parents:
diff changeset
510 return ret;
kono
parents:
diff changeset
511 }
kono
parents:
diff changeset
512
kono
parents:
diff changeset
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
kono
parents:
diff changeset
514 thread THREAD (and also current device/context). If CREATE is true, create
kono
parents:
diff changeset
515 the stream if it does not exist (or use EXISTING if it is non-NULL), and
kono
parents:
diff changeset
516 associate the stream with the same thread argument. Returns stream to use
kono
parents:
diff changeset
517 as result. */
kono
parents:
diff changeset
518
kono
parents:
diff changeset
519 static struct ptx_stream *
kono
parents:
diff changeset
520 select_stream_for_async (int async, pthread_t thread, bool create,
kono
parents:
diff changeset
521 CUstream existing)
kono
parents:
diff changeset
522 {
kono
parents:
diff changeset
523 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
524 /* Local copy of TLS variable. */
kono
parents:
diff changeset
525 struct ptx_device *ptx_dev = nvthd->ptx_dev;
kono
parents:
diff changeset
526 struct ptx_stream *stream = NULL;
kono
parents:
diff changeset
527 int orig_async = async;
kono
parents:
diff changeset
528
kono
parents:
diff changeset
529 /* The special value acc_async_noval (-1) maps (for now) to an
kono
parents:
diff changeset
530 implicitly-created stream, which is then handled the same as any other
kono
parents:
diff changeset
531 numbered async stream. Other options are available, e.g. using the null
kono
parents:
diff changeset
532 stream for anonymous async operations, or choosing an idle stream from an
kono
parents:
diff changeset
533 active set. But, stick with this for now. */
kono
parents:
diff changeset
534 if (async > acc_async_sync)
kono
parents:
diff changeset
535 async++;
kono
parents:
diff changeset
536
kono
parents:
diff changeset
537 if (create)
kono
parents:
diff changeset
538 pthread_mutex_lock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
539
kono
parents:
diff changeset
540 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
kono
parents:
diff changeset
541 null stream, and in fact better performance may be obtainable if it doesn't
kono
parents:
diff changeset
542 (because the null stream enforces overly-strict synchronisation with
kono
parents:
diff changeset
543 respect to other streams for legacy reasons, and that's probably not
kono
parents:
diff changeset
544 needed with OpenACC). Maybe investigate later. */
kono
parents:
diff changeset
545 if (async == acc_async_sync)
kono
parents:
diff changeset
546 stream = ptx_dev->null_stream;
kono
parents:
diff changeset
547 else if (async >= 0 && async < ptx_dev->async_streams.size
kono
parents:
diff changeset
548 && ptx_dev->async_streams.arr[async] && !(create && existing))
kono
parents:
diff changeset
549 stream = ptx_dev->async_streams.arr[async];
kono
parents:
diff changeset
550 else if (async >= 0 && create)
kono
parents:
diff changeset
551 {
kono
parents:
diff changeset
552 if (async >= ptx_dev->async_streams.size)
kono
parents:
diff changeset
553 {
kono
parents:
diff changeset
554 int i, newsize = ptx_dev->async_streams.size * 2;
kono
parents:
diff changeset
555
kono
parents:
diff changeset
556 if (async >= newsize)
kono
parents:
diff changeset
557 newsize = async + 1;
kono
parents:
diff changeset
558
kono
parents:
diff changeset
559 ptx_dev->async_streams.arr
kono
parents:
diff changeset
560 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
kono
parents:
diff changeset
561 newsize * sizeof (struct ptx_stream *));
kono
parents:
diff changeset
562
kono
parents:
diff changeset
563 for (i = ptx_dev->async_streams.size; i < newsize; i++)
kono
parents:
diff changeset
564 ptx_dev->async_streams.arr[i] = NULL;
kono
parents:
diff changeset
565
kono
parents:
diff changeset
566 ptx_dev->async_streams.size = newsize;
kono
parents:
diff changeset
567 }
kono
parents:
diff changeset
568
kono
parents:
diff changeset
569 /* Create a new stream on-demand if there isn't one already, or if we're
kono
parents:
diff changeset
570 setting a particular async value to an existing (externally-provided)
kono
parents:
diff changeset
571 stream. */
kono
parents:
diff changeset
572 if (!ptx_dev->async_streams.arr[async] || existing)
kono
parents:
diff changeset
573 {
kono
parents:
diff changeset
574 CUresult r;
kono
parents:
diff changeset
575 struct ptx_stream *s
kono
parents:
diff changeset
576 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
kono
parents:
diff changeset
577
kono
parents:
diff changeset
578 if (existing)
kono
parents:
diff changeset
579 s->stream = existing;
kono
parents:
diff changeset
580 else
kono
parents:
diff changeset
581 {
kono
parents:
diff changeset
582 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
kono
parents:
diff changeset
583 CU_STREAM_DEFAULT);
kono
parents:
diff changeset
584 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
585 {
kono
parents:
diff changeset
586 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
587 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
kono
parents:
diff changeset
588 cuda_error (r));
kono
parents:
diff changeset
589 }
kono
parents:
diff changeset
590 }
kono
parents:
diff changeset
591
kono
parents:
diff changeset
592 /* If CREATE is true, we're going to be queueing some work on this
kono
parents:
diff changeset
593 stream. Associate it with the current host thread. */
kono
parents:
diff changeset
594 s->host_thread = thread;
kono
parents:
diff changeset
595 s->multithreaded = false;
kono
parents:
diff changeset
596
kono
parents:
diff changeset
597 s->d = (CUdeviceptr) NULL;
kono
parents:
diff changeset
598 s->h = NULL;
kono
parents:
diff changeset
599 if (!map_init (s))
kono
parents:
diff changeset
600 {
kono
parents:
diff changeset
601 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
602 GOMP_PLUGIN_fatal ("map_init fail");
kono
parents:
diff changeset
603 }
kono
parents:
diff changeset
604
kono
parents:
diff changeset
605 s->next = ptx_dev->active_streams;
kono
parents:
diff changeset
606 ptx_dev->active_streams = s;
kono
parents:
diff changeset
607 ptx_dev->async_streams.arr[async] = s;
kono
parents:
diff changeset
608 }
kono
parents:
diff changeset
609
kono
parents:
diff changeset
610 stream = ptx_dev->async_streams.arr[async];
kono
parents:
diff changeset
611 }
kono
parents:
diff changeset
612 else if (async < 0)
kono
parents:
diff changeset
613 {
kono
parents:
diff changeset
614 if (create)
kono
parents:
diff changeset
615 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
616 GOMP_PLUGIN_fatal ("bad async %d", async);
kono
parents:
diff changeset
617 }
kono
parents:
diff changeset
618
kono
parents:
diff changeset
619 if (create)
kono
parents:
diff changeset
620 {
kono
parents:
diff changeset
621 assert (stream != NULL);
kono
parents:
diff changeset
622
kono
parents:
diff changeset
623 /* If we're trying to use the same stream from different threads
kono
parents:
diff changeset
624 simultaneously, set stream->multithreaded to true. This affects the
kono
parents:
diff changeset
625 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
kono
parents:
diff changeset
626 only wait for asynchronous launches from the same host thread they are
kono
parents:
diff changeset
627 invoked on. If multiple threads use the same async value, we make note
kono
parents:
diff changeset
628 of that here and fall back to testing/waiting for all threads in those
kono
parents:
diff changeset
629 functions. */
kono
parents:
diff changeset
630 if (thread != stream->host_thread)
kono
parents:
diff changeset
631 stream->multithreaded = true;
kono
parents:
diff changeset
632
kono
parents:
diff changeset
633 pthread_mutex_unlock (&ptx_dev->stream_lock);
kono
parents:
diff changeset
634 }
kono
parents:
diff changeset
635 else if (stream && !stream->multithreaded
kono
parents:
diff changeset
636 && !pthread_equal (stream->host_thread, thread))
kono
parents:
diff changeset
637 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
kono
parents:
diff changeset
638
kono
parents:
diff changeset
639 return stream;
kono
parents:
diff changeset
640 }
kono
parents:
diff changeset
641
kono
parents:
diff changeset
642 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
kono
parents:
diff changeset
643 should be locked on entry and remains locked on exit. */
kono
parents:
diff changeset
644
kono
parents:
diff changeset
645 static bool
kono
parents:
diff changeset
646 nvptx_init (void)
kono
parents:
diff changeset
647 {
kono
parents:
diff changeset
648 int ndevs;
kono
parents:
diff changeset
649
kono
parents:
diff changeset
650 if (instantiated_devices != 0)
kono
parents:
diff changeset
651 return true;
kono
parents:
diff changeset
652
kono
parents:
diff changeset
653 ptx_events = NULL;
kono
parents:
diff changeset
654 pthread_mutex_init (&ptx_event_lock, NULL);
kono
parents:
diff changeset
655
kono
parents:
diff changeset
656 if (!init_cuda_lib ())
kono
parents:
diff changeset
657 return false;
kono
parents:
diff changeset
658
kono
parents:
diff changeset
659 CUDA_CALL (cuInit, 0);
kono
parents:
diff changeset
660
kono
parents:
diff changeset
661 CUDA_CALL (cuDeviceGetCount, &ndevs);
kono
parents:
diff changeset
662 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
kono
parents:
diff changeset
663 * ndevs);
kono
parents:
diff changeset
664 return true;
kono
parents:
diff changeset
665 }
kono
parents:
diff changeset
666
kono
parents:
diff changeset
667 /* Select the N'th PTX device for the current host thread. The device must
kono
parents:
diff changeset
668 have been previously opened before calling this function. */
kono
parents:
diff changeset
669
kono
parents:
diff changeset
670 static bool
kono
parents:
diff changeset
671 nvptx_attach_host_thread_to_device (int n)
kono
parents:
diff changeset
672 {
kono
parents:
diff changeset
673 CUdevice dev;
kono
parents:
diff changeset
674 CUresult r;
kono
parents:
diff changeset
675 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
676 CUcontext thd_ctx;
kono
parents:
diff changeset
677
kono
parents:
diff changeset
678 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
kono
parents:
diff changeset
679 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
kono
parents:
diff changeset
680 {
kono
parents:
diff changeset
681 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
kono
parents:
diff changeset
682 return false;
kono
parents:
diff changeset
683 }
kono
parents:
diff changeset
684
kono
parents:
diff changeset
685 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
kono
parents:
diff changeset
686 return true;
kono
parents:
diff changeset
687 else
kono
parents:
diff changeset
688 {
kono
parents:
diff changeset
689 CUcontext old_ctx;
kono
parents:
diff changeset
690
kono
parents:
diff changeset
691 ptx_dev = ptx_devices[n];
kono
parents:
diff changeset
692 if (!ptx_dev)
kono
parents:
diff changeset
693 {
kono
parents:
diff changeset
694 GOMP_PLUGIN_error ("device %d not found", n);
kono
parents:
diff changeset
695 return false;
kono
parents:
diff changeset
696 }
kono
parents:
diff changeset
697
kono
parents:
diff changeset
698 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
kono
parents:
diff changeset
699
kono
parents:
diff changeset
700 /* We don't necessarily have a current context (e.g. if it has been
kono
parents:
diff changeset
701 destroyed. Pop it if we do though. */
kono
parents:
diff changeset
702 if (thd_ctx != NULL)
kono
parents:
diff changeset
703 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
kono
parents:
diff changeset
704
kono
parents:
diff changeset
705 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
kono
parents:
diff changeset
706 }
kono
parents:
diff changeset
707 return true;
kono
parents:
diff changeset
708 }
kono
parents:
diff changeset
709
kono
parents:
diff changeset
710 static struct ptx_device *
kono
parents:
diff changeset
711 nvptx_open_device (int n)
kono
parents:
diff changeset
712 {
kono
parents:
diff changeset
713 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
714 CUdevice dev, ctx_dev;
kono
parents:
diff changeset
715 CUresult r;
kono
parents:
diff changeset
716 int async_engines, pi;
kono
parents:
diff changeset
717
kono
parents:
diff changeset
718 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
kono
parents:
diff changeset
719
kono
parents:
diff changeset
720 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
kono
parents:
diff changeset
721
kono
parents:
diff changeset
722 ptx_dev->ord = n;
kono
parents:
diff changeset
723 ptx_dev->dev = dev;
kono
parents:
diff changeset
724 ptx_dev->ctx_shared = false;
kono
parents:
diff changeset
725
kono
parents:
diff changeset
726 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
kono
parents:
diff changeset
727 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
kono
parents:
diff changeset
728 {
kono
parents:
diff changeset
729 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
kono
parents:
diff changeset
730 return NULL;
kono
parents:
diff changeset
731 }
kono
parents:
diff changeset
732
kono
parents:
diff changeset
733 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
kono
parents:
diff changeset
734 {
kono
parents:
diff changeset
735 /* The current host thread has an active context for a different device.
kono
parents:
diff changeset
736 Detach it. */
kono
parents:
diff changeset
737 CUcontext old_ctx;
kono
parents:
diff changeset
738 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
kono
parents:
diff changeset
739 }
kono
parents:
diff changeset
740
kono
parents:
diff changeset
741 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
kono
parents:
diff changeset
742
kono
parents:
diff changeset
743 if (!ptx_dev->ctx)
kono
parents:
diff changeset
744 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
kono
parents:
diff changeset
745 else
kono
parents:
diff changeset
746 ptx_dev->ctx_shared = true;
kono
parents:
diff changeset
747
kono
parents:
diff changeset
748 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
749 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
kono
parents:
diff changeset
750 ptx_dev->overlap = pi;
kono
parents:
diff changeset
751
kono
parents:
diff changeset
752 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
753 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
kono
parents:
diff changeset
754 ptx_dev->map = pi;
kono
parents:
diff changeset
755
kono
parents:
diff changeset
756 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
757 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
kono
parents:
diff changeset
758 ptx_dev->concur = pi;
kono
parents:
diff changeset
759
kono
parents:
diff changeset
760 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
761 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
kono
parents:
diff changeset
762 ptx_dev->mode = pi;
kono
parents:
diff changeset
763
kono
parents:
diff changeset
764 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
765 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
kono
parents:
diff changeset
766 ptx_dev->mkern = pi;
kono
parents:
diff changeset
767
kono
parents:
diff changeset
768 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
769 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
kono
parents:
diff changeset
770 ptx_dev->clock_khz = pi;
kono
parents:
diff changeset
771
kono
parents:
diff changeset
772 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
773 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
kono
parents:
diff changeset
774 ptx_dev->num_sms = pi;
kono
parents:
diff changeset
775
kono
parents:
diff changeset
776 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
777 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
kono
parents:
diff changeset
778 ptx_dev->regs_per_block = pi;
kono
parents:
diff changeset
779
kono
parents:
diff changeset
780 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
kono
parents:
diff changeset
781 in CUDA 6.0 and newer. */
kono
parents:
diff changeset
782 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
kono
parents:
diff changeset
783 /* Fallback: use limit of registers per block, which is usually equal. */
kono
parents:
diff changeset
784 if (r == CUDA_ERROR_INVALID_VALUE)
kono
parents:
diff changeset
785 pi = ptx_dev->regs_per_block;
kono
parents:
diff changeset
786 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
787 {
kono
parents:
diff changeset
788 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
kono
parents:
diff changeset
789 return NULL;
kono
parents:
diff changeset
790 }
kono
parents:
diff changeset
791 ptx_dev->regs_per_sm = pi;
kono
parents:
diff changeset
792
kono
parents:
diff changeset
793 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
kono
parents:
diff changeset
794 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
kono
parents:
diff changeset
795 if (pi != 32)
kono
parents:
diff changeset
796 {
kono
parents:
diff changeset
797 GOMP_PLUGIN_error ("Only warp size 32 is supported");
kono
parents:
diff changeset
798 return NULL;
kono
parents:
diff changeset
799 }
kono
parents:
diff changeset
800
kono
parents:
diff changeset
801 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
kono
parents:
diff changeset
802 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
kono
parents:
diff changeset
803 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
804 async_engines = 1;
kono
parents:
diff changeset
805
kono
parents:
diff changeset
806 ptx_dev->images = NULL;
kono
parents:
diff changeset
807 pthread_mutex_init (&ptx_dev->image_lock, NULL);
kono
parents:
diff changeset
808
kono
parents:
diff changeset
809 if (!init_streams_for_device (ptx_dev, async_engines))
kono
parents:
diff changeset
810 return NULL;
kono
parents:
diff changeset
811
kono
parents:
diff changeset
812 return ptx_dev;
kono
parents:
diff changeset
813 }
kono
parents:
diff changeset
814
kono
parents:
diff changeset
815 static bool
kono
parents:
diff changeset
816 nvptx_close_device (struct ptx_device *ptx_dev)
kono
parents:
diff changeset
817 {
kono
parents:
diff changeset
818 if (!ptx_dev)
kono
parents:
diff changeset
819 return true;
kono
parents:
diff changeset
820
kono
parents:
diff changeset
821 if (!fini_streams_for_device (ptx_dev))
kono
parents:
diff changeset
822 return false;
kono
parents:
diff changeset
823
kono
parents:
diff changeset
824 pthread_mutex_destroy (&ptx_dev->image_lock);
kono
parents:
diff changeset
825
kono
parents:
diff changeset
826 if (!ptx_dev->ctx_shared)
kono
parents:
diff changeset
827 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
kono
parents:
diff changeset
828
kono
parents:
diff changeset
829 free (ptx_dev);
kono
parents:
diff changeset
830 return true;
kono
parents:
diff changeset
831 }
kono
parents:
diff changeset
832
kono
parents:
diff changeset
833 static int
kono
parents:
diff changeset
834 nvptx_get_num_devices (void)
kono
parents:
diff changeset
835 {
kono
parents:
diff changeset
836 int n;
kono
parents:
diff changeset
837
kono
parents:
diff changeset
838 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
kono
parents:
diff changeset
839 configurations. */
kono
parents:
diff changeset
840 if (sizeof (void *) != 8)
kono
parents:
diff changeset
841 {
kono
parents:
diff changeset
842 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
kono
parents:
diff changeset
843 " only 64-bit configurations are supported\n");
kono
parents:
diff changeset
844 return 0;
kono
parents:
diff changeset
845 }
kono
parents:
diff changeset
846
kono
parents:
diff changeset
847 /* This function will be called before the plugin has been initialized in
kono
parents:
diff changeset
848 order to enumerate available devices, but CUDA API routines can't be used
kono
parents:
diff changeset
849 until cuInit has been called. Just call it now (but don't yet do any
kono
parents:
diff changeset
850 further initialization). */
kono
parents:
diff changeset
851 if (instantiated_devices == 0)
kono
parents:
diff changeset
852 {
kono
parents:
diff changeset
853 if (!init_cuda_lib ())
kono
parents:
diff changeset
854 return 0;
kono
parents:
diff changeset
855 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
kono
parents:
diff changeset
856 /* This is not an error: e.g. we may have CUDA libraries installed but
kono
parents:
diff changeset
857 no devices available. */
kono
parents:
diff changeset
858 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
859 {
kono
parents:
diff changeset
860 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
kono
parents:
diff changeset
861 cuda_error (r));
kono
parents:
diff changeset
862 return 0;
kono
parents:
diff changeset
863 }
kono
parents:
diff changeset
864 }
kono
parents:
diff changeset
865
kono
parents:
diff changeset
866 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
kono
parents:
diff changeset
867 return n;
kono
parents:
diff changeset
868 }
kono
parents:
diff changeset
869
kono
parents:
diff changeset
870 static void
kono
parents:
diff changeset
871 notify_var (const char *var_name, const char *env_var)
kono
parents:
diff changeset
872 {
kono
parents:
diff changeset
873 if (env_var == NULL)
kono
parents:
diff changeset
874 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
kono
parents:
diff changeset
875 else
kono
parents:
diff changeset
876 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
kono
parents:
diff changeset
877 }
kono
parents:
diff changeset
878
kono
parents:
diff changeset
879 static bool
kono
parents:
diff changeset
880 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
kono
parents:
diff changeset
881 unsigned num_objs)
kono
parents:
diff changeset
882 {
kono
parents:
diff changeset
883 CUjit_option opts[6];
kono
parents:
diff changeset
884 void *optvals[6];
kono
parents:
diff changeset
885 float elapsed = 0.0;
kono
parents:
diff changeset
886 char elog[1024];
kono
parents:
diff changeset
887 char ilog[16384];
kono
parents:
diff changeset
888 CUlinkState linkstate;
kono
parents:
diff changeset
889 CUresult r;
kono
parents:
diff changeset
890 void *linkout;
kono
parents:
diff changeset
891 size_t linkoutsize __attribute__ ((unused));
kono
parents:
diff changeset
892
kono
parents:
diff changeset
893 opts[0] = CU_JIT_WALL_TIME;
kono
parents:
diff changeset
894 optvals[0] = &elapsed;
kono
parents:
diff changeset
895
kono
parents:
diff changeset
896 opts[1] = CU_JIT_INFO_LOG_BUFFER;
kono
parents:
diff changeset
897 optvals[1] = &ilog[0];
kono
parents:
diff changeset
898
kono
parents:
diff changeset
899 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
kono
parents:
diff changeset
900 optvals[2] = (void *) sizeof ilog;
kono
parents:
diff changeset
901
kono
parents:
diff changeset
902 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
kono
parents:
diff changeset
903 optvals[3] = &elog[0];
kono
parents:
diff changeset
904
kono
parents:
diff changeset
905 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
kono
parents:
diff changeset
906 optvals[4] = (void *) sizeof elog;
kono
parents:
diff changeset
907
kono
parents:
diff changeset
908 opts[5] = CU_JIT_LOG_VERBOSE;
kono
parents:
diff changeset
909 optvals[5] = (void *) 1;
kono
parents:
diff changeset
910
kono
parents:
diff changeset
911 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
kono
parents:
diff changeset
912
kono
parents:
diff changeset
913 for (; num_objs--; ptx_objs++)
kono
parents:
diff changeset
914 {
kono
parents:
diff changeset
915 /* cuLinkAddData's 'data' argument erroneously omits the const
kono
parents:
diff changeset
916 qualifier. */
kono
parents:
diff changeset
917 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
kono
parents:
diff changeset
918 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
kono
parents:
diff changeset
919 (char *) ptx_objs->code, ptx_objs->size,
kono
parents:
diff changeset
920 0, 0, 0, 0);
kono
parents:
diff changeset
921 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
922 {
kono
parents:
diff changeset
923 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
kono
parents:
diff changeset
924 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
kono
parents:
diff changeset
925 cuda_error (r));
kono
parents:
diff changeset
926 return false;
kono
parents:
diff changeset
927 }
kono
parents:
diff changeset
928 }
kono
parents:
diff changeset
929
kono
parents:
diff changeset
930 GOMP_PLUGIN_debug (0, "Linking\n");
kono
parents:
diff changeset
931 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
kono
parents:
diff changeset
932
kono
parents:
diff changeset
933 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
kono
parents:
diff changeset
934 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
kono
parents:
diff changeset
935
kono
parents:
diff changeset
936 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
937 {
kono
parents:
diff changeset
938 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
kono
parents:
diff changeset
939 return false;
kono
parents:
diff changeset
940 }
kono
parents:
diff changeset
941
kono
parents:
diff changeset
942 CUDA_CALL (cuModuleLoadData, module, linkout);
kono
parents:
diff changeset
943 CUDA_CALL (cuLinkDestroy, linkstate);
kono
parents:
diff changeset
944 return true;
kono
parents:
diff changeset
945 }
kono
parents:
diff changeset
946
kono
parents:
diff changeset
947 static void
kono
parents:
diff changeset
948 event_gc (bool memmap_lockable)
kono
parents:
diff changeset
949 {
kono
parents:
diff changeset
950 struct ptx_event *ptx_event = ptx_events;
kono
parents:
diff changeset
951 struct ptx_event *async_cleanups = NULL;
kono
parents:
diff changeset
952 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
953
kono
parents:
diff changeset
954 pthread_mutex_lock (&ptx_event_lock);
kono
parents:
diff changeset
955
kono
parents:
diff changeset
956 while (ptx_event != NULL)
kono
parents:
diff changeset
957 {
kono
parents:
diff changeset
958 CUresult r;
kono
parents:
diff changeset
959 struct ptx_event *e = ptx_event;
kono
parents:
diff changeset
960
kono
parents:
diff changeset
961 ptx_event = ptx_event->next;
kono
parents:
diff changeset
962
kono
parents:
diff changeset
963 if (e->ord != nvthd->ptx_dev->ord)
kono
parents:
diff changeset
964 continue;
kono
parents:
diff changeset
965
kono
parents:
diff changeset
966 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
kono
parents:
diff changeset
967 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
968 {
kono
parents:
diff changeset
969 bool append_async = false;
kono
parents:
diff changeset
970 CUevent *te;
kono
parents:
diff changeset
971
kono
parents:
diff changeset
972 te = e->evt;
kono
parents:
diff changeset
973
kono
parents:
diff changeset
974 switch (e->type)
kono
parents:
diff changeset
975 {
kono
parents:
diff changeset
976 case PTX_EVT_MEM:
kono
parents:
diff changeset
977 case PTX_EVT_SYNC:
kono
parents:
diff changeset
978 break;
kono
parents:
diff changeset
979
kono
parents:
diff changeset
980 case PTX_EVT_KNL:
kono
parents:
diff changeset
981 map_pop (e->addr);
kono
parents:
diff changeset
982 break;
kono
parents:
diff changeset
983
kono
parents:
diff changeset
984 case PTX_EVT_ASYNC_CLEANUP:
kono
parents:
diff changeset
985 {
kono
parents:
diff changeset
986 /* The function gomp_plugin_async_unmap_vars needs to claim the
kono
parents:
diff changeset
987 memory-map splay tree lock for the current device, so we
kono
parents:
diff changeset
988 can't call it when one of our callers has already claimed
kono
parents:
diff changeset
989 the lock. In that case, just delay the GC for this event
kono
parents:
diff changeset
990 until later. */
kono
parents:
diff changeset
991 if (!memmap_lockable)
kono
parents:
diff changeset
992 continue;
kono
parents:
diff changeset
993
kono
parents:
diff changeset
994 append_async = true;
kono
parents:
diff changeset
995 }
kono
parents:
diff changeset
996 break;
kono
parents:
diff changeset
997 }
kono
parents:
diff changeset
998
kono
parents:
diff changeset
999 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
kono
parents:
diff changeset
1000 free ((void *)te);
kono
parents:
diff changeset
1001
kono
parents:
diff changeset
1002 /* Unlink 'e' from ptx_events list. */
kono
parents:
diff changeset
1003 if (ptx_events == e)
kono
parents:
diff changeset
1004 ptx_events = ptx_events->next;
kono
parents:
diff changeset
1005 else
kono
parents:
diff changeset
1006 {
kono
parents:
diff changeset
1007 struct ptx_event *e_ = ptx_events;
kono
parents:
diff changeset
1008 while (e_->next != e)
kono
parents:
diff changeset
1009 e_ = e_->next;
kono
parents:
diff changeset
1010 e_->next = e_->next->next;
kono
parents:
diff changeset
1011 }
kono
parents:
diff changeset
1012
kono
parents:
diff changeset
1013 if (append_async)
kono
parents:
diff changeset
1014 {
kono
parents:
diff changeset
1015 e->next = async_cleanups;
kono
parents:
diff changeset
1016 async_cleanups = e;
kono
parents:
diff changeset
1017 }
kono
parents:
diff changeset
1018 else
kono
parents:
diff changeset
1019 free (e);
kono
parents:
diff changeset
1020 }
kono
parents:
diff changeset
1021 }
kono
parents:
diff changeset
1022
kono
parents:
diff changeset
1023 pthread_mutex_unlock (&ptx_event_lock);
kono
parents:
diff changeset
1024
kono
parents:
diff changeset
1025 /* We have to do these here, after ptx_event_lock is released. */
kono
parents:
diff changeset
1026 while (async_cleanups)
kono
parents:
diff changeset
1027 {
kono
parents:
diff changeset
1028 struct ptx_event *e = async_cleanups;
kono
parents:
diff changeset
1029 async_cleanups = async_cleanups->next;
kono
parents:
diff changeset
1030
kono
parents:
diff changeset
1031 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
kono
parents:
diff changeset
1032 free (e);
kono
parents:
diff changeset
1033 }
kono
parents:
diff changeset
1034 }
kono
parents:
diff changeset
1035
kono
parents:
diff changeset
1036 static void
kono
parents:
diff changeset
1037 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
kono
parents:
diff changeset
1038 {
kono
parents:
diff changeset
1039 struct ptx_event *ptx_event;
kono
parents:
diff changeset
1040 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1041
kono
parents:
diff changeset
1042 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
kono
parents:
diff changeset
1043 || type == PTX_EVT_ASYNC_CLEANUP);
kono
parents:
diff changeset
1044
kono
parents:
diff changeset
1045 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
kono
parents:
diff changeset
1046 ptx_event->type = type;
kono
parents:
diff changeset
1047 ptx_event->evt = e;
kono
parents:
diff changeset
1048 ptx_event->addr = h;
kono
parents:
diff changeset
1049 ptx_event->ord = nvthd->ptx_dev->ord;
kono
parents:
diff changeset
1050 ptx_event->val = val;
kono
parents:
diff changeset
1051
kono
parents:
diff changeset
1052 pthread_mutex_lock (&ptx_event_lock);
kono
parents:
diff changeset
1053
kono
parents:
diff changeset
1054 ptx_event->next = ptx_events;
kono
parents:
diff changeset
1055 ptx_events = ptx_event;
kono
parents:
diff changeset
1056
kono
parents:
diff changeset
1057 pthread_mutex_unlock (&ptx_event_lock);
kono
parents:
diff changeset
1058 }
kono
parents:
diff changeset
1059
kono
parents:
diff changeset
1060 static void
kono
parents:
diff changeset
1061 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
kono
parents:
diff changeset
1062 int async, unsigned *dims, void *targ_mem_desc)
kono
parents:
diff changeset
1063 {
kono
parents:
diff changeset
1064 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
kono
parents:
diff changeset
1065 CUfunction function;
kono
parents:
diff changeset
1066 CUresult r;
kono
parents:
diff changeset
1067 int i;
kono
parents:
diff changeset
1068 struct ptx_stream *dev_str;
kono
parents:
diff changeset
1069 void *kargs[1];
kono
parents:
diff changeset
1070 void *hp, *dp;
kono
parents:
diff changeset
1071 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1072 const char *maybe_abort_msg = "(perhaps abort was called)";
kono
parents:
diff changeset
1073
kono
parents:
diff changeset
1074 function = targ_fn->fn;
kono
parents:
diff changeset
1075
kono
parents:
diff changeset
1076 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1077 assert (dev_str == nvthd->current_stream);
kono
parents:
diff changeset
1078
kono
parents:
diff changeset
1079 /* Initialize the launch dimensions. Typically this is constant,
kono
parents:
diff changeset
1080 provided by the device compiler, but we must permit runtime
kono
parents:
diff changeset
1081 values. */
kono
parents:
diff changeset
1082 int seen_zero = 0;
kono
parents:
diff changeset
1083 for (i = 0; i != GOMP_DIM_MAX; i++)
kono
parents:
diff changeset
1084 {
kono
parents:
diff changeset
1085 if (targ_fn->launch->dim[i])
kono
parents:
diff changeset
1086 dims[i] = targ_fn->launch->dim[i];
kono
parents:
diff changeset
1087 if (!dims[i])
kono
parents:
diff changeset
1088 seen_zero = 1;
kono
parents:
diff changeset
1089 }
kono
parents:
diff changeset
1090
kono
parents:
diff changeset
1091 if (seen_zero)
kono
parents:
diff changeset
1092 {
kono
parents:
diff changeset
1093 /* See if the user provided GOMP_OPENACC_DIM environment
kono
parents:
diff changeset
1094 variable to specify runtime defaults. */
kono
parents:
diff changeset
1095 static int default_dims[GOMP_DIM_MAX];
kono
parents:
diff changeset
1096
kono
parents:
diff changeset
1097 pthread_mutex_lock (&ptx_dev_lock);
kono
parents:
diff changeset
1098 if (!default_dims[0])
kono
parents:
diff changeset
1099 {
kono
parents:
diff changeset
1100 const char *var_name = "GOMP_OPENACC_DIM";
kono
parents:
diff changeset
1101 /* We only read the environment variable once. You can't
kono
parents:
diff changeset
1102 change it in the middle of execution. The syntax is
kono
parents:
diff changeset
1103 the same as for the -fopenacc-dim compilation option. */
kono
parents:
diff changeset
1104 const char *env_var = getenv (var_name);
kono
parents:
diff changeset
1105 notify_var (var_name, env_var);
kono
parents:
diff changeset
1106 if (env_var)
kono
parents:
diff changeset
1107 {
kono
parents:
diff changeset
1108 const char *pos = env_var;
kono
parents:
diff changeset
1109
kono
parents:
diff changeset
1110 for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
kono
parents:
diff changeset
1111 {
kono
parents:
diff changeset
1112 if (i && *pos++ != ':')
kono
parents:
diff changeset
1113 break;
kono
parents:
diff changeset
1114 if (*pos != ':')
kono
parents:
diff changeset
1115 {
kono
parents:
diff changeset
1116 const char *eptr;
kono
parents:
diff changeset
1117
kono
parents:
diff changeset
1118 errno = 0;
kono
parents:
diff changeset
1119 long val = strtol (pos, (char **)&eptr, 10);
kono
parents:
diff changeset
1120 if (errno || val < 0 || (unsigned)val != val)
kono
parents:
diff changeset
1121 break;
kono
parents:
diff changeset
1122 default_dims[i] = (int)val;
kono
parents:
diff changeset
1123 pos = eptr;
kono
parents:
diff changeset
1124 }
kono
parents:
diff changeset
1125 }
kono
parents:
diff changeset
1126 }
kono
parents:
diff changeset
1127
kono
parents:
diff changeset
1128 int warp_size, block_size, dev_size, cpu_size;
kono
parents:
diff changeset
1129 CUdevice dev = nvptx_thread()->ptx_dev->dev;
kono
parents:
diff changeset
1130 /* 32 is the default for known hardware. */
kono
parents:
diff changeset
1131 int gang = 0, worker = 32, vector = 32;
kono
parents:
diff changeset
1132 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
kono
parents:
diff changeset
1133
kono
parents:
diff changeset
1134 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
kono
parents:
diff changeset
1135 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
kono
parents:
diff changeset
1136 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
kono
parents:
diff changeset
1137 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
kono
parents:
diff changeset
1138
kono
parents:
diff changeset
1139 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
kono
parents:
diff changeset
1140 dev) == CUDA_SUCCESS
kono
parents:
diff changeset
1141 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
kono
parents:
diff changeset
1142 dev) == CUDA_SUCCESS
kono
parents:
diff changeset
1143 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
kono
parents:
diff changeset
1144 dev) == CUDA_SUCCESS
kono
parents:
diff changeset
1145 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
kono
parents:
diff changeset
1146 dev) == CUDA_SUCCESS)
kono
parents:
diff changeset
1147 {
kono
parents:
diff changeset
1148 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
kono
parents:
diff changeset
1149 " dev_size=%d, cpu_size=%d\n",
kono
parents:
diff changeset
1150 warp_size, block_size, dev_size, cpu_size);
kono
parents:
diff changeset
1151 gang = (cpu_size / block_size) * dev_size;
kono
parents:
diff changeset
1152 worker = block_size / warp_size;
kono
parents:
diff changeset
1153 vector = warp_size;
kono
parents:
diff changeset
1154 }
kono
parents:
diff changeset
1155
kono
parents:
diff changeset
1156 /* There is no upper bound on the gang size. The best size
kono
parents:
diff changeset
1157 matches the hardware configuration. Logical gangs are
kono
parents:
diff changeset
1158 scheduled onto physical hardware. To maximize usage, we
kono
parents:
diff changeset
1159 should guess a large number. */
kono
parents:
diff changeset
1160 if (default_dims[GOMP_DIM_GANG] < 1)
kono
parents:
diff changeset
1161 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
kono
parents:
diff changeset
1162 /* The worker size must not exceed the hardware. */
kono
parents:
diff changeset
1163 if (default_dims[GOMP_DIM_WORKER] < 1
kono
parents:
diff changeset
1164 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
kono
parents:
diff changeset
1165 default_dims[GOMP_DIM_WORKER] = worker;
kono
parents:
diff changeset
1166 /* The vector size must exactly match the hardware. */
kono
parents:
diff changeset
1167 if (default_dims[GOMP_DIM_VECTOR] < 1
kono
parents:
diff changeset
1168 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
kono
parents:
diff changeset
1169 default_dims[GOMP_DIM_VECTOR] = vector;
kono
parents:
diff changeset
1170
kono
parents:
diff changeset
1171 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
kono
parents:
diff changeset
1172 default_dims[GOMP_DIM_GANG],
kono
parents:
diff changeset
1173 default_dims[GOMP_DIM_WORKER],
kono
parents:
diff changeset
1174 default_dims[GOMP_DIM_VECTOR]);
kono
parents:
diff changeset
1175 }
kono
parents:
diff changeset
1176 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1177
kono
parents:
diff changeset
1178 for (i = 0; i != GOMP_DIM_MAX; i++)
kono
parents:
diff changeset
1179 if (!dims[i])
kono
parents:
diff changeset
1180 dims[i] = default_dims[i];
kono
parents:
diff changeset
1181 }
kono
parents:
diff changeset
1182
kono
parents:
diff changeset
1183 /* This reserves a chunk of a pre-allocated page of memory mapped on both
kono
parents:
diff changeset
1184 the host and the device. HP is a host pointer to the new chunk, and DP is
kono
parents:
diff changeset
1185 the corresponding device pointer. */
kono
parents:
diff changeset
1186 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
kono
parents:
diff changeset
1187
kono
parents:
diff changeset
1188 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
kono
parents:
diff changeset
1189
kono
parents:
diff changeset
1190 /* Copy the array of arguments to the mapped page. */
kono
parents:
diff changeset
1191 for (i = 0; i < mapnum; i++)
kono
parents:
diff changeset
1192 ((void **) hp)[i] = devaddrs[i];
kono
parents:
diff changeset
1193
kono
parents:
diff changeset
1194 /* Copy the (device) pointers to arguments to the device (dp and hp might in
kono
parents:
diff changeset
1195 fact have the same value on a unified-memory system). */
kono
parents:
diff changeset
1196 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
kono
parents:
diff changeset
1197 mapnum * sizeof (void *));
kono
parents:
diff changeset
1198 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
kono
parents:
diff changeset
1199 " gangs=%u, workers=%u, vectors=%u\n",
kono
parents:
diff changeset
1200 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
kono
parents:
diff changeset
1201 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
kono
parents:
diff changeset
1202
kono
parents:
diff changeset
1203 // OpenACC CUDA
kono
parents:
diff changeset
1204 //
kono
parents:
diff changeset
1205 // num_gangs nctaid.x
kono
parents:
diff changeset
1206 // num_workers ntid.y
kono
parents:
diff changeset
1207 // vector length ntid.x
kono
parents:
diff changeset
1208
kono
parents:
diff changeset
1209 kargs[0] = &dp;
kono
parents:
diff changeset
1210 CUDA_CALL_ASSERT (cuLaunchKernel, function,
kono
parents:
diff changeset
1211 dims[GOMP_DIM_GANG], 1, 1,
kono
parents:
diff changeset
1212 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
kono
parents:
diff changeset
1213 0, dev_str->stream, kargs, 0);
kono
parents:
diff changeset
1214
kono
parents:
diff changeset
1215 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1216 if (async < acc_async_noval)
kono
parents:
diff changeset
1217 {
kono
parents:
diff changeset
1218 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
kono
parents:
diff changeset
1219 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1220 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1221 maybe_abort_msg);
kono
parents:
diff changeset
1222 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1223 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
1224 }
kono
parents:
diff changeset
1225 else
kono
parents:
diff changeset
1226 {
kono
parents:
diff changeset
1227 CUevent *e;
kono
parents:
diff changeset
1228
kono
parents:
diff changeset
1229 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1230
kono
parents:
diff changeset
1231 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1232 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1233 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1234 maybe_abort_msg);
kono
parents:
diff changeset
1235 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1236 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
kono
parents:
diff changeset
1237
kono
parents:
diff changeset
1238 event_gc (true);
kono
parents:
diff changeset
1239
kono
parents:
diff changeset
1240 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
kono
parents:
diff changeset
1241
kono
parents:
diff changeset
1242 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
kono
parents:
diff changeset
1243 }
kono
parents:
diff changeset
1244 #else
kono
parents:
diff changeset
1245 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
kono
parents:
diff changeset
1246 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
1247 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
1248 maybe_abort_msg);
kono
parents:
diff changeset
1249 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1250 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
1251 #endif
kono
parents:
diff changeset
1252
kono
parents:
diff changeset
1253 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
kono
parents:
diff changeset
1254 targ_fn->launch->fn);
kono
parents:
diff changeset
1255
kono
parents:
diff changeset
1256 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1257 if (async < acc_async_noval)
kono
parents:
diff changeset
1258 #endif
kono
parents:
diff changeset
1259 map_pop (dev_str);
kono
parents:
diff changeset
1260 }
kono
parents:
diff changeset
1261
kono
parents:
diff changeset
1262 void * openacc_get_current_cuda_context (void);
kono
parents:
diff changeset
1263
kono
parents:
diff changeset
1264 static void *
kono
parents:
diff changeset
1265 nvptx_alloc (size_t s)
kono
parents:
diff changeset
1266 {
kono
parents:
diff changeset
1267 CUdeviceptr d;
kono
parents:
diff changeset
1268
kono
parents:
diff changeset
1269 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
kono
parents:
diff changeset
1270 return (void *) d;
kono
parents:
diff changeset
1271 }
kono
parents:
diff changeset
1272
kono
parents:
diff changeset
1273 static bool
kono
parents:
diff changeset
1274 nvptx_free (void *p)
kono
parents:
diff changeset
1275 {
kono
parents:
diff changeset
1276 CUdeviceptr pb;
kono
parents:
diff changeset
1277 size_t ps;
kono
parents:
diff changeset
1278
kono
parents:
diff changeset
1279 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
kono
parents:
diff changeset
1280 if ((CUdeviceptr) p != pb)
kono
parents:
diff changeset
1281 {
kono
parents:
diff changeset
1282 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1283 return false;
kono
parents:
diff changeset
1284 }
kono
parents:
diff changeset
1285
kono
parents:
diff changeset
1286 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
kono
parents:
diff changeset
1287 return true;
kono
parents:
diff changeset
1288 }
kono
parents:
diff changeset
1289
kono
parents:
diff changeset
1290
kono
parents:
diff changeset
1291 static bool
kono
parents:
diff changeset
1292 nvptx_host2dev (void *d, const void *h, size_t s)
kono
parents:
diff changeset
1293 {
kono
parents:
diff changeset
1294 CUdeviceptr pb;
kono
parents:
diff changeset
1295 size_t ps;
kono
parents:
diff changeset
1296 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1297
kono
parents:
diff changeset
1298 if (!s)
kono
parents:
diff changeset
1299 return true;
kono
parents:
diff changeset
1300 if (!d)
kono
parents:
diff changeset
1301 {
kono
parents:
diff changeset
1302 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1303 return false;
kono
parents:
diff changeset
1304 }
kono
parents:
diff changeset
1305
kono
parents:
diff changeset
1306 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
kono
parents:
diff changeset
1307
kono
parents:
diff changeset
1308 if (!pb)
kono
parents:
diff changeset
1309 {
kono
parents:
diff changeset
1310 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1311 return false;
kono
parents:
diff changeset
1312 }
kono
parents:
diff changeset
1313 if (!h)
kono
parents:
diff changeset
1314 {
kono
parents:
diff changeset
1315 GOMP_PLUGIN_error ("invalid host address");
kono
parents:
diff changeset
1316 return false;
kono
parents:
diff changeset
1317 }
kono
parents:
diff changeset
1318 if (d == h)
kono
parents:
diff changeset
1319 {
kono
parents:
diff changeset
1320 GOMP_PLUGIN_error ("invalid host or device address");
kono
parents:
diff changeset
1321 return false;
kono
parents:
diff changeset
1322 }
kono
parents:
diff changeset
1323 if ((void *)(d + s) > (void *)(pb + ps))
kono
parents:
diff changeset
1324 {
kono
parents:
diff changeset
1325 GOMP_PLUGIN_error ("invalid size");
kono
parents:
diff changeset
1326 return false;
kono
parents:
diff changeset
1327 }
kono
parents:
diff changeset
1328
kono
parents:
diff changeset
1329 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1330 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1331 {
kono
parents:
diff changeset
1332 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1333 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1334 event_gc (false);
kono
parents:
diff changeset
1335 CUDA_CALL (cuMemcpyHtoDAsync,
kono
parents:
diff changeset
1336 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
kono
parents:
diff changeset
1337 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
1338 event_add (PTX_EVT_MEM, e, (void *)h, 0);
kono
parents:
diff changeset
1339 }
kono
parents:
diff changeset
1340 else
kono
parents:
diff changeset
1341 #endif
kono
parents:
diff changeset
1342 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
kono
parents:
diff changeset
1343
kono
parents:
diff changeset
1344 return true;
kono
parents:
diff changeset
1345 }
kono
parents:
diff changeset
1346
kono
parents:
diff changeset
1347 static bool
kono
parents:
diff changeset
1348 nvptx_dev2host (void *h, const void *d, size_t s)
kono
parents:
diff changeset
1349 {
kono
parents:
diff changeset
1350 CUdeviceptr pb;
kono
parents:
diff changeset
1351 size_t ps;
kono
parents:
diff changeset
1352 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1353
kono
parents:
diff changeset
1354 if (!s)
kono
parents:
diff changeset
1355 return true;
kono
parents:
diff changeset
1356 if (!d)
kono
parents:
diff changeset
1357 {
kono
parents:
diff changeset
1358 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1359 return false;
kono
parents:
diff changeset
1360 }
kono
parents:
diff changeset
1361
kono
parents:
diff changeset
1362 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
kono
parents:
diff changeset
1363
kono
parents:
diff changeset
1364 if (!pb)
kono
parents:
diff changeset
1365 {
kono
parents:
diff changeset
1366 GOMP_PLUGIN_error ("invalid device address");
kono
parents:
diff changeset
1367 return false;
kono
parents:
diff changeset
1368 }
kono
parents:
diff changeset
1369 if (!h)
kono
parents:
diff changeset
1370 {
kono
parents:
diff changeset
1371 GOMP_PLUGIN_error ("invalid host address");
kono
parents:
diff changeset
1372 return false;
kono
parents:
diff changeset
1373 }
kono
parents:
diff changeset
1374 if (d == h)
kono
parents:
diff changeset
1375 {
kono
parents:
diff changeset
1376 GOMP_PLUGIN_error ("invalid host or device address");
kono
parents:
diff changeset
1377 return false;
kono
parents:
diff changeset
1378 }
kono
parents:
diff changeset
1379 if ((void *)(d + s) > (void *)(pb + ps))
kono
parents:
diff changeset
1380 {
kono
parents:
diff changeset
1381 GOMP_PLUGIN_error ("invalid size");
kono
parents:
diff changeset
1382 return false;
kono
parents:
diff changeset
1383 }
kono
parents:
diff changeset
1384
kono
parents:
diff changeset
1385 #ifndef DISABLE_ASYNC
kono
parents:
diff changeset
1386 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1387 {
kono
parents:
diff changeset
1388 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1389 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1390 event_gc (false);
kono
parents:
diff changeset
1391 CUDA_CALL (cuMemcpyDtoHAsync,
kono
parents:
diff changeset
1392 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
kono
parents:
diff changeset
1393 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
1394 event_add (PTX_EVT_MEM, e, (void *)h, 0);
kono
parents:
diff changeset
1395 }
kono
parents:
diff changeset
1396 else
kono
parents:
diff changeset
1397 #endif
kono
parents:
diff changeset
1398 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
kono
parents:
diff changeset
1399
kono
parents:
diff changeset
1400 return true;
kono
parents:
diff changeset
1401 }
kono
parents:
diff changeset
1402
kono
parents:
diff changeset
1403 static void
kono
parents:
diff changeset
1404 nvptx_set_async (int async)
kono
parents:
diff changeset
1405 {
kono
parents:
diff changeset
1406 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1407 nvthd->current_stream
kono
parents:
diff changeset
1408 = select_stream_for_async (async, pthread_self (), true, NULL);
kono
parents:
diff changeset
1409 }
kono
parents:
diff changeset
1410
kono
parents:
diff changeset
1411 static int
kono
parents:
diff changeset
1412 nvptx_async_test (int async)
kono
parents:
diff changeset
1413 {
kono
parents:
diff changeset
1414 CUresult r;
kono
parents:
diff changeset
1415 struct ptx_stream *s;
kono
parents:
diff changeset
1416
kono
parents:
diff changeset
1417 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1418
kono
parents:
diff changeset
1419 if (!s)
kono
parents:
diff changeset
1420 GOMP_PLUGIN_fatal ("unknown async %d", async);
kono
parents:
diff changeset
1421
kono
parents:
diff changeset
1422 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
kono
parents:
diff changeset
1423 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
1424 {
kono
parents:
diff changeset
1425 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
kono
parents:
diff changeset
1426 whether all work has completed on this stream, and if so omits the call
kono
parents:
diff changeset
1427 to the wait hook. If that happens, event_gc might not get called
kono
parents:
diff changeset
1428 (which prevents variables from getting unmapped and their associated
kono
parents:
diff changeset
1429 device storage freed), so call it here. */
kono
parents:
diff changeset
1430 event_gc (true);
kono
parents:
diff changeset
1431 return 1;
kono
parents:
diff changeset
1432 }
kono
parents:
diff changeset
1433 else if (r == CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1434 return 0;
kono
parents:
diff changeset
1435
kono
parents:
diff changeset
1436 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
kono
parents:
diff changeset
1437
kono
parents:
diff changeset
1438 return 0;
kono
parents:
diff changeset
1439 }
kono
parents:
diff changeset
1440
kono
parents:
diff changeset
1441 static int
kono
parents:
diff changeset
1442 nvptx_async_test_all (void)
kono
parents:
diff changeset
1443 {
kono
parents:
diff changeset
1444 struct ptx_stream *s;
kono
parents:
diff changeset
1445 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1446 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1447
kono
parents:
diff changeset
1448 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1449
kono
parents:
diff changeset
1450 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
kono
parents:
diff changeset
1451 {
kono
parents:
diff changeset
1452 if ((s->multithreaded || pthread_equal (s->host_thread, self))
kono
parents:
diff changeset
1453 && CUDA_CALL_NOCHECK (cuStreamQuery,
kono
parents:
diff changeset
1454 s->stream) == CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1455 {
kono
parents:
diff changeset
1456 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1457 return 0;
kono
parents:
diff changeset
1458 }
kono
parents:
diff changeset
1459 }
kono
parents:
diff changeset
1460
kono
parents:
diff changeset
1461 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1462
kono
parents:
diff changeset
1463 event_gc (true);
kono
parents:
diff changeset
1464
kono
parents:
diff changeset
1465 return 1;
kono
parents:
diff changeset
1466 }
kono
parents:
diff changeset
1467
kono
parents:
diff changeset
1468 static void
kono
parents:
diff changeset
1469 nvptx_wait (int async)
kono
parents:
diff changeset
1470 {
kono
parents:
diff changeset
1471 struct ptx_stream *s;
kono
parents:
diff changeset
1472
kono
parents:
diff changeset
1473 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1474 if (!s)
kono
parents:
diff changeset
1475 GOMP_PLUGIN_fatal ("unknown async %d", async);
kono
parents:
diff changeset
1476
kono
parents:
diff changeset
1477 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
kono
parents:
diff changeset
1478
kono
parents:
diff changeset
1479 event_gc (true);
kono
parents:
diff changeset
1480 }
kono
parents:
diff changeset
1481
kono
parents:
diff changeset
1482 static void
kono
parents:
diff changeset
1483 nvptx_wait_async (int async1, int async2)
kono
parents:
diff changeset
1484 {
kono
parents:
diff changeset
1485 CUevent *e;
kono
parents:
diff changeset
1486 struct ptx_stream *s1, *s2;
kono
parents:
diff changeset
1487 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1488
kono
parents:
diff changeset
1489 /* The stream that is waiting (rather than being waited for) doesn't
kono
parents:
diff changeset
1490 necessarily have to exist already. */
kono
parents:
diff changeset
1491 s2 = select_stream_for_async (async2, self, true, NULL);
kono
parents:
diff changeset
1492
kono
parents:
diff changeset
1493 s1 = select_stream_for_async (async1, self, false, NULL);
kono
parents:
diff changeset
1494 if (!s1)
kono
parents:
diff changeset
1495 GOMP_PLUGIN_fatal ("invalid async 1\n");
kono
parents:
diff changeset
1496
kono
parents:
diff changeset
1497 if (s1 == s2)
kono
parents:
diff changeset
1498 GOMP_PLUGIN_fatal ("identical parameters");
kono
parents:
diff changeset
1499
kono
parents:
diff changeset
1500 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1501
kono
parents:
diff changeset
1502 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1503
kono
parents:
diff changeset
1504 event_gc (true);
kono
parents:
diff changeset
1505
kono
parents:
diff changeset
1506 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
kono
parents:
diff changeset
1507
kono
parents:
diff changeset
1508 event_add (PTX_EVT_SYNC, e, NULL, 0);
kono
parents:
diff changeset
1509
kono
parents:
diff changeset
1510 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
kono
parents:
diff changeset
1511 }
kono
parents:
diff changeset
1512
kono
parents:
diff changeset
1513 static void
kono
parents:
diff changeset
1514 nvptx_wait_all (void)
kono
parents:
diff changeset
1515 {
kono
parents:
diff changeset
1516 CUresult r;
kono
parents:
diff changeset
1517 struct ptx_stream *s;
kono
parents:
diff changeset
1518 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1519 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1520
kono
parents:
diff changeset
1521 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1522
kono
parents:
diff changeset
1523 /* Wait for active streams initiated by this thread (or by multiple threads)
kono
parents:
diff changeset
1524 to complete. */
kono
parents:
diff changeset
1525 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
kono
parents:
diff changeset
1526 {
kono
parents:
diff changeset
1527 if (s->multithreaded || pthread_equal (s->host_thread, self))
kono
parents:
diff changeset
1528 {
kono
parents:
diff changeset
1529 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
kono
parents:
diff changeset
1530 if (r == CUDA_SUCCESS)
kono
parents:
diff changeset
1531 continue;
kono
parents:
diff changeset
1532 else if (r != CUDA_ERROR_NOT_READY)
kono
parents:
diff changeset
1533 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
kono
parents:
diff changeset
1534
kono
parents:
diff changeset
1535 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
kono
parents:
diff changeset
1536 }
kono
parents:
diff changeset
1537 }
kono
parents:
diff changeset
1538
kono
parents:
diff changeset
1539 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1540
kono
parents:
diff changeset
1541 event_gc (true);
kono
parents:
diff changeset
1542 }
kono
parents:
diff changeset
1543
kono
parents:
diff changeset
1544 static void
kono
parents:
diff changeset
1545 nvptx_wait_all_async (int async)
kono
parents:
diff changeset
1546 {
kono
parents:
diff changeset
1547 struct ptx_stream *waiting_stream, *other_stream;
kono
parents:
diff changeset
1548 CUevent *e;
kono
parents:
diff changeset
1549 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1550 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1551
kono
parents:
diff changeset
1552 /* The stream doing the waiting. This could be the first mention of the
kono
parents:
diff changeset
1553 stream, so create it if necessary. */
kono
parents:
diff changeset
1554 waiting_stream
kono
parents:
diff changeset
1555 = select_stream_for_async (async, pthread_self (), true, NULL);
kono
parents:
diff changeset
1556
kono
parents:
diff changeset
1557 /* Launches on the null stream already block on other streams in the
kono
parents:
diff changeset
1558 context. */
kono
parents:
diff changeset
1559 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
kono
parents:
diff changeset
1560 return;
kono
parents:
diff changeset
1561
kono
parents:
diff changeset
1562 event_gc (true);
kono
parents:
diff changeset
1563
kono
parents:
diff changeset
1564 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1565
kono
parents:
diff changeset
1566 for (other_stream = nvthd->ptx_dev->active_streams;
kono
parents:
diff changeset
1567 other_stream != NULL;
kono
parents:
diff changeset
1568 other_stream = other_stream->next)
kono
parents:
diff changeset
1569 {
kono
parents:
diff changeset
1570 if (!other_stream->multithreaded
kono
parents:
diff changeset
1571 && !pthread_equal (other_stream->host_thread, self))
kono
parents:
diff changeset
1572 continue;
kono
parents:
diff changeset
1573
kono
parents:
diff changeset
1574 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1575
kono
parents:
diff changeset
1576 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1577
kono
parents:
diff changeset
1578 /* Record an event on the waited-for stream. */
kono
parents:
diff changeset
1579 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
kono
parents:
diff changeset
1580
kono
parents:
diff changeset
1581 event_add (PTX_EVT_SYNC, e, NULL, 0);
kono
parents:
diff changeset
1582
kono
parents:
diff changeset
1583 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
kono
parents:
diff changeset
1584 }
kono
parents:
diff changeset
1585
kono
parents:
diff changeset
1586 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1587 }
kono
parents:
diff changeset
1588
kono
parents:
diff changeset
1589 static void *
kono
parents:
diff changeset
1590 nvptx_get_current_cuda_device (void)
kono
parents:
diff changeset
1591 {
kono
parents:
diff changeset
1592 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1593
kono
parents:
diff changeset
1594 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1595 return NULL;
kono
parents:
diff changeset
1596
kono
parents:
diff changeset
1597 return &nvthd->ptx_dev->dev;
kono
parents:
diff changeset
1598 }
kono
parents:
diff changeset
1599
kono
parents:
diff changeset
1600 static void *
kono
parents:
diff changeset
1601 nvptx_get_current_cuda_context (void)
kono
parents:
diff changeset
1602 {
kono
parents:
diff changeset
1603 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1604
kono
parents:
diff changeset
1605 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1606 return NULL;
kono
parents:
diff changeset
1607
kono
parents:
diff changeset
1608 return nvthd->ptx_dev->ctx;
kono
parents:
diff changeset
1609 }
kono
parents:
diff changeset
1610
kono
parents:
diff changeset
1611 static void *
kono
parents:
diff changeset
1612 nvptx_get_cuda_stream (int async)
kono
parents:
diff changeset
1613 {
kono
parents:
diff changeset
1614 struct ptx_stream *s;
kono
parents:
diff changeset
1615 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1616
kono
parents:
diff changeset
1617 if (!nvthd || !nvthd->ptx_dev)
kono
parents:
diff changeset
1618 return NULL;
kono
parents:
diff changeset
1619
kono
parents:
diff changeset
1620 s = select_stream_for_async (async, pthread_self (), false, NULL);
kono
parents:
diff changeset
1621
kono
parents:
diff changeset
1622 return s ? s->stream : NULL;
kono
parents:
diff changeset
1623 }
kono
parents:
diff changeset
1624
kono
parents:
diff changeset
1625 static int
kono
parents:
diff changeset
1626 nvptx_set_cuda_stream (int async, void *stream)
kono
parents:
diff changeset
1627 {
kono
parents:
diff changeset
1628 struct ptx_stream *oldstream;
kono
parents:
diff changeset
1629 pthread_t self = pthread_self ();
kono
parents:
diff changeset
1630 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1631
kono
parents:
diff changeset
1632 if (async < 0)
kono
parents:
diff changeset
1633 GOMP_PLUGIN_fatal ("bad async %d", async);
kono
parents:
diff changeset
1634
kono
parents:
diff changeset
1635 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1636
kono
parents:
diff changeset
1637 /* We have a list of active streams and an array mapping async values to
kono
parents:
diff changeset
1638 entries of that list. We need to take "ownership" of the passed-in stream,
kono
parents:
diff changeset
1639 and add it to our list, removing the previous entry also (if there was one)
kono
parents:
diff changeset
1640 in order to prevent resource leaks. Note the potential for surprise
kono
parents:
diff changeset
1641 here: maybe we should keep track of passed-in streams and leave it up to
kono
parents:
diff changeset
1642 the user to tidy those up, but that doesn't work for stream handles
kono
parents:
diff changeset
1643 returned from acc_get_cuda_stream above... */
kono
parents:
diff changeset
1644
kono
parents:
diff changeset
1645 oldstream = select_stream_for_async (async, self, false, NULL);
kono
parents:
diff changeset
1646
kono
parents:
diff changeset
1647 if (oldstream)
kono
parents:
diff changeset
1648 {
kono
parents:
diff changeset
1649 if (nvthd->ptx_dev->active_streams == oldstream)
kono
parents:
diff changeset
1650 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
kono
parents:
diff changeset
1651 else
kono
parents:
diff changeset
1652 {
kono
parents:
diff changeset
1653 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
kono
parents:
diff changeset
1654 while (s->next != oldstream)
kono
parents:
diff changeset
1655 s = s->next;
kono
parents:
diff changeset
1656 s->next = s->next->next;
kono
parents:
diff changeset
1657 }
kono
parents:
diff changeset
1658
kono
parents:
diff changeset
1659 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
kono
parents:
diff changeset
1660
kono
parents:
diff changeset
1661 if (!map_fini (oldstream))
kono
parents:
diff changeset
1662 GOMP_PLUGIN_fatal ("error when freeing host memory");
kono
parents:
diff changeset
1663
kono
parents:
diff changeset
1664 free (oldstream);
kono
parents:
diff changeset
1665 }
kono
parents:
diff changeset
1666
kono
parents:
diff changeset
1667 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
kono
parents:
diff changeset
1668
kono
parents:
diff changeset
1669 (void) select_stream_for_async (async, self, true, (CUstream) stream);
kono
parents:
diff changeset
1670
kono
parents:
diff changeset
1671 return 1;
kono
parents:
diff changeset
1672 }
kono
parents:
diff changeset
1673
kono
parents:
diff changeset
1674 /* Plugin entry points. */
kono
parents:
diff changeset
1675
kono
parents:
diff changeset
1676 const char *
kono
parents:
diff changeset
1677 GOMP_OFFLOAD_get_name (void)
kono
parents:
diff changeset
1678 {
kono
parents:
diff changeset
1679 return "nvptx";
kono
parents:
diff changeset
1680 }
kono
parents:
diff changeset
1681
kono
parents:
diff changeset
1682 unsigned int
kono
parents:
diff changeset
1683 GOMP_OFFLOAD_get_caps (void)
kono
parents:
diff changeset
1684 {
kono
parents:
diff changeset
1685 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
kono
parents:
diff changeset
1686 }
kono
parents:
diff changeset
1687
kono
parents:
diff changeset
1688 int
kono
parents:
diff changeset
1689 GOMP_OFFLOAD_get_type (void)
kono
parents:
diff changeset
1690 {
kono
parents:
diff changeset
1691 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
kono
parents:
diff changeset
1692 }
kono
parents:
diff changeset
1693
kono
parents:
diff changeset
1694 int
kono
parents:
diff changeset
1695 GOMP_OFFLOAD_get_num_devices (void)
kono
parents:
diff changeset
1696 {
kono
parents:
diff changeset
1697 return nvptx_get_num_devices ();
kono
parents:
diff changeset
1698 }
kono
parents:
diff changeset
1699
kono
parents:
diff changeset
1700 bool
kono
parents:
diff changeset
1701 GOMP_OFFLOAD_init_device (int n)
kono
parents:
diff changeset
1702 {
kono
parents:
diff changeset
1703 struct ptx_device *dev;
kono
parents:
diff changeset
1704
kono
parents:
diff changeset
1705 pthread_mutex_lock (&ptx_dev_lock);
kono
parents:
diff changeset
1706
kono
parents:
diff changeset
1707 if (!nvptx_init () || ptx_devices[n] != NULL)
kono
parents:
diff changeset
1708 {
kono
parents:
diff changeset
1709 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1710 return false;
kono
parents:
diff changeset
1711 }
kono
parents:
diff changeset
1712
kono
parents:
diff changeset
1713 dev = nvptx_open_device (n);
kono
parents:
diff changeset
1714 if (dev)
kono
parents:
diff changeset
1715 {
kono
parents:
diff changeset
1716 ptx_devices[n] = dev;
kono
parents:
diff changeset
1717 instantiated_devices++;
kono
parents:
diff changeset
1718 }
kono
parents:
diff changeset
1719
kono
parents:
diff changeset
1720 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1721
kono
parents:
diff changeset
1722 return dev != NULL;
kono
parents:
diff changeset
1723 }
kono
parents:
diff changeset
1724
kono
parents:
diff changeset
1725 bool
kono
parents:
diff changeset
1726 GOMP_OFFLOAD_fini_device (int n)
kono
parents:
diff changeset
1727 {
kono
parents:
diff changeset
1728 pthread_mutex_lock (&ptx_dev_lock);
kono
parents:
diff changeset
1729
kono
parents:
diff changeset
1730 if (ptx_devices[n] != NULL)
kono
parents:
diff changeset
1731 {
kono
parents:
diff changeset
1732 if (!nvptx_attach_host_thread_to_device (n)
kono
parents:
diff changeset
1733 || !nvptx_close_device (ptx_devices[n]))
kono
parents:
diff changeset
1734 {
kono
parents:
diff changeset
1735 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1736 return false;
kono
parents:
diff changeset
1737 }
kono
parents:
diff changeset
1738 ptx_devices[n] = NULL;
kono
parents:
diff changeset
1739 instantiated_devices--;
kono
parents:
diff changeset
1740 }
kono
parents:
diff changeset
1741
kono
parents:
diff changeset
1742 pthread_mutex_unlock (&ptx_dev_lock);
kono
parents:
diff changeset
1743 return true;
kono
parents:
diff changeset
1744 }
kono
parents:
diff changeset
1745
kono
parents:
diff changeset
1746 /* Return the libgomp version number we're compatible with. There is
kono
parents:
diff changeset
1747 no requirement for cross-version compatibility. */
kono
parents:
diff changeset
1748
kono
parents:
diff changeset
1749 unsigned
kono
parents:
diff changeset
1750 GOMP_OFFLOAD_version (void)
kono
parents:
diff changeset
1751 {
kono
parents:
diff changeset
1752 return GOMP_VERSION;
kono
parents:
diff changeset
1753 }
kono
parents:
diff changeset
1754
kono
parents:
diff changeset
1755 /* Initialize __nvptx_clocktick, if present in MODULE. */
kono
parents:
diff changeset
1756
kono
parents:
diff changeset
1757 static void
kono
parents:
diff changeset
1758 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
kono
parents:
diff changeset
1759 {
kono
parents:
diff changeset
1760 CUdeviceptr dptr;
kono
parents:
diff changeset
1761 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
kono
parents:
diff changeset
1762 module, "__nvptx_clocktick");
kono
parents:
diff changeset
1763 if (r == CUDA_ERROR_NOT_FOUND)
kono
parents:
diff changeset
1764 return;
kono
parents:
diff changeset
1765 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1766 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
kono
parents:
diff changeset
1767 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
kono
parents:
diff changeset
1768 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
kono
parents:
diff changeset
1769 sizeof (__nvptx_clocktick));
kono
parents:
diff changeset
1770 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
1771 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
kono
parents:
diff changeset
1772 }
kono
parents:
diff changeset
1773
kono
parents:
diff changeset
1774 /* Load the (partial) program described by TARGET_DATA to device
kono
parents:
diff changeset
1775 number ORD. Allocate and return TARGET_TABLE. */
kono
parents:
diff changeset
1776
kono
parents:
diff changeset
1777 int
kono
parents:
diff changeset
1778 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
kono
parents:
diff changeset
1779 struct addr_pair **target_table)
kono
parents:
diff changeset
1780 {
kono
parents:
diff changeset
1781 CUmodule module;
kono
parents:
diff changeset
1782 const char *const *var_names;
kono
parents:
diff changeset
1783 const struct targ_fn_launch *fn_descs;
kono
parents:
diff changeset
1784 unsigned int fn_entries, var_entries, i, j;
kono
parents:
diff changeset
1785 struct targ_fn_descriptor *targ_fns;
kono
parents:
diff changeset
1786 struct addr_pair *targ_tbl;
kono
parents:
diff changeset
1787 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
kono
parents:
diff changeset
1788 struct ptx_image_data *new_image;
kono
parents:
diff changeset
1789 struct ptx_device *dev;
kono
parents:
diff changeset
1790
kono
parents:
diff changeset
1791 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
kono
parents:
diff changeset
1792 {
kono
parents:
diff changeset
1793 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
kono
parents:
diff changeset
1794 " (expected %u, received %u)",
kono
parents:
diff changeset
1795 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
kono
parents:
diff changeset
1796 return -1;
kono
parents:
diff changeset
1797 }
kono
parents:
diff changeset
1798
kono
parents:
diff changeset
1799 if (!nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
1800 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
kono
parents:
diff changeset
1801 return -1;
kono
parents:
diff changeset
1802
kono
parents:
diff changeset
1803 dev = ptx_devices[ord];
kono
parents:
diff changeset
1804
kono
parents:
diff changeset
1805 /* The mkoffload utility emits a struct of pointers/integers at the
kono
parents:
diff changeset
1806 start of each offload image. The array of kernel names and the
kono
parents:
diff changeset
1807 functions addresses form a one-to-one correspondence. */
kono
parents:
diff changeset
1808
kono
parents:
diff changeset
1809 var_entries = img_header->var_num;
kono
parents:
diff changeset
1810 var_names = img_header->var_names;
kono
parents:
diff changeset
1811 fn_entries = img_header->fn_num;
kono
parents:
diff changeset
1812 fn_descs = img_header->fn_descs;
kono
parents:
diff changeset
1813
kono
parents:
diff changeset
1814 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
kono
parents:
diff changeset
1815 * (fn_entries + var_entries));
kono
parents:
diff changeset
1816 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
kono
parents:
diff changeset
1817 * fn_entries);
kono
parents:
diff changeset
1818
kono
parents:
diff changeset
1819 *target_table = targ_tbl;
kono
parents:
diff changeset
1820
kono
parents:
diff changeset
1821 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
kono
parents:
diff changeset
1822 new_image->target_data = target_data;
kono
parents:
diff changeset
1823 new_image->module = module;
kono
parents:
diff changeset
1824 new_image->fns = targ_fns;
kono
parents:
diff changeset
1825
kono
parents:
diff changeset
1826 pthread_mutex_lock (&dev->image_lock);
kono
parents:
diff changeset
1827 new_image->next = dev->images;
kono
parents:
diff changeset
1828 dev->images = new_image;
kono
parents:
diff changeset
1829 pthread_mutex_unlock (&dev->image_lock);
kono
parents:
diff changeset
1830
kono
parents:
diff changeset
1831 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
kono
parents:
diff changeset
1832 {
kono
parents:
diff changeset
1833 CUfunction function;
kono
parents:
diff changeset
1834 int nregs, mthrs;
kono
parents:
diff changeset
1835
kono
parents:
diff changeset
1836 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
kono
parents:
diff changeset
1837 fn_descs[i].fn);
kono
parents:
diff changeset
1838 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
kono
parents:
diff changeset
1839 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
kono
parents:
diff changeset
1840 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
kono
parents:
diff changeset
1841 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
kono
parents:
diff changeset
1842
kono
parents:
diff changeset
1843 targ_fns->fn = function;
kono
parents:
diff changeset
1844 targ_fns->launch = &fn_descs[i];
kono
parents:
diff changeset
1845 targ_fns->regs_per_thread = nregs;
kono
parents:
diff changeset
1846 targ_fns->max_threads_per_block = mthrs;
kono
parents:
diff changeset
1847
kono
parents:
diff changeset
1848 targ_tbl->start = (uintptr_t) targ_fns;
kono
parents:
diff changeset
1849 targ_tbl->end = targ_tbl->start + 1;
kono
parents:
diff changeset
1850 }
kono
parents:
diff changeset
1851
kono
parents:
diff changeset
1852 for (j = 0; j < var_entries; j++, targ_tbl++)
kono
parents:
diff changeset
1853 {
kono
parents:
diff changeset
1854 CUdeviceptr var;
kono
parents:
diff changeset
1855 size_t bytes;
kono
parents:
diff changeset
1856
kono
parents:
diff changeset
1857 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
kono
parents:
diff changeset
1858 &var, &bytes, module, var_names[j]);
kono
parents:
diff changeset
1859
kono
parents:
diff changeset
1860 targ_tbl->start = (uintptr_t) var;
kono
parents:
diff changeset
1861 targ_tbl->end = targ_tbl->start + bytes;
kono
parents:
diff changeset
1862 }
kono
parents:
diff changeset
1863
kono
parents:
diff changeset
1864 nvptx_set_clocktick (module, dev);
kono
parents:
diff changeset
1865
kono
parents:
diff changeset
1866 return fn_entries + var_entries;
kono
parents:
diff changeset
1867 }
kono
parents:
diff changeset
1868
kono
parents:
diff changeset
1869 /* Unload the program described by TARGET_DATA. DEV_DATA is the
kono
parents:
diff changeset
1870 function descriptors allocated by G_O_load_image. */
kono
parents:
diff changeset
1871
kono
parents:
diff changeset
1872 bool
kono
parents:
diff changeset
1873 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
kono
parents:
diff changeset
1874 {
kono
parents:
diff changeset
1875 struct ptx_image_data *image, **prev_p;
kono
parents:
diff changeset
1876 struct ptx_device *dev = ptx_devices[ord];
kono
parents:
diff changeset
1877
kono
parents:
diff changeset
1878 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
kono
parents:
diff changeset
1879 {
kono
parents:
diff changeset
1880 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
kono
parents:
diff changeset
1881 " (expected %u, received %u)",
kono
parents:
diff changeset
1882 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
kono
parents:
diff changeset
1883 return false;
kono
parents:
diff changeset
1884 }
kono
parents:
diff changeset
1885
kono
parents:
diff changeset
1886 bool ret = true;
kono
parents:
diff changeset
1887 pthread_mutex_lock (&dev->image_lock);
kono
parents:
diff changeset
1888 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
kono
parents:
diff changeset
1889 if (image->target_data == target_data)
kono
parents:
diff changeset
1890 {
kono
parents:
diff changeset
1891 *prev_p = image->next;
kono
parents:
diff changeset
1892 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
kono
parents:
diff changeset
1893 ret = false;
kono
parents:
diff changeset
1894 free (image->fns);
kono
parents:
diff changeset
1895 free (image);
kono
parents:
diff changeset
1896 break;
kono
parents:
diff changeset
1897 }
kono
parents:
diff changeset
1898 pthread_mutex_unlock (&dev->image_lock);
kono
parents:
diff changeset
1899 return ret;
kono
parents:
diff changeset
1900 }
kono
parents:
diff changeset
1901
kono
parents:
diff changeset
1902 void *
kono
parents:
diff changeset
1903 GOMP_OFFLOAD_alloc (int ord, size_t size)
kono
parents:
diff changeset
1904 {
kono
parents:
diff changeset
1905 if (!nvptx_attach_host_thread_to_device (ord))
kono
parents:
diff changeset
1906 return NULL;
kono
parents:
diff changeset
1907 return nvptx_alloc (size);
kono
parents:
diff changeset
1908 }
kono
parents:
diff changeset
1909
kono
parents:
diff changeset
1910 bool
kono
parents:
diff changeset
1911 GOMP_OFFLOAD_free (int ord, void *ptr)
kono
parents:
diff changeset
1912 {
kono
parents:
diff changeset
1913 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
1914 && nvptx_free (ptr));
kono
parents:
diff changeset
1915 }
kono
parents:
diff changeset
1916
kono
parents:
diff changeset
1917 bool
kono
parents:
diff changeset
1918 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
1919 {
kono
parents:
diff changeset
1920 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
1921 && nvptx_dev2host (dst, src, n));
kono
parents:
diff changeset
1922 }
kono
parents:
diff changeset
1923
kono
parents:
diff changeset
1924 bool
kono
parents:
diff changeset
1925 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
1926 {
kono
parents:
diff changeset
1927 return (nvptx_attach_host_thread_to_device (ord)
kono
parents:
diff changeset
1928 && nvptx_host2dev (dst, src, n));
kono
parents:
diff changeset
1929 }
kono
parents:
diff changeset
1930
kono
parents:
diff changeset
1931 bool
kono
parents:
diff changeset
1932 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
kono
parents:
diff changeset
1933 {
kono
parents:
diff changeset
1934 struct ptx_device *ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
1935 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
kono
parents:
diff changeset
1936 ptx_dev->null_stream->stream);
kono
parents:
diff changeset
1937 return true;
kono
parents:
diff changeset
1938 }
kono
parents:
diff changeset
1939
kono
parents:
diff changeset
1940 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
kono
parents:
diff changeset
1941
kono
parents:
diff changeset
1942 void
kono
parents:
diff changeset
1943 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
kono
parents:
diff changeset
1944 void **hostaddrs, void **devaddrs,
kono
parents:
diff changeset
1945 int async, unsigned *dims, void *targ_mem_desc)
kono
parents:
diff changeset
1946 {
kono
parents:
diff changeset
1947 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
kono
parents:
diff changeset
1948 }
kono
parents:
diff changeset
1949
kono
parents:
diff changeset
1950 void
kono
parents:
diff changeset
1951 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
kono
parents:
diff changeset
1952 {
kono
parents:
diff changeset
1953 struct nvptx_thread *nvthd = nvptx_thread ();
kono
parents:
diff changeset
1954 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
kono
parents:
diff changeset
1955
kono
parents:
diff changeset
1956 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
kono
parents:
diff changeset
1957 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
kono
parents:
diff changeset
1958 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
kono
parents:
diff changeset
1959 }
kono
parents:
diff changeset
1960
kono
parents:
diff changeset
1961 int
kono
parents:
diff changeset
1962 GOMP_OFFLOAD_openacc_async_test (int async)
kono
parents:
diff changeset
1963 {
kono
parents:
diff changeset
1964 return nvptx_async_test (async);
kono
parents:
diff changeset
1965 }
kono
parents:
diff changeset
1966
kono
parents:
diff changeset
1967 int
kono
parents:
diff changeset
1968 GOMP_OFFLOAD_openacc_async_test_all (void)
kono
parents:
diff changeset
1969 {
kono
parents:
diff changeset
1970 return nvptx_async_test_all ();
kono
parents:
diff changeset
1971 }
kono
parents:
diff changeset
1972
kono
parents:
diff changeset
1973 void
kono
parents:
diff changeset
1974 GOMP_OFFLOAD_openacc_async_wait (int async)
kono
parents:
diff changeset
1975 {
kono
parents:
diff changeset
1976 nvptx_wait (async);
kono
parents:
diff changeset
1977 }
kono
parents:
diff changeset
1978
kono
parents:
diff changeset
1979 void
kono
parents:
diff changeset
1980 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
kono
parents:
diff changeset
1981 {
kono
parents:
diff changeset
1982 nvptx_wait_async (async1, async2);
kono
parents:
diff changeset
1983 }
kono
parents:
diff changeset
1984
kono
parents:
diff changeset
1985 void
kono
parents:
diff changeset
1986 GOMP_OFFLOAD_openacc_async_wait_all (void)
kono
parents:
diff changeset
1987 {
kono
parents:
diff changeset
1988 nvptx_wait_all ();
kono
parents:
diff changeset
1989 }
kono
parents:
diff changeset
1990
kono
parents:
diff changeset
1991 void
kono
parents:
diff changeset
1992 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
kono
parents:
diff changeset
1993 {
kono
parents:
diff changeset
1994 nvptx_wait_all_async (async);
kono
parents:
diff changeset
1995 }
kono
parents:
diff changeset
1996
kono
parents:
diff changeset
1997 void
kono
parents:
diff changeset
1998 GOMP_OFFLOAD_openacc_async_set_async (int async)
kono
parents:
diff changeset
1999 {
kono
parents:
diff changeset
2000 nvptx_set_async (async);
kono
parents:
diff changeset
2001 }
kono
parents:
diff changeset
2002
kono
parents:
diff changeset
2003 void *
kono
parents:
diff changeset
2004 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
kono
parents:
diff changeset
2005 {
kono
parents:
diff changeset
2006 struct ptx_device *ptx_dev;
kono
parents:
diff changeset
2007 struct nvptx_thread *nvthd
kono
parents:
diff changeset
2008 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
kono
parents:
diff changeset
2009 CUcontext thd_ctx;
kono
parents:
diff changeset
2010
kono
parents:
diff changeset
2011 ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
2012
kono
parents:
diff changeset
2013 assert (ptx_dev);
kono
parents:
diff changeset
2014
kono
parents:
diff changeset
2015 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
kono
parents:
diff changeset
2016
kono
parents:
diff changeset
2017 assert (ptx_dev->ctx);
kono
parents:
diff changeset
2018
kono
parents:
diff changeset
2019 if (!thd_ctx)
kono
parents:
diff changeset
2020 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
kono
parents:
diff changeset
2021
kono
parents:
diff changeset
2022 nvthd->current_stream = ptx_dev->null_stream;
kono
parents:
diff changeset
2023 nvthd->ptx_dev = ptx_dev;
kono
parents:
diff changeset
2024
kono
parents:
diff changeset
2025 return (void *) nvthd;
kono
parents:
diff changeset
2026 }
kono
parents:
diff changeset
2027
kono
parents:
diff changeset
2028 void
kono
parents:
diff changeset
2029 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
kono
parents:
diff changeset
2030 {
kono
parents:
diff changeset
2031 free (data);
kono
parents:
diff changeset
2032 }
kono
parents:
diff changeset
2033
kono
parents:
diff changeset
2034 void *
kono
parents:
diff changeset
2035 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
kono
parents:
diff changeset
2036 {
kono
parents:
diff changeset
2037 return nvptx_get_current_cuda_device ();
kono
parents:
diff changeset
2038 }
kono
parents:
diff changeset
2039
kono
parents:
diff changeset
2040 void *
kono
parents:
diff changeset
2041 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
kono
parents:
diff changeset
2042 {
kono
parents:
diff changeset
2043 return nvptx_get_current_cuda_context ();
kono
parents:
diff changeset
2044 }
kono
parents:
diff changeset
2045
kono
parents:
diff changeset
2046 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
kono
parents:
diff changeset
2047
kono
parents:
diff changeset
2048 void *
kono
parents:
diff changeset
2049 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
kono
parents:
diff changeset
2050 {
kono
parents:
diff changeset
2051 return nvptx_get_cuda_stream (async);
kono
parents:
diff changeset
2052 }
kono
parents:
diff changeset
2053
kono
parents:
diff changeset
2054 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
kono
parents:
diff changeset
2055
kono
parents:
diff changeset
2056 int
kono
parents:
diff changeset
2057 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
kono
parents:
diff changeset
2058 {
kono
parents:
diff changeset
2059 return nvptx_set_cuda_stream (async, stream);
kono
parents:
diff changeset
2060 }
kono
parents:
diff changeset
2061
kono
parents:
diff changeset
2062 /* Adjust launch dimensions: pick good values for number of blocks and warps
kono
parents:
diff changeset
2063 and ensure that number of warps does not exceed CUDA limits as well as GCC's
kono
parents:
diff changeset
2064 own limits. */
kono
parents:
diff changeset
2065
kono
parents:
diff changeset
2066 static void
kono
parents:
diff changeset
2067 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
kono
parents:
diff changeset
2068 struct ptx_device *ptx_dev,
kono
parents:
diff changeset
2069 int *teams_p, int *threads_p)
kono
parents:
diff changeset
2070 {
kono
parents:
diff changeset
2071 int max_warps_block = fn->max_threads_per_block / 32;
kono
parents:
diff changeset
2072 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
kono
parents:
diff changeset
2073 and libgcc, which matches documented limit of all GPUs as of 2015. */
kono
parents:
diff changeset
2074 if (max_warps_block > 32)
kono
parents:
diff changeset
2075 max_warps_block = 32;
kono
parents:
diff changeset
2076 if (*threads_p <= 0)
kono
parents:
diff changeset
2077 *threads_p = 8;
kono
parents:
diff changeset
2078 if (*threads_p > max_warps_block)
kono
parents:
diff changeset
2079 *threads_p = max_warps_block;
kono
parents:
diff changeset
2080
kono
parents:
diff changeset
2081 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
kono
parents:
diff changeset
2082 /* This is an estimate of how many blocks the device can host simultaneously.
kono
parents:
diff changeset
2083 Actual limit, which may be lower, can be queried with "occupancy control"
kono
parents:
diff changeset
2084 driver interface (since CUDA 6.0). */
kono
parents:
diff changeset
2085 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
kono
parents:
diff changeset
2086 if (*teams_p <= 0 || *teams_p > max_blocks)
kono
parents:
diff changeset
2087 *teams_p = max_blocks;
kono
parents:
diff changeset
2088 }
kono
parents:
diff changeset
2089
kono
parents:
diff changeset
2090 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
kono
parents:
diff changeset
2091 target regions. */
kono
parents:
diff changeset
2092
kono
parents:
diff changeset
2093 static size_t
kono
parents:
diff changeset
2094 nvptx_stacks_size ()
kono
parents:
diff changeset
2095 {
kono
parents:
diff changeset
2096 return 128 * 1024;
kono
parents:
diff changeset
2097 }
kono
parents:
diff changeset
2098
kono
parents:
diff changeset
2099 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
kono
parents:
diff changeset
2100
kono
parents:
diff changeset
2101 static void *
kono
parents:
diff changeset
2102 nvptx_stacks_alloc (size_t size, int num)
kono
parents:
diff changeset
2103 {
kono
parents:
diff changeset
2104 CUdeviceptr stacks;
kono
parents:
diff changeset
2105 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
kono
parents:
diff changeset
2106 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2107 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
kono
parents:
diff changeset
2108 return (void *) stacks;
kono
parents:
diff changeset
2109 }
kono
parents:
diff changeset
2110
kono
parents:
diff changeset
2111 /* Release storage previously allocated by nvptx_stacks_alloc. */
kono
parents:
diff changeset
2112
kono
parents:
diff changeset
2113 static void
kono
parents:
diff changeset
2114 nvptx_stacks_free (void *p, int num)
kono
parents:
diff changeset
2115 {
kono
parents:
diff changeset
2116 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
kono
parents:
diff changeset
2117 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2118 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
kono
parents:
diff changeset
2119 }
kono
parents:
diff changeset
2120
kono
parents:
diff changeset
2121 void
kono
parents:
diff changeset
2122 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
kono
parents:
diff changeset
2123 {
kono
parents:
diff changeset
2124 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
kono
parents:
diff changeset
2125 CUresult r;
kono
parents:
diff changeset
2126 struct ptx_device *ptx_dev = ptx_devices[ord];
kono
parents:
diff changeset
2127 const char *maybe_abort_msg = "(perhaps abort was called)";
kono
parents:
diff changeset
2128 int teams = 0, threads = 0;
kono
parents:
diff changeset
2129
kono
parents:
diff changeset
2130 if (!args)
kono
parents:
diff changeset
2131 GOMP_PLUGIN_fatal ("No target arguments provided");
kono
parents:
diff changeset
2132 while (*args)
kono
parents:
diff changeset
2133 {
kono
parents:
diff changeset
2134 intptr_t id = (intptr_t) *args++, val;
kono
parents:
diff changeset
2135 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
kono
parents:
diff changeset
2136 val = (intptr_t) *args++;
kono
parents:
diff changeset
2137 else
kono
parents:
diff changeset
2138 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
kono
parents:
diff changeset
2139 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
kono
parents:
diff changeset
2140 continue;
kono
parents:
diff changeset
2141 val = val > INT_MAX ? INT_MAX : val;
kono
parents:
diff changeset
2142 id &= GOMP_TARGET_ARG_ID_MASK;
kono
parents:
diff changeset
2143 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
kono
parents:
diff changeset
2144 teams = val;
kono
parents:
diff changeset
2145 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
kono
parents:
diff changeset
2146 threads = val;
kono
parents:
diff changeset
2147 }
kono
parents:
diff changeset
2148 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
kono
parents:
diff changeset
2149
kono
parents:
diff changeset
2150 size_t stack_size = nvptx_stacks_size ();
kono
parents:
diff changeset
2151 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
kono
parents:
diff changeset
2152 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
kono
parents:
diff changeset
2153 size_t fn_args_size = sizeof fn_args;
kono
parents:
diff changeset
2154 void *config[] = {
kono
parents:
diff changeset
2155 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
kono
parents:
diff changeset
2156 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
kono
parents:
diff changeset
2157 CU_LAUNCH_PARAM_END
kono
parents:
diff changeset
2158 };
kono
parents:
diff changeset
2159 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
kono
parents:
diff changeset
2160 32, threads, 1, 0, ptx_dev->null_stream->stream,
kono
parents:
diff changeset
2161 NULL, config);
kono
parents:
diff changeset
2162 if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2163 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
kono
parents:
diff changeset
2164
kono
parents:
diff changeset
2165 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
kono
parents:
diff changeset
2166 if (r == CUDA_ERROR_LAUNCH_FAILED)
kono
parents:
diff changeset
2167 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
kono
parents:
diff changeset
2168 maybe_abort_msg);
kono
parents:
diff changeset
2169 else if (r != CUDA_SUCCESS)
kono
parents:
diff changeset
2170 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
kono
parents:
diff changeset
2171 nvptx_stacks_free (stacks, teams * threads);
kono
parents:
diff changeset
2172 }
kono
parents:
diff changeset
2173
kono
parents:
diff changeset
2174 void
kono
parents:
diff changeset
2175 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
kono
parents:
diff changeset
2176 void *async_data)
kono
parents:
diff changeset
2177 {
kono
parents:
diff changeset
2178 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
kono
parents:
diff changeset
2179 }