111
|
1 /*
|
|
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
|
|
3
|
|
4 Redistribution and use in source and binary forms, with or without
|
|
5 modification, are permitted provided that the following conditions
|
|
6 are met:
|
|
7
|
|
8 * Redistributions of source code must retain the above copyright
|
|
9 notice, this list of conditions and the following disclaimer.
|
|
10 * Redistributions in binary form must reproduce the above copyright
|
|
11 notice, this list of conditions and the following disclaimer in the
|
|
12 documentation and/or other materials provided with the distribution.
|
|
13 * Neither the name of Intel Corporation nor the names of its
|
|
14 contributors may be used to endorse or promote products derived
|
|
15 from this software without specific prior written permission.
|
|
16
|
|
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
28 */
|
|
29
|
|
30
|
|
31 /*! \file
|
|
32 \brief The parts of the runtime library used only on the host
|
|
33 */
|
|
34
|
|
35 #ifndef OFFLOAD_HOST_H_INCLUDED
|
|
36 #define OFFLOAD_HOST_H_INCLUDED
|
|
37
|
|
38 #ifndef TARGET_WINNT
|
|
39 #include <unistd.h>
|
|
40 #endif // TARGET_WINNT
|
|
41 #include "offload_common.h"
|
|
42 #include "offload_util.h"
|
|
43 #include "offload_engine.h"
|
|
44 #include "offload_env.h"
|
|
45 #include "offload_orsl.h"
|
|
46 #include "coi/coi_client.h"
|
|
47
|
|
48 // MIC engines.
|
|
49 DLL_LOCAL extern Engine* mic_engines;
|
|
50 DLL_LOCAL extern uint32_t mic_engines_total;
|
|
51
|
|
52 // DMA channel count used by COI and set via
|
|
53 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
|
|
54 DLL_LOCAL extern uint32_t mic_dma_channel_count;
|
|
55
|
|
56 //! The target image is packed as follows.
|
|
57 /*! 1. 8 bytes containing the size of the target binary */
|
|
58 /*! 2. a null-terminated string which is the binary name */
|
|
59 /*! 3. <size> number of bytes that are the contents of the image */
|
|
60 /*! The address of symbol __offload_target_image
|
|
61 is the address of this structure. */
|
|
62 struct Image {
|
|
63 int64_t size; //!< Size in bytes of the target binary name and contents
|
|
64 char data[]; //!< The name and contents of the target image
|
|
65 };
|
|
66
|
|
67 // The offload descriptor.
|
|
68 class OffloadDescriptor
|
|
69 {
|
|
70 public:
|
|
71 enum OmpAsyncLastEventType {
|
|
72 c_last_not, // not last event
|
|
73 c_last_write, // the last event that is write
|
|
74 c_last_read, // the last event that is read
|
|
75 c_last_runfunc // the last event that is runfunction
|
|
76 };
|
|
77
|
|
78 OffloadDescriptor(
|
|
79 int index,
|
|
80 _Offload_status *status,
|
|
81 bool is_mandatory,
|
|
82 bool is_openmp,
|
|
83 OffloadHostTimerData * timer_data
|
|
84 ) :
|
|
85 m_device(mic_engines[index == -1 ? 0 : index % mic_engines_total]),
|
|
86 m_is_mandatory(is_mandatory),
|
|
87 m_is_openmp(is_openmp),
|
|
88 m_inout_buf(0),
|
|
89 m_func_desc(0),
|
|
90 m_func_desc_size(0),
|
|
91 m_num_in_dependencies(0),
|
|
92 m_p_in_dependencies(0),
|
|
93 m_in_deps(0),
|
|
94 m_in_deps_total(0),
|
|
95 m_in_deps_allocated(0),
|
|
96 m_out_deps(0),
|
|
97 m_out_deps_total(0),
|
|
98 m_out_deps_allocated(0),
|
|
99 m_vars(0),
|
|
100 m_vars_extra(0),
|
|
101 m_status(status),
|
|
102 m_timer_data(timer_data),
|
|
103 m_out_with_preallocated(false),
|
|
104 m_preallocated_alloc(false),
|
|
105 m_traceback_called(false),
|
|
106 m_stream(-1),
|
|
107 m_signal(0),
|
|
108 m_has_signal(0),
|
|
109 m_omp_async_last_event_type(c_last_not)
|
|
110 {
|
|
111 m_wait_all_devices = index == -1;
|
|
112 }
|
|
113
|
|
114 ~OffloadDescriptor()
|
|
115 {
|
|
116 if (m_in_deps != 0) {
|
|
117 free(m_in_deps);
|
|
118 }
|
|
119 if (m_out_deps != 0) {
|
|
120 free(m_out_deps);
|
|
121 }
|
|
122 if (m_func_desc != 0) {
|
|
123 free(m_func_desc);
|
|
124 }
|
|
125 if (m_vars != 0) {
|
|
126 free(m_vars);
|
|
127 free(m_vars_extra);
|
|
128 }
|
|
129 }
|
|
130
|
|
131 bool offload(const char *name, bool is_empty,
|
|
132 VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
133 const void **waits, int num_waits, const void **signal,
|
|
134 int entry_id, const void *stack_addr,
|
|
135 OffloadFlags offload_flags);
|
|
136
|
|
137 bool offload_finish(bool is_traceback);
|
|
138
|
|
139 bool is_signaled();
|
|
140
|
|
141 OffloadHostTimerData* get_timer_data() const {
|
|
142 return m_timer_data;
|
|
143 }
|
|
144
|
|
145 void set_stream(_Offload_stream stream) {
|
|
146 m_stream = stream;
|
|
147 }
|
|
148
|
|
149 _Offload_stream get_stream() {
|
|
150 return(m_stream);
|
|
151 }
|
|
152
|
|
153 Engine& get_device() {
|
|
154 return m_device;
|
|
155 }
|
|
156
|
|
157 void* get_signal() {
|
|
158 return(m_signal);
|
|
159 }
|
|
160
|
|
161 void set_signal(const void* signal) {
|
|
162 m_has_signal = 1;
|
|
163 m_signal = const_cast<void*>(signal);
|
|
164 }
|
|
165
|
|
166 void cleanup();
|
|
167
|
|
168 uint32_t m_event_count;
|
|
169 bool m_has_signal;
|
|
170
|
|
171 private:
|
|
172 bool offload_wrap(const char *name, bool is_empty,
|
|
173 VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
174 const void **waits, int num_waits, const void **signal,
|
|
175 int entry_id, const void *stack_addr,
|
|
176 OffloadFlags offload_flags);
|
|
177 bool wait_dependencies(const void **waits, int num_waits,
|
|
178 _Offload_stream stream);
|
|
179 bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
180 int entry_id, const void *stack_addr);
|
|
181 bool setup_misc_data(const char *name);
|
|
182 bool send_pointer_data(bool is_async, void* info);
|
|
183 bool send_noncontiguous_pointer_data(
|
|
184 int i,
|
|
185 PtrData* src_buf,
|
|
186 PtrData* dst_buf,
|
|
187 COIEVENT *event,
|
|
188 uint64_t &sent_data,
|
|
189 uint32_t in_deps_amount,
|
|
190 COIEVENT *in_deps
|
|
191 );
|
|
192 bool receive_noncontiguous_pointer_data(
|
|
193 int i,
|
|
194 COIBUFFER dst_buf,
|
|
195 COIEVENT *event,
|
|
196 uint64_t &received_data,
|
|
197 uint32_t in_deps_amount,
|
|
198 COIEVENT *in_deps
|
|
199 );
|
|
200
|
|
201 bool gather_copyin_data();
|
|
202
|
|
203 bool compute(void *);
|
|
204
|
|
205 bool receive_pointer_data(bool is_async, bool first_run, void * info);
|
|
206 bool scatter_copyout_data();
|
|
207
|
|
208 bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
|
|
209 int64_t length, bool is_targptr,
|
|
210 bool error_does_not_exist = true);
|
|
211
|
|
212 void find_device_ptr( int64_t* &device_ptr,
|
|
213 void *host_ptr);
|
|
214
|
|
215 bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
|
|
216 int64_t length, int64_t alloc_disp, int align,
|
|
217 bool is_targptr, bool is_prealloc, bool pin);
|
|
218 bool create_preallocated_buffer(PtrData* ptr_data, void *base);
|
|
219 bool init_static_ptr_data(PtrData *ptr_data);
|
|
220 bool init_mic_address(PtrData *ptr_data);
|
|
221 bool offload_stack_memory_manager(
|
|
222 const void * stack_begin,
|
|
223 int routine_id,
|
|
224 int buf_size,
|
|
225 int align,
|
|
226 bool thread_specific_function_locals,
|
|
227 bool *is_new);
|
|
228 char *get_this_threads_cpu_stack_addr(
|
|
229 const void * stack_begin,
|
|
230 int routine_id,
|
|
231 bool thread_specific_function_locals);
|
|
232 PtrData *get_this_threads_mic_stack_addr(
|
|
233 const void * stack_begin,
|
|
234 int routine_id,
|
|
235 bool thread_specific_function_locals);
|
|
236 bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
|
|
237
|
|
238 bool gen_var_descs_for_pointer_array(int i);
|
|
239
|
|
240 void get_stream_in_dependencies(uint32_t &in_deps_amount,
|
|
241 COIEVENT* &in_deps);
|
|
242
|
|
243 void report_coi_error(error_types msg, COIRESULT res);
|
|
244 _Offload_result translate_coi_error(COIRESULT res) const;
|
|
245
|
|
246 void setup_omp_async_info();
|
|
247
|
|
248 void setup_use_device_ptr(int i);
|
|
249
|
|
250 void register_event_call_back(void (*)(
|
|
251 COIEVENT,
|
|
252 const COIRESULT,
|
|
253 const void*),
|
|
254 const COIEVENT *event,
|
|
255 const void *info);
|
|
256
|
|
257 void register_omp_event_call_back(const COIEVENT *event, const void *info);
|
|
258
|
|
259 private:
|
|
260 typedef std::list<COIBUFFER> BufferList;
|
|
261
|
|
262 // extra data associated with each variable descriptor
|
|
263 struct VarExtra {
|
|
264 PtrData* src_data;
|
|
265 PtrData* dst_data;
|
|
266 AutoData* auto_data;
|
|
267 int64_t cpu_disp;
|
|
268 int64_t cpu_offset;
|
|
269 void *alloc;
|
|
270 union {
|
|
271 CeanReadRanges *read_rng_src;
|
|
272 NonContigDesc *noncont_desc;
|
|
273 };
|
|
274 CeanReadRanges *read_rng_dst;
|
|
275 int64_t ptr_arr_offset;
|
|
276 bool is_arr_ptr_el;
|
|
277 OmpAsyncLastEventType omp_last_event_type;
|
|
278 int64_t pointer_offset;
|
|
279 uint16_t type_src;
|
|
280 uint16_t type_dst;
|
|
281 };
|
|
282
|
|
283 template<typename T> class ReadArrElements {
|
|
284 public:
|
|
285 ReadArrElements():
|
|
286 ranges(NULL),
|
|
287 el_size(sizeof(T)),
|
|
288 offset(0),
|
|
289 count(0),
|
|
290 is_empty(true),
|
|
291 base(NULL)
|
|
292 {}
|
|
293
|
|
294 bool read_next(bool flag)
|
|
295 {
|
|
296 if (flag != 0) {
|
|
297 if (is_empty) {
|
|
298 if (ranges) {
|
|
299 if (!get_next_range(ranges, &offset)) {
|
|
300 // ranges are over
|
|
301 return false;
|
|
302 }
|
|
303 }
|
|
304 // all contiguous elements are over
|
|
305 else if (count != 0) {
|
|
306 return false;
|
|
307 }
|
|
308
|
|
309 length_cur = size;
|
|
310 }
|
|
311 else {
|
|
312 offset += el_size;
|
|
313 }
|
|
314 val = (T)get_el_value(base, offset, el_size);
|
|
315 length_cur -= el_size;
|
|
316 count++;
|
|
317 is_empty = length_cur == 0;
|
|
318 }
|
|
319 return true;
|
|
320 }
|
|
321 public:
|
|
322 CeanReadRanges * ranges;
|
|
323 T val;
|
|
324 int el_size;
|
|
325 int64_t size,
|
|
326 offset,
|
|
327 length_cur;
|
|
328 bool is_empty;
|
|
329 int count;
|
|
330 char *base;
|
|
331 };
|
|
332
|
|
333 // ptr_data for persistent auto objects
|
|
334 PtrData* m_stack_ptr_data;
|
|
335 PtrDataList m_destroy_stack;
|
|
336
|
|
337 // Engine
|
|
338 Engine& m_device;
|
|
339
|
|
340 // true for offload_wait target(mic) stream(0)
|
|
341 bool m_wait_all_devices;
|
|
342
|
|
343 // if true offload is mandatory
|
|
344 bool m_is_mandatory;
|
|
345
|
|
346 // if true offload has openmp origin
|
|
347 const bool m_is_openmp;
|
|
348
|
|
349 // The Marshaller for the inputs of the offloaded region.
|
|
350 Marshaller m_in;
|
|
351
|
|
352 // The Marshaller for the outputs of the offloaded region.
|
|
353 Marshaller m_out;
|
|
354
|
|
355 // List of buffers that are passed to dispatch call
|
|
356 BufferList m_compute_buffers;
|
|
357
|
|
358 // List of buffers that need to be destroyed at the end of offload
|
|
359 BufferList m_destroy_buffers;
|
|
360
|
|
361 // Variable descriptors
|
|
362 VarDesc* m_vars;
|
|
363 VarExtra* m_vars_extra;
|
|
364 int m_vars_total;
|
|
365
|
|
366 // Pointer to a user-specified status variable
|
|
367 _Offload_status *m_status;
|
|
368
|
|
369 // Function descriptor
|
|
370 FunctionDescriptor* m_func_desc;
|
|
371 uint32_t m_func_desc_size;
|
|
372
|
|
373 // Buffer for transferring copyin/copyout data
|
|
374 COIBUFFER m_inout_buf;
|
|
375
|
|
376
|
|
377 // Dependencies
|
|
378 COIEVENT *m_in_deps;
|
|
379 uint32_t m_in_deps_total;
|
|
380 uint32_t m_in_deps_allocated;
|
|
381 COIEVENT *m_out_deps;
|
|
382 uint32_t m_out_deps_total;
|
|
383 uint32_t m_out_deps_allocated;
|
|
384
|
|
385 // 2 variables defines input dependencies for current COI API.
|
|
386 // The calls to routines as BufferWrite/PipelineRunFunction/BufferRead
|
|
387 // is supposed to have input dependencies.
|
|
388 // 2 variables below defines the number and vector of dependencies
|
|
389 // in every current moment of offload.
|
|
390 // So any phase of offload can use its values as input dependencies
|
|
391 // for the COI API that the phase calls.
|
|
392 // It means that all phases (of Write, RunFunction,Read) must keep
|
|
393 // the variables correct to be used by following phase.
|
|
394 // If some consequent offloads are connected (i.e. by the same stream)
|
|
395 // the final 2 variables of the offload is used as initial inputs
|
|
396 // for the next offload.
|
|
397 uint32_t m_num_in_dependencies;
|
|
398 COIEVENT *m_p_in_dependencies;
|
|
399
|
|
400 // Stream
|
|
401 _Offload_stream m_stream;
|
|
402
|
|
403 // Signal
|
|
404 void* m_signal;
|
|
405
|
|
406 // Timer data
|
|
407 OffloadHostTimerData *m_timer_data;
|
|
408
|
|
409 // copyin/copyout data length
|
|
410 uint64_t m_in_datalen;
|
|
411 uint64_t m_out_datalen;
|
|
412
|
|
413 // a boolean value calculated in setup_descriptors. If true we need to do
|
|
414 // a run function on the target. Otherwise it may be optimized away.
|
|
415 bool m_need_runfunction;
|
|
416
|
|
417 // initialized value of m_need_runfunction;
|
|
418 // is used to recognize offload_transfer
|
|
419 bool m_initial_need_runfunction;
|
|
420
|
|
421 // a Boolean value set to true when OUT clauses with preallocated targetptr
|
|
422 // is encountered to indicate that call receive_pointer_data needs to be
|
|
423 // invoked again after call to scatter_copyout_data.
|
|
424 bool m_out_with_preallocated;
|
|
425
|
|
426 // a Boolean value set to true if an alloc_if(1) is used with preallocated
|
|
427 // targetptr to indicate the need to scatter_copyout_data even for
|
|
428 // async offload
|
|
429 bool m_preallocated_alloc;
|
|
430
|
|
431 // a Boolean value set to true if traceback routine is called
|
|
432 bool m_traceback_called;
|
|
433
|
|
434 OmpAsyncLastEventType m_omp_async_last_event_type;
|
|
435 };
|
|
436
|
|
437 // Initialization types for MIC
|
|
438 enum OffloadInitType {
|
|
439 c_init_on_start, // all devices before entering main
|
|
440 c_init_on_offload, // single device before starting the first offload
|
|
441 c_init_on_offload_all // all devices before starting the first offload
|
|
442 };
|
|
443
|
|
444 // Determines if MIC code is an executable or a shared library
|
|
445 extern "C" bool __offload_target_image_is_executable(const void *target_image);
|
|
446
|
|
447 // Initializes library and registers specified offload image.
|
|
448 extern "C" bool __offload_register_image(const void* image);
|
|
449 extern "C" void __offload_unregister_image(const void* image);
|
|
450
|
|
451 // Registers asynchronous task completion callback
|
|
452 extern "C" void __offload_register_task_callback(void (*cb)(void *));
|
|
453
|
|
454 // Initializes offload runtime library.
|
|
455 DLL_LOCAL extern int __offload_init_library(void);
|
|
456
|
|
457 // thread data for associating pipelines with threads
|
|
458 DLL_LOCAL extern pthread_key_t mic_thread_key;
|
|
459
|
|
460 // location of offload_main executable
|
|
461 // To be used if the main application has no offload and is not built
|
|
462 // with -offload but dynamic library linked in has offload pragma
|
|
463 DLL_LOCAL extern char* mic_device_main;
|
|
464
|
|
465 // Environment variables for devices
|
|
466 DLL_LOCAL extern MicEnvVar mic_env_vars;
|
|
467
|
|
468 // CPU frequency
|
|
469 DLL_LOCAL extern uint64_t cpu_frequency;
|
|
470
|
|
471 // LD_LIBRARY_PATH for KNC libraries
|
|
472 DLL_LOCAL extern char* knc_library_path;
|
|
473
|
|
474 // LD_LIBRARY_PATH for KNL libraries
|
|
475 DLL_LOCAL extern char* knl_library_path;
|
|
476
|
|
477 // stack size for target
|
|
478 DLL_LOCAL extern uint32_t mic_stack_size;
|
|
479
|
|
480 // Preallocated memory size for buffers on MIC
|
|
481 DLL_LOCAL extern uint64_t mic_buffer_size;
|
|
482
|
|
483 // Preallocated 4K page memory size for buffers on MIC
|
|
484 DLL_LOCAL extern uint64_t mic_4k_buffer_size;
|
|
485
|
|
486 // Preallocated 2M page memory size for buffers on MIC
|
|
487 DLL_LOCAL extern uint64_t mic_2m_buffer_size;
|
|
488
|
|
489 // Setting controlling inout proxy
|
|
490 DLL_LOCAL extern bool mic_proxy_io;
|
|
491 DLL_LOCAL extern char* mic_proxy_fs_root;
|
|
492
|
|
493 // Threshold for creating buffers with large pages
|
|
494 DLL_LOCAL extern uint64_t __offload_use_2mb_buffers;
|
|
495
|
|
496 // offload initialization type
|
|
497 DLL_LOCAL extern OffloadInitType __offload_init_type;
|
|
498
|
|
499 // Device number to offload to when device is not explicitly specified.
|
|
500 DLL_LOCAL extern int __omp_device_num;
|
|
501
|
|
502 // target executable
|
|
503 DLL_LOCAL extern TargetImage* __target_exe;
|
|
504
|
|
505 // is true if last loaded image is dll
|
|
506 DLL_LOCAL extern bool __current_image_is_dll;
|
|
507 // is true if myo library is loaded when dll is loaded
|
|
508 DLL_LOCAL extern bool __myo_init_in_so;
|
|
509
|
|
510 // IDB support
|
|
511
|
|
512 // Called by the offload runtime after initialization of offload infrastructure
|
|
513 // has been completed.
|
|
514 extern "C" void __dbg_target_so_loaded();
|
|
515
|
|
516 // Called by the offload runtime when the offload infrastructure is about to be
|
|
517 // shut down, currently at application exit.
|
|
518 extern "C" void __dbg_target_so_unloaded();
|
|
519
|
|
520 // Null-terminated string containing path to the process image of the hosting
|
|
521 // application (offload_main)
|
|
522 #define MAX_TARGET_NAME 512
|
|
523 extern "C" char __dbg_target_exe_name[MAX_TARGET_NAME];
|
|
524
|
|
525 // Integer specifying the process id
|
|
526 extern "C" pid_t __dbg_target_so_pid;
|
|
527
|
|
528 // Integer specifying the 0-based device number
|
|
529 extern "C" int __dbg_target_id;
|
|
530
|
|
531 // Set to non-zero by the host-side debugger to enable offload debugging
|
|
532 // support
|
|
533 extern "C" int __dbg_is_attached;
|
|
534
|
|
535 // Major version of the debugger support API
|
|
536 extern "C" const int __dbg_api_major_version;
|
|
537
|
|
538 // Minor version of the debugger support API
|
|
539 extern "C" const int __dbg_api_minor_version;
|
|
540
|
|
541 #endif // OFFLOAD_HOST_H_INCLUDED
|