Mercurial > hg > CbC > CbC_gcc
comparison liboffloadmic/runtime/offload_engine.cpp @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 /* | |
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved. | |
3 | |
4 Redistribution and use in source and binary forms, with or without | |
5 modification, are permitted provided that the following conditions | |
6 are met: | |
7 | |
8 * Redistributions of source code must retain the above copyright | |
9 notice, this list of conditions and the following disclaimer. | |
10 * Redistributions in binary form must reproduce the above copyright | |
11 notice, this list of conditions and the following disclaimer in the | |
12 documentation and/or other materials provided with the distribution. | |
13 * Neither the name of Intel Corporation nor the names of its | |
14 contributors may be used to endorse or promote products derived | |
15 from this software without specific prior written permission. | |
16 | |
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 */ | |
29 | |
30 | |
31 #include "offload_engine.h" | |
32 #include <signal.h> | |
33 #include <errno.h> | |
34 #include <sys/stat.h> | |
35 #include <sys/types.h> | |
36 | |
37 #include <algorithm> | |
38 #include <vector> | |
39 | |
40 #include "offload_host.h" | |
41 #include "offload_table.h" | |
42 #include "offload_iterator.h" | |
43 | |
44 #if defined(HOST_WINNT) | |
45 #define PATH_SEPARATOR ";" | |
46 #else | |
47 #define PATH_SEPARATOR ":" | |
48 #endif | |
49 | |
50 // Static members of Stream class must be described somewhere. | |
51 // This members describe the list of all streams defined in programm | |
52 // via call to _Offload_stream_create. | |
53 uint64_t Stream::m_streams_count = 0; | |
54 StreamMap Stream::all_streams; | |
55 mutex_t Stream::m_stream_lock; | |
56 char* mic_library_path = 0; | |
57 | |
58 const char* Engine::m_func_names[Engine::c_funcs_total] = | |
59 { | |
60 "server_compute", | |
61 #ifdef MYO_SUPPORT | |
62 "server_myoinit", | |
63 "server_myofini", | |
64 #endif // MYO_SUPPORT | |
65 "server_init", | |
66 "server_var_table_size", | |
67 "server_var_table_copy", | |
68 "server_set_stream_affinity" | |
69 }; | |
70 | |
71 // Symbolic representation of system signals. Fix for CQ233593 | |
72 const char* Engine::c_signal_names[Engine::c_signal_max] = | |
73 { | |
74 "Unknown SIGNAL", | |
75 "SIGHUP", /* 1, Hangup (POSIX). */ | |
76 "SIGINT", /* 2, Interrupt (ANSI). */ | |
77 "SIGQUIT", /* 3, Quit (POSIX). */ | |
78 "SIGILL", /* 4, Illegal instruction (ANSI). */ | |
79 "SIGTRAP", /* 5, Trace trap (POSIX). */ | |
80 "SIGABRT", /* 6, Abort (ANSI). */ | |
81 "SIGBUS", /* 7, BUS error (4.2 BSD). */ | |
82 "SIGFPE", /* 8, Floating-point exception (ANSI). */ | |
83 "SIGKILL", /* 9, Kill, unblockable (POSIX). */ | |
84 "SIGUSR1", /* 10, User-defined signal 1 (POSIX). */ | |
85 "SIGSEGV", /* 11, Segmentation violation (ANSI). */ | |
86 "SIGUSR2", /* 12, User-defined signal 2 (POSIX). */ | |
87 "SIGPIPE", /* 13, Broken pipe (POSIX). */ | |
88 "SIGALRM", /* 14, Alarm clock (POSIX). */ | |
89 "SIGTERM", /* 15, Termination (ANSI). */ | |
90 "SIGSTKFLT", /* 16, Stack fault. */ | |
91 "SIGCHLD", /* 17, Child status has changed (POSIX). */ | |
92 "SIGCONT", /* 18, Continue (POSIX). */ | |
93 "SIGSTOP", /* 19, Stop, unblockable (POSIX). */ | |
94 "SIGTSTP", /* 20, Keyboard stop (POSIX). */ | |
95 "SIGTTIN", /* 21, Background read from tty (POSIX). */ | |
96 "SIGTTOU", /* 22, Background write to tty (POSIX). */ | |
97 "SIGURG", /* 23, Urgent condition on socket (4.2 BSD). */ | |
98 "SIGXCPU", /* 24, CPU limit exceeded (4.2 BSD). */ | |
99 "SIGXFSZ", /* 25, File size limit exceeded (4.2 BSD). */ | |
100 "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD). */ | |
101 "SIGPROF", /* 27, Profiling alarm clock (4.2 BSD). */ | |
102 "SIGWINCH", /* 28, Window size change (4.3 BSD, Sun). */ | |
103 "SIGIO", /* 29, I/O now possible (4.2 BSD). */ | |
104 "SIGPWR", /* 30, Power failure restart (System V). */ | |
105 "SIGSYS" /* 31, Bad system call. */ | |
106 }; | |
107 | |
108 void Engine::init(void) | |
109 { | |
110 if (!m_ready) { | |
111 mutex_locker_t locker(m_lock); | |
112 | |
113 if (!m_ready) { | |
114 // start process if not done yet | |
115 if (m_process == 0) { | |
116 init_process(); | |
117 } | |
118 | |
119 // load penging images | |
120 load_libraries(); | |
121 | |
122 // and (re)build pointer table | |
123 init_ptr_data(); | |
124 | |
125 // it is ready now | |
126 m_ready = true; | |
127 | |
128 // Inform the debugger | |
129 if (__dbg_is_attached) { | |
130 __dbg_target_so_loaded(); | |
131 } | |
132 } | |
133 } | |
134 } | |
135 | |
136 void Engine::print_stream_cpu_list(const char * str) | |
137 { | |
138 int count = 0; | |
139 char buffer[1024]; | |
140 CpuEl* cpu_el = m_cpu_head; | |
141 | |
142 OFFLOAD_DEBUG_TRACE(3, | |
143 "%s : cpu list as Index(Count) for the streams is :\n", str); | |
144 buffer[0] = 0; | |
145 for (int i = 0; i < m_num_threads; i++) { | |
146 cpu_el = m_cpus + i; | |
147 if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) { | |
148 count++; | |
149 sprintf(buffer + strlen(buffer), "%d(%d) ", CPU_INDEX(cpu_el), cpu_el->count); | |
150 if (count % 20 == 0) { | |
151 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer); | |
152 buffer[0] = 0; | |
153 } | |
154 } | |
155 } | |
156 if (count % 20 != 0) { | |
157 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer); | |
158 } | |
159 } | |
160 | |
161 void Engine::init_process(void) | |
162 { | |
163 COIENGINE engine; | |
164 COIRESULT res; | |
165 const char **environ; | |
166 char buf[4096]; // For exe path name | |
167 char* mic_device_main = 0; | |
168 | |
169 // create environment for the target process | |
170 environ = (const char**) mic_env_vars.create_environ_for_card(m_index); | |
171 if (environ != 0) { | |
172 for (const char **p = environ; *p != 0; p++) { | |
173 OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p); | |
174 } | |
175 } | |
176 | |
177 // Create execution context in the specified device | |
178 OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index, | |
179 m_physical_index); | |
180 res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine); | |
181 check_result(res, c_get_engine_handle, m_index, res); | |
182 | |
183 // Get engine info on threads and cores. | |
184 // The values of core number and thread number will be used later at stream | |
185 // creation by call to _Offload_stream_create(device,number_of_cpus). | |
186 | |
187 COI_ENGINE_INFO engine_info; | |
188 | |
189 res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info); | |
190 check_result(res, c_get_engine_info, m_index, res); | |
191 if (mic_library_path == 0 ) { | |
192 if (engine_info.ISA == COI_DEVICE_KNC) { | |
193 mic_library_path = knc_library_path; | |
194 } | |
195 else if (engine_info.ISA == COI_DEVICE_KNL) { | |
196 mic_library_path = knl_library_path; | |
197 } | |
198 else { | |
199 LIBOFFLOAD_ERROR(c_unknown_mic_device_type); | |
200 } | |
201 } | |
202 | |
203 // m_cpus is the list of all available threads. | |
204 // At the begining all threads made available through OFFLOAD_DEVICES | |
205 // or all threads existed at the engine if OFFLOAD_DEVICES isn't set. | |
206 // m_cpu_head points to the head of the m_cpus list. | |
207 // m_cpus is ordered by number of streams using the thread. | |
208 // m_cpu_head points to the least used thread. | |
209 // After creating and destroying a stream the m_cpus list must be fixed | |
210 // to be ordered. | |
211 | |
212 m_cpus = (CpuEl*)malloc(engine_info.NumThreads * sizeof(CpuEl)); | |
213 if (m_cpus == NULL) | |
214 LIBOFFLOAD_ERROR(c_malloc); | |
215 memset(m_cpus, 0, engine_info.NumThreads * sizeof(CpuEl)); | |
216 CpuEl* prev_cpu = NULL; | |
217 | |
218 for (int i = 0; i < engine_info.NumThreads; i++) { | |
219 if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) { | |
220 if (prev_cpu) { | |
221 prev_cpu->next = m_cpus + i; | |
222 } | |
223 else { | |
224 m_cpu_head = m_cpus + i; | |
225 } | |
226 m_cpus[i].prev = prev_cpu; | |
227 m_cpus[i].count = 0; | |
228 prev_cpu = m_cpus + i; | |
229 } | |
230 } | |
231 | |
232 // The following values will be used at pipeline creation for streams | |
233 m_num_cores = engine_info.NumCores; | |
234 m_num_threads = engine_info.NumThreads; | |
235 | |
236 print_stream_cpu_list("init_process"); | |
237 | |
238 // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2 | |
239 // Only the value 2 is supported in 16.0 | |
240 if (mic_dma_channel_count == 2) { | |
241 if (COI::ProcessConfigureDMA) { | |
242 // Set DMA channels using COI API | |
243 COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE); | |
244 } | |
245 else { | |
246 // Set environment variable COI_DMA_CHANNEL_COUNT | |
247 // use putenv instead of setenv as Windows has no setenv. | |
248 // Note: putenv requires its argument can't be freed or modified. | |
249 // So no free after call to putenv or elsewhere. | |
250 char * env_var = strdup("COI_DMA_CHANNEL_COUNT=2"); | |
251 if (env_var == NULL) | |
252 LIBOFFLOAD_ERROR(c_malloc); | |
253 putenv(env_var); | |
254 } | |
255 } | |
256 | |
257 // Target executable is not available then use compiler provided offload_main | |
258 if (__target_exe == 0) { | |
259 // find target executable to be used if main application is not an | |
260 // offload build application. | |
261 const char *base_name = "offload_main"; | |
262 if (mic_library_path != 0) { | |
263 char *buf = strdup(mic_library_path); | |
264 if (buf == NULL) | |
265 LIBOFFLOAD_ERROR(c_malloc); | |
266 char *try_name = (char*) alloca(strlen(mic_library_path) + | |
267 strlen(base_name) + 2); | |
268 char *dir, *ptr; | |
269 | |
270 for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0; | |
271 dir = strtok_r(0, PATH_SEPARATOR, &ptr)) { | |
272 // compose a full path | |
273 sprintf(try_name, "%s/%s", dir, base_name); | |
274 | |
275 // check if such file exists | |
276 struct stat st; | |
277 if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) { | |
278 mic_device_main = strdup(try_name); | |
279 if (mic_device_main == NULL) | |
280 LIBOFFLOAD_ERROR(c_malloc); | |
281 break; | |
282 } | |
283 } | |
284 free(buf); | |
285 } | |
286 if (mic_device_main == 0) { | |
287 LIBOFFLOAD_ERROR(c_report_no_target_exe, "offload_main"); | |
288 exit(1); | |
289 } | |
290 | |
291 OFFLOAD_DEBUG_TRACE(2, | |
292 "Loading target executable %s\n",mic_device_main); | |
293 | |
294 res = COI::ProcessCreateFromFile( | |
295 engine, // in_Engine | |
296 mic_device_main, // in_pBinaryName | |
297 0, // in_Argc | |
298 0, // in_ppArgv | |
299 environ == 0, // in_DupEnv | |
300 environ, // in_ppAdditionalEnv | |
301 mic_proxy_io, // in_ProxyActive | |
302 mic_proxy_fs_root, // in_ProxyfsRoot | |
303 mic_buffer_size, // in_BufferSpace | |
304 mic_library_path, // in_LibrarySearchPath | |
305 &m_process // out_pProcess | |
306 ); | |
307 } | |
308 else { | |
309 // Target executable should be available by the time when we | |
310 // attempt to initialize the device | |
311 | |
312 // Need the full path of the FAT exe for VTUNE | |
313 { | |
314 #ifndef TARGET_WINNT | |
315 ssize_t len = readlink("/proc/self/exe", buf,1000); | |
316 #else | |
317 int len = GetModuleFileName(NULL, buf,1000); | |
318 #endif // TARGET_WINNT | |
319 if (len == -1) { | |
320 LIBOFFLOAD_ERROR(c_report_no_host_exe); | |
321 exit(1); | |
322 } | |
323 else if (len > 999) { | |
324 LIBOFFLOAD_ERROR(c_report_path_buff_overflow); | |
325 exit(1); | |
326 } | |
327 buf[len] = '\0'; | |
328 } | |
329 | |
330 OFFLOAD_DEBUG_TRACE(2, | |
331 "Loading target executable \"%s\" from %p, size %lld, host file %s\n", | |
332 __target_exe->name, __target_exe->data, __target_exe->size, | |
333 buf); | |
334 | |
335 res = COI::ProcessCreateFromMemory( | |
336 engine, // in_Engine | |
337 __target_exe->name, // in_pBinaryName | |
338 __target_exe->data, // in_pBinaryBuffer | |
339 __target_exe->size, // in_BinaryBufferLength, | |
340 0, // in_Argc | |
341 0, // in_ppArgv | |
342 environ == 0, // in_DupEnv | |
343 environ, // in_ppAdditionalEnv | |
344 mic_proxy_io, // in_ProxyActive | |
345 mic_proxy_fs_root, // in_ProxyfsRoot | |
346 mic_buffer_size, // in_BufferSpace | |
347 mic_library_path, // in_LibrarySearchPath | |
348 buf, // in_FileOfOrigin | |
349 -1, // in_FileOfOriginOffset use -1 to indicate to | |
350 // COI that is is a FAT binary | |
351 &m_process // out_pProcess | |
352 ); | |
353 } | |
354 check_result(res, c_process_create, m_index, res); | |
355 | |
356 if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) { | |
357 // available only in MPSS 4.2 and greater | |
358 if (COI::ProcessSetCacheSize != 0 ) { | |
359 int flags; | |
360 // Need compiler to use MPSS 3.2 or greater to get these | |
361 // definition so currently hardcoding it | |
362 // COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC; | |
363 flags = 0x00020002; | |
364 res = COI::ProcessSetCacheSize( | |
365 m_process, // in_Process | |
366 mic_2m_buffer_size, // in_HugePagePoolSize | |
367 flags, // inHugeFlags | |
368 mic_4k_buffer_size, // in_SmallPagePoolSize | |
369 flags, // inSmallFlags | |
370 0, // in_NumDependencies | |
371 0, // in_pDependencies | |
372 0 // out_PCompletion | |
373 ); | |
374 OFFLOAD_DEBUG_TRACE(2, | |
375 "Reserve target buffers 4K pages = %d 2M pages = %d\n", | |
376 mic_4k_buffer_size, mic_2m_buffer_size); | |
377 check_result(res, c_process_set_cache_size, m_index, res); | |
378 } | |
379 else { | |
380 OFFLOAD_DEBUG_TRACE(2, | |
381 "Reserve target buffers not supported in current MPSS\n"); | |
382 } | |
383 } | |
384 | |
385 // get function handles | |
386 res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total, | |
387 m_func_names, m_funcs); | |
388 check_result(res, c_process_get_func_handles, m_index, res); | |
389 | |
390 // initialize device side | |
391 pid_t pid = init_device(); | |
392 | |
393 // For IDB | |
394 if (__dbg_is_attached) { | |
395 // TODO: we have in-memory executable now. | |
396 // Check with IDB team what should we provide them now? | |
397 if (__target_exe == 0) { | |
398 strcpy(__dbg_target_exe_name, "offload_main"); | |
399 } | |
400 else { | |
401 if (strlen(__target_exe->name) < MAX_TARGET_NAME) { | |
402 strcpy(__dbg_target_exe_name, __target_exe->name); | |
403 } | |
404 } | |
405 __dbg_target_so_pid = pid; | |
406 __dbg_target_id = m_physical_index; | |
407 // The call to __dbg_target_so_loaded() is moved | |
408 // to Engine:init so all the libraries are loaded before | |
409 // informing debugger so debugger can access them. | |
410 // __dbg_target_so_loaded(); | |
411 } | |
412 } | |
413 | |
414 void Engine::fini_process(bool verbose) | |
415 { | |
416 if (m_process != 0) { | |
417 uint32_t sig; | |
418 int8_t ret; | |
419 | |
420 // destroy target process | |
421 OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n", | |
422 m_index); | |
423 | |
424 COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig); | |
425 m_process = 0; | |
426 | |
427 if (res == COI_SUCCESS) { | |
428 OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n", | |
429 sig, ret); | |
430 if (verbose) { | |
431 if (sig != 0) { | |
432 LIBOFFLOAD_ERROR( | |
433 c_mic_process_exit_sig, m_index, sig, | |
434 c_signal_names[sig >= c_signal_max ? 0 : sig]); | |
435 } | |
436 else { | |
437 LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret); | |
438 } | |
439 } | |
440 | |
441 // for idb | |
442 if (__dbg_is_attached) { | |
443 __dbg_target_so_unloaded(); | |
444 } | |
445 } | |
446 else { | |
447 if (verbose) { | |
448 LIBOFFLOAD_ERROR(c_mic_process_exit, m_index); | |
449 } | |
450 } | |
451 } | |
452 } | |
453 | |
454 void Engine::load_libraries() | |
455 { | |
456 // load libraries collected so far | |
457 for (TargetImageList::iterator it = m_images.begin(); | |
458 it != m_images.end(); it++) { | |
459 OFFLOAD_DEBUG_TRACE(2, | |
460 "Loading library \"%s\" from %p, size %llu, host file %s\n", | |
461 it->name, it->data, it->size, it->origin); | |
462 | |
463 // load library to the device | |
464 COILIBRARY lib; | |
465 COIRESULT res; | |
466 res = COI::ProcessLoadLibraryFromMemory(m_process, | |
467 it->data, | |
468 it->size, | |
469 it->name, | |
470 mic_library_path, | |
471 it->origin, | |
472 (it->origin) ? -1 : 0, | |
473 COI_LOADLIBRARY_V1_FLAGS, | |
474 &lib); | |
475 m_dyn_libs.push_front(DynLib(it->name, it->data, lib)); | |
476 | |
477 if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) { | |
478 check_result(res, c_load_library, it->origin, m_index, res); | |
479 } | |
480 } | |
481 m_images.clear(); | |
482 } | |
483 | |
484 void Engine::unload_library(const void *data, const char *name) | |
485 { | |
486 if (m_process == 0) { | |
487 return; | |
488 } | |
489 for (DynLibList::iterator it = m_dyn_libs.begin(); | |
490 it != m_dyn_libs.end(); it++) { | |
491 if (it->data == data) { | |
492 COIRESULT res; | |
493 OFFLOAD_DEBUG_TRACE(2, | |
494 "Unloading library \"%s\"\n",name); | |
495 res = COI::ProcessUnloadLibrary(m_process,it->lib); | |
496 m_dyn_libs.erase(it); | |
497 if (res != COI_SUCCESS) { | |
498 check_result(res, c_unload_library, m_index, res); | |
499 } | |
500 return; | |
501 } | |
502 } | |
503 } | |
504 | |
505 static bool target_entry_cmp( | |
506 const VarList::BufEntry &l, | |
507 const VarList::BufEntry &r | |
508 ) | |
509 { | |
510 const char *l_name = reinterpret_cast<const char*>(l.name); | |
511 const char *r_name = reinterpret_cast<const char*>(r.name); | |
512 return strcmp(l_name, r_name) < 0; | |
513 } | |
514 | |
515 static bool host_entry_cmp( | |
516 const VarTable::Entry *l, | |
517 const VarTable::Entry *r | |
518 ) | |
519 { | |
520 return strcmp(l->name, r->name) < 0; | |
521 } | |
522 | |
523 void Engine::init_ptr_data(void) | |
524 { | |
525 COIRESULT res; | |
526 COIEVENT event; | |
527 | |
528 // Prepare table of host entries | |
529 std::vector<const VarTable::Entry*> host_table( | |
530 Iterator(__offload_vars.get_head()), | |
531 Iterator()); | |
532 | |
533 // no need to do anything further is host table is empty | |
534 if (host_table.size() <= 0) { | |
535 return; | |
536 } | |
537 | |
538 // Get var table entries from the target. | |
539 // First we need to get size for the buffer to copy data | |
540 struct { | |
541 int64_t nelems; | |
542 int64_t length; | |
543 } params; | |
544 | |
545 res = COI::PipelineRunFunction(get_pipeline(), | |
546 m_funcs[c_func_var_table_size], | |
547 0, 0, 0, | |
548 0, 0, | |
549 0, 0, | |
550 ¶ms, sizeof(params), | |
551 &event); | |
552 check_result(res, c_pipeline_run_func, m_index, res); | |
553 | |
554 res = COI::EventWait(1, &event, -1, 1, 0, 0); | |
555 check_result(res, c_event_wait, res); | |
556 | |
557 if (params.length == 0) { | |
558 return; | |
559 } | |
560 | |
561 // create buffer for target entries and copy data to host | |
562 COIBUFFER buffer; | |
563 res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1, | |
564 &m_process, &buffer); | |
565 check_result(res, c_buf_create, m_index, res); | |
566 | |
567 COI_ACCESS_FLAGS flags = COI_SINK_WRITE; | |
568 res = COI::PipelineRunFunction(get_pipeline(), | |
569 m_funcs[c_func_var_table_copy], | |
570 1, &buffer, &flags, | |
571 0, 0, | |
572 ¶ms.nelems, sizeof(params.nelems), | |
573 0, 0, | |
574 &event); | |
575 check_result(res, c_pipeline_run_func, m_index, res); | |
576 | |
577 res = COI::EventWait(1, &event, -1, 1, 0, 0); | |
578 check_result(res, c_event_wait, res); | |
579 | |
580 // patch names in target data | |
581 VarList::BufEntry *target_table; | |
582 COIMAPINSTANCE map_inst; | |
583 res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0, | |
584 0, &map_inst, | |
585 reinterpret_cast<void**>(&target_table)); | |
586 check_result(res, c_buf_map, res); | |
587 | |
588 VarList::table_patch_names(target_table, params.nelems); | |
589 | |
590 // and sort entries | |
591 std::sort(target_table, target_table + params.nelems, target_entry_cmp); | |
592 std::sort(host_table.begin(), host_table.end(), host_entry_cmp); | |
593 | |
594 // merge host and target entries and enter matching vars map | |
595 std::vector<const VarTable::Entry*>::const_iterator hi = | |
596 host_table.begin(); | |
597 std::vector<const VarTable::Entry*>::const_iterator he = | |
598 host_table.end(); | |
599 const VarList::BufEntry *ti = target_table; | |
600 const VarList::BufEntry *te = target_table + params.nelems; | |
601 | |
602 while (hi != he && ti != te) { | |
603 int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name)); | |
604 if (res == 0) { | |
605 bool is_new; | |
606 // add matching entry to var map | |
607 PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new); | |
608 | |
609 // store address for new entries | |
610 if (is_new) { | |
611 ptr->mic_addr = ti->addr; | |
612 ptr->is_static = true; | |
613 ptr->var_alloc_type = (*hi)->var_alloc_type; | |
614 } | |
615 ptr->alloc_ptr_data_lock.unlock(); | |
616 hi++; | |
617 ti++; | |
618 } | |
619 else if (res < 0) { | |
620 hi++; | |
621 } | |
622 else { | |
623 ti++; | |
624 } | |
625 } | |
626 | |
627 // cleanup | |
628 res = COI::BufferUnmap(map_inst, 0, 0, 0); | |
629 check_result(res, c_buf_unmap, res); | |
630 | |
631 res = COI::BufferDestroy(buffer); | |
632 check_result(res, c_buf_destroy, res); | |
633 } | |
634 | |
635 COIRESULT Engine::compute( | |
636 _Offload_stream stream, | |
637 const std::list<COIBUFFER> &buffers, | |
638 const void* data, | |
639 uint16_t data_size, | |
640 void* ret, | |
641 uint16_t ret_size, | |
642 uint32_t num_deps, | |
643 const COIEVENT* deps, | |
644 COIEVENT* event | |
645 ) /* const */ | |
646 { | |
647 COIBUFFER *bufs; | |
648 COI_ACCESS_FLAGS *flags; | |
649 COIRESULT res; | |
650 | |
651 // convert buffers list to array | |
652 int num_bufs = buffers.size(); | |
653 if (num_bufs > 0) { | |
654 bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER)); | |
655 flags = (COI_ACCESS_FLAGS*) alloca(num_bufs * | |
656 sizeof(COI_ACCESS_FLAGS)); | |
657 | |
658 int i = 0; | |
659 for (std::list<COIBUFFER>::const_iterator it = buffers.begin(); | |
660 it != buffers.end(); it++) { | |
661 bufs[i] = *it; | |
662 | |
663 // TODO: this should be fixed | |
664 flags[i++] = COI_SINK_WRITE; | |
665 } | |
666 } | |
667 else { | |
668 bufs = 0; | |
669 flags = 0; | |
670 } | |
671 COIPIPELINE pipeline = (stream == no_stream) ? | |
672 get_pipeline() : | |
673 get_pipeline(stream); | |
674 // start computation | |
675 res = COI::PipelineRunFunction(pipeline, | |
676 m_funcs[c_func_compute], | |
677 num_bufs, bufs, flags, | |
678 num_deps, deps, | |
679 data, data_size, | |
680 ret, ret_size, | |
681 event); | |
682 return res; | |
683 } | |
684 | |
685 pid_t Engine::init_device(void) | |
686 { | |
687 struct init_data { | |
688 int device_index; | |
689 int devices_total; | |
690 int console_level; | |
691 int offload_report_level; | |
692 } data; | |
693 COIRESULT res; | |
694 COIEVENT event; | |
695 pid_t pid; | |
696 | |
697 OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init, | |
698 "Initializing device with logical index %d " | |
699 "and physical index %d\n", | |
700 m_index, m_physical_index); | |
701 | |
702 // setup misc data | |
703 data.device_index = m_index; | |
704 data.devices_total = mic_engines_total; | |
705 data.console_level = console_enabled; | |
706 data.offload_report_level = offload_report_level; | |
707 | |
708 res = COI::PipelineRunFunction(get_pipeline(), | |
709 m_funcs[c_func_init], | |
710 0, 0, 0, 0, 0, | |
711 &data, sizeof(data), | |
712 &pid, sizeof(pid), | |
713 &event); | |
714 check_result(res, c_pipeline_run_func, m_index, res); | |
715 | |
716 res = COI::EventWait(1, &event, -1, 1, 0, 0); | |
717 check_result(res, c_event_wait, res); | |
718 | |
719 OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid); | |
720 | |
721 return pid; | |
722 } | |
723 | |
724 // data associated with each thread | |
725 struct Thread { | |
726 Thread(long* addr_coipipe_counter) { | |
727 m_addr_coipipe_counter = addr_coipipe_counter; | |
728 memset(m_pipelines, 0, sizeof(m_pipelines)); | |
729 } | |
730 | |
731 ~Thread() { | |
732 #ifndef TARGET_WINNT | |
733 __sync_sub_and_fetch(m_addr_coipipe_counter, 1); | |
734 #else // TARGET_WINNT | |
735 _InterlockedDecrement(m_addr_coipipe_counter); | |
736 #endif // TARGET_WINNT | |
737 for (int i = 0; i < mic_engines_total; i++) { | |
738 if (m_pipelines[i] != 0) { | |
739 COI::PipelineDestroy(m_pipelines[i]); | |
740 } | |
741 } | |
742 } | |
743 | |
744 COIPIPELINE get_pipeline(int index) const { | |
745 return m_pipelines[index]; | |
746 } | |
747 | |
748 void set_pipeline(int index, COIPIPELINE pipeline) { | |
749 m_pipelines[index] = pipeline; | |
750 } | |
751 | |
752 AutoSet& get_auto_vars() { | |
753 return m_auto_vars; | |
754 } | |
755 | |
756 private: | |
757 long* m_addr_coipipe_counter; | |
758 AutoSet m_auto_vars; | |
759 COIPIPELINE m_pipelines[MIC_ENGINES_MAX]; | |
760 }; | |
761 | |
762 COIPIPELINE Engine::get_pipeline(void) | |
763 { | |
764 Thread* thread = (Thread*) thread_getspecific(mic_thread_key); | |
765 if (thread == 0) { | |
766 thread = new Thread(&m_proc_number); | |
767 thread_setspecific(mic_thread_key, thread); | |
768 } | |
769 | |
770 COIPIPELINE pipeline = thread->get_pipeline(m_index); | |
771 if (pipeline == 0) { | |
772 COIRESULT res; | |
773 int proc_num; | |
774 | |
775 #ifndef TARGET_WINNT | |
776 proc_num = __sync_fetch_and_add(&m_proc_number, 1); | |
777 #else // TARGET_WINNT | |
778 proc_num = _InterlockedIncrement(&m_proc_number); | |
779 #endif // TARGET_WINNT | |
780 | |
781 if (proc_num > COI_PIPELINE_MAX_PIPELINES) { | |
782 LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES); | |
783 LIBOFFLOAD_ABORT; | |
784 } | |
785 | |
786 // Create pipeline for this thread | |
787 if (m_assigned_cpus == 0) { | |
788 // If m_assigned_cpus is NULL, it implies all threads | |
789 // Create the pipeline with no CPU mask | |
790 res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline); | |
791 } else { | |
792 // Create COI CPU mask | |
793 COI_CPU_MASK in_Mask; | |
794 res = COI::PipelineClearCPUMask(in_Mask); | |
795 check_result(res, c_clear_cpu_mask, m_index, res); | |
796 | |
797 int threads_per_core = m_num_threads / m_num_cores; | |
798 | |
799 // Available threads are defined by examining of m_assigned_cpus bitset. | |
800 // We skip thread 0. | |
801 for (int i = 1; i < m_num_threads; i++) { | |
802 // For available thread i m_assigned_cpus[i] is equal to 1 | |
803 if ((*m_assigned_cpus)[i]) { | |
804 COI_CPU_MASK_SET(i, in_Mask); | |
805 } | |
806 } | |
807 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this CPU thread\n" | |
808 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n" | |
809 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n", | |
810 in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3], | |
811 in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7], | |
812 in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11], | |
813 in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]); | |
814 | |
815 // Create the pipeline with allowable CPUs | |
816 res = COI::PipelineCreate(m_process, in_Mask, mic_stack_size, &pipeline); | |
817 } | |
818 check_result(res, c_pipeline_create, m_index, res); | |
819 thread->set_pipeline(m_index, pipeline); | |
820 } | |
821 return pipeline; | |
822 } | |
823 | |
824 Stream* Stream::find_stream(uint64_t handle, bool remove) | |
825 { | |
826 Stream *stream = 0; | |
827 | |
828 m_stream_lock.lock(); | |
829 { | |
830 StreamMap::iterator it = all_streams.find(handle); | |
831 if (it != all_streams.end()) { | |
832 stream = it->second; | |
833 if (remove) { | |
834 all_streams.erase(it); | |
835 } | |
836 } | |
837 } | |
838 m_stream_lock.unlock(); | |
839 return stream; | |
840 } | |
841 | |
842 void Engine::move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after) | |
843 { | |
844 if (cpu_what == cpu_after) { | |
845 return; | |
846 } | |
847 CpuEl* cpu_prev = cpu_what->prev; | |
848 | |
849 // remove cpu_what | |
850 if (!cpu_prev) { | |
851 m_cpu_head = cpu_what->next; | |
852 } | |
853 else { | |
854 cpu_prev->next = cpu_what->next; | |
855 } | |
856 if (cpu_what->next) { | |
857 cpu_what->next->prev = cpu_prev; | |
858 } | |
859 | |
860 // insert cpu_what after cpu_after | |
861 cpu_what->prev = cpu_after; | |
862 cpu_what->next = cpu_after->next; | |
863 if (cpu_after->next) { | |
864 cpu_after->next->prev = cpu_what; | |
865 } | |
866 cpu_after->next = cpu_what; | |
867 } | |
868 | |
869 COIPIPELINE Engine::get_pipeline(_Offload_stream handle) | |
870 { | |
871 Stream * stream = Stream::find_stream(handle, false); | |
872 | |
873 if (!stream) { | |
874 LIBOFFLOAD_ERROR(c_offload_no_stream, m_index); | |
875 LIBOFFLOAD_ABORT; | |
876 } | |
877 | |
878 COIPIPELINE pipeline = stream->get_pipeline(); | |
879 | |
880 if (pipeline == 0) { | |
881 COIRESULT res; | |
882 int proc_num; | |
883 COI_CPU_MASK in_Mask ; | |
884 | |
885 #ifndef TARGET_WINNT | |
886 proc_num = __sync_fetch_and_add(&m_proc_number, 1); | |
887 #else // TARGET_WINNT | |
888 proc_num = _InterlockedIncrement(&m_proc_number); | |
889 #endif // TARGET_WINNT | |
890 | |
891 if (proc_num > COI_PIPELINE_MAX_PIPELINES) { | |
892 LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES); | |
893 LIBOFFLOAD_ABORT; | |
894 } | |
895 | |
896 m_stream_lock.lock(); | |
897 | |
898 // start process if not done yet | |
899 if (m_process == 0) { | |
900 init_process(); | |
901 } | |
902 | |
903 // create CPUmask | |
904 res = COI::PipelineClearCPUMask(in_Mask); | |
905 check_result(res, c_clear_cpu_mask, m_index, res); | |
906 | |
907 int stream_cpu_num = stream->get_cpu_number(); | |
908 | |
909 stream->m_stream_cpus.reset(); | |
910 | |
911 int threads_per_core = m_num_threads / m_num_cores; | |
912 | |
913 | |
914 // Available threads is taken from m_cpus list. | |
915 // m_cpu_head points to the head of m_cpus. | |
916 // the elements of m_cpus is ordered by the number of usage in streams. | |
917 | |
918 CpuEl *cpu_el = m_cpu_head; | |
919 CpuEl *cpu_used_el, *cpu_used_prev, *cpu_prev; | |
920 | |
921 for (int i = 0; i < stream_cpu_num; i++) { | |
922 COI_CPU_MASK_SET(CPU_INDEX(cpu_el), in_Mask); | |
923 stream->m_stream_cpus.set(CPU_INDEX(cpu_el)); | |
924 //If the number of availabale threads is less than stream_cpu_num, | |
925 // the stream_cpu_num is restricted to this number. | |
926 if (!cpu_el->next) { | |
927 break; | |
928 } | |
929 if (i + 1 < stream_cpu_num) { | |
930 cpu_el = cpu_el->next; | |
931 } | |
932 } | |
933 | |
934 // assertion : cpu_el points to the last used thread | |
935 cpu_used_el = cpu_el; | |
936 while (cpu_used_el) { | |
937 cpu_used_el->count++; | |
938 cpu_el = cpu_prev = cpu_used_el; | |
939 cpu_used_prev = cpu_used_el->prev; | |
940 if (!cpu_el->next) { | |
941 cpu_used_el = cpu_used_prev; | |
942 continue; | |
943 } | |
944 | |
945 while (cpu_el) { | |
946 if (cpu_used_el->count < cpu_el->count) { | |
947 break; | |
948 } | |
949 // Equal used threads are ordered by thread number to | |
950 // assign to a stream as contiguous threads as possible. | |
951 else if (cpu_used_el->count == cpu_el->count && | |
952 CPU_INDEX(cpu_used_el) < CPU_INDEX(cpu_el)) { | |
953 break; | |
954 } | |
955 cpu_prev = cpu_el; | |
956 cpu_el = cpu_el->next; | |
957 } | |
958 if (cpu_used_el != cpu_prev) { | |
959 move_cpu_el_after(cpu_used_el, cpu_prev); | |
960 } | |
961 cpu_used_el = cpu_used_prev; | |
962 } | |
963 print_stream_cpu_list("get_pipeline"); | |
964 | |
965 // create pipeline for this thread | |
966 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this Stream\n" | |
967 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n" | |
968 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n", | |
969 in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3], | |
970 in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7], | |
971 in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11], | |
972 in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]); | |
973 res = COI::PipelineCreate(m_process, in_Mask, | |
974 mic_stack_size, &pipeline); | |
975 check_result(res, c_pipeline_create, m_index, res); | |
976 | |
977 // Set stream's affinities | |
978 { | |
979 struct affinity_spec affinity_spec; | |
980 char* affinity_type; | |
981 int i; | |
982 | |
983 // "compact" by default | |
984 affinity_spec.affinity_type = affinity_compact; | |
985 | |
986 // Check if user has specified type of affinity | |
987 if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) != | |
988 NULL) | |
989 { | |
990 char affinity_str[16]; | |
991 int affinity_str_len; | |
992 | |
993 OFFLOAD_DEBUG_TRACE(2, | |
994 "User has specified OFFLOAD_STREAM_AFFINITY=%s\n", | |
995 affinity_type); | |
996 | |
997 // Set type of affinity requested | |
998 affinity_str_len = strlen(affinity_type); | |
999 for (i=0; i<affinity_str_len && i<15; i++) | |
1000 { | |
1001 affinity_str[i] = tolower(affinity_type[i]); | |
1002 } | |
1003 affinity_str[i] = '\0'; | |
1004 if (strcmp(affinity_str, "compact") == 0) { | |
1005 affinity_spec.affinity_type = affinity_compact; | |
1006 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n"); | |
1007 } else if (strcmp(affinity_str, "scatter") == 0) { | |
1008 affinity_spec.affinity_type = affinity_scatter; | |
1009 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n"); | |
1010 } else { | |
1011 LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str); | |
1012 affinity_spec.affinity_type = affinity_compact; | |
1013 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n"); | |
1014 } | |
1015 } | |
1016 // Make flat copy of sink mask because COI's mask is opaque | |
1017 for (i=0; i<16; i++) { | |
1018 affinity_spec.sink_mask[i] = in_Mask[i]; | |
1019 } | |
1020 // Set number of cores and threads | |
1021 affinity_spec.num_cores = m_num_cores; | |
1022 affinity_spec.num_threads = m_num_threads; | |
1023 | |
1024 COIEVENT event; | |
1025 res = COI::PipelineRunFunction(pipeline, | |
1026 m_funcs[c_func_set_stream_affinity], | |
1027 0, 0, 0, | |
1028 0, 0, | |
1029 &affinity_spec, sizeof(affinity_spec), | |
1030 0, 0, | |
1031 &event); | |
1032 check_result(res, c_pipeline_run_func, m_index, res); | |
1033 | |
1034 res = COI::EventWait(1, &event, -1, 1, 0, 0); | |
1035 check_result(res, c_event_wait, res); | |
1036 } | |
1037 | |
1038 m_stream_lock.unlock(); | |
1039 stream->set_pipeline(pipeline); | |
1040 } | |
1041 return pipeline; | |
1042 } | |
1043 | |
1044 void Engine::stream_destroy(_Offload_stream handle) | |
1045 { | |
1046 // get stream | |
1047 Stream * stream = Stream::find_stream(handle, true); | |
1048 | |
1049 if (stream) { | |
1050 // return cpus for future use | |
1051 for (int i = 0; i < m_num_threads; i++) { | |
1052 if (stream->m_stream_cpus.test(i)) { | |
1053 CpuEl *cpu_el = m_cpus + i; | |
1054 CpuEl *cpu_first_el = cpu_el; | |
1055 // decrease count of thread "i" and move its CpuEl to the | |
1056 // proper place into the ordered list | |
1057 cpu_el->count--; | |
1058 while (cpu_el->prev) { | |
1059 if (cpu_first_el->count > cpu_el->prev->count) { | |
1060 break; | |
1061 } | |
1062 else if (cpu_first_el->count == cpu_el->prev->count && | |
1063 CPU_INDEX(cpu_first_el) > CPU_INDEX(cpu_el->prev)) { | |
1064 break; | |
1065 } | |
1066 cpu_el = cpu_el->prev; | |
1067 } | |
1068 cpu_el = cpu_el->prev; | |
1069 // If cpu_el for thread "i" must be moved in the list | |
1070 if (cpu_first_el != cpu_el) { | |
1071 // Thread "i" is used the least times. It must be set as | |
1072 // the m_cpu_head. | |
1073 if (!cpu_el) { | |
1074 if (!cpu_first_el->prev) { | |
1075 continue; | |
1076 } | |
1077 // remove cpu_el. | |
1078 cpu_first_el->prev->next = cpu_first_el->next; | |
1079 if (cpu_first_el->next) { | |
1080 cpu_first_el->next->prev = cpu_first_el->prev; | |
1081 } | |
1082 // make cpu_first_el as new m_cpu_head | |
1083 cpu_first_el->prev = NULL; | |
1084 cpu_first_el->next = m_cpu_head; | |
1085 m_cpu_head->prev = cpu_first_el; | |
1086 m_cpu_head = cpu_first_el; | |
1087 } | |
1088 else { | |
1089 move_cpu_el_after(cpu_first_el, cpu_el); | |
1090 } | |
1091 } | |
1092 } | |
1093 } | |
1094 print_stream_cpu_list("stream_destroy"); | |
1095 delete stream; | |
1096 } | |
1097 else { | |
1098 LIBOFFLOAD_ERROR(c_offload_no_stream, m_index); | |
1099 LIBOFFLOAD_ABORT; | |
1100 } | |
1101 } | |
1102 | |
1103 uint64_t Engine::get_thread_id(void) | |
1104 { | |
1105 Thread* thread = (Thread*) thread_getspecific(mic_thread_key); | |
1106 if (thread == 0) { | |
1107 thread = new Thread(&m_proc_number); | |
1108 thread_setspecific(mic_thread_key, thread); | |
1109 } | |
1110 | |
1111 return reinterpret_cast<uint64_t>(thread); | |
1112 } | |
1113 | |
1114 AutoSet& Engine::get_auto_vars(void) | |
1115 { | |
1116 Thread* thread = (Thread*) thread_getspecific(mic_thread_key); | |
1117 if (thread == 0) { | |
1118 thread = new Thread(&m_proc_number); | |
1119 thread_setspecific(mic_thread_key, thread); | |
1120 } | |
1121 | |
1122 return thread->get_auto_vars(); | |
1123 } | |
1124 | |
1125 void Engine::destroy_thread_data(void *data) | |
1126 { | |
1127 delete static_cast<Thread*>(data); | |
1128 } |