view liboffloadmic/runtime/offload_target.cpp @ 158:494b0b89df80 default tip

...
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 18:13:55 +0900
parents 04ced10e8804
children
line wrap: on
line source

/*
    Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
    are met:

      * Redistributions of source code must retain the above copyright
        notice, this list of conditions and the following disclaimer.
      * Redistributions in binary form must reproduce the above copyright
        notice, this list of conditions and the following disclaimer in the
        documentation and/or other materials provided with the distribution.
      * Neither the name of Intel Corporation nor the names of its
        contributors may be used to endorse or promote products derived
        from this software without specific prior written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


#include "offload_target.h"
#include <stdlib.h>
#include <unistd.h>
#ifdef SEP_SUPPORT
#include <fcntl.h>
#include <sys/ioctl.h>
#endif // SEP_SUPPORT
#include <omp.h>
#include <map>

// typedef offload_func_with_parms.
// Pointer to function that represents an offloaded entry point.
// The parameters are a temporary fix for parameters on the stack.
typedef void (*offload_func_with_parms)(void *);

// Target console and file logging
const char *prefix;
int console_enabled = 0;
int offload_report_level = 0;

// Trace information
static const char* vardesc_direction_as_string[] = {
    "NOCOPY",
    "IN",
    "OUT",
    "INOUT"
};
static const char* vardesc_type_as_string[] = {
    "unknown",
    "data",
    "data_ptr",
    "func_ptr",
    "void_ptr",
    "string_ptr",
    "dv",
    "dv_data",
    "dv_data_slice",
    "dv_ptr",
    "dv_ptr_data",
    "dv_ptr_data_slice",
    "cean_var",
    "cean_var_ptr",
    "c_data_ptr_array",
    "c_extended_type",
    "c_func_ptr_array",
    "c_void_ptr_array",
    "c_string_ptr_array",
    "c_data_ptr_ptr",
    "c_func_ptr_ptr",
    "c_void_ptr_ptr",
    "c_string_ptr_ptr",
    "c_cean_var_ptr_ptr",
};

int mic_index = -1;
int mic_engines_total = -1;
uint64_t mic_frequency = 0;
int offload_number = 0;
static std::map<void*, RefInfo*> ref_data;
static mutex_t add_ref_lock;

#ifdef SEP_SUPPORT
static const char*  sep_monitor_env = "SEP_MONITOR";
static bool         sep_monitor = false;
static const char*  sep_device_env = "SEP_DEVICE";
static const char*  sep_device =  "/dev/sep3.8/c";
static int          sep_counter = 0;

#define SEP_API_IOC_MAGIC   99
#define SEP_IOCTL_PAUSE     _IO (SEP_API_IOC_MAGIC, 31)
#define SEP_IOCTL_RESUME    _IO (SEP_API_IOC_MAGIC, 32)

static void add_ref_count(void * buf, bool created)
{
    mutex_locker_t locker(add_ref_lock);
    RefInfo * info = ref_data[buf];

    if (info) {
        info->count++;
    }
    else {
        info = new RefInfo((int)created,(long)1);
    }
    info->is_added |= created;
    ref_data[buf] = info;
}

static void BufReleaseRef(void * buf)
{
    mutex_locker_t locker(add_ref_lock);
    RefInfo * info = ref_data[buf];

    if (info) {
        --info->count;
        if (info->count == 0 && info->is_added) {
            OFFLOAD_TRACE(1, "Calling COIBufferReleaseRef AddRef count = %d\n",
                             ((RefInfo *) ref_data[buf])->count);
            BufferReleaseRef(buf);
            info->is_added = 0;
        }
    }
}

static int VTPauseSampling(void)
{
    int ret = -1;
    int handle = open(sep_device, O_RDWR);
    if (handle > 0) {
        ret = ioctl(handle, SEP_IOCTL_PAUSE);
        close(handle);
    }
    return ret;
}

static int VTResumeSampling(void)
{
    int ret = -1;
    int handle = open(sep_device, O_RDWR);
    if (handle > 0) {
        ret = ioctl(handle, SEP_IOCTL_RESUME);
        close(handle);
    }
    return ret;
}
#endif // SEP_SUPPORT

void OffloadDescriptor::offload(
    uint32_t  buffer_count,
    void**    buffers,
    void*     misc_data,
    uint16_t  misc_data_len,
    void*     return_data,
    uint16_t  return_data_len
)
{
    FunctionDescriptor *func = (FunctionDescriptor*) misc_data;
    const char *name = func->data;
    OffloadDescriptor ofld;
    char *in_data = 0;
    char *out_data = 0;
    char *timer_data = 0;

    console_enabled = func->console_enabled;
    timer_enabled = func->timer_enabled;
    offload_report_level = func->offload_report_level;
    offload_number = func->offload_number;
    ofld.set_offload_number(func->offload_number);

#ifdef SEP_SUPPORT
    if (sep_monitor) {
        if (__sync_fetch_and_add(&sep_counter, 1) == 0) {
            OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n");
            VTResumeSampling();
        }
    }
#endif // SEP_SUPPORT

    OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(),
                          c_offload_start_target_func,
                          "Offload \"%s\" started\n", name);

    // initialize timer data
    OFFLOAD_TIMER_INIT();

    OFFLOAD_TIMER_START(c_offload_target_total_time);

    OFFLOAD_TIMER_START(c_offload_target_descriptor_setup);

    // get input/output buffer addresses
    if (func->in_datalen > 0 || func->out_datalen > 0) {
        if (func->data_offset != 0) {
            in_data = (char*) misc_data + func->data_offset;
            out_data = (char*) return_data;
        }
        else {
            char *inout_buf = (char*) buffers[--buffer_count];
            in_data = inout_buf;
            out_data = inout_buf;
        }
    }

    // assign variable descriptors
    ofld.m_vars_total = func->vars_num;
    if (ofld.m_vars_total > 0) {
        uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc);

        ofld.m_vars = (VarDesc*) malloc(var_data_len);
        if (ofld.m_vars == NULL)
          LIBOFFLOAD_ERROR(c_malloc);
        memcpy(ofld.m_vars, in_data, var_data_len);

        ofld.m_vars_extra =
            (VarExtra*) malloc(ofld.m_vars_total * sizeof(VarExtra));
        if (ofld.m_vars == NULL)
          LIBOFFLOAD_ERROR(c_malloc);

        in_data += var_data_len;
        func->in_datalen -= var_data_len;
    }

    // timer data
    if (func->timer_enabled) {
        uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN();

        timer_data = out_data;
        out_data += timer_data_len;
        func->out_datalen -= timer_data_len;
    }

    // init Marshallers
    ofld.m_in.init_buffer(in_data, func->in_datalen);
    ofld.m_out.init_buffer(out_data, func->out_datalen);

    // copy buffers to offload descriptor
    std::copy(buffers, buffers + buffer_count,
              std::back_inserter(ofld.m_buffers));

    OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup);

    // find offload entry address
    OFFLOAD_TIMER_START(c_offload_target_func_lookup);

    offload_func_with_parms entry = (offload_func_with_parms)
        __offload_entries.find_addr(name);

    if (entry == NULL) {
#if OFFLOAD_DEBUG > 0
        if (console_enabled > 2) {
            __offload_entries.dump();
        }
#endif
        LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name);
        exit(1);
    }

    OFFLOAD_TIMER_STOP(c_offload_target_func_lookup);

    OFFLOAD_TIMER_START(c_offload_target_func_time);

    // execute offload entry
    entry(&ofld);

    OFFLOAD_TIMER_STOP(c_offload_target_func_time);

    OFFLOAD_TIMER_STOP(c_offload_target_total_time);

    // copy timer data to the buffer
    OFFLOAD_TIMER_TARGET_DATA(timer_data);

    OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name);

#ifdef SEP_SUPPORT
    if (sep_monitor) {
        if (__sync_sub_and_fetch(&sep_counter, 1) == 0) {
            OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n");
            VTPauseSampling();
        }
    }
#endif // SEP_SUPPORT
}

void OffloadDescriptor::merge_var_descs(
    VarDesc *vars,
    VarDesc2 *vars2,
    int vars_total
)
{
    // number of variable descriptors received from host and generated
    // locally should match
    if (m_vars_total < vars_total) {
        LIBOFFLOAD_ERROR(c_merge_var_descs1);
        exit(1);
    }

    for (int i = 0; i < m_vars_total; i++) {
        // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src

        if (i < vars_total) {
            // variable type must match
            if (m_vars[i].type.bits != vars[i].type.bits) {
                OFFLOAD_TRACE(2,
                    "m_vars[%d].type.bits=%08x, vars[%d].type.bits=%08x\n",
                    i, m_vars[i].type.bits, i, vars[i].type.bits);
                LIBOFFLOAD_ERROR(c_merge_var_descs2);
                exit(1);
            }

            if (m_vars[i].type.src == c_extended_type) {
                VarDescExtendedType *etype =
                    reinterpret_cast<VarDescExtendedType*>(vars[i].ptr);
                m_vars_extra[i].type_src = etype->extended_type;
                m_vars[i].ptr            = etype->ptr;
            }
            else {
                m_vars_extra[i].type_src = m_vars[i].type.src;
                if (!(m_vars[i].flags.use_device_ptr &&
                      m_vars[i].type.src == c_dv)) {
                    m_vars[i].ptr = vars[i].ptr;
                }
            }
            // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
            if (m_vars[i].type.dst == c_extended_type && i < vars_total) {
                VarDescExtendedType *etype =
                    reinterpret_cast<VarDescExtendedType*>(vars[i].into);
                m_vars_extra[i].type_dst = etype->extended_type;
                m_vars[i].into           = etype->ptr;
            }
            else {
                m_vars_extra[i].type_dst = m_vars[i].type.dst;
                m_vars[i].into = vars[i].into;
            }

            const char *var_sname = "";
            if (vars2 != NULL) {
                if (vars2[i].sname != NULL) {
                    var_sname = vars2[i].sname;
                }
            }
            OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var,
                "   VarDesc %d, var=%s, %s, %s\n",
                i, var_sname,
                vardesc_direction_as_string[m_vars[i].direction.bits],
                vardesc_type_as_string[m_vars_extra[i].type_src]);
            if (vars2 != NULL && vars2[i].dname != NULL) {
                OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
                    vardesc_type_as_string[m_vars_extra[i].type_dst]);
            }
        }
        else {
            m_vars_extra[i].type_src = m_vars[i].type.src;
            m_vars_extra[i].type_dst = m_vars[i].type.dst;
        }

        OFFLOAD_TRACE(2,
            "              type_src=%d, type_dstn=%d, direction=%d, "
            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n",
            m_vars_extra[i].type_src,
            m_vars_extra[i].type_dst,
            m_vars[i].direction.bits,
            m_vars[i].alloc_if,
            m_vars[i].free_if,
            m_vars[i].align,
            m_vars[i].mic_offset,
            m_vars[i].flags.bits,
            m_vars[i].offset,
            m_vars[i].size,
            m_vars[i].count,
            m_vars[i].ptr,
            m_vars[i].into);
    }
}

void OffloadDescriptor::scatter_copyin_data()
{
    OFFLOAD_TIMER_START(c_offload_target_scatter_inputs);

    OFFLOAD_DEBUG_TRACE(2, "IN  buffer @ %p size %lld\n",
                        m_in.get_buffer_start(),
                        m_in.get_buffer_size());
    OFFLOAD_DEBUG_DUMP_BYTES(2, m_in.get_buffer_start(),
                             m_in.get_buffer_size());

    // receive data
    for (int i = 0; i < m_vars_total; i++) {
        bool src_is_for_mic = (m_vars[i].direction.out ||
                               m_vars[i].into == NULL);
        void** ptr_addr = src_is_for_mic ?
                          static_cast<void**>(m_vars[i].ptr) :
                          static_cast<void**>(m_vars[i].into);
        int type = src_is_for_mic ? m_vars_extra[i].type_src :
                                    m_vars_extra[i].type_dst;
        bool is_static = src_is_for_mic ?
                         m_vars[i].flags.is_static :
                         m_vars[i].flags.is_static_dstn;
        void *ptr = NULL;

        if (m_vars[i].flags.alloc_disp) {
            int64_t offset = 0;
            m_in.receive_data(&offset, sizeof(offset));
        }
        if (VAR_TYPE_IS_DV_DATA_SLICE(type) ||
            VAR_TYPE_IS_DV_DATA(type)) {
            ArrDesc *dvp = (type == c_dv_data_slice || type == c_dv_data)?
                  reinterpret_cast<ArrDesc*>(ptr_addr) :
                  *reinterpret_cast<ArrDesc**>(ptr_addr);
            ptr_addr = reinterpret_cast<void**>(&dvp->Base);
        }
        // Set pointer values
        switch (type) {
            case c_data_ptr_array:
                {
                    int j = m_vars[i].ptr_arr_offset;
                    int max_el = j + m_vars[i].count;
                    char *dst_arr_ptr = (src_is_for_mic)?
                        *(reinterpret_cast<char**>(m_vars[i].ptr)) :
                        reinterpret_cast<char*>(m_vars[i].into);

                    // if is_pointer is 1 it means that pointer array itself
                    // is defined either via pointer or as class member.
                    // i.e. arr_ptr[0:5] or this->ARR[0:5]
                    if (m_vars[i].flags.is_pointer) {
                        int64_t offset = 0;
                        m_in.receive_data(&offset, sizeof(offset));
                        dst_arr_ptr = *((char**)dst_arr_ptr) + offset;
                    }
                    for (; j < max_el; j++) {
                        if (src_is_for_mic) {
                            m_vars[j].ptr =
                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
                        }
                        else {
                            m_vars[j].into =
                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
                        }
                    }
                }
                break;
            case c_data:
            case c_void_ptr:
            case c_void_ptr_ptr:
            case c_cean_var:
            case c_dv:
                break;

            case c_string_ptr:
            case c_data_ptr:
            case c_string_ptr_ptr:
            case c_data_ptr_ptr:
            case c_cean_var_ptr:
            case c_cean_var_ptr_ptr:
            case c_dv_ptr:
                // Don't need ptr_addr value for variables from stack buffer.
                // Stack buffer address is set at var_desc with #0.
                if (i != 0 && m_vars[i].flags.is_stack_buf) {
                    break;
                }
                if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
                    TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst)) {
                    int64_t offset;

                    m_in.receive_data(&offset, sizeof(offset));
                    ptr_addr = reinterpret_cast<void**>(
                                 reinterpret_cast<char*>(*ptr_addr) + offset);

                }

                if (m_vars[i].alloc_if && !m_vars[i].flags.preallocated) {
                    void *buf = NULL;
                    if (m_vars[i].flags.sink_addr) {
                        m_in.receive_data(&buf, sizeof(buf));
                    }
                    else {
                        buf = m_buffers.front();
                        m_buffers.pop_front();
                    }
                    if (buf) {
                        if (!is_static) {
                            if (!m_vars[i].flags.sink_addr) {
                                // increment buffer reference
                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
                                BufferAddRef(buf);
                                OFFLOAD_TRACE(1, "Calling COIBufferAddRef %p\n", buf);
                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
                            }
                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
                            OFFLOAD_TRACE(1, "    AddRef count = %d\n",
                                              ((RefInfo *) ref_data[buf])->count);
                        }
                        ptr = static_cast<char*>(buf) +
                                  m_vars[i].mic_offset +
                                  (m_vars[i].flags.is_stack_buf ?
                                   0 : m_vars[i].offset);

                    }
                    *ptr_addr = ptr;
                }
                else if (m_vars[i].flags.sink_addr) {
                    void *buf;
                    m_in.receive_data(&buf, sizeof(buf));
                    void *ptr = static_cast<char*>(buf) +
                                    m_vars[i].mic_offset +
                                    (m_vars[i].flags.is_stack_buf ?
                                     0 : m_vars[i].offset);
                    *ptr_addr = ptr;
                }
                break;

            case c_func_ptr:
            case c_func_ptr_ptr:
                break;

            case c_dv_data:
            case c_dv_ptr_data:
            case c_dv_data_slice:
            case c_dv_ptr_data_slice:
                if (m_vars[i].alloc_if) {
                    void *buf;
                    if (m_vars[i].flags.sink_addr) {
                        m_in.receive_data(&buf, sizeof(buf));
                    }
                    else {
                        buf = m_buffers.front();
                        m_buffers.pop_front();
                    }
                    if (buf) {
                        if (!is_static) {
                            if (!m_vars[i].flags.sink_addr) {
                                // increment buffer reference
                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
                                BufferAddRef(buf);
                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
                            }
                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
                        }
                        ptr = static_cast<char*>(buf) +
                            m_vars[i].mic_offset + m_vars[i].offset;
                    }
                    *ptr_addr = ptr;
                }
                else if (m_vars[i].flags.sink_addr) {
                    void *buf;
                    m_in.receive_data(&buf, sizeof(buf));
                    ptr = static_cast<char*>(buf) +
                          m_vars[i].mic_offset + m_vars[i].offset;
                    *ptr_addr = ptr;
                }
                break;

            default:
                LIBOFFLOAD_ERROR(c_unknown_var_type, type);
                abort();
        }
        // Release obsolete buffers for stack of persistent objects.
        // The vardesc with i==0 and flags.is_stack_buf==TRUE is always for
        // stack buffer pointer.
        if (i == 0 &&
            m_vars[i].flags.is_stack_buf &&
            !m_vars[i].direction.bits &&
            m_vars[i].alloc_if &&
            m_vars[i].size != 0) {
                for (int j=0; j < m_vars[i].size; j++) {
                    void *buf;
                    m_in.receive_data(&buf, sizeof(buf));
                    OFFLOAD_TRACE(4, "Releasing stack buffer %p\n", buf);
                    BufferReleaseRef(buf);
                    ref_data.erase(buf);
                }
        }
        // Do copyin
        switch (m_vars_extra[i].type_dst) {
            case c_data_ptr_array:
                break;
            case c_data:
            case c_void_ptr:
            case c_void_ptr_ptr:
            case c_cean_var:
                if (m_vars[i].direction.in &&
                    !m_vars[i].flags.is_static_dstn) {
                    int64_t size;
                    int64_t disp;
                    char* ptr = m_vars[i].into ?
                                 static_cast<char*>(m_vars[i].into) :
                                 static_cast<char*>(m_vars[i].ptr);
                    if (m_vars_extra[i].type_dst == c_cean_var) {
                        m_in.receive_data((&size), sizeof(int64_t));
                        m_in.receive_data((&disp), sizeof(int64_t));
                    }
                    else {
                        size = m_vars[i].size;
                        disp = 0;
                    }
                    m_in.receive_data(ptr + disp, size);
                }
                break;

            case c_dv:
                if (m_vars[i].direction.bits ||
                    m_vars[i].alloc_if ||
                    m_vars[i].free_if) {
                    char* ptr = m_vars[i].into ?
                                 static_cast<char*>(m_vars[i].into) :
                                 static_cast<char*>(m_vars[i].ptr);
                    m_in.receive_data(ptr + sizeof(uint64_t),
                                      m_vars[i].size - sizeof(uint64_t));
                }
                break;

            case c_string_ptr:
            case c_data_ptr:
            case c_string_ptr_ptr:
            case c_data_ptr_ptr:
            case c_cean_var_ptr:
            case c_cean_var_ptr_ptr:
            case c_dv_ptr:
            case c_dv_data:
            case c_dv_ptr_data:
            case c_dv_data_slice:
            case c_dv_ptr_data_slice:
                break;

            case c_func_ptr:
            case c_func_ptr_ptr:
                if (m_vars[i].direction.in) {
                    m_in.receive_func_ptr((const void**) m_vars[i].ptr);
                }
                break;

            default:
                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_dst);
                abort();
        }
    }

    OFFLOAD_TRACE(1, "Total copyin data received from host: [%lld] bytes\n",
                  m_in.get_tfr_size());

    OFFLOAD_TIMER_STOP(c_offload_target_scatter_inputs);

    OFFLOAD_TIMER_START(c_offload_target_compute);
}

void OffloadDescriptor::gather_copyout_data()
{
    OFFLOAD_TIMER_STOP(c_offload_target_compute);

    OFFLOAD_TIMER_START(c_offload_target_gather_outputs);

    for (int i = 0; i < m_vars_total; i++) {
        bool src_is_for_mic = (m_vars[i].direction.out ||
                               m_vars[i].into == NULL);
        if (m_vars[i].flags.is_stack_buf) {
            continue;
        }
        switch (m_vars_extra[i].type_src) {
            case c_data_ptr_array:
                break;
            case c_data:
            case c_void_ptr:
            case c_void_ptr_ptr:
            case c_cean_var:
                if (m_vars[i].direction.out &&
                    !m_vars[i].flags.is_static) {
                    m_out.send_data(
                        static_cast<char*>(m_vars[i].ptr) + m_vars[i].disp,
                        m_vars[i].size);
                }
                break;

            case c_dv:
                break;

            case c_string_ptr:
            case c_data_ptr:
            case c_string_ptr_ptr:
            case c_data_ptr_ptr:
            case c_cean_var_ptr:
            case c_cean_var_ptr_ptr:
            case c_dv_ptr:
                if (m_vars[i].free_if &&
                    src_is_for_mic &&
                    !m_vars[i].flags.preallocated &&
                    !m_vars[i].flags.is_static) {
                    void *buf = *static_cast<char**>(m_vars[i].ptr) -
                                    m_vars[i].mic_offset -
                                    (m_vars[i].flags.is_stack_buf?
                                     0 : m_vars[i].offset);
                    if (buf == NULL) {
                        break;
                    }
                    // decrement buffer reference count
                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
                    BufReleaseRef(buf);
                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
                }
                if (m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
                    m_out.send_data((void*) m_vars[i].ptr, sizeof(void*));
                }
                break;

            case c_func_ptr:
            case c_func_ptr_ptr:
                if (m_vars[i].direction.out) {
                    m_out.send_func_ptr(*((void**) m_vars[i].ptr));
                }
                break;

            case c_dv_data:
            case c_dv_ptr_data:
            case c_dv_data_slice:
            case c_dv_ptr_data_slice:
                if (src_is_for_mic &&
                    m_vars[i].free_if &&
                    !m_vars[i].flags.is_static) {
                    ArrDesc *dvp = (m_vars_extra[i].type_src == c_dv_data ||
                               m_vars_extra[i].type_src == c_dv_data_slice) ?
                               static_cast<ArrDesc*>(m_vars[i].ptr) :
                               *static_cast<ArrDesc**>(m_vars[i].ptr);

                    void *buf = reinterpret_cast<char*>(dvp->Base) -
                                m_vars[i].mic_offset -
                                m_vars[i].offset;

                    if (buf == NULL) {
                        break;
                    }

                    // decrement buffer reference count
                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
                    BufReleaseRef(buf);
                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
                }
                break;

            default:
                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_dst);
                abort();
        }

        if (m_vars[i].into) {
            switch (m_vars_extra[i].type_dst) {
                case c_data_ptr_array:
                    break;
                case c_data:
                case c_void_ptr:
                case c_void_ptr_ptr:
                case c_cean_var:
                case c_dv:
                    break;

                case c_string_ptr:
                case c_data_ptr:
                case c_string_ptr_ptr:
                case c_data_ptr_ptr:
                case c_cean_var_ptr:
                case c_cean_var_ptr_ptr:
                case c_dv_ptr:
                    if (m_vars[i].direction.in &&
                        m_vars[i].free_if &&
                        !m_vars[i].flags.is_static_dstn) {
                        void *buf = *static_cast<char**>(m_vars[i].into) -
                                    m_vars[i].mic_offset -
                                    (m_vars[i].flags.is_stack_buf?
                                     0 : m_vars[i].offset);

                        if (buf == NULL) {
                            break;
                        }
                        // decrement buffer reference count
                        OFFLOAD_TIMER_START(
                            c_offload_target_release_buffer_refs);
                        BufReleaseRef(buf);
                        OFFLOAD_TIMER_STOP(
                            c_offload_target_release_buffer_refs);
                    }
                    break;

                case c_func_ptr:
                case c_func_ptr_ptr:
                    break;

                case c_dv_data:
                case c_dv_ptr_data:
                case c_dv_data_slice:
                case c_dv_ptr_data_slice:
                    if (m_vars[i].free_if &&
                        m_vars[i].direction.in &&
                        !m_vars[i].flags.is_static_dstn) {
                        ArrDesc *dvp =
                            (m_vars_extra[i].type_dst == c_dv_data_slice ||
                             m_vars_extra[i].type_dst == c_dv_data) ?
                            static_cast<ArrDesc*>(m_vars[i].into) :
                            *static_cast<ArrDesc**>(m_vars[i].into);
                        void *buf = reinterpret_cast<char*>(dvp->Base) -
                              m_vars[i].mic_offset -
                              m_vars[i].offset;

                        if (buf == NULL) {
                            break;
                        }
                        // decrement buffer reference count
                        OFFLOAD_TIMER_START(
                            c_offload_target_release_buffer_refs);
                        BufReleaseRef(buf);
                        OFFLOAD_TIMER_STOP(
                            c_offload_target_release_buffer_refs);
                    }
                    break;

                default:
                    LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_dst);
                    abort();
            }
        }
    }

    OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n",
                        m_out.get_buffer_start(),
                        m_out.get_buffer_size());

    OFFLOAD_DEBUG_DUMP_BYTES(2,
                             m_out.get_buffer_start(),
                             m_out.get_buffer_size());

    OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data,
                  "Total copyout data sent to host: [%lld] bytes\n",
                  m_out.get_tfr_size());

    OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs);
}

void __offload_target_init(void)
{
#ifdef SEP_SUPPORT
    const char* env_var = getenv(sep_monitor_env);
    if (env_var != 0 && *env_var != '\0') {
        sep_monitor = atoi(env_var);
    }
    env_var = getenv(sep_device_env);
    if (env_var != 0 && *env_var != '\0') {
        sep_device = env_var;
    }
#endif // SEP_SUPPORT

    prefix = report_get_message_str(c_report_mic);

    // init frequency
    mic_frequency = COIPerfGetCycleFrequency();
}

// User-visible offload API

int _Offload_number_of_devices(void)
{
    return mic_engines_total;
}

int _Offload_get_device_number(void)
{
    return mic_index;
}

int _Offload_get_physical_device_number(void)
{
    uint32_t index;
    EngineGetIndex(&index);
    return index;
}