changeset 1925:cd5bbd8ec5d6 draft

fix CudaScheduler
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Fri, 31 Jan 2014 05:56:23 +0900
parents 14f9fc88872c
children 777bdbf6c072
files TaskManager/Cell/spe/SpeTaskManagerImpl.h TaskManager/Cuda/CudaError.h TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/Cuda/CudaThreads.cc TaskManager/Cuda/CudaThreads.h TaskManager/Makefile.cuda TaskManager/kernel/schedule/Scheduler.h example/Cuda/main.cc
diffstat 9 files changed, 146 insertions(+), 97 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cell/spe/SpeTaskManagerImpl.h	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cell/spe/SpeTaskManagerImpl.h	Fri Jan 31 05:56:23 2014 +0900
@@ -40,6 +40,14 @@
     void append_waitTask(HTask* p);
     
 #endif
+
+#ifdef __CERIUM_CUDA__
+
+    SpeTaskManagerImpl(int i);
+    void append_activeTask(HTask* p);
+    void append_waitTask(HTask* p);
+
+#endif
 }  ;
 
 
--- a/TaskManager/Cuda/CudaError.h	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cuda/CudaError.h	Fri Jan 31 05:56:23 2014 +0900
@@ -37,7 +37,7 @@
     const char* message;
 
     for(int i=0; Error_Status[i].status_string != NULL; i++) {
-        if (Error_Status[i].status = status) {
+        if (Error_Status[i].status == status) {
             message = Error_Status[i].status_string;
         }
     }
--- a/TaskManager/Cuda/CudaScheduler.cc	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Fri Jan 31 05:56:23 2014 +0900
@@ -1,6 +1,6 @@
 #include "TaskManager.h"
 #include "CudaScheduler.h"
-#include "ReferenceDmaManager.h"
+#include "ReferencedDmaManager.h"
 #include "PreRefDmaManager.h"
 #include "SchedTask.h"
 #include "CudaError.h"
@@ -25,9 +25,9 @@
     cuInit(0);
     cuDeviceGetCount(&ret_num_devices);
     if (ret_num_devices == 0) {
-        exit(EXIT_FILURE);
+        exit(EXIT_FAILURE);
     }
-    cuDeviceGet(&context, 0);
+    cuDeviceGet(&device, 0);
     ret = cuCtxCreate(&context, 0, device);
     if (ret!=0) {
         error(convert_error_status(ret));
@@ -47,9 +47,9 @@
     m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
     m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
     m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*));
-    error = cuStreamCreate(&m->stream, 0);
-    if (error!=0)
-        error(convert_error_status(error));
+    ret = cuStreamCreate(&m->stream, 0);
+    if (ret!=0)
+        error(convert_error_status(ret));
 }
 
 void
@@ -57,23 +57,28 @@
     free(m->memin);
     free(m->memout);
     free(m->event);
-    m->size = 0;
+    ret = cuStreamDestroy(m->stream);
+    if (ret!=0)
+        error(convert_error_status(ret));
+    m->memin = 0;
+    m->memout = 0;
+    m->in_size = 0;
+    m->out_size = 0;
     m->allcate_size = 0;
-    m->buf = 0;
     m->event = 0;
-    cuStreamDestroy(m->stream);
+    m->stream = 0;
 }
 
 CUdeviceptr
-CudaScheduler::createBuffer(CUdeviceptr* mem, int i, size_t size, int* error) {
-    if (i > m->allcate_size) {
-        m->allcate_size *= 2;
-        m->memin = (CUdeviceptr*)realloc(m->memin, m->allcate_size*sizeof(CUdeviceptr*));
-        m->memout = (CUdeviceptr*)realloc(m->memout, m->allcate_size*sizeof(CUdeviceptr*));
-        m->event = (CUevent*)remalloc(m->event, m->allcate_size*sizeof(CUevent*));
+CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error) {
+    if (i > cudabuffer->allcate_size) {
+        cudabuffer->allcate_size *= 2;
+        cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
+        cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
+        cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*));
     }
 
-    error = cuMemAlloc(mem[i], size);
+    error = (int*)cuMemAlloc(&mem[i], size);
     
     return mem[i];
 }
@@ -82,23 +87,29 @@
 
 static void
 release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) {
-    for (int i=0; i<mem[1-cur].in_size+mem[]; i++) {
-        if (mem[1-cur].event[i] != 0)
-            cuEventDestroy(mem[1-cur].event[i]);
-        mem[1-cur].event[i] = 0;
-        if (mem[1-cur].buf[i])
-            cuMemFree(mem[1-cur].buf[i]);
-        mem[1-cur].buf[i] = 0;
+    for (int i=0; i<mem[cur-1].in_size; i++) {
+        if (mem[cur-1].memin[i])
+            cuMemFree(mem[cur-1].memin[i]);
+        mem[cur-1].memin[i] = 0;
     }
-    mem[1-cur].size = 0;
+    for (int i=0; i<mem[cur-1].out_size; i++) {
+        if (mem[cur-1].event[i] != 0)
+            cuEventDestroy(mem[cur-1].event[i]);
+        mem[cur-1].event[i] = 0;
+        if (mem[1-cur].memout[i])
+            cuMemFree(mem[cur-1].memout[i]);
+        mem[cur-1].memout[i] = 0;
+    }
+    mem[cur-1].in_size = 0;
+    mem[cur-1].out_size = 0;
 }
 
 void
-CudaBufferPtr::wait_for_event(CUevent* kernel_event, CudaBufferPtr memout, CudaBufferPtr memin, TaskListPtr taskList, int cur) {
-    if (kernel_event[1-cur] == NOP_REPLY) {
-
-    } else {
-        ret = cuEventSynchronize(kernel_event[1-cur]);
+CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) {
+    if (kernel_event[cur-1] == NOP_REPLY) {
+        
+    } else if (kernel_event[cur-1] != NULL){
+        ret = cuEventSynchronize(kernel_event[cur-1]);
 
         if (ret!=0) {
             error(convert_error_status(ret));
@@ -108,18 +119,19 @@
             unsigned long end = 0;
             // timestamp 取る方法がない?
         }
-        cuEventDestroy(kernel_event[1-cur]);
-        kernel_event[1-cur] = 0;
-    }
-
-    if (memout[1-cur].size > 0) {
-        ret = cuEventSynchronize(memout[1-cur].event);
-        if (ret!=0) error(convert_error_status(ret));
-        release_buf_event(cur, memout);
-    }        
-
-    if (memin[1-cur].size > 0) {
-        release_buf_event(cur, memin);
+        ret = cuEventDestroy(kernel_event[cur-1]);
+        if (ret!=0) {
+            error(convert_error_status(ret));
+        }
+        kernel_event[cur-1] = 0;
+        
+        if (cudabuffer[cur-1].out_size > 0) {
+            for (int i = 0; i<cudabuffer[cur-1].out_size; i++) {
+                ret = cuEventSynchronize(cudabuffer[cur-1].event[i]);
+                if (ret!=0) error(convert_error_status(ret));
+            }
+        }
+        release_buf_event(cur, cudabuffer);
     }
 
     if(reply) {
@@ -130,18 +142,15 @@
 }
 
 void
-CudaScheduler::CudaTaskError(int cur, TaskListPtr taskList, int ret) {
+CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) {
     error(convert_error_status(ret));
     if (kernel_event[cur] != 0)
         cuEventDestroy(kernel_event[cur]);
     kernel_event[cur] = NOP_REPLY;
-    //    if (kernel[cur] != 0)
-    // kerneldestroy();
     kernel[cur] = 0;
-    release_buf_event(1-cur, memout);
-    release_buf_event(1-cur, memin);
+    release_buf_event(cur+1, cudabuffer);
 
-    wait_for_event(kernel_event, memout, taskList, cur);
+    wait_for_event(kernel_event, cudabuffer, taskList, cur);
 }
 
 void
@@ -161,8 +170,11 @@
     for (;;) {
         memaddr param_addr = connector->task_list_mail_read();
 
-        if ((memaddr)param_addr === (memaddr)MY_SPE_COMMAND_EXIT) {
+        if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
             for (int i = 0; i<stage; i++) {
+                ret = cuStreamSynchronize(cudabuffer[i].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
+                
                 destroyCudaBuffer(&cudabuffer[i]);
             }
             free(cudabuffer);
@@ -170,9 +182,9 @@
         }
 
         (*connector->start_dmawait_profile)(&(connector->start_time));
-        while (params_addr) {
+        while (param_addr) {
             // since we are on the same memory space, we don't has to use dma_load here
-            tasklist = (TaskListPtr)connector->dma_load(this, params_addr,
+            tasklist = (TaskListPtr)connector->dma_load(this, param_addr,
                                                         sizeof(TaskList), DMA_READ_TASKLIST);
             //            tasklist[cur]->task_start_time = gettime();
             tasklist->task_start_time = 0;
@@ -192,32 +204,32 @@
                 if(nextTask->command==StartProfile) {
                     connector->start_profile(); continue;
                 }
-                if (load_kernel(nextTask->command) == 0) { cudaTaskError(cur,tasklist,ret); continue; }
+                if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module;
                 const char *funcname = cuda_task_list[nextTask->command].name;
                 
-                ret = cuModuleGetFunction(kernel[cur], module, funcname);
-                if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                ret = cuModuleGetFunction(&kernel[cur], module, funcname);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
 
                 int param = 0;
 
                 // set arg count
-                CUdeviceptr memparam = createBuffer(cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret);
-                if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                CUdeviceptr memparam = createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
 
                 // parameter is passed as first kernel arg 
                 ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream);
-                if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 
                 param++;
                 
                 for(int i=0;i<nextTask->inData_count;i++) {
                     ListElement *input_buf = nextTask->inData(i);
                     if (input_buf->size==0) break;
-                    createBuffer(cudabuffer[cur].memin, param, input_buf->size, &ret);
-                    if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size, &ret);
+                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                     ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
-                    if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                     
                     param++;
                 }
@@ -227,67 +239,86 @@
                     ListElement *output_buf = nextTask->outData(i);
                     if (output_buf->size==0) break;
                     if (!flag[cur].flip) { // flip use memin for output 
-                        createBuffer(cudabuffer[cur].memout, i, output_buf->size, &ret);
-                        if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                        createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size, &ret);
+                        if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                         // enqueue later
                     }
                     param++;
                 }
                 cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event
 
-                void* kernelParams;
+                void** kernelParams;
 
-                if (!flag[cur.flip]) {
-                    kernelParams = malloc(sizeof(void*)*param);
-                    kernelParams[0] = memparam;
+                if (!flag[cur].flip) {
+                    kernelParams = (void**)malloc(sizeof(void*)*param);
+                    kernelParams[0] = &memparam;
                     for (int i = 1; i<cudabuffer[cur].in_size; i++) {
-                        kernelParams[i] = cudabuffer[cur].memin[i-1];
+                        kernelParams[i] = &cudabuffer[cur].memin[i-1];
                     }
                     for (int i = 0; i<cudabuffer[cur].out_size; i++) {
-                        kernelParams[i+cudabuffer[cur].in_size] = cudabuffer[cur].memout[i];
+                        kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
                     }
                 } else {
-                    kernelParams = malloc(sizeof(void*)*cudabuffer[cur].in_size);
-                    kernelParams[0] = memparam;
+                    kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size);
+                    kernelParams[0] = &memparam;
                     for (int i = 1; i<cudabuffer[cur].in_size; i++) {
-                        kernelParams[i] = memin[cur].buf[i-1];
+                        kernelParams[i] = &cudabuffer[cur].memin[i-1];
                     }
                 }
-                
+
+                ret = cuEventCreate(&kernel_event[cur], 0);
+                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
+                    
                 if (tasklist->dim > 0) {
                     ret = cuLaunchKernel(kernel[cur],
                                          tasklist->x*tasklist->y*tasklist->z, 1, 1,
                                          1, 1, 1,
-                                         stream, kernelParams, NULL);
+                                         0, cudabuffer[cur].stream, kernelParams, NULL);
                 } else {
                     ret = cuLaunchKernel(kernel[cur],
                                          1, 1, 1,
                                          1, 1, 1,
-                                         stream, kernelParams, NULL);
+                                         0, cudabuffer[cur].stream, kernelParams, NULL);
                 }
-                if (ret!=0) { cudaTaskError(cur, tasklist, ret); continue; }
+                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
 
+                ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
+                
                 for(int i=0;i<nextTask->outData_count;i++) { // read output data
                     ListElement *output_buf = nextTask->outData(i);
                     if (output_buf->size==0) break;
-                    GpuBufferPtr mem = flag[cur].flip ? memin : memout ;
+                    CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ;
                     int i0 = flag[cur].flip ? i+1 : i ;
                     // flip use memin buffer and memout event
-                    ret = cuMemcpyDtoHAsync(mem[cur].buf[i0], output_buf->addr, output_buf->size, stream);
-                    if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
+                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+
+                    ret = cuEventCreate(&cudabuffer[cur].event[i], 0);
+                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+
+                    ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream);
+                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 }
                 // wait kernel[1-cur] and write[1-cur]
                 // pipeline    : cur
-                // to stop pipeline set 1-cur
-                wait_for_event(kernel_event, memout, tasklist, cur);
-                cur = 1 - cur;
+                // to stop pipeline set cur+1
+                if (cur == 0) {
+                    wait_for_event(kernel_event, cudabuffer, tasklist, stage); // to stop pipeline comment out this line
+                } else {
+                    wait_for_event(kernel_event, cudabuffer, tasklist, cur);
+                }
+                cur += 1;
+                if (stage <= cur)
+                    cur = 0;
                 free(kernelParams);
+                cuModuleUnload(module);
             }
             reply = (memaddr)tasklist->waiter;
-            params_addr = (memaddr)tasklist->next;
+            param_addr = (memaddr)tasklist->next;
         }
-        wait_for_event(kernel_event, memout, tasklist, cur);
-
+        wait_for_event(kernel_event, cudabuffer, tasklist, cur);
+        
         unsigned long long wait = 0;
         (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
         connector->mail_write((memaddr)MY_SPE_STATUS_READY);
--- a/TaskManager/Cuda/CudaScheduler.h	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Fri Jan 31 05:56:23 2014 +0900
@@ -7,8 +7,6 @@
 #include "HTask.h"
 #include "TaskManager.h"
 #include <cuda.h>
-#include <cuda_runtime.h>
-
 
 extern TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
@@ -34,7 +32,7 @@
     // Cuda の場合、NVIDIA だけなので必要ない?
     CUdevice device;
     unsigned int ret_num_platforms; // たぶん要らない
-    unsigned int ret_num_devices;
+    int ret_num_devices;
     CUcontext context;
     // command_queue command_queue;
     // Cuda には command_queue に相当するものはない
@@ -47,14 +45,14 @@
     CUfunction kernel[2];
     CUevent kernel_event[2];
     CudaBuffer* cudabuffer;
-    HTask::htask_flag[2];
+    HTask::htask_flag flag[2];
     
- privete:
+ private:
     int load_kernel(int cmd);
-    CUdeviceptr createBuffer(CudaBufferPtr m, int i, CUcontext context, /* mem_flag mem_flag, */size_t size, int* error);
+    CUdeviceptr createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error);
     void initCudaBuffer(CudaBufferPtr m);
     void destroyCudaBuffer(CudaBufferPtr m);
-    void CudaTaskError(int cur, TaskListPtr taskList, int ret);
+    void CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret);
 };
 
 #define CudaSchedRegister(str, filename, functionname)  \
--- a/TaskManager/Cuda/CudaThreads.cc	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cuda/CudaThreads.cc	Fri Jan 31 05:56:23 2014 +0900
@@ -11,7 +11,7 @@
     args = new cuda_thread_arg_t;
 }
 
-CuduThreads::~CudaThreads()
+CudaThreads::~CudaThreads()
 {
     memaddr mail = (memaddr)MY_SPE_COMMAND_EXIT;
     send_mail(0,1,&mail);
--- a/TaskManager/Cuda/CudaThreads.h	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Cuda/CudaThreads.h	Fri Jan 31 05:56:23 2014 +0900
@@ -3,7 +3,6 @@
 
 #include <pthread.h>
 #include <cuda.h>
-#include <cuda_runtime.h>
 #include "Threads.h"
 #include "CudaScheduler.h"
 #include "Sem.h"
--- a/TaskManager/Makefile.cuda	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/Makefile.cuda	Fri Jan 31 05:56:23 2014 +0900
@@ -1,7 +1,7 @@
 include ./Makefile.def
 TARGET = libCudaManager.a
 CFLAGS += -DHAS_POSIX_MEMALIGN
-CUDA_PATH = /Developer/NVIDIA/CUDA-5.5/include
+INCLUDE += -I/Developer/NVIDIA/CUDA-5.5/include
 
 VPATH = CUDA_PATH
 
--- a/TaskManager/kernel/schedule/Scheduler.h	Thu Jan 30 16:22:51 2014 +0900
+++ b/TaskManager/kernel/schedule/Scheduler.h	Fri Jan 31 05:56:23 2014 +0900
@@ -20,6 +20,10 @@
 #endif
 #endif
 
+#ifdef __CERIUM_CUDA__
+#include <cuda.h>
+#endif
+
 #define MAX_USER_TASK 100
 #define MAX_SYSTEM_TASK 2
 #define MAX_TASK_OBJECT MAX_USER_TASK + MAX_SYSTEM_TASK
@@ -40,6 +44,11 @@
 #endif
 } GpuTaskObject;
 
+typedef struct cuda_task_object {
+#ifdef __CERIUM_CUDA__
+    CUmodule* module;
+#endif
+} CudaTaskObject;
 // Task Object Table
 //  this is named TaskObjectRun but it is not an object.
 //  It is a pointer to an object creation function
@@ -57,7 +66,7 @@
     void (*wait)(Scheduler *,int);
 
     GpuTaskObject *gputask;
-
+    CudaTaskObject* cudatask;
 }  __attribute__ ((aligned (DEFAULT_ALIGNMENT))) //sizeはどれくらい?
       TaskObject, *TaskObjectPtr;
 
--- a/example/Cuda/main.cc	Thu Jan 30 16:22:51 2014 +0900
+++ b/example/Cuda/main.cc	Fri Jan 31 05:56:23 2014 +0900
@@ -56,11 +56,15 @@
     cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream);
     cuMemcpyHtoDAsync(devC, C, LENGTH*sizeof(float), stream);
 
-    void* args[] = {&devA, &devB, &devC};
+    //    void* args[] = {&devA, &devB, &devC};
+    void** args=(void**)malloc(sizeof(void*)*3);
+    args[0] = &devA;
+    args[1] = &devB;
+    args[2] = &devC;
     
     cuLaunchKernel(function,
                    LENGTH, 1, 1,
-                   2, 1, 1,
+                   1, 1, 1,
                    0, stream, args, NULL);
     
     cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream);