changeset 1963:6988e5478a8c draft

fix CudaScheduler
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Wed, 12 Feb 2014 17:56:40 +0900
parents fdffcf8feeab
children 33d07fd99291
files TaskManager/Cell/spe/SpeTaskManagerImpl.h TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/kernel/ppe/CpuThreads.cc TaskManager/kernel/ppe/CpuThreads.h example/word_count/main.cc
diffstat 6 files changed, 42 insertions(+), 98 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cell/spe/SpeTaskManagerImpl.h	Tue Feb 11 19:58:38 2014 +0900
+++ b/TaskManager/Cell/spe/SpeTaskManagerImpl.h	Wed Feb 12 17:56:40 2014 +0900
@@ -33,19 +33,10 @@
     void free_htask(HTaskPtr htask) {}
     void print_arch();
 
-#ifdef __CERIUM_GPU__
-    
+#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__
     SpeTaskManagerImpl(int i);
     void append_activeTask(HTask* p);
     void append_waitTask(HTask* p);
-    
-#endif
-#ifdef __CERIUM_CUDA__
-
-    SpeTaskManagerImpl(int i);
-    void append_activeTask(HTask* p);
-    void append_waitTask(HTask* p);
-
 #endif
 }  ;
 
--- a/TaskManager/Cuda/CudaScheduler.cc	Tue Feb 11 19:58:38 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Wed Feb 12 17:56:40 2014 +0900
@@ -46,17 +46,17 @@
     m->out_size = 0;
     m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
     m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
-    m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*));
     ret = cuStreamCreate(&(m->stream), 0);
     if (ret!=0)
         error(convert_error_status(ret));
+    m->kernelParams = (void**)malloc(m->allcate_size*2*sizeof(void*));
 }
 
 void
 CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) {
     free(m->memin);
     free(m->memout);
-    free(m->event);
+    free(m->kernelParams);
     ret = cuStreamDestroy(m->stream);
     if (ret!=0)
         error(convert_error_status(ret));
@@ -65,20 +65,20 @@
     m->in_size = 0;
     m->out_size = 0;
     m->allcate_size = 0;
-    m->event = 0;
     m->stream = 0;
+    m->kernelParams = 0;
 }
 
 void
-CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size) {
-    if (i > cudabuffer->allcate_size) {
+CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int param, size_t size) {
+    if (param > cudabuffer->allcate_size) {
         cudabuffer->allcate_size *= 2;
         cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
         cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
-        cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*));
+        cudabuffer->kernelParams = (void**)realloc(cudabuffer->kernelParams, cudabuffer->allcate_size*2*sizeof(void*));
     }
 
-    ret = cuMemAlloc(&mem[i], size);
+    ret = cuMemAlloc(&mem[param], size);
 }
 
 #define NOP_REPLY NULL
@@ -91,9 +91,6 @@
         mem[cur].memin[i] = 0;
     }
     for (int i=0; i<mem[cur].out_size; i++) {
-        if (mem[cur].event[i] != 0)
-            cuEventDestroy(mem[cur].event[i]);
-        mem[cur].event[i] = 0;
         if (mem[cur].memout[i])
             cuMemFree(mem[cur].memout[i]);
         mem[cur].memout[i] = 0;
@@ -104,10 +101,10 @@
 
 void
 CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) {
-    if (kernel_event[cur-1] == NOP_REPLY) {
+    if (cuEventQuery(kernel_event[cur]) == CUDA_SUCCESS) {
         
-    } else if (kernel_event[cur-1] != NULL){
-        ret = cuEventSynchronize(kernel_event[cur-1]);
+    } else if (cuEventQuery(kernel_event[cur]) == CUDA_ERROR_NOT_READY){
+        ret = cuEventSynchronize(kernel_event[cur]);
         
         if (ret!=0) {
             error(convert_error_status(ret));
@@ -117,19 +114,15 @@
             unsigned long end = 0;
             // timestamp 取る方法がない?
         }
-        ret = cuEventDestroy(kernel_event[cur-1]);
+        ret = cuEventDestroy(kernel_event[cur]);
         if (ret!=0) {
             error(convert_error_status(ret));
         }
-        kernel_event[cur-1] = 0;
-        
-        if (cudabuffer[cur-1].out_size > 0) {
-            for (int i = 0; i<cudabuffer[cur-1].out_size; i++) {
-                ret = cuEventSynchronize(cudabuffer[cur-1].event[i]);
-                if (ret!=0) error(convert_error_status(ret));
-            }
-        }
-        release_buf_event(cur-1, cudabuffer);
+        ret = cuEventCreate(&kernel_event[cur], CU_EVENT_DISABLE_TIMING);
+        if (ret!=0) {
+            error(convert_error_status(ret));
+        }        
+        release_buf_event(cur, cudabuffer);
     }
 
     if(reply) {
@@ -142,9 +135,10 @@
 void
 CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) {
     error(convert_error_status(ret));
-    if (kernel_event[cur] != 0)
+    if (cuEventQuery(kernel_event[cur]) == CUDA_ERROR_NOT_READY) {
         cuEventDestroy(kernel_event[cur]);
-    kernel_event[cur] = NOP_REPLY;
+        cuEventCreate(&kernel_event[cur], CU_EVENT_DISABLE_TIMING);
+    }
     kernel[cur] = 0;
     release_buf_event(cur, cudabuffer);
 
@@ -160,7 +154,7 @@
     
     for (int i = 0; i<STAGE; i++) {
         initCudaBuffer(&cudabuffer[i]);
-        kernel_event[i]=0;
+        cuEventCreate(&kernel_event[i], CU_EVENT_DISABLE_TIMING);
     }
 
     memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE);
@@ -198,10 +192,8 @@
                     connector->start_profile(); continue;
                 }
                 if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-                CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module;
-                const char *funcname = cuda_task_list[nextTask->command].name; // move to load_kernel.
                 
-                ret = cuModuleGetFunction(&kernel[cur], module, funcname);
+                ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name);
                 if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 
                 int param = 0;
@@ -240,40 +232,34 @@
                 }
                 cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event
                 
-                void** kernelParams; // move to cudabuffer.
-                
                 if (!flag[cur].flip) {
-                    kernelParams = (void**)malloc(sizeof(void*)*param);
                     for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                        kernelParams[i] = &cudabuffer[cur].memin[i];
+                        cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
                     }
                     for (int i = 0; i<cudabuffer[cur].out_size; i++) {
-                        kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
+                        cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
                     }
                 } else {
-                    kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size);
                     for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                        kernelParams[i] = &cudabuffer[cur].memin[i];
+                        cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
                     }
                 }
 
-                ret = cuEventCreate(&kernel_event[cur], 0);
                 if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
                     
                 if (tasklist->dim > 0) {
                     ret = cuLaunchKernel(kernel[cur],
-                                         tasklist->x*tasklist->y*tasklist->z, 1, 1,
+                                         tasklist->x, tasklist->y, tasklist->z,
                                          1, 1, 1,
-                                         0, cudabuffer[cur].stream, kernelParams, NULL);
+                                         0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL);
                 } else {
                     ret = cuLaunchKernel(kernel[cur],
                                          1, 1, 1,
                                          1, 1, 1,
-                                         0, cudabuffer[cur].stream, kernelParams, NULL);
+                                         0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL);
                 }
                 if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
 
-                ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream);
                 if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
                 
                 for(int i=0;i<nextTask->outData_count;i++) { // read output data
@@ -283,35 +269,22 @@
                     int i0 = flag[cur].flip ? i+1 : i ;
                     // flip use memin buffer and memout event
                     ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
-                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
 
-                    ret = cuEventCreate(&cudabuffer[cur].event[i], 0);
-                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-
-                    ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream);
+                    ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream);
                     if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 }
-                // wait kernel[cur] and write[cur]
-                // pipeline    : cur
-                // to stop pipeline set cur+1
-                if (cur == 0) {
-                    wait_for_event(kernel_event, cudabuffer, tasklist, STAGE);
-                } else {
-                    wait_for_event(kernel_event, cudabuffer, tasklist, cur);
-                }
                 cur++;
                 if (STAGE <= cur)
                     cur = 0;
-                free(kernelParams);
+                // wait kernel[cur] and write[cur]
+                // pipeline    : cur
+                // to stop pipeline set cur-11                
+                wait_for_event(kernel_event, cudabuffer, tasklist, cur);
             }
             reply = (memaddr)tasklist->waiter;
             param_addr = (memaddr)tasklist->next;
         }
-        if (cur == 0) {
-            wait_for_event(kernel_event, cudabuffer, tasklist, STAGE);
-        } else {
-            wait_for_event(kernel_event, cudabuffer, tasklist, cur);
-        }
+        wait_for_event(kernel_event, cudabuffer, tasklist, cur-1);
         for (int i = 0; i<STAGE; i++) {
             ret = cuStreamSynchronize(cudabuffer[i].stream);
             if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
@@ -348,7 +321,7 @@
 
     CUmodule* module = new CUmodule;
     ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename);
-
+    
     if(ret!=0) {
         error(convert_error_status(ret));
     }
--- a/TaskManager/Cuda/CudaScheduler.h	Tue Feb 11 19:58:38 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Wed Feb 12 17:56:40 2014 +0900
@@ -20,8 +20,8 @@
         int out_size;
         CUdeviceptr* memin;
         CUdeviceptr* memout;
-        CUevent* event;
         CUstream stream;
+        void** kernelParams;
     } CudaBuffer, *CudaBufferPtr;
     CudaScheduler();
     virtual ~CudaScheduler();
--- a/TaskManager/kernel/ppe/CpuThreads.cc	Tue Feb 11 19:58:38 2014 +0900
+++ b/TaskManager/kernel/ppe/CpuThreads.cc	Wed Feb 12 17:56:40 2014 +0900
@@ -53,10 +53,7 @@
 
     delete [] threads;
     delete [] args;
-#ifdef __CERIUM_GPU__
-    delete gpu;
-#endif
-#ifdef __CERIUM_CUDA__
+#if defined  __CERIUM_GPU__ || defined __CERIUM_CUDA__
     delete gpu;
 #endif
 }
@@ -126,10 +123,7 @@
     for (int i = 0; i < cpu_num+io_num; i++) {
         args[i].scheduler->connector->set_mail_waiter(w);
     }
-#ifdef __CERIUM_GPU__
-    gpu->set_mail_waiter(w);
-#endif
-#ifdef __CERIUM_CUDA__
+#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__
     gpu->set_mail_waiter(w);
 #endif
 }
@@ -158,10 +152,7 @@
 int
 CpuThreads::get_mail(int cpuid, int count, memaddr *ret)
 {
-#ifdef __CERIUM_GPU__
-    if (is_gpu(cpuid)) return gpu->get_mail(cpuid, count, ret);
-#endif
-#ifdef __CERIUM_CUDA__
+#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__
     if (is_gpu(cpuid)) return gpu->get_mail(cpuid, count, ret);
 #endif
     *ret = args[cpuid-id_offset].scheduler->mail_read_from_host();
@@ -171,10 +162,7 @@
 int
 CpuThreads::has_mail(int cpuid, int count, memaddr *ret)
 {
-#ifdef __CERIUM_GPU__
-    if (is_gpu(cpuid)) return gpu->has_mail(cpuid, count, ret);
-#endif
-#ifdef __CERIUM_CUDA__
+#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__
     if (is_gpu(cpuid)) return gpu->has_mail(cpuid, count, ret);
 #endif
     if (args[cpuid-id_offset].scheduler->has_mail_from_host() != 0) {
@@ -201,13 +189,7 @@
 void
 CpuThreads::send_mail(int cpuid, int num, memaddr *data)
 {
-#ifdef __CERIUM_GPU__
-    if (is_gpu(cpuid)){
-        gpu->send_mail(cpuid, num, data);
-        return;
-    }
-#endif
-#ifdef __CERIUM_CUDA__
+#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__
     if (is_gpu(cpuid)){
         gpu->send_mail(cpuid, num, data);
         return;
--- a/TaskManager/kernel/ppe/CpuThreads.h	Tue Feb 11 19:58:38 2014 +0900
+++ b/TaskManager/kernel/ppe/CpuThreads.h	Wed Feb 12 17:56:40 2014 +0900
@@ -4,9 +4,7 @@
 #include <pthread.h>
 #include "Threads.h"
 #include "GpuThreads.h"
-#ifdef __CERIUM_CUDA__
 #include "CudaThreads.h"
-#endif
 #include "TaskManagerImpl.h"
 #include "MainScheduler.h"
 #include "Sem.h"
--- a/example/word_count/main.cc	Tue Feb 11 19:58:38 2014 +0900
+++ b/example/word_count/main.cc	Wed Feb 12 17:56:40 2014 +0900
@@ -26,7 +26,7 @@
 int use_compat = 0;
 int use_iterate = 0;
 int use_iterate_all = 0;
-int array_task_num = 16;
+int array_task_num = 11;
 int spe_num = 1;
 CPU_TYPE spe_cpu = SPE_ANY;
 const char *usr_help_str = "Usage: ./word_count [-a -c -s] [-cpu spe_num] [-g] [-file filename]\n";