changeset 319:a15511b1a6e0

separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Wed, 15 Feb 2017 20:43:55 +0900
parents 054c47e6ca20
children f730761bb044
files src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAWorker.cbc src/parallel_execution/CUDAtwice.cbc src/parallel_execution/TaskManagerImpl.cbc src/parallel_execution/cuda.c src/parallel_execution/main.cbc
diffstat 6 files changed, 140 insertions(+), 84 deletions(-) [+]
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt	Wed Feb 15 16:45:10 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Wed Feb 15 20:43:55 2017 +0900
@@ -67,9 +67,9 @@
       TARGET
           CUDAtwice
       SOURCES 
-          main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu
+          main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu cuda.c
     )
-    set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1")
+    set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1 -DUSE_CUDA_MAIN_THREAD")
 endif()
 
 GearsCommand(
--- a/src/parallel_execution/CUDAWorker.cbc	Wed Feb 15 16:45:10 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Wed Feb 15 20:43:55 2017 +0900
@@ -4,16 +4,11 @@
 #include <stdlib.h>
 #include <libkern/OSAtomic.h>
 
-// includes, project
-#include <driver_types.h>
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include "helper_cuda.h"
-
 #include "../context.h"
 
+extern void cudaInit(struct CUDAWorker *cudaWorker,int phase) ;
+
 static void start_CUDAworker(Worker* worker);
-static void cudaInit(struct CUDAWorker *cudaWorker,int phase) ;
 
 volatile int cuda_initialized = 0;
 
@@ -27,33 +22,18 @@
     worker->tasks = queue;
     cudaWorker->id = id;
     worker->shutdown = C_shutdownCUDAWorker;
-    // pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
+#ifndef USE_CUDA_MAIN_THREAD
+    pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
+#else
     if (im) {
         im->workers[0] = worker;
     }
     cuda_initialized = 1;
     start_CUDAworker(worker);
+#endif
     return worker;
 }
 
-static void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
-    // initialize and load kernel
-    cudaWorker->num_stream = 1; // number of stream
-//    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
-   if (phase==0)
-    checkCudaErrors(cuInit(0));
-   if (phase==0)
-    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
-   if (phase==0)
-    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
-//    if (cudaWorker->num_stream) {
-//        for (int i=0;i<cudaWorker->num_stream;i++)
-//            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
-//    }
-    CUdeviceptr devA;
-    checkCudaErrors(cuMemAlloc(&devA, 16));
-
-}
 
 static void start_CUDAworker(Worker* worker) {
     CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
@@ -77,10 +57,6 @@
 __code getTaskCUDA(struct Worker* worker, struct Context* task) {
     if (!task)
         return; // end thread
-//    if (cuda_initialized==0 || 1) {
-//        CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-//        cudaInit(cudaWorker,1);
-//    }
     worker->taskReceive = C_taskReceiveCUDAWorker;
     task->worker = worker;
     enum Code taskCg = task->next;
@@ -134,11 +110,10 @@
                  
 }
 
+extern void cudaShutdown( CUDAWorker *cudaWorker) ;
 
 __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) {
-//    for (int i=0;i<worker->num_stream;i++)
-//        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
-    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
+    cudaShutdown( worker) ;
 }
 
 __code shutdownCUDAWorker_stub(struct Context* context) {
--- a/src/parallel_execution/CUDAtwice.cbc	Wed Feb 15 16:45:10 2017 +0900
+++ b/src/parallel_execution/CUDAtwice.cbc	Wed Feb 15 20:43:55 2017 +0900
@@ -1,49 +1,8 @@
 #include <stdio.h>
 #include "../context.h"
 
-#include <cuda.h>
 
-#include <cuda_runtime.h>
-#include "helper_cuda.h"
-
-static void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) {
-    // Worker *worker = context->worker;
-    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-  // memory allocate
-    CUdeviceptr devA;
-    CUdeviceptr devLoopCounter;
-printf("CUdA Exe 1\n");
-
-    checkCudaErrors(cuMemAlloc(&devA, array->size));
-    checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
-
-    //twiceカーネルが定義されてなければそれをロードする
-printf("CUdA Exe 2\n");
-    checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx"));
-printf("CUdA Exe 3\n");
-    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
-printf("CUdA Exe 4\n");
-
-    //入力のDataGearをGPUにbuffer経由で送る
-    // Synchronous data transfer(host to device)
-    checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
-    checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
-
-  // Asynchronous launch kernel
-     context->num_exec = 1;
-     void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA};
-     checkCudaErrors(cuLaunchKernel(context->function,
-                       1, 1, 1,
-                       1, 1, 1,
-                                 0, NULL , args, NULL));
-
-    //結果を取ってくるコマンドを入力する
-    //コマンドの終了待ちを行う   
-    checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
-
-    // wait for stream
-    checkCudaErrors(cuCtxSynchronize());
-}
+extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) ;
 
 __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
     int i = loopCounter->i;
--- a/src/parallel_execution/TaskManagerImpl.cbc	Wed Feb 15 16:45:10 2017 +0900
+++ b/src/parallel_execution/TaskManagerImpl.cbc	Wed Feb 15 20:43:55 2017 +0900
@@ -35,8 +35,11 @@
     for (;i<taskManager->cpu;i++) {
 #ifdef USE_CUDAWorker
         Queue* queue = createSynchronizedQueue(context);
-        // taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue,0);
+#ifndef USE_CUDA_MAIN_THREAD
+        taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue,0);
+#else
         taskManagerImpl->workers[i] = (Worker*)queue;
+#endif
 #else
         Queue* queue = createSynchronizedQueue(context);
         taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue);
@@ -49,13 +52,9 @@
 }
 
 __code createTask(struct TaskManager* taskManager) {
-    TaskManager *t = (TaskManager *)taskManager->taskManager;
-    TaskManagerImpl *im = (TaskManagerImpl *)t->taskManager;
-
     taskManager->context = NEW(struct Context);
     initContext(taskManager->context);
     taskManager->context->taskManager = taskManager;
-    struct Queue* tasks = im->workers[0]->tasks;
     goto meta(context, C_setWorker);
 }
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/cuda.c	Wed Feb 15 20:43:55 2017 +0900
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <sys/time.h>
+#include <string.h>
+#include <stdlib.h>
+#include <libkern/OSAtomic.h>
+
+// includes, project
+#include <driver_types.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include "helper_cuda.h"
+#include "pthread.h"
+
+// #include "context.h"
+
+struct Context {
+    int next;
+    struct Worker* worker;
+    struct TaskManager* taskManager;
+    int codeNum;
+    void  (**code) (struct Context*);
+    void* heapStart;
+    void* heap;
+    long heapLimit;
+    int dataNum;
+    int idgCount; //number of waiting dataGear
+    int odg;
+    int maxOdg;
+    int workerId;
+    int num_exec;
+    CUmodule module;
+    CUfunction function;
+    union Data **data;
+};
+
+    struct CUDAWorker {
+        CUdevice device;
+        CUcontext cuCtx;
+        pthread_t thread;
+        struct Context* context;
+        int id;
+        struct Queue* tasks;
+        int runFlag;
+        int next;
+        int num_stream;
+        CUstream *stream;
+    } CUDAWorker;
+
+    struct LoopCounter {
+        int i;
+    } LoopCounter;
+
+    struct Array {
+        int size;
+        int index;
+        int prefix;
+        int* array;
+    } Array;
+
+
+
+void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
+    // initialize and load kernel
+    cudaWorker->num_stream = 1; // number of stream
+//    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+   if (phase==0)
+    checkCudaErrors(cuInit(0));
+   if (phase==0)
+    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+   if (phase==0)
+    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
+//    if (cudaWorker->num_stream) {
+//        for (int i=0;i<cudaWorker->num_stream;i++)
+//            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
+//    }
+}
+
+
+void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) {
+    // Worker *worker = context->worker;
+    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
+  // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devLoopCounter;
+
+    checkCudaErrors(cuMemAlloc(&devA, array->size));
+    checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
+
+    //twiceカーネルが定義されてなければそれをロードする
+    checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx"));
+    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
+
+    //入力のDataGearをGPUにbuffer経由で送る
+    // Synchronous data transfer(host to device)
+    checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
+    checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
+
+  // Asynchronous launch kernel
+     context->num_exec = 1;
+     void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA};
+     checkCudaErrors(cuLaunchKernel(context->function,
+                       1, 1, 1,
+                       1, 1, 1,
+                                 0, NULL , args, NULL));
+
+    //結果を取ってくるコマンドを入力する
+    //コマンドの終了待ちを行う   
+    checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
+
+    // wait for stream
+    checkCudaErrors(cuCtxSynchronize());
+}
+
+void cudaShutdown( struct CUDAWorker *worker) {
+//    for (int i=0;i<worker->num_stream;i++)
+//        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
+    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
+}
+
--- a/src/parallel_execution/main.cbc	Wed Feb 15 16:45:10 2017 +0900
+++ b/src/parallel_execution/main.cbc	Wed Feb 15 20:43:55 2017 +0900
@@ -34,23 +34,27 @@
 }
 
 #ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
 extern volatile int cuda_initialized;
 #endif
+#endif
 
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
     taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0);
 #ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
     while(! cuda_initialized) {};
 #endif
+#endif
     goto meta(context, C_createTask1);
 }
 
 __code initDataGears_stub(struct Context* context) {
     struct TaskManager* taskManager =  Gearef(context, TaskManager);
     taskManager->taskManager = 0;
-#ifndef USE_CUDAWorker
+#if (! defined(USE_CUDAWorker) || ! defined(USE_CUDA_MAIN_THREAD))
     struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
     goto initDataGears(context, loopCounter, taskManager);
 #else
@@ -114,7 +118,7 @@
 
     loopCounter->i = 0;
     taskManager->next = C_code1;
-#ifdef USE_CUDAWorker
+#if ( defined(USE_CUDAWorker) && defined(USE_CUDA_MAIN_THREAD))
 sleep(5);
 #endif
     goto meta(context, taskManager->taskManager->TaskManager.shutdown);