changeset 305:ec0a5b4fba05

CUDAWorker
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 14 Feb 2017 12:15:58 +0900
parents 9755206813cb
children ae4f6aa427f5
files src/parallel_execution/CMakeLists.txt src/parallel_execution/CPUWorker.cbc src/parallel_execution/CUDAWorker.cbc src/parallel_execution/context.h src/parallel_execution/generate_stub.pl src/parallel_execution/helper_cuda.h
diffstat 6 files changed, 81 insertions(+), 270 deletions(-) [+]
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Tue Feb 14 12:15:58 2017 +0900
@@ -13,6 +13,7 @@
     set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") 
     find_package(CUDA REQUIRED)
     add_definitions("-Wall -g -DUSE_CUDAWorker=1")
+    SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" )
 else()                
     add_definitions("-Wall -g")
 endif()
--- a/src/parallel_execution/CPUWorker.cbc	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/CPUWorker.cbc	Tue Feb 14 12:15:58 2017 +0900
@@ -89,17 +89,5 @@
                  
 }
 
-#ifdef USE_CUDA
-__code twiceGpu() {
-    cuMemcpyHtoDAsync(context,context,context,context->stream);
-    cuLaunchkanel();
-    cuMemcpyDtoHAsync();
-}
-
-__code twiceGpu_stub() {
-}
-
-#endif
-
 __code shutdownWorker(struct CPUWorker* worker) {
 }
--- a/src/parallel_execution/CUDAWorker.cbc	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Tue Feb 14 12:15:58 2017 +0900
@@ -3,8 +3,11 @@
 #include <string.h>
 #include <stdlib.h>
 #include <libkern/OSAtomic.h>
+
+// includes, project
+#include <driver_types.h>
+#include <cuda_runtime.h>
 #include <cuda.h>
-#include <cuda_runtime.h>
 #include "helper_cuda.h"
 
 #include "../context.h"
@@ -13,10 +16,10 @@
 
 Worker* createCUDAWorker(struct Context* context, int id, Queue* queue) {
     struct Worker* worker = ALLOC(context, Worker);
-    struct CUDAWorker* CUDAWorker = ALLOC(context, CUDAWorker);
-    worker->worker = (union Data*)CUDAWorker;
+    struct CUDAWorker* cudaWorker = new CUDAWorker();
+    worker->worker = (union Data*)cudaWorker;
     worker->tasks = queue;
-    cpuWorker->id = id;
+    cudaWorker->id = id;
     worker->taskReceive = C_taskReceiveCUDAWorker;
     worker->shutdown = C_shutdownCUDAWorker;
     pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
@@ -24,75 +27,98 @@
 }
 
 static void start_CUDAworker(Worker* worker) {
-    CUDAWorker* CUDAWorker = (CUDAWorker*)worker->worker;
-    CUDAWorker->context = NEW(struct Context);
-    initContext(CUDAWorker->context);
-    Gearef(CUDAWorker->context, Worker)->worker = (union Data*)worker;
-    int num_stream = 1; // number of stream
-    int num_exec = 16; // number of executed kernel
+    CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
+    cudaWorker->context = NEW(struct Context);
+    initContext(cudaWorker->context);
+    Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker;
+    cudaWorker->num_stream = 1; // number of stream
 
     // initialize and load kernel
-    CUdevice device;
-    CUcontext context;
-    CUmodule module;
-    CUfunction function;
-    CUstream stream[num_stream];
-
+    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
     checkCudaErrors(cuInit(0));
-    checkCudaErrors(cuDeviceGet(&device, 0));
-    checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
-    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
-    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
-    if (num_stream) {
-        for (int i=0;i<num_stream;i++)
-            checkCudaErrors(cuStreamCreate(&stream[i],0));
+    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
+    if (cudaWorker->num_stream) {
+        for (int i=0;i<cudaWorker->num_stream;i++)
+            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
     }
 
-    goto meta(CUDAWorker->context, C_taskReceiveCUDAWorker);
+    goto meta(cudaWorker->context, C_taskReceiveCUDAWorker);
 }
 
-__code taskReceiveCUDAWorker(struct Context* context, Worker* worker, Queue* queue) {
+__code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) {
     queue->queue = (union Data*)worker->tasks;
-    queue->next = C_getTask;
+    queue->next = C_getTaskCUDA;
     goto meta(context, worker->tasks->take);
 }
 
 __code taskReceiveCUDAWorker_stub(struct Context* context) {
-    CUDAWorker* CUDAWorker = (CUDAWorker *)GearImpl(context, CUDAWorker, CUDAworker);
-    pthread_cond_wait(&CUDAWorker->cond, &CUDAWorker->mutex);
     goto taskReceiveCUDAWorker(context, &Gearef(context, Worker)->worker->Worker, Gearef(context, Queue));
 }
 
-__code getCUDATask(struct Context* context, Worker* worker, struct Context* task) {
+__code getTaskCUDA(struct Worker* worker, struct Context* task) {
     if (!task)
         return; // end thread
     task->worker = worker;
-    context->next = C_taskReceiveCUDAWorker; // set CG after task exec
-    goto meta(task, task->next);
+    enum Code taskCg = task->next;
+    task->next = C_odgCommitCUDA; // set CG after task exec
+    goto meta(task, taskCg);
 }
 
-__code getCUDATask_stub(struct Context* context) {
+__code getTaskCUDA_stub(struct Context* context) {
     Worker* worker = &Gearef(context,Worker)->worker->Worker;
     struct Context* task = &Gearef(context, Queue)->data->Context;
-    goto getCUDATask(context, worker, task);
+    goto getTaskCUDA(context, worker, task);
+}
+
+__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Queue* queue, struct Context* task) {
+    int i = loopCounter->i ;
+    if(task->odg + i < task->maxOdg) {
+        queue->queue = (union Data*)GET_WAIT_LIST(task->data[task->odg+i]);
+        queue->next = C_odgCommitCUDA1;
+        goto meta(context, queue->queue->Queue.take);
+    }
+    loopCounter->i = 0;
+    goto meta(context, C_taskReceiveCUDAWorker);
 }
 
-#ifdef USE_CUDA
-__code twiceCUDA(struct Context* context) {
-    cuMemcpyHtoDAsync(context,context,context,context->stream);
-    cuLaunchkanel();
-    cuMemcpyDtoHAsync();
+__code odgCommitCUDA_stub(struct Context* context) {
+    struct Context* workerContext = context->worker->worker->CUDAWorker.context;
+    goto odgCommitCUDA(workerContext,
+                   Gearef(workerContext, LoopCounter),
+                   Gearef(workerContext, Queue),
+                   context);
 }
-#endif
+
+__code odgCommitCUDA1(struct TaskManager* taskManager, struct Context* task) {
+    if(__sync_fetch_and_sub(&task->idgCount, 1)) {
+        if(task->idgCount == 0) {
+            taskManager->taskManager = (union Data*)task->taskManager;
+            taskManager->context = task;
+            taskManager->next = C_odgCommitCUDA;
+            goto meta(context, task->taskManager->spawn);
+        }
+    } else {
+        goto meta(context, C_odgCommitCUDA1);
+    }
+}
 
-__code shutdownCUDAWorker(struct Context* context, CPUWorker* worker) {
-    for (int i=0;i<num_stream;i++)
-        checkCudaErrors(cuStreamDestroy(stream[i]));
-    checkCudaErrors(cuModuleUnload(module));
-    checkCudaErrors(cuCtxDestroy(context));
+__code odgCommitCUDA1_stub(struct Context* context) {
+    struct Context* task = &Gearef(context, Queue)->data->Context;
+    goto odgCommitCUDA1(context,
+                    Gearef(context, TaskManager),
+                    task);
+                 
+}
+
+
+__code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) {
+    for (int i=0;i<worker->num_stream;i++)
+        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
+    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
 }
 
 __code shutdownCUDAWorker_stub(struct Context* context) {
-    CPUWorker* worker = (CPUWorker *)GearImpl(context, Worker, worker);
+    CUDAWorker* worker = (CUDAWorker *)GearImpl(context, Worker, worker);
     goto shutdownCUDAWorker(context,worker);
 }
--- a/src/parallel_execution/context.h	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/context.h	Tue Feb 14 12:15:58 2017 +0900
@@ -145,13 +145,12 @@
         enum Code next;
         CUdevice device;
         CUcontext cuCtx;
-        CUfunction code;
-        CUdeviceptr* deviceptr;
-        CUstream stream;
-    } CudaWorker;
+        int num_stream;
+        CUstream *stream;
+    } CUDAWorker;
 #else
     struct CUDAWorker {
-    } CudaWorker;
+    } CUDAWorker;
 #endif
     struct Main {
         enum Code code;
--- a/src/parallel_execution/generate_stub.pl	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/generate_stub.pl	Tue Feb 14 12:15:58 2017 +0900
@@ -195,6 +195,9 @@
                 $outputVar{$codeGearName} = "";
                 $outputArgs{$codeGearName} = {};
                 my $newArgs = "struct Context *context,";
+                if ($args=~/^struct Context\s*\*\s*context/) {
+                    $newArgs = "";
+                }
                 while($args) {
                     if ($args =~ s/(^\s*,\s*)//) {
                         $newArgs .= $1;
--- a/src/parallel_execution/helper_cuda.h	Tue Feb 14 11:36:41 2017 +0900
+++ b/src/parallel_execution/helper_cuda.h	Tue Feb 14 12:15:58 2017 +0900
@@ -32,7 +32,7 @@
 // on which CUDA functions are used.
 
 // CUDA Runtime error messages
-#ifdef __DRIVER_TYPES_H__
+#ifndef __DRIVER_TYPES_H__
 static const char *_cudaGetErrorEnum(cudaError_t error)
 {
     switch (error)
@@ -979,7 +979,7 @@
 #endif
 #endif
 
-#ifndef __DRIVER_TYPES_H__
+#ifdef __DRIVER_TYPES_H__
 static inline void check(CUresult result, char const *const func, const char *const file, int const line)
 {
     if (result)
@@ -1081,212 +1081,6 @@
 }
 // end of GPU Architecture definitions
 
-#ifdef __CUDA_RUNTIME_H__
-// General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID)
-{
-    int device_count;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0)
-    {
-        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (devID < 0)
-    {
-        devID = 0;
-    }
-
-    if (devID > device_count-1)
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
-        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
-        fprintf(stderr, "\n");
-        return -devID;
-    }
-
-    struct cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-
-    if (deviceProp.computeMode == cudaComputeModeProhibited)
-    {
-        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
-        return -1;
-    }
-
-    if (deviceProp.major < 1)
-    {
-        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    checkCudaErrors(cudaSetDevice(devID));
-    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
-
-    return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int gpuGetMaxGflopsDeviceId()
-{
-    int current_device     = 0, sm_per_multiproc  = 0;
-    int max_perf_device    = 0;
-    int device_count       = 0, best_SM_arch      = 0;
-    int devices_prohibited = 0;
-    
-    unsigned long long max_compute_perf = 0;
-    struct cudaDeviceProp deviceProp;
-    cudaGetDeviceCount(&device_count);
-    
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0)
-    {
-        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    // Find the best major SM Architecture GPU device
-    while (current_device < device_count)
-    {
-        cudaGetDeviceProperties(&deviceProp, current_device);
-
-        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
-        if (deviceProp.computeMode != cudaComputeModeProhibited)
-        {
-            if (deviceProp.major > 0 && deviceProp.major < 9999)
-            {
-                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
-            }
-        }
-        else
-        {
-            devices_prohibited++;
-        }
-
-        current_device++;
-    }
-
-    if (devices_prohibited == device_count)
-    {
-    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
-    	exit(EXIT_FAILURE);
-    }
-
-    // Find the best CUDA capable GPU device
-    current_device = 0;
-
-    while (current_device < device_count)
-    {
-        cudaGetDeviceProperties(&deviceProp, current_device);
-
-        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
-        if (deviceProp.computeMode != cudaComputeModeProhibited)
-        {
-            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
-            {
-                sm_per_multiproc = 1;
-            }
-            else
-            {
-                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
-            }
-
-            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
-
-            if (compute_perf  > max_compute_perf)
-            {
-                // If we find GPU with SM major > 2, search only these
-                if (best_SM_arch > 2)
-                {
-                    // If our device==dest_SM_arch, choose this, or else pass
-                    if (deviceProp.major == best_SM_arch)
-                    {
-                        max_compute_perf  = compute_perf;
-                        max_perf_device   = current_device;
-                    }
-                }
-                else
-                {
-                    max_compute_perf  = compute_perf;
-                    max_perf_device   = current_device;
-                }
-            }
-        }
-
-        ++current_device;
-    }
-
-    return max_perf_device;
-}
-
-
-// Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv)
-{
-    struct cudaDeviceProp deviceProp;
-    int devID = 0;
-
-    // If the command-line has a device number specified, use it
-    if (checkCmdLineFlag(argc, argv, "device"))
-    {
-        devID = getCmdLineArgumentInt(argc, argv, "device=");
-
-        if (devID < 0)
-        {
-            printf("Invalid command line parameter\n ");
-            exit(EXIT_FAILURE);
-        }
-        else
-        {
-            devID = gpuDeviceInit(devID);
-
-            if (devID < 0)
-            {
-                printf("exiting...\n");
-                exit(EXIT_FAILURE);
-            }
-        }
-    }
-    else
-    {
-        // Otherwise pick the device with highest Gflops/s
-        devID = gpuGetMaxGflopsDeviceId();
-        checkCudaErrors(cudaSetDevice(devID));
-        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
-    }
-
-    return devID;
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilities(int major_version, int minor_version)
-{
-    struct cudaDeviceProp deviceProp;
-    deviceProp.major = 0;
-    deviceProp.minor = 0;
-    int dev;
-
-    checkCudaErrors(cudaGetDevice(&dev));
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
-
-    if ((deviceProp.major > major_version) ||
-        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
-    {
-        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
-        return true;
-    }
-    else
-    {
-        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
-        return false;
-    }
-}
-#endif
 
 // end of CUDA Helper Functions