# HG changeset patch # User Tatsuki IHA # Date 1505324120 -32400 # Node ID 85b0ddbf458e89ac7e8b0955b8d675464f651c29 # Parent 4d1e3697a6b89a14edc670d4d2cc4ce3be9d04df Fix CudaWorker diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/CMakeLists.txt --- a/src/parallel_execution/CMakeLists.txt Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Thu Sep 14 02:35:20 2017 +0900 @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.8) set(USE_CUDA,0) # -DUSE_CUDA @@ -7,6 +7,8 @@ set(CMAKE_C_COMPILER $ENV{CBC_COMPILER}) add_definitions("-Wall -g") +# -DCMAKE_BUILD_TYPE=Debug +set(CMAKE_C_FLAGS_DEBUG "-O0") if (${USE_CUDA}) include_directories("/usr/local/cuda/include") @@ -81,9 +83,9 @@ TARGET CUDAtwice SOURCES - main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc CUDAWorker.cbc examples/twice/CUDAtwice.cbc examples/twice/CUDAtwice.cu cuda.c + examples/twice/main.cbc examples/twice/CUDAtwice.cbc examples/twice/CUDAtwice.cu examples/twice/createArray.cbc CPUWorker.cbc TimeImpl.cbc examples/twice/twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc CUDAWorker.cbc cuda.c MultiDimIterator.cbc ) - set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1 -DUSE_CUDA_MAIN_THREAD") + set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") endif() GearsCommand( diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/CPUWorker.cbc --- a/src/parallel_execution/CPUWorker.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/CPUWorker.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -1,6 +1,6 @@ #include "../context.h" -static void start_worker(Worker* worker); +static void startWorker(Worker* worker); Worker* createCPUWorker(struct Context* context, int id, Queue* queue) { struct Worker* worker = new Worker(); @@ -10,11 +10,11 @@ cpuWorker->id = id; worker->taskReceive = C_taskReceiveWorker; worker->shutdown = C_shutdownWorker; - pthread_create(&worker->worker->CPUWorker.thread, NULL, (void*)&start_worker, worker); + pthread_create(&worker->worker->CPUWorker.thread, NULL, (void*)&startWorker, worker); return worker; } -static void start_worker(Worker* worker) { +static void startWorker(Worker* worker) { CPUWorker* cpuWorker = (CPUWorker*)worker->worker; cpuWorker->context = NEW(struct Context); initContext(cpuWorker->context); diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/CUDAWorker.cbc --- a/src/parallel_execution/CUDAWorker.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -1,14 +1,8 @@ -#include -#include -#include -#include -#include - #include "../context.h" extern void cudaInit(struct CUDAWorker *cudaWorker,int phase) ; -static void start_CUDAworker(Worker* worker); +static void startCUDAWorker(Worker* worker); #ifndef USE_CUDA_MAIN_THREAD volatile @@ -19,32 +13,31 @@ struct Worker* worker = ALLOC(context, Worker); struct CUDAWorker* cudaWorker = new CUDAWorker(); - cudaInit(cudaWorker,0); + cudaInit(cudaWorker, 0); worker->worker = (union Data*)cudaWorker; worker->tasks = queue; cudaWorker->id = id; + worker->taskReceive = C_taskReceiveWorker; worker->shutdown = C_shutdownCUDAWorker; #ifndef USE_CUDA_MAIN_THREAD - pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); + pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&startCUDAWorker, worker); #else if (im) { im->workers[0] = worker; } cuda_initialized = 1; - start_CUDAworker(worker); + startCUDAWorker(worker); #endif return worker; } - -static void start_CUDAworker(Worker* worker) { +static void startCUDAWorker(Worker* worker) { CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; cudaWorker->context = NEW(struct Context); initContext(cudaWorker->context); Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker; - - goto meta(cudaWorker->context, C_taskReceiveCUDAWorker); + goto meta(cudaWorker->context, worker->taskReceive); } __code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) { @@ -60,10 +53,13 @@ __code getTaskCUDA(struct Worker* worker, struct Context* task) { if (!task) return; // end thread - worker->taskReceive = C_taskReceiveCUDAWorker; task->worker = worker; enum Code taskCg = task->next; - task->next = C_odgCommitCUDA; // set CG after task exec + if (task->iterate) { + task->next = C_iterateCommitCUDA; + } else { + task->next = C_odgCommitCUDA; // set CG after task exec + } goto meta(task, taskCg); } @@ -73,50 +69,97 @@ goto getTaskCUDA(context, worker, task); } -__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Queue* queue, struct Context* task) { +__code iterateCommitCUDA(struct Iterator* iterator) { + iterator->iterator = (union Data*)context->iterator; + iterator->task = context; + iterator->next = C_odgCommitCUDA; + iterator->whenWait = C_iterateCommitCUDA1; + goto meta(context, context->iterator->barrier); +} + +__code iterateCommitCUDA1(struct Context* task) { + goto meta(context, C_taskReceiveWorker); +} + +__code iterateCommitCUDA1_stub(struct Context* context) { + struct Context* workerContext = context->worker->worker->CUDAWorker.context; + goto iterateCommitCUDA1(workerContext, context); +} + +__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Context* task, struct TaskManager* taskManager) { int i = loopCounter->i ; - if(task->odg + i < task->maxOdg) { - queue->queue = (union Data*)GET_WAIT_LIST(task->data[task->odg+i]); - queue->next = C_odgCommitCUDA1; - goto meta(context, queue->queue->Queue.take); + if (task->odg+i < task->maxOdg) { + goto meta(task, C_odgCommitCUDA1); } loopCounter->i = 0; - goto meta(context, C_taskReceiveCUDAWorker); + taskManager->taskManager = (union Data*)task->taskManager; + taskManager->next = C_taskReceiveWorker; + goto meta(context, task->taskManager->decrementTaskCount); } __code odgCommitCUDA_stub(struct Context* context) { struct Context* workerContext = context->worker->worker->CUDAWorker.context; goto odgCommitCUDA(workerContext, - Gearef(workerContext, LoopCounter), - Gearef(workerContext, Queue), - context); + Gearef(context, LoopCounter), + context, + Gearef(workerContext, TaskManager)); } -__code odgCommitCUDA1(struct TaskManager* taskManager, struct Context* task) { - if(__sync_fetch_and_sub(&task->idgCount, 1)) { - if(task->idgCount == 0) { - taskManager->taskManager = (union Data*)task->taskManager; - taskManager->context = task; - taskManager->next = C_odgCommitCUDA; - goto meta(context, task->taskManager->spawn); - } - } else { - goto meta(context, C_odgCommitCUDA1); - } +__code odgCommitCUDA1(struct LoopCounter* loopCounter, struct Queue* queue) { + int i = loopCounter->i ; + queue->queue = (union Data*)GET_WAIT_LIST(context->data[context->odg+i]); + queue->whenEmpty = C_odgCommitCUDA4; + queue->next = C_odgCommitCUDA2; + goto meta(context, queue->queue->Queue.isEmpty); } __code odgCommitCUDA1_stub(struct Context* context) { + goto odgCommitCUDA1(context, + Gearef(context, LoopCounter), + Gearef(context, Queue)); +} + +__code odgCommitCUDA2(struct Queue* queue) { + queue->next = C_odgCommitCUDA3; + goto meta(context, queue->queue->Queue.take); +} + +__code odgCommitCUDA2_stub(struct Context* context) { + goto odgCommitCUDA2(context, + Gearef(context, Queue)); +} + +__code odgCommitCUDA3(struct TaskManager* taskManager, struct Context* task) { + if (__sync_fetch_and_sub(&task->idgCount, 1) == 1) { // atomic decrement idg counter(__sync_fetch_and_sub function return initial value of task->idgCount point) + taskManager->taskManager = (union Data*)task->taskManager; + taskManager->context = task; + taskManager->next = C_odgCommitCUDA1; + goto meta(context, task->taskManager->spawn); + } + goto meta(context, C_odgCommitCUDA1); +} + +__code odgCommitCUDA3_stub(struct Context* context) { struct Context* task = &Gearef(context, Queue)->data->Context; - goto odgCommitCUDA1(context, - Gearef(context, TaskManager), - task); - + goto odgCommitCUDA3(context, + Gearef(context, TaskManager), + task); +} + +__code odgCommitCUDA4(struct LoopCounter* loopCounter) { + loopCounter->i++; + goto meta(context, C_odgCommitCUDA); +} + +__code odgCommitCUDA4_stub(struct Context* context) { + goto odgCommitCUDA4(context, + Gearef(context, LoopCounter)); } extern void cudaShutdown( CUDAWorker *cudaWorker) ; __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) { - cudaShutdown( worker) ; + cudaShutdown(worker) ; } __code shutdownCUDAWorker_stub(struct Context* context) { diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/Iterator.cbc --- a/src/parallel_execution/Iterator.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/Iterator.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -1,7 +1,8 @@ typedef struct Iterator{ union Data* iterator; struct Context* task; - __code exec(Impl* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)); + int numGPU; + __code exec(Impl* iterator, struct TaskManager* taskManager, struct Context* task, int numGPU, __code next(...)); __code barrier(Impl* iterator, struct Context* task, __code next(...), __code whenWait(...)); __code whenWait(...); __code next(...); diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/MultiDimIterator.cbc --- a/src/parallel_execution/MultiDimIterator.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/MultiDimIterator.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -53,14 +53,16 @@ return task1; } -__code execMultiDimIterator(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) { - int x = iterator->counterX; - int y = iterator->counterY; - int z = iterator->counterZ; - struct Context* iterateTask = createMultiDimIterateTask(task, x, y, z); +__code execMultiDimIterator(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, int numGPU, __code next(...)) { + // No GPU device + if (numGPU == 0) { + goto meta(context, C_execMultiDimIterator1); + } + task->iterate = 1; + task->gpu = 1; taskManager->taskManager = (union Data*)task->taskManager; - taskManager->context = iterateTask; - taskManager->next = C_execMultiDimIterator1; + taskManager->context = task; + taskManager->next = next; goto meta(context, task->taskManager->spawn); } @@ -68,11 +70,31 @@ MultiDimIterator* iterator = (MultiDimIterator*)GearImpl(context, Iterator, iterator); TaskManager* taskManager = Gearef(context, TaskManager); Context* task = Gearef(context, Iterator)->task; + int numGPU = Gearef(context, Iterator)->numGPU; enum Code next = Gearef(context, Iterator)->next; - goto execMultiDimIterator(context, iterator, taskManager, task, next); + goto execMultiDimIterator(context, iterator, taskManager, task, numGPU, next); } -__code execMultiDimIterator1(struct MultiDimIterator* iterator, struct Context* task, __code next(...)) { +__code execMultiDimIterator1(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) { + int x = iterator->counterX; + int y = iterator->counterY; + int z = iterator->counterZ; + struct Context* iterateTask = createMultiDimIterateTask(task, x, y, z); + taskManager->taskManager = (union Data*)task->taskManager; + taskManager->context = iterateTask; + taskManager->next = C_execMultiDimIterator2; + goto meta(context, task->taskManager->spawn); +} + +__code execMultiDimIterator1_stub(struct Context* context) { + MultiDimIterator* iterator = (MultiDimIterator*)GearImpl(context, Iterator, iterator); + TaskManager* taskManager = Gearef(context, TaskManager); + Context* task = Gearef(context, Iterator)->task; + enum Code next = Gearef(context, Iterator)->next; + goto execMultiDimIterator1(context, iterator, taskManager, task, next); +} + +__code execMultiDimIterator2(struct MultiDimIterator* iterator, struct Context* task, __code next(...)) { if (++iterator->counterX >= iterator->x) { iterator->counterX = 0; if (++iterator->counterY >= iterator->y) { @@ -83,7 +105,7 @@ } } } - goto meta(context, C_execMultiDimIterator); + goto meta(context, C_execMultiDimIterator1); } __code barrierMultiDimIterator(struct MultiDimIterator* iterator, struct Context* task, __code next(...), __code whenWait(...)) { diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/TaskManagerImpl.cbc --- a/src/parallel_execution/TaskManagerImpl.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/TaskManagerImpl.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -22,7 +22,8 @@ struct TaskManagerImpl* taskManagerImpl = new TaskManagerImpl(); taskManagerImpl->taskQueue = createSingleLinkedQueue(context); taskManagerImpl->numWorker = taskManager->maxCPU; - taskManagerImpl->sendWorkerIndex = 0; + taskManagerImpl->sendGPUWorkerIndex = taskManager->gpu; + taskManagerImpl->sendCPUWorkerIndex = taskManager->cpu; taskManagerImpl->taskCount = 0; taskManagerImpl->loopCounter = new LoopCounter(); taskManagerImpl->loopCounter -> i = 0; @@ -162,18 +163,19 @@ goto next(...); } -__code spawnTaskManagerImpl(struct TaskManagerImpl* taskManager, struct Iterator* iterator, struct Context* task, __code next(...)) { +__code spawnTaskManagerImpl(struct TaskManagerImpl* taskManagerImpl, struct Iterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) { if (task->idgCount == 0) { if(task->iterator != NULL && task->iterate == 0) { iterator->iterator = (union Data*)task->iterator; iterator->task = task; iterator->next = next; - pthread_mutex_unlock(&taskManager->mutex); + iterator->numGPU = taskManager->cpu - taskManager->gpu; + pthread_mutex_unlock(&taskManagerImpl->mutex); goto meta(context, task->iterator->exec); } goto meta(context, C_taskSend); } - pthread_mutex_unlock(&taskManager->mutex); + pthread_mutex_unlock(&taskManagerImpl->mutex); goto next(...); } @@ -181,30 +183,40 @@ TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager); pthread_mutex_lock(&taskManager->mutex); goto spawnTaskManagerImpl(context, - taskManager, - Gearef(context, Iterator), - Gearef(context, TaskManager)->context, - Gearef(context, TaskManager)->next); + taskManager, + Gearef(context, Iterator), + &Gearef(context, TaskManager)->taskManager->TaskManager, + Gearef(context, TaskManager)->context, + Gearef(context, TaskManager)->next); } -__code taskSend(struct TaskManagerImpl* taskManager, struct Queue* queue, struct Context* task, __code next(...)) { +__code taskSend(struct TaskManagerImpl* taskManagerImpl, struct Queue* queue, struct TaskManager* taskManager, struct Context* task, __code next(...)) { // set workerId - task->workerId = taskManager->sendWorkerIndex; - if(++taskManager->sendWorkerIndex >= taskManager->numWorker) { - taskManager->sendWorkerIndex = 0; + if (task->gpu) { + task->workerId = taskManagerImpl->sendGPUWorkerIndex; + if(++taskManagerImpl->sendGPUWorkerIndex >= taskManager->cpu) { + taskManagerImpl->sendGPUWorkerIndex = taskManager->gpu; + } + } else { + task->workerId = taskManagerImpl->sendCPUWorkerIndex; + if(++taskManagerImpl->sendCPUWorkerIndex >= taskManager->maxCPU) { + taskManagerImpl->sendCPUWorkerIndex = taskManager->cpu; + } } - struct Queue* tasks = taskManager->workers[task->workerId]->tasks; + struct Queue* tasks = taskManagerImpl->workers[task->workerId]->tasks; queue->queue = (union Data*)tasks; queue->data = (union Data*)task; queue->next = next; - pthread_mutex_unlock(&taskManager->mutex); + pthread_mutex_unlock(&taskManagerImpl->mutex); goto meta(context, tasks->put); } __code taskSend_stub(struct Context* context) { TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager); goto taskSend(context, - taskManager, Gearef(context, Queue), + taskManager, + Gearef(context, Queue), + &Gearef(context, TaskManager)->taskManager->TaskManager, Gearef(context, TaskManager)->context, Gearef(context, TaskManager)->next); } @@ -230,10 +242,10 @@ __code shutdownTaskManagerImpl_stub(struct Context* context) { TaskManagerImpl* taskManagerImpl = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager); goto shutdownTaskManagerImpl(context, - taskManagerImpl, - Gearef(context, TaskManager)->next, - &Gearef(context, TaskManager)->taskManager->TaskManager, - Gearef(context, Queue)); + taskManagerImpl, + Gearef(context, TaskManager)->next, + &Gearef(context, TaskManager)->taskManager->TaskManager, + Gearef(context, Queue)); } __code shutdownTaskManagerImpl1(TaskManagerImpl* taskManager) { diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/context.h --- a/src/parallel_execution/context.h Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/context.h Thu Sep 14 02:35:20 2017 +0900 @@ -97,6 +97,7 @@ int odg; int maxOdg; int workerId; + int gpu; // GPU task struct Context* task; struct Queue* tasks; #ifdef USE_CUDAWorker @@ -155,7 +156,8 @@ struct TaskManagerImpl { enum Code next; int numWorker; - int sendWorkerIndex; + int sendCPUWorkerIndex; + int sendGPUWorkerIndex; int taskCount; pthread_mutex_t mutex; struct Queue* activeQueue; @@ -316,6 +318,7 @@ struct Iterator { union Data* iterator; struct Context* task; + int numGPU; enum Code exec; enum Code barrier; enum Code whenWait; diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/cuda.c --- a/src/parallel_execution/cuda.c Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/cuda.c Thu Sep 14 02:35:20 2017 +0900 @@ -11,8 +11,9 @@ #include "helper_cuda.h" #include "pthread.h" -// #include "context.h" +#include "context.h" +/* struct Context { int next; struct Worker* worker; @@ -24,85 +25,99 @@ long heapLimit; int dataNum; int idgCount; //number of waiting dataGear + int idg; + int maxIdg; int odg; int maxOdg; int workerId; + struct Context* task; + struct Queue* tasks; int num_exec; CUmodule module; CUfunction function; union Data **data; + + // multi dimension parameter + int iterate; + struct Iterator* iterator; }; - struct CUDAWorker { - CUdevice device; - CUcontext cuCtx; - pthread_t thread; - struct Context* context; - int id; - struct Queue* tasks; - int runFlag; - int next; - int num_stream; - CUstream *stream; - } CUDAWorker; +struct CUDAWorker { + CUdevice device; + CUcontext cuCtx; + pthread_t thread; + struct Context* context; + int id; + struct Queue* tasks; + int runFlag; + int next; + int num_stream; + CUstream *stream; +} CUDAWorker; - struct LoopCounter { - int i; - } LoopCounter; +struct LoopCounter { + int i; +} LoopCounter; - struct Array { - int size; - int index; - int prefix; - int* array; - } Array; - - +struct Array { + int size; + int index; + int prefix; + int* array; +} Array; +*/ void cudaInit(struct CUDAWorker *cudaWorker,int phase) { // initialize and load kernel cudaWorker->num_stream = 1; // number of stream -// cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); - if (phase==0) - checkCudaErrors(cuInit(0)); - if (phase==0) - checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); - if (phase==0) - checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); -// if (cudaWorker->num_stream) { -// for (int i=0;inum_stream;i++) -// checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); -// } + // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); + if (phase==0) + checkCudaErrors(cuInit(0)); + if (phase==0) + checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + if (phase==0) + checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); + // if (cudaWorker->num_stream) { + // for (int i=0;inum_stream;i++) + // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); + // } + printf("cuda Init: Done\n"); } -void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { +void CUDAExec(struct Context* context, struct Array* array) { + printf("cuda exec start\n"); // Worker *worker = context->worker; // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; - // memory allocate + // memory allocate CUdeviceptr devA; - CUdeviceptr devLoopCounter; - checkCudaErrors(cuMemAlloc(&devA, array->size)); - checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); + checkCudaErrors(cuMemAlloc(&devA, sizeof(int)*array->size)); //twiceカーネルが定義されてなければそれをロードする - checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); + checkCudaErrors(cuModuleLoad(&context->module, "c/examples/twice/CUDAtwice.ptx")); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); - // Asynchronous launch kernel - context->num_exec = 1; - void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; - checkCudaErrors(cuLaunchKernel(context->function, - 1, 1, 1, - 1, 1, 1, - 0, NULL, args, NULL)); + // Asynchronous launch kernel + context->num_exec = 1; + void* args[] = {&devA}; + if (context->iterate) { + struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; + checkCudaErrors(cuLaunchKernel(context->function, + iterator->x, iterator->y, iterator->z, + 1, 1, 1, + 0, NULL, args, NULL)); + } else { + checkCudaErrors(cuLaunchKernel(context->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL, args, NULL)); + } //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); @@ -112,7 +127,7 @@ } void cudaShutdown( struct CUDAWorker *worker) { -// for (int i=0;inum_stream;i++) -// checkCudaErrors(cuStreamDestroy(worker->stream[i])); + // for (int i=0;inum_stream;i++) + // checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx)); } diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/examples/twice/CUDAtwice.cbc --- a/src/parallel_execution/examples/twice/CUDAtwice.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/examples/twice/CUDAtwice.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -2,7 +2,7 @@ #include "../../../context.h" -extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter); +extern void CUDAExec(struct Context* context, Array* array); __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) { int i = loopCounter->i; @@ -18,10 +18,9 @@ } __code CUDAtwice_stub(struct Context* context) { -printf("CUdAtwice stub\n"); - struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter; +printf("CUDAtwice stub\n"); struct Array* array = &context->data[context->dataNum+1]->Array; - CUDAExec(context,array,loopCounter); + CUDAExec(context,array); //continuationにそってGPUworkerに戻る goto meta(context, context->next); diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/examples/twice/CUDAtwice.cu --- a/src/parallel_execution/examples/twice/CUDAtwice.cu Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/examples/twice/CUDAtwice.cu Thu Sep 14 02:35:20 2017 +0900 @@ -1,33 +1,8 @@ extern "C" { #include - -// __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) { -// int index = blockIdx.x * blockDim.x + threadIdx.x; -// printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n"); -// int i = 0; -// while (i < prefix) { -// array[i+index*prefix] = array[i+index*prefix]*2; -// } -// } - - struct LoopCounter { - int i; - } LoopCounter; - - __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) { - printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i); -C_twice: - int i = loopCounter->i; - if (i < prefix) { - array[i+index*prefix] = array[i+index*prefix]*2; - loopCounter->i++; - - goto C_twice; - } - - loopCounter->i = 0; + __global__ void twice(int* array) { + printf("array %p",array); + array[blockIdx.x] = array[blockIdx.x]*2; } - - } diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/examples/twice/createArray.cbc --- a/src/parallel_execution/examples/twice/createArray.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/examples/twice/createArray.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -12,6 +12,7 @@ output->array = array_ptr; output->size = length; *O_output = output; + printf("created Array\n"); goto meta(context, context->next); } diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/examples/twice/main.cbc --- a/src/parallel_execution/examples/twice/main.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/examples/twice/main.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -98,7 +98,7 @@ par goto createArray(array, __exit); - par goto twice(array, iterate(split), __exit); + par goto CUDAtwice(array, iterate(split), __exit); goto code2(); } diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/generate_context.pl --- a/src/parallel_execution/generate_context.pl Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/generate_context.pl Thu Sep 14 02:35:20 2017 +0900 @@ -139,6 +139,8 @@ $code_init .= " context->code[C_${code}] = ${code}_stub;\n"; } + my $data_num = keys(%dataGear); + $data_num++; my $context_c = << "EOFEOF"; #include @@ -155,7 +157,7 @@ $code_init #include "dataGearInit.c" - context->dataNum = D_Worker; + context->dataNum = $data_num; context->tasks = createSingleLinkedQueue(context); } EOFEOF diff -r 4d1e3697a6b8 -r 85b0ddbf458e src/parallel_execution/main.cbc --- a/src/parallel_execution/main.cbc Wed Sep 06 22:01:27 2017 +0900 +++ b/src/parallel_execution/main.cbc Thu Sep 14 02:35:20 2017 +0900 @@ -13,21 +13,6 @@ int CPU_ANY = -1; int CPU_CUDA = -1; -void print_queue(struct Element* element) { - while (element) { - printf("%p\n", ((struct Task *)(element->data))); - element = element->next; - } -} - -void print_tree(struct Node* node) { - if (node != 0) { - printf("%d\n", node->value->Array.index); - print_tree(node->left); - print_tree(node->right); - } -} - void *start_taskManager(struct Context *context) { goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager)); return 0; @@ -48,7 +33,7 @@ while(! cuda_initialized) {}; #endif #endif - goto meta(context, C_createTask1); + goto meta(context, C_code1); } __code initDataGears_stub(struct Context* context) { @@ -81,25 +66,25 @@ /* puts("tree"); */ /* print_tree(context->data[Tree]->tree.root); */ /* puts("result"); */ - - time->next = C_code2; - goto meta(context, C_code2); - //goto meta(context, C_exit_code); - //goto meta(context, C_start_time); + time->time = (union Data*)createTimeImpl(context); + time->next = C_createTask1; + goto meta(context, time->time->Time.start); } -__code code1_stub(struct Context* context) { - goto code1(context, Gearef(context, Time)); +__code code2(struct Time* time, struct TaskManager* taskManager) { + time->next = C_code3; + taskManager->next = time->time->Time.end; + goto meta(context, taskManager->taskManager->TaskManager.shutdown); } -__code code2(struct LoopCounter* loopCounter) { +__code code3(struct LoopCounter* loopCounter) { int i = loopCounter->i; if (i < length) { //printf("%d\n", array_ptr[i]); if (array_ptr[i] == (i*2)) { loopCounter->i++; - goto meta(context, C_code2); + goto meta(context, C_code3); } else puts("wrong result"); @@ -109,57 +94,12 @@ } __code createTask1(struct LoopCounter* loopCounter, struct TaskManager* taskManager) { - int i = loopCounter->i; - - if ((length/split*i) < length) { - taskManager->next = C_createTask2; - goto meta(context, taskManager->taskManager->TaskManager.createTask); - } - - loopCounter->i = 0; - taskManager->next = C_code1; -#if ( defined(USE_CUDAWorker) && defined(USE_CUDA_MAIN_THREAD)) -sleep(5); -#endif - goto meta(context, taskManager->taskManager->TaskManager.shutdown); -} + Array* array = &ALLOCATE_DATA_GEAR(context, Array)->Array; -__code createTask2(LoopCounter* loopCounter, TaskManager* taskManager,struct Context* task, LoopCounter* loopCounter2, Array* array) { - int i = loopCounter->i; - array->index = i; - array->prefix = length/split; - array->array = array_ptr; - array->size = length; - loopCounter2->i = 0; - task->idgCount = 0; - if (gpu_num) { -#ifdef USE_CUDAWorker - task->next = C_CUDAtwice; - task->workerId = CPU_CUDA; -#else - task->next = C_twice; -#endif - } else { - task->next = C_twice; - } - task->data[task->dataNum] = (union Data*)loopCounter2; - task->data[task->dataNum+1] = (union Data*)array; - task->odg = task->dataNum + 2; - task->maxOdg = task->odg; - taskManager->next = C_createTask1; - loopCounter->i++; - goto meta(context, taskManager->taskManager->TaskManager.spawn); -} + par goto createArray(array, __exit); -__code createTask2_stub(struct Context* context) { - LoopCounter* loopCounter = &ALLOCATE(context, LoopCounter)->LoopCounter; - Array* array = &ALLOCATE(context, Array)->Array; - goto createTask2(context, - Gearef(context, LoopCounter), - Gearef(context, TaskManager), - Gearef(context, TaskManager)->context, - loopCounter, - array); + par goto twice(array, iterate(split), __exit); + goto code2(); } void init(int argc, char** argv) {