# HG changeset patch # User Tatsuki IHA # Date 1517861695 -32400 # Node ID b78533641f9b019cd2b529e55cf56bc4e16e2553 # Parent acc80b26156bf1b36f29a74856154635c7504c6e Add calcMaxThread diff -r acc80b26156b -r b78533641f9b src/parallel_execution/CUDAExecutor.cbc --- a/src/parallel_execution/CUDAExecutor.cbc Tue Feb 06 02:04:02 2018 +0900 +++ b/src/parallel_execution/CUDAExecutor.cbc Tue Feb 06 05:14:55 2018 +0900 @@ -2,6 +2,7 @@ #interface "Executor.h" #interface "Timer.h" #include +#include Executor* createCUDAExecutor(struct Context* context, CUdevice device) { struct Executor* executor = new Executor(); @@ -38,14 +39,38 @@ return count < maxThreadPerBlock ? count : maxThreadPerBlock; } +void calcBlockMaxThread(struct MultiDimIterator* iterator, struct CUDAExecutor* executor) { + executor->maxThreadPerBlockX = 1; + executor->maxThreadPerBlockY = 1; + executor->maxThreadPerBlockZ = 1; + if (iterator->x > 1 && iterator->y == 1 && iterator->z == 1) { + executor->maxThreadPerBlockX = executor->maxThreadPerBlock; + executor->maxThreadPerBlockY = 1; + executor->maxThreadPerBlockZ = 1; + } else if (iterator->x > 1 && iterator->y > 1 && iterator->z == 1) { + int ln_2 = log2(executor->maxThreadPerBlock); + int maxThread = 1 << (ln_2/2); + executor->maxThreadPerBlockX = maxThread; + executor->maxThreadPerBlockY = maxThread; + executor->maxThreadPerBlockZ = 1; + } else { + int ln_2 = log2(executor->maxThreadPerBlock); + int maxThread = 1 << (ln_2/3); + executor->maxThreadPerBlockX = maxThread * (1 << (ln_2%3)); + executor->maxThreadPerBlockY = maxThread; + executor->maxThreadPerBlockZ = maxThread; + } +} + __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel task->num_exec = 1; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; - int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock); - int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock); - int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock); + calcBlockMaxThread(iterator, executor); + int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlockX); + int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlockY); + int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlockZ); checkCudaErrors(cuLaunchKernel(task->function, iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ, blockDimX, blockDimY, blockDimZ, diff -r acc80b26156b -r b78533641f9b src/parallel_execution/context.h --- a/src/parallel_execution/context.h Tue Feb 06 02:04:02 2018 +0900 +++ b/src/parallel_execution/context.h Tue Feb 06 05:14:55 2018 +0900 @@ -378,6 +378,9 @@ CUdeviceptr** kernelParams; struct CUDABuffer* buffer; int maxThreadPerBlock; + int maxThreadPerBlockX; + int maxThreadPerBlockY; + int maxThreadPerBlockZ; struct Timer* timer; } CUDAExecutor; struct CUDABuffer {