#include "../context.h" #interface "Executor.h" #interface "Timer.h" #include #include Executor* createCUDAExecutor(struct Context* context, CUdevice device) { struct Executor* executor = new Executor(); struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); cudaExecutor->timer = createTimerImpl(context); checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device)); executor->executor = (union Data*)cudaExecutor; executor->read = C_readCUDAExecutor; executor->exec = C_execCUDAExecutor; executor->write = C_writeCUDAExecutor; return executor; } __code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { struct CUDABuffer* buffer = executor->buffer; int paramLen = buffer->inputLen + buffer->outputLen; executor->kernelParams = (CUdeviceptr**)ALLOCATE_PTR_ARRAY(context, CUdeviceptr, paramLen); for (int i = 0; i < paramLen; i++) { CUdeviceptr* deviceptr = new CUdeviceptr(); // memory allocate union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen]; checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(data))); checkCudaErrors(cuMemcpyHtoD(*deviceptr, data, GET_SIZE(data))); // Synchronous data transfer(host to device) executor->kernelParams[i] = deviceptr; } // TODO: Implements pipeline // goto next(...); struct Timer* timer = executor->timer; goto timer->start(execCUDAExecutor); } int computeblockDim(int count, int maxThreadPerBlock) { return count < maxThreadPerBlock ? count : maxThreadPerBlock; } void calcBlockMaxThread(struct MultiDimIterator* iterator, struct CUDAExecutor* executor) { executor->maxThreadPerBlockX = 1; executor->maxThreadPerBlockY = 1; executor->maxThreadPerBlockZ = 1; // maxThreadPerBlockX * maxThreadPerBlockY * maxThreadPerBlockZ <= maxThreadPerBlock if (iterator->x > 1 && iterator->y == 1 && iterator->z == 1) { executor->maxThreadPerBlockX = executor->maxThreadPerBlock; executor->maxThreadPerBlockY = 1; executor->maxThreadPerBlockZ = 1; } else if (iterator->x > 1 && iterator->y > 1 && iterator->z == 1) { int ln_2 = log2(executor->maxThreadPerBlock); int maxThread = 1 << (ln_2/2); executor->maxThreadPerBlockX = maxThread; executor->maxThreadPerBlockY = maxThread; executor->maxThreadPerBlockZ = 1; } else { int ln_2 = log2(executor->maxThreadPerBlock); int maxThread = 1 << (ln_2/3); executor->maxThreadPerBlockX = maxThread * (1 << (ln_2%3)); executor->maxThreadPerBlockY = maxThread; executor->maxThreadPerBlockZ = maxThread; } } __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { task->num_exec = 1; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; calcBlockMaxThread(iterator, executor); int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlockX); int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlockY); int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlockZ); // launch kernel checkCudaErrors(cuLaunchKernel(task->function, iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ, blockDimX, blockDimY, blockDimZ, 0, NULL, (void**)executor->kernelParams, NULL)); } else { checkCudaErrors(cuLaunchKernel(task->function, 1, 1, 1, 1, 1, 1, 0, NULL, (void**)executor->kernelParams, NULL)); } // TODO: Implements pipeline // goto next(...); goto writeCUDAExecutor(); } __code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel checkCudaErrors(cuCtxSynchronize()); struct Timer* timer = executor->timer; goto timer->end(writeCUDAExecutor1); } __code writeCUDAExecutor1(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { //結果を取ってくるコマンドを入力する struct CUDABuffer* buffer = executor->buffer; int paramLen = buffer->inputLen + buffer->outputLen; for (int i = 0; i < paramLen; i++) { CUdeviceptr deviceptr = *(executor->kernelParams[i]); union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen]; checkCudaErrors(cuMemcpyDtoH(data, deviceptr, GET_SIZE(data))); cuMemFree(deviceptr); } goto next(...); }