Mercurial > hg > Game > Cerium
view example/Cuda/main.cc @ 2006:f6aa6d6a3fa2 draft
add fft using cuda, not running
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Jun 2014 12:07:00 +0900 |
parents | c3b4083c4467 |
children | 0e2389a5ac4e |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <cuda.h> #define LENGTH 10 #define THREAD 10 static double getTime() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + (double)tv.tv_usec*1e-6; } void check_data(float* A, float B, float* C) { for (int i=0; i<LENGTH*THREAD; i++) { if (A[i]*B!=C[i]) { puts("multiply failure."); return; } } puts("success."); } void print_result(float* C) { for (int i=0; i<LENGTH*THREAD; i++) { printf("%f\n",C[i]); } } int main(int args, char* argv[]) { int num_stream = 1; // number of stream int num_exec = 16; // number of executed kernel for (int i=1;argv[i];i++) { if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { num_stream = atoi(argv[++i]); } } // initialize and load kernel CUdevice device; CUcontext context; CUmodule module; CUfunction function; CUstream stream[num_stream]; cuInit(0); cuDeviceGet(&device, 0); cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); cuModuleLoad(&module, "multiply.ptx"); cuModuleGetFunction(&function, module, "multiply"); for (int i=0;i<num_stream;i++) cuStreamCreate(&stream[i],0); // memory allocate CUdeviceptr devA; CUdeviceptr devB[num_exec]; CUdeviceptr devOut[num_exec]; cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)); for (int i=0;i<num_exec;i++) { cuMemAlloc(&devB[i], sizeof(float)); cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)); } // input buffer float* A = new float[LENGTH*THREAD]; float* B = new float[num_exec]; for (int i=0; i<LENGTH*THREAD; i++) A[i] = (float)(i+1000); // output buffer float** result = new float* [num_exec]; for (int i=0;i<num_exec;i++) result[i] = new float[LENGTH*THREAD]; // Synchronous data transfer(host to device) cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)); // Asynchronous data transfer(host to device) int cur = 0; for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; B[i] = (float)(i+1); cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); } cur = 0; // Asynchronous launch kernel for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur=0; B[i] = (float)(i+1); //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); void* args[] = {&devA, &devB[i], &devOut[i]}; cuLaunchKernel(function, LENGTH, 1, 1, THREAD, 1, 1, 0, stream[cur], args, NULL); //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } cur = 0; // Asynchronous data transfer(device to host) for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } // wait for stream for (int i=0;i<num_stream;i++) cuStreamSynchronize(stream[i]); //printf("%0.6f\n",getTime()-start); for (int i=0;i<num_exec;i++) check_data(A,(float)(i+1),result[i]); // memory release cuMemFree(devA); for (int i=0;i<num_exec;i++) { cuMemFree(devB[i]); cuMemFree(devOut[i]); } for (int i=0;i<num_stream;i++) cuStreamDestroy(stream[i]); cuModuleUnload(module); cuCtxDestroy(context); delete[] A; delete[] B; for (int i=0;i<num_exec;i++) delete[] result[i]; delete[] result; return 0; }