# HG changeset patch # User Shohei KOKUBO # Date 1404201841 -32400 # Node ID 8c618e912c88b2be6d7a015e002fda6724bdfe59 # Parent d43c2b7932eac704c2998514105049a46fd66ac3 optimization data transfer. wrong result diff -r d43c2b7932ea -r 8c618e912c88 TaskManager/Cuda/CudaScheduler.cc --- a/TaskManager/Cuda/CudaScheduler.cc Tue Jul 01 11:17:12 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Tue Jul 01 17:04:01 2014 +0900 @@ -13,6 +13,9 @@ #include #include #include +#include + +using namespace std; TaskObject cuda_task_list[MAX_TASK_OBJECT]; @@ -137,6 +140,7 @@ ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(input_buf, &cudabuffer[cur].memin[param])); + reverse_map.insert(make_pair(&cudabuffer[cur].memin[param], input_buf)); } cudabuffer[cur].kernelParams[param] = transmitted[input_buf]; param++; @@ -151,6 +155,7 @@ createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(output_buf, &cudabuffer[cur].memout[i])); + reverse_map.insert(make_pair(&cudabuffer[cur].memout[i], output_buf)); } cudabuffer[cur].kernelParams[param] = transmitted[output_buf]; param++; @@ -183,34 +188,36 @@ int cur = 0; for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { if (STAGE <= cur) break; + // enable flip : not data transfer device to host + if (flag[cur].flip) continue; for(int i=0;ioutData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; - CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; - int i0 = flag[cur].flip ? i+1 : i ; - // flip use memin buffer and memout event - ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - transmitted.erase(output_buf); + if (transmitted.count(output_buf)) { + ret = cuMemcpyDtoHAsync(output_buf->addr, *transmitted[output_buf], output_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + reverse_map.erase(transmitted[output_buf]); + transmitted.erase(output_buf); + } } } return nextTask; } static void -release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { +release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem, map map) { for (int i=0; i 0 || cudabuffer[i].out_size > 0) - release_buf_event(i, cudabuffer); + release_buf_event(i, cudabuffer, reverse_map); } if(reply) { diff -r d43c2b7932ea -r 8c618e912c88 TaskManager/Cuda/CudaScheduler.h --- a/TaskManager/Cuda/CudaScheduler.h Tue Jul 01 11:17:12 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Tue Jul 01 17:04:01 2014 +0900 @@ -48,7 +48,8 @@ CudaBuffer cudabuffer[STAGE]; // record transmitted data. - map transmitted; + map transmitted; + map reverse_map; HTask::htask_flag flag[STAGE]; diff -r d43c2b7932ea -r 8c618e912c88 example/fft/main.cc --- a/example/fft/main.cc Tue Jul 01 11:17:12 2014 +0900 +++ b/example/fft/main.cc Tue Jul 01 17:04:01 2014 +0900 @@ -126,6 +126,7 @@ brev->set_inData(0, src, length_src*sizeof(cl_float2)); brev->set_outData(0, dst, length_dst*sizeof(cl_float2)); brev->set_cpu(spe_cpu); + brev->flip(); brev->wait_for(waitTask); brev->iterate(gws[0],gws[1]); @@ -141,6 +142,7 @@ bfly->set_inData(1, spin, sizeof(cl_float2)*(n/2)); bfly->set_outData(0,dst,length_dst*sizeof(cl_float2)); bfly->set_cpu(spe_cpu); + bfly->flip(); bfly->wait_for(waitTask); bfly->iterate(gws[0],gws[1]); waitTask = bfly; @@ -163,9 +165,9 @@ char * init(int argc, char**argv){ - + char *filename = 0; - + // printf("%s ",argv[4]); for (int i = 1; argv[i]; ++i) { if (strcmp(argv[i], "-file") == 0) { @@ -191,11 +193,11 @@ long m = (cl_int)(log((double)n)/log(2.0)); size_t *gws = new size_t[2]; size_t *lws = new size_t[2]; - + xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); - + HTask* waitTask; /* * [cl_float2] @@ -226,6 +228,7 @@ sfac->set_outData(0, wm, length_w*sizeof(cl_float2)); sfac->set_param(0,n); sfac->set_cpu(spe_cpu); + sfac->flip(); sfac->iterate(gws[0]); // Butterfly Operation @@ -239,6 +242,7 @@ first_trns->set_outData(0,xm,length_r*sizeof(cl_float2)); first_trns->set_param(0,n); first_trns->set_cpu(spe_cpu); + first_trns->flip(); first_trns->wait_for(waitTask); first_trns->iterate(gws[0],gws[1]); @@ -254,6 +258,7 @@ hpfl->set_param(0,n); hpfl->set_param(1,(long)radius); hpfl->set_cpu(spe_cpu); + hpfl->flip(); hpfl->wait_for(waitTask); hpfl->iterate(gws[0],gws[1]); @@ -269,6 +274,7 @@ second_trns->set_outData(0,rm,length_r*sizeof(cl_float2)); second_trns->set_param(0,n); second_trns->set_cpu(spe_cpu); + second_trns->flip(); second_trns->wait_for(waitTask); second_trns->iterate(gws[0],gws[1]);