changeset 1969:a68dbdf9b429 draft

fix GpuScheduler
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 25 Feb 2014 13:43:25 +0900
parents d45b7223515b
children e211424ac950
files TaskManager/Gpu/GpuScheduler.cc TaskManager/kernel/ppe/CpuThreads.h example/Cuda/Makefile example/Cuda/main.cc example/Cuda/multiply.cu example/OpenCL/twice.cc example/word_count/main.cc
diffstat 7 files changed, 282 insertions(+), 157 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Gpu/GpuScheduler.cc	Thu Feb 13 14:57:04 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Tue Feb 25 13:43:25 2014 +0900
@@ -37,7 +37,9 @@
     }
     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
     for (int i=0;i<STAGE;i++) {
-        command_queue[i] = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+        command_queue[i] = clCreateCommandQueue(context, device_id,
+                                                CL_QUEUE_PROFILING_ENABLE,
+                                                &ret);
         if (ret<0) {
             const char *msg=convert_error_status(ret);
             error(msg);
@@ -105,10 +107,10 @@
  */
 void
 GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, TaskListPtr taskList, int cur) {
-    if (kernel_event[cur-1] == NOP_REPLY) {
+    if (kernel_event[cur] == NOP_REPLY) {
         
-    } else if (kernel_event[cur-1] != NULL) {
-        int ret=clWaitForEvents(1,&kernel_event[cur-1]);
+    } else if (kernel_event[cur] != NULL) {
+        int ret=clWaitForEvents(1,&kernel_event[cur]);
 
         if (ret<0) {
             error(convert_error_status(ret));
@@ -116,25 +118,24 @@
         if (taskList!=NULL){
             cl_ulong start = 0;
             cl_ulong end   = 0;
-            clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
-            clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
+            clGetEventProfilingInfo(kernel_event[cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
+            clGetEventProfilingInfo(kernel_event[cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
             if (taskList->task_start_time == 0)
                 taskList->task_start_time = start;
             taskList->task_end_time   = end;
         }
-        clReleaseEvent(kernel_event[cur-1]);
-        kernel_event[cur-1] = 0;
-
+        clReleaseEvent(kernel_event[cur]);
+        kernel_event[cur] = 0;
     }
     
-    if (memout[cur-1].size > 0) {
-        int ret=clWaitForEvents(memout[cur-1].size, memout[cur-1].event);
+    if (memout[cur].size > 0) {
+        int ret=clWaitForEvents(memout[cur].size, memout[cur].event);
         if (ret<0) error(convert_error_status(ret));
-        release_buf_event(cur-1,memout);
+        release_buf_event(cur,memout);
     }
 
-    if (memin[cur-1].size > 0) {
-        release_buf_event(cur-1,memin);
+    if (memin[cur].size > 0) {
+        release_buf_event(cur,memin);
     }
     if(reply) {
         connector->mail_write(reply);
@@ -189,8 +190,6 @@
 
         if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
             // wait_for_envet was called, so all kernel,buf,event have been released.
-            for (int i=0;i<STAGE;i++) 
-                clFinish(command_queue[i]);
             for (int i=0;i<STAGE;i++) {
                 destroyGpuBuffer(&memin[i]);
                 destroyGpuBuffer(&memout[i]);
@@ -238,8 +237,8 @@
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 
                 // parameter is passed as first kernel arg 
-                ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
-                                           nextTask->param(0), 0, NULL, &memin[cur].event[0]);
+                ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_FALSE, 0,sizeof(memaddr)*nextTask->param_count,
+                                           nextTask->param(0), 0, NULL, NULL);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 
                 ret = clSetKernelArg(kernel[cur], 0, sizeof(memaddr),(void *)&memin[cur].buf[0]);
@@ -256,7 +255,7 @@
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                     ret = clEnqueueWriteBuffer(command_queue[cur], memin[cur].buf[param], CL_TRUE, 0,
                                                input_buf->size, input_buf->addr, 0, 
-                                               NULL, &memin[cur].event[param]);
+                                               NULL, NULL);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                     ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur].buf[param]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
@@ -281,10 +280,10 @@
 
                 if (tasklist->dim > 0) {
                     ret = clEnqueueNDRangeKernel(command_queue[cur], kernel[cur], tasklist->dim,
-                                                 NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
+                                                 NULL, &tasklist->x, 0, 0, NULL, NULL);
                 } else {
-                    ret = clEnqueueTask(command_queue[cur], kernel[cur], memin[cur].size,
-                                        memin[cur].event, &kernel_event[cur]);
+                    ret = clEnqueueTask(command_queue[cur], kernel[cur], 0,
+                                        NULL, NULL);
                 }
                 if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; }
 
@@ -295,29 +294,22 @@
                     int i0 = flag[cur].flip ? i+1 : i ;
                     // flip use memin buffer and memout event
                     ret = clEnqueueReadBuffer(command_queue[cur], mem[cur].buf[i0], CL_FALSE, 0,
-                                              output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]);
+                                              output_buf->size, output_buf->addr, 0, NULL, &memout[cur].event[i]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 }
-                // wait kernel[1-cur] and write[1-cur]
-                // pipeline    : cur
-                // to stop pipeline set 1-cur
-                if (cur == 0) {
-                    wait_for_event(kernel_event, memout, tasklist, STAGE);
-                } else {
-                    wait_for_event(kernel_event, memout, tasklist, cur);
-                }
-                cur++;
-                if (STAGE <= cur)
-                    cur = 0;
+                cur++;            // wait write[cur+1]
+                if (STAGE <= cur) // to stop pipeline move to after wait_for_event
+                    cur = 0;      //
+                wait_for_event(kernel_event, memout, tasklist, cur);
             }
             reply = (memaddr)tasklist->waiter;
             params_addr = (memaddr)tasklist->next;
         }
-        if (cur == 0) {
-            wait_for_event(kernel_event, memout, tasklist, STAGE);
-        } else {
-            wait_for_event(kernel_event, memout, tasklist, cur);
-        }
+        for (int i=0;i<STAGE;i++) 
+            clFinish(command_queue[i]);
+
+        wait_for_event(kernel_event, memout, tasklist, cur);
+
         unsigned long long wait = 0;
         (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
         connector->mail_write((memaddr)MY_SPE_STATUS_READY);
--- a/TaskManager/kernel/ppe/CpuThreads.h	Thu Feb 13 14:57:04 2014 +0900
+++ b/TaskManager/kernel/ppe/CpuThreads.h	Tue Feb 25 13:43:25 2014 +0900
@@ -4,7 +4,9 @@
 #include <pthread.h>
 #include "Threads.h"
 #include "GpuThreads.h"
+#ifdef __CERIUM_CUDA__
 #include "CudaThreads.h"
+#endif
 #include "TaskManagerImpl.h"
 #include "MainScheduler.h"
 #include "Sem.h"
--- a/example/Cuda/Makefile	Thu Feb 13 14:57:04 2014 +0900
+++ b/example/Cuda/Makefile	Tue Feb 25 13:43:25 2014 +0900
@@ -15,14 +15,15 @@
 LIBS = -F/Library/Frameworks -framework CUDA
 INCLUDE = -I/Developer/NVIDIA/CUDA-5.5/include
 
-.SUFFIXES: .cc .o
+.SUFFIXES: .cc .o .cu .ptx
 .cc.o:
 	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+.cu.ptx:
+	$(NVCC) $(NVCCFLAGS) $<
 
 all: $(TARGET)
 
-$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_SRCS_TMP)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_SRCS_TMP)
+$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
 	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
 
 link:
--- a/example/Cuda/main.cc	Thu Feb 13 14:57:04 2014 +0900
+++ b/example/Cuda/main.cc	Tue Feb 25 13:43:25 2014 +0900
@@ -1,94 +1,145 @@
 #include <stdio.h>
-
+#include <sys/time.h>
+#include <string.h>
 #include <cuda.h>
 
-#define LENGTH 1000
+#define LENGTH 10000
+#define THREAD 1000
 
-void check_data(float* A,float* B,float* C) {
-    for (int i=0; i<LENGTH; i++) {
-        if (A[i]*B[i]!=C[i]) {
-            puts("failure.");
+static double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+void
+check_data(float* A, float B, float* C) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
+        if (A[i]*B!=C[i]) {
+            puts("multiply failure.");
             return;
         }
     }
     puts("success.");
-    return;
 }
 
 void print_result(float* C) {
-    for (int i=0; i<LENGTH; i++) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
         printf("%f\n",C[i]);
     }
 }
 
-int main() {
+int main(int args, char* argv[]) {
+    int num_stream = 1; // number of stream
+    int num_exec = 16; // number of executed kernel
+    
+    for (int i=1;argv[i];i++) {
+        if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
+            num_stream = atoi(argv[++i]);
+        }
+    }
+
+    // initialize and load kernel
     CUdevice device;
     CUcontext context;
     CUmodule module;
     CUfunction function;
+    CUstream stream[num_stream];
 
     cuInit(0);
     cuDeviceGet(&device, 0);
-    cuCtxCreate(&context, 0, device);
+    cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
     cuModuleLoad(&module, "multiply.ptx");
     cuModuleGetFunction(&function, module, "multiply");
-    
-    CUresult ret;
-    int size = 8;
-    CUstream stream1[size];
+    for (int i=0;i<num_stream;i++)
+        cuStreamCreate(&stream[i],0);
 
-    for (int i=0;i<size;i++) {
-        ret=cuStreamCreate(&stream1[i],0);
+    // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devB[num_exec];
+    CUdeviceptr devOut[num_exec];
+
+    cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float));
+    for (int i=0;i<num_exec;i++) {
+        cuMemAlloc(&devB[i], sizeof(float));
+        cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float));
     }
-    
-    printf("%d\n",ret);
+
+    // input buffer
+    float* A = new float[LENGTH*THREAD];
+    float* B = new float[num_exec];
 
+    for (int i=0; i<LENGTH*THREAD; i++)
+        A[i] = (float)(i+1000);
+
+    // output buffer
+    float** result = new float* [num_exec];
 
-    float* A = new float[LENGTH];
-    float* B = new float[LENGTH];
-    float* C = new float[LENGTH];
-    
-    for (int i=0; i<LENGTH; i++) {
-        A[i] = (float)(i+1000);
-        B[i] = (float)(i+1)/10.f;
+    for (int i=0;i<num_exec;i++)
+        result[i] = new float[LENGTH*THREAD];
+
+    // Synchronous data transfer(host to device)
+    cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float));
+
+    // Asynchronous data transfer(host to device)
+    int cur = 0;
+
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur = 0;
+        B[i] = (float)(i+1);
+        cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
     }
 
-    CUdeviceptr devA,devB,devC;
+    cur = 0;
 
-    cuMemAlloc(&devA, LENGTH*sizeof(float));
-    cuMemAlloc(&devB, LENGTH*sizeof(float));
-    cuMemAlloc(&devC, LENGTH*sizeof(float));
+    // Asynchronous launch kernel
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur=0;
+        void* args[] = {&devA, &devB[i], &devOut[i]};
+        cuLaunchKernel(function,
+                       LENGTH, 1, 1,
+                       THREAD, 1, 1,
+                       0, 0, args, NULL);
+    }
 
-    cuMemcpyHtoDAsync(devA, A, LENGTH*sizeof(float), stream1[0]);
-    cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream1[0]);
+    cur = 0;
+
     
-    //    void* args[] = {&devA, &devB, &devC};
-    void** args=NULL;
-    // args=(void**)malloc(sizeof(void*)*8);
-    // args[0] = &devA;
-    // args[1] = &devB;
-    // args[2] = &devC;
+    // Asynchronous data transfer(device to host)
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur = 0;
+        cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
+    }
+    
+    // wait for stream
+    for (int i=0;i<num_stream;i++)
+        cuStreamSynchronize(stream[i]);
     
-    ret=cuLaunchKernel(function,
-                       LENGTH, 1, 1,
-                       1, 1, 1,
-                       0, stream1[0], args, NULL);
-    printf("%d\n",ret);
-    
-    cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream1[0]);
+    //printf("%0.6f\n",getTime()-start);
+
+    for (int i=0;i<num_exec;i++)
+        check_data(A,(float)(i+1),result[i]);
 
-    //    print_result(C);
-    check_data(A, B, C);
+    // memory release
+    cuMemFree(devA);
+    for (int i=0;i<num_exec;i++) {
+        cuMemFree(devB[i]);
+        cuMemFree(devOut[i]);
+    }
+    for (int i=0;i<num_stream;i++)
+        cuStreamDestroy(stream[i]);
+    cuModuleUnload(module);
+    cuCtxDestroy(context);
 
     delete[] A;
     delete[] B;
-    delete[] C;
-    cuMemFree(devA);
-    cuMemFree(devB);
-    cuMemFree(devC);
-    cuModuleUnload(module);
-    cuStreamDestroy(stream1[0]);
-    cuCtxDestroy(context);
+    for (int i=0;i<num_exec;i++)
+        delete[] result[i];
+    delete[] result;
 
     return 0;
 }
--- a/example/Cuda/multiply.cu	Thu Feb 13 14:57:04 2014 +0900
+++ b/example/Cuda/multiply.cu	Tue Feb 25 13:43:25 2014 +0900
@@ -1,7 +1,6 @@
 extern "C" {
-    __global__ void multiply(/*float* A, float* B, float* C*/) {
+    __global__ void multiply(float* A, float* B, float* C) {
         int index = blockIdx.x * blockDim.x + threadIdx.x;
-        //C[index] = A[index] * B[index];
-        printf("%d\n",index);
+        C[index] = A[index] * B[0];
     }
 }
--- a/example/OpenCL/twice.cc	Thu Feb 13 14:57:04 2014 +0900
+++ b/example/OpenCL/twice.cc	Tue Feb 25 13:43:25 2014 +0900
@@ -3,46 +3,73 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <sys/stat.h>
-#define DEFAULT 432
+#include <sys/time.h>
+#include <string.h>
+
+#define WORKS 10000000
+
+static double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
 
 void
-print_data(int *data, int size, const char *title)
+check_data(float* A, float B,float* C) {
+    for (int i=0;i<WORKS;i++) {
+        if (A[i]*B!=C[i]) {
+            puts("multiply failure.");
+            return;
+        }
+    }
+    puts("success.");
+}
+
+void
+print_data(float *data, int size, const char *title)
 {
     printf("%s ---\n", title);
     for ( int i = 0; i < size; i++) {
-        printf("%2d ", data[i]);
+        printf("%2f\n", data[i]);
     }
     printf("\n");
 }
 
 int main(int argc, char *argv[]) {
 
-    // 無効な引数ならデフォルトの値として432を設定
-    int task_array_num = DEFAULT;
-
-    if (argc>1) {
-        if (atoi(argv[1])) {
-            task_array_num = atoi(argv[1]);
+    int num_command_queue = 1; // number of command_queue
+    int num_exec = 16; // number of kernel
+    size_t num_work = WORKS;
+    
+    for (int i=1;argv[i];i++) {
+        if (strcmp(argv[i], "--command_queue") == 0 || strcmp(argv[i], "-c") == 0) {
+            num_command_queue = atoi(argv[++i]);
         }
     }
+    printf("%d\n",num_command_queue);
 
+    // initialize
     cl_platform_id platform_id = NULL;
     cl_uint ret_num_platforms = 0;
     cl_device_id device_id = NULL;
     cl_uint ret_num_devices = 0L;
     cl_int ret;
+    cl_event write_event[num_exec];
+    cl_event exec_event[num_exec];
 
-    clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
-    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
-                   &ret_num_devices);
-
+    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
+                         &ret_num_devices);
+    
     cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
-    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
-
-    //ファイルオープン
-
-    const char* filename = "twice.cl";
-    const char* functionname = "twice";
+    cl_command_queue command_queue[num_command_queue];
+    for (int i=0;i<num_command_queue;i++)
+        command_queue[i] = clCreateCommandQueue(context, device_id, 0, &ret);
+    
+    // load kernel
+    const char* filename = "multiply.cl";
+    const char* functionname = "multiply";
 
     int fp = open(filename, O_RDONLY);
 
@@ -59,63 +86,116 @@
         fprintf(stderr, "Failed to load kernel.\n");
     }
 
-    char *kernel_src_str = new char[size];
+    char *kernel_src_str = (char*)alloca(size+1);
     size_t kernel_code_size = read(fp, kernel_src_str, size);
     close(fp);
+    kernel_src_str[size] = 0;
 
     cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
-                                                   (const size_t *)&kernel_code_size, &ret);
-    clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
-    cl_kernel kernel = clCreateKernel(program,functionname, &ret);
+                                                   0/*(const size_t *)&kernel_code_size*/, &ret);
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+    if(ret<0) {
+        size_t size;
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL ,&size);
+        
+        char* log = new char[size];
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
+        printf("%s\n",log);
+        delete[] log;
+        exit(0);
+    }
+    cl_kernel multiply = clCreateKernel(program,functionname, &ret);
 
-    int *data = new int[task_array_num];
-    int *output_data = new int[task_array_num];
-
-    int count = 0;
-    for (int c = 0; c < task_array_num ; count++,c++){
-        data[c] = c;
+    // memory allcate
+    cl_mem memA = clCreateBuffer(context, CL_MEM_READ_WRITE, WORKS*sizeof(float), NULL, &ret);
+    cl_mem memB[num_exec];
+    cl_mem memOut[num_exec];
+    for (int i=0;i<num_exec;i++) {
+        memB[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, &ret);
+        memOut[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, WORKS*sizeof(float), NULL, &ret);
     }
 
-    //メモリバッファの作成
-    cl_mem data_count = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*count, NULL, &ret);
-    cl_mem memobj_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*count, NULL, &ret);
-    cl_mem memobj_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*count, NULL, &ret);
+    // input buffer
+    float* A = new float[WORKS];
+    float* B = new float[num_exec];
+
+    for (int i=0;i<WORKS;i++)
+        A[i] = (float)(i+1000);
 
+    // output buffer
+    float** result = new float* [num_exec];
+    
+    for (int i=0;i<num_exec;i++)
+        result[i] = new float[WORKS];
+    
+    double start = getTime();
 
-    //メモリバッファに入力データを書き込み
-    ret = clEnqueueWriteBuffer(command_queue, data_count, CL_TRUE, 0,
-                               sizeof(count), &count, 0, NULL, NULL);
+    // Synchronous data transfer(host to device)
+    clEnqueueWriteBuffer(command_queue[0], memA, CL_TRUE, 0,
+                         WORKS*sizeof(float), A, 0, NULL, NULL);
+
+    // Asynchronous data transfer(host to device)
+    int cur = 0;
 
-    ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0,
-                               sizeof(int)*count, data, 0, NULL, NULL);
+    for (int i = 0;i<num_exec;i++,cur++){
+        if (num_command_queue <= cur)
+            cur = 0;
+        B[i] = (float)(i+1);
+        clEnqueueWriteBuffer(command_queue[cur], memB[i], CL_FALSE, 0,
+                             sizeof(float), &B[i], 0, NULL, NULL);
+    }
 
-    print_data(data, count, "before");
+    cur = 0;
 
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_count);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_in);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobj_out);
+    // Asynchronous launch kernel
+    for (int i = 0;i<num_exec;i++,cur++){
+        if (num_command_queue <= cur)
+            cur = 0;
+        ret = clSetKernelArg(multiply, 0, sizeof(cl_mem), &memA);
+        ret = clSetKernelArg(multiply, 1, sizeof(cl_mem), &memB[i]);
+        ret = clSetKernelArg(multiply, 2, sizeof(cl_mem), &memOut[i]);
+        ret = clEnqueueNDRangeKernel(command_queue[cur], multiply, 1, NULL,
+                                     &num_work, NULL, 0, NULL, NULL);
+    }
 
+    cur = 0;
 
-    cl_event ev = NULL;
-    ret = clEnqueueTask(command_queue, kernel, 0, NULL, &ev);
+    // Asynchronous data transfer(device to host)
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_command_queue <= cur)
+            cur = 0;
+        clEnqueueReadBuffer(command_queue[cur], memOut[i], CL_FALSE, 0,
+                            WORKS*sizeof(float), result[i], 0, NULL, NULL);
+    }
 
-    //メモリバッファから結果を取得
-    ret = clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0,
-                              sizeof(int)*count, output_data, 1, &ev, NULL);
+    // wait for command_queue
+    for (int i=0;i<num_command_queue;i++)
+        clFlush(command_queue[i]);
+    
+    printf("%06f\n",getTime()-start);
 
-    print_data(output_data, count, "after");
+    //  for (int i=0;i<num_exec;i++) {
+    // //     //print_data(result[i],WORKS,"hoge");
+    //      check_data(A,(float)(i+1),result[i]);
+    //     }
 
-    clReleaseKernel(kernel);
+    // memory release
+    clReleaseMemObject(memA);
+    for (int i=0;i<num_exec;i++) {
+        clReleaseMemObject(memB[i]);
+        clReleaseMemObject(memOut[i]);
+    }
+    for (int i=0;i<num_command_queue;i++)
+        clReleaseCommandQueue(command_queue[i]);
+    clReleaseKernel(multiply);
     clReleaseProgram(program);
-    clReleaseMemObject(data_count);
-    clReleaseMemObject(memobj_in);
-    clReleaseMemObject(memobj_out);
-    clReleaseCommandQueue(command_queue);
     clReleaseContext(context);
 
-    delete [] kernel_src_str;
-    delete [] data;
-    delete [] output_data;
+    delete[] A;
+    delete[] B;
+    for (int i=0;i<num_exec;i++)
+        delete[] result[i];
+    delete[] result;
 
     return 0;
 }
--- a/example/word_count/main.cc	Thu Feb 13 14:57:04 2014 +0900
+++ b/example/word_count/main.cc	Tue Feb 25 13:43:25 2014 +0900
@@ -63,7 +63,7 @@
     st_mmap_t st_mmap;
     struct stat sb;
 
-    if ((fd=open(filename,O_RDONLY,0666))==0) {
+    if ((fd=open(filename,O_RDWR,0666))==0) {
         fprintf(stderr,"can't open %s\n",filename);
     }
 
@@ -78,7 +78,7 @@
 
     printf("fix 4096byte file size %d\n",(int)st_mmap.size);
     
-    st_mmap.file_mmap = (char*)mmap(NULL,st_mmap.size,PROT_READ,map,fd,(off_t)0);
+    st_mmap.file_mmap = (char*)mmap(NULL,st_mmap.size,PROT_READ|PROT_WRITE,map,fd,(off_t)0);
     if (st_mmap.file_mmap == (caddr_t)-1) {
         fprintf(stderr,"Can't mmap file\n");
         perror(NULL);
@@ -294,7 +294,7 @@
     t_print = manager->create_task(TASK_PRINT,
                                    (memaddr)&w->self,sizeof(memaddr),0,0);
     w->t_print = t_print;    
-    for(int i=0;i<1;i++) {
+    for(int i=0;i<4;i++) {
         /* Task を task_blocks ずつ起動する Task */
         /* serialize されていると仮定する... */
         HTaskPtr t_exec = manager->create_task(RUN_TASK_BLOCKS,