changeset 1960:273de551f726 draft

use multiple command_queue
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 11 Feb 2014 16:28:22 +0900
parents 6d343611bb03
children 7d1afa7aeccd
files TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaTaskManagerFactory.cc TaskManager/Gpu/GpuScheduler.cc TaskManager/Gpu/GpuScheduler.h TaskManager/Makefile.def example/flip/Makefile example/flip/Makefile.def example/flip/main.cc example/flip/task_init.cc example/flip/twice.cl example/many_task/Makefile example/many_task/ppe/task_init.cc example/multiply/cuda/multiply.cu example/multiply/main.cc example/word_count/Makefile.cuda example/word_count/main.cc
diffstat 16 files changed, 148 insertions(+), 114 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -84,21 +84,21 @@
 
 static void
 release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) {
-    for (int i=0; i<mem[cur-1].in_size; i++) {
-        if (mem[cur-1].memin[i])
-            cuMemFree(mem[cur-1].memin[i]);
-        mem[cur-1].memin[i] = 0;
+    for (int i=0; i<mem[cur].in_size; i++) {
+        if (mem[cur].memin[i])
+            cuMemFree(mem[cur].memin[i]);
+        mem[cur].memin[i] = 0;
     }
-    for (int i=0; i<mem[cur-1].out_size; i++) {
-        if (mem[cur-1].event[i] != 0)
-            cuEventDestroy(mem[cur-1].event[i]);
-        mem[cur-1].event[i] = 0;
-        if (mem[cur-1].memout[i])
-            cuMemFree(mem[cur-1].memout[i]);
-        mem[cur-1].memout[i] = 0;
+    for (int i=0; i<mem[cur].out_size; i++) {
+        if (mem[cur].event[i] != 0)
+            cuEventDestroy(mem[cur].event[i]);
+        mem[cur].event[i] = 0;
+        if (mem[cur].memout[i])
+            cuMemFree(mem[cur].memout[i]);
+        mem[cur].memout[i] = 0;
     }
-    mem[cur-1].in_size = 0;
-    mem[cur-1].out_size = 0;
+    mem[cur].in_size = 0;
+    mem[cur].out_size = 0;
 }
 
 void
@@ -128,12 +128,12 @@
                 if (ret!=0) error(convert_error_status(ret));
             }
         }
-        release_buf_event(cur, cudabuffer);
+        release_buf_event(cur-1, cudabuffer);
     }
 
     if(reply) {
         connector->mail_write(reply);
-        __debug(this, "CUDA %d %s\t%lld\n", taskList->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time);
+        __debug(this, "CUDA %d %s\t%lld\n", taskList->self->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time);
         reply = 0;
     }
 }
@@ -145,7 +145,7 @@
         cuEventDestroy(kernel_event[cur]);
     kernel_event[cur] = NOP_REPLY;
     kernel[cur] = 0;
-    release_buf_event(cur+1, cudabuffer);
+    release_buf_event(cur, cudabuffer);
 
     wait_for_event(kernel_event, cudabuffer, taskList, cur);
 }
@@ -291,7 +291,7 @@
                     ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream);
                     if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 }
-                // wait kernel[1-cur] and write[1-cur]
+                // wait kernel[cur] and write[cur]
                 // pipeline    : cur
                 // to stop pipeline set cur+1
                 if (cur == 0) {
--- a/TaskManager/Cuda/CudaTaskManagerFactory.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/TaskManager/Cuda/CudaTaskManagerFactory.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -1,4 +1,3 @@
-#define DEBUG
 #include "CellTaskManagerImpl.h"
 #include "CudaThreads.h"
 #include "CpuThreads.h"
--- a/TaskManager/Gpu/GpuScheduler.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -36,17 +36,20 @@
         exit(EXIT_FAILURE);
     }
     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
-    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
-    if (ret<0) {
-        const char *msg=convert_error_status(ret);
-        error(msg);
+    for (int i=0;i<STAGE;i++) {
+        command_queue[i] = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+        if (ret<0) {
+            const char *msg=convert_error_status(ret);
+            error(msg);
+        }
     }
 
 }
 
 GpuScheduler::~GpuScheduler()
 {
-    clReleaseCommandQueue(command_queue);
+    for (int i=0;i<STAGE;i++)
+        clReleaseCommandQueue(command_queue[i]);
     clReleaseContext(context);
 }
 
@@ -85,15 +88,15 @@
 
 static void
 release_buf_event(int cur, GpuScheduler::GpuBufferPtr memout) {
-    for (int i=0; i < memout[1-cur].size; i++) {
-        if (memout[1-cur].event[i] != 0)
-            clReleaseEvent(memout[1-cur].event[i]);
-        memout[1-cur].event[i] = 0;
-        if (memout[1-cur].buf[i] != 0)
-            clReleaseMemObject(memout[1-cur].buf[i]);
-        memout[1-cur].buf[i]   = 0;
+    for (int i=0; i < memout[cur].size; i++) {
+        if (memout[cur].event[i] != 0)
+            clReleaseEvent(memout[cur].event[i]);
+        memout[cur].event[i] = 0;
+        if (memout[cur].buf[i] != 0)
+            clReleaseMemObject(memout[cur].buf[i]);
+        memout[cur].buf[i]   = 0;
     }
-    memout[1-cur].size = 0;
+    memout[cur].size = 0;
 }
 
 /**
@@ -102,10 +105,10 @@
  */
 void
 GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, TaskListPtr taskList, int cur) {
-    if (kernel_event[1-cur] == NOP_REPLY) {
+    if (kernel_event[cur-1] == NOP_REPLY) {
         
-    } else if (kernel_event[1-cur] != NULL) {
-        int ret=clWaitForEvents(1,&kernel_event[1-cur]);
+    } else if (kernel_event[cur-1] != NULL) {
+        int ret=clWaitForEvents(1,&kernel_event[cur-1]);
 
         if (ret<0) {
             error(convert_error_status(ret));
@@ -113,25 +116,25 @@
         if (taskList!=NULL){
             cl_ulong start = 0;
             cl_ulong end   = 0;
-            clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
-            clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
+            clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
+            clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
             if (taskList->task_start_time == 0)
                 taskList->task_start_time = start;
             taskList->task_end_time   = end;
         }
-        clReleaseEvent(kernel_event[1-cur]);
-        kernel_event[1-cur] = 0;
+        clReleaseEvent(kernel_event[cur-1]);
+        kernel_event[cur-1] = 0;
 
     }
     
-    if (memout[1-cur].size > 0) {
-        int ret=clWaitForEvents(memout[1-cur].size, memout[1-cur].event);
+    if (memout[cur-1].size > 0) {
+        int ret=clWaitForEvents(memout[cur-1].size, memout[cur-1].event);
         if (ret<0) error(convert_error_status(ret));
-        release_buf_event(cur,memout);
+        release_buf_event(cur-1,memout);
     }
 
-    if (memin[1-cur].size > 0) {
-        release_buf_event(cur,memin);
+    if (memin[cur-1].size > 0) {
+        release_buf_event(cur-1,memin);
     }
     if(reply) {
         connector->mail_write(reply);
@@ -150,8 +153,8 @@
     if (kernel[cur] != 0)
         clReleaseKernel(kernel[cur]);
     kernel[cur] = 0;
-    release_buf_event(1-cur,memout);
-    release_buf_event(1-cur,memin);
+    release_buf_event(cur,memout);
+    release_buf_event(cur,memin);
 
     // wait kernel[1-cur] and write[1-cur]
     wait_for_event(kernel_event, memout, tasklist, cur);
@@ -172,9 +175,13 @@
     int cur = 0;
     TaskListPtr tasklist = NULL;
     reply = 0;
-    initGpuBuffer(&memin[0]);initGpuBuffer(&memin[1]);
-    initGpuBuffer(&memout[0]);initGpuBuffer(&memout[1]);
-    memset(&flag, 0, sizeof(HTask::htask_flag)*2);
+    for (int i=0;i<STAGE;i++) {
+        initGpuBuffer(&memin[i]);
+        initGpuBuffer(&memout[i]);
+        kernel[i]=0;
+        kernel_event[i]=0;
+    }
+    memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE);
     
     for (;;) {
         memaddr params_addr = connector->task_list_mail_read();
@@ -182,11 +189,12 @@
 
         if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
             // wait_for_envet was called, so all kernel,buf,event have been released.
-            clFinish(command_queue);
-            destroyGpuBuffer(&memout[1-cur]);
-            destroyGpuBuffer(&memout[cur]);
-            destroyGpuBuffer(&memin[cur]);
-            destroyGpuBuffer(&memin[1-cur]);
+            for (int i=0;i<STAGE;i++) 
+                clFinish(command_queue[i]);
+            for (int i=0;i<STAGE;i++) {
+                destroyGpuBuffer(&memin[i]);
+                destroyGpuBuffer(&memout[i]);
+            }
             return ;
         }
 
@@ -194,7 +202,7 @@
         while (params_addr) {
             // since we are on the same memory space, we don't has to use dma_load here
             tasklist = (TaskListPtr)connector->dma_load(this, params_addr,
-                                                                    sizeof(TaskList), DMA_READ_TASKLIST);
+                                                        sizeof(TaskList), DMA_READ_TASKLIST);
             //            tasklist[cur]->task_start_time = gettime();
             tasklist->task_start_time = 0;
             /*
@@ -204,7 +212,7 @@
             if (tasklist->self) {
                 flag[cur] = tasklist->self->flag;
             } else {
-                 memset(&flag[cur], 0, sizeof(HTask::htask_flag));
+                memset(&flag[cur], 0, sizeof(HTask::htask_flag));
             }
             for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) {
                 if(nextTask->command==ShowTime) {
@@ -223,14 +231,14 @@
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
 
                 int param = 0;
-
+                
                 // set arg count
                 cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY,
                                                sizeof(memaddr)*nextTask->param_count, &ret);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-
+                
                 // parameter is passed as first kernel arg 
-                ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
+                ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
                                            nextTask->param(0), 0, NULL, &memin[cur].event[0]);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 
@@ -246,7 +254,7 @@
                     if (input_buf->size==0) break;
                     createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-                    ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0,
+                    ret = clEnqueueWriteBuffer(command_queue[cur], memin[cur].buf[param], CL_TRUE, 0,
                                                input_buf->size, input_buf->addr, 0, 
                                                NULL, &memin[cur].event[param]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
@@ -272,10 +280,10 @@
                 memout[cur].size = param - memin[cur].size;  // no buffer on flip, but flip use memout event
 
                 if (tasklist->dim > 0) {
-                    ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim,
+                    ret = clEnqueueNDRangeKernel(command_queue[cur], kernel[cur], tasklist->dim,
                                                  NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
                 } else {
-                    ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size,
+                    ret = clEnqueueTask(command_queue[cur], kernel[cur], memin[cur].size,
                                         memin[cur].event, &kernel_event[cur]);
                 }
                 if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; }
@@ -286,21 +294,30 @@
                     GpuBufferPtr mem = flag[cur].flip ? memin : memout ;
                     int i0 = flag[cur].flip ? i+1 : i ;
                     // flip use memin buffer and memout event
-                    ret = clEnqueueReadBuffer(command_queue, mem[cur].buf[i0], CL_FALSE, 0,
+                    ret = clEnqueueReadBuffer(command_queue[cur], mem[cur].buf[i0], CL_FALSE, 0,
                                               output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 }
                 // wait kernel[1-cur] and write[1-cur]
                 // pipeline    : cur
                 // to stop pipeline set 1-cur
-                wait_for_event(kernel_event, memout, tasklist, cur);
-                cur = 1 - cur;
+                if (cur == 0) {
+                    wait_for_event(kernel_event, memout, tasklist, STAGE);
+                } else {
+                    wait_for_event(kernel_event, memout, tasklist, cur);
+                }
+                cur++;
+                if (STAGE <= cur)
+                    cur = 0;
             }
             reply = (memaddr)tasklist->waiter;
             params_addr = (memaddr)tasklist->next;
         }
-        wait_for_event(kernel_event, memout, tasklist, cur);
-
+        if (cur == 0) {
+            wait_for_event(kernel_event, memout, tasklist, STAGE);
+        } else {
+            wait_for_event(kernel_event, memout, tasklist, cur);
+        }
         unsigned long long wait = 0;
         (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
         connector->mail_write((memaddr)MY_SPE_STATUS_READY);
--- a/TaskManager/Gpu/GpuScheduler.h	Sat Feb 08 14:19:41 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.h	Tue Feb 11 16:28:22 2014 +0900
@@ -15,6 +15,8 @@
 
 extern TaskObject gpu_task_list[MAX_TASK_OBJECT];
 
+#define STAGE 8
+
 class GpuScheduler : public MainScheduler {
  public:
     typedef struct gpubuffer {
@@ -34,14 +36,14 @@
     cl_uint ret_num_platforms;
     cl_uint ret_num_devices;
     cl_context context;
-    cl_command_queue command_queue;
+    cl_command_queue command_queue[STAGE];
     cl_int ret;
     memaddr reply;
-    cl_kernel kernel[2];
-    cl_event kernel_event[2];
-    GpuBuffer memin[2];
-    GpuBuffer memout[2];
-    HTask::htask_flag flag[2];
+    cl_kernel kernel[STAGE];
+    cl_event kernel_event[STAGE];
+    GpuBuffer memin[STAGE];
+    GpuBuffer memout[STAGE];
+    HTask::htask_flag flag[STAGE];
 private:
     int load_kernel(int cmd);
     cl_mem createBuffer(GpuBufferPtr m, int i, cl_context context, cl_mem_flags flags, size_t size, cl_int *error);
--- a/TaskManager/Makefile.def	Sat Feb 08 14:19:41 2014 +0900
+++ b/TaskManager/Makefile.def	Tue Feb 11 16:28:22 2014 +0900
@@ -48,7 +48,7 @@
 
 CC   = clang++
 CXX     = clang++
-CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT)   $(OPT)
+CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT)   $(OPT) #-DDEBUG
 CXXFLAGS = $(CFLAGS)
 LIBS   =  -m$(ABIBIT)
 
--- a/example/flip/Makefile	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/flip/Makefile	Tue Feb 11 16:28:22 2014 +0900
@@ -1,29 +1,15 @@
-include ./Makefile.def
+default: gpu
 
-SRCS_TMP = $(wildcard *.cc)
-SRCS_EXCLUDE =   # 除外するファイルを書く
-SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
-OBJS = $(SRCS:.cc=.o)
-
-LIBS += -lGpuManager -framework opencl `sdl-config --libs`
-
-.SUFFIXES: .cc .o
+gpu: FORCE
+	@echo "Make for OpenCL"
+	@$(MAKE) -f Makefile.gpu
 
-.cc.o:
-	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
-
-all: $(TARGET)
-gpu: all
+cuda: FORCE
+	@echo "Make for CUDA"
+	@$(MAKE) -f Makefile.cuda
 
-$(TARGET): $(OBJS)
-	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
-
-link:
-	$(CC) -o $(TARGET) $(OBJS) $(LIBS)
-
-debug: $(TARGET)
-	sudo gdb ./$(TARGET) 
+FORCE:
 
 clean:
-	rm -f $(TARGET) $(OBJS)
-	rm -f *~ \#*
+	@$(MAKE) -f Makefile.gpu clean
+	@$(MAKE) -f Makefile.cuda clean
--- a/example/flip/Makefile.def	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/flip/Makefile.def	Tue Feb 11 16:28:22 2014 +0900
@@ -7,3 +7,6 @@
 
 INCLUDE = -I${CERIUM}/include/TaskManager -I. -I../..
 LIBS = -L${CERIUM}/TaskManager
+
+NVCC = nvcc
+NVCCFLAGS = -ptx -arch=sm_20
\ No newline at end of file
--- a/example/flip/main.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/flip/main.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -11,8 +11,10 @@
 static long int length = DEFAULT;
 static int task = 1;
 int *indata;
+int *data;
 
 extern void task_init(void);
+void TMend(TaskManager*);
 
 char usr_help_str[] = "GpuRun [length]\n";
 
@@ -72,25 +74,28 @@
 void
 test(TaskManager *manager) {
     indata = new int[length];
-
+    data = new int;
+    
     for (int c=0; c < length ;c++) {
         indata[c] = c;
     }
-
+    *data = 2;
     print_data(indata, length, "before");
-
+    
     HTaskPtr twice = manager->create_task(Twice);
-
+    
     twice->set_param(0, (memaddr)length);
     twice->set_inData(0, indata, sizeof (int)*length);
-    //    twice->set_outData(0, indata, sizeof (int)*length);
+    twice->set_inData(1, data, sizeof(int));
+    twice->set_outData(0, indata, sizeof (int)*length);
+    twice->set_outData(1, data, sizeof(int));
     twice->set_cpu(GPU_0);
     twice->flip();
 
     /*
      * set_post() で ppe task を渡せるようにしたい
      */
-    twice->set_post(twice_result, (void*)indata, (void*)length);
+    //twice->set_post(twice_result, (void*)indata, (void*)length);
 
     twice->spawn();
 }
@@ -107,8 +112,15 @@
     for (int i = 0; i < task; ++i) {
         test(manager);
     }
-
+    
+    manager->set_TMend(TMend);
     return 0;
 }
 
+void
+TMend(TaskManager* manager) {
+    print_data(indata, length, "after");
+    delete[] indata;
+    delete data;
+}
 /* end */
--- a/example/flip/task_init.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/flip/task_init.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -1,8 +1,14 @@
 #include "GpuFunc.h"
 #include "GpuScheduler.h"
+#include "CudaScheduler.h" 
 
 void
 task_init(void)
 {
+#ifdef __CERIUM_GPU__
     GpuSchedRegister(Twice, "twice.cl", "twice");
+#endif
+#ifdef __CERIUM_CUDA__
+    CudaSchedRegister(Twice, "twice.ptx", "twice");
+#endif
 }
--- a/example/flip/twice.cl	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/flip/twice.cl	Tue Feb 11 16:28:22 2014 +0900
@@ -1,12 +1,9 @@
 __kernel void
 twice(__constant int *data_count,
       __global int *input_data)
-      //      __global int *output_data)
 {
     long count = (long)data_count[0];
     for (int i = 0; i<count; i++) {
-        // output_data[i] = 2*input_data[i];
         input_data[i] *= 2;
     }
-
 }
--- a/example/many_task/Makefile	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/many_task/Makefile	Tue Feb 11 16:28:22 2014 +0900
@@ -20,6 +20,10 @@
 	@echo "Make for OpenCL"
 	@$(MAKE) -f Makefile.gpu
 
+cuda: FORCE
+	@echo "Make for Cuda"
+	@$(MAKE) -f Makefile.cuda
+
 gpu-test: FORCE
 	@echo "Make for OpenCL"
 	@$(MAKE) -f Makefile.gpu test
@@ -33,3 +37,5 @@
 	@$(MAKE) -f Makefile.macosx clean
 	@$(MAKE) -f Makefile.linux clean
 	@$(MAKE) -f Makefile.cell clean
+	@$(MAKE) -f Makefile.gpu clean
+	@$(MAKE) -f Makefile.cuda clean
--- a/example/many_task/ppe/task_init.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/many_task/ppe/task_init.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -1,5 +1,9 @@
 #include "Func.h"
 #include "Scheduler.h"
+#ifdef __CERIUM_CUDA__
+#include "CudaScheduler.h"
+#endif
+
 
 SchedExternTask(QuickSort);
 SchedExternTask(SortSimple);
@@ -13,4 +17,8 @@
     SchedRegister(SortSimple);
     SchedRegister(SortCompat);
     SchedRegister(SortTaskArray);
+    
+#ifdef __CERIUM_CUDA__
+    CudaSchedRegister(QUICK_SORT, "cuda/QuickSort.ptx", "quick_sort");
+#endif
 }
--- a/example/multiply/cuda/multiply.cu	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/multiply/cuda/multiply.cu	Tue Feb 11 16:28:22 2014 +0900
@@ -2,7 +2,7 @@
     __global__ void multi(long* params, float* A, float* B, float* C) {
         int id = blockIdx.x * blockDim.x + threadIdx.x;
         long length = params[0];
-        for (int id = 0; id < length; id++) 
-            C[id]=A[id]*B[id];
+        //        for (int id = 0; id < length; id++) 
+        C[id]=A[id]*B[id];
     }
 }
--- a/example/multiply/main.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/multiply/main.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -105,7 +105,7 @@
     multiply->set_outData(0,(memaddr)C, sizeof(float)*length);
     multiply->set_param(0,(long)length);
     // param 0に0~length-1をsetしたtaskをlength個spawnする
-    //multiply->iterate(length); 
+    multiply->iterate(length); 
     
     // hoge = manager->create_task(MULTIPLY_TASK);
     // hoge->set_cpu(spe_cpu);
@@ -116,7 +116,7 @@
     // hoge->set_param(0,(long)0);
     // hoge->wait_for(multiply);
     // hoge->iterate(length);
-    multiply->spawn();
+    //multiply->spawn();
     //    }
 }
 
--- a/example/word_count/Makefile.cuda	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/word_count/Makefile.cuda	Tue Feb 11 16:28:22 2014 +0900
@@ -25,7 +25,6 @@
 
 NVCC = /Developer/NVIDIA/CUDA-5.5/bin/nvcc
 NVCCFLAGS = -ptx -arch=sm_20
-INDEX = 0
 
 .SUFFIXES: .cc .o .cu .ptx
 
--- a/example/word_count/main.cc	Sat Feb 08 14:19:41 2014 +0900
+++ b/example/word_count/main.cc	Tue Feb 11 16:28:22 2014 +0900
@@ -231,7 +231,6 @@
     return 0;
 }
 
-
 static int blocks = 48;
 //static int blocks = 31 * 6 * 24;
 static int division = 16; // in Kbyte
@@ -254,7 +253,7 @@
     w->size = w->file_size = st_mmap.size;
     w->file_mmap = st_mmap.file_mmap;
     printf("w %lx\n",(long)w);
-
+    
     /* 1task分のデータサイズ(byte) */
     if (w->size >= 1024*division) {
         w->division_size = 1024 * division;/*16kbyte*/