changeset 1941:f19885ea776d draft

add wordcount for cuda. fix CudaScheduler. add makefile
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 04 Feb 2014 02:18:07 +0900
parents e8ca9cae59fc
children b5d778f00bf1
files TaskManager/Cuda/CudaScheduler.cc example/multiply/Func.h example/multiply/cuda/multiply.cu example/multiply/gpu/Multi.cl example/multiply/main.cc example/word_count/Makefile example/word_count/Makefile.cuda example/word_count/cuda/Exec.cu example/word_count/cuda/Exec_Data_Parallel.cu example/word_count/main.cc example/word_count/task_init.cc
diffstat 11 files changed, 158 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc	Sun Feb 02 18:34:31 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Tue Feb 04 02:18:07 2014 +0900
@@ -93,7 +93,7 @@
         if (mem[cur-1].event[i] != 0)
             cuEventDestroy(mem[cur-1].event[i]);
         mem[cur-1].event[i] = 0;
-        if (mem[1-cur].memout[i])
+        if (mem[cur-1].memout[i])
             cuMemFree(mem[cur-1].memout[i]);
         mem[cur-1].memout[i] = 0;
     }
@@ -107,7 +107,7 @@
         
     } else if (kernel_event[cur-1] != NULL){
         ret = cuEventSynchronize(kernel_event[cur-1]);
-
+        
         if (ret!=0) {
             error(convert_error_status(ret));
         }
@@ -299,16 +299,19 @@
                 } else {
                     wait_for_event(kernel_event, cudabuffer, tasklist, cur);
                 }
-                cur += 1;
+                cur++;
                 if (STAGE <= cur)
                     cur = 0;
                 free(kernelParams);
-                cuModuleUnload(module);
             }
             reply = (memaddr)tasklist->waiter;
             param_addr = (memaddr)tasklist->next;
         }
-        wait_for_event(kernel_event, cudabuffer, tasklist, cur);
+        if (cur == 0) {
+            wait_for_event(kernel_event, cudabuffer, tasklist, STAGE);
+        } else {
+            wait_for_event(kernel_event, cudabuffer, tasklist, cur);
+        }
         for (int i = 0; i<STAGE; i++) {
             ret = cuStreamSynchronize(cudabuffer[i].stream);
             if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
--- a/example/multiply/Func.h	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/multiply/Func.h	Tue Feb 04 02:18:07 2014 +0900
@@ -3,4 +3,4 @@
     MULTIPLY_TASK,
 };
 
-#define DATA_NUM 1000
+#define DATA_NUM 60000
--- a/example/multiply/cuda/multiply.cu	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/multiply/cuda/multiply.cu	Tue Feb 04 02:18:07 2014 +0900
@@ -1,6 +1,8 @@
 extern "C" {
-    __global__ void multi(void* params, float* A, float* B, float* C) {
-        int id = blockIdx.x * blockDim.x + threadIdx.x;
-        C[id]=A[id]*B[id];
+    __global__ void multi(long* params, float* A, float* B, float* C) {
+        //int id = blockIdx.x * blockDim.x + threadIdx.x;
+        long length = params[0];
+        for (int id = 0; id < length; id++) 
+            C[id]=A[id]*B[id];
     }
 }
--- a/example/multiply/gpu/Multi.cl	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/multiply/gpu/Multi.cl	Tue Feb 04 02:18:07 2014 +0900
@@ -2,9 +2,7 @@
 multi(__global const long *params, __global const float* A, __global const float* B, __global float* C)
 {
     //    int i=get_global_id(0);
-    long length = (long)params[0];
     long id = get_global_id(0);
     //    for(int i=0;i<length;i++) {
-    if(length)
-        C[id]=A[id]*B[id];
+    C[id]=A[id]*B[id];
 }
--- a/example/multiply/main.cc	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/multiply/main.cc	Tue Feb 04 02:18:07 2014 +0900
@@ -105,7 +105,7 @@
     multiply->set_outData(0,(memaddr)C, sizeof(float)*length);
     multiply->set_param(0,(long)length);
     // param 0に0~length-1をsetしたtaskをlength個spawnする
-    multiply->iterate(length); 
+    //multiply->iterate(length); 
     
     // hoge = manager->create_task(MULTIPLY_TASK);
     // hoge->set_cpu(spe_cpu);
@@ -116,7 +116,7 @@
     // hoge->set_param(0,(long)0);
     // hoge->wait_for(multiply);
     // hoge->iterate(length);
-    //        multiply->spawn();
+    multiply->spawn();
     //    }
 }
 
--- a/example/word_count/Makefile	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/word_count/Makefile	Tue Feb 04 02:18:07 2014 +0900
@@ -23,9 +23,14 @@
 	@echo "Make for OpenCL"
 	@$(MAKE) -f Makefile.gpu test
 
+cuda: FORCE
+	@echo "Make for Cuda"
+	@$(MAKE) -f Makefile.cuda
+
 FORCE:
 
 clean:
 	@$(MAKE) -f Makefile.macosx clean
 	@$(MAKE) -f Makefile.linux clean
 	@$(MAKE) -f Makefile.cell clean
+	@$(MAKE) -f Makefile.cuda clean
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/word_count/Makefile.cuda	Tue Feb 04 02:18:07 2014 +0900
@@ -0,0 +1,52 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+CUDA_TASK_DIR = cuda
+
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP)) $(wildcard $(CUDA_TASK_DIR)/*.cc)
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+CUDA_SRCS_TMP = $(wildcard $(CUDA_TASK_DIR)/*.cu)
+CUDA_SRCS_EXCLUDE = # 除外するファイルを書く
+CUDA_SRCS = $(filter-out $(CUDA_TASK_DIR)/$(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
+CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx)
+
+CFLAGS += -D__CERIUM_CUDA__
+LIBS += `sdl-config --libs` -lCudaManager -F/Library/Frameworks -framework CUDA
+
+INCLUDE += -I/Developer/NVIDIA/CUDA-5.5/include
+
+NVCC = /Developer/NVIDIA/CUDA-5.5/bin/nvcc
+NVCCFLAGS = -ptx -arch=sm_20
+INDEX = 0
+
+.SUFFIXES: .cc .o .cu .ptx
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+.cu.ptx:
+	$(NVCC) $(NVCCFLAGS) $< -o $@
+
+all: $(TARGET) $(CUDA_OBJS)
+
+$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo ppu-gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	rm -f *~ \#*
+	rm -f cuda/*~ cuda/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/word_count/cuda/Exec.cu	Tue Feb 04 02:18:07 2014 +0900
@@ -0,0 +1,34 @@
+extern "C" {
+    __global__ void
+    wordcount(long *param,
+              char *i_data,
+              unsigned long *o_data)
+    {
+        unsigned long *head_tail_flag = o_data+2;
+        long length = param[0];
+        long word_flag = 0;
+        long word_num = 0;
+        long line_num = 0;
+        long i = 0;
+        
+        head_tail_flag[0] = (i_data[0] != 0x20) && (i_data[0] != 0x0A);
+        word_num -= 1-head_tail_flag[0];
+        for (; i < length; i++) {
+            if (i_data[i] == 0x20) {
+                word_flag = 1;
+            } else if (i_data[i] == 0x0A) {
+                line_num += 1;
+                word_flag = 1;
+            } else {
+                word_num += word_flag;
+                word_flag = 0;
+            }
+        }
+        
+        word_num += word_flag;
+        head_tail_flag[1] = (i_data[i-1] != 0x20) && (i_data[i-1] != 0x0A);
+        
+        o_data[0] = (unsigned long)word_num;
+        o_data[1] = (unsigned long)line_num;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/word_count/cuda/Exec_Data_Parallel.cu	Tue Feb 04 02:18:07 2014 +0900
@@ -0,0 +1,41 @@
+extern "C" {
+    __global__ void
+    wordcount_parallel(long *param,
+                       char *rbuf,
+                       unsigned long *wbuf)
+    {
+        long task_spwaned = param[0];
+        long division_size = param[1];
+        long length = param[2];
+        long out_size = param[3];
+        int allocation = (int)task_spwaned + (blockIdx.x * blockDim.x + threadIdx.x);
+        char *i_data = rbuf + allocation*division_size;
+        unsigned long *o_data = wbuf + allocation*out_size;
+        unsigned long *head_tail_flag = o_data+2;
+        long word_flag = 0;
+        long word_num = 0;
+        long line_num = 0;
+        long i = 0;
+        
+        head_tail_flag[0] = (i_data[0] != 0x20) && (i_data[0] != 0x0A);
+        word_num -= 1-head_tail_flag[0];
+        
+        for (; i < length; i++) {
+            if (i_data[i] == 0x20) {
+                word_flag = 1;
+            } else if (i_data[i] == 0x0A) {
+                line_num += 1;
+                word_flag = 1;
+            } else {
+                word_num += word_flag;
+                word_flag = 0;
+            }
+        }
+        
+        word_num += word_flag;
+        head_tail_flag[1] = (i_data[i-1] != 0x20) && (i_data[i-1] != 0x0A);
+        
+        o_data[0] = (unsigned long)word_num;
+        o_data[1] = (unsigned long)line_num;
+    }
+}
--- a/example/word_count/main.cc	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/word_count/main.cc	Tue Feb 04 02:18:07 2014 +0900
@@ -339,7 +339,7 @@
             array_task_num = atoi(argv[i+1]);
             i++;
         } else if (strcmp(argv[i], "-g") == 0) {
-            spe_cpu = GPU_ANY;
+            spe_cpu = GPU_0;
         } else if (strcmp(argv[i], "-any") == 0) {
             spe_cpu = ANY_ANY;
         } else if (strcmp(argv[i], "-i") == 0) {
--- a/example/word_count/task_init.cc	Sun Feb 02 18:34:31 2014 +0900
+++ b/example/word_count/task_init.cc	Tue Feb 04 02:18:07 2014 +0900
@@ -3,6 +3,9 @@
 #ifdef __CERIUM_GPU__
 #include "GpuScheduler.h"
 #endif
+#ifdef __CERIUM_CUDA__
+#include "CudaScheduler.h"
+#endif
 
 /* 必ずこの位置に書いて */
 SchedExternTask(Exec);
@@ -22,6 +25,11 @@
     GpuSchedRegister(TASK_EXEC, "gpu/Exec.cl", "wordcount");
     GpuSchedRegister(TASK_EXEC_DATA_PARALLEL, "gpu/Exec_Data_Parallel.cl","wordcount_parallel");
 #endif
+#ifdef __CERIUM_CUDA__
+    CudaSchedRegister(TASK_EXEC, "cuda/Exec.ptx", "wordcount");
+    CudaSchedRegister(TASK_EXEC_DATA_PARALLEL, "cuda/Exec_Data_Parallel.ptx","wordcount_parallel");
+#endif
+    
     SchedRegisterTask(TASK_EXEC, Exec);
     SchedRegisterTask(TASK_EXEC_DATA_PARALLEL, Exec_Data_Parallel);