# HG changeset patch
# User yutaka@henri.cr.ie.u-ryukyu.ac.jp
# Date 1265798007 -32400
# Node ID dc26593f8c4013478695f3363692b07135a442ad
# Parent  987d4cced27974855e218e589da74e3635f1cf40
simd

diff -r 987d4cced279 -r dc26593f8c40 Renderer/Engine/SceneGraphRoot.cc
--- a/Renderer/Engine/SceneGraphRoot.cc	Sun Feb 07 17:52:52 2010 +0900
+++ b/Renderer/Engine/SceneGraphRoot.cc	Wed Feb 10 19:33:27 2010 +0900
@@ -382,6 +382,10 @@
       light_vector[i*4] /= light_vector[i*4+2];
       light_vector[i*4+1] /= light_vector[i*4+2];
 
+      /*SIMD演算のため*/
+      light_vector[i*4+2] *= -1;
+      light_vector[i*4+3] *= -1;
+
     }
 
    
diff -r 987d4cced279 -r dc26593f8c40 Renderer/Engine/spe/DrawSpan.cc
--- a/Renderer/Engine/spe/DrawSpan.cc	Sun Feb 07 17:52:52 2010 +0900
+++ b/Renderer/Engine/spe/DrawSpan.cc	Wed Feb 10 19:33:27 2010 +0900
@@ -10,6 +10,7 @@
 #include "SchedTask.h"
 #include "Tapestry.h"
 #include "SpanPack.h"
+#include <spu_intrinsics.h>
 #include <math.h>
 
 #if (__LITTLE_ENDIAN__)
@@ -57,27 +58,39 @@
 
 
 void
-normalize(float *v0, float *v1)
+normalize(vector float *v0, vector float *v1)
 {
-    float norm, dnorm;
+    float norm;
+    float ret[4] __attribute__((aligned(16)));
+    vector float *vret = (vector float *) ret;
+    *vret = spu_mul(*v0,*v1);
 
-    norm = sqrt(v1[0]*v1[0] + v1[1]*v1[1] + v1[2]*v1[2]);
-    if (norm > 0) {
-	dnorm = 1.0/norm;
-	v0[0] = v1[0]*dnorm;
-	v0[1] = v1[1]*dnorm;
-	v0[2] = v1[2]*dnorm;
-	v0[3] = v1[3]*dnorm;
-    }
+    norm = (ret[0] + ret[1] + ret[2]);
+    
+    *vret = (vector float)spu_splats(norm);
+    *vret = spu_rsqrte(*vret);
+    *v0 = spu_mul(*v1,*vret);
 }
 
 static float
-innerProduct1(float *v0, float *v1)
+innerProduct1(vector float *v0, vector float *v1)
 {
-    return (v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2]);
+
+    float ret[4] __attribute__((aligned(16)));
+    float inner;
+    vector float *vret = (vector float *) ret;
+    *vret = spu_mul(*v0,*v1);
+    
+    inner = (ret[0] + ret[1] + ret[2]);
+    if (inner < 0) {
+      inner = 0;
+    }
+    
+    return inner;
 }
 
 
+
 /**
  * テクスチャは、TEXTURE_SPLIT_PIXEL^2 のブロックに分割する
  *
@@ -403,17 +416,8 @@
 		    int world_x, int world_y, float world_z)
 
 {
+
     unsigned char rgb[4];
-    int light_rgb;
-    int flag;
-    float normal_vector[4] = {normal_x,normal_y,normal_z,0};
-    // 光のベクトル,きめうちしちゃった。どうにかする
-    //float light_vector[4] = {0,0,-1,0};
-    float light_vector[4];
-    float inner_product;
-    float *light_xyz = (float*)smanager->global_get(Light);
-
-    normalize(normal_vector, normal_vector);
 
     // 引数で受け取った color の rgb 情報の抜き出し
 #if LITTLEENDIAN
@@ -428,52 +432,58 @@
     rgb[0] = (color & 0x000000ff);
 #endif
 
-    int tmp_rgb[3] = {0,0,0};
+
+    vector float v_rgb __attribute__((aligned(16))) = {(float)rgb[0],(float)rgb[1],(float)rgb[2],0};
+    int light_rgb;
+    float normal_vector[4] __attribute__((aligned(16))) = {normal_x,normal_y,normal_z,0};
+    vector float *vnormal_vector = (vector float *) normal_vector;
+    float light_vector[4];
+    vector float *vlight_vector = (vector float *) light_vector;
+    float inner_product;
+    float *light_xyz = (float*)smanager->global_get(Light);
+    vector float *vlight_xyz = (vector float *) light_xyz;
+    vector float v_inner __attribute__((aligned(16)));
+
+    vector float v_world[4] __attribute__((aligned(16))) = {{world_x, world_y, -world_z, 0},
+                                                            {world_x, world_y, -world_z, 0},
+                                                            {world_x, world_y, -world_z, 0},
+                                                            {0,       0,        0,       0}};
+
+
+    normalize(vnormal_vector, vnormal_vector);
+
+
+    float tmp_rgb[4] __attribute__((aligned(16))) = {0,0,0,0};
+    vector float *vtmp_rgb = (vector float *) tmp_rgb;
+
     int light_num = 4;
+
     for (int i = 0; i < light_num; i++) {
 
-      //printf("light_xyz[%d] %f\n",i*4,light_xyz[i*4]);
-      //printf("light_xyz[%d] %f\n",i*4+1,light_xyz[i*4+1]);
-      //printf("light_xyz[%d] %f\n",i*4+2,light_xyz[i*4+2]);
-      //printf("light_xyz[%d] %f\n",i*4+3fg,light_xyz[i*4+3]);
-
-      light_vector[0] = world_x - light_xyz[i*4];
-      light_vector[1] = world_y - light_xyz[i*4+1];
-      light_vector[2] = light_xyz[i*4+2] - world_z;
-      light_vector[3] = light_xyz[i*4+3];
-
-      normalize(light_vector, light_vector);
-
-      // 法線ベクトルと光源ベクトルとの内積をとる
-      inner_product = innerProduct1(normal_vector,light_vector);
-
-      //printf("inner_product %f\n",inner_product);
-
-      // 内積がマイナスの場合は色がない。
-      flag = (inner_product > 0);
-
-      // 内積を rgb にかけていく
-      tmp_rgb[0] += (unsigned char)(rgb[0]*inner_product*flag);
-      tmp_rgb[1] += (unsigned char)(rgb[1]*inner_product*flag);
-      tmp_rgb[2] += (unsigned char)(rgb[2]*inner_product*flag);
+      *vlight_vector = spu_sub(v_world[i],vlight_xyz[i]);
+      normalize(vlight_vector, vlight_vector);
+      inner_product = innerProduct1(vnormal_vector,vlight_vector);
+      v_inner = spu_splats(inner_product);
+      *vtmp_rgb = spu_madd(v_rgb,v_inner,*vtmp_rgb);
 
     }
 
-    int rgb_flag[3];
-    for (int i = 0; i < 3; i++) {
-      rgb_flag[i] = (tmp_rgb[i] > 255);
-    }
+    vector unsigned int v_flag __attribute__((aligned(16)));
+    vector float max_rgb __attribute__((aligned(16))) = (vector float)spu_splats((float)255);
 
-    rgb[0] = tmp_rgb[0]*(1 - rgb_flag[0]) + 255*(rgb_flag[0]);
-    rgb[1] = tmp_rgb[1]*(1 - rgb_flag[1]) + 255*(rgb_flag[1]);
-    rgb[2] = tmp_rgb[2]*(1 - rgb_flag[2]) + 255*(rgb_flag[2]);
+    v_flag = spu_cmpgt(max_rgb,*vtmp_rgb);
+    *vtmp_rgb = spu_sel(max_rgb,*vtmp_rgb,v_flag);
+    
+    vector unsigned int vlast_rgb __attribute__((aligned(16)));
+    vlast_rgb = spu_convtu(*vtmp_rgb,0); 
+    unsigned int *last_rgb = (unsigned int*) &vlast_rgb;
 
 
     //計算した rgb を light_rgb にまとめる。
 #if LITTLEENDIAN
-    light_rgb = (rgb[0] << 24) + (rgb[1] << 16) + (rgb[2] << 8) + (rgb[3]);
+    light_rgb = (last_rgb[0] << 24) + (last_rgb[1] << 16) + (last_rgb[2] << 8) + (last_rgb[3]);
 #else
-    light_rgb = (rgb[3] << 24) + (rgb[2] << 16) + (rgb[1] << 8) + (rgb[0]);
+    light_rgb = (last_rgb[3] << 24) + (last_rgb[2] << 16) + (last_rgb[1] << 8) + (last_rgb[0]);
 #endif
 
     return light_rgb;