Mercurial > hg > Papers > 2013 > yuhi-prosym

--- a/presen/images/glaffle/bench_mark.graffle	Wed Jan 08 16:22:05 2014 +0900
+++ b/presen/images/glaffle/bench_mark.graffle	Wed Jan 08 21:43:29 2014 +0900
@@ -152,7 +152,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 26 ms}</string>
+\f0\fs48 \cf0 26 ns}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -193,7 +193,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 117 ms}</string>
+\f0\fs48 \cf0 3 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -234,7 +234,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 178 ms}</string>
+\f0\fs48 \cf0 4 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -275,7 +275,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0  13.9 ms}</string>
+\f0\fs48 \cf0  5 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -461,7 +461,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 117 ms}</string>
+\f0\fs48 \cf0 2 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -502,7 +502,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 178 ms}</string>
+\f0\fs48 \cf0 2 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -543,7 +543,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0  13.9 ms}</string>
+\f0\fs48 \cf0  5 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -729,7 +729,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 117 ms}</string>
+\f0\fs48 \cf0 8 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -770,7 +770,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qr

-\f0\fs48 \cf0 6.0 ms}</string>
+\f0\fs48 \cf0 3 ms}</string>
 			</dict>
 		</dict>
 		<dict>
@@ -1452,7 +1452,7 @@
 	<key>MasterSheets</key>
 	<array/>
 	<key>ModificationDate</key>
-	<string>2014-01-07 07:22:18 +0000</string>
+	<string>2014-01-08 07:27:46 +0000</string>
 	<key>Modifier</key>
 	<string>yuhi</string>
 	<key>NotesVisible</key>
@@ -1533,7 +1533,7 @@
 			</dict>
 		</array>
 		<key>Frame</key>
-		<string>{{861, 256}, {693, 922}}</string>
+		<string>{{876, 256}, {693, 922}}</string>
 		<key>ListView</key>
 		<true/>
 		<key>OutlineWidth</key>
@@ -1547,7 +1547,7 @@
 		<key>SidebarWidth</key>
 		<integer>120</integer>
 		<key>VisibleRegion</key>
-		<string>{{158, 7}, {558, 767}}</string>
+		<string>{{0, 6}, {558, 767}}</string>
 		<key>Zoom</key>
 		<real>1</real>
 		<key>ZoomValues</key>
--- a/presen/index.html	Wed Jan 08 16:22:05 2014 +0900
+++ b/presen/index.html	Wed Jan 08 21:43:29 2014 +0900
@@ -163,7 +163,7 @@
                     よって、OpenCLのAPIを用いてデータを転送する必要がある。するとこの転送がネックになる。
                   </p>
                   <p>
-                    GPGPUは<font color="red">データの転送が頻繁に起こる</font>ようなTaskは並列度が出ない。
+                    GPGPUは<font color="red">データの転送が頻繁に起こる</font>ようなTaskは並列度が出づらい。
                   </p>
                 </td>
               </tr>
@@ -177,7 +177,7 @@
          <h3>データ並列</h3>
        </hgroup>
        <article>
-                   <table  border="0" cellpadding="0" cellspacing="0">
+         <table  border="0" cellpadding="0" cellspacing="0">
             <tbody>
               <tr>
                 <td><img src='images/gpu_data_parallel.png' style="height:400px"></td>
@@ -288,6 +288,7 @@
         </hgroup>
         <article>
           <pre class="prettyprint" data-lang="twice.cc(MultiCore)">
+
             long i = (long)scheduler->x; // (long)scheduler->get_param(0);
             output[i]=input[i]*2;
           </pre>
@@ -363,8 +364,8 @@
         <article>
           <pre class="prettyprint" data-lang="GpuScheduler.cc">

-            clEnqueueNDRangeKernel(command_queue, kernel[cur], task->dim, NULL,
-                                   &task->x, &task->y, &task->z, NULL, NULL);
+clEnqueueNDRangeKernel(command_queue, kernel[cur], task->dim, NULL,
+                       &task->x, &task->y, &task->z, NULL, NULL);
           </pre>
           <table  border="2" style="font-size:18pt;">
             <tbody>
@@ -398,41 +399,6 @@
         <hgroup>
           <h3>ベンチマーク</h3>
         </hgroup>
-        <article>
-        <table >
-          <tbody>
-            <tr>
-              <td> <!--  benchmark -->
-                <img src="images/bench_mark.png" height="300"></img>
-              </td> <!-- /benchmark -->
-              <td>  <!-- system env -->
-                <h3 class="yellow">FFT</h3>
-                <font size="5">
-                  <p>
-                    フーリエ変換と周波数フィルタによる、512*512の画像への処理をデータ並列で行う例題
-                  </p>
-                  <h3 class="yellow">実験環境</h3>
-                  OS : MacOS 10.9.1<br>
-                  CPU : 2*2.66GHz 6-CoreIntel Xeon<br>
-                  Memory : 16GB<br>
-                  Compiler : Apple LLVM version 5.0<br>
-                  　　　　　 (clang-500.2.79)<br>
-                  GPU :  AMD ATI Radeon HD 5870 1024MB<br>
-                <h3 class="yellow">結果</h3>
-                <font size="5">
-                  GPU 実行が1 coreのCPUよりも4.8倍、2 Coreよりも2.7倍の実行速度
-                </font>
-              </td> <!--system env  -->
-            </tr>
-          </tbody>
-        </table>
-        </article>
-      </slide>
-
-     <slide>
-        <hgroup>
-          <h3>ベンチマーク</h3>
-        </hgroup>
         <table >
           <tbody>
             <tr>
@@ -443,12 +409,24 @@
                 <h3 class="yellow">FFT</h3>
                 <font size="5">
                   <p>
-                    Busy Timeを含めて再測定
+                    フーリエ変換と周波数フィルタによる、
+                    512*512の画像への処理をデータ並列で行う例題
                   </p>
-                <h3 class="yellow">結果</h3>
-                <font size="5">
-                  CPUはどのコアもBusy TimeとRun Timeに大きな差は無いが、GPUは大きく差が開いた
-                </font>
+                  <h3 class="yellow">実験環境</h3>
+                  OS : MacOS 10.9.1<br>
+                  CPU : 2*2.66GHz 6-CoreIntel Xeon<br>
+                  Memory : 16GB<br>
+                  Compiler : Apple LLVM version 5.0<br>
+                  　　　　　 (clang-500.2.79)<br>
+                  GPU :  AMD ATI Radeon HD 5870 1024MB<br>
+                  <h3 class="yellow">結果</h3>
+                  <font size="5">
+                    <p>
+                      GPU 実行が1 coreのCPUよりも4.8倍、2 Coreよりも2.7倍の実行速度。
+                      CPUはどのコアもBusy TimeとRun Timeに大きな差は無いが、
+                      GPUは大きく差が開いた。
+                    </p>
+                  </font>
               </td> <!--system env  -->
             </tr>
           </tbody>
@@ -530,6 +508,109 @@
         </article>
       </slide>

+
+      <slide>
+        <hgroup>
+          <h3>ベンチマーク</h3>
+        </hgroup>
+        <article>
+          <img src="images/bench_mark_each_task.png" height="350"></img>
+          <p>
+            FFTはSpinFactやButterfly演算等、様々なTaskで構成されている。
+            それぞれのTaskについて、実行時間を計測した。
+          </p>
+          <p>
+            SpinFactのTaskに関しては、CPUの方が実行速度が早い。
+          </p>
+        </article>
+      </slide>
+
+      <slide>
+        <hgroup>
+          <h3>新たに実装するScheduling手法の提案</h3>
+        </hgroup>
+        <article>
+          <table >
+            <tbody>
+              <tr>
+                <td>
+                  <img src="images/decide_weight.png" height="150"></img>
+                </td>
+                <td>
+                  並列実行するTaskをCPUとGPUで事前に一度実行し、実行時間を測定する。
+                  それぞれの実行時間の割合で重みをつける。
+                </td>
+              </tr>
+              <tr>
+                <td>
+                  <img src="images/select_arch.png" height="200"></img>
+                </td>
+                <td>
+                  <p>
+                    それぞれの重みからCPU実行とGPU実行のどちらに適しているか判断する
+                  </p>
+                </td>
+              </tr>
+            </tbody>
+          </table>
+
+        </article>
+      </slide>
+
+      <slide>
+        <hgroup>
+          <h3>新たに実装するScheduling手法の提案</h3>
+        </hgroup>
+        <article>
+          <p>
+            更に、CPUとGPUでTaskが同時に終了するようにSchedulingを行いたい。
+          </p>
+          <table >
+            <tbody>
+              <tr>
+                <td>
+                  <img src="images/decide_weight2.png" height="150"></img>
+                </td>
+                <td>
+                  全てのTaskがCPUの二倍、GPUの方が実行速度が早い場合
+                </td>
+              </tr>
+              <tr>
+                <td>
+                  <img src="images/select_arch2.png" height="180"></img>
+                </td>
+                <td>
+                  <p>
+                    それぞれのTaskを得意とするアーキテクチャに全て割り振るのではなく、
+                    RunTimeが最小になるように割り振る
+                  </p>
+                </td>
+              </tr>
+            </tbody>
+          </table>
+
+        </article>
+      </slide>
+
+      <slide>
+        <hgroup>
+          <h3>まとめ</h3>
+        </hgroup>
+        <article>
+        <ul>
+          <li>Cerium Task Managerをデータ並列に対応</li>
+          <li>FFTによるデータ並列実行のベンチマーク</li>
+          <li>CPUとGPUでのTaskの同時実行に対応</li>
+          <li>同時実行時のTaskのScheduling手法の提案</li>
+        </ul>
+        <h3 class="yellow">今後の課題</h3>
+        <ul>
+          <li>提案したSchedulingの手法を実装・ベンチマーク</li>
+          <li>ベンチマークに使用する例題の追加</li>
+        </ul>
+        </article>
+      </slide>
+
       <!--
       <slide>
         <hgroup>
@@ -634,25 +715,7 @@
 </ul>
         </article>
       </slide>
-
-      <slide>
-        <hgroup>
-          <h3>まとめ</h3>
-        </hgroup>
-        <article>
-        <ul>
-          <li>Cerium Task ManagerをGPGPUに対応</li>
-          <li>マルチコア実行とGPU実行のベンチマーク</li>
-          <li>改善案として、データ並列機構の提案</li>
-        </ul>
-        <h3 class="yellow">今後の課題</h3>
-        <ul>
-          <li>提案したデータ並列機構の実装</li>
-          <li>新たな例題として、iterate APIのベンチマーク</li>
-        </ul>
-        </article>
-      </slide>
-       -->
+      -->

       <slide class="backdrop"></slide>
 </slides>
--- a/presen/theme/css/default.css	Wed Jan 08 16:22:05 2014 +0900
+++ b/presen/theme/css/default.css	Wed Jan 08 21:43:29 2014 +0900
@@ -234,7 +234,7 @@
   display: none;
   font-family: 'Open Sans', Arial, sans-serif;
   font-size: 26px;
-  color: #797979;
+  color: #000000;
   width: 900px;
   height: 700px;
   margin-left: -450px;
@@ -570,7 +570,8 @@
 /* line 373, ../scss/default.scss */
 pre {
   font-family:  "Courier New", monospace;
-  font-size: 20px;
+  font-size: 24px;
+  font-color: #000000;
   line-height: 28px;
   padding: 10px 0 10px 60px;
   letter-spacing: -1px;
--- a/presen/theme/scss/default.scss	Wed Jan 08 16:22:05 2014 +0900
+++ b/presen/theme/scss/default.scss	Wed Jan 08 21:43:29 2014 +0900
@@ -518,7 +518,7 @@
   }

   tr:nth-child(odd) {
-    background-color: $gray-1;
+    background-color: $gray-4;
   }

   th {