# HG changeset patch # User Daichi TOMA # Date 1343012300 -32400 # Node ID 17c01f69db69d2ca970c2868be33f539aa6c9e79 # Parent 03e644cc3366d434ac765d4891a8956e496ba712 finish diff -r 03e644cc3366 -r 17c01f69db69 Paper/book.bib --- a/Paper/book.bib Mon Jul 23 06:51:08 2012 +0900 +++ b/Paper/book.bib Mon Jul 23 11:58:20 2012 +0900 @@ -3,7 +3,7 @@ author={Clay Breshears}, publisher={O'REILLY}, year={2009}, - month={12}, + month={December}, isbn={9784873114354}, url={http://amazon.co.jp/o/ASIN/4873114357/}, price={¥ 3,360}, diff -r 03e644cc3366 -r 17c01f69db69 Paper/cerium.bib --- a/Paper/cerium.bib Mon Jul 23 06:51:08 2012 +0900 +++ b/Paper/cerium.bib Mon Jul 23 11:58:20 2012 +0900 @@ -54,6 +54,38 @@ year = 2006 } +@misc{cell_wiki, +title = "{Cell}", +howpublished = "{http://en.wikipedia.org/wiki/Cell\_(microprocessor)}" +} + +@manual{cell-ibm, +author = "{IBM}", +title = "{IBM Research - Cell}", +year = 2005 +} + +@article{cell-ieee, + author = {Gschwind, Michael and Hofstee, H. Peter and Flachs, Brian and Hopkins, Martin and Watanabe, Yukio and Yamazaki, Takeshi}, + title = {Synergistic Processing in Cell's Multicore Architecture}, + journal = {IEEE Micro}, + issue_date = {March 2006}, + volume = {26}, + number = {2}, + month = mar, + year = {2006}, + issn = {0272-1732}, + pages = {10--24}, + numpages = {15}, + url = {http://dx.doi.org/10.1109/MM.2006.41}, + doi = {10.1109/MM.2006.41}, + acmid = {1130803}, + publisher = {IEEE Computer Society Press}, + address = {Los Alamitos, CA, USA}, + keywords = {Cell Broadband Engine, multicore architecture, synergistic processing, synergistic processing, Cell Broadband Engine, multicore architecture}, +} + + @manual{cell_sdk, author = "{International Business Machines Corporation}", title = "{Software Development Kit for Multicore Acceleration Version 3.1}", @@ -291,3 +323,21 @@ month = "Sep", year = 2011 } + +@inproceedings{2006:CMC, + author = {Gschwind, Michael}, + title = {Chip multiprocessing and the cell broadband engine}, + booktitle = {Proceedings of the 3rd conference on Computing frontiers}, + series = {CF '06}, + year = {2006}, + isbn = {1-59593-302-6}, + location = {Ischia, Italy}, + pages = {1--8}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1128022.1128023}, + doi = {10.1145/1128022.1128023}, + acmid = {1128023}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {cell broadband engine, chip multiprocessing, compute-transfer parallelism (CTP), heterogeneous chip multiprocessor, memory-level parallelism (MLP)}, +} diff -r 03e644cc3366 -r 17c01f69db69 Paper/paper.tex --- a/Paper/paper.tex Mon Jul 23 06:51:08 2012 +0900 +++ b/Paper/paper.tex Mon Jul 23 11:58:20 2012 +0900 @@ -1,4 +1,4 @@ -\documentclass[twocolumn,twoside,9.5pt]{article} +\documentclass[twocolumn,twoside,11pt]{article} \usepackage[dvipdfmx]{graphicx} \usepackage{url} \usepackage{picins} @@ -8,13 +8,13 @@ \rhead{} \cfoot{} -\setlength{\topmargin}{-1in \addtolength{\topmargin}{15mm}} +\setlength{\topmargin}{-1in \addtolength{\topmargin}{20mm}} \setlength{\headheight}{0mm} \setlength{\headsep}{5mm} -\setlength{\oddsidemargin}{-1in \addtolength{\oddsidemargin}{15mm}} -\setlength{\evensidemargin}{-1in \addtolength{\evensidemargin}{15mm}} -\setlength{\textwidth}{181mm} -\setlength{\textheight}{261mm} +\setlength{\oddsidemargin}{-1in \addtolength{\oddsidemargin}{20mm}} +\setlength{\evensidemargin}{-1in \addtolength{\evensidemargin}{20mm}} +\setlength{\textwidth}{171mm} +\setlength{\textheight}{256mm} \setlength{\footskip}{0mm} \pagestyle{empty} @@ -78,6 +78,87 @@ \label{fig:cell_arch} \end{figure} +The Cell processor marries the SPEs and the PPE via EIB to give access, +via fully cache coherent DMA (direct memory access), to both main memory and to other external data storage. +To make the best of EIB, and to overlap computation and data transfer, +each of the nine processing elements (PPE and SPEs) is equipped with a DMA engine. +Since the SPE's load/store instructions can only access its own local memory, +each SPE entirely depends on DMAs to transfer data to and from the main memory and other SPEs' local memories. +A DMA operation can transfer either a single block area of size up to 16KB, or a list of 2 to 2048 such blocks. +One of the major design decisions in the architecture of Cell is the use of DMAs as a central means of intra-chip data transfer, +with a view to enabling maximal asynchrony and concurrency in data processing inside a chip\cite{2006:CMC}. + +The PPE, which is capable of running a conventional operating system, +has control over the SPEs and can start, stop, interrupt, and schedule processes running on the SPEs. +To this end the PPE has additional instructions relating to control of the SPEs. +Unlike SPEs, the PPE can read and write the main memory and the local memories of SPEs through the standard load/store instructions. +Despite having Turing complete architectures, +the SPEs are not fully autonomous and require the PPE to prime them before they can do any useful work. +Though most of the "horsepower" of the system comes from the synergistic processing elements, +the use of DMA as a method of data transfer and the limited local memory footprint of each SPE pose a major challenge +to software developers who wish to make the most of this horsepower, +demanding careful hand-tuning of programs to extract maximal performance from this CPU. + +The PPE and bus architecture includes various modes of operation giving different levels of memory protection, +allowing areas of memory to be protected from access by specific processes running on the SPEs or the PPE. + +Both the PPE and SPE are RISC architectures with a fixed-width 32-bit instruction format. +The PPE contains a 64-bit general purpose register set (GPR), a 64-bit floating point register set (FPR), +and a 128-bit Altivec register set. The SPE contains 128-bit registers only. +These can be used for scalar data types ranging from 8-bits to 128-bits +in size or for SIMD computations on a variety of integer and floating point formats. +System memory addresses for both the PPE and SPE are expressed as 64-bit values +for a theoretic address range of 264 bytes (16 exabytes or 16,777,216 terabytes). +In practice, not all of these bits are implemented in hardware. +Local store addresses internal to the SPU processor are expressed as a 32-bit word. +In documentation relating to Cell a word is always taken to mean 32 bits, a doubleword means 64 bits, and a quadword means 128 bits. + + +\subsubsection{Power Processor Element (PPE)} +The PPE(Figure \ref{fig:ppe}) is the Power Architecture based, +two-way multithreaded core acting as the controller for the eight SPEs, +which handle most of the computational workload. The PPE will work +with conventional operating systems due to its similarity to other 64-bit PowerPC processors, +while the SPEs are designed for vectorized floating point code execution. +The PPE contains a 64 KiB level 1 cache (32 KiB instruction and a 32 KiB data) and a 512 KiB Level 2 cache. +The size of a cache line is 128 bytes. +Each PPE can complete two double precision operations per clock cycle using a scalar-fused multiply-add instruction, +which translates to 6.4 GFLOPS at 3.2 GHz; +or eight single precision operations per clock cycle with a vector fused-multiply-add instruction, +which translates to 25.6 GFLOPS at 3.2 GHz. + +\begin{figure}[htb] + \begin{center} + \includegraphics[scale=0.4]{./pic/PPE.pdf} + \end{center} + \caption{PPE (Power Processor Element)} + \label{fig:ppe} +\end{figure} + +\subsubsection{Synergistic Processing Elements (SPE)} +Each SPE(Figure \ref{fig:ppe}) is composed of a "Synergistic Processing Unit", SPU, and a "Memory Flow Controller", MFC (DMA, MMU, and bus interface)\cite{cell-ibm}. +An SPE is a RISC processor with 128-bit SIMD organization\cite{cell-ieee} for single and double precision instructions. +With the current generation of the Cell, each SPE contains a 256 KiB embedded SRAM for instruction and data, +called "Local Storage" (not to be mistaken for "Local Memory" in Sony's documents that refer to the VRAM) +which is visible to the PPE and can be addressed directly by software. Each SPE can support up to 4 GiB of local store memory. +The local store does not operate like a conventional CPU cache since it is neither transparent +to software nor does it contain hardware structures that predict which data to load. The SPEs contain a 128-bit, +128-entry register file and measures 14.5 mm2 on a 90 nm process. +An SPE can operate on sixteen 8-bit integers, eight 16-bit integers, four 32-bit integers, +or four single-precision floating-point numbers in a single clock cycle, as well as a memory operation. +Note that the SPU cannot directly access system memory; +the 64-bit virtual memory addresses formed by the SPU must be passed from the SPU +to the SPE memory flow controller (MFC) to set up a DMA operation within the system address space. +At 3.2 GHz, each SPE gives a theoretical 25.6 GFLOPS of single precision performance. + +\begin{figure}[htb] + \begin{center} + \includegraphics[scale=0.5]{./pic/SPE.pdf} + \end{center} + \caption{SPE (Synergistic Processing Element)} + \label{fig:spe} +\end{figure} + % Cell の説明いれる % \subsection{Mailbox} @@ -133,11 +214,10 @@ \end{small} -\begin{tiny} \begin{table}[h] \caption{Benchmark} \label{table:benchmark} -\small +{\scriptsize \begin{tabular}[t]{c||r|r|r} \hline & Word Count & Sort & Prime Counter\\ @@ -154,9 +234,8 @@ \hline 24 CPU (Xeon)& 40 ms & 100 ms & 31 ms\\ \hline -\end{tabular} +\end{tabular}} \end{table} -\end{tiny} % Word Count 354 / 70 = 5.0571 % Sort 846 / 163 = 5.1901 @@ -180,7 +259,7 @@ In addition, Cerium Task Manager has many type of task, is a drawback of such description. This can be solved by the system description the dependency of the task rather than on the user side. -\nocite{cell_abi, opencl, clay200912} +\nocite{cell_abi, opencl, clay200912, cell_wiki, cell_cpp, cell_sdk, libspe2} % \nocite{yutaka:2010a, cell_abi, cell_cpp, cell_sdk, libspe2, ydl, clay200912, fix200609} \bibliographystyle{junsrt} \bibliography{cerium.bib,book.bib} diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/PPE.pdf Binary file Paper/pic/PPE.pdf has changed diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/PPE.xbb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Paper/pic/PPE.xbb Mon Jul 23 11:58:20 2012 +0900 @@ -0,0 +1,8 @@ +%%Title: ./PPE.pdf +%%Creator: extractbb 20110311 +%%BoundingBox: 0 0 453 454 +%%HiResBoundingBox: 0.000000 0.000000 453.000000 454.000000 +%%PDFVersion: 1.3 +%%Pages: 1 +%%CreationDate: Mon Jul 23 08:57:46 2012 + diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/SPE.pdf Binary file Paper/pic/SPE.pdf has changed diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/SPE.xbb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Paper/pic/SPE.xbb Mon Jul 23 11:58:20 2012 +0900 @@ -0,0 +1,8 @@ +%%Title: ./SPE.pdf +%%Creator: extractbb 20110311 +%%BoundingBox: 0 0 380 340 +%%HiResBoundingBox: 0.000000 0.000000 380.000000 340.000000 +%%PDFVersion: 1.4 +%%Pages: 1 +%%CreationDate: Mon Jul 23 08:57:52 2012 + diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/cell-main.pdf Binary file Paper/pic/cell-main.pdf has changed diff -r 03e644cc3366 -r 17c01f69db69 Paper/pic/cell-main.xbb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Paper/pic/cell-main.xbb Mon Jul 23 11:58:20 2012 +0900 @@ -0,0 +1,8 @@ +%%Title: ./cell-main.pdf +%%Creator: extractbb 20110311 +%%BoundingBox: 0 0 423 300 +%%HiResBoundingBox: 0.000000 0.000000 423.000000 300.000000 +%%PDFVersion: 1.3 +%%Pages: 1 +%%CreationDate: Mon Jul 23 06:49:15 2012 +