Mercurial > hg > Papers > 2010 > iDB2010

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DataSegment and CodeSegment .mm	Mon May 31 03:22:18 2010 +0900
@@ -0,0 +1,161 @@
+<map version="0.8.1">
+<!-- To view this file, download free mind mapping software FreeMind from http://freemind.sourceforge.net -->
+<node CREATED="1274963708492" ID="Freemind_Link_1053833782" MODIFIED="1274964503548" TEXT="DataSegment and CodeSegment ">
+<node CREATED="1274963875894" ID="Freemind_Link_495601512" MODIFIED="1274963879570" POSITION="right" TEXT="Key Concept">
+<node CREATED="1274963751326" ID="_" MODIFIED="1274963766502" TEXT="How to build database"/>
+<node CREATED="1274963768330" ID="Freemind_Link_475169321" MODIFIED="1274963784949" TEXT="CAP / BASE"/>
+<node CREATED="1274963788889" ID="Freemind_Link_89404325" MODIFIED="1274963806348" TEXT="DataSegment"/>
+<node CREATED="1274963819928" ID="Freemind_Link_1348831282" MODIFIED="1274963830364" TEXT="CodeSegment"/>
+<node CREATED="1274963812712" ID="Freemind_Link_430777911" MODIFIED="1274963816996" TEXT="ManyCore">
+<node CREATED="1274963844799" ID="Freemind_Link_501607558" MODIFIED="1274963848795" TEXT="Open CL"/>
+</node>
+<node CREATED="1274963854791" ID="Freemind_Link_749030640" MODIFIED="1274963863410" TEXT="Distributed"/>
+<node CREATED="1274963864703" ID="Freemind_Link_309978201" MODIFIED="1274963872618" TEXT="FederatedLinda"/>
+<node CREATED="1274963924805" ID="Freemind_Link_1611750850" MODIFIED="1274963927528" TEXT="Cassandra"/>
+<node CREATED="1274964083535" ID="Freemind_Link_108229779" MODIFIED="1274964089538" TEXT="Object Orientation"/>
+<node CREATED="1274964103510" ID="Freemind_Link_1270678193" MODIFIED="1274964109138" TEXT="Garbage Collection"/>
+</node>
+<node CREATED="1275112578402" ID="Freemind_Link_626322700" MODIFIED="1275112584309" POSITION="right" TEXT="Story">
+<node CREATED="1275112584961" ID="Freemind_Link_1188471331" MODIFIED="1275112599303" TEXT="SEDA is difficult">
+<node CREATED="1275112662381" ID="Freemind_Link_1104617514" MODIFIED="1275112667602" TEXT="poor performance"/>
+<node CREATED="1275112668422" ID="Freemind_Link_404356874" MODIFIED="1275112671882" TEXT="cannsandra"/>
+</node>
+<node CREATED="1275112605204" ID="Freemind_Link_1475121621" MODIFIED="1275112621722" TEXT="Implemenation diffculities">
+<node CREATED="1275112653252" ID="Freemind_Link_28500128" MODIFIED="1275112657519" TEXT="thread pool"/>
+<node CREATED="1275112872117" ID="Freemind_Link_660883007" MODIFIED="1275112879193" TEXT="Garbage collection"/>
+<node CREATED="1275112879702" ID="Freemind_Link_1170322606" MODIFIED="1275112889259" TEXT="Poor Thread implemantion ">
+<node CREATED="1275112891080" ID="Freemind_Link_1005276543" MODIFIED="1275112892772" TEXT="native"/>
+<node CREATED="1275112894000" ID="Freemind_Link_1872356089" MODIFIED="1275112899676" TEXT="user space"/>
+</node>
+<node CREATED="1275113534465" ID="Freemind_Link_1057963141" MODIFIED="1275113544653" TEXT="Thread is no use for Many Core">
+<node CREATED="1275113553291" ID="Freemind_Link_101335497" MODIFIED="1275113557847" TEXT="Data separation"/>
+<node CREATED="1275113558139" ID="Freemind_Link_1380411545" MODIFIED="1275113560383" TEXT="Quick path"/>
+<node CREATED="1275113568829" ID="Freemind_Link_102500206" MODIFIED="1275113579090" TEXT="Direct handling of data management"/>
+<node CREATED="1275113583439" ID="Freemind_Link_326187704" MODIFIED="1275113588204" TEXT="GPGPU"/>
+</node>
+</node>
+<node CREATED="1275112631448" ID="Freemind_Link_1068833839" MODIFIED="1275112640822" TEXT="Open CL as an assembler">
+<node CREATED="1275112643330" ID="Freemind_Link_1746670257" MODIFIED="1275112646206" TEXT="complicated"/>
+<node CREATED="1275112831991" ID="Freemind_Link_735586753" MODIFIED="1275112843828" TEXT="Data Transfer semantics"/>
+<node CREATED="1275112844368" ID="Freemind_Link_767727313" MODIFIED="1275112847861" TEXT="DMA handling"/>
+<node CREATED="1275112852338" ID="Freemind_Link_1971955041" MODIFIED="1275112855822" TEXT="64bit addressing"/>
+</node>
+<node CREATED="1275113609867" ID="Freemind_Link_469948367" MODIFIED="1275113615689" TEXT="Script Language">
+<node CREATED="1275113616132" ID="Freemind_Link_949199189" MODIFIED="1275113623113" TEXT="Very bad thread support">
+<node CREATED="1275113625989" ID="Freemind_Link_1953916950" MODIFIED="1275113631138" TEXT="GIL in python"/>
+</node>
+<node CREATED="1275113642328" ID="Freemind_Link_616018690" MODIFIED="1275113657350" TEXT="Other option of scripting language?"/>
+</node>
+<node CREATED="1275112679215" ID="Freemind_Link_225352853" MODIFIED="1275112683384" TEXT="Our experience">
+<node CREATED="1275112683944" ID="Freemind_Link_1838733481" MODIFIED="1275112690669" TEXT="Federated Linda">
+<node CREATED="1275113771475" ID="Freemind_Link_584745370" MODIFIED="1275113787081" TEXT="single threaded asynchronus server"/>
+</node>
+<node CREATED="1275112691569" ID="Freemind_Link_454593896" MODIFIED="1275112699014" TEXT="Suci Library"/>
+<node CREATED="1275112699491" ID="Freemind_Link_1456118464" MODIFIED="1275112710448" TEXT="Cerium TaskManager">
+<node CREATED="1275113667196" ID="Freemind_Link_170000986" MODIFIED="1275113670664" TEXT="writing task is easy"/>
+<node CREATED="1275113671252" ID="Freemind_Link_770653383" MODIFIED="1275113679377" TEXT="writing task configuration is difficult"/>
+</node>
+<node CREATED="1275112752755" ID="Freemind_Link_1891510360" MODIFIED="1275112775322" TEXT="Module management"/>
+<node CREATED="1275112780087" ID="Freemind_Link_725058035" MODIFIED="1275112783899" TEXT="Debugging"/>
+<node CREATED="1275112784247" ID="Freemind_Link_1580820713" MODIFIED="1275112787228" TEXT="Correctness"/>
+</node>
+<node CREATED="1275113801712" ID="Freemind_Link_1355660255" MODIFIED="1275113809005" TEXT="What we found">
+<node CREATED="1275113813514" ID="Freemind_Link_1498238078" MODIFIED="1275113818966" TEXT="Java is no good">
+<node CREATED="1275113820683" ID="Freemind_Link_933662284" MODIFIED="1275113825911" TEXT="no so easy to write"/>
+</node>
+<node CREATED="1275113827756" ID="Freemind_Link_66377206" MODIFIED="1275113831552" TEXT="OOL is no good">
+<node CREATED="1275113831869" ID="Freemind_Link_191315880" MODIFIED="1275113841778" TEXT="no good for pipeline"/>
+<node CREATED="1275113845255" ID="Freemind_Link_1291903278" MODIFIED="1275113850819" TEXT="sharing base"/>
+</node>
+<node CREATED="1275113854432" ID="Freemind_Link_1891537947" MODIFIED="1275113868854" TEXT="Copy is not so expensive in Many core"/>
+<node CREATED="1275113881148" ID="Freemind_Link_1308117896" MODIFIED="1275113888457" TEXT="Data management is difficult"/>
+<node CREATED="1275113889533" ID="Freemind_Link_1420743321" MODIFIED="1275113895554" TEXT="Task configuration is difficult"/>
+<node CREATED="1275113899663" ID="Freemind_Link_597396736" MODIFIED="1275113911165" TEXT="Proving correctness is difficult">
+<node CREATED="1275113912465" ID="Freemind_Link_1156417806" MODIFIED="1275113921830" TEXT="no in an algorythm base"/>
+<node CREATED="1275113923971" ID="Freemind_Link_1237667597" MODIFIED="1275113929607" TEXT="find stupid error"/>
+</node>
+</node>
+<node CREATED="1275112741985" ID="Freemind_Link_1108698248" MODIFIED="1275112745386" TEXT="New Tools">
+<node CREATED="1275112745826" ID="Freemind_Link_163003817" MODIFIED="1275112748694" TEXT="Code Segment"/>
+<node CREATED="1275112790736" ID="Freemind_Link_326714433" MODIFIED="1275112794693" TEXT="Data Segment"/>
+</node>
+<node CREATED="1275112799714" ID="Freemind_Link_1015918924" MODIFIED="1275112804078" TEXT="Code Segment">
+<node CREATED="1275112804635" ID="Freemind_Link_240796797" MODIFIED="1275112810423" TEXT="Continuation based C"/>
+<node CREATED="1275112813372" ID="Freemind_Link_1481991611" MODIFIED="1275112816792" TEXT="gcc implementation"/>
+<node CREATED="1275113029844" ID="Freemind_Link_165331020" MODIFIED="1275113032609" TEXT="No stack"/>
+<node CREATED="1275113936685" ID="Freemind_Link_731050431" MODIFIED="1275113943953" TEXT="Code segment rearrangement"/>
+<node CREATED="1275113945910" ID="Freemind_Link_1273890617" MODIFIED="1275113954971" TEXT="Simulating Speciial Hardware"/>
+<node CREATED="1275113964449" ID="Freemind_Link_1926395499" MODIFIED="1275113972758" TEXT="Explicit handing of Stack"/>
+<node CREATED="1275113973722" ID="Freemind_Link_1793479549" MODIFIED="1275113980103" TEXT="Conversion from C is easy"/>
+</node>
+<node CREATED="1275112823606" ID="Freemind_Link_921316948" MODIFIED="1275112826698" TEXT="Data Segment">
+<node CREATED="1275112908794" ID="Freemind_Link_486480678" MODIFIED="1275112922552" TEXT="software cache"/>
+<node CREATED="1275112923268" ID="Freemind_Link_351990123" MODIFIED="1275112926625" TEXT="hardware cache"/>
+<node CREATED="1275112926981" ID="Freemind_Link_1866512467" MODIFIED="1275112928337" TEXT="LRU"/>
+<node CREATED="1275112928837" ID="Freemind_Link_224129129" MODIFIED="1275112931497" TEXT="Hash"/>
+<node CREATED="1275112936158" ID="Freemind_Link_470080321" MODIFIED="1275112955421" TEXT="Safety"/>
+<node CREATED="1275112962850" ID="Freemind_Link_1720630435" MODIFIED="1275112971223" TEXT="basic operations">
+<node CREATED="1275112971996" ID="Freemind_Link_1048686989" MODIFIED="1275113004472" TEXT="copy default"/>
+<node CREATED="1275113014922" ID="Freemind_Link_924497944" MODIFIED="1275113017614" TEXT="no write back"/>
+<node CREATED="1275112979077" ID="Freemind_Link_1949069878" MODIFIED="1275112986922" TEXT="write back"/>
+<node CREATED="1275112989287" ID="Freemind_Link_1331615626" MODIFIED="1275113010461" TEXT="write only"/>
+<node CREATED="1275114353581" ID="Freemind_Link_387110297" MODIFIED="1275114363868" TEXT="Working on different storage system"/>
+</node>
+<node CREATED="1275114369126" ID="Freemind_Link_887334998" MODIFIED="1275114375563" TEXT="consistency">
+<node CREATED="1275114386329" ID="Freemind_Link_1515596960" MODIFIED="1275114394101" TEXT="separated by copy"/>
+<node CREATED="1275114397106" ID="Freemind_Link_1335935086" MODIFIED="1275114409904" TEXT="serialized in a thread">
+<node CREATED="1275114418326" ID="Freemind_Link_913046128" MODIFIED="1275114422650" TEXT="by TaskManager"/>
+</node>
+<node CREATED="1275114429400" ID="Freemind_Link_908462274" MODIFIED="1275114433844" TEXT="locked in a site"/>
+</node>
+<node CREATED="1275113704385" ID="Freemind_Link_130229052" MODIFIED="1275113706909" TEXT="allocation"/>
+<node CREATED="1275113711722" ID="Freemind_Link_439331989" MODIFIED="1275113715071" TEXT="copy base">
+<node CREATED="1275113715435" ID="Freemind_Link_665352752" MODIFIED="1275113717263" TEXT="no gc"/>
+<node CREATED="1275113717923" ID="Freemind_Link_287335785" MODIFIED="1275113725392" TEXT="pipelined architecture"/>
+<node CREATED="1275113726012" ID="Freemind_Link_1198591558" MODIFIED="1275113733089" TEXT="new object orientation">
+<node CREATED="1275113748568" ID="Freemind_Link_1135556772" MODIFIED="1275113750796" TEXT="become base"/>
+</node>
+</node>
+</node>
+<node CREATED="1275113046143" ID="Freemind_Link_138338616" MODIFIED="1275113700533" TEXT="Comparison">
+<node CREATED="1275113995358" ID="Freemind_Link_745247811" MODIFIED="1275113999810" TEXT="SEDA"/>
+<node CREATED="1275114000182" ID="Freemind_Link_704604936" MODIFIED="1275114008539" TEXT="Open CL"/>
+<node CREATED="1275114447946" ID="Freemind_Link_1016409443" MODIFIED="1275114450590" TEXT="drawback">
+<node CREATED="1275114450930" ID="Freemind_Link_8472774" MODIFIED="1275114464083" TEXT="different from convestional programming"/>
+<node CREATED="1275114464973" ID="Freemind_Link_230082378" MODIFIED="1275114468466" TEXT="different from SEDA"/>
+<node CREATED="1275114477455" ID="Freemind_Link_1547251035" MODIFIED="1275114481435" TEXT="can be converted">
+<node CREATED="1275114484495" ID="Freemind_Link_1039604627" MODIFIED="1275114500368" TEXT="verified by equality check"/>
+</node>
+</node>
+</node>
+</node>
+<node CREATED="1274963967987" ID="Freemind_Link_1598868572" MODIFIED="1274963987214" POSITION="left" TEXT="Programming with DataSegment and CodeSegment"/>
+<node CREATED="1274964021633" ID="Freemind_Link_1749358525" MODIFIED="1274964077331" POSITION="left" TEXT="Continuation based C"/>
+<node CREATED="1274964119190" ID="Freemind_Link_692615740" MODIFIED="1274964122409" POSITION="left" TEXT="CodeSegment"/>
+<node CREATED="1274964122949" ID="Freemind_Link_1662908742" MODIFIED="1274964126633" POSITION="left" TEXT="DataSegment"/>
+<node CREATED="1274964156093" ID="Freemind_Link_996044851" MODIFIED="1274964170888" POSITION="left" TEXT="TaskManager">
+<node CREATED="1274964931753" ID="Freemind_Link_1470206126" MODIFIED="1274964934325" TEXT="Cerium"/>
+</node>
+<node CREATED="1274964183460" ID="Freemind_Link_1573793876" MODIFIED="1274964195703" POSITION="left" TEXT="DataSegment Operation">
+<node CREATED="1274964209115" ID="Freemind_Link_1807920319" MODIFIED="1274964219374" TEXT="Core movement"/>
+<node CREATED="1274964308519" ID="Freemind_Link_743804722" MODIFIED="1274964314803" TEXT="Cache management"/>
+<node CREATED="1274964220562" ID="Freemind_Link_1256487209" MODIFIED="1274964234613" TEXT="Copy base"/>
+<node CREATED="1274964240393" ID="Freemind_Link_1492961037" MODIFIED="1274964263348" TEXT="Storage movement"/>
+<node CREATED="1274964288936" ID="Freemind_Link_1963933474" MODIFIED="1274964294579" TEXT="Key Value Store"/>
+<node CREATED="1274964742056" ID="Freemind_Link_1733115535" MODIFIED="1274964744348" TEXT="get">
+<node CREATED="1274964748408" ID="Freemind_Link_603709527" MODIFIED="1274964757619" TEXT="new"/>
+<node CREATED="1274964764575" ID="Freemind_Link_800484454" MODIFIED="1274964770563" TEXT="no write back"/>
+</node>
+<node CREATED="1274964745280" ID="Freemind_Link_1717358826" MODIFIED="1274964746923" TEXT="put"/>
+</node>
+<node CREATED="1274964520704" ID="Freemind_Link_335778680" MODIFIED="1274964526667" POSITION="left" TEXT="Implementation">
+<node CREATED="1274964700729" ID="Freemind_Link_1259715582" MODIFIED="1274964712077" TEXT="2^n malloc"/>
+<node CREATED="1274964714833" ID="Freemind_Link_1740094065" MODIFIED="1274964724364" TEXT="Hash and LRU"/>
+</node>
+<node CREATED="1274964639196" ID="Freemind_Link_1361170885" MODIFIED="1274964647039" POSITION="left" TEXT="Comparison">
+<node CREATED="1274964649123" ID="Freemind_Link_144839326" MODIFIED="1274964656894" TEXT="Persitent Programming"/>
+<node CREATED="1274964657427" ID="Freemind_Link_1039631289" MODIFIED="1274964663110" TEXT="Tuple Space"/>
+<node CREATED="1274964663723" ID="Freemind_Link_749613570" MODIFIED="1274964668526" TEXT="Open CL"/>
+</node>
+</node>
+</map>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/db.ind	Mon May 31 03:22:18 2010 +0900
@@ -0,0 +1,546 @@
+-title: Programming Scalable Service in Code segment and Data segment
+
+--author: Shoji TAMAKI and Shinji KONO
+
+-abstract:
+
+--Tools for implementing Distributed Application
+
+On demanding construction of scalable services such as Twitter,
+FaceBook or Network based Books, we need new stage programming
+tools. Based on our experiences, we designed and implemented
+two major tools to build scalable services: Code segments and
+Data Segments.
+
+Not necessary mentioned SEDA \cite{SEDA}, scalable services
+requires highly distributed servers and highly multi-threaded
+program on a server among them. We have successfully implemented
+WWW services using Classical tools such as C++, Java, or even C.
+Script Languages such as Perl, PHP or Python are used in
+design side by non professional programmers.
+
+Now some of the services have more than 10 millions users,
+load balancing among several WWW front-end and many
+memcached servers to replicate Database accesses using
+classical database such as Oracle, mySQL or Postgress, which
+performs so badly, Internet companies have to
+create Key Value Store system by themselves, such as BigTable
+or Cassandra. This situation is sometimes discussed in a context
+of CAP vs BASE database scheme.
+
+Based on our working on Internet programming and PS3 programming, now,
+we are sure that we need more suitable tools to implement various
+components in the scalable services, such as database server,
+web server or HTTP response generator.
+
+We separate difficulties in two point of views: Code and Data.
+This sounds very basic, but since our history is starting from
+a single CPU with few memories, current tools are some how obsolete
+now, so we have to reconsider the situation.
+
+We are working on a combination of Continuation based C and
+Cerium Engine. The former one is a lower language of C implemented
+in GCC, and the later is a Open CL like fine grain task manager on
+PS3 Cell architecture, which supports data segment management on
+SPE (Synergistic Processor Engine). Since SPE has only 256Kbytes
+local memory, careful management is necessary, so we have to invent
+our own memory manager.
+
+In this paper, first we analyze problems in scalable system. Then
+we introduce new concepts: Code Segment and Data Segment. Code
+segment is implemented in Continuation based C here after we call CbC  and Data Segment
+is implemented in a memory management module in many core task manager called
+Cerium Engine.
+
+The basic idea is this. We pass the control among module layer without function
+call. We cannot use conventional language because it has built-in function
+call which which cannot be removed. These module layer segments are
+called code segments.
+All the data are stored in a message
+packed from, which we called data segment, which is controlled uniformly.
+Instead of using direct pointer access,
+data segments are copied among modules and CPU cores, which are carefully
+adopted to the cache or interconnect communication such as DMA.
+All the data segments are hashed in 2^n size memory pool similar to the
+Unix malloc mechanism. This pool is in 64bit address space and
+it makes data segment communication far simpler.
+
+It sounds like completely different from current Internet service scheme,
+we overcome the difference in following way using code transformation.
+First we make entire program in a conventional way. We divide it
+into code segment and explicit stack manipulation. During this stage,
+communications are reformatted into data segment passing among
+code segments. The program still working exactly the same before
+transformation and we may use automatic conversion here.
+We reorganize it using data parallel and pipeline execution. At this
+stage, automatic conversion is not suitable in many cases, so we have
+to make translation by hands, but we can use possible equivalence checker
+for the program correctness. In order to make pipeline execution,
+destructive modification of the content of data segments such as
+classical object oriented programming is not allowed.
+We have to make copies.
+
+At the last section, we give some of our achievement and
+comparison with other tools, such as SEDA or Open CL.
+
+--Problems
+
+Let's think that we are going to make a network game.
+Maybe PS3 (6 SPU and 2 PPU )is used in a client side and 32 CPU ( 16 x 2 hyper thread ). We have to use highly pipelined thread and data parallel execution
+in both client side and server side, something like SEDA architecture.
+We will demonstrate several problems based on our experiences.
+
+---Our Experiences
+
+Our PS3 implementation is SPURS\cite{SPURS} like Pipeline Task Manager which
+is called Cerium. (We had to write software rendering because of SCEI did
+not open GPU information BTW.) Since PS3's SPU has only 256Kbytes memory,
+we have to carefully handle memory usage both in case of code and data.
+
+Data segment is copied from PPU to SPU via DMA, which overhead is hide
+using Task Pipeline. But we have to avoid too much synchronization
+of these copies.
+
+In case of Xeon architecture, CPU cores are shared all the memory,
+but actually it has a local cache which is interconnected using quick path.
+Cache size is 256Kbyte for each core. Implicit copy is done between
+a cache memory and the main memory or between a cache and another cache.
+The situation is basically the same in PS3 and Xeon.
+
+---Module Layer
+
+Complex systems such as Operating systems, Database systems,
+Network Systems or Game libraries have several module layer.
+For examples, Database system has message packing module,
+query analyzing module and execution module. Network system
+may have ISO standard layer such as presentation, transport
+and network. Operating system have v-node file system and
+device drivers.
+
+Each module may have 1-5 nested function calls, so we have
+more then 10-30 nested function calls in Complex systems.
+
+If we have several tasks to do, each
+processing in modules can be executed in a pipelined way.
+In order to implement the pipeline, we assign threads from
+thread pools to each module layer.
+
+Conversion from sequential execution to pipelined execution
+is straight forward, but if it has a race condition, correcting
+the problem is very difficult.
+
+---SEDA
+
+This combination of pipeline staging and data parallel execution
+is the heart of SEDA. But it requires very complex programming.
+At first we have design communication queue among all the pipeline
+stages. In case of C++, we have to managing all the queue manually
+because it lacks garbage collection. It is not so easy and requires
+complex memory pools ( or conservative GC ), which is a bug prone.
+
+---Thread Implementation
+
+Theoretically SEDA architecture works fine, but it assumes very
+fast thread execution with blocking queue. Cassandra key value store
+system use Java to implement this architecture. Java 6 is far better
+thread execution, but sine it is a combination of user level thread and
+kernel level thread, it is not so easy to optimize its execution.
+
+If we use script language to implement this, thing become worse. For
+example, Python thread implementation is very bad currently, and ruby
+does not support kernel thread. And their GC mechanism always interfere
+with thread execution.
+
+---Blocking queue
+
+Each threads are executed in an event driven way. A task is
+put into a blocking queue and it wakes up a thread. The thread
+read the queue atomically.
+
+We can write this operation in following way.
+
+    while(Task p = waitingQueue.get()) {
+	p.run();
+    }
+
+It looks good but {\tt p} is determines just before its execution,
+which is no good in terms of branch prediction.
+It looks like this delay is not so important, but
+it has penalty around 10 clocks. If we have many small task
+this situation is not so good. What we need is 10 to 20 instruction
+cycle before executing the indirect call.
+
+---Scheduling
+
+If cost of blocking queue operation is negligible, simple
+FIFO scheduling is OK in SEDA from the through put point of view.
+But blocking queue requires 10 to 20 instruction cycle under
+no race condition. Thread pool size is heavyly depends on
+the architecture, that is number of CPU, number of requests,
+execution time of tasks.
+
+Sometimes it is better to reduce concurrency and skip these
+synchronization costs.
+
+---Garbage Collection
+
+Basically communication between layers makes no garbage, because it is
+generated and destroyed in fix amount size. But in case of programming
+language with GC, if we use memory pool like technique, it makes many
+references which GC have to take care of. It makes GC very slow. So
+it is better to simply generate and destroy.
+
+Apache Web server features memory pool approach drastically, but it
+is an convention, some module use malloc library call directly.
+
+---Programming Correctness
+
+SEDA architecture or SPURS architecture is very complex to implement
+and the program working on it is very difficult also. It is very difficult
+to test.
+
+For example, message packet between pipeline stages is created and destroyed
+in exact moment. If we lost the correct timing, a bug will occur or not
+if we are unlucky. This is odd, because even if program itself is deterministic,
+it behaves non deterministic dew to pipeline execution timing.
+
+---Many Core Awareness
+
+Open CL \cite{opencl} is a standard library to use Many Core
+architecture, but it has very complex interfaces. We have to
+write a program on a core in a string, which is compiled in
+LLVM \cite{LLVM}. Data transfer API is vary and complex, which
+requires large amount of code.
+
+In case of Java or Scripting language, we cannot directly control
+the copy between cores, which means we cannot hide copy cost
+explicitly. We have to care about SPU's local memory size or
+cache memory size which is 256Kbytes in this time.
+
+The same careful management is necessary for executing code
+which is a data on a core also. We have to transfer code segment
+using copying cost hiding technique such as pipeline execution.
+
+These higher level pipeline optimization is very difficult and is not handle well
+in compiles. Since compiler technique is working well on streaming instructions, it is some how contradict. It should be designed by hands.
+
+--New Tools
+
+We introduce two main tools, one is Continuation based C and the other one
+is Data segment management based on Cerium Task Manager.
+
+--Code Segment
+
+Continuation based C is a C language which all the function is
+forced to do tail call elimination. It is implemented using
+GCC 4.x. Modification is not so large. We also force
+FASTCALL option which assign arguments on registers. This
+makes it faster.
+
+CbC Syntax is very simple.
+
+   struct interface1  { int i; };
+   struct interface2  { int o; };
+
+   __code f(struct interface1 a) {
+       struct interface2 b; b.o=a.i;
+       goto g(b);
+   }
+
+In this example, a code segment
+\verb+f+ has \verb+input a+ and sends \verb+output b+ to a code segment \verb+g+.
+There is no return from code segment \verb+b+, \verb+b+ should call another
+continuation using \verb+goto+. Any control structure in C is allowed in CbC
+language, but we may restrict ourselves to use \verb+if+ statement
+only, because it is sufficient to implement C to CbC translation. In this case,
+code segment has one input interface and several output interfaces (fig.\ref{code
+}).
+
+\verb+__code+ and parameterized global goto statement is an extension of
+Continuation based C. Unlike \verb+C--+ \cite{cminusminus}'s parameterized goto,
+we cannot goto into normal C function because of forced FASTCALL option.
+
+---Continuation
+
+Since code segment has no stack, continuation of code segment is mere
+entry address to the code segment. We can call it a light weight continuation.
+
+We also supports full continuation of normal C function using GCC nested function and statement expression. It is implemented some like this in GCC compiler.
+
+        void (*__return)(int retval_, void *_envp);
+        __return = ({
+          nee_label__ _cbc_exit0;
+            void __return_func(int retval_, void *_envp){
+               retval = retval_;
+               goto exit0;
+            }
+            if (0) {
+              _cbc_exit0:
+               return retval;
+            }
+            __return_func;
+        });
+
+        void *__environment = __builtin_frame_pointer();
+
+We have a environment pointer which is usually the frame pointer, but
+it is not used here, because this is a closure with a hidden environment.
+Since this closure is usually implemented using trampoline, that is executable code on stack, if execution code on stack is prohibited, it will not work, but it works on Linux and Mac OS X. In case of Windows case, we cannot use closure,
+so we have to assign frame pointer explicitly.  Anyway this can be used like this.
+
+    int main() {
+	goto f(1, __environment, __return );
+	  ....
+    }
+    __code f(int, void *env, __code (*continuation)(int retval_,void *fp)) {
+            goto (continuation)(-1, env);
+    }
+
+In this example, \verb+main+ will return -1. When you want to return to
+the middle of the normal function or code segment, put an extra function
+call over it.
+
+--Data Segment
+
+Data segment is a set of doubly linked fix size block which also
+hashed by the 64bit address. It has $2^n$ size, so it is allocated
+efficiently.
+
+Each site, CPU or cores expected to have separated data segment pool.
+Data segment address is unique in all CPUs. In case of PS3, SPU has
+local storage that is 256Kbytes separate addressing space, which
+local address is different from data segment global address.
+
+Code segment will not use global address directly but it will use
+offset in data segments in its input interface. So we can use
+same code segment both for 64 bit Xeon and SPU 256Kbytes memory.
+This means data segment size is normally limited by its hardware,
+Typically 16Kbytes.
+
+---Data Segment operations
+
+Data segment has several operations,
+
+   get with no global address
+   get with copy
+   get with no copy
+   get with copy with write back
+   get with no copy with write
+
+Allocation/deallocation is not directly handled from its code segment,
+because it is handled by the Task Manager in a pipelined way.
+
+Data segment may contains other data segments' global address, but
+it may invalid. It is a kind of key in a key value store. Consistency
+of data segment global address is maintained by the Cerium task manager.
+
+--Typical usage
+
+A code segment is provided input interface, which contains array of
+data segment in local address space or cache. Usually availability
+of data segment is assured by the task manager. If it is not ready,
+the code segment waits and other ready-to-run code segments are executed.
+
+Loading necessary data segments in the input interfaces are done prior
+to the execution, may be in a back ground of other code segments
+execution.
+
+When all data segments are ready, the cod segment is executed. During
+its execution, next input interfaces may be loading.
+
+After the execution, output interfaces are written into the global
+address if necessary. This is also done in a pipelined way.
+
+---Task dependency and Task array
+
+Cerium task manager has very simple FIFO scheduler. It is sufficient
+if only through put is matter, which is a usual case.
+
+We support task dependency some think like this,
+
+    task_a->wait_for(task_b);
+
+All the task is stored in data segments, and connected wait-for link.
+
+After a task is finished, the task manager solve these dependencies,
+which is a rather heavy task. If tasks are grouped in terms of
+dependencies, we can reduce this phase. This is called task array.
+
+All the data is stored in data segments and it is managed in data segment
+pool in each separate CPU, that is we need no lock in its allocation.
+
+---Task execution
+
+After the loading of input interface, if we have a next task,
+we know where to execute it. It can be passed to the current
+task.
+
+    _code task_a(next_task, interface input, interface output) {
+	.... Task processing
+        goto next_task->code(next_task, next_task->input, next_task->output);
+    }
+
+If we have not task to execute more, we can put mail waiting task in
+the next_task.
+In this way, next_task call address is determined well before the call.
+
+---Data segment deallocation timing
+
+There are two types of data segment.
+
+The one is staying in a main memory
+indefinitely, possibly replicated in more reliable storage hierarchy such
+as SSD or Hard Disks. It's global address is persistent. It is basically
+write only and remain forever in the life of the Internet service. In
+other words, it will never be deallocated.  We can call this a persistent
+data segment.
+
+The other data segment is stayed in local cores. This is limited and
+temporally. It is copied from the persistent data segment. After the
+code segment execution, temporal data segment may be copied into
+persistent data segment.
+
+Task itself don't care about reading and writing race conditions. It have
+be controlled in terms of task dependency or be controlled by the task
+manager.
+
+---Where is the synchronization?
+
+In Cerium Task Manager, a core has a single threaded scheduler. It accept
+an array of task as a data segment as a mail. There is a main task manager,
+which waits mails from schedulers in cores. Synchronization only
+happens in mail transfer among main scheduler and sub schedulers in cores.
+This means synchronization itself can be delayed significantly in this scheme.
+
+If some service needs very fast response, dominating special task is necessary.
+For example, it wait some events using spin locks or hardware interrupts.
+
+
+---Hardware support
+
+This architecture requires explicit cache control. But now a day, most
+architecture has this kind of cache control such as memory barrier.
+Unfortunately these are not standardized.
+Using Cerium task manager, we can hide these differences.
+
+---With Conventional Operating System
+
+Task Manager itself is running in a user space. Since tasks are in
+data segments and it can be transferred to other user spaces, for
+example in other clusters.
+Actually we build our task manager in user space.
+
+There are possible operating system supports for this task manager or
+we can provide memory space management for code segments and data segments.
+
+---Object Orientation
+
+There are many object oriented programming style since Smalltalk-80.
+C++ or Java has an object in fixed memory address. When a
+field of an object is updated, fixed memory contents is updated.
+In case of highly pipelined execution, updating memory contents requires
+synchronization when the object is shared.
+
+In our scheme, usually input interface and output interface point
+different data segment to avoid synchronization.
+
+In ACT3, actor has a become operator. An object is replaced by
+newly created object. This means object has multiple memory address
+according to its update history. In Smalltalk-80, it has object
+table and become operator is a replacement of pointer in the list.
+The list should be kept in a data segment and update by a single
+threaded task.
+
+We can build actor like object oriented system on top of data segment
+pools.
+
+---Verification
+
+Basically pipelined tasks are in fact, series of application of
+tasks on requests. We can simply writes this using iterator.
+In case of word count in a file,
+
+    foreach data segment d in ( file ), out in (partial_result) {
+        task_work_count(d,out);
+    }
+    task_sum_up(partial_result);
+
+If pipeline execution is correct, we don't have to verify the pipeline
+execution, but check the correctness of this sequential execution,
+which is much easier.
+
+Once we get verified sequential execution, we can put checking
+stage on each pipeline stage.
+
+-- Comparison
+
+Our architecture is a variant of SEA, but it can reduce
+synchronization cost. Using data segment copying, shared
+data are reduced. Copying costs are hide using DMA or
+cache management instruction.
+
+Open CL is recently introduced but it has very complex
+data transfer operation. It is an assembler level description
+to achieve best performance. Data segment handling makes it
+simple both in syntactically and in memory management efficiency.
+
+Tasks in Open CL is stored in C strings. In our scheme
+tasks are all written in code segments, which can run on
+any architecture. Actually we can run Cerium both on Mac OS X and
+PS3 Linux using the same code. If it contains an optimized code
+for SPU, we can run a code has the same behavior with non optimized code.
+
+Script language is easy to describe, but it works sequentially
+from the beginning. Python cannot achieve parallel processing
+advantage because of interpreter restriction. Our approach is that
+divide problem into code segments and data segments and execute
+it in an iterator.
+
+This is something like FP, but using data segments, its execution
+becomes suitable for Many core architecture.
+
+KVS, key value store is a distributed database which is separated
+from main service program, In our scheme, management of global
+data segment can be done in a KVS. We can also use our architecture
+to implement a KVS.
+
+In Persistent programming, records and transactions are introduced
+in Programming Languages, but at that time, parallel execution is
+not well considered. In our scheme, data segments behave as records,
+which has several versions.
+
+This architecture uses a new language CbC. It is a lower level
+language of C, but still programmers have to learn it. It is
+very different programming style also. We think it is not so
+easyly accepted by every one.
+
+In structured programming, everything has record like structure,
+which is called container. In this architecture, containers has
+common operations and managed in many core architecture. It is
+also executed in CbC.
+
+There is a conversion  algorithm from C to CbC, so we hope
+some kind of half automatic conversion of sequential implementation
+of the Internet service is possible.
+
+--Conclusions
+
+We are developing SEDA like architecture for software service
+architecture. It has code segment system based on Continuation
+based C and data segment system based on
+Cerium Engine.
+
+Combination of data segment and code segment provides a better
+many core programming than Open CL. It is executed in multi
+stage pipeline. Code segment provides good implementation
+technique of pipeline scheduler.
+
+Data segment copying makes garbage collection unnecessary in
+computation pipeline which cannot avoid in case of GC based
+language such as Java.
+
+Cerium task engine and Continuation based C compiler is
+developed openly and working.
+The combination of code segment and data segment is under construction.
+
+