changeset 2:ccea4e6a1945

add OpenCL example
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Jan 2013 23:19:41 +0900
parents b511640282d2
children f3cfea46e585
files fft_Example/AccelerateError.pdf fft_Example/Error.pdf fft_Example/Makefile fft_Example/OpenCLError.pdf fft_Example/OpenCL_FFT.xcodeproj/project.pbxproj fft_Example/ReadMe.txt fft_Example/clFFT.h fft_Example/fft_base_kernels.cl fft_Example/fft_base_kernels.h fft_Example/fft_execute.cc fft_Example/fft_internal.h fft_Example/fft_kernelstring.cc fft_Example/fft_setup.cc fft_Example/main.cc fft_Example/param.txt fft_Example/param_small.txt fft_Example/procs.h hello_World_Example/.DS_Store hello_World_Example/Makefile hello_World_Example/ReadMe.txt hello_World_Example/hello.cc hello_World_Example/hello.xcodeproj/project.pbxproj parallel_Prefix_Sum_Example/.DS_Store parallel_Prefix_Sum_Example/Makefile parallel_Prefix_Sum_Example/ReadMe.txt parallel_Prefix_Sum_Example/scan.cc parallel_Prefix_Sum_Example/scan.xcodeproj/project.pbxproj parallel_Prefix_Sum_Example/scan_kernel.cl
diffstat 28 files changed, 6162 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
Binary file fft_Example/AccelerateError.pdf has changed
Binary file fft_Example/Error.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/Makefile	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,28 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = fft_execute.cc fft_setup.cc main.cc fft_kernelstring.cc
+HEADERS = procs.h fft_internal.h fft_base_kernels.h clFFT.h
+TARGET = test_clFFT
+COMPILERFLAGS = -c -g -Wall -Werror -O0
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF}
+CC = clang++
+LIBRARIES = -framework OpenCL -framework Accelerate -framework AppKit ${RC_CFLAGS} ${ATF}
+
+OBJECTS = fft_execute.o fft_setup.o main.o fft_kernelstring.o
+TARGETOBJECT =
+all: $(TARGET)
+
+$(OBJECTS): $(SRCS) $(HEADERS)
+	$(CC) $(CFLAGS) $(SRCS)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(OBJECTS) -o $@ $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
Binary file fft_Example/OpenCLError.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/OpenCL_FFT.xcodeproj/project.pbxproj	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,237 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 45;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		BE94A7B3108AB33000C1AD87 /* fft_kernelstring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */; };
+		BE94A83D108AF8A100C1AD87 /* fft_setup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */; };
+		BEE709AF1097B8DD0017B8A5 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BEE709AE1097B8DD0017B8A5 /* main.cpp */; };
+		BEEA39EE108BD89D00729F49 /* fft_execute.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BEEA39ED108BD89D00729F49 /* fft_execute.cpp */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		8DD76F690486A84900D96B5E /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 8;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		8DD76F6C0486A84900D96B5E /* OpenCL_FFT */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = OpenCL_FFT; sourceTree = BUILT_PRODUCTS_DIR; };
+		BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_kernelstring.cpp; sourceTree = "<group>"; };
+		BE94A7CB108AB8BF00C1AD87 /* clFFT.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clFFT.h; sourceTree = "<group>"; };
+		BE94A7D4108ABFF000C1AD87 /* fft_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fft_internal.h; sourceTree = "<group>"; };
+		BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_setup.cpp; sourceTree = "<group>"; };
+		BE9DE3E010923A4E00940D66 /* fft_base_kernels.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fft_base_kernels.h; sourceTree = "<group>"; };
+		BE9DE4741092732C00940D66 /* param.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = param.txt; sourceTree = "<group>"; };
+		BE9DE4761092732C00940D66 /* procs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = procs.h; sourceTree = "<group>"; };
+		BEE709AE1097B8DD0017B8A5 /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
+		BEEA39ED108BD89D00729F49 /* fft_execute.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_execute.cpp; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		8DD76F660486A84900D96B5E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		08FB7794FE84155DC02AAC07 /* OpenCL_FFT */ = {
+			isa = PBXGroup;
+			children = (
+				BE9DE4741092732C00940D66 /* param.txt */,
+				08FB7795FE84155DC02AAC07 /* Source */,
+				C6859E8C029090F304C91782 /* Documentation */,
+				1AB674ADFE9D54B511CA2CBB /* Products */,
+			);
+			name = OpenCL_FFT;
+			sourceTree = "<group>";
+		};
+		08FB7795FE84155DC02AAC07 /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				BE9DE4761092732C00940D66 /* procs.h */,
+				BEEA39ED108BD89D00729F49 /* fft_execute.cpp */,
+				BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */,
+				BEE709AE1097B8DD0017B8A5 /* main.cpp */,
+				BE94A7D4108ABFF000C1AD87 /* fft_internal.h */,
+				BE9DE3E010923A4E00940D66 /* fft_base_kernels.h */,
+				BE94A7CB108AB8BF00C1AD87 /* clFFT.h */,
+				BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */,
+			);
+			name = Source;
+			sourceTree = "<group>";
+		};
+		1AB674ADFE9D54B511CA2CBB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				8DD76F6C0486A84900D96B5E /* OpenCL_FFT */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C6859E8C029090F304C91782 /* Documentation */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Documentation;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		8DD76F620486A84900D96B5E /* OpenCL_FFT */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "OpenCL_FFT" */;
+			buildPhases = (
+				8DD76F640486A84900D96B5E /* Sources */,
+				8DD76F660486A84900D96B5E /* Frameworks */,
+				8DD76F690486A84900D96B5E /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = OpenCL_FFT;
+			productInstallPath = "$(HOME)/bin";
+			productName = OpenCL_FFT;
+			productReference = 8DD76F6C0486A84900D96B5E /* OpenCL_FFT */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		08FB7793FE84155DC02AAC07 /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "OpenCL_FFT" */;
+			compatibilityVersion = "Xcode 3.1";
+			hasScannedForEncodings = 1;
+			mainGroup = 08FB7794FE84155DC02AAC07 /* OpenCL_FFT */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				8DD76F620486A84900D96B5E /* OpenCL_FFT */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		8DD76F640486A84900D96B5E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				BE94A7B3108AB33000C1AD87 /* fft_kernelstring.cpp in Sources */,
+				BE94A83D108AF8A100C1AD87 /* fft_setup.cpp in Sources */,
+				BEEA39EE108BD89D00729F49 /* fft_execute.cpp in Sources */,
+				BEE709AF1097B8DD0017B8A5 /* main.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1DEB923208733DC60010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				COPY_PHASE_STRIP = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_FIX_AND_CONTINUE = YES;
+				GCC_MODEL_TUNING = G5;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"_GLIBCXX_DEBUG=1",
+					"_GLIBCXX_DEBUG_PEDANTIC=1",
+				);
+				INSTALL_PATH = /usr/local/bin;
+				PRODUCT_NAME = OpenCL_FFT;
+			};
+			name = Debug;
+		};
+		1DEB923308733DC60010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				GCC_MODEL_TUNING = G5;
+				INSTALL_PATH = /usr/local/bin;
+				PRODUCT_NAME = OpenCL_FFT;
+			};
+			name = Release;
+		};
+		1DEB923608733DC60010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ARCHS = "$(NATIVE_ARCH_ACTUAL)";
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_LDFLAGS = (
+					"-framework",
+					OpenCL,
+					"-framework",
+					Accelerate,
+				);
+				PREBINDING = NO;
+				SDKROOT = "";
+			};
+			name = Debug;
+		};
+		1DEB923708733DC60010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ARCHS = "$(NATIVE_ARCH_ACTUAL)";
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				OTHER_LDFLAGS = (
+					"-framework",
+					OpenCL,
+					"-framework",
+					Accelerate,
+				);
+				PREBINDING = NO;
+				SDKROOT = "";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "OpenCL_FFT" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB923208733DC60010E9CD /* Debug */,
+				1DEB923308733DC60010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "OpenCL_FFT" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB923608733DC60010E9CD /* Debug */,
+				1DEB923708733DC60010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/ReadMe.txt	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,1 @@
+### OpenCL FFT (Fast Fourier Transform) ###

===========================================================================
DESCRIPTION:

This example shows how OpenCL can be used to compute FFT. Algorithm implemented
is described in the following references

1) Fitting FFT onto the G80 Architecture
   by Vasily Volkov and Brian Kazian
   University of California, Berkeley, May 19, 2008
   http://www.cs.berkeley.edu/~kubitron/courses/cs258-S08/projects/reports/project6_report.pdf
   
2) High Performance Discrete Fourier Tansforms on Graphics Processors
   by Naga K. Govindaraju, Brandon Lloyd, Yuri Dotsenko, Burton Smith, and John Manferdelli
   Supercomputing 2008.
   http://portal.acm.org/citation.cfm?id=1413373
   
Current version only supports power of two transform sizes however it should be straight forward
to extend the sample to non-power of two but power of base primes like 3, 5, 7. 

Current version supports 1D, 2D, 3D batched transforms. 

Current version supports both in-place and out-of-place transforms.

Current version supports both forward and inverse transform.

Current version supports both plannar and interleaved data format.

Current version only supports complex-to-complex transform. For real 
transform, one can use plannar data format with imaginary array mem set to zero. 

Current version only supports transform on GPU device. Accelerate framework can be used on CPU.

Current version supports sizes that fits in device global memory although "Twist Kernel" is 
included in fft plan if user wants to virtualize (implement sizes larger than what can fit 
in GPU global memory).

Users can dump all the kernels and global, local dimensions with which these kernels are run 
so that they can not only inspect/modify these kernels and understand how FFT is being 
computed on GPU, but also create their own stand along app for executing FFT of size of
their interest.

For any given signal size n, sample crates a clFFT_Plan, that encapsulates the kernel string, 
associated compiled cl_program. Note that kernel string is generated at runtime based on 
input size, dimension (1D, 2D, 3D) and data format (plannar or interleaved) along with some 
device depended parameters encapsulated in the clFFT_Plan. These device dependent parameters 
are set such that kernel is generated for high performance meeting following requirements

   1) Access pattern to global memory (of-chip DRAM) is such that memory transaction 
      coalesceing is achieved if device supports it thus achieving full bandwidth
   2) Local shuffles (matrix transposes or data sharing among work items of a workgroup)
      are band conflict free if local memory is banked.
   3) Kernel is fully optimized for memory hierarcy meaning that it uses GPU's large 
      vector register file, which is fastest, first before reverting to local memory 
      for data sharing among work items to save global DRAM bandwidth and only then 
      reverts to global memory if signal size is such that transform cannnot be computed
      by singal workgroup and thus require global communation among work groups.
      
Users can modify these parameters to get best performance on their particular GPU.     

Users how really want to understand the details of implementation are highly encouraged 
to read above two references but here is a high level description.
At a higher the algorithm decomposes signal of length N into factors as 

                   N = N1 x N2 x N3 x N4 x .... Nn
                   
where the factors (N1, ....., Nn) are sorted such that N1 is largest. It thus decomposes 
N into n-dimensional matrix. It than applies fft along each dimension, multiply by twiddle
factors and transposes the matrix as follow 

                      N2 x N3 x N4 x ............ x Nn x N1   (fft along N1 and transpose)
                      N3 x N4 x N5 x ....    x Nn x N2 x N1   (fft along N2 and transpose)
                      N4 x N5 x N6 x .. x Nn x N3 x N2 x N1   (fft along N3 and transpose)
                      
                      ......
                     Nn x Nn-1 x Nn-2 x ........ N3 x N2 x N1 (fft along Nn and transpose)
                     
 Decomposition is such that algorithm is fully optimized for memory hierarchy. N1 (largest
 base radix) is constrained by maximum register usage by work item (largest size of in-register 
 fft) and product N2 x N3 .... x Nn determine the maximum size of work group which is constrained
 by local memory used by work group (local memory is used to share data among work items i.e.
 local transposes). Togather these two parameters determine the maximum size fft that can be 
 computed by just using register file and local memory without reverting to global memory 
 for transpose (i.e. these sizes do not require global transpose and thus no inter work group 
 communication). However, for larger sizes, global communication among workgroup is required
 and multiple kernel launches are needed depending on the size and the base radix used. 
 
 For details of parameters user can play with, please see the comments in fft_internal.h
 and kernel_string.cpp, which has the main kernel generator functions ... especially
 see the comments preceeding function getRadixArray and getGlobalRadixInfo.
 User can adjust these parameters you achieve best performance on his device. 

Description of API Calls
=========================
clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );

This function creates a plan and returns a handle to it for use with other functions below. 
context    context in which things are happening
n          n.x, n.y, n.z contain the dimension of signal (length along each dimension)
dim        much be one of clFFT_1D, clFFT_2D, clFFT_3D for one, two or three dimensional fft
dataFormat much be either clFFT_InterleavedComplexFormat or clFFT_SplitComplexFormat for either interleaved or plannar data (real and imaginary)
error_code pointer for getting error back in plan creation. In case of error NULL plan is returned
==========================
void clFFT_DestroyPlan( clFFT_Plan plan );

Function to release/free resources
==========================
cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
								 cl_mem data_in, cl_mem data_out,
								 cl_int num_events, cl_event *event_list, cl_event *event );
								 
Function for interleaved fft execution.
queue      command queue for the device on which fft needs to be executed. It should be present in the context for this plan was created
plan       fft plan that was created using clFFT_CreatePlan
batchSize  size of the batch for batched transform
dir        much be either clFFT_Forward or clFFT_Inverse for forward or inverse transform
data_in    input data
data_out   output data. For in-place transform, pass same mem object for both data_in and data_out
num_events, event_list and event are for future use for letting fft fit in other CL based application pipeline through event dependency.
Not implemented in this version yet so these parameters are redundant right now. Just pass NULL.

=========================
cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
							 cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
							 cl_int num_events, cl_event *event_list, cl_event *event );
							 
Same as above but for plannar data type.							 
=========================
cl_int clFFT_1DTwistInterleaved( clFFT_Plan plan, cl_mem mem, size_t numRows, size_t numCols, size_t startRow, clFFT_Direction dir );

Function for applying twist (twiddle factor multiplication) for virtualizing computation of very large ffts that cannot fit into global
memory at once but can be decomposed into many global memory fitting ffts followed by twiddle multiplication (twist) followed by transpose
followed by again many global memory fitting ffts.

=========================
cl_int clFFT_1DTwistPlanner( clFFT_Plan plan, cl_mem mem_real, cl_mem mem_imag, size_t numRows, size_t numCols, size_t startRow, clFFT_Direction dir );

Same fucntion as above but for plannar data
=========================	

void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);	

Function to dump the plan. Passing stdout to file prints out the plan to standard out. It prints out
the kernel string and local, global dimension with which each kernel is executed in this plan.
						
==================================================================================
IMPORTANT NOTE ON PERFORMANCE:

Currently there are a few known performance issues (bug) that this sample has discovered
in rumtime and code generation that are being actively fixed. Hence, for sizes >= 1024, 
performance is much below the expected peak for any particular size. However, we have 
internally verified that once these bugs are fixed, performance should be on par with 
expected peak. Note that these are bugs in OpenCL runtime/compiler and not in this
sample.

===========================================================================
BUILD REQUIREMENTS:

Mac OS X v10.6 or later

If you are running in Xcode, be sure to pass file name "param.txt". You can do that
by double clicking OpenCL_FFT under executable and then click on Argument tab and 
add ./../../param.txt under "Arguments to be passed on launch" section. 

===========================================================================
RUNTIME REQUIREMENTS:

. Mac OS X v10.6 or later with OpenCL 1.0
. For good performance, device should support local memory. 
  FFT performance critically depend on how efficiently local shuffles 
  (matrix transposes) using local memory to reduce external DRAM bandwidth
  requirement.

===========================================================================
PACKAGING LIST:

AccelerateError.pdf
clFFT.h
Error.pdf
fft_base_kernels.h
fft_execute.cpp
fft_internal.h
fft_kernelstring.cpp
fft_setup.cpp
main.cpp
Makefile
OpenCL_FFT.xcodeproj
OpenCLError.pdf
param.txt
procs.h
ReadMe.txt

===========================================================================
CHANGES FROM PREVIOUS VERSIONS:

Version 1.0
- First version.

===========================================================================
Copyright (C) 2008 Apple Inc. All rights reserved.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/clFFT.h	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,129 @@
+
+//
+// File:       clFFT.h
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CLFFT_H
+#define __CLFFT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <OpenCL/opencl.h>
+#include <stdio.h>
+
+// XForm type
+typedef enum 
+{
+	clFFT_Forward	= 	-1,
+	clFFT_Inverse	= 	 1
+	
+}clFFT_Direction;
+
+// XForm dimension
+typedef enum
+{
+	clFFT_1D	= 0,
+	clFFT_2D	= 1,
+	clFFT_3D	= 3
+	
+}clFFT_Dimension;
+
+// XForm Data type
+typedef enum
+{
+	clFFT_SplitComplexFormat       = 0,
+	clFFT_InterleavedComplexFormat = 1
+}clFFT_DataFormat;
+
+typedef struct
+{
+	unsigned int x;
+	unsigned int y;
+	unsigned int z;
+}clFFT_Dim3;	
+	
+typedef struct
+{
+	float *real;
+	float *imag;
+} clFFT_SplitComplex;
+
+typedef struct
+{
+	float real;
+	float imag;
+}clFFT_Complex;
+
+typedef void* clFFT_Plan;	
+
+clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
+
+void clFFT_DestroyPlan( clFFT_Plan plan );
+
+cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
+								 cl_mem data_in, cl_mem data_out,
+								 cl_int num_events, cl_event *event_list, cl_event *event );
+
+cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
+							 cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+							 cl_int num_events, cl_event *event_list, cl_event *event );
+
+cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
+						        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+	
+
+cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
+					        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+	
+void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);	
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_base_kernels.cl	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,210 @@
+#ifndef M_PI
+#define M_PI 0x1.921fb54442d18p+1
+#endif
+#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))
+#define conj(a) ((float2)((a).x, -(a).y))
+#define conjTransp(a) ((float2)(-(a).y, (a).x))
+
+#define fftKernel2(a,dir)
+{
+    float2 c = (a)[0];
+    (a)[0] = c + (a)[1];
+    (a)[1] = c - (a)[1];
+}
+
+define fftKernel2S(d1,d2,dir)
+{
+    float2 c = (d1);
+    (d1) = c + (d2);
+    (d2) = c - (d2);
+}
+
+#define fftKernel4(a,dir)
+{
+    fftKernel2S((a)[0], (a)[2], dir);
+    fftKernel2S((a)[1], (a)[3], dir);
+    fftKernel2S((a)[0], (a)[1], dir);
+    (a)[3] = (float2)(dir)*(conjTransp((a)[3]));
+    fftKernel2S((a)[2], (a)[3], dir);
+    float2 c = (a)[1];
+    (a)[1] = (a)[2];
+    (a)[2] = c;
+}
+
+#define fftKernel4s(a0,a1,a2,a3,dir)
+{
+    fftKernel2S((a0), (a2), dir);
+    fftKernel2S((a1), (a3), dir);
+    fftKernel2S((a0), (a1), dir);
+    (a3) = (float2)(dir)*(conjTransp((a3)));
+    fftKernel2S((a2), (a3), dir);
+    float2 c = (a1);
+    (a1) = (a2);
+    (a2) = c;
+}
+
+#define bitreverse8(a)
+{
+    float2 c;
+    c = (a)[1];
+    (a)[1] = (a)[4];
+    (a)[4] = c;
+    c = (a)[3];
+    (a)[3] = (a)[6];
+    (a)[6] = c;
+}
+
+#define fftKernel8(a,dir)
+{
+    const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);
+    const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);
+    float2 c;
+    fftKernel2S((a)[0], (a)[4], dir);
+    fftKernel2S((a)[1], (a)[5], dir);
+    fftKernel2S((a)[2], (a)[6], dir);
+    fftKernel2S((a)[3], (a)[7], dir);
+    (a)[5] = complexMul(w1, (a)[5]);
+    (a)[6] = (float2)(dir)*(conjTransp((a)[6]));
+    (a)[7] = complexMul(w3, (a)[7]);
+    fftKernel2S((a)[0], (a)[2], dir);
+    fftKernel2S((a)[1], (a)[3], dir);
+    fftKernel2S((a)[4], (a)[6], dir);
+    fftKernel2S((a)[5], (a)[7], dir);
+    (a)[3] = (float2)(dir)*(conjTransp((a)[3]));
+    (a)[7] = (float2)(dir)*(conjTransp((a)[7]));
+    fftKernel2S((a)[0], (a)[1], dir);
+    fftKernel2S((a)[2], (a)[3], dir);
+    fftKernel2S((a)[4], (a)[5], dir);
+    fftKernel2S((a)[6], (a)[7], dir);
+    bitreverse8((a));
+}
+
+#define bitreverse4x4(a)
+{
+    float2 c;
+    c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c;
+    c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c;
+    c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c;
+    c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c;
+    c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c;
+    c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c;
+}
+
+#define fftKernel16(a,dir)
+{
+    const float w0 = 0x1.d906bcp-1f;
+    const float w1 = 0x1.87de2ap-2f;
+    const float w2 = 0x1.6a09e6p-1f;
+    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir);
+    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir);
+    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir);
+    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir);
+    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1));
+    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2));
+    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0));
+    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2));
+    (a)[10] = (float2)(dir)*(conjTransp((a)[10]));
+    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2));
+    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0));
+    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2));
+    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1));
+    fftKernel4((a), dir);
+    fftKernel4((a) + 4, dir);
+    fftKernel4((a) + 8, dir);
+    fftKernel4((a) + 12, dir);
+    bitreverse4x4((a));
+}
+
+#define bitreverse32(a)
+{
+    float2 c1, c2;
+    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2;
+    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2;
+    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2;
+    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2;
+    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2;
+    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2;
+}
+
+#define fftKernel32(a,dir)
+{
+    fftKernel2S((a)[0],  (a)[16], dir);
+    fftKernel2S((a)[1],  (a)[17], dir);
+    fftKernel2S((a)[2],  (a)[18], dir);
+    fftKernel2S((a)[3],  (a)[19], dir);
+    fftKernel2S((a)[4],  (a)[20], dir);
+    fftKernel2S((a)[5],  (a)[21], dir);
+    fftKernel2S((a)[6],  (a)[22], dir);
+    fftKernel2S((a)[7],  (a)[23], dir);
+    fftKernel2S((a)[8],  (a)[24], dir);
+    fftKernel2S((a)[9],  (a)[25], dir);
+    fftKernel2S((a)[10], (a)[26], dir);
+    fftKernel2S((a)[11], (a)[27], dir);
+    fftKernel2S((a)[12], (a)[28], dir);
+    fftKernel2S((a)[13], (a)[29], dir);
+    fftKernel2S((a)[14], (a)[30], dir);
+    fftKernel2S((a)[15], (a)[31], dir);
+    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f));
+    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f));
+    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f));
+    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f));
+    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f));
+    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f));
+    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f));
+    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f));
+    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f));
+    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f));
+    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f));
+    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f));
+    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f));
+    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f));
+    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f));
+    fftKernel16((a), dir);
+    fftKernel16((a) + 16, dir);
+    bitreverse32((a));
+}
+__kernel void
+clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir)
+{
+    float2 a, w;
+    float ang;
+    unsigned int j;
+    unsigned int i = get_global_id(0);
+    unsigned int startIndex = i;
+
+    if(i < numCols)
+        {
+            for(j = 0; j < numRowsToProcess; j++)
+                {
+                    a = in[startIndex];
+                    ang = 2.0f * M_PI * dir * i * (startRow + j) / N;
+                    w = (float2)(native_cos(ang), native_sin(ang));
+                    a = complexMul(a, w);
+                    in[startIndex] = a;
+                    startIndex += numCols;
+                }
+        }
+}
+__kernel void
+clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir)
+{
+    float2 a, w;
+    float ang;
+    unsigned int j;
+    unsigned int i = get_global_id(0);
+    unsigned int startIndex = i;
+
+    if(i < numCols)
+        {
+            for(j = 0; j < numRowsToProcess; j++)
+                {
+                    a = (float2)(in_real[startIndex], in_imag[startIndex]);
+                    ang = 2.0f * M_PI * dir * i * (startRow + j) / N;
+                    w = (float2)(native_cos(ang), native_sin(ang));
+                    a = complexMul(a, w);
+                    in_real[startIndex] = a.x;
+                    in_imag[startIndex] = a.y;
+                    startIndex += numCols;
+                }
+        }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_base_kernels.h	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,277 @@
+
+//
+// File:       fft_base_kernels.h
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CL_FFT_BASE_KERNELS_
+#define __CL_FFT_BASE_KERNELS_
+
+#include <string>
+
+using namespace std;
+
+static string baseKernels = string(
+                          "#ifndef M_PI\n"
+						  "#define M_PI 0x1.921fb54442d18p+1\n"
+						  "#endif\n"
+						  "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
+						  "#define conj(a) ((float2)((a).x, -(a).y))\n"
+						  "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"		   
+						  "\n"
+						  "#define fftKernel2(a,dir) \\\n"
+						  "{ \\\n"
+						  "    float2 c = (a)[0];    \\\n"
+						  "    (a)[0] = c + (a)[1];  \\\n"
+						  "    (a)[1] = c - (a)[1];  \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel2S(d1,d2,dir) \\\n"
+						  "{ \\\n"
+						  "    float2 c = (d1);   \\\n"
+						  "    (d1) = c + (d2);   \\\n"
+						  "    (d2) = c - (d2);   \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel4(a,dir) \\\n"
+						  "{ \\\n"
+						  "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
+						  "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
+						  "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
+						  "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+						  "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
+						  "    float2 c = (a)[1]; \\\n"
+						  "    (a)[1] = (a)[2]; \\\n"
+						  "    (a)[2] = c; \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
+						  "{ \\\n"
+						  "    fftKernel2S((a0), (a2), dir); \\\n"
+						  "    fftKernel2S((a1), (a3), dir); \\\n"
+						  "    fftKernel2S((a0), (a1), dir); \\\n"
+						  "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
+						  "    fftKernel2S((a2), (a3), dir); \\\n"
+						  "    float2 c = (a1); \\\n"
+						  "    (a1) = (a2); \\\n"
+						  "    (a2) = c; \\\n" 
+						  "}\n"
+						  "\n"						  
+						  "#define bitreverse8(a) \\\n"
+						  "{ \\\n"
+						  "    float2 c; \\\n"
+						  "    c = (a)[1]; \\\n"
+						  "    (a)[1] = (a)[4]; \\\n"
+						  "    (a)[4] = c; \\\n"
+						  "    c = (a)[3]; \\\n"
+						  "    (a)[3] = (a)[6]; \\\n"
+						  "    (a)[6] = c; \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel8(a,dir) \\\n"
+						  "{ \\\n"
+						  "	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
+						  "	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
+						  "	float2 c; \\\n"
+						  "	fftKernel2S((a)[0], (a)[4], dir); \\\n"
+						  "	fftKernel2S((a)[1], (a)[5], dir); \\\n"
+						  "	fftKernel2S((a)[2], (a)[6], dir); \\\n"
+						  "	fftKernel2S((a)[3], (a)[7], dir); \\\n"
+						  "	(a)[5] = complexMul(w1, (a)[5]); \\\n"
+						  "	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
+						  "	(a)[7] = complexMul(w3, (a)[7]); \\\n"
+						  "	fftKernel2S((a)[0], (a)[2], dir); \\\n"
+						  "	fftKernel2S((a)[1], (a)[3], dir); \\\n"
+						  "	fftKernel2S((a)[4], (a)[6], dir); \\\n"
+						  "	fftKernel2S((a)[5], (a)[7], dir); \\\n"
+						  "	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+						  "	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
+						  "	fftKernel2S((a)[0], (a)[1], dir); \\\n"
+						  "	fftKernel2S((a)[2], (a)[3], dir); \\\n"
+						  "	fftKernel2S((a)[4], (a)[5], dir); \\\n"
+						  "	fftKernel2S((a)[6], (a)[7], dir); \\\n"
+						  "	bitreverse8((a)); \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define bitreverse4x4(a) \\\n"
+						  "{ \\\n"
+						  "	float2 c; \\\n"
+						  "	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
+						  "	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
+						  "	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
+						  "	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
+						  "	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
+						  "	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel16(a,dir) \\\n"
+						  "{ \\\n"
+						  "    const float w0 = 0x1.d906bcp-1f; \\\n"
+						  "    const float w1 = 0x1.87de2ap-2f; \\\n"
+						  "    const float w2 = 0x1.6a09e6p-1f; \\\n"
+						  "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
+						  "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
+						  "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
+						  "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
+						  "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
+						  "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
+						  "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
+						  "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
+						  "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
+						  "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
+						  "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
+						  "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
+						  "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
+						  "    fftKernel4((a), dir); \\\n"
+						  "    fftKernel4((a) + 4, dir); \\\n"
+						  "    fftKernel4((a) + 8, dir); \\\n"
+						  "    fftKernel4((a) + 12, dir); \\\n"
+						  "    bitreverse4x4((a)); \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define bitreverse32(a) \\\n"
+						  "{ \\\n"
+						  "    float2 c1, c2; \\\n"
+						  "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
+						  "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
+						  "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
+						  "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
+						  "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
+						  "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
+						  "}\n"
+						  "\n"						  
+						  "#define fftKernel32(a,dir) \\\n"
+						  "{ \\\n"
+						  "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
+						  "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
+						  "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
+						  "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
+						  "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
+						  "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
+						  "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
+						  "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
+						  "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
+						  "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
+						  "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
+						  "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
+						  "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
+						  "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
+						  "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
+						  "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
+						  "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+						  "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+						  "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+						  "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+						  "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+						  "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+						  "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+						  "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
+						  "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+						  "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+						  "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+						  "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+						  "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+						  "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+						  "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+						  "    fftKernel16((a), dir); \\\n"
+						  "    fftKernel16((a) + 16, dir); \\\n"
+						  "    bitreverse32((a)); \\\n"
+						  "}\n\n"
+						  );
+
+static string twistKernelInterleaved = string(
+											  "__kernel void \\\n"
+											  "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+											  "{ \\\n"
+											  "   float2 a, w; \\\n"
+											  "   float ang; \\\n"
+											  "   unsigned int j; \\\n"
+											  "	unsigned int i = get_global_id(0); \\\n"
+											  "	unsigned int startIndex = i; \\\n"
+											  "	 \\\n"
+											  "	if(i < numCols) \\\n"
+											  "	{ \\\n"
+											  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
+											  "	    { \\\n"
+											  "	        a = in[startIndex]; \\\n"
+											  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+											  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+											  "	        a = complexMul(a, w); \\\n"
+											  "	        in[startIndex] = a; \\\n"
+											  "	        startIndex += numCols; \\\n"
+											  "	    } \\\n"
+											  "	}	 \\\n"
+											  "} \\\n"
+											  );
+
+static string twistKernelPlannar = string(
+										  "__kernel void \\\n"
+										  "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+										  "{ \\\n"
+										  "    float2 a, w; \\\n"
+										  "    float ang; \\\n"
+										  "    unsigned int j; \\\n"
+										  "	unsigned int i = get_global_id(0); \\\n"
+										  "	unsigned int startIndex = i; \\\n"
+										  "	 \\\n"
+										  "	if(i < numCols) \\\n"
+										  "	{ \\\n"
+										  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
+										  "	    { \\\n"
+										  "	        a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
+										  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+										  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+										  "	        a = complexMul(a, w); \\\n"
+										  "	        in_real[startIndex] = a.x; \\\n"
+										  "	        in_imag[startIndex] = a.y; \\\n"
+										  "	        startIndex += numCols; \\\n"
+										  "	    } \\\n"
+										  "	}	 \\\n"
+										  "} \\\n"
+										  );										  
+
+
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_execute.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,405 @@
+
+//
+// File:       fft_execute.cpp
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.¬
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "fft_internal.h"
+#include "clFFT.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#define max(a,b) (((a)>(b)) ? (a) : (b))
+#define min(a,b) (((a)<(b)) ? (a) : (b))
+
+static cl_int
+allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
+{
+    cl_int err = CL_SUCCESS;
+    if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+    {
+        plan->last_batch_size = batchSize;
+        size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
+
+        if(plan->tempmemobj)
+            clReleaseMemObject(plan->tempmemobj);
+
+        plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+    }
+    return err;
+}
+
+static cl_int
+allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
+{
+    cl_int err = CL_SUCCESS;
+    cl_int terr;
+    if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+    {
+        plan->last_batch_size = batchSize;
+        size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
+
+        if(plan->tempmemobj_real)
+            clReleaseMemObject(plan->tempmemobj_real);
+
+        if(plan->tempmemobj_imag)
+            clReleaseMemObject(plan->tempmemobj_imag);
+
+        plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+        plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
+        err |= terr;
+    }
+    return err;
+}
+
+void
+getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
+{
+    *lWorkItems = kernelInfo->num_workitems_per_workgroup;
+    int numWorkGroups = kernelInfo->num_workgroups;
+    int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
+
+    switch(kernelInfo->dir)
+    {
+        case cl_fft_kernel_x:
+            *batchSize *= (plan->n.y * plan->n.z);
+            numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
+            numWorkGroups *= kernelInfo->num_workgroups;
+            break;
+        case cl_fft_kernel_y:
+            *batchSize *= plan->n.z;
+            numWorkGroups *= *batchSize;
+            break;
+        case cl_fft_kernel_z:
+            numWorkGroups *= *batchSize;
+            break;
+    }
+
+    *gWorkItems = numWorkGroups * *lWorkItems;
+}
+
+cl_int
+clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+                         cl_mem data_in, cl_mem data_out,
+                         cl_int num_events, cl_event *event_list, cl_event *event )
+{
+    int s;
+    cl_fft_plan *plan = (cl_fft_plan *) Plan;
+    if(plan->format != clFFT_InterleavedComplexFormat)
+        return CL_INVALID_VALUE;
+
+    cl_int err;
+    size_t gWorkItems, lWorkItems;
+    int inPlaceDone;
+
+    cl_int isInPlace = data_in == data_out ? 1 : 0;
+
+    if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
+        return err;
+
+    cl_mem memObj[3];
+    memObj[0] = data_in;
+    memObj[1] = data_out;
+    memObj[2] = plan->tempmemobj;
+    cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+    int numKernels = plan->num_kernels;
+
+    int numKernelsOdd = numKernels & 1;
+    int currRead  = 0;
+    int currWrite = 1;
+
+    // at least one external dram shuffle (transpose) required
+    if(plan->temp_buffer_needed)
+    {
+        // in-place transform
+        if(isInPlace)
+        {
+            inPlaceDone = 0;
+            currRead  = 1;
+            currWrite = 2;
+        }
+        else
+        {
+            currWrite = (numKernels & 1) ? 1 : 2;
+        }
+
+        while(kernelInfo)
+        {
+            if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+            {
+                currWrite = currRead;
+                inPlaceDone = 1;
+            }
+
+            s = batchSize;
+            getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+            err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+            err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+            err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+            if(err)
+                return err;
+
+            currRead  = (currWrite == 1) ? 1 : 2;
+            currWrite = (currWrite == 1) ? 2 : 1;
+
+            kernelInfo = kernelInfo->next;
+        }
+    }
+    // no dram shuffle (transpose required) transform
+    // all kernels can execute in-place.
+    else {
+
+        while(kernelInfo)
+        {
+            s = batchSize;
+            getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+            err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+            err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+            err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+            if(err)
+                return err;
+
+            currRead  = 1;
+            currWrite = 1;
+
+            kernelInfo = kernelInfo->next;
+        }
+    }
+
+    return err;
+}
+
+cl_int
+clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+                      cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+                      cl_int num_events, cl_event *event_list, cl_event *event)
+{
+    int s;
+    cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+    if(plan->format != clFFT_SplitComplexFormat)
+        return CL_INVALID_VALUE;
+
+    cl_int err;
+    size_t gWorkItems, lWorkItems;
+    int inPlaceDone;
+
+    cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
+
+    if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
+        return err;
+
+    cl_mem memObj_real[3];
+    cl_mem memObj_imag[3];
+    memObj_real[0] = data_in_real;
+    memObj_real[1] = data_out_real;
+    memObj_real[2] = plan->tempmemobj_real;
+    memObj_imag[0] = data_in_imag;
+    memObj_imag[1] = data_out_imag;
+    memObj_imag[2] = plan->tempmemobj_imag;
+
+    cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+    int numKernels = plan->num_kernels;
+
+    int numKernelsOdd = numKernels & 1;
+    int currRead  = 0;
+    int currWrite = 1;
+
+    // at least one external dram shuffle (transpose) required
+    if(plan->temp_buffer_needed)
+    {
+        // in-place transform
+        if(isInPlace)
+        {
+            inPlaceDone = 0;
+            currRead  = 1;
+            currWrite = 2;
+        }
+        else
+        {
+            currWrite = (numKernels & 1) ? 1 : 2;
+        }
+
+        while(kernelInfo)
+        {
+            if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+            {
+                currWrite = currRead;
+                inPlaceDone = 1;
+            }
+
+            s = batchSize;
+            getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+            err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+            err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+
+            err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+            if(err)
+                return err;
+
+            currRead  = (currWrite == 1) ? 1 : 2;
+            currWrite = (currWrite == 1) ? 2 : 1;
+
+            kernelInfo = kernelInfo->next;
+        }
+    }
+    // no dram shuffle (transpose required) transform
+    else {
+
+        while(kernelInfo)
+        {
+            s = batchSize;
+            getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+            err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+            err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+            err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+            err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+            err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+            //err |= clEnqueueTask(queue,kernelInfo->kernel,0,NULL,NULL);
+            if(err)
+                return err;
+
+            currRead  = 1;
+            currWrite = 1;
+
+            kernelInfo = kernelInfo->next;
+        }
+    }
+
+    return err;
+}
+
+cl_int
+clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
+                         unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+    cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+    unsigned int N = numRows*numCols;
+    unsigned int nCols = numCols;
+    unsigned int sRow = startRow;
+    unsigned int rToProcess = rowsToProcess;
+    int d = dir;
+    int err = 0;
+
+    cl_device_id device_id;
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+    if(err)
+        return err;
+
+    size_t gSize;
+    err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+    if(err)
+        return err;
+
+    gSize = min(128, gSize);
+    size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
+    size_t numLocalThreads[1]  = { gSize };
+
+    err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
+    err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
+    err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
+    err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
+    err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
+    err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
+
+    err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+    return err;
+}
+
+cl_int
+clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
+                     unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+    cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+    unsigned int N = numRows*numCols;
+    unsigned int nCols = numCols;
+    unsigned int sRow = startRow;
+    unsigned int rToProcess = rowsToProcess;
+    int d = dir;
+    int err = 0;
+
+    cl_device_id device_id;
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+    if(err)
+        return err;
+
+    size_t gSize;
+    err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+    if(err)
+        return err;
+
+    gSize = min(128, gSize);
+    size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
+    size_t numLocalThreads[1]  = { gSize };
+
+    err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
+    err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
+    err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
+    err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
+    err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
+    err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
+    err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
+
+    err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+    return err;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_internal.h	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,163 @@
+
+//
+// File:       fft_internal.h
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CLFFT_INTERNAL_H
+#define __CLFFT_INTERNAL_H
+
+#include "clFFT.h"
+#include <iostream>
+#include <string>
+#include <sstream>
+
+using namespace std;
+
+typedef enum kernel_dir_t
+{
+	cl_fft_kernel_x,
+	cl_fft_kernel_y,
+	cl_fft_kernel_z
+}cl_fft_kernel_dir;
+
+typedef struct kernel_info_t
+{
+	cl_kernel kernel;
+	char *kernel_name;
+	unsigned lmem_size;
+	unsigned num_workgroups;
+    unsigned num_xforms_per_workgroup;
+	unsigned num_workitems_per_workgroup;
+	cl_fft_kernel_dir dir;
+	int in_place_possible;
+	kernel_info_t *next;
+}cl_fft_kernel_info;
+
+typedef struct 
+{
+	// context in which fft resources are created and kernels are executed
+	cl_context              context;
+	
+	// size of signal
+	clFFT_Dim3              n;
+	
+	// dimension of transform ... must be either 1D, 2D or 3D
+	clFFT_Dimension			dim;
+	
+	// data format ... must be either interleaved or plannar
+	clFFT_DataFormat		format;
+	
+	// string containing kernel source. Generated at runtime based on
+	// n, dim, format and other parameters
+	string                  *kernel_string;
+	
+	// CL program containing source and kernel this particular 
+	// n, dim, data format
+	cl_program				program;
+	
+	// linked list of kernels which needs to be executed for this fft
+	cl_fft_kernel_info		*kernel_info;
+	
+	// number of kernels
+	int                     num_kernels;
+	
+	// twist kernel for virtualizing fft of very large sizes that do not
+	// fit in GPU global memory
+	cl_kernel				twist_kernel;
+	
+	// flag indicating if temporary intermediate buffer is needed or not.
+	// this depends on fft kernels being executed and if transform is 
+	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
+	// one that does not require global transpose do not need temporary buffer)
+	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
+	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
+	// until its needed
+	cl_int                  temp_buffer_needed;
+	
+	// Batch size is runtime parameter and size of temporary buffer (if needed)
+	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
+	// only created when needed. Once its created at first call of clFFT_Executexxx
+	// it is not allocated next time if next time clFFT_Executexxx is called with 
+	// batch size different than the first call. last_batch_size caches the last
+	// batch size with which this plan is used so that we dont keep allocating/deallocating
+	// temp buffer if same batch size is used again and again.
+	unsigned                  last_batch_size;
+	
+	// temporary buffer for interleaved plan
+	cl_mem   				tempmemobj;
+	
+	// temporary buffer for planner plan. Only one of tempmemobj or 
+	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
+	// data format of plan (plannar or interleaved)
+	cl_mem                  tempmemobj_real, tempmemobj_imag;
+	
+	// Maximum size of signal for which local memory transposed based
+	// fft is sufficient i.e. no global mem transpose (communication)
+	// is needed
+	unsigned					max_localmem_fft_size;
+	
+	// Maximum work items per work group allowed. This, along with max_radix below controls 
+	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
+	unsigned                  max_work_item_per_workgroup;
+	
+	// Maximum base radix for local memory fft ... this controls the maximum register 
+	// space used by work items. Currently defaults to 16
+	unsigned                  max_radix;
+	
+	// Device depended parameter that tells how many work-items need to be read consecutive
+	// values to make sure global memory access by work-items of a work-group result in 
+	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
+	unsigned                  min_mem_coalesce_width;
+	
+	// Number of local memory banks. This is used to geneate kernel with local memory 
+	// transposes with appropriate padding to avoid bank conflicts to local memory
+	// e.g. on NVidia it is 16.
+	unsigned                  num_local_mem_banks;
+}cl_fft_plan;
+
+void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
+
+#endif  
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_kernelstring.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,1256 @@
+
+//
+// File:       fft_kernelstring.cpp
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <assert.h>
+#include "fft_internal.h"
+#include "clFFT.h"
+
+using namespace std;
+
+#define max(A,B) ((A) > (B) ? (A) : (B))
+#define min(A,B) ((A) < (B) ? (A) : (B))
+
+static string 
+num2str(int num)
+{
+	char temp[200];
+	sprintf(temp, "%d", num);
+	return string(temp);
+}
+
+// For any n, this function decomposes n into factors for loacal memory tranpose 
+// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
+// is the largest. This base radix determines the number of registers used by each
+// work item and product of remaining radices determine the size of work group needed.
+// To make things concrete with and example, suppose n = 1024. It is decomposed into
+// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and 
+// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
+// 16 ffts (64 work items working in parallel) following by transpose using local 
+// memory followed by again 64 length 16 ffts followed by transpose using local memory
+// followed by 256 length 4 ffts. For the last step since with size of work group is 
+// 64 and each work item can array for 16 values, 64 work items can compute 256 length
+// 4 ffts by each work item computing 4 length 4 ffts. 
+// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
+// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
+// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
+// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
+// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
+// for the last step, each work item computes two length 4 in-register ffts and thus
+// 256 work items are needed to compute all 512 ffts. 
+// For n = 32 = 8 x 4, 4 work items first compute 4 in-register 
+// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
+// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
+// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, 
+// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). 
+// Users can play with these parameters to figure what gives best performance on
+// their particular device i.e. some device have less register space thus using
+// smaller base radix can avoid spilling ... some has small local memory thus 
+// using smaller work group size may be required etc
+
+static void 
+getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
+{
+    if(maxRadix > 1)
+    {
+        maxRadix = min(n, maxRadix);
+        unsigned int cnt = 0;
+        while(n > maxRadix)
+        {
+            radixArray[cnt++] = maxRadix;
+            n /= maxRadix;
+        }
+        radixArray[cnt++] = n;
+        *numRadices = cnt;
+        return;
+    }
+
+	switch(n) 
+	{
+		case 2:
+			*numRadices = 1;
+			radixArray[0] = 2;
+			break;
+			
+		case 4:
+			*numRadices = 1;
+			radixArray[0] = 4;
+			break;
+			
+		case 8:
+			*numRadices = 1;
+			radixArray[0] = 8;
+			break;
+			
+		case 16:
+			*numRadices = 2;
+			radixArray[0] = 8; radixArray[1] = 2; 
+			break;
+			
+		case 32:
+			*numRadices = 2;
+			radixArray[0] = 8; radixArray[1] = 4;
+			break;
+			
+		case 64:
+			*numRadices = 2;
+			radixArray[0] = 8; radixArray[1] = 8;
+			break;
+			
+		case 128:
+			*numRadices = 3;
+			radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
+			break;
+			
+		case 256:
+			*numRadices = 4;
+			radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
+			break;
+			
+		case 512:
+			*numRadices = 3;
+			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
+			break;			
+			
+		case 1024:
+			*numRadices = 3;
+			radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
+			break;	
+		case 2048:
+			*numRadices = 4;
+			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
+			break;
+		default:
+			*numRadices = 0;
+			return;
+	}
+}
+
+static void
+insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
+{
+	if(dataFormat == clFFT_SplitComplexFormat) 
+		kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
+	else 
+		kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
+}
+
+static void 
+insertVariables(string &kStream, int maxRadix)
+{
+	kStream += string("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
+    kStream += string("    int s, ii, jj, offset;\n");
+	kStream += string("    float2 w;\n");
+	kStream += string("    float ang, angf, ang1;\n");
+    kStream += string("    __local float *lMemStore, *lMemLoad;\n");
+    kStream += string("    float2 a[") +  num2str(maxRadix) + string("];\n");
+    kStream += string("    int lId = get_local_id( 0 );\n");
+    kStream += string("    int groupId = get_group_id( 0 );\n");
+}
+
+static void
+formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
+{
+	if(dataFormat == clFFT_InterleavedComplexFormat)
+		kernelString += string("        a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
+	else
+	{
+		kernelString += string("        a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
+		kernelString += string("        a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
+	}
+}
+
+static void
+formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
+{
+	if(dataFormat == clFFT_InterleavedComplexFormat)
+		kernelString += string("        out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
+	else
+	{
+		kernelString += string("        out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
+		kernelString += string("        out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
+	}
+}
+
+static int
+insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
+{
+	int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
+	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
+	int i, j;
+	int lMemSize = 0;
+	
+	if(numXFormsPerWG > 1)
+	    kernelString += string("        s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
+	
+    if(numWorkItemsPerXForm >= mem_coalesce_width)
+    {   		
+		if(numXFormsPerWG > 1)
+		{
+            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
+            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
+			kernelString += string("        offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
+			if(dataFormat == clFFT_InterleavedComplexFormat)
+			{
+			    kernelString += string("        in += offset;\n");
+			    kernelString += string("        out += offset;\n");
+			}
+			else
+			{
+			    kernelString += string("        in_real += offset;\n");
+				kernelString += string("        in_imag += offset;\n");
+			    kernelString += string("        out_real += offset;\n");
+				kernelString += string("        out_imag += offset;\n");
+			}
+			for(i = 0; i < R0; i++)
+				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
+			kernelString += string("    }\n");
+		}
+		else
+		{
+			kernelString += string("    ii = lId;\n");
+			kernelString += string("    jj = 0;\n");
+			kernelString += string("    offset =  mad24(groupId, ") + num2str(N) + string(", ii);\n");
+			if(dataFormat == clFFT_InterleavedComplexFormat)
+			{
+			    kernelString += string("        in += offset;\n");
+			    kernelString += string("        out += offset;\n");
+			}
+			else
+			{
+			    kernelString += string("        in_real += offset;\n");
+				kernelString += string("        in_imag += offset;\n");
+			    kernelString += string("        out_real += offset;\n");
+				kernelString += string("        out_imag += offset;\n");
+			}
+			for(i = 0; i < R0; i++)
+				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
+		}
+    }
+    else if( N >= mem_coalesce_width )
+    {
+        int numInnerIter = N / mem_coalesce_width;
+        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
+		
+        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
+        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
+        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+        kernelString += string("    offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
+        kernelString += string("    offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
+		if(dataFormat == clFFT_InterleavedComplexFormat)
+		{
+			kernelString += string("        in += offset;\n");
+			kernelString += string("        out += offset;\n");
+		}
+		else
+		{
+			kernelString += string("        in_real += offset;\n");
+			kernelString += string("        in_imag += offset;\n");
+			kernelString += string("        out_real += offset;\n");
+			kernelString += string("        out_imag += offset;\n");
+		}
+        
+		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+        for(i = 0; i < numOuterIter; i++ )
+        {
+            kernelString += string("    if( jj < s ) {\n");
+			for(j = 0; j < numInnerIter; j++ ) 
+				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
+			kernelString += string("    }\n"); 
+			if(i != numOuterIter - 1)
+			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
+        }
+		kernelString += string("}\n ");
+		kernelString += string("else {\n");
+        for(i = 0; i < numOuterIter; i++ )
+        {
+			for(j = 0; j < numInnerIter; j++ ) 
+				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);			
+        }		
+		kernelString += string("}\n");
+        
+		kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
+		kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");  
+		
+        for( i = 0; i < numOuterIter; i++ )
+		{
+			for( j = 0; j < numInnerIter; j++ )
+			{	
+				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
+				                num2str(i * numInnerIter + j) + string("].x;\n");
+			}
+		}	
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+        
+        for( i = 0; i < R0; i++ )
+			kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
+		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
+
+	    for( i = 0; i < numOuterIter; i++ )
+		{
+			for( j = 0; j < numInnerIter; j++ )
+			{	
+				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
+								num2str(i * numInnerIter + j) + string("].y;\n");
+			}
+	    }	
+		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+																						   
+		for( i = 0; i < R0; i++ )
+			kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
+		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
+		
+		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+    }  
+    else
+    {
+        kernelString += string("    offset = mad24( groupId,  ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
+		if(dataFormat == clFFT_InterleavedComplexFormat)
+		{
+			kernelString += string("        in += offset;\n");
+			kernelString += string("        out += offset;\n");
+		}
+		else
+		{
+			kernelString += string("        in_real += offset;\n");
+			kernelString += string("        in_imag += offset;\n");
+			kernelString += string("        out_real += offset;\n");
+			kernelString += string("        out_imag += offset;\n");
+		}
+        
+        kernelString += string("    ii = lId & ") + num2str(N-1) + string(";\n");
+        kernelString += string("    jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
+        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+        
+		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+        for( i = 0; i < R0; i++ )
+        {
+            kernelString += string("    if(jj < s )\n");
+			formattedLoad(kernelString, i, i*groupSize, dataFormat);
+			if(i != R0 - 1)
+			    kernelString += string("    jj += ") + num2str(groupSize / N) + string(";\n");
+        }
+		kernelString += string("}\n");
+		kernelString += string("else {\n");
+        for( i = 0; i < R0; i++ )
+        {
+			formattedLoad(kernelString, i, i*groupSize, dataFormat);
+        }		
+		kernelString += string("}\n");
+        
+		if(numWorkItemsPerXForm > 1)
+		{
+            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
+            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+            kernelString += string("    lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); 
+		}
+		else 
+		{
+            kernelString += string("    ii = 0;\n");
+            kernelString += string("    jj = lId;\n");
+            kernelString += string("    lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); 			
+		}
+
+		
+        for( i = 0; i < R0; i++ )
+            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); 
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
+        
+        for( i = 0; i < R0; i++ )
+            kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+        
+        for( i = 0; i < R0; i++ )
+            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); 
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
+        
+        for( i = 0; i < R0; i++ )
+            kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+		
+		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+    }
+	
+	return lMemSize;
+}
+
+static int
+insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
+{
+	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
+	int i, j, k, ind;
+	int lMemSize = 0;
+	int numIter = maxRadix / Nr;
+	string indent = string("");
+	
+    if( numWorkItemsPerXForm >= mem_coalesce_width )
+    {   
+		if(numXFormsPerWG > 1)
+		{
+            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
+			indent = string("    ");
+		}	
+		for(i = 0; i < maxRadix; i++) 
+		{
+			j = i % numIter;
+			k = i / numIter;
+			ind = j * Nr + k;
+			formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
+		}
+		if(numXFormsPerWG > 1)
+		    kernelString += string("    }\n");
+    }
+    else if( N >= mem_coalesce_width )
+    {
+        int numInnerIter = N / mem_coalesce_width;
+        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
+		
+        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
+        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
+        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
+        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+		
+        for( i = 0; i < maxRadix; i++ )
+		{
+			j = i % numIter;
+			k = i / numIter;
+			ind = j * Nr + k;
+            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");            
+		}	
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
+		
+        for( i = 0; i < numOuterIter; i++ )
+			for( j = 0; j < numInnerIter; j++ )
+				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+		
+        for( i = 0; i < maxRadix; i++ )
+		{
+			j = i % numIter;
+			k = i / numIter;
+			ind = j * Nr + k;
+            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");            
+		}	
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
+		
+        for( i = 0; i < numOuterIter; i++ )
+			for( j = 0; j < numInnerIter; j++ )
+				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
+		
+		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+		for(i = 0; i < numOuterIter; i++ )
+        {
+            kernelString += string("    if( jj < s ) {\n");
+			for(j = 0; j < numInnerIter; j++ ) 
+				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
+			kernelString += string("    }\n"); 
+			if(i != numOuterIter - 1)
+			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
+        }
+		kernelString += string("}\n");
+		kernelString += string("else {\n");
+		for(i = 0; i < numOuterIter; i++ )
+        {
+			for(j = 0; j < numInnerIter; j++ ) 
+				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
+        }		
+		kernelString += string("}\n");
+		
+		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+	}   	
+    else
+    {   
+        kernelString += string("    lMemLoad  = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
+        
+		kernelString += string("    ii = lId & ") + num2str(N - 1) + string(";\n");
+        kernelString += string("    jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
+        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+        
+        for( i = 0; i < maxRadix; i++ )
+		{
+			j = i % numIter;
+			k = i / numIter;
+			ind = j * Nr + k;
+            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
+		}	
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+        
+        for( i = 0; i < maxRadix; i++ )
+            kernelString += string("    a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
+        
+        for( i = 0; i < maxRadix; i++ )
+		{
+			j = i % numIter;
+			k = i / numIter;
+			ind = j * Nr + k;
+            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
+		}	
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
+        
+        for( i = 0; i < maxRadix; i++ )
+            kernelString += string("    a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
+        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
+        
+		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+		for( i = 0; i < maxRadix; i++ )
+        {
+            kernelString += string("    if(jj < s ) {\n");
+			formattedStore(kernelString, i, i*groupSize, dataFormat);
+			kernelString += string("    }\n");
+			if( i != maxRadix - 1)
+				kernelString += string("    jj +=") + num2str(groupSize / N) + string(";\n");
+        } 
+		kernelString += string("}\n");
+		kernelString += string("else {\n");
+		for( i = 0; i < maxRadix; i++ )
+        {
+			formattedStore(kernelString, i, i*groupSize, dataFormat);
+        } 		
+		kernelString += string("}\n");
+		
+		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+    }
+	
+	return lMemSize;
+}
+
+static void 
+insertfftKernel(string &kernelString, int Nr, int numIter)
+{
+	int i;
+	for(i = 0; i < numIter; i++) 
+	{
+		kernelString += string("    fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
+	}
+}
+
+static void
+insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
+{
+	int z, k;
+	int logNPrev = (int)log2(Nprev);
+	
+	for(z = 0; z < numIter; z++) 
+	{
+		if(z == 0)
+		{
+			if(Nprev > 1)
+			    kernelString += string("    angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
+			else
+				kernelString += string("    angf = (float) ii;\n");
+		}	
+		else
+		{
+			if(Nprev > 1)
+			    kernelString += string("    angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); 
+			else
+				kernelString += string("    angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
+		}	
+	
+		for(k = 1; k < Nr; k++) {
+			int ind = z*Nr + k;
+			//float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
+			kernelString += string("    ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
+			kernelString += string("    w = (float2)(native_cos(ang), native_sin(ang));\n");
+			kernelString += string("    a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
+		}
+	}
+}
+
+static int
+getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
+{
+	if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
+		*offset = 0;
+	else {
+		int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
+		int numColsReq = 1;
+		if(numRowsReq > Nr)
+			numColsReq = numRowsReq / Nr;
+		numColsReq = Nprev * numColsReq;
+		*offset = numColsReq;
+	}
+	
+	if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
+		*midPad = 0;
+	else {
+		int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
+		if( bankNum >= numWorkItemsPerXForm )
+			*midPad = 0;
+		else
+			*midPad = numWorkItemsPerXForm - bankNum;
+	}
+	
+	int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
+	return lMemSize;
+}
+
+
+static void 
+insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
+{
+	int z, k;
+
+	for(z = 0; z < numIter; z++) {
+		for(k = 0; k < Nr; k++) {
+			int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
+			kernelString += string("    lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
+		}
+	}
+	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
+}
+
+static void 
+insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
+{
+	int numWorkItemsReqN = n / Nrn;										
+	int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );			
+	int interBlockHStride = numWorkItemsPerXForm;							
+	int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);					
+	vertWidth = min( vertWidth, Nr);									
+	int vertNum = Nr / vertWidth;										
+	int vertStride = ( n / Nr + offset ) * vertWidth;					
+	int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
+	int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
+	intraBlockHStride *= Nprev;
+	
+	int stride = numWorkItemsReq / Nrn;									
+	int i;
+	for(i = 0; i < iter; i++) {
+		int ii = i / (interBlockHNum * vertNum);
+		int zz = i % (interBlockHNum * vertNum);
+		int jj = zz % interBlockHNum;
+		int kk = zz / interBlockHNum;
+		int z;
+		for(z = 0; z < Nrn; z++) {
+			int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
+			kernelString += string("    a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
+		}
+	}
+	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
+}
+
+static void
+insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
+{	
+	int Ncurr = Nprev * Nr;
+	int logNcurr = (int)log2(Ncurr);
+	int logNprev = (int)log2(Nprev);
+	int incr = (numWorkItemsReq + offset) * Nr + midPad;
+	
+	if(Ncurr < numWorkItemsPerXForm) 
+	{
+		if(Nprev == 1)
+		    kernelString += string("    j = ii & ") + num2str(Ncurr - 1) + string(";\n");
+		else
+			kernelString += string("    j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
+		
+		if(Nprev == 1) 
+			kernelString += string("    i = ii >> ") + num2str(logNcurr) + string(";\n");
+		else 
+			kernelString += string("    i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); 
+	}	
+	else 
+	{
+		if(Nprev == 1)
+		    kernelString += string("    j = ii;\n");
+		else
+			kernelString += string("    j = ii >> ") + num2str(logNprev) + string(";\n");
+		if(Nprev == 1) 
+			kernelString += string("    i = 0;\n"); 
+		else 
+			kernelString += string("    i = ii & ") + num2str(Nprev - 1) + string(";\n");
+	}
+
+    if(numXFormsPerWG > 1)
+        kernelString += string("    i = mad24(jj, ") + num2str(incr) + string(", i);\n");		
+
+    kernelString += string("    lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); 
+}
+
+static void
+insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
+{
+	if(numXFormsPerWG == 1) {
+		kernelString += string("    lMemStore = sMem + ii;\n");		
+	}
+	else {
+		kernelString += string("    lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");	
+	}
+}
+
+
+static void
+createLocalMemfftKernelString(cl_fft_plan *plan)
+{
+	unsigned int radixArray[10];
+	unsigned int numRadix;
+	 
+	unsigned int n = plan->n.x;
+	
+	assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
+	
+	getRadixArray(n, radixArray, &numRadix, 0);
+	assert(numRadix > 0 && "no radix array supplied\n");
+	
+	if(n/radixArray[0] > plan->max_work_item_per_workgroup)
+	    getRadixArray(n, radixArray, &numRadix, plan->max_radix);
+
+	assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
+	assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
+	
+	unsigned int tmpLen = 1;
+	unsigned int i;
+	for(i = 0; i < numRadix; i++)
+	{	
+		assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
+	    tmpLen *= radixArray[i];
+	}
+	assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
+	
+	int offset, midPad;
+	string localString(""), kernelName("");
+	
+	clFFT_DataFormat dataFormat = plan->format;
+	string *kernelString = plan->kernel_string;
+	
+	
+	cl_fft_kernel_info **kInfo = &plan->kernel_info;
+	int kCount = 0;
+	
+	while(*kInfo)
+	{
+		kInfo = &(*kInfo)->next;
+		kCount++;
+	}
+	
+	kernelName = string("fft") + num2str(kCount);
+	
+	*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
+	(*kInfo)->kernel = 0;
+	(*kInfo)->lmem_size = 0;
+	(*kInfo)->num_workgroups = 0;
+	(*kInfo)->num_workitems_per_workgroup = 0;
+	(*kInfo)->dir = cl_fft_kernel_x;
+	(*kInfo)->in_place_possible = 1;
+	(*kInfo)->next = NULL;
+	(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
+	strcpy((*kInfo)->kernel_name, kernelName.c_str());
+	
+	unsigned int numWorkItemsPerXForm = n / radixArray[0];
+	unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; 
+	assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
+	int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
+	(*kInfo)->num_workgroups = 1;
+    (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
+	(*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
+	
+	unsigned int *N = radixArray;
+	unsigned int maxRadix = N[0];
+	unsigned int lMemSize = 0;
+		
+	insertVariables(localString, maxRadix);
+	
+	lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
+	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+	
+	string xcomp = string("x");
+	string ycomp = string("y");
+	
+	unsigned int Nprev = 1;
+	unsigned int len = n;
+	unsigned int r;
+	for(r = 0; r < numRadix; r++) 
+	{
+		int numIter = N[0] / N[r];
+		int numWorkItemsReq = n / N[r];
+		int Ncurr = Nprev * N[r];
+		insertfftKernel(localString, N[r], numIter);
+		
+		if(r < (numRadix - 1)) {
+			insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
+			lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
+			(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+			insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
+			insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
+			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
+			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
+			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
+			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
+			Nprev = Ncurr;
+			len = len / N[r];
+		}
+	}
+	
+	lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
+	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+	
+	insertHeader(*kernelString, kernelName, dataFormat);
+	*kernelString += string("{\n");
+	if((*kInfo)->lmem_size)
+        *kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
+	*kernelString += localString;
+	*kernelString += string("}\n");
+}
+
+// For n larger than what can be computed using local memory fft, global transposes
+// multiple kernel launces is needed. For these sizes, n can be decomposed using
+// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
+// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
+// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. 
+// Each of these base radices can futher be divided into factors so that each of these 
+// base ffts can be computed within one kernel launch using in-register ffts and local 
+// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length 
+// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length 
+// 16 ffts followed by transpose using local memory followed by each of these eight 
+// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This 
+// means only 8 work items are needed for computing one length 128 fft. If we choose
+// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
+// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this 
+// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each 
+// work group where each work group is computing 8 length 128 ffts where each length
+// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
+// in this example. Users can play with difference base radices and difference 
+// decompositions of base radices to generates different kernels and see which gives
+// best performance. Following function is just fixed to use 128 as base radix
+
+void
+getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
+{
+	int baseRadix = min(n, 128);
+	
+	int numR = 0;
+	int N = n;
+	while(N > baseRadix) 
+	{
+		N /= baseRadix;
+		numR++;
+	}
+	
+	for(int i = 0; i < numR; i++)
+		radix[i] = baseRadix;
+	
+	radix[numR] = N;
+	numR++;
+	*numRadices = numR;
+		
+	for(int i = 0; i < numR; i++)
+	{
+		int B = radix[i];
+		if(B <= 8)
+		{
+			R1[i] = B;
+			R2[i] = 1;
+			continue;
+		}
+		
+		int r1 = 2; 
+		int r2 = B / r1;
+	    while(r2 > r1)
+	    {
+		   r1 *=2;
+		   r2 = B / r1;
+	    }
+		R1[i] = r1;
+		R2[i] = r2;
+	}	
+}
+
+static void
+createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
+{		
+	int i, j, k, t;
+	int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int radix, R1, R2;
+	int numRadices;
+	
+	int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
+	int maxArrayLen = plan->max_radix;
+	int batchSize = plan->min_mem_coalesce_width;	
+	clFFT_DataFormat dataFormat = plan->format;
+	int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;	
+	
+	getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
+		
+	int numPasses = numRadices;
+	
+	string localString(""), kernelName("");
+	string *kernelString = plan->kernel_string;
+	cl_fft_kernel_info **kInfo = &plan->kernel_info; 
+	int kCount = 0;
+	
+	while(*kInfo)
+	{
+		kInfo = &(*kInfo)->next;
+		kCount++;
+	}
+	
+	int N = n;
+	int m = (int)log2(n);
+	int Rinit = vertical ? BS : 1;
+	batchSize = vertical ? min(BS, batchSize) : batchSize;
+	int passNum;
+	
+	for(passNum = 0; passNum < numPasses; passNum++) 
+	{
+		
+		localString.clear();
+		kernelName.clear();
+		
+		radix = radixArr[passNum];
+		R1 = R1Arr[passNum];
+		R2 = R2Arr[passNum];
+				
+		int strideI = Rinit;
+		for(i = 0; i < numPasses; i++)
+			if(i != passNum)
+				strideI *= radixArr[i];
+		
+		int strideO = Rinit;
+		for(i = 0; i < passNum; i++)
+			strideO *= radixArr[i];
+		
+		int threadsPerXForm = R2;
+		batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
+		batchSize = min(batchSize, strideI);
+		int threadsPerBlock = batchSize * threadsPerXForm;
+		threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
+		batchSize = threadsPerBlock / threadsPerXForm;
+		assert(R2 <= R1);
+		assert(R1*R2 == radix);
+		assert(R1 <= maxArrayLen);
+		assert(threadsPerBlock <= maxThreadsPerBlock);
+		
+		int numIter = R1 / R2;
+		int gInInc = threadsPerBlock / batchSize;
+		
+		
+		int lgStrideO = (int)log2(strideO);
+		int numBlocksPerXForm = strideI / batchSize;
+		int numBlocks = numBlocksPerXForm;
+		if(!vertical)
+			numBlocks *= BS;
+		else
+			numBlocks *= vertBS;
+		
+		kernelName = string("fft") + num2str(kCount);
+		*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
+		(*kInfo)->kernel = 0;
+		if(R2 == 1)
+			(*kInfo)->lmem_size = 0;
+		else
+		{
+		    if(strideO == 1)
+		        (*kInfo)->lmem_size = (radix + 1)*batchSize;
+		    else
+			    (*kInfo)->lmem_size = threadsPerBlock*R1;
+		}
+		(*kInfo)->num_workgroups = numBlocks;
+        (*kInfo)->num_xforms_per_workgroup = 1;
+		(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
+		(*kInfo)->dir = dir;
+		if( (passNum == (numPasses - 1)) && (numPasses & 1) )
+		    (*kInfo)->in_place_possible = 1;
+		else
+			(*kInfo)->in_place_possible = 0;
+		(*kInfo)->next = NULL;
+		(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
+		strcpy((*kInfo)->kernel_name, kernelName.c_str());
+		
+		insertVariables(localString, R1);
+						
+		if(vertical) 
+		{
+			localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
+			localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
+			localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
+			localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
+			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
+			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
+			int stride = radix*Rinit;
+			for(i = 0; i < passNum; i++)
+				stride *= radixArr[i];
+			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
+			localString += string("bNum = groupId;\n");
+		}
+		else 
+		{
+			int lgNumBlocksPerXForm = (int)log2(numBlocksPerXForm);
+			localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
+			localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
+			localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
+			localString += string("tid = indexIn;\n");
+			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
+			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); 
+			int stride = radix*Rinit;
+			for(i = 0; i < passNum; i++)
+				stride *= radixArr[i];
+			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");			
+			localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
+			localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");	
+		}
+		
+		// Load Data
+		int lgBatchSize = (int)log2(batchSize);
+		localString += string("tid = lId;\n");
+		localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
+		localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); 
+		localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");
+
+		if(dataFormat == clFFT_SplitComplexFormat) 
+		{
+			localString += string("in_real += indexIn;\n");
+			localString += string("in_imag += indexIn;\n");			
+			for(j = 0; j < R1; j++)
+				localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
+			for(j = 0; j < R1; j++) 
+				localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
+		}
+		else 
+		{
+			localString += string("in += indexIn;\n");
+			for(j = 0; j < R1; j++)
+				localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
+	    }
+		
+		localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");							  
+		
+		if(R2 > 1)
+		{
+		    // twiddle
+		    for(k = 1; k < R1; k++) 
+		    {
+			    localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
+			    localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
+			    localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); 
+		    }
+		
+		    // shuffle
+		    numIter = R1 / R2;	
+		    localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
+		    localString += string("lMemStore = sMem + tid;\n");
+		    localString += string("lMemLoad = sMem + indexIn;\n");
+		    for(k = 0; k < R1; k++) 
+			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
+		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
+		    for(k = 0; k < numIter; k++)
+			    for(t = 0; t < R2; t++)
+				    localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
+		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+		    for(k = 0; k < R1; k++) 
+			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
+		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
+		    for(k = 0; k < numIter; k++)
+			    for(t = 0; t < R2; t++)
+				    localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
+		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+		
+		    for(j = 0; j < numIter; j++)
+			    localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
+		}
+		
+		// twiddle
+		if(passNum < (numPasses - 1)) 
+		{
+			localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
+			localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); 
+			localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
+			for(t = 0; t < R1; t++) 
+			{
+				localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
+				localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
+				localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
+			}
+		}
+		
+		// Store Data
+		if(strideO == 1) 
+		{
+			
+			localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
+			localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
+			
+			for(i = 0; i < R1/R2; i++)
+				for(j = 0; j < R2; j++)
+					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
+			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+			if(threadsPerBlock >= radix)
+            {
+                for(i = 0; i < R1; i++)
+                localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
+            }
+            else
+            {
+                int innerIter = radix/threadsPerBlock;
+                int outerIter = R1/innerIter;
+                for(i = 0; i < outerIter; i++)
+                    for(j = 0; j < innerIter; j++)
+                        localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
+            }
+			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+			
+			for(i = 0; i < R1/R2; i++)
+				for(j = 0; j < R2; j++)
+					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
+			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+			if(threadsPerBlock >= radix)
+            {
+                for(i = 0; i < R1; i++)
+                    localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
+            }
+            else
+            {
+                int innerIter = radix/threadsPerBlock;
+                int outerIter = R1/innerIter;
+                for(i = 0; i < outerIter; i++)
+                    for(j = 0; j < innerIter; j++)
+                        localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
+            }
+			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+			
+			localString += string("indexOut += tid;\n");
+			if(dataFormat == clFFT_SplitComplexFormat) {
+				localString += string("out_real += indexOut;\n");
+				localString += string("out_imag += indexOut;\n");
+				for(k = 0; k < R1; k++)
+					localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
+				for(k = 0; k < R1; k++)
+					localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
+			}
+			else {
+				localString += string("out += indexOut;\n");
+				for(k = 0; k < R1; k++)
+					localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");				
+			}
+		 
+		}
+		else 
+		{
+			localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
+			if(dataFormat == clFFT_SplitComplexFormat) {
+				localString += string("out_real += indexOut;\n");
+				localString += string("out_imag += indexOut;\n");			
+				for(k = 0; k < R1; k++)
+					localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
+				for(k = 0; k < R1; k++)
+					localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
+			}
+			else {
+				localString += string("out += indexOut;\n");
+				for(k = 0; k < R1; k++)
+					localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
+			}
+		}
+		
+		insertHeader(*kernelString, kernelName, dataFormat);
+		*kernelString += string("{\n");
+		if((*kInfo)->lmem_size)
+			*kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
+		*kernelString += localString;
+		*kernelString += string("}\n");		
+		
+		N /= radix;
+		kInfo = &(*kInfo)->next;
+		kCount++;
+	}
+}
+
+void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
+{	
+    unsigned int radixArray[10];
+    unsigned int numRadix;
+    
+	switch(dir)
+	{
+		case cl_fft_kernel_x:
+		    if(plan->n.x > plan->max_localmem_fft_size)
+		    {
+		        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
+		    }
+		    else if(plan->n.x > 1)
+		    {
+		        getRadixArray(plan->n.x, radixArray, &numRadix, 0);
+		        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
+		        {
+				    createLocalMemfftKernelString(plan);
+				}
+			    else
+			    {
+			        getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
+			        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
+			            createLocalMemfftKernelString(plan);
+			        else
+				        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
+				}
+		    }
+			break;
+			
+		case cl_fft_kernel_y:
+			if(plan->n.y > 1)
+			    createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
+			break;
+			
+		case cl_fft_kernel_z:
+			if(plan->n.z > 1)
+			    createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
+		default:
+			return;
+	}
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/fft_setup.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,401 @@
+
+//
+// File:       fft_setup.cpp
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "fft_internal.h"
+#include "fft_base_kernels.h"
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+using namespace std;
+
+extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);
+
+static void 
+getBlockConfigAndKernelString(cl_fft_plan *plan)
+{
+	plan->temp_buffer_needed = 0;
+	*plan->kernel_string += baseKernels;
+	
+	if(plan->format == clFFT_SplitComplexFormat)
+		*plan->kernel_string += twistKernelPlannar;
+	else
+		*plan->kernel_string += twistKernelInterleaved;
+	
+	switch(plan->dim) 
+	{
+		case clFFT_1D:
+			FFT1D(plan, cl_fft_kernel_x);
+			break;
+			
+		case clFFT_2D:
+			FFT1D(plan, cl_fft_kernel_x); 
+			FFT1D(plan, cl_fft_kernel_y);  
+			break;
+			
+		case clFFT_3D:
+			FFT1D(plan, cl_fft_kernel_x); 
+			FFT1D(plan, cl_fft_kernel_y); 
+			FFT1D(plan, cl_fft_kernel_z); 
+			break;
+			
+		default:
+			return;
+	}
+	
+	plan->temp_buffer_needed = 0;
+	cl_fft_kernel_info *kInfo = plan->kernel_info;
+	while(kInfo)
+	{
+		plan->temp_buffer_needed |= !kInfo->in_place_possible;
+		kInfo = kInfo->next;
+	}
+}
+
+ 
+static void
+deleteKernelInfo(cl_fft_kernel_info *kInfo)
+{
+	if(kInfo)
+	{
+	    if(kInfo->kernel_name)
+		    free(kInfo->kernel_name);
+	    if(kInfo->kernel)
+		    clReleaseKernel(kInfo->kernel);
+		free(kInfo);
+	}	
+}
+
+static void
+destroy_plan(cl_fft_plan *Plan)
+{
+    cl_fft_kernel_info *kernel_info = Plan->kernel_info;
+
+	while(kernel_info)
+	{
+		cl_fft_kernel_info *tmp = kernel_info->next;
+		deleteKernelInfo(kernel_info);
+		kernel_info = tmp;
+	}
+	
+	Plan->kernel_info = NULL;
+		
+	if(Plan->kernel_string)
+	{
+		delete Plan->kernel_string;
+		Plan->kernel_string = NULL;
+	}			
+	if(Plan->twist_kernel)
+	{
+		clReleaseKernel(Plan->twist_kernel);
+		Plan->twist_kernel = NULL;
+	}
+	if(Plan->program)
+	{
+		clReleaseProgram(Plan->program);
+		Plan->program = NULL;
+	}
+	if(Plan->tempmemobj) 
+	{
+		clReleaseMemObject(Plan->tempmemobj);
+		Plan->tempmemobj = NULL;
+	}
+	if(Plan->tempmemobj_real)
+	{
+		clReleaseMemObject(Plan->tempmemobj_real);
+		Plan->tempmemobj_real = NULL;
+	}
+	if(Plan->tempmemobj_imag)
+	{
+		clReleaseMemObject(Plan->tempmemobj_imag);
+		Plan->tempmemobj_imag = NULL;
+	}
+}
+
+static int
+createKernelList(cl_fft_plan *plan) 
+{
+	cl_program program = plan->program;
+	cl_fft_kernel_info *kernel_info = plan->kernel_info;
+	
+	cl_int err;
+	while(kernel_info)
+	{
+		kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
+		if(!kernel_info->kernel || err != CL_SUCCESS)
+			return err;
+		kernel_info = kernel_info->next;		
+	}
+	
+	if(plan->format == clFFT_SplitComplexFormat)
+		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
+	else
+		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
+	
+	if(!plan->twist_kernel || err)
+		return err;
+
+	return CL_SUCCESS;
+}
+
+int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
+{	
+    int reg_needed = 0;
+    *max_wg_size = INT_MAX;
+    int err;
+    unsigned wg_size;
+    
+    unsigned int i;
+    for(i = 0; i < num_devices; i++)
+    {
+	    cl_fft_kernel_info *kInfo = plan->kernel_info;
+	    while(kInfo)
+	    {
+		    err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
+		    if(err != CL_SUCCESS)
+		        return -1;
+		        
+		    if(wg_size < kInfo->num_workitems_per_workgroup)
+		        reg_needed |= 1;
+		    
+		    if(*max_wg_size > wg_size)
+		        *max_wg_size = wg_size;
+		        
+		    kInfo = kInfo->next;
+	    }
+	}
+	
+	return reg_needed;
+}	
+
+#define ERR_MACRO(err) { \
+                         if( err != CL_SUCCESS) \
+                         { \
+                           if(error_code) \
+                               *error_code = err; \
+                           clFFT_DestroyPlan((clFFT_Plan) plan); \
+						   return (clFFT_Plan) NULL; \
+                         } \
+					   }
+
+clFFT_Plan
+clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
+{
+	int i;
+	cl_int err;
+	int isPow2 = 1;
+	cl_fft_plan *plan = NULL;
+	ostringstream kString;
+	int num_devices;
+	int gpu_found = 0;
+	cl_device_id devices[16];
+	size_t ret_size;
+	cl_device_type device_type;
+	
+    if(!context)
+		ERR_MACRO(CL_INVALID_VALUE);
+	
+	isPow2 |= n.x && !( (n.x - 1) & n.x );
+	isPow2 |= n.y && !( (n.y - 1) & n.y );
+	isPow2 |= n.z && !( (n.z - 1) & n.z );
+	
+	if(!isPow2)
+		ERR_MACRO(CL_INVALID_VALUE);
+	
+	if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
+		ERR_MACRO(CL_INVALID_VALUE);
+
+	plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
+	if(!plan)
+		ERR_MACRO(CL_OUT_OF_RESOURCES);
+	
+	plan->context = context;
+	clRetainContext(context);
+	plan->n = n;
+	plan->dim = dim;
+	plan->format = dataFormat;
+	plan->kernel_info = 0;
+	plan->num_kernels = 0;
+	plan->twist_kernel = 0;
+	plan->program = 0;
+	plan->temp_buffer_needed = 0;
+	plan->last_batch_size = 0;
+	plan->tempmemobj = 0;
+	plan->tempmemobj_real = 0;
+	plan->tempmemobj_imag = 0;
+	plan->max_localmem_fft_size = 2048;
+	plan->max_work_item_per_workgroup = 256;
+	plan->max_radix = 16;
+	plan->min_mem_coalesce_width = 16;
+	plan->num_local_mem_banks = 16;	
+	
+patch_kernel_source:
+
+	plan->kernel_string = new string("");
+	if(!plan->kernel_string)
+        ERR_MACRO(CL_OUT_OF_RESOURCES);
+
+	getBlockConfigAndKernelString(plan);
+	
+	const char *source_str = plan->kernel_string->c_str();
+	plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
+    ERR_MACRO(err);
+
+	err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
+	ERR_MACRO(err);
+	
+	num_devices = (int)(ret_size / sizeof(cl_device_id));
+	
+	for(i = 0; i < num_devices; i++)
+	{
+		err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
+		ERR_MACRO(err);
+		
+		if(device_type == CL_DEVICE_TYPE_GPU)
+		{	
+			gpu_found = 1;
+	        err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
+	        if (err != CL_SUCCESS)
+	        {
+		        char *build_log;				
+				char devicename[200];
+		        size_t log_size;
+				
+		        err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+				ERR_MACRO(err);
+				
+		        build_log = (char *) malloc(log_size + 1);
+				
+			    err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
+				ERR_MACRO(err);
+				
+				err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
+				ERR_MACRO(err);
+				
+				fprintf(stdout, "FFT program build log on device %s\n", devicename);
+		        fprintf(stdout, "%s\n", build_log);
+		        free(build_log);
+				
+				ERR_MACRO(err);
+			}	
+		}	
+	}
+	
+	if(!gpu_found)
+		ERR_MACRO(CL_INVALID_CONTEXT);
+	
+	err = createKernelList(plan); 
+    ERR_MACRO(err);
+    
+    // we created program and kernels based on "some max work group size (default 256)" ... this work group size
+    // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source 
+    // setting this as limit i.e max group size and rebuild. 
+	unsigned int max_kernel_wg_size; 
+	int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
+	if(patching_req == -1)
+	{
+	    ERR_MACRO(err);
+	}
+	
+	if(patching_req)
+	{
+	    destroy_plan(plan);
+	    plan->max_work_item_per_workgroup = max_kernel_wg_size;
+	    goto patch_kernel_source;
+	}
+	
+	cl_fft_kernel_info *kInfo = plan->kernel_info;
+	while(kInfo)
+	{
+		plan->num_kernels++;
+		kInfo = kInfo->next;
+	}
+	
+	if(error_code)
+		*error_code = CL_SUCCESS;
+			
+	return (clFFT_Plan) plan;
+}
+
+void		 
+clFFT_DestroyPlan(clFFT_Plan plan)
+{
+    cl_fft_plan *Plan = (cl_fft_plan *) plan;
+	if(Plan) 
+	{	
+		destroy_plan(Plan);	
+		clReleaseContext(Plan->context);
+		free(Plan);
+	}		
+}
+
+void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
+{
+	size_t gDim, lDim;
+	FILE *out;
+	if(!file)
+		out = stdout;
+	else 
+		out = file;
+	
+	cl_fft_plan *plan = (cl_fft_plan *) Plan;
+	cl_fft_kernel_info *kInfo = plan->kernel_info;
+	
+	while(kInfo)
+	{
+		cl_int s = 1;
+		getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
+		fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
+		kInfo = kInfo->next;
+	}
+	fprintf(out, "%s\n", plan->kernel_string->c_str());
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/main.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,882 @@
+
+//
+// File:       main.cpp
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <OpenCL/opencl.h>
+#include "clFFT.h"
+#include <mach/mach_time.h>
+#include <Accelerate/Accelerate.h>
+#include "procs.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <float.h>
+
+#define eps_avg 10.0
+
+#define MAX( _a, _b)    ((_a)>(_b)?(_a) : (_b))
+
+typedef enum {
+    clFFT_OUT_OF_PLACE,
+    clFFT_IN_PLACE,
+}clFFT_TestType;
+
+typedef struct
+{
+    double real;
+    double imag;
+}clFFT_ComplexDouble;
+
+typedef struct
+{
+    double *real;
+    double *imag;
+}clFFT_SplitComplexDouble;
+
+cl_device_id     device_id;
+cl_context       context;
+cl_command_queue queue;
+
+typedef unsigned long long ulong;
+
+double subtractTimes( uint64_t endTime, uint64_t startTime )
+{
+    uint64_t difference = endTime - startTime;
+    static double conversion = 0.0;
+
+    if( conversion == 0.0 )
+    {
+        mach_timebase_info_data_t info;
+        kern_return_t err = mach_timebase_info( &info );
+
+        //Convert the timebase into seconds
+        if( err == 0  )
+            conversion = 1e-9 * (double) info.numer / (double) info.denom;
+    }
+
+    return conversion * (double) difference;
+}
+
+void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n,
+                      unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
+{
+    FFTSetup plan_vdsp;
+    DSPSplitComplex out_vdsp;
+    FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
+
+    unsigned int i, j, k;
+    unsigned int stride;
+    unsigned int log2Nx = (unsigned int) log2(n.x);
+    unsigned int log2Ny = (unsigned int) log2(n.y);
+    unsigned int log2Nz = (unsigned int) log2(n.z);
+    unsigned int log2N;
+
+    log2N = log2Nx;
+    log2N = log2N > log2Ny ? log2N : log2Ny;
+    log2N = log2N > log2Nz ? log2N : log2Nz;
+
+    plan_vdsp = vDSP_create_fftsetup(log2N, 2);
+
+    switch(dim)
+    {
+        case clFFT_1D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                stride = i * n.x;
+                out_vdsp.realp  = out->real  + stride;
+                out_vdsp.imagp  = out->imag  + stride;
+
+                vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+            }
+            break;
+
+        case clFFT_2D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    stride = j * n.x + i * n.x * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.x; j++)
+                {
+                    stride = j + i * n.x  * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                }
+            }
+            break;
+
+        case clFFT_3D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.y; k++)
+                    {
+                        stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
+                    }
+                }
+            }
+            break;
+    }
+
+    vDSP_destroy_fftsetup(plan_vdsp);
+}
+
+void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n,
+                      unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
+{
+    FFTSetupD plan_vdsp;
+    DSPDoubleSplitComplex out_vdsp;
+    FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
+
+    unsigned int i, j, k;
+    unsigned int stride;
+    unsigned int log2Nx = (int) log2(n.x);
+    unsigned int log2Ny = (int) log2(n.y);
+    unsigned int log2Nz = (int) log2(n.z);
+    unsigned int log2N;
+
+    log2N = log2Nx;
+    log2N = log2N > log2Ny ? log2N : log2Ny;
+    log2N = log2N > log2Nz ? log2N : log2Nz;
+
+    plan_vdsp = vDSP_create_fftsetupD(log2N, 2);
+
+    switch(dim)
+    {
+        case clFFT_1D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                stride = i * n.x;
+                out_vdsp.realp  = out->real  + stride;
+                out_vdsp.imagp  = out->imag  + stride;
+
+                vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+            }
+            break;
+
+        case clFFT_2D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    stride = j * n.x + i * n.x * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.x; j++)
+                {
+                    stride = j + i * n.x  * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                }
+            }
+            break;
+
+        case clFFT_3D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.y; k++)
+                    {
+                        stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
+                    }
+                }
+            }
+            break;
+    }
+
+    vDSP_destroy_fftsetupD(plan_vdsp);
+}
+
+double complexNormSq(clFFT_ComplexDouble a)
+{
+    return (a.real * a.real + a.imag * a.imag);
+}
+
+double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff)
+{
+    int i, j;
+    double avg_norm = 0.0;
+    *max_diff = 0.0;
+    *min_diff = 0x1.0p1000;
+
+    for(j = 0; j < batchSize; j++)
+    {
+        double norm_ref = 0.0;
+        double norm = 0.0;
+        for(i = 0; i < n; i++)
+        {
+            int index = j * n + i;
+            clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] };
+            double norm_tmp = complexNormSq(diff);
+            norm += norm_tmp;
+            norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]);
+        }
+        double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON;
+        avg_norm += curr_norm;
+        *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff;
+        *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff;
+    }
+
+    return avg_norm / batchSize;
+}
+
+void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length)
+{
+    int i;
+    for(i = 0; i < length; i++) {
+        result_split->real[i] = data_cl[i].real;
+        result_split->imag[i] = data_cl[i].imag;
+    }
+}
+
+int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim,
+            clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
+{
+    cl_int err = CL_SUCCESS;
+    int iter;
+    double t;
+
+    uint64_t t0, t1;
+    int mx = (int)log2(n.x);
+    int my = (int)log2(n.y);
+    int mz = (int)log2(n.z);
+
+    int length = n.x * n.y * n.z * batchSize;
+
+    double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
+
+    clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
+    clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
+    clFFT_Complex *data_i = NULL;
+    clFFT_Complex *data_cl = NULL;
+    clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL };
+    clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
+
+    clFFT_Plan plan = NULL;
+    cl_mem data_in = NULL;
+    cl_mem data_out = NULL;
+    cl_mem data_in_real = NULL;
+    cl_mem data_in_imag = NULL;
+    cl_mem data_out_real = NULL;
+    cl_mem data_out_imag = NULL;
+
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        data_i_split.real     = (float *) malloc(sizeof(float) * length);
+        data_i_split.imag     = (float *) malloc(sizeof(float) * length);
+        data_cl_split.real    = (float *) malloc(sizeof(float) * length);
+        data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
+        if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
+        {
+            err = -1;
+            log_error("Out-of-Resources\n");
+            goto cleanup;
+        }
+    }
+    else {
+        data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
+        data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
+        if(!data_i || !data_cl)
+        {
+            err = -2;
+            log_error("Out-of-Resouces\n");
+            goto cleanup;
+        }
+    }
+
+    data_iref.real   = (double *) malloc(sizeof(double) * length);
+    data_iref.imag   = (double *) malloc(sizeof(double) * length);
+    data_oref.real   = (double *) malloc(sizeof(double) * length);
+    data_oref.imag   = (double *) malloc(sizeof(double) * length);
+    if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
+    {
+        err = -3;
+        log_error("Out-of-Resources\n");
+        goto cleanup;
+    }
+
+    int i;
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        for(i = 0; i < length; i++)
+        {
+            data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_cl_split.real[i] = 0.0f;
+            data_cl_split.imag[i] = 0.0f;
+            data_iref.real[i] = data_i_split.real[i];
+            data_iref.imag[i] = data_i_split.imag[i];
+            data_oref.real[i] = data_iref.real[i];
+            data_oref.imag[i] = data_iref.imag[i];
+        }
+    }
+    else {
+        for(i = 0; i < length; i++)
+        {
+            data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_cl[i].real = 0.0f;
+            data_cl[i].imag = 0.0f;
+            data_iref.real[i] = data_i[i].real;
+            data_iref.imag[i] = data_i[i].imag;
+            data_oref.real[i] = data_iref.real[i];
+            data_oref.imag[i] = data_iref.imag[i];
+        }
+    }
+
+    plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
+    if(!plan || err)
+    {
+        log_error("clFFT_CreatePlan failed\n");
+        goto cleanup;
+    }
+
+    //clFFT_DumpPlan(plan, stdout);
+
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
+        if(!data_in_real || err)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+
+        data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
+        if(!data_in_imag || err)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+
+        if(testType == clFFT_OUT_OF_PLACE)
+        {
+            data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
+            if(!data_out_real || err)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+
+            data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
+            if(!data_out_imag || err)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+        }
+        else
+        {
+            data_out_real = data_in_real;
+            data_out_imag = data_in_imag;
+        }
+    }
+    else
+    {
+        data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
+        if(!data_in)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+        if(testType == clFFT_OUT_OF_PLACE)
+        {
+            data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
+            if(!data_out)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+        }
+        else
+            data_out = data_in;
+    }
+
+
+    err = CL_SUCCESS;
+
+    t0 = mach_absolute_time();
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        for(iter = 0; iter < numIter; iter++)
+            err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
+    }
+    else
+    {
+        for(iter = 0; iter < numIter; iter++)
+            err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
+    }
+
+    err |= clFinish(queue);
+
+    if(err)
+    {
+        log_error("clFFT_Execute\n");
+        goto cleanup;
+    }
+
+    t1 = mach_absolute_time();
+    t = subtractTimes(t1, t0);
+    char temp[100];
+    sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
+    log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);
+
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
+        err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
+    }
+    else
+    {
+        err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
+    }
+
+    if(err)
+    {
+        log_error("clEnqueueReadBuffer failed\n");
+        goto cleanup;
+    }
+
+    computeReferenceD(&data_oref, n, batchSize, dim, dir);
+
+    double diff_avg, diff_max, diff_min;
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
+        if(diff_avg > eps_avg)
+            log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        else
+            log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+    }
+    else {
+        clFFT_SplitComplex result_split;
+        result_split.real = (float *) malloc(length*sizeof(float));
+        result_split.imag = (float *) malloc(length*sizeof(float));
+        convertInterleavedToSplit(&result_split, data_cl, length);
+        diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
+
+        if(diff_avg > eps_avg)
+            log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        else
+            log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        free(result_split.real);
+        free(result_split.imag);
+    }
+
+cleanup:
+    clFFT_DestroyPlan(plan);
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        if(data_i_split.real)
+            free(data_i_split.real);
+        if(data_i_split.imag)
+            free(data_i_split.imag);
+        if(data_cl_split.real)
+            free(data_cl_split.real);
+        if(data_cl_split.imag)
+            free(data_cl_split.imag);
+
+        if(data_in_real)
+            clReleaseMemObject(data_in_real);
+        if(data_in_imag)
+            clReleaseMemObject(data_in_imag);
+        if(data_out_real && testType == clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out_real);
+        if(data_out_imag && clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out_imag);
+    }
+    else
+    {
+        if(data_i)
+            free(data_i);
+        if(data_cl)
+            free(data_cl);
+
+        if(data_in)
+            clReleaseMemObject(data_in);
+        if(data_out && testType == clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out);
+    }
+
+    if(data_iref.real)
+        free(data_iref.real);
+    if(data_iref.imag)
+        free(data_iref.imag);
+    if(data_oref.real)
+        free(data_oref.real);
+    if(data_oref.imag)
+        free(data_oref.imag);
+
+    return err;
+}
+
+bool ifLineCommented(const char *line) {
+    const char *Line = line;
+    while(*Line != '\0')
+        if((*Line == '/') && (*(Line + 1) == '/'))
+            return true;
+        else
+            Line++;
+    return false;
+}
+
+cl_device_type getGlobalDeviceType()
+{
+    char *force_cpu = getenv( "CL_DEVICE_TYPE" );
+    if( force_cpu != NULL )
+    {
+        if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 )
+            return CL_DEVICE_TYPE_GPU;
+        else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 )
+            return CL_DEVICE_TYPE_CPU;
+        else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
+            return CL_DEVICE_TYPE_ACCELERATOR;
+        else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
+            return CL_DEVICE_TYPE_DEFAULT;
+    }
+    // default
+    return CL_DEVICE_TYPE_GPU;
+}
+
+void
+notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
+{
+    log_error( "%s\n", errinfo );
+}
+
+int
+checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize)
+{
+    cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2;
+    memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize;
+    memReq = memReq/1024/1024;
+    if(memReq >= gMemSize)
+        return -1;
+    return 0;
+}
+
+int main (int argc, char * const argv[]) {
+
+    test_start();
+
+    cl_ulong gMemSize;
+    clFFT_Direction dir = clFFT_Forward;
+    int numIter = 1;
+    clFFT_Dim3 n = { 1024, 1, 1 };
+    int batchSize = 1;
+    clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
+    clFFT_Dimension dim = clFFT_1D;
+    clFFT_TestType testType = clFFT_OUT_OF_PLACE;
+    cl_device_id device_ids[16];
+
+    FILE *paramFile;
+
+    cl_int err;
+    unsigned int num_devices;
+
+    cl_device_type device_type = getGlobalDeviceType();
+    if(device_type != CL_DEVICE_TYPE_GPU)
+        {
+            log_info("Test only supported on DEVICE_TYPE_GPU\n");
+            test_finish();
+            exit(0);
+        }
+
+    err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices);
+    if(err)
+        {
+            log_error("clGetComputeDevice failed\n");
+            test_finish();
+            return -1;
+        }
+
+    device_id = NULL;
+
+    unsigned int i;
+    for(i = 0; i < num_devices; i++)
+        {
+            cl_bool available;
+            err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
+            if(err)
+                {
+                    log_error("Cannot check device availability of device # %d\n", i);
+                }
+
+            if(available)
+                {
+                    device_id = device_ids[i];
+                    break;
+                }
+            else
+                {
+                    char name[200];
+                    err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
+                    if(err == CL_SUCCESS)
+                        {
+                            log_info("Device %s not available for compute\n", name);
+                        }
+                    else
+                        {
+                            log_info("Device # %d not available for compute\n", i);
+                        }
+                }
+        }
+
+    if(!device_id)
+        {
+            log_error("None of the devices available for compute ... aborting test\n");
+            test_finish();
+            return -1;
+        }
+
+    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+    if(!context || err)
+        {
+            log_error("clCreateContext failed\n");
+            test_finish();
+            return -1;
+        }
+
+    queue = clCreateCommandQueue(context, device_id, 0, &err);
+    if(!queue || err)
+        {
+            log_error("clCreateCommandQueue() failed.\n");
+            clReleaseContext(context);
+            test_finish();
+            return -1;
+        }
+
+    err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL);
+    if(err)
+        {
+            log_error("Failed to get global mem size\n");
+            clReleaseContext(context);
+            clReleaseCommandQueue(queue);
+            test_finish();
+            return -2;
+        }
+
+    gMemSize /= (1024*1024);
+
+    char delim[] = " \n";
+    char tmpStr[100];
+    char line[200];
+    char *param, *val;
+    int total_errors = 0;
+    if(argc == 1) {
+        log_error("Need file name with list of parameters to run the test\n");
+        test_finish();
+        return -1;
+    }
+
+    if(argc == 2) { // arguments are supplied in a file with arguments for a single run are all on the same line
+        paramFile = fopen(argv[1], "r");
+        if(!paramFile) {
+            log_error("Cannot open the parameter file\n");
+            clReleaseContext(context);
+            clReleaseCommandQueue(queue);
+            test_finish();
+            return -3;
+        }
+        while(fgets(line, 199, paramFile)) {
+            if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line))
+                continue;
+            param = strtok(line, delim);
+            while(param) {
+                val = strtok(NULL, delim);
+                if(!strcmp(param, "-n")) {
+                    sscanf(val, "%d", &n.x);
+                    val = strtok(NULL, delim);
+                    sscanf(val, "%d", &n.y);
+                    val = strtok(NULL, delim);
+                    sscanf(val, "%d", &n.z);
+                }
+                else if(!strcmp(param, "-batchsize"))
+                    sscanf(val, "%d", &batchSize);
+                else if(!strcmp(param, "-dir")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "forward"))
+                        dir = clFFT_Forward;
+                    else if(!strcmp(tmpStr, "inverse"))
+                        dir = clFFT_Inverse;
+                }
+                else if(!strcmp(param, "-dim")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "1D"))
+                        dim = clFFT_1D;
+                    else if(!strcmp(tmpStr, "2D"))
+                        dim = clFFT_2D;
+                    else if(!strcmp(tmpStr, "3D"))
+                        dim = clFFT_3D;
+                }
+                else if(!strcmp(param, "-format")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "plannar"))
+                        dataFormat = clFFT_SplitComplexFormat;
+                    else if(!strcmp(tmpStr, "interleaved"))
+                        dataFormat = clFFT_InterleavedComplexFormat;
+                }
+                else if(!strcmp(param, "-numiter"))
+                    sscanf(val, "%d", &numIter);
+                else if(!strcmp(param, "-testtype")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "out-of-place"))
+                        testType = clFFT_OUT_OF_PLACE;
+                    else if(!strcmp(tmpStr, "in-place"))
+                        testType = clFFT_IN_PLACE;
+                }
+                param = strtok(NULL, delim);
+            }
+
+            if(checkMemRequirements(n, batchSize, testType, gMemSize)) {
+                log_info("This test cannot run because memory requirements canot be met by the available device\n");
+                continue;
+            }
+
+            err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType);
+            if (err)
+                total_errors++;
+        }
+    }
+
+    clReleaseContext(context);
+    clReleaseCommandQueue(queue);
+
+    test_finish();
+    return total_errors;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/param.txt	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,57 @@
+
+//
+// File:       param.txt
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+-n 64 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
+-n 1024 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
+-n 1048576 1 1 -batchsize 4 -dir inverse -dim 1D -format interleaved -numiter 1000 -testtype out-of-place
+-n 1024 512 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1000 -testtype out-of-place
+-n 128 128 128 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1000 -testtype out-of-place
+-n 16384 1 1 -batchsize 4 -dir forward -dim 1D -format interleaved -numiter 1 -testtype in-place
+-n 32 2048 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1 -testtype in-place
+-n 4096 64 1 -batchsize 4 -dir inverse -dim 2D -format plannar -numiter 1 -testtype in-place
+-n 64 32 16 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1 -testtype out-of-place
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/param_small.txt	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,47 @@
+
+//
+// File:       param.txt
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+-n 64 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/procs.h	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,53 @@
+
+//
+// File:       procs.h
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#define test_start()
+#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, _higherBetter?"higher is better":"lower is better" , _number)
+#define log_info printf
+#define log_error printf
+#define test_finish()
Binary file hello_World_Example/.DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hello_World_Example/Makefile	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,11 @@
+CFLAGS = -Wall -framework opencl
+CC = clang++
+OPT = -g
+
+TARGET=hello
+
+hello : hello.o
+	$(CC) $(OPT) $(CFLAGS) -o $@ $?
+
+clean:
+	rm -rf *.o $(TARGET)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hello_World_Example/ReadMe.txt	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,1 @@
+### OpenCL Hello World Example ###

===========================================================================
DESCRIPTION:

A simple "Hello World" compute example showing basic usage of OpenCL which
calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
floating point values.

For simplicity, this example is intended to be run from the command line.
If run from within XCode, open the Run Log (Command-Shift-R) to see the 
output.  Alternatively, run the applications from within a Terminal.app 
session to launch from the command line.

===========================================================================
BUILD REQUIREMENTS:

Mac OS X v10.6 or later

===========================================================================
RUNTIME REQUIREMENTS:

Mac OS X v10.6 or later

To use the GPU as a compute device, use one of the following devices:
- MacBook Pro w/NVidia GeForce 8600M 
- Mac Pro w/NVidia GeForce 8800GT

===========================================================================
PACKAGING LIST:

ReadMe.txt
hello.c
hello.xcodeproj

===========================================================================
CHANGES FROM PREVIOUS VERSIONS:

Version 1.0
- First version.

===========================================================================
Copyright (C) 2008 Apple Inc. All rights reserved.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hello_World_Example/hello.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,262 @@
+//
+// File:       hello.c
+//
+// Abstract:   A simple "Hello World" compute example showing basic usage of OpenCL which
+//             calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
+//             floating point values.
+//             
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <OpenCL/opencl.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Use a static data size for simplicity
+//
+#define DATA_SIZE (1024)
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Simple compute kernel which computes the square of an input array 
+//
+const char *KernelSource = "\n" \
+"__kernel void square(                                                       \n" \
+"   __global float* input,                                              \n" \
+"   __global float* output,                                             \n" \
+"   const unsigned int count)                                           \n" \
+"{                                                                      \n" \
+"   int i = get_global_id(0);                                           \n" \
+"   if(i < count)                                                       \n" \
+"       output[i] = input[i] * input[i];                                \n" \
+"}                                                                      \n" \
+"\n";
+
+////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv)
+{
+    int err;                            // error code returned from api calls
+      
+    float data[DATA_SIZE];              // original data set given to device
+    float results[DATA_SIZE];           // results returned from device
+    unsigned int correct;               // number of correct results returned
+
+    size_t global;                      // global domain size for our calculation
+    size_t local;                       // local domain size for our calculation
+
+    cl_device_id device_id;             // compute device id 
+    cl_context context;                 // compute context
+    cl_command_queue commands;          // compute command queue
+    cl_program program;                 // compute program
+    cl_kernel kernel;                   // compute kernel
+    
+    cl_mem input;                       // device memory used for the input array
+    cl_mem output;                      // device memory used for the output array
+    
+    // Fill our data set with random float values
+    //
+    int i = 0;
+    unsigned int count = DATA_SIZE;
+    for(i = 0; i < count; i++)
+        data[i] = rand() / (float)RAND_MAX;
+    
+    // Connect to a compute device
+    //
+    int gpu = 1;
+    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to create a device group!\n");
+        return EXIT_FAILURE;
+    }
+  
+    // Create a compute context 
+    //
+    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+    if (!context)
+    {
+        printf("Error: Failed to create a compute context!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Create a command commands
+    //
+    commands = clCreateCommandQueue(context, device_id, 0, &err);
+    if (!commands)
+    {
+        printf("Error: Failed to create a command commands!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Create the compute program from the source buffer
+    //
+    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
+    if (!program)
+    {
+        printf("Error: Failed to create compute program!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Build the program executable
+    //
+    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        size_t len;
+        char buffer[2048];
+
+        printf("Error: Failed to build program executable!\n");
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+        printf("%s\n", buffer);
+        exit(1);
+    }
+
+    // Create the compute kernel in the program we wish to run
+    //
+    kernel = clCreateKernel(program, "square", &err);
+    if (!kernel || err != CL_SUCCESS)
+    {
+        printf("Error: Failed to create compute kernel!\n");
+        exit(1);
+    }
+
+    // Create the input and output arrays in device memory for our calculation
+    //
+    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
+    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
+    if (!input || !output)
+    {
+        printf("Error: Failed to allocate device memory!\n");
+        exit(1);
+    }    
+    
+    // Write our data set into the input array in device memory 
+    //
+    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+    }
+
+    // Set the arguments to our compute kernel
+    //
+    err = 0;
+    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
+    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
+    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to set kernel arguments! %d\n", err);
+        exit(1);
+    }
+
+    // Get the maximum work group size for executing the kernel on the device
+    //
+    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
+        exit(1);
+    }
+
+    // Execute the kernel over the entire range of our 1d input data set
+    // using the maximum number of work group items for this device
+    //
+    global = count;
+    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+    if (err)
+    {
+        printf("Error: Failed to execute kernel!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Wait for the command commands to get serviced before reading back results
+    //
+    clFinish(commands);
+
+    // Read back the results from the device to verify the output
+    //
+    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+    }
+    
+    // Validate our results
+    //
+    correct = 0;
+    for(i = 0; i < count; i++)
+    {
+        if(results[i] == data[i] * data[i])
+            correct++;
+    }
+    
+    // Print a brief summary detailing the results
+    //
+    printf("Computed '%d/%d' correct values!\n", correct, count);
+    
+    // Shutdown and cleanup
+    //
+    clReleaseMemObject(input);
+    clReleaseMemObject(output);
+    clReleaseProgram(program);
+    clReleaseKernel(kernel);
+    clReleaseCommandQueue(commands);
+    clReleaseContext(context);
+
+    return 0;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hello_World_Example/hello.xcodeproj/project.pbxproj	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,215 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 42;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		C3770EFB0E6F1121009A5A77 /* hello.c in Sources */ = {isa = PBXBuildFile; fileRef = C3770EFA0E6F1121009A5A77 /* hello.c */; };
+		C3770EFD0E6F1138009A5A77 /* OpenCL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C3770EFC0E6F1138009A5A77 /* OpenCL.framework */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		C39444690DAFF5A0008FFE68 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 16;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		466E0F5F0C932E1A00ED01DB /* hello */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = hello; sourceTree = BUILT_PRODUCTS_DIR; };
+		C3770EFA0E6F1121009A5A77 /* hello.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = hello.c; sourceTree = "<group>"; };
+		C3770EFC0E6F1138009A5A77 /* OpenCL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenCL.framework; path = /System/Library/Frameworks/OpenCL.framework; sourceTree = "<absolute>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		466E0F5D0C932E1A00ED01DB /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				C3770EFD0E6F1138009A5A77 /* OpenCL.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		466E0F490C93291B00ED01DB = {
+			isa = PBXGroup;
+			children = (
+				C3770EF30E6F10CF009A5A77 /* Frameworks */,
+				C3770EF10E6F10BB009A5A77 /* Sources */,
+				466E0F600C932E1A00ED01DB /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		466E0F600C932E1A00ED01DB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				466E0F5F0C932E1A00ED01DB /* hello */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C3770EF10E6F10BB009A5A77 /* Sources */ = {
+			isa = PBXGroup;
+			children = (
+				C3770EFA0E6F1121009A5A77 /* hello.c */,
+			);
+			name = Sources;
+			sourceTree = "<group>";
+		};
+		C3770EF30E6F10CF009A5A77 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				C3770EFC0E6F1138009A5A77 /* OpenCL.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		466E0F5E0C932E1A00ED01DB /* hello */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 466E0F640C932E1A00ED01DB /* Build configuration list for PBXNativeTarget "hello" */;
+			buildPhases = (
+				466E0F5C0C932E1A00ED01DB /* Sources */,
+				466E0F5D0C932E1A00ED01DB /* Frameworks */,
+				C39444690DAFF5A0008FFE68 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = hello;
+			productName = hello;
+			productReference = 466E0F5F0C932E1A00ED01DB /* hello */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		466E0F4B0C93291B00ED01DB /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 466E0F4E0C93291B00ED01DB /* Build configuration list for PBXProject "hello" */;
+			compatibilityVersion = "Xcode 2.4";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				English,
+				Japanese,
+				French,
+				German,
+			);
+			mainGroup = 466E0F490C93291B00ED01DB;
+			productRefGroup = 466E0F600C932E1A00ED01DB /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				466E0F5E0C932E1A00ED01DB /* hello */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		466E0F5C0C932E1A00ED01DB /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				C3770EFB0E6F1121009A5A77 /* hello.c in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		466E0F4C0C93291B00ED01DB /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				HEADER_SEARCH_PATHS = ../../inc;
+				LIBRARY_SEARCH_PATHS = ../../lib;
+			};
+			name = Debug;
+		};
+		466E0F4D0C93291B00ED01DB /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = YES;
+				HEADER_SEARCH_PATHS = ../../inc;
+				LIBRARY_SEARCH_PATHS = ../../lib;
+			};
+			name = Release;
+		};
+		466E0F620C932E1A00ED01DB /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_FIX_AND_CONTINUE = YES;
+				GCC_MODEL_TUNING = G5;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1)",
+				);
+				LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1 = "\"$(SRCROOT)/../lib\"";
+				PREBINDING = NO;
+				PRODUCT_NAME = hello;
+				ZERO_LINK = YES;
+			};
+			name = Debug;
+		};
+		466E0F630C932E1A00ED01DB /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = YES;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				GCC_ENABLE_FIX_AND_CONTINUE = NO;
+				GCC_MODEL_TUNING = G5;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1)",
+				);
+				LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1 = "\"$(SRCROOT)/../lib\"";
+				PREBINDING = NO;
+				PRODUCT_NAME = hello;
+				ZERO_LINK = NO;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		466E0F4E0C93291B00ED01DB /* Build configuration list for PBXProject "hello" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				466E0F4C0C93291B00ED01DB /* Debug */,
+				466E0F4D0C93291B00ED01DB /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		466E0F640C932E1A00ED01DB /* Build configuration list for PBXNativeTarget "hello" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				466E0F620C932E1A00ED01DB /* Debug */,
+				466E0F630C932E1A00ED01DB /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 466E0F4B0C93291B00ED01DB /* Project object */;
+}
Binary file parallel_Prefix_Sum_Example/.DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parallel_Prefix_Sum_Example/Makefile	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,11 @@
+CFLAGS = -Wall -framework opencl
+CC = clang++
+OPT = -g
+
+TARGET=scan
+
+scan: scan.o
+	$(CC) $(OPT) $(CFLAGS) -o $@ $?
+
+clean:
+	rm -rf *.o $(TARGET)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parallel_Prefix_Sum_Example/ReadMe.txt	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,1 @@
+### OpenCL Parallel Prefix Sum (aka Scan) Example ###

===========================================================================
DESCRIPTION:

This example shows how to perform an efficient parallel prefix sum (aka Scan)
using OpenCL.  Scan is a common data parallel primitive which can be used for
variety of different operations -- this example uses local memory for storing
partial sums and avoids memory bank conflicts on architectures which serialize
memory operations that are serviced on the same memory bank by offsetting the
loads and stores based on the size of the local group and the number of
memory banks (see appropriate macro definition).  As a result, this example
requires that the local group size > 1.

Note that the .cl compute kernel file(s) are loaded and compiled at
runtime.  The example source assumes that these files are in the same 
path as the built executable.

For simplicity, this example is intended to be run from the command line.
If run from within XCode, open the Run Log (Command-Shift-R) to see the 
output.  Alternatively, run the applications from within a Terminal.app 
session to launch from the command line.

===========================================================================
BUILD REQUIREMENTS:

Mac OS X v10.6 or later

===========================================================================
RUNTIME REQUIREMENTS:

Mac OS X v10.6 or later with OpenCL 1.0

===========================================================================
PACKAGING LIST:

ReadMe.txt
scan.c
scan_kernel.cl
scan.xcodeproj

===========================================================================
CHANGES FROM PREVIOUS VERSIONS:

Version 1.0
- First version.

===========================================================================
Copyright (C) 2008 Apple Inc. All rights reserved.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parallel_Prefix_Sum_Example/scan.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,876 @@
+//
+// File:       scan.c
+//
+// Abstract:   This example shows how to perform an efficient parallel prefix sum (aka Scan)
+//             using OpenCL.  Scan is a common data parallel primitive which can be used for 
+//             variety of different operations -- this example uses local memory for storing
+//             partial sums and avoids memory bank conflicts on architectures which serialize
+//             memory operations that are serviced on the same memory bank by offsetting the
+//             loads and stores based on the size of the local group and the number of
+//             memory banks (see appropriate macro definition).  As a result, this example
+//             requires that the local group size > 1.
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <libc.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <mach/mach_time.h>
+#include <math.h>
+
+#include <OpenCL/opencl.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define DEBUG_INFO      (0)
+int		GROUP_SIZE      = 256;
+#define NUM_BANKS       (16)
+#define MAX_ERROR       (1e-7)
+#define SEPARATOR       ("----------------------------------------------------------------------\n")
+
+#define min(A,B) ((A) < (B) ? (A) : (B))
+
+static int iterations = 1000;
+static int count      = 1024 * 1024;    
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+cl_device_id            ComputeDeviceId;
+cl_command_queue        ComputeCommands;
+cl_context              ComputeContext;
+cl_program              ComputeProgram;
+cl_kernel*              ComputeKernels;
+cl_mem*                 ScanPartialSums = 0;
+unsigned int            ElementsAllocated = 0;
+unsigned int            LevelsAllocated = 0;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum KernelMethods
+{
+    PRESCAN                             = 0,
+    PRESCAN_STORE_SUM                   = 1,
+    PRESCAN_STORE_SUM_NON_POWER_OF_TWO  = 2,
+    PRESCAN_NON_POWER_OF_TWO            = 3,
+    UNIFORM_ADD                         = 4
+};
+
+static const char* KernelNames[] =
+{
+    "PreScanKernel",
+    "PreScanStoreSumKernel",
+    "PreScanStoreSumNonPowerOfTwoKernel",
+    "PreScanNonPowerOfTwoKernel",
+    "UniformAddKernel"
+};
+
+static const unsigned int KernelCount = sizeof(KernelNames) / sizeof(char *);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+uint64_t
+GetCurrentTime()
+{
+    return mach_absolute_time();
+}
+	
+double 
+SubtractTimeInSec( uint64_t endtime, uint64_t starttime )
+{    
+	static double conversion = 0.0;
+	uint64_t difference = endtime - starttime;
+	if( 0 == conversion )
+	{
+		mach_timebase_info_data_t timebase;
+		kern_return_t kError = mach_timebase_info( &timebase );
+		if( kError == 0  )
+			conversion = 1e-9 * (double) timebase.numer / (double) timebase.denom;
+    }
+		
+	return conversion * (double) difference; 
+}
+
+static char *
+LoadProgramSourceFromFile(const char *filename)
+{
+    struct stat statbuf;
+    FILE        *fh;
+    char        *source;
+
+    fh = fopen(filename, "r");
+    if (fh == 0)
+        return 0;
+
+    stat(filename, &statbuf);
+    source = (char *) malloc(statbuf.st_size + 1);
+    fread(source, statbuf.st_size, 1, fh);
+    source[statbuf.st_size] = '\0';
+
+    return source;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+bool IsPowerOfTwo(int n)
+{
+    return ((n&(n-1))==0) ;
+}
+
+int floorPow2(int n)
+{
+    int exp;
+    frexp((float)n, &exp);
+    return 1 << (exp - 1);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int 
+CreatePartialSumBuffers(unsigned int count)
+{
+    ElementsAllocated = count;
+
+    unsigned int group_size = GROUP_SIZE;
+    unsigned int element_count = count;
+
+    int level = 0;
+
+    do
+    {       
+        unsigned int group_count = (int)fmax(1, (int)ceil((float)element_count / (2.0f * group_size)));
+        if (group_count > 1)
+        {
+            level++;
+        }
+        element_count = group_count;
+        
+    } while (element_count > 1);
+
+    ScanPartialSums = (cl_mem*) malloc(level * sizeof(cl_mem));
+    LevelsAllocated = level;
+    memset(ScanPartialSums, 0, sizeof(cl_mem) * level);
+    
+    element_count = count;
+    level = 0;
+    
+    do
+    {       
+        unsigned int group_count = (int)fmax(1, (int)ceil((float)element_count / (2.0f * group_size)));
+        if (group_count > 1) 
+        {
+            size_t buffer_size = group_count * sizeof(float);
+            ScanPartialSums[level++] = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
+        }
+
+        element_count = group_count;
+
+    } while (element_count > 1);
+
+    return CL_SUCCESS;
+}
+
+void 
+ReleasePartialSums(void)
+{
+    unsigned int i;
+    for (i = 0; i < LevelsAllocated; i++)
+    {
+        clReleaseMemObject(ScanPartialSums[i]);
+    }    
+    
+    free(ScanPartialSums);
+    ScanPartialSums = 0;
+    ElementsAllocated = 0;
+    LevelsAllocated = 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int
+PreScan(
+    size_t *global, 
+    size_t *local, 
+    size_t shared, 
+    cl_mem output_data, 
+    cl_mem input_data, 
+    unsigned int n,
+    int group_index, 
+    int base_index)
+{
+#if DEBUG_INFO
+    printf("PreScan: Global[%4d] Local[%4d] Shared[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
+        (int)global[0], (int)local[0], (int)shared, group_index, base_index, n);
+#endif
+
+    unsigned int k = PRESCAN;
+    unsigned int a = 0;
+
+    int err = CL_SUCCESS;
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    err = CL_SUCCESS;
+    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    return CL_SUCCESS;
+}
+
+int
+PreScanStoreSum(
+    size_t *global, 
+    size_t *local, 
+    size_t shared, 
+    cl_mem output_data, 
+    cl_mem input_data, 
+    cl_mem partial_sums,
+    unsigned int n,
+    int group_index, 
+    int base_index)
+{
+#if DEBUG_INFO
+    printf("PreScan: Global[%4d] Local[%4d] Shared[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
+        (int)global[0], (int)local[0], (int)shared, group_index, base_index, n);
+#endif
+
+    unsigned int k = PRESCAN_STORE_SUM;
+    unsigned int a = 0;
+
+    int err = CL_SUCCESS;
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    err = CL_SUCCESS;
+    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+    
+    return CL_SUCCESS;
+}
+
+int
+PreScanStoreSumNonPowerOfTwo(
+    size_t *global, 
+    size_t *local, 
+    size_t shared, 
+    cl_mem output_data, 
+    cl_mem input_data, 
+    cl_mem partial_sums,
+    unsigned int n,
+    int group_index, 
+    int base_index)
+{
+#if DEBUG_INFO
+    printf("PreScanStoreSumNonPowerOfTwo: Global[%4d] Local[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
+        (int)global[0], (int)local[0], group_index, base_index, n);
+#endif
+
+    unsigned int k = PRESCAN_STORE_SUM_NON_POWER_OF_TWO;
+    unsigned int a = 0;
+
+    int err = CL_SUCCESS;
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    err = CL_SUCCESS;
+    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    return CL_SUCCESS;
+}
+
+int
+PreScanNonPowerOfTwo(
+    size_t *global, 
+    size_t *local, 
+    size_t shared, 
+    cl_mem output_data, 
+    cl_mem input_data, 
+    unsigned int n,
+    int group_index, 
+    int base_index)
+{
+#if DEBUG_INFO
+    printf("PreScanNonPowerOfTwo: Global[%4d] Local[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
+        (int)global[0], (int)local[0], group_index, base_index, n);
+#endif
+
+    unsigned int k = PRESCAN_NON_POWER_OF_TWO;
+    unsigned int a = 0;
+
+    int err = CL_SUCCESS;
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    err = CL_SUCCESS;
+    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+    return CL_SUCCESS;
+}
+
+int
+UniformAdd(
+    size_t *global, 
+    size_t *local, 
+    cl_mem output_data, 
+    cl_mem partial_sums, 
+    unsigned int n, 
+    unsigned int group_offset, 
+    unsigned int base_index)
+{
+#if DEBUG_INFO
+    printf("UniformAdd: Global[%4d] Local[%4d] BlockOffset[%4d] BaseIndex[%4d] Entries[%d]\n", 
+        (int)global[0], (int)local[0], group_offset, base_index, n);
+#endif
+
+    unsigned int k = UNIFORM_ADD;
+    unsigned int a = 0;
+
+    int err = CL_SUCCESS;
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(float),  0);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_offset);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
+    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    err = CL_SUCCESS;
+    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
+        return EXIT_FAILURE;
+    }
+
+    return CL_SUCCESS;
+}
+
+int 
+PreScanBufferRecursive(
+    cl_mem output_data, 
+    cl_mem input_data, 
+    int max_group_size,
+    int max_work_item_count,
+    int element_count, 
+    int level)
+{
+    unsigned int group_size = max_group_size; 
+    unsigned int group_count = (int)fmax(1.0f, (int)ceil((float)element_count / (2.0f * group_size)));
+    unsigned int work_item_count = 0;
+
+    if (group_count > 1)
+        work_item_count = group_size;
+    else if (IsPowerOfTwo(element_count))
+        work_item_count = element_count / 2;
+    else
+        work_item_count = floorPow2(element_count);
+        
+    work_item_count = (work_item_count > max_work_item_count) ? max_work_item_count : work_item_count;
+
+    unsigned int element_count_per_group = work_item_count * 2;
+    unsigned int last_group_element_count = element_count - (group_count-1) * element_count_per_group;
+    unsigned int remaining_work_item_count = (int)fmax(1.0f, last_group_element_count / 2);
+    remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
+    unsigned int remainder = 0;
+    size_t last_shared = 0;
+
+    
+    if (last_group_element_count != element_count_per_group)
+    {
+        remainder = 1;
+
+        if(!IsPowerOfTwo(last_group_element_count))
+            remaining_work_item_count = floorPow2(last_group_element_count);    
+        
+        remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
+        unsigned int padding = (2 * remaining_work_item_count) / NUM_BANKS;
+        last_shared = sizeof(float) * (2 * remaining_work_item_count + padding);
+    }
+
+    remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
+    size_t global[] = { (int)fmax(1, group_count - remainder) * work_item_count, 1 };
+    size_t local[]  = { work_item_count, 1 };  
+
+    unsigned int padding = element_count_per_group / NUM_BANKS;
+    size_t shared = sizeof(float) * (element_count_per_group + padding);
+    
+    cl_mem partial_sums = ScanPartialSums[level];
+    int err = CL_SUCCESS;
+    
+    if (group_count > 1)
+    {
+        err = PreScanStoreSum(global, local, shared, output_data, input_data, partial_sums, work_item_count * 2, 0, 0);
+        if(err != CL_SUCCESS)
+            return err;
+            
+        if (remainder)
+        {
+            size_t last_global[] = { 1 * remaining_work_item_count, 1 };
+            size_t last_local[]  = { remaining_work_item_count, 1 };  
+
+            err = PreScanStoreSumNonPowerOfTwo(
+                    last_global, last_local, last_shared, 
+                    output_data, input_data, partial_sums,
+                    last_group_element_count, 
+                    group_count - 1, 
+                    element_count - last_group_element_count);    
+        
+            if(err != CL_SUCCESS)
+                return err;			
+			
+        }
+
+        err = PreScanBufferRecursive(partial_sums, partial_sums, max_group_size, max_work_item_count, group_count, level + 1);
+        if(err != CL_SUCCESS)
+            return err;
+            
+        err = UniformAdd(global, local, output_data, partial_sums,  element_count - last_group_element_count, 0, 0);
+        if(err != CL_SUCCESS)
+            return err;
+        
+        if (remainder)
+        {
+            size_t last_global[] = { 1 * remaining_work_item_count, 1 };
+            size_t last_local[]  = { remaining_work_item_count, 1 };  
+
+            err = UniformAdd(
+                    last_global, last_local, 
+                    output_data, partial_sums,
+                    last_group_element_count, 
+                    group_count - 1, 
+                    element_count - last_group_element_count);
+                
+            if(err != CL_SUCCESS)
+                return err;
+        }
+    }
+    else if (IsPowerOfTwo(element_count))
+    {
+        err = PreScan(global, local, shared, output_data, input_data, work_item_count * 2, 0, 0);
+        if(err != CL_SUCCESS)
+            return err;
+    }
+    else
+    {
+        err = PreScanNonPowerOfTwo(global, local, shared, output_data, input_data, element_count, 0, 0);
+        if(err != CL_SUCCESS)
+            return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+void 
+PreScanBuffer(
+    cl_mem output_data, 
+    cl_mem input_data, 
+    unsigned int max_group_size,
+    unsigned int max_work_item_count,
+    unsigned int element_count)
+{
+    PreScanBufferRecursive(output_data, input_data, max_group_size, max_work_item_count, element_count, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void ScanReference( float* reference, float* input, const unsigned int count) 
+{
+    reference[0] = 0;
+    double total_sum = 0;
+    
+    unsigned int i = 1;
+    for( i = 1; i < count; ++i) 
+    {
+        total_sum += input[i-1];
+        reference[i] = input[i-1] + reference[i-1];
+    }
+    if (total_sum != reference[count-1])
+        printf("Warning: Exceeding single-precision accuracy.  Scan will be inaccurate.\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char **argv)
+{
+    int i;
+    uint64_t         t0 = 0;
+    uint64_t         t1 = 0;
+    uint64_t         t2 = 0;
+    int              err = 0;
+    cl_mem			 output_buffer;
+    cl_mem           input_buffer;
+    
+    // Create some random input data on the host 
+    //
+    float *float_data = (float*)malloc(count * sizeof(float));
+    for (i = 0; i < count; i++)
+    {
+        float_data[i] = (int)(10 * ((float) rand() / (float) RAND_MAX));
+    }
+
+    // Connect to a GPU compute device
+    //
+    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &ComputeDeviceId, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to locate a compute device!\n");
+        return EXIT_FAILURE;
+    }
+
+    size_t returned_size = 0;
+    size_t max_workgroup_size = 0;
+    err = clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to retrieve device info!\n");
+        return EXIT_FAILURE;
+    }
+	
+	GROUP_SIZE = min( GROUP_SIZE, max_workgroup_size );
+
+    cl_char vendor_name[1024] = {0};
+    cl_char device_name[1024] = {0};
+    err = clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
+    err|= clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to retrieve device info!\n");
+        return EXIT_FAILURE;
+    }
+
+    printf(SEPARATOR);
+    printf("Connecting to %s %s...\n", vendor_name, device_name);
+
+    // Load the compute program from disk into a cstring buffer
+    //
+    printf(SEPARATOR);
+    const char* filename = "./scan_kernel.cl";
+    printf("Loading program '%s'...\n", filename);
+    printf(SEPARATOR);
+
+    char *source = LoadProgramSourceFromFile(filename);
+    if(!source)
+    {
+        printf("Error: Failed to load compute program from file!\n");
+        return EXIT_FAILURE;    
+    }
+    
+    /* 
+     * Create a compute ComputeContext 
+     * [Context]
+     * The context is the environment in which 
+     * OpenCL kernels execute. 
+     * The context includes a set of devices,
+     * the memory accessible to those devices, 
+     * and one or more command queues 
+     * used to schedule execution of one or more kernels. 
+     * A context is needed to share memory objects between devices.
+     */
+    ComputeContext = clCreateContext(0, 1, &ComputeDeviceId, NULL, NULL, &err);
+    if (!ComputeContext)
+    {
+        printf("Error: Failed to create a compute ComputeContext!\n");
+        return EXIT_FAILURE;
+    }
+
+    /* 
+     * Create a command queue
+     * [Command Queue]
+     * OpenCL command queues are used for submitting work to a device.
+     * They order the execution of kernels on a device 
+     * and manipulate memory objects. 
+     * OpenCL executes the commands in the order 
+     * that you enqueue them. .
+     */
+    
+    ComputeCommands = clCreateCommandQueue(ComputeContext, ComputeDeviceId, 0, &err);
+    if (!ComputeCommands)
+    {
+        printf("Error: Failed to create a command ComputeCommands!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Create the compute program from the source buffer
+    //
+    ComputeProgram = clCreateProgramWithSource(ComputeContext, 1, (const char **) & source, NULL, &err);
+    if (!ComputeProgram || err != CL_SUCCESS)
+    {
+        printf("%s\n", source);
+        printf("Error: Failed to create compute program!\n");
+        return EXIT_FAILURE;
+    }
+    
+    // Build the program executable
+    //
+    err = clBuildProgram(ComputeProgram, 0, NULL, NULL, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        size_t length;
+        char build_log[2048];
+        printf("%s\n", source);
+        printf("Error: Failed to build program executable!\n");
+        clGetProgramBuildInfo(ComputeProgram, ComputeDeviceId, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
+        printf("%s\n", build_log);
+        return EXIT_FAILURE;
+    }
+
+    ComputeKernels = (cl_kernel*) malloc(KernelCount * sizeof(cl_kernel));
+    for(i = 0; i < KernelCount; i++)
+    {    
+        // Create each compute kernel from within the program
+        //
+        ComputeKernels[i] = clCreateKernel(ComputeProgram, KernelNames[i], &err);
+        if (!ComputeKernels[i] || err != CL_SUCCESS)
+        {
+            printf("Error: Failed to create compute kernel!\n");
+            return EXIT_FAILURE;
+        }
+		
+		size_t wgSize;
+		err = clGetKernelWorkGroupInfo(ComputeKernels[i], ComputeDeviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); 
+		if(err)
+		{
+			printf("Error: Failed to get kernel work group size\n");
+			return EXIT_FAILURE;
+		}
+		GROUP_SIZE = min( GROUP_SIZE, wgSize );
+		
+    }
+    /*
+     * KernelNames[i]
+     * (gdb) p KernelNames[0]
+     * $3 = 0x100004740 "PreScanKernel"
+     * (gdb) p KernelNames[1]
+     * $4 = 0x100004740 "PreScanKernel"
+     * (gdb) p KernelNames[2]
+     * $5 = 0x10000474e "PreScanStoreSumKernel"
+     * (gdb) p KernelNames[3]
+     * $6 = 0x100004768 "PreScanStoreSumNonPowerOfTwoKernel"
+     * (gdb) p KernelNames[4]
+     * $7 = 0x10000478b "PreScanNonPowerOfTwoKernel"
+     * (gdb) p KernelNames[5]
+     * $8 = 0x1000047a6 "UniformAddKernel"
+     */
+
+    free(source);
+
+    // Create the input buffer on the device
+    //
+    size_t buffer_size = sizeof(float) * count;
+    input_buffer = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
+    if (!input_buffer)
+    {
+        printf("Error: Failed to allocate input buffer on device!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Fill the input buffer with the host allocated random data
+    //
+    err = clEnqueueWriteBuffer(ComputeCommands, input_buffer, CL_TRUE, 0, buffer_size, float_data, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to write to source array!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Create the output buffer on the device
+    //
+    output_buffer = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
+    if (!output_buffer)
+    {
+        printf("Error: Failed to allocate result buffer on device!\n");
+        return EXIT_FAILURE;
+    }
+
+    float* result = (float*)malloc(buffer_size);
+    memset(result, 0, buffer_size);
+	
+    err = clEnqueueWriteBuffer(ComputeCommands, output_buffer, CL_TRUE, 0, buffer_size, result, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to write to source array!\n");
+        return EXIT_FAILURE;
+    }
+		
+    CreatePartialSumBuffers(count);
+    PreScanBuffer(output_buffer, input_buffer, GROUP_SIZE, GROUP_SIZE, count);
+
+    printf("Starting timing run of '%d' iterations...\n", iterations);
+
+    t0 = t1 = GetCurrentTime();
+    for (i = 0; i < iterations; i++)
+    {
+        PreScanBuffer(output_buffer, input_buffer, GROUP_SIZE, GROUP_SIZE, count);
+    }
+
+    err = clFinish(ComputeCommands);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to wait for command queue to finish! %d\n", err);
+        return EXIT_FAILURE;
+    }
+    t2 = GetCurrentTime();
+
+    
+    // Calculate the statistics for execution time and throughput
+    //
+    double t = SubtractTimeInSec(t2, t1);
+    printf("Exec Time:  %.2f ms\n", 1000.0 * t / (double)(iterations));
+    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
+    printf(SEPARATOR);
+
+    // Read back the results that were computed on the device
+    //
+    err = clEnqueueReadBuffer(ComputeCommands, output_buffer, CL_TRUE, 0, buffer_size, result, 0, NULL, NULL);
+    if (err)
+    {
+        printf("Error: Failed to read back results from the device!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Verify the results are correct
+    //
+    float* reference = (float*) malloc( buffer_size); 
+    ScanReference(reference, float_data, count);
+
+    float error = 0.0f;
+    float diff = 0.0f;
+    for(i = 0; i < count; i++)
+    {
+        diff = fabs(reference[i] - result[i]);
+        error = diff > error ? diff : error;
+    }
+
+    if (error > MAX_ERROR)
+    {
+        printf("Error:   Incorrect results obtained! Max error = %f\n", error);
+        return EXIT_FAILURE;
+    }
+    else
+    {
+        printf("Results Validated!\n");
+        printf(SEPARATOR);
+    }
+    
+    // Shutdown and cleanup
+    //
+    ReleasePartialSums();    
+    for(i = 0; i < KernelCount; i++)
+        clReleaseKernel(ComputeKernels[i]);
+    clReleaseProgram(ComputeProgram);
+    clReleaseMemObject(input_buffer);
+    clReleaseMemObject(output_buffer);
+    clReleaseCommandQueue(ComputeCommands);
+    clReleaseContext(ComputeContext);
+    
+    free(ComputeKernels);
+    free(float_data);
+    free(reference);
+    free(result);
+    
+        
+    return 0;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parallel_Prefix_Sum_Example/scan.xcodeproj/project.pbxproj	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,220 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 42;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		466E0F660C932ED500ED01DB /* OpenCL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 466E0F650C932ED500ED01DB /* OpenCL.framework */; };
+		466E0F6D0C932F0F00ED01DB /* scan.c in Sources */ = {isa = PBXBuildFile; fileRef = 466E0F5A0C93299100ED01DB /* scan.c */; };
+		C394446C0DAFF5B2008FFE68 /* scan_kernel.cl in CopyFiles */ = {isa = PBXBuildFile; fileRef = C394446B0DAFF5AE008FFE68 /* scan_kernel.cl */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		C39444690DAFF5A0008FFE68 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 16;
+			files = (
+				C394446C0DAFF5B2008FFE68 /* scan_kernel.cl in CopyFiles */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		466E0F5A0C93299100ED01DB /* scan.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = scan.c; sourceTree = "<group>"; };
+		466E0F5F0C932E1A00ED01DB /* scan */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = scan; sourceTree = BUILT_PRODUCTS_DIR; };
+		466E0F650C932ED500ED01DB /* OpenCL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenCL.framework; path = /System/Library/Frameworks/OpenCL.framework; sourceTree = "<absolute>"; };
+		C394446B0DAFF5AE008FFE68 /* scan_kernel.cl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = scan_kernel.cl; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		466E0F5D0C932E1A00ED01DB /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				466E0F660C932ED500ED01DB /* OpenCL.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		466E0F490C93291B00ED01DB = {
+			isa = PBXGroup;
+			children = (
+				C382C3610ED6042D00C4C6E1 /* Source Files */,
+				C382C35E0ED6041B00C4C6E1 /* Compute Kernels */,
+				C382C30C0ED6041300C4C6E1 /* Frameworks */,
+				466E0F600C932E1A00ED01DB /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		466E0F600C932E1A00ED01DB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				466E0F5F0C932E1A00ED01DB /* scan */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C382C30C0ED6041300C4C6E1 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				466E0F650C932ED500ED01DB /* OpenCL.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		C382C35E0ED6041B00C4C6E1 /* Compute Kernels */ = {
+			isa = PBXGroup;
+			children = (
+				C394446B0DAFF5AE008FFE68 /* scan_kernel.cl */,
+			);
+			name = "Compute Kernels";
+			sourceTree = "<group>";
+		};
+		C382C3610ED6042D00C4C6E1 /* Source Files */ = {
+			isa = PBXGroup;
+			children = (
+				466E0F5A0C93299100ED01DB /* scan.c */,
+			);
+			name = "Source Files";
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		466E0F5E0C932E1A00ED01DB /* scan */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 466E0F640C932E1A00ED01DB /* Build configuration list for PBXNativeTarget "scan" */;
+			buildPhases = (
+				466E0F5C0C932E1A00ED01DB /* Sources */,
+				466E0F5D0C932E1A00ED01DB /* Frameworks */,
+				C39444690DAFF5A0008FFE68 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = scan;
+			productName = scan;
+			productReference = 466E0F5F0C932E1A00ED01DB /* scan */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		466E0F4B0C93291B00ED01DB /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 466E0F4E0C93291B00ED01DB /* Build configuration list for PBXProject "scan" */;
+			compatibilityVersion = "Xcode 2.4";
+			hasScannedForEncodings = 0;
+			mainGroup = 466E0F490C93291B00ED01DB;
+			productRefGroup = 466E0F600C932E1A00ED01DB /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				466E0F5E0C932E1A00ED01DB /* scan */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		466E0F5C0C932E1A00ED01DB /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				466E0F6D0C932F0F00ED01DB /* scan.c in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		466E0F4C0C93291B00ED01DB /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				HEADER_SEARCH_PATHS = ../../inc;
+				LIBRARY_SEARCH_PATHS = ../../lib;
+			};
+			name = Debug;
+		};
+		466E0F4D0C93291B00ED01DB /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = YES;
+				HEADER_SEARCH_PATHS = ../../inc;
+				LIBRARY_SEARCH_PATHS = ../../lib;
+			};
+			name = Release;
+		};
+		466E0F620C932E1A00ED01DB /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_FIX_AND_CONTINUE = YES;
+				GCC_MODEL_TUNING = G5;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1)",
+				);
+				LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1 = "\"$(SRCROOT)/../lib\"";
+				PREBINDING = NO;
+				PRODUCT_NAME = scan;
+				ZERO_LINK = YES;
+			};
+			name = Debug;
+		};
+		466E0F630C932E1A00ED01DB /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = YES;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				GCC_ENABLE_FIX_AND_CONTINUE = NO;
+				GCC_MODEL_TUNING = G5;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1)",
+				);
+				LIBRARY_SEARCH_PATHS_QUOTED_FOR_TARGET_1 = "\"$(SRCROOT)/../lib\"";
+				PREBINDING = NO;
+				PRODUCT_NAME = scan;
+				ZERO_LINK = NO;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		466E0F4E0C93291B00ED01DB /* Build configuration list for PBXProject "scan" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				466E0F4C0C93291B00ED01DB /* Debug */,
+				466E0F4D0C93291B00ED01DB /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		466E0F640C932E1A00ED01DB /* Build configuration list for PBXNativeTarget "scan" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				466E0F620C932E1A00ED01DB /* Debug */,
+				466E0F630C932E1A00ED01DB /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 466E0F4B0C93291B00ED01DB /* Project object */;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parallel_Prefix_Sum_Example/scan_kernel.cl	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,419 @@
+//
+// File:       scan_kernel.cl
+//
+// Abstract:   This example shows how to perform an efficient parallel prefix sum (aka Scan
+//             using OpenCL.  Scan is a common data parallel primitive which can be used for 
+//             variety of different operations -- this example uses local memory for storing
+//             partial sums and avoids memory bank conflicts on architectures which serialize
+//             memory operations that are serviced on the same memory bank by offsetting the
+//             loads and stores based on the size of the local group and the number of
+//             memory banks (see appropriate macro definition).  As a result, this example
+//             requires that the local group size > 1.
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MEMORY_BANK_COUNT       (16)  // Adjust to your architecture
+#define LOG2_MEMORY_BANK_COUNT   (4)  // Set to log2(MEMORY_BANK_COUNT)
+#define ELIMINATE_CONFLICTS      (0)  // Enable for slow address calculation, but zero bank conflicts
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (ELIMINATE_CONFLICTS)
+#define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT + (index) >> (2*LOG2_MEMORY_BANK_COUNT))
+#else
+#define MEMORY_BANK_OFFSET(index) ((index) >> LOG2_MEMORY_BANK_COUNT)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+uint4 
+GetAddressMapping(int index)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    uint2 global_index;
+    global_index.x = index + local_id;
+    global_index.y = global_index.x + group_size;
+
+    uint2 local_index;
+    local_index.x = local_id;
+    local_index.y = local_id + group_size;
+
+    return (uint4)(global_index.x, global_index.y, local_index.x, local_index.y);
+}
+
+void 
+LoadLocalFromGlobal(
+    __local float *shared_data,
+    __global const float *input_data, 
+    const uint4 address_pair,
+    const uint n)
+{
+    const uint global_index_a = address_pair.x; 
+    const uint global_index_b = address_pair.y; 
+
+    const uint local_index_a = address_pair.z; 
+    const uint local_index_b = address_pair.w; 
+
+    const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
+    const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
+
+    shared_data[local_index_a + bank_offset_a] = input_data[global_index_a]; 
+    shared_data[local_index_b + bank_offset_b] = input_data[global_index_b]; 
+}
+
+void 
+LoadLocalFromGlobalNonPowerOfTwo(
+    __local float *shared_data,
+    __global const float *input_data, 
+    const uint4 address_pair,
+    const uint n)
+{
+    const uint global_index_a = address_pair.x; 
+    const uint global_index_b = address_pair.y; 
+
+    const uint local_index_a = address_pair.z; 
+    const uint local_index_b = address_pair.w; 
+
+    const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
+    const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
+
+    shared_data[local_index_a + bank_offset_a] = input_data[global_index_a]; 
+    shared_data[local_index_b + bank_offset_b] = (local_index_b < n) ? input_data[global_index_b] : 0; 
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void 
+StoreLocalToGlobal(
+    __global float* output_data, 
+    __local const float* shared_data,
+    const uint4 address_pair,
+    const uint n)
+{
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const uint global_index_a = address_pair.x; 
+    const uint global_index_b = address_pair.y; 
+
+    const uint local_index_a = address_pair.z; 
+    const uint local_index_b = address_pair.w; 
+
+    const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
+    const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
+
+    output_data[global_index_a] = shared_data[local_index_a + bank_offset_a]; 
+    output_data[global_index_b] = shared_data[local_index_b + bank_offset_b]; 
+}
+
+void 
+StoreLocalToGlobalNonPowerOfTwo(
+    __global float* output_data, 
+    __local const float* shared_data,
+    const uint4 address_pair,
+    const uint n)
+{
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const uint global_index_a = address_pair.x; 
+    const uint global_index_b = address_pair.y; 
+
+    const uint local_index_a = address_pair.z; 
+    const uint local_index_b = address_pair.w; 
+
+    const uint bank_offset_a = MEMORY_BANK_OFFSET(local_index_a);
+    const uint bank_offset_b = MEMORY_BANK_OFFSET(local_index_b);
+
+    output_data[global_index_a] = shared_data[local_index_a + bank_offset_a]; 
+    if(local_index_b < n)
+        output_data[global_index_b] = shared_data[local_index_b + bank_offset_b]; 
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void 
+ClearLastElement(
+    __local float* shared_data, 
+    int group_index)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    if (local_id == 0)
+    {
+        int index = (group_size << 1) - 1;
+        index += MEMORY_BANK_OFFSET(index);
+        shared_data[index] = 0;
+    }
+}
+
+void 
+ClearLastElementStoreSum(
+    __local float* shared_data, 
+    __global float *partial_sums, 
+    int group_index)
+{
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+    const uint local_id = get_local_id(0); 
+
+    if (local_id == 0)
+    {
+        int index = (group_size << 1) - 1;
+        index += MEMORY_BANK_OFFSET(index);
+        partial_sums[group_index] = shared_data[index];
+        shared_data[index] = 0;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+uint 
+BuildPartialSum(
+    __local float *shared_data)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_size = get_local_size(0);
+    const uint two = 2;
+    uint stride = 1;
+    
+    for (uint j = group_size; j > 0; j >>= 1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (local_id < j)      
+        {
+            int i  = mul24(mul24(two, stride), local_id);
+
+            uint local_index_a = i + stride - 1;
+            uint local_index_b = local_index_a + stride;
+
+            local_index_a += MEMORY_BANK_OFFSET(local_index_a);
+            local_index_b += MEMORY_BANK_OFFSET(local_index_b);
+
+            shared_data[local_index_b] += shared_data[local_index_a];
+        }
+
+        stride *= two;
+    }
+
+    return stride;
+}
+
+void 
+ScanRootToLeaves(
+    __local float *shared_data, 
+    uint stride)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+    const uint two = 2;
+    
+    for (uint j = 1; j <= group_size; j *= two)
+    {
+        stride >>= 1;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (local_id < j)
+        {
+            int i  = mul24(mul24(two, stride), local_id);
+
+            uint local_index_a = i + stride - 1;
+            uint local_index_b = local_index_a + stride;
+
+            local_index_a += MEMORY_BANK_OFFSET(local_index_a);
+            local_index_b += MEMORY_BANK_OFFSET(local_index_b);
+
+            float t = shared_data[local_index_a];
+            shared_data[local_index_a] = shared_data[local_index_b];
+            shared_data[local_index_b] += t;
+        }
+    }
+}
+
+void 
+PreScanGroup(
+    __local float *shared_data, 
+    int group_index)
+{
+    const uint group_id = get_global_id(0) / get_local_size(0);
+
+    int stride = BuildPartialSum(shared_data);               
+    ClearLastElement(shared_data, (group_index == 0) ? group_id : group_index);
+    ScanRootToLeaves(shared_data, stride);             
+}
+
+void 
+PreScanGroupStoreSum(
+    __global float *partial_sums,
+    __local float *shared_data, 
+    int group_index) 
+{
+    const uint group_id = get_global_id(0) / get_local_size(0);
+
+    int stride = BuildPartialSum(shared_data);               
+    ClearLastElementStoreSum(shared_data, partial_sums, (group_index == 0) ? group_id : group_index);
+    ScanRootToLeaves(shared_data, stride);             
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void 
+PreScanKernel(
+    __global float *output_data, 
+    __global const float *input_data, 
+    __local float* shared_data,
+    const uint  group_index, 
+    const uint  base_index,
+    const uint  n)
+{
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+    
+    uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
+    uint4 address_pair = GetAddressMapping(local_index);
+    
+    LoadLocalFromGlobal(shared_data, input_data, address_pair, n); 
+    PreScanGroup(shared_data, group_index); 
+    StoreLocalToGlobal(output_data, shared_data, address_pair, n);
+}
+
+__kernel void 
+PreScanStoreSumKernel(
+    __global float *output_data, 
+    __global const float *input_data, 
+    __global float *partial_sums, 
+    __local float* shared_data,
+    const uint group_index, 
+    const uint base_index,
+    const uint n)
+{
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
+    uint4 address_pair = GetAddressMapping(local_index);
+    
+    LoadLocalFromGlobal(shared_data, input_data, address_pair, n); 
+    PreScanGroupStoreSum(partial_sums, shared_data, group_index); 
+    StoreLocalToGlobal(output_data, shared_data, address_pair, n);
+}
+
+__kernel void 
+PreScanStoreSumNonPowerOfTwoKernel(
+    __global float *output_data, 
+    __global const float *input_data, 
+    __global float *partial_sums, 
+    __local float* shared_data,
+    const uint group_index, 
+    const uint base_index,
+    const uint n) 
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
+    uint4 address_pair = GetAddressMapping(local_index);
+    
+    LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n); 
+    PreScanGroupStoreSum(partial_sums, shared_data, group_index); 
+    StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n);
+}
+
+__kernel void 
+PreScanNonPowerOfTwoKernel(
+    __global float *output_data, 
+    __global const float *input_data, 
+    __local float* shared_data,
+    const uint group_index, 
+    const uint base_index,
+    const uint n)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    uint local_index = (base_index == 0) ? mul24(group_id, (group_size << 1)) : base_index;
+    uint4 address_pair = GetAddressMapping(local_index);
+    
+    LoadLocalFromGlobalNonPowerOfTwo(shared_data, input_data, address_pair, n); 
+    PreScanGroup(shared_data, group_index); 
+    StoreLocalToGlobalNonPowerOfTwo(output_data, shared_data, address_pair, n);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void UniformAddKernel(
+    __global float *output_data, 
+    __global float *input_data, 
+    __local float *shared_data,
+    const uint group_offset, 
+    const uint base_index,
+    const uint n)
+{
+    const uint local_id = get_local_id(0);
+    const uint group_id = get_global_id(0) / get_local_size(0);
+    const uint group_size = get_local_size(0);
+
+    if (local_id == 0)
+        shared_data[0] = input_data[group_id + group_offset];
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    uint address = mul24(group_id, (group_size << 1)) + base_index + local_id;
+    
+    output_data[address]              += shared_data[0];
+	if( (local_id + group_size) < n)
+		output_data[address + group_size] += shared_data[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+