view parallel_Prefix_Sum_Example/scan.cc @ 12:a664602e1819 default tip

add sample
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 12 Feb 2013 17:13:26 +0900
parents ccea4e6a1945
children
line wrap: on
line source

//
// File:       scan.c
//
// Abstract:   This example shows how to perform an efficient parallel prefix sum (aka Scan)
//             using OpenCL.  Scan is a common data parallel primitive which can be used for 
//             variety of different operations -- this example uses local memory for storing
//             partial sums and avoids memory bank conflicts on architectures which serialize
//             memory operations that are serviced on the same memory bank by offsetting the
//             loads and stores based on the size of the local group and the number of
//             memory banks (see appropriate macro definition).  As a result, this example
//             requires that the local group size > 1.
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <libc.h>
#include <stdbool.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <mach/mach_time.h>
#include <math.h>

#include <OpenCL/opencl.h>

////////////////////////////////////////////////////////////////////////////////////////////////////

#define DEBUG_INFO      (0)
int		GROUP_SIZE      = 256;
#define NUM_BANKS       (16)
#define MAX_ERROR       (1e-7)
#define SEPARATOR       ("----------------------------------------------------------------------\n")

#define min(A,B) ((A) < (B) ? (A) : (B))

static int iterations = 1000;
static int count      = 1024 * 1024;    

////////////////////////////////////////////////////////////////////////////////////////////////////

cl_device_id            ComputeDeviceId;
cl_command_queue        ComputeCommands;
cl_context              ComputeContext;
cl_program              ComputeProgram;
cl_kernel*              ComputeKernels;
cl_mem*                 ScanPartialSums = 0;
unsigned int            ElementsAllocated = 0;
unsigned int            LevelsAllocated = 0;

////////////////////////////////////////////////////////////////////////////////////////////////////

enum KernelMethods
{
    PRESCAN                             = 0,
    PRESCAN_STORE_SUM                   = 1,
    PRESCAN_STORE_SUM_NON_POWER_OF_TWO  = 2,
    PRESCAN_NON_POWER_OF_TWO            = 3,
    UNIFORM_ADD                         = 4
};

static const char* KernelNames[] =
{
    "PreScanKernel",
    "PreScanStoreSumKernel",
    "PreScanStoreSumNonPowerOfTwoKernel",
    "PreScanNonPowerOfTwoKernel",
    "UniformAddKernel"
};

static const unsigned int KernelCount = sizeof(KernelNames) / sizeof(char *);

////////////////////////////////////////////////////////////////////////////////////////////////////

uint64_t
GetCurrentTime()
{
    return mach_absolute_time();
}
	
double 
SubtractTimeInSec( uint64_t endtime, uint64_t starttime )
{    
	static double conversion = 0.0;
	uint64_t difference = endtime - starttime;
	if( 0 == conversion )
	{
		mach_timebase_info_data_t timebase;
		kern_return_t kError = mach_timebase_info( &timebase );
		if( kError == 0  )
			conversion = 1e-9 * (double) timebase.numer / (double) timebase.denom;
    }
		
	return conversion * (double) difference; 
}

static char *
LoadProgramSourceFromFile(const char *filename)
{
    struct stat statbuf;
    FILE        *fh;
    char        *source;

    fh = fopen(filename, "r");
    if (fh == 0)
        return 0;

    stat(filename, &statbuf);
    source = (char *) malloc(statbuf.st_size + 1);
    fread(source, statbuf.st_size, 1, fh);
    source[statbuf.st_size] = '\0';

    return source;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

bool IsPowerOfTwo(int n)
{
    return ((n&(n-1))==0) ;
}

int floorPow2(int n)
{
    int exp;
    frexp((float)n, &exp);
    return 1 << (exp - 1);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

int 
CreatePartialSumBuffers(unsigned int count)
{
    ElementsAllocated = count;

    unsigned int group_size = GROUP_SIZE;
    unsigned int element_count = count;

    int level = 0;

    do
    {       
        unsigned int group_count = (int)fmax(1, (int)ceil((float)element_count / (2.0f * group_size)));
        if (group_count > 1)
        {
            level++;
        }
        element_count = group_count;
        
    } while (element_count > 1);

    ScanPartialSums = (cl_mem*) malloc(level * sizeof(cl_mem));
    LevelsAllocated = level;
    memset(ScanPartialSums, 0, sizeof(cl_mem) * level);
    
    element_count = count;
    level = 0;
    
    do
    {       
        unsigned int group_count = (int)fmax(1, (int)ceil((float)element_count / (2.0f * group_size)));
        if (group_count > 1) 
        {
            size_t buffer_size = group_count * sizeof(float);
            ScanPartialSums[level++] = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
        }

        element_count = group_count;

    } while (element_count > 1);

    return CL_SUCCESS;
}

void 
ReleasePartialSums(void)
{
    unsigned int i;
    for (i = 0; i < LevelsAllocated; i++)
    {
        clReleaseMemObject(ScanPartialSums[i]);
    }    
    
    free(ScanPartialSums);
    ScanPartialSums = 0;
    ElementsAllocated = 0;
    LevelsAllocated = 0;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

int
PreScan(
    size_t *global, 
    size_t *local, 
    size_t shared, 
    cl_mem output_data, 
    cl_mem input_data, 
    unsigned int n,
    int group_index, 
    int base_index)
{
#if DEBUG_INFO
    printf("PreScan: Global[%4d] Local[%4d] Shared[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
        (int)global[0], (int)local[0], (int)shared, group_index, base_index, n);
#endif

    unsigned int k = PRESCAN;
    unsigned int a = 0;

    int err = CL_SUCCESS;
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    return CL_SUCCESS;
}

int
PreScanStoreSum(
    size_t *global, 
    size_t *local, 
    size_t shared, 
    cl_mem output_data, 
    cl_mem input_data, 
    cl_mem partial_sums,
    unsigned int n,
    int group_index, 
    int base_index)
{
#if DEBUG_INFO
    printf("PreScan: Global[%4d] Local[%4d] Shared[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
        (int)global[0], (int)local[0], (int)shared, group_index, base_index, n);
#endif

    unsigned int k = PRESCAN_STORE_SUM;
    unsigned int a = 0;

    int err = CL_SUCCESS;
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }
    
    return CL_SUCCESS;
}

int
PreScanStoreSumNonPowerOfTwo(
    size_t *global, 
    size_t *local, 
    size_t shared, 
    cl_mem output_data, 
    cl_mem input_data, 
    cl_mem partial_sums,
    unsigned int n,
    int group_index, 
    int base_index)
{
#if DEBUG_INFO
    printf("PreScanStoreSumNonPowerOfTwo: Global[%4d] Local[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
        (int)global[0], (int)local[0], group_index, base_index, n);
#endif

    unsigned int k = PRESCAN_STORE_SUM_NON_POWER_OF_TWO;
    unsigned int a = 0;

    int err = CL_SUCCESS;
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    return CL_SUCCESS;
}

int
PreScanNonPowerOfTwo(
    size_t *global, 
    size_t *local, 
    size_t shared, 
    cl_mem output_data, 
    cl_mem input_data, 
    unsigned int n,
    int group_index, 
    int base_index)
{
#if DEBUG_INFO
    printf("PreScanNonPowerOfTwo: Global[%4d] Local[%4d] BlockIndex[%4d] BaseIndex[%4d] Entries[%d]\n", 
        (int)global[0], (int)local[0], group_index, base_index, n);
#endif

    unsigned int k = PRESCAN_NON_POWER_OF_TWO;
    unsigned int a = 0;

    int err = CL_SUCCESS;
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &input_data);
    err |= clSetKernelArg(ComputeKernels[k],  a++, shared,         0);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }
    return CL_SUCCESS;
}

int
UniformAdd(
    size_t *global, 
    size_t *local, 
    cl_mem output_data, 
    cl_mem partial_sums, 
    unsigned int n, 
    unsigned int group_offset, 
    unsigned int base_index)
{
#if DEBUG_INFO
    printf("UniformAdd: Global[%4d] Local[%4d] BlockOffset[%4d] BaseIndex[%4d] Entries[%d]\n", 
        (int)global[0], (int)local[0], group_offset, base_index, n);
#endif

    unsigned int k = UNIFORM_ADD;
    unsigned int a = 0;

    int err = CL_SUCCESS;
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &output_data);  
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_mem), &partial_sums);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(float),  0);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &group_offset);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &base_index);
    err |= clSetKernelArg(ComputeKernels[k],  a++, sizeof(cl_int), &n);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to set kernel arguments!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(ComputeCommands, ComputeKernels[k], 1, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: %s: Failed to execute kernel!\n", KernelNames[k]);
        return EXIT_FAILURE;
    }

    return CL_SUCCESS;
}

int 
PreScanBufferRecursive(
    cl_mem output_data, 
    cl_mem input_data, 
    int max_group_size,
    int max_work_item_count,
    int element_count, 
    int level)
{
    unsigned int group_size = max_group_size; 
    unsigned int group_count = (int)fmax(1.0f, (int)ceil((float)element_count / (2.0f * group_size)));
    unsigned int work_item_count = 0;

    if (group_count > 1)
        work_item_count = group_size;
    else if (IsPowerOfTwo(element_count))
        work_item_count = element_count / 2;
    else
        work_item_count = floorPow2(element_count);
        
    work_item_count = (work_item_count > max_work_item_count) ? max_work_item_count : work_item_count;

    unsigned int element_count_per_group = work_item_count * 2;
    unsigned int last_group_element_count = element_count - (group_count-1) * element_count_per_group;
    unsigned int remaining_work_item_count = (int)fmax(1.0f, last_group_element_count / 2);
    remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
    unsigned int remainder = 0;
    size_t last_shared = 0;

    
    if (last_group_element_count != element_count_per_group)
    {
        remainder = 1;

        if(!IsPowerOfTwo(last_group_element_count))
            remaining_work_item_count = floorPow2(last_group_element_count);    
        
        remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
        unsigned int padding = (2 * remaining_work_item_count) / NUM_BANKS;
        last_shared = sizeof(float) * (2 * remaining_work_item_count + padding);
    }

    remaining_work_item_count = (remaining_work_item_count > max_work_item_count) ? max_work_item_count : remaining_work_item_count;
    size_t global[] = { (int)fmax(1, group_count - remainder) * work_item_count, 1 };
    size_t local[]  = { work_item_count, 1 };  

    unsigned int padding = element_count_per_group / NUM_BANKS;
    size_t shared = sizeof(float) * (element_count_per_group + padding);
    
    cl_mem partial_sums = ScanPartialSums[level];
    int err = CL_SUCCESS;
    
    if (group_count > 1)
    {
        err = PreScanStoreSum(global, local, shared, output_data, input_data, partial_sums, work_item_count * 2, 0, 0);
        if(err != CL_SUCCESS)
            return err;
            
        if (remainder)
        {
            size_t last_global[] = { 1 * remaining_work_item_count, 1 };
            size_t last_local[]  = { remaining_work_item_count, 1 };  

            err = PreScanStoreSumNonPowerOfTwo(
                    last_global, last_local, last_shared, 
                    output_data, input_data, partial_sums,
                    last_group_element_count, 
                    group_count - 1, 
                    element_count - last_group_element_count);    
        
            if(err != CL_SUCCESS)
                return err;			
			
        }

        err = PreScanBufferRecursive(partial_sums, partial_sums, max_group_size, max_work_item_count, group_count, level + 1);
        if(err != CL_SUCCESS)
            return err;
            
        err = UniformAdd(global, local, output_data, partial_sums,  element_count - last_group_element_count, 0, 0);
        if(err != CL_SUCCESS)
            return err;
        
        if (remainder)
        {
            size_t last_global[] = { 1 * remaining_work_item_count, 1 };
            size_t last_local[]  = { remaining_work_item_count, 1 };  

            err = UniformAdd(
                    last_global, last_local, 
                    output_data, partial_sums,
                    last_group_element_count, 
                    group_count - 1, 
                    element_count - last_group_element_count);
                
            if(err != CL_SUCCESS)
                return err;
        }
    }
    else if (IsPowerOfTwo(element_count))
    {
        err = PreScan(global, local, shared, output_data, input_data, work_item_count * 2, 0, 0);
        if(err != CL_SUCCESS)
            return err;
    }
    else
    {
        err = PreScanNonPowerOfTwo(global, local, shared, output_data, input_data, element_count, 0, 0);
        if(err != CL_SUCCESS)
            return err;
    }

    return CL_SUCCESS;
}

void 
PreScanBuffer(
    cl_mem output_data, 
    cl_mem input_data, 
    unsigned int max_group_size,
    unsigned int max_work_item_count,
    unsigned int element_count)
{
    PreScanBufferRecursive(output_data, input_data, max_group_size, max_work_item_count, element_count, 0);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void ScanReference( float* reference, float* input, const unsigned int count) 
{
    reference[0] = 0;
    double total_sum = 0;
    
    unsigned int i = 1;
    for( i = 1; i < count; ++i) 
    {
        total_sum += input[i-1];
        reference[i] = input[i-1] + reference[i-1];
    }
    if (total_sum != reference[count-1])
        printf("Warning: Exceeding single-precision accuracy.  Scan will be inaccurate.\n");
}

////////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char **argv)
{
    int i;
    uint64_t         t0 = 0;
    uint64_t         t1 = 0;
    uint64_t         t2 = 0;
    int              err = 0;
    cl_mem			 output_buffer;
    cl_mem           input_buffer;
    
    // Create some random input data on the host 
    //
    float *float_data = (float*)malloc(count * sizeof(float));
    for (i = 0; i < count; i++)
    {
        float_data[i] = (int)(10 * ((float) rand() / (float) RAND_MAX));
    }

    // Connect to a GPU compute device
    //
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &ComputeDeviceId, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate a compute device!\n");
        return EXIT_FAILURE;
    }

    size_t returned_size = 0;
    size_t max_workgroup_size = 0;
    err = clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }
	
	GROUP_SIZE = min( GROUP_SIZE, max_workgroup_size );

    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    err = clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
    err|= clGetDeviceInfo(ComputeDeviceId, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve device info!\n");
        return EXIT_FAILURE;
    }

    printf(SEPARATOR);
    printf("Connecting to %s %s...\n", vendor_name, device_name);

    // Load the compute program from disk into a cstring buffer
    //
    printf(SEPARATOR);
    const char* filename = "./scan_kernel.cl";
    printf("Loading program '%s'...\n", filename);
    printf(SEPARATOR);

    char *source = LoadProgramSourceFromFile(filename);
    if(!source)
    {
        printf("Error: Failed to load compute program from file!\n");
        return EXIT_FAILURE;    
    }
    
    /* 
     * Create a compute ComputeContext 
     * [Context]
     * The context is the environment in which 
     * OpenCL kernels execute. 
     * The context includes a set of devices,
     * the memory accessible to those devices, 
     * and one or more command queues 
     * used to schedule execution of one or more kernels. 
     * A context is needed to share memory objects between devices.
     */
    ComputeContext = clCreateContext(0, 1, &ComputeDeviceId, NULL, NULL, &err);
    if (!ComputeContext)
    {
        printf("Error: Failed to create a compute ComputeContext!\n");
        return EXIT_FAILURE;
    }

    /* 
     * Create a command queue
     * [Command Queue]
     * OpenCL command queues are used for submitting work to a device.
     * They order the execution of kernels on a device 
     * and manipulate memory objects. 
     * OpenCL executes the commands in the order 
     * that you enqueue them. .
     */
    
    ComputeCommands = clCreateCommandQueue(ComputeContext, ComputeDeviceId, 0, &err);
    if (!ComputeCommands)
    {
        printf("Error: Failed to create a command ComputeCommands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    //
    ComputeProgram = clCreateProgramWithSource(ComputeContext, 1, (const char **) & source, NULL, &err);
    if (!ComputeProgram || err != CL_SUCCESS)
    {
        printf("%s\n", source);
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    
    // Build the program executable
    //
    err = clBuildProgram(ComputeProgram, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t length;
        char build_log[2048];
        printf("%s\n", source);
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(ComputeProgram, ComputeDeviceId, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
        printf("%s\n", build_log);
        return EXIT_FAILURE;
    }

    ComputeKernels = (cl_kernel*) malloc(KernelCount * sizeof(cl_kernel));
    for(i = 0; i < KernelCount; i++)
    {    
        // Create each compute kernel from within the program
        //
        ComputeKernels[i] = clCreateKernel(ComputeProgram, KernelNames[i], &err);
        if (!ComputeKernels[i] || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            return EXIT_FAILURE;
        }
		
		size_t wgSize;
		err = clGetKernelWorkGroupInfo(ComputeKernels[i], ComputeDeviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); 
		if(err)
		{
			printf("Error: Failed to get kernel work group size\n");
			return EXIT_FAILURE;
		}
		GROUP_SIZE = min( GROUP_SIZE, wgSize );
		
    }
    /*
     * KernelNames[i]
     * (gdb) p KernelNames[0]
     * $3 = 0x100004740 "PreScanKernel"
     * (gdb) p KernelNames[1]
     * $4 = 0x100004740 "PreScanKernel"
     * (gdb) p KernelNames[2]
     * $5 = 0x10000474e "PreScanStoreSumKernel"
     * (gdb) p KernelNames[3]
     * $6 = 0x100004768 "PreScanStoreSumNonPowerOfTwoKernel"
     * (gdb) p KernelNames[4]
     * $7 = 0x10000478b "PreScanNonPowerOfTwoKernel"
     * (gdb) p KernelNames[5]
     * $8 = 0x1000047a6 "UniformAddKernel"
     */

    free(source);

    // Create the input buffer on the device
    //
    size_t buffer_size = sizeof(float) * count;
    input_buffer = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!input_buffer)
    {
        printf("Error: Failed to allocate input buffer on device!\n");
        return EXIT_FAILURE;
    }

    // Fill the input buffer with the host allocated random data
    //
    err = clEnqueueWriteBuffer(ComputeCommands, input_buffer, CL_TRUE, 0, buffer_size, float_data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }

    // Create the output buffer on the device
    //
    output_buffer = clCreateBuffer(ComputeContext, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    if (!output_buffer)
    {
        printf("Error: Failed to allocate result buffer on device!\n");
        return EXIT_FAILURE;
    }

    float* result = (float*)malloc(buffer_size);
    memset(result, 0, buffer_size);
	
    err = clEnqueueWriteBuffer(ComputeCommands, output_buffer, CL_TRUE, 0, buffer_size, result, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return EXIT_FAILURE;
    }
		
    CreatePartialSumBuffers(count);
    PreScanBuffer(output_buffer, input_buffer, GROUP_SIZE, GROUP_SIZE, count);

    printf("Starting timing run of '%d' iterations...\n", iterations);

    t0 = t1 = GetCurrentTime();
    for (i = 0; i < iterations; i++)
    {
        PreScanBuffer(output_buffer, input_buffer, GROUP_SIZE, GROUP_SIZE, count);
    }

    err = clFinish(ComputeCommands);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to wait for command queue to finish! %d\n", err);
        return EXIT_FAILURE;
    }
    t2 = GetCurrentTime();

    
    // Calculate the statistics for execution time and throughput
    //
    double t = SubtractTimeInSec(t2, t1);
    printf("Exec Time:  %.2f ms\n", 1000.0 * t / (double)(iterations));
    printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
    printf(SEPARATOR);

    // Read back the results that were computed on the device
    //
    err = clEnqueueReadBuffer(ComputeCommands, output_buffer, CL_TRUE, 0, buffer_size, result, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to read back results from the device!\n");
        return EXIT_FAILURE;
    }

    // Verify the results are correct
    //
    float* reference = (float*) malloc( buffer_size); 
    ScanReference(reference, float_data, count);

    float error = 0.0f;
    float diff = 0.0f;
    for(i = 0; i < count; i++)
    {
        diff = fabs(reference[i] - result[i]);
        error = diff > error ? diff : error;
    }

    if (error > MAX_ERROR)
    {
        printf("Error:   Incorrect results obtained! Max error = %f\n", error);
        return EXIT_FAILURE;
    }
    else
    {
        printf("Results Validated!\n");
        printf(SEPARATOR);
    }
    
    // Shutdown and cleanup
    //
    ReleasePartialSums();    
    for(i = 0; i < KernelCount; i++)
        clReleaseKernel(ComputeKernels[i]);
    clReleaseProgram(ComputeProgram);
    clReleaseMemObject(input_buffer);
    clReleaseMemObject(output_buffer);
    clReleaseCommandQueue(ComputeCommands);
    clReleaseContext(ComputeContext);
    
    free(ComputeKernels);
    free(float_data);
    free(reference);
    free(result);
    
        
    return 0;
}