Mercurial > hg > Gears > GearsAgda

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/CMakeLists.txt	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 2.8)
+
+# add_definitions("-Wall -g -O0")
+
+set(CMAKE_C_COMPILER $ENV{CBC_COMPILER})
+set(CUDA_LINK_FLAGS "-Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib")
+SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" )
+
+cmake_minimum_required(VERSION 2.8)
+find_package(CUDA REQUIRED)
+
+add_custom_command(OUTPUT main.o
+   DEPENDS main.cu
+   COMMAND nvcc -O -c  main.cu
+)
+
+add_executable(cudaExmple main.o test.c)
+
+add_custom_command(OUTPUT twice.o
+   DEPENDS twice.cu
+   COMMAND nvcc -O -c  twice.cu
+)
+
+add_custom_command(OUTPUT multiply.o
+   DEPENDS multiply.cu
+   COMMAND nvcc -O -c  multiply.cu
+)
+
+add_executable(twiceExmple twice.o multiply.o test.c)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/helper_cuda.h	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,1283 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef HELPER_CUDA_H
+#define HELPER_CUDA_H
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header files, please
+// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
+// on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+    switch (error)
+    {
+        case cudaSuccess:
+            return "cudaSuccess";
+
+        case cudaErrorMissingConfiguration:
+            return "cudaErrorMissingConfiguration";
+
+        case cudaErrorMemoryAllocation:
+            return "cudaErrorMemoryAllocation";
+
+        case cudaErrorInitializationError:
+            return "cudaErrorInitializationError";
+
+        case cudaErrorLaunchFailure:
+            return "cudaErrorLaunchFailure";
+
+        case cudaErrorPriorLaunchFailure:
+            return "cudaErrorPriorLaunchFailure";
+
+        case cudaErrorLaunchTimeout:
+            return "cudaErrorLaunchTimeout";
+
+        case cudaErrorLaunchOutOfResources:
+            return "cudaErrorLaunchOutOfResources";
+
+        case cudaErrorInvalidDeviceFunction:
+            return "cudaErrorInvalidDeviceFunction";
+
+        case cudaErrorInvalidConfiguration:
+            return "cudaErrorInvalidConfiguration";
+
+        case cudaErrorInvalidDevice:
+            return "cudaErrorInvalidDevice";
+
+        case cudaErrorInvalidValue:
+            return "cudaErrorInvalidValue";
+
+        case cudaErrorInvalidPitchValue:
+            return "cudaErrorInvalidPitchValue";
+
+        case cudaErrorInvalidSymbol:
+            return "cudaErrorInvalidSymbol";
+
+        case cudaErrorMapBufferObjectFailed:
+            return "cudaErrorMapBufferObjectFailed";
+
+        case cudaErrorUnmapBufferObjectFailed:
+            return "cudaErrorUnmapBufferObjectFailed";
+
+        case cudaErrorInvalidHostPointer:
+            return "cudaErrorInvalidHostPointer";
+
+        case cudaErrorInvalidDevicePointer:
+            return "cudaErrorInvalidDevicePointer";
+
+        case cudaErrorInvalidTexture:
+            return "cudaErrorInvalidTexture";
+
+        case cudaErrorInvalidTextureBinding:
+            return "cudaErrorInvalidTextureBinding";
+
+        case cudaErrorInvalidChannelDescriptor:
+            return "cudaErrorInvalidChannelDescriptor";
+
+        case cudaErrorInvalidMemcpyDirection:
+            return "cudaErrorInvalidMemcpyDirection";
+
+        case cudaErrorAddressOfConstant:
+            return "cudaErrorAddressOfConstant";
+
+        case cudaErrorTextureFetchFailed:
+            return "cudaErrorTextureFetchFailed";
+
+        case cudaErrorTextureNotBound:
+            return "cudaErrorTextureNotBound";
+
+        case cudaErrorSynchronizationError:
+            return "cudaErrorSynchronizationError";
+
+        case cudaErrorInvalidFilterSetting:
+            return "cudaErrorInvalidFilterSetting";
+
+        case cudaErrorInvalidNormSetting:
+            return "cudaErrorInvalidNormSetting";
+
+        case cudaErrorMixedDeviceExecution:
+            return "cudaErrorMixedDeviceExecution";
+
+        case cudaErrorCudartUnloading:
+            return "cudaErrorCudartUnloading";
+
+        case cudaErrorUnknown:
+            return "cudaErrorUnknown";
+
+        case cudaErrorNotYetImplemented:
+            return "cudaErrorNotYetImplemented";
+
+        case cudaErrorMemoryValueTooLarge:
+            return "cudaErrorMemoryValueTooLarge";
+
+        case cudaErrorInvalidResourceHandle:
+            return "cudaErrorInvalidResourceHandle";
+
+        case cudaErrorNotReady:
+            return "cudaErrorNotReady";
+
+        case cudaErrorInsufficientDriver:
+            return "cudaErrorInsufficientDriver";
+
+        case cudaErrorSetOnActiveProcess:
+            return "cudaErrorSetOnActiveProcess";
+
+        case cudaErrorInvalidSurface:
+            return "cudaErrorInvalidSurface";
+
+        case cudaErrorNoDevice:
+            return "cudaErrorNoDevice";
+
+        case cudaErrorECCUncorrectable:
+            return "cudaErrorECCUncorrectable";
+
+        case cudaErrorSharedObjectSymbolNotFound:
+            return "cudaErrorSharedObjectSymbolNotFound";
+
+        case cudaErrorSharedObjectInitFailed:
+            return "cudaErrorSharedObjectInitFailed";
+
+        case cudaErrorUnsupportedLimit:
+            return "cudaErrorUnsupportedLimit";
+
+        case cudaErrorDuplicateVariableName:
+            return "cudaErrorDuplicateVariableName";
+
+        case cudaErrorDuplicateTextureName:
+            return "cudaErrorDuplicateTextureName";
+
+        case cudaErrorDuplicateSurfaceName:
+            return "cudaErrorDuplicateSurfaceName";
+
+        case cudaErrorDevicesUnavailable:
+            return "cudaErrorDevicesUnavailable";
+
+        case cudaErrorInvalidKernelImage:
+            return "cudaErrorInvalidKernelImage";
+
+        case cudaErrorNoKernelImageForDevice:
+            return "cudaErrorNoKernelImageForDevice";
+
+        case cudaErrorIncompatibleDriverContext:
+            return "cudaErrorIncompatibleDriverContext";
+
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return "cudaErrorPeerAccessAlreadyEnabled";
+
+        case cudaErrorPeerAccessNotEnabled:
+            return "cudaErrorPeerAccessNotEnabled";
+
+        case cudaErrorDeviceAlreadyInUse:
+            return "cudaErrorDeviceAlreadyInUse";
+
+        case cudaErrorProfilerDisabled:
+            return "cudaErrorProfilerDisabled";
+
+        case cudaErrorProfilerNotInitialized:
+            return "cudaErrorProfilerNotInitialized";
+
+        case cudaErrorProfilerAlreadyStarted:
+            return "cudaErrorProfilerAlreadyStarted";
+
+        case cudaErrorProfilerAlreadyStopped:
+            return "cudaErrorProfilerAlreadyStopped";
+
+        /* Since CUDA 4.0*/
+        case cudaErrorAssert:
+            return "cudaErrorAssert";
+
+        case cudaErrorTooManyPeers:
+            return "cudaErrorTooManyPeers";
+
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return "cudaErrorHostMemoryAlreadyRegistered";
+
+        case cudaErrorHostMemoryNotRegistered:
+            return "cudaErrorHostMemoryNotRegistered";
+
+        /* Since CUDA 5.0 */
+        case cudaErrorOperatingSystem:
+            return "cudaErrorOperatingSystem";
+
+        case cudaErrorPeerAccessUnsupported:
+            return "cudaErrorPeerAccessUnsupported";
+
+        case cudaErrorLaunchMaxDepthExceeded:
+            return "cudaErrorLaunchMaxDepthExceeded";
+
+        case cudaErrorLaunchFileScopedTex:
+            return "cudaErrorLaunchFileScopedTex";
+
+        case cudaErrorLaunchFileScopedSurf:
+            return "cudaErrorLaunchFileScopedSurf";
+
+        case cudaErrorSyncDepthExceeded:
+            return "cudaErrorSyncDepthExceeded";
+
+        case cudaErrorLaunchPendingCountExceeded:
+            return "cudaErrorLaunchPendingCountExceeded";
+
+        case cudaErrorNotPermitted:
+            return "cudaErrorNotPermitted";
+
+        case cudaErrorNotSupported:
+            return "cudaErrorNotSupported";
+
+        /* Since CUDA 6.0 */
+        case cudaErrorHardwareStackError:
+            return "cudaErrorHardwareStackError";
+
+        case cudaErrorIllegalInstruction:
+            return "cudaErrorIllegalInstruction";
+
+        case cudaErrorMisalignedAddress:
+            return "cudaErrorMisalignedAddress";
+
+        case cudaErrorInvalidAddressSpace:
+            return "cudaErrorInvalidAddressSpace";
+
+        case cudaErrorInvalidPc:
+            return "cudaErrorInvalidPc";
+
+        case cudaErrorIllegalAddress:
+            return "cudaErrorIllegalAddress";
+
+        /* Since CUDA 6.5*/
+        case cudaErrorInvalidPtx:
+            return "cudaErrorInvalidPtx";
+
+        case cudaErrorInvalidGraphicsContext:
+            return "cudaErrorInvalidGraphicsContext";
+
+        case cudaErrorStartupFailure:
+            return "cudaErrorStartupFailure";
+
+        case cudaErrorApiFailureBase:
+            return "cudaErrorApiFailureBase";
+
+        /* Since CUDA 8.0*/
+        case cudaErrorNvlinkUncorrectable :
+            return "cudaErrorNvlinkUncorrectable";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __cuda_cuda_h__
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error)
+{
+    switch (error)
+    {
+        case CUDA_SUCCESS:
+            return "CUDA_SUCCESS";
+
+        case CUDA_ERROR_INVALID_VALUE:
+            return "CUDA_ERROR_INVALID_VALUE";
+
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return "CUDA_ERROR_OUT_OF_MEMORY";
+
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return "CUDA_ERROR_NOT_INITIALIZED";
+
+        case CUDA_ERROR_DEINITIALIZED:
+            return "CUDA_ERROR_DEINITIALIZED";
+
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return "CUDA_ERROR_PROFILER_DISABLED";
+
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+
+        case CUDA_ERROR_NO_DEVICE:
+            return "CUDA_ERROR_NO_DEVICE";
+
+        case CUDA_ERROR_INVALID_DEVICE:
+            return "CUDA_ERROR_INVALID_DEVICE";
+
+        case CUDA_ERROR_INVALID_IMAGE:
+            return "CUDA_ERROR_INVALID_IMAGE";
+
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return "CUDA_ERROR_INVALID_CONTEXT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+
+        case CUDA_ERROR_MAP_FAILED:
+            return "CUDA_ERROR_MAP_FAILED";
+
+        case CUDA_ERROR_UNMAP_FAILED:
+            return "CUDA_ERROR_UNMAP_FAILED";
+
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return "CUDA_ERROR_ARRAY_IS_MAPPED";
+
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return "CUDA_ERROR_ALREADY_MAPPED";
+
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return "CUDA_ERROR_ALREADY_ACQUIRED";
+
+        case CUDA_ERROR_NOT_MAPPED:
+            return "CUDA_ERROR_NOT_MAPPED";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return "CUDA_ERROR_ECC_UNCORRECTABLE";
+
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+
+        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+            return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+
+        case CUDA_ERROR_INVALID_PTX:
+            return "CUDA_ERROR_INVALID_PTX";
+
+        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+            return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+
+        case CUDA_ERROR_NVLINK_UNCORRECTABLE:
+            return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
+
+        case CUDA_ERROR_INVALID_SOURCE:
+            return "CUDA_ERROR_INVALID_SOURCE";
+
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return "CUDA_ERROR_FILE_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return "CUDA_ERROR_OPERATING_SYSTEM";
+
+        case CUDA_ERROR_INVALID_HANDLE:
+            return "CUDA_ERROR_INVALID_HANDLE";
+
+        case CUDA_ERROR_NOT_FOUND:
+            return "CUDA_ERROR_NOT_FOUND";
+
+        case CUDA_ERROR_NOT_READY:
+            return "CUDA_ERROR_NOT_READY";
+
+        case CUDA_ERROR_ILLEGAL_ADDRESS:
+            return "CUDA_ERROR_ILLEGAL_ADDRESS";
+
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return "CUDA_ERROR_LAUNCH_FAILED";
+
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return "CUDA_ERROR_LAUNCH_TIMEOUT";
+
+        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+            return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+
+        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+            return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+
+        case CUDA_ERROR_ASSERT:
+            return "CUDA_ERROR_ASSERT";
+
+        case CUDA_ERROR_TOO_MANY_PEERS:
+            return "CUDA_ERROR_TOO_MANY_PEERS";
+
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+
+        case CUDA_ERROR_HARDWARE_STACK_ERROR:
+            return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+
+        case CUDA_ERROR_ILLEGAL_INSTRUCTION:
+            return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+
+        case CUDA_ERROR_MISALIGNED_ADDRESS:
+            return "CUDA_ERROR_MISALIGNED_ADDRESS";
+
+        case CUDA_ERROR_INVALID_ADDRESS_SPACE:
+            return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+
+        case CUDA_ERROR_INVALID_PC:
+            return "CUDA_ERROR_INVALID_PC";
+
+        case CUDA_ERROR_NOT_PERMITTED:
+            return "CUDA_ERROR_NOT_PERMITTED";
+
+        case CUDA_ERROR_NOT_SUPPORTED:
+            return "CUDA_ERROR_NOT_SUPPORTED";
+
+        case CUDA_ERROR_UNKNOWN:
+            return "CUDA_ERROR_UNKNOWN";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error)
+{
+    switch (error)
+    {
+        case CUFFT_SUCCESS:
+            return "CUFFT_SUCCESS";
+
+        case CUFFT_INVALID_PLAN:
+            return "CUFFT_INVALID_PLAN";
+
+        case CUFFT_ALLOC_FAILED:
+            return "CUFFT_ALLOC_FAILED";
+
+        case CUFFT_INVALID_TYPE:
+            return "CUFFT_INVALID_TYPE";
+
+        case CUFFT_INVALID_VALUE:
+            return "CUFFT_INVALID_VALUE";
+
+        case CUFFT_INTERNAL_ERROR:
+            return "CUFFT_INTERNAL_ERROR";
+
+        case CUFFT_EXEC_FAILED:
+            return "CUFFT_EXEC_FAILED";
+
+        case CUFFT_SETUP_FAILED:
+            return "CUFFT_SETUP_FAILED";
+
+        case CUFFT_INVALID_SIZE:
+            return "CUFFT_INVALID_SIZE";
+
+        case CUFFT_UNALIGNED_DATA:
+            return "CUFFT_UNALIGNED_DATA";
+
+        case CUFFT_INCOMPLETE_PARAMETER_LIST:
+            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+        case CUFFT_INVALID_DEVICE:
+            return "CUFFT_INVALID_DEVICE";
+
+        case CUFFT_PARSE_ERROR:
+            return "CUFFT_PARSE_ERROR";
+
+        case CUFFT_NO_WORKSPACE:
+            return "CUFFT_NO_WORKSPACE";
+
+        case CUFFT_NOT_IMPLEMENTED:
+            return "CUFFT_NOT_IMPLEMENTED";
+
+        case CUFFT_LICENSE_ERROR:
+            return "CUFFT_LICENSE_ERROR";
+
+        case CUFFT_NOT_SUPPORTED:
+            return "CUFFT_NOT_SUPPORTED";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+//cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error)
+{
+   switch(error)
+   {
+       case CUSOLVER_STATUS_SUCCESS:
+           return "CUSOLVER_STATUS_SUCCESS";
+       case CUSOLVER_STATUS_NOT_INITIALIZED:
+           return "CUSOLVER_STATUS_NOT_INITIALIZED";
+       case CUSOLVER_STATUS_ALLOC_FAILED:
+           return "CUSOLVER_STATUS_ALLOC_FAILED";
+       case CUSOLVER_STATUS_INVALID_VALUE:
+           return "CUSOLVER_STATUS_INVALID_VALUE";
+       case CUSOLVER_STATUS_ARCH_MISMATCH:
+           return "CUSOLVER_STATUS_ARCH_MISMATCH";
+       case CUSOLVER_STATUS_MAPPING_ERROR:
+           return "CUSOLVER_STATUS_MAPPING_ERROR";
+       case CUSOLVER_STATUS_EXECUTION_FAILED:
+           return "CUSOLVER_STATUS_EXECUTION_FAILED";
+       case CUSOLVER_STATUS_INTERNAL_ERROR:
+           return "CUSOLVER_STATUS_INTERNAL_ERROR";
+       case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+           return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+       case CUSOLVER_STATUS_NOT_SUPPORTED :
+           return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+       case CUSOLVER_STATUS_ZERO_PIVOT:
+           return "CUSOLVER_STATUS_ZERO_PIVOT";
+       case CUSOLVER_STATUS_INVALID_LICENSE:
+           return "CUSOLVER_STATUS_INVALID_LICENSE";
+    }
+
+    return "<unknown>";
+
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error)
+{
+    switch (error)
+    {
+        case CURAND_STATUS_SUCCESS:
+            return "CURAND_STATUS_SUCCESS";
+
+        case CURAND_STATUS_VERSION_MISMATCH:
+            return "CURAND_STATUS_VERSION_MISMATCH";
+
+        case CURAND_STATUS_NOT_INITIALIZED:
+            return "CURAND_STATUS_NOT_INITIALIZED";
+
+        case CURAND_STATUS_ALLOCATION_FAILED:
+            return "CURAND_STATUS_ALLOCATION_FAILED";
+
+        case CURAND_STATUS_TYPE_ERROR:
+            return "CURAND_STATUS_TYPE_ERROR";
+
+        case CURAND_STATUS_OUT_OF_RANGE:
+            return "CURAND_STATUS_OUT_OF_RANGE";
+
+        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+            return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+        case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+            return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+        case CURAND_STATUS_LAUNCH_FAILURE:
+            return "CURAND_STATUS_LAUNCH_FAILURE";
+
+        case CURAND_STATUS_PREEXISTING_FAILURE:
+            return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+        case CURAND_STATUS_INITIALIZATION_FAILED:
+            return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+        case CURAND_STATUS_ARCH_MISMATCH:
+            return "CURAND_STATUS_ARCH_MISMATCH";
+
+        case CURAND_STATUS_INTERNAL_ERROR:
+            return "CURAND_STATUS_INTERNAL_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error)
+{
+    switch (error)
+    {
+        case NPP_NOT_SUPPORTED_MODE_ERROR:
+            return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+        case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_RESIZE_NO_OPERATION_ERROR:
+            return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+        case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+            return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_BAD_ARG_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFF_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECT_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUAD_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEM_ALLOC_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_INPUT:
+            return "NPP_INVALID_INPUT";
+
+        case NPP_POINTER_ERROR:
+            return "NPP_POINTER_ERROR";
+
+        case NPP_WARNING:
+            return "NPP_WARNING";
+
+        case NPP_ODD_ROI_WARNING:
+            return "NPP_ODD_ROI_WARNING";
+#else
+
+            // These are for CUDA 5.5 or higher
+        case NPP_BAD_ARGUMENT_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFFICIENT_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECTANGLE_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUADRANGLE_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEMORY_ALLOCATION_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_HOST_POINTER_ERROR:
+            return "NPP_INVALID_HOST_POINTER_ERROR";
+
+        case NPP_INVALID_DEVICE_POINTER_ERROR:
+            return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+        case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_TEXTURE_BIND_ERROR:
+            return "NPP_TEXTURE_BIND_ERROR";
+
+        case NPP_WRONG_INTERSECTION_ROI_ERROR:
+            return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+        case NPP_NOT_EVEN_STEP_ERROR:
+            return "NPP_NOT_EVEN_STEP_ERROR";
+
+        case NPP_INTERPOLATION_ERROR:
+            return "NPP_INTERPOLATION_ERROR";
+
+        case NPP_RESIZE_FACTOR_ERROR:
+            return "NPP_RESIZE_FACTOR_ERROR";
+
+        case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+            return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_MEMFREE_ERR:
+            return "NPP_MEMFREE_ERR";
+
+        case NPP_MEMSET_ERR:
+            return "NPP_MEMSET_ERR";
+
+        case NPP_MEMCPY_ERR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERR:
+            return "NPP_MIRROR_FLIP_ERR";
+#else
+
+        case NPP_MEMFREE_ERROR:
+            return "NPP_MEMFREE_ERROR";
+
+        case NPP_MEMSET_ERROR:
+            return "NPP_MEMSET_ERROR";
+
+        case NPP_MEMCPY_ERROR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERROR:
+            return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+        case NPP_ALIGNMENT_ERROR:
+            return "NPP_ALIGNMENT_ERROR";
+
+        case NPP_STEP_ERROR:
+            return "NPP_STEP_ERROR";
+
+        case NPP_SIZE_ERROR:
+            return "NPP_SIZE_ERROR";
+
+        case NPP_NULL_POINTER_ERROR:
+            return "NPP_NULL_POINTER_ERROR";
+
+        case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+            return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+        case NPP_NOT_IMPLEMENTED_ERROR:
+            return "NPP_NOT_IMPLEMENTED_ERROR";
+
+        case NPP_ERROR:
+            return "NPP_ERROR";
+
+        case NPP_SUCCESS:
+            return "NPP_SUCCESS";
+
+        case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+            return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+        case NPP_MISALIGNED_DST_ROI_WARNING:
+            return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+        case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+            return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+        case NPP_DOUBLE_SIZE_WARNING:
+            return "NPP_DOUBLE_SIZE_WARNING";
+
+        case NPP_WRONG_INTERSECTION_ROI_WARNING:
+            return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+        /* These are 6.0 or higher */
+        case NPP_LUT_PALETTE_BITSIZE_ERROR:
+            return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+        case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_QUALITY_INDEX_ERROR:
+            return "NPP_QUALITY_INDEX_ERROR";
+
+        case NPP_CHANNEL_ORDER_ERROR:
+            return "NPP_CHANNEL_ORDER_ERROR";
+
+        case NPP_ZERO_MASK_VALUE_ERROR:
+            return "NPP_ZERO_MASK_VALUE_ERROR";
+
+        case NPP_NUMBER_OF_CHANNELS_ERROR:
+            return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+        case NPP_COI_ERROR:
+            return "NPP_COI_ERROR";
+
+        case NPP_DIVISOR_ERROR:
+            return "NPP_DIVISOR_ERROR";
+
+        case NPP_CHANNEL_ERROR:
+            return "NPP_CHANNEL_ERROR";
+
+        case NPP_STRIDE_ERROR:
+            return "NPP_STRIDE_ERROR";
+
+        case NPP_ANCHOR_ERROR:
+            return "NPP_ANCHOR_ERROR";
+
+        case NPP_MASK_SIZE_ERROR:
+            return "NPP_MASK_SIZE_ERROR";
+
+        case NPP_MOMENT_00_ZERO_ERROR:
+            return "NPP_MOMENT_00_ZERO_ERROR";
+
+        case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+            return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+        case NPP_THRESHOLD_ERROR:
+            return "NPP_THRESHOLD_ERROR";
+
+        case NPP_CONTEXT_MATCH_ERROR:
+            return "NPP_CONTEXT_MATCH_ERROR";
+
+        case NPP_FFT_FLAG_ERROR:
+            return "NPP_FFT_FLAG_ERROR";
+
+        case NPP_FFT_ORDER_ERROR:
+            return "NPP_FFT_ORDER_ERROR";
+
+        case NPP_SCALE_RANGE_ERROR:
+            return "NPP_SCALE_RANGE_ERROR";
+
+        case NPP_DATA_TYPE_ERROR:
+            return "NPP_DATA_TYPE_ERROR";
+
+        case NPP_OUT_OFF_RANGE_ERROR:
+            return "NPP_OUT_OFF_RANGE_ERROR";
+
+        case NPP_DIVIDE_BY_ZERO_ERROR:
+            return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+        case NPP_RANGE_ERROR:
+            return "NPP_RANGE_ERROR";
+
+        case NPP_NO_MEMORY_ERROR:
+            return "NPP_NO_MEMORY_ERROR";
+
+        case NPP_ERROR_RESERVED:
+            return "NPP_ERROR_RESERVED";
+
+        case NPP_NO_OPERATION_WARNING:
+            return "NPP_NO_OPERATION_WARNING";
+
+        case NPP_DIVIDE_BY_ZERO_WARNING:
+            return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+        /* These are 7.0 or higher */
+        case NPP_OVERFLOW_ERROR:
+            return "NPP_OVERFLOW_ERROR";
+
+        case NPP_CORRUPTED_DATA_ERROR:
+            return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        DEVICE_RESET
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+{
+    cudaError_t err = cudaGetLastError();
+
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+                file, line, errorMessage, (int)err, cudaGetErrorString(err));
+        DEVICE_RESET
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value)
+{
+    return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+        { 0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class
+        { 0x60, 64 }, // Pascal Generation (SM 6.0) GP100 class
+        { 0x61, 128}, // Pascal Generation (SM 6.1) GP10x class
+        { 0x62, 128}, // Pascal Generation (SM 6.2) GP10x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID)
+{
+    int device_count;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (devID < 0)
+    {
+        devID = 0;
+    }
+
+    if (devID > device_count-1)
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
+        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
+        fprintf(stderr, "\n");
+        return -devID;
+    }
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+    if (deviceProp.computeMode == cudaComputeModeProhibited)
+    {
+        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+        return -1;
+    }
+
+    if (deviceProp.major < 1)
+    {
+        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    checkCudaErrors(cudaSetDevice(devID));
+    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
+
+    return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId()
+{
+    int current_device     = 0, sm_per_multiproc  = 0;
+    int max_perf_device    = 0;
+    int device_count       = 0, best_SM_arch      = 0;
+    int devices_prohibited = 0;
+
+    unsigned long long max_compute_perf = 0;
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceCount(&device_count);
+
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best major SM Architecture GPU device
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major > 0 && deviceProp.major < 9999)
+            {
+                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
+            }
+        }
+        else
+        {
+            devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count)
+    {
+    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
+    	exit(EXIT_FAILURE);
+    }
+
+    // Find the best CUDA capable GPU device
+    current_device = 0;
+
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+            {
+                sm_per_multiproc = 1;
+            }
+            else
+            {
+                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
+            }
+
+            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+
+            if (compute_perf  > max_compute_perf)
+            {
+                // If we find GPU with SM major > 2, search only these
+                if (best_SM_arch > 2)
+                {
+                    // If our device==dest_SM_arch, choose this, or else pass
+                    if (deviceProp.major == best_SM_arch)
+                    {
+                        max_compute_perf  = compute_perf;
+                        max_perf_device   = current_device;
+                    }
+                }
+                else
+                {
+                    max_compute_perf  = compute_perf;
+                    max_perf_device   = current_device;
+                }
+            }
+        }
+
+        ++current_device;
+    }
+
+    return max_perf_device;
+}
+
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv)
+{
+    cudaDeviceProp deviceProp;
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, argv, "device"))
+    {
+        devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+        if (devID < 0)
+        {
+            printf("Invalid command line parameter\n ");
+            exit(EXIT_FAILURE);
+        }
+        else
+        {
+            devID = gpuDeviceInit(devID);
+
+            if (devID < 0)
+            {
+                printf("exiting...\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+    else
+    {
+        // Otherwise pick the device with highest Gflops/s
+        devID = gpuGetMaxGflopsDeviceId();
+        checkCudaErrors(cudaSetDevice(devID));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+    }
+
+    return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version)
+{
+    cudaDeviceProp deviceProp;
+    deviceProp.major = 0;
+    deviceProp.minor = 0;
+    int dev;
+
+    checkCudaErrors(cudaGetDevice(&dev));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    if ((deviceProp.major > major_version) ||
+        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
+    {
+        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
+        return true;
+    }
+    else
+    {
+        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+        return false;
+    }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/helper_string.h	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,527 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef STRING_HELPER_H
+#define STRING_HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP  _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP  strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string)
+{
+    int string_start = 0;
+
+    while (string[string_start] == delimiter)
+    {
+        string_start++;
+    }
+
+    if (string_start >= (int)strlen(string)-1)
+    {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension)
+{
+    int string_length = (int)strlen(filename);
+
+    while (filename[string_length--] != '.')
+    {
+        if (string_length == 0)
+            break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+
+inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = (int)strlen(string_ref);
+
+            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+            {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i=argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = (float)atof(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref, char **string_retval)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = (char *)&argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                *string_retval = &string_argv[length+1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound)
+    {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+{
+    // <executable_name> defines a variable that is replaced with the name of the executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
+    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
+    const char *searchPath[] =
+    {
+        "./",                                       // same dir
+        "./<executable_name>_data_files/",
+        "./common/",                                // "/common/" subdir
+        "./common/data/",                           // "/common/data/" subdir
+        "./data/",                                  // "/data/" subdir
+        "./src/",                                   // "/src/" subdir
+        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
+        "./inc/",                                   // "/inc/" subdir
+        "./0_Simple/",                              // "/0_Simple/" subdir
+        "./1_Utilities/",                           // "/1_Utilities/" subdir
+        "./2_Graphics/",                            // "/2_Graphics/" subdir
+        "./3_Imaging/",                             // "/3_Imaging/" subdir
+        "./4_Finance/",                             // "/4_Finance/" subdir
+        "./5_Simulations/",                         // "/5_Simulations/" subdir
+        "./6_Advanced/",                            // "/6_Advanced/" subdir
+        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
+        "./8_Android/",                             // "/8_Android/" subdir
+        "./samples/",                               // "/samples/" subdir
+
+        "./0_Simple/<executable_name>/data/",        // "/0_Simple/<executable_name>/data/" subdir
+        "./1_Utilities/<executable_name>/data/",     // "/1_Utilities/<executable_name>/data/" subdir
+        "./2_Graphics/<executable_name>/data/",      // "/2_Graphics/<executable_name>/data/" subdir
+        "./3_Imaging/<executable_name>/data/",       // "/3_Imaging/<executable_name>/data/" subdir
+        "./4_Finance/<executable_name>/data/",       // "/4_Finance/<executable_name>/data/" subdir
+        "./5_Simulations/<executable_name>/data/",   // "/5_Simulations/<executable_name>/data/" subdir
+        "./6_Advanced/<executable_name>/data/",      // "/6_Advanced/<executable_name>/data/" subdir
+        "./7_CUDALibraries/<executable_name>/",      // "/7_CUDALibraries/<executable_name>/" subdir
+        "./7_CUDALibraries/<executable_name>/data/", // "/7_CUDALibraries/<executable_name>/data/" subdir
+
+        "../",                                      // up 1 in tree
+        "../common/",                               // up 1 in tree, "/common/" subdir
+        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
+        "../data/",                                 // up 1 in tree, "/data/" subdir
+        "../src/",                                  // up 1 in tree, "/src/" subdir
+        "../inc/",                                  // up 1 in tree, "/inc/" subdir
+
+        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../4_Finance/<executable_name>/data/",      // up 1 in tree, "/4_Finance/<executable_name>/" subdir
+        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
+        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
+        "../../",                                        // up 2 in tree
+        "../../common/",                                 // up 2 in tree, "/common/" subdir
+        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
+        "../../data/",                                   // up 2 in tree, "/data/" subdir
+        "../../src/",                                    // up 2 in tree, "/src/" subdir
+        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
+        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
+        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../4_Finance/<executable_name>/data/",       // up 2 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
+        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
+        "../../../",                                        // up 3 in tree
+        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
+        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
+        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
+        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/data/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../0_Simple/<executable_name>/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
+        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
+        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
+        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
+        "../../../../",                                // up 4 in tree
+        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
+        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/data/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../0_Simple/<executable_name>/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
+        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
+        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
+        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
+        "../../../../../",                                // up 5 in tree
+        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
+        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../../4_Finance/<executable_name>/data/",    // up 5 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
+        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
+        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0)
+    {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('\\');
+        executable_name.erase(0, delimiter_pos + 1);
+
+        if (executable_name.rfind(".exe") != std::string::npos)
+        {
+            // we strip .exe, only if the .exe is found
+            executable_name.resize(executable_name.size() - 4);
+        }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0,delimiter_pos+1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+    {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos)
+        {
+            if (executable_path != 0)
+            {
+                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
+            }
+            else
+            {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL)
+        {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = (char *) malloc(path.length() + 1);
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp)
+        {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/main.cu	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,368 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two memcopies
+ * with kernel execution. This sample illustrates the usage of CUDA streams to
+ * achieve overlapping of kernel execution with copying data to and from the device.
+ *
+ * Additionally, this sample uses CUDA events to measure elapsed time for
+ * CUDA calls.  Events are a part of CUDA API and provide a system independent
+ * way to measure execution times on CUDA devices with approximately 0.5
+ * microsecond precision.
+ *
+ * Elapsed times are averaged over nreps repetitions (10 by default).
+ *
+*/
+
+const char *sSDKname = "simpleMultiCopy";
+
+// includes, system
+#include <stdio.h>
+
+extern "C" {
+extern void test1();
+}
+// include CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// includes, project
+//#include <helper_cuda.h>
+//#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+
+#include "helper_cuda.h"
+
+// includes, kernels
+// Declare the CUDA kernels here and main() code that is needed to launch
+// Compute workload on the system
+__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < N)
+    {
+        for (int i=0; i<inner_reps; ++i)
+        {
+            g_out[idx] = g_in[idx] + 1;
+        }
+    }
+}
+
+#define STREAM_COUNT 4
+
+// Uncomment to simulate data source/sink IO times
+//#define SIMULATE_IO
+
+int *h_data_source;
+int *h_data_sink;
+
+int *h_data_in[STREAM_COUNT];
+int *d_data_in[STREAM_COUNT];
+
+int *h_data_out[STREAM_COUNT];
+int *d_data_out[STREAM_COUNT];
+
+
+cudaEvent_t cycleDone[STREAM_COUNT];
+cudaStream_t stream[STREAM_COUNT];
+
+cudaEvent_t start, stop;
+
+int N = 1 << 22;
+int nreps = 10;                 // number of times each experiment is repeated
+int inner_reps = 5;
+
+int memsize;
+
+dim3 block(512);
+dim3 grid;
+
+int thread_blocks;
+
+float processWithStreams(int streams_used);
+void init();
+bool test();
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char *argv[])
+{
+    int cuda_device = 0;
+    float scale_factor;
+    cudaDeviceProp deviceProp;
+
+    test1();
+    printf("[%s] - Starting...\n", sSDKname);
+
+        // Otherwise pick the device with the highest Gflops/s
+        cuda_device = 0;
+        checkCudaErrors(cudaSetDevice(cuda_device));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+        printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
+
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
+           deviceProp.name, deviceProp.multiProcessorCount,
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
+
+    // Anything that is less than 32 Cores will have scaled down workload
+    scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
+    N = (int)((float)N / scale_factor);
+
+    printf("> Device name: %s\n", deviceProp.name);
+    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
+           deviceProp.major, deviceProp.minor,
+           deviceProp.multiProcessorCount);
+    printf("> scale_factor = %.2f\n", 1.0f/scale_factor);
+    printf("> array_size   = %d\n\n", N);
+
+    memsize = N * sizeof(int);
+
+    thread_blocks = N / block.x;
+
+    grid.x = thread_blocks % 65535;
+    grid.y = (thread_blocks / 65535 + 1);
+
+
+    // Allocate resources
+
+    h_data_source = (int *) malloc(memsize);
+    h_data_sink = (int *) malloc(memsize);
+
+    for (int i =0; i<STREAM_COUNT; ++i)
+    {
+
+        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize,
+                                      cudaHostAllocDefault));
+        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
+
+        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize,
+                                      cudaHostAllocDefault));
+        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
+
+        checkCudaErrors(cudaStreamCreate(&stream[i]));
+        checkCudaErrors(cudaEventCreate(&cycleDone[i]));
+
+        cudaEventRecord(cycleDone[i], stream[i]);
+    }
+
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    init();
+
+    // Kernel warmup
+    incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+
+
+    // Time copies and kernel
+    cudaEventRecord(start,0);
+    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
+                                    cudaMemcpyHostToDevice,0));
+    cudaEventRecord(stop,0);
+    cudaEventSynchronize(stop);
+
+    float memcpy_h2d_time;
+    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
+
+    cudaEventRecord(start,0);
+    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
+                                    cudaMemcpyDeviceToHost, 0));
+    cudaEventRecord(stop,0);
+    cudaEventSynchronize(stop);
+
+    float memcpy_d2h_time;
+    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
+
+    cudaEventRecord(start,0);
+    incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+    cudaEventRecord(stop,0);
+    cudaEventSynchronize(stop);
+
+    float kernel_time;
+    cudaEventElapsedTime(&kernel_time, start, stop);
+
+    printf("\n");
+    printf("Relevant properties of this CUDA device\n");
+    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution (device property \"deviceOverlap\")\n", deviceProp.deviceOverlap ? "X" : " ");
+    //printf("(%s) Can execute several GPU kernels simultaneously (compute capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
+    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
+           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro 4000/5000/6000/K5000)\n",
+           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)
+           ? "X" : " ");
+
+    printf("\n");
+    printf("Measured timings (throughput):\n");
+    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
+           memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time);
+    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
+           memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);
+    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
+           kernel_time, (inner_reps *memsize * 2e-6)/ kernel_time);
+
+    printf("\n");
+    printf("Theoretical limits for speedup gained from overlapped data transfers:\n");
+    printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
+           memcpy_h2d_time + memcpy_d2h_time + kernel_time);
+    printf("Compute can overlap with one transfer: %f ms\n",
+           max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
+    printf("Compute can overlap with both data transfers: %f ms\n",
+           max(max(memcpy_h2d_time,memcpy_d2h_time), kernel_time));
+
+    // Process pipelined work
+    float serial_time = processWithStreams(1);
+    float overlap_time = processWithStreams(STREAM_COUNT);
+
+    printf("\nAverage measured timings over %d repetitions:\n", nreps);
+    printf(" Avg. time when execution fully serialized\t: %f ms\n",
+           serial_time / nreps);
+    printf(" Avg. time when overlapped using %d streams\t: %f ms\n",
+           STREAM_COUNT, overlap_time / nreps);
+    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
+           (serial_time - overlap_time) / nreps);
+
+    printf("\nMeasured throughput:\n");
+    printf(" Fully serialized execution\t\t: %f GB/s\n",
+           (nreps * (memsize * 2e-6))/ serial_time);
+    printf(" Overlapped using %d streams\t\t: %f GB/s\n",
+           STREAM_COUNT, (nreps * (memsize * 2e-6))/ overlap_time);
+
+    // Verify the results, we will use the results for final output
+    bool bResults = test();
+
+    // Free resources
+
+    free(h_data_source);
+    free(h_data_sink);
+
+    for (int i =0; i<STREAM_COUNT; ++i)
+    {
+
+        cudaFreeHost(h_data_in[i]);
+        cudaFree(d_data_in[i]);
+
+        cudaFreeHost(h_data_out[i]);
+        cudaFree(d_data_out[i]);
+
+        cudaStreamDestroy(stream[i]);
+        cudaEventDestroy(cycleDone[i]);
+    }
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    // Test result
+    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+float processWithStreams(int streams_used)
+{
+
+    int current_stream = 0;
+
+    float time;
+
+    // Do processing in a loop
+    //
+    // Note: All memory commands are processed in the order  they are issued,
+    // independent of the stream they are enqueued in. Hence the pattern by
+    // which the copy and kernel commands are enqueued in the stream
+    // has an influence on the achieved overlap.
+
+    cudaEventRecord(start, 0);
+
+    for (int i=0; i<nreps; ++i)
+    {
+        int next_stream = (current_stream + 1) % streams_used;
+
+#ifdef SIMULATE_IO
+        // Store the result
+        memcpy(h_data_sink, h_data_out[current_stream],memsize);
+
+        // Read new input
+        memcpy(h_data_in[next_stream], h_data_source, memsize);
+#endif
+
+        // Ensure that processing and copying of the last cycle has finished
+        cudaEventSynchronize(cycleDone[next_stream]);
+
+        // Process current frame
+        incKernel<<<grid, block, 0, stream[current_stream]>>>(
+            d_data_out[current_stream],
+            d_data_in[current_stream],
+            N,
+            inner_reps);
+
+        // Upload next frame
+        checkCudaErrors(cudaMemcpyAsync(
+                            d_data_in[next_stream],
+                            h_data_in[next_stream],
+                            memsize,
+                            cudaMemcpyHostToDevice,
+                            stream[next_stream]));
+
+        // Download current frame
+        checkCudaErrors(cudaMemcpyAsync(
+                            h_data_out[current_stream],
+                            d_data_out[current_stream],
+                            memsize,
+                            cudaMemcpyDeviceToHost,
+                            stream[current_stream]));
+
+        checkCudaErrors(cudaEventRecord(
+                            cycleDone[current_stream],
+                            stream[current_stream]));
+
+        current_stream = next_stream;
+    }
+
+    cudaEventRecord(stop, 0);
+
+    cudaDeviceSynchronize();
+
+    cudaEventElapsedTime(&time, start, stop);
+
+    return time;
+
+}
+
+void init()
+{
+    for (int i=0; i<N; ++i)
+    {
+        h_data_source[i] = 0;
+    }
+
+    for (int i =0; i<STREAM_COUNT; ++i)
+    {
+        memcpy(h_data_in[i], h_data_source, memsize);
+    }
+}
+
+
+bool test()
+{
+
+    bool passed = true;
+
+    for (int j =0; j<STREAM_COUNT; ++j)
+    {
+        for (int i =0; i<N; ++i)
+        {
+            passed &= (h_data_out[j][i] == 1);
+        }
+    }
+
+    return passed;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/multiply.cu	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,8 @@
+extern "C" {
+    __global__ void multiply(float* A, float* B, float* C,int* i) {
+//        printf("%d %d\n",i[0],i[1]);
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        C[index] = A[index] * B[0];
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/test.c	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,5 @@
+#include <stdio.h>
+
+void test1() {
+    printf("test\n");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/twice.cu	Fri Feb 10 08:48:51 2017 +0900
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <sys/time.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <cuda.h>
+
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+
+#define LENGTH (10)
+#define THREAD (10)
+
+double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+void
+check_data(float* A, float B, float* C) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
+        if (A[i]*B!=C[i]) {
+            puts("multiply failure.");
+            return;
+        }
+    }
+    puts("success.");
+}
+
+void print_result(float* C) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
+        printf("%f\n",C[i]);
+    }
+}
+
+int main(int args, char* argv[]) {
+    int num_stream = 1; // number of stream
+    int num_exec = 16; // number of executed kernel
+
+    for (int i=1;argv[i];i++) {
+        if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
+            num_stream = atoi(argv[++i]);
+        }
+    }
+
+    // initialize and load kernel
+    CUdevice device;
+    CUcontext context;
+    CUmodule module;
+    CUfunction function;
+    CUstream stream[num_stream];
+
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+    checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
+    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
+    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamCreate(&stream[i],0));
+
+    // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devB[num_exec];
+    CUdeviceptr devOut[num_exec];
+
+    checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
+    for (int i=0;i<num_exec;i++) {
+        checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
+        checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
+    }
+
+    // input buffer
+    float* A = new float[LENGTH*THREAD];
+    float* B = new float[num_exec];
+
+    for (int i=0; i<LENGTH*THREAD; i++)
+        A[i] = (float)(i+1000);
+
+    // output buffer
+    float** result = new float* [num_exec];
+
+    for (int i=0;i<num_exec;i++)
+        result[i] = new float[LENGTH*THREAD];
+
+    // Synchronous data transfer(host to device)
+    checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
+
+    // Asynchronous data transfer(host to device)
+    int cur = 0;
+
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         B[i] = (float)(i+1);
+         checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
+     }
+
+    cur = 0;
+
+    // Asynchronous launch kernel
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur=0;
+        B[i] = (float)(i+1);
+        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
+        void* args[] = {&devA, &devB[i], &devOut[i]};
+        checkCudaErrors(cuLaunchKernel(function,
+                       LENGTH, 1, 1,
+                       THREAD, 1, 1,
+                                 0, stream[cur], args, NULL));
+        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
+    }
+
+    cur = 0;
+
+
+    // Asynchronous data transfer(device to host)
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
+     }
+
+    // wait for stream
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamSynchronize(stream[i]));
+
+    //printf("%0.6f\n",getTime()-start);
+
+    for (int i=0;i<num_exec;i++)
+        check_data(A,(float)(i+1),result[i]);
+
+    // memory release
+    checkCudaErrors(cuMemFree(devA));
+    for (int i=0;i<num_exec;i++) {
+        checkCudaErrors(cuMemFree(devB[i]));
+        checkCudaErrors(cuMemFree(devOut[i]));
+    }
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamDestroy(stream[i]));
+    checkCudaErrors(cuModuleUnload(module));
+    checkCudaErrors(cuCtxDestroy(context));
+
+    delete[] A;
+    delete[] B;
+    for (int i=0;i<num_exec;i++)
+        delete[] result[i];
+    delete[] result;
+
+    return 0;
+}
+//