Bolt  1.1
C++ template library with support for OpenCL
Classes | Typedefs | Enumerations | Enumerator | Functions | Variables | Friends
CL-control

Classes

class  bolt::cl::control
 
struct  bolt::cl::control::debug
 

Typedefs

typedef boost::shared_ptr
< ::cl::Buffer > 
bolt::cl::control::buffPointer
 Buffer pool support functions.
 

Enumerations

enum  e_UseHostMode { NoUseHost, UseHost }
 
enum  e_RunMode { Automatic, SerialCpu, MultiCoreCpu, OpenCL }
 
enum  e_AutoTuneMode { NoAutoTune =0x0, AutoTuneDevice =0x1, AutoTuneWorkShape =0x2, AutoTuneAll =0x3 }
 
enum  e_WaitMode { BalancedWait, NiceWait, BusyWait, ClFinish }
 

Functions

 bolt::cl::control::control (const ::cl::CommandQueue &commandQueue=getDefault().getCommandQueue(), e_UseHostMode useHost=getDefault().getUseHost(), unsigned debug=getDefault().getDebugMode())
 
 bolt::cl::control::control (const control &ref)
 
void bolt::cl::control::setCommandQueue (::cl::CommandQueue commandQueue)
 
void bolt::cl::control::setUseHost (e_UseHostMode useHost)
 
void bolt::cl::control::setForceRunMode (e_RunMode forceRunMode)
 
void bolt::cl::control::setDebugMode (unsigned debug)
 
void bolt::cl::control::setWGPerComputeUnit (int wgPerComputeUnit)
 
void bolt::cl::control::setWaitMode (e_WaitMode waitMode)
 
void bolt::cl::control::setUnroll (int unroll)
 
void bolt::cl::control::setCompileOptions (std::string &compileOptions)
 
::cl::CommandQueue & bolt::cl::control::getCommandQueue ()
 
const ::cl::CommandQueue & bolt::cl::control::getCommandQueue () const
 
::cl::Context bolt::cl::control::getContext () const
 
::cl::Device bolt::cl::control::getDevice () const
 
e_UseHostMode bolt::cl::control::getUseHost () const
 
e_RunMode bolt::cl::control::getForceRunMode () const
 
e_RunMode bolt::cl::control::getDefaultPathToRun () const
 
unsigned bolt::cl::control::getDebugMode () const
 
int const bolt::cl::control::getWGPerComputeUnit () const
 
const ::std::string bolt::cl::control::getCompileOptions () const
 
e_WaitMode bolt::cl::control::getWaitMode () const
 
int bolt::cl::control::getUnroll () const
 
bool bolt::cl::control::getCompileForAllDevices () const
 
static control & bolt::cl::control::getDefault ()
 
static void bolt::cl::control::printPlatforms (bool printDevices=true, cl_device_type deviceType=CL_DEVICE_TYPE_ALL)
 
static void bolt::cl::control::printPlatformsRange (std::vector< ::cl::Platform >::iterator begin, std::vector< ::cl::Platform >::iterator end, bool printDevices=true, cl_device_type deviceType=CL_DEVICE_TYPE_ALL)
 
::cl::CommandQueue bolt::cl::control::getDefaultCommandQueue ()
 Convenience method to help users create and initialize an OpenCL CommandQueue.
 
size_t bolt::cl::control::totalBufferSize ()
 
buffPointer bolt::cl::control::acquireBuffer (size_t reqSize, cl_mem_flags flags=CL_MEM_READ_WRITE, const void *host_ptr=NULL)
 
void bolt::cl::control::freeBuffers ()
 
bool bolt::cl::control::descBufferComp::operator() (const descBufferKey &lhs, const descBufferKey &rhs) const
 
 bolt::cl::control::UnlockBuffer::UnlockBuffer (control &p_control, mapBufferType::iterator it)
 
void bolt::cl::control::UnlockBuffer::operator() (const void *pBuff)
 

Variables

static const unsigned bolt::cl::control::debug::None =0
 
static const unsigned bolt::cl::control::debug::Compile = 0x1
 
static const unsigned bolt::cl::control::debug::ShowCode = 0x2
 
static const unsigned bolt::cl::control::debug::SaveCompilerTemps = 0x4
 
static const unsigned bolt::cl::control::debug::DebugKernelRun = 0x8
 
static const unsigned bolt::cl::control::debug::AutoTune = 0x10
 
::cl::Context bolt::cl::control::descBufferKey::buffContext
 
cl_mem_flags bolt::cl::control::descBufferKey::memFlags
 
const void * bolt::cl::control::descBufferKey::host_ptr
 
size_t bolt::cl::control::descBufferValue::buffSize
 
bool bolt::cl::control::descBufferValue::inUse
 
::cl::Buffer bolt::cl::control::descBufferValue::buffBuff
 

Friends

class bolt::cl::control::UnlockBuffer
 

Detailed Description

Function Documentation

buffPointer bolt::cl::control::acquireBuffer ( size_t  reqSize,
cl_mem_flags  flags = CL_MEM_READ_WRITE,
const void *  host_ptr = NULL 
)

Return a pointer to memory from per allocated memory pool

void bolt::cl::control::freeBuffers ( )

Freeing memory

static control& bolt::cl::control::getDefault ( )
inlinestatic

Return default default control structure. This is used for Bolt API calls when the user does not explicitly specify a control structure. Also, newly created control structures copy the default structure for their initial values. Note that changes to the default control structure are not automatically copied to already-created control structures. Typically, the default control structure is modified as part of the application initialiation; then, as other control structures are created, they pick up the modified defaults. Some examples:

bolt::cl::control myControl = bolt::cl::getDefault(); // copy existing default control.
bolt::cl::control myControl; // same as last line - the constructor also copies values from the default control
// Modify a setting in the default \p control
bolt::cl::control::getDefault().compileOptions("-g");
::cl::CommandQueue bolt::cl::control::getDefaultCommandQueue ( )
static

Convenience method to help users create and initialize an OpenCL CommandQueue.

Todo:
The default commandqueue is created with a context that contains all GPU devices in platform. Since kernels are only compiled on first invocation, switching between GPU devices is OK, but switching to a CPU device afterwards causes an exception because the kernel was not compiled for CPU. Should we provide more options and expose more intefaces to the user?
void bolt::cl::control::setCommandQueue ( ::cl::CommandQueue  commandQueue)
inline

Set the OpenCL command queue (and associated device) for Bolt algorithms to use. Only one command-queue can be specified for each call; Bolt does not load-balance across multiple command queues. Bolt also uses the specified command queue to determine the OpenCL context and device.

void bolt::cl::control::setCompileOptions ( std::string &  compileOptions)
inline

Specify the compile options passed to the OpenCL(TM) compiler.

void bolt::cl::control::setDebugMode ( unsigned  debug)
inline

Enable debug messages to be printed to stdout as the algorithm is compiled, run, and tuned. See the #debug namespace for a list of values. Multiple debug options can be combined with the + sign, as in following example. Use this technique rather than separate calls to the debug() API; each call resets the debug level, rather than merging with the existing debug() setting.

// Show example of combining two debug options with the '+' sign.
myControl.setDebug(bolt::cl::control::debug::Compile + bolt::cl::control:debug::SaveCompilerTemps);
void bolt::cl::control::setForceRunMode ( e_RunMode  forceRunMode)
inline

Force the Bolt command to run on the specifed device. Default is "Automatic," in which case the Bolt runtime selects the device. Forcing the mode to SerialCpu can be useful for debugging the algorithm. Forcing the mode can also be useful for performance comparisons, or for direct control over the run location (perhaps due to knowledge that the algorithm is best-suited for GPU). Please note that forcing the run modes will not change the OpenCL device in the control object. This API is designed to simplify the process of choosing the appropriate path in the Bolt API.

void bolt::cl::control::setUnroll ( int  unroll)
inline

unroll assignment

void bolt::cl::control::setUseHost ( e_UseHostMode  useHost)
inline

If enabled, Bolt can use the host CPU to run parts of the algorithm. If false, Bolt runs the entire algorithm using the device specified by the command-queue. This can be appropriate on a discrete GPU, where the input data is located on the device memory.

void bolt::cl::control::setWaitMode ( e_WaitMode  waitMode)
inline

Set the method used to detect completion at the end of a Bolt routine.

void bolt::cl::control::setWGPerComputeUnit ( int  wgPerComputeUnit)
inline

Set the work-groups-per-compute unit that will be used for reduction-style operations (reduce, transform_reduce). Higher numbers can hide latency by improving the occupancy but will increase the amoutn of data that has to be reduced in the final, less efficient step. Experimentation may be required to find the optimal point for a given algorithm and device; typically 8-12 will deliver good results

size_t bolt::cl::control::totalBufferSize ( )

Return device memory size