Bolt  1.1
C++ template library with support for OpenCL
control.h
Go to the documentation of this file.
1 /***************************************************************************
2 * Copyright 2012 - 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 
16 ***************************************************************************/
17 
18 
23 #pragma once
24 #if !defined( BOLT_CL_CONTROL_H )
25 #define BOLT_CL_CONTROL_H
26 
27 
28 #include <bolt/cl/bolt.h>
29 #include <string>
30 #include <map>
31 
32 #include <boost/thread/mutex.hpp>
33 #include <boost/shared_ptr.hpp>
34 
39 namespace bolt {
40  namespace cl {
41 
101  class control {
102  public:
103  enum e_UseHostMode {NoUseHost, UseHost};
104  enum e_RunMode {Automatic,
105  SerialCpu,
106  MultiCoreCpu,
107  OpenCL };
108 
109  enum e_AutoTuneMode{NoAutoTune=0x0,
110  AutoTuneDevice=0x1,
111  AutoTuneWorkShape=0x2,
112  AutoTuneAll=0x3}; // FIXME, experimental
113  struct debug {
114  static const unsigned None=0;
115  static const unsigned Compile = 0x1;
116  static const unsigned ShowCode = 0x2;
117  static const unsigned SaveCompilerTemps = 0x4;
118  static const unsigned DebugKernelRun = 0x8;
119  static const unsigned AutoTune = 0x10;
120  };
121 
122  enum e_WaitMode {BalancedWait, // Balance of Busy and Nice: tries to use Busy for short-running kernels. \todo: Balanced currently maps to nice.
123  NiceWait, // Use an OS semaphore to detect completion status.
124  BusyWait, // Busy a CPU core continuously monitoring results. Lowest-latency, but requires a dedicated core.
125  ClFinish, // Call clFinish on the queue.
126  };
127 
128  public:
129 
130  // Construct a new control structure, copying from default control for arguments that are not overridden.
131  control(
132  const ::cl::CommandQueue& commandQueue = getDefault().getCommandQueue(),
133  e_UseHostMode useHost=getDefault().getUseHost(),
134  unsigned debug=getDefault().getDebugMode()
135  ) :
136  m_commandQueue(commandQueue),
137  m_useHost(useHost),
138  m_forceRunMode(OpenCL), //Replaced this with automatic because the default is not MultiCoreCPU if no GPU is found
139  m_defaultRunMode(OpenCL),
140  m_debug(debug),
141  m_autoTune(getDefault().m_autoTune),
142  m_wgPerComputeUnit(getDefault().m_wgPerComputeUnit),
143  m_compileOptions(getDefault().m_compileOptions),
144  m_compileForAllDevices(getDefault().m_compileForAllDevices),
145  m_waitMode(getDefault().m_waitMode),
146  m_unroll(getDefault().m_unroll)
147  {};
148 
149 
150  control( const control& ref) :
151  m_commandQueue(ref.m_commandQueue),
152  m_useHost(ref.m_useHost),
153  m_forceRunMode(ref.m_forceRunMode),
154  m_defaultRunMode(ref.m_defaultRunMode),
155  m_debug(ref.m_debug),
156  m_autoTune(ref.m_autoTune),
157  m_wgPerComputeUnit(ref.m_wgPerComputeUnit),
158  m_compileOptions(ref.m_compileOptions),
159  m_compileForAllDevices(ref.m_compileForAllDevices),
160  m_waitMode(ref.m_waitMode),
161  m_unroll(ref.m_unroll)
162  {
163  //printf("control::copy construcor\n");
164  };
165 
166  //setters:
171  void setCommandQueue(::cl::CommandQueue commandQueue) { m_commandQueue = commandQueue; };
172 
176  void setUseHost(e_UseHostMode useHost) { m_useHost = useHost; };
177 
178 
185  void setForceRunMode(e_RunMode forceRunMode) { m_forceRunMode = forceRunMode; };
186 
197  void setDebugMode(unsigned debug) { m_debug = debug; };
198 
203  void setWGPerComputeUnit(int wgPerComputeUnit) { m_wgPerComputeUnit = wgPerComputeUnit; };
204 
206  void setWaitMode(e_WaitMode waitMode) { m_waitMode = waitMode; };
207 
209  void setUnroll(int unroll) { m_unroll = unroll; };
210 
213  void setCompileOptions(std::string &compileOptions) { m_compileOptions = compileOptions; };
214 
215  // getters:
216  ::cl::CommandQueue& getCommandQueue( ) { return m_commandQueue; };
217  const ::cl::CommandQueue& getCommandQueue( ) const { return m_commandQueue; };
218  ::cl::Context getContext() const { return m_commandQueue.getInfo<CL_QUEUE_CONTEXT>();};
219  ::cl::Device getDevice() const { return m_commandQueue.getInfo<CL_QUEUE_DEVICE>();};
220  e_UseHostMode getUseHost() const { return m_useHost; };
221  e_RunMode getForceRunMode() const { return m_forceRunMode; };
222  e_RunMode getDefaultPathToRun() const { return m_defaultRunMode; };
223  unsigned getDebugMode() const { return m_debug;};
224  int const getWGPerComputeUnit() const { return m_wgPerComputeUnit; };
225  const ::std::string getCompileOptions() const { return m_compileOptions; };
226  e_WaitMode getWaitMode() const { return m_waitMode; };
227  int getUnroll() const { return m_unroll; };
228  bool getCompileForAllDevices() const { return m_compileForAllDevices; };
229 
245  static control &getDefault()
246  {
247  // Default control structure; this can be accessed by the bolt::cl::control::getDefault()
248  static control _defaultControl( true );
249  return _defaultControl;
250  };
251 
252  static void printPlatforms( bool printDevices = true, cl_device_type deviceType = CL_DEVICE_TYPE_ALL );
253  static void printPlatformsRange( std::vector< ::cl::Platform >::iterator begin, std::vector< ::cl::Platform >::iterator end,
254  bool printDevices = true, cl_device_type deviceType = CL_DEVICE_TYPE_ALL );
255 
262  static ::cl::CommandQueue getDefaultCommandQueue( );
263 
266  typedef boost::shared_ptr< ::cl::Buffer > buffPointer;
267 
269  size_t totalBufferSize( );
271  buffPointer acquireBuffer( size_t reqSize, cl_mem_flags flags = CL_MEM_READ_WRITE, const void* host_ptr = NULL );
273  void freeBuffers( );
274 
275  private:
276 
277  // This is the private constructor is only used to create the initial default control structure.
278  control(bool createGlobal) :
279  m_commandQueue( getDefaultCommandQueue( ) ),
280  m_useHost(UseHost),
281  m_debug(debug::None),
282  m_autoTune(AutoTuneAll),
283  m_wgPerComputeUnit(8),
284  m_compileForAllDevices(true),
285  m_waitMode(BusyWait),
286  m_unroll(1)
287  {
288  ::cl_device_type dType = CL_DEVICE_TYPE_CPU;
289  if(m_commandQueue() != NULL)
290  {
291  ::cl::Device device = m_commandQueue.getInfo<CL_QUEUE_DEVICE>();
292  dType = device.getInfo<CL_DEVICE_TYPE>();
293  }
294  if(dType == CL_DEVICE_TYPE_CPU || m_commandQueue() == NULL)
295  {
296  //m_commandQueue will be NULL if no platforms are found and
297  //if a non AMD paltform is found but cound not enumerate any CPU device
298 #ifdef ENABLE_TBB
299  m_forceRunMode = MultiCoreCpu;
300  m_defaultRunMode = MultiCoreCpu;
301 #else
302  m_forceRunMode = SerialCpu;
303  m_defaultRunMode = SerialCpu;
304 #endif
305  }
306  else
307  {
308  //If dType = CL_DEVICE_TYPE_GPU
309  m_forceRunMode = OpenCL;
310  m_defaultRunMode = OpenCL;
311  }
312  };
313 
314  ::cl::CommandQueue m_commandQueue;
315  e_UseHostMode m_useHost;
316  e_RunMode m_forceRunMode;
317  e_RunMode m_defaultRunMode;
318  e_AutoTuneMode m_autoTune; /* auto-tune the choice of device CPU/GPU and workgroup shape */
319  unsigned m_debug;
320  int m_wgPerComputeUnit;
321  ::std::string m_compileOptions; // extra options to pass to OpenCL compiler.
322  bool m_compileForAllDevices; // compile for all devices in the context. False means to only compile for specified device.
323  e_WaitMode m_waitMode;
324  int m_unroll;
325 
326  struct descBufferKey
327  {
328  ::cl::Context buffContext;
329  cl_mem_flags memFlags;
330  const void* host_ptr;
331  };
332 
333  struct descBufferValue
334  {
335  size_t buffSize;
336  bool inUse;
337  ::cl::Buffer buffBuff;
338  };
339 
340  struct descBufferComp
341  {
342  bool operator( )( const descBufferKey& lhs, const descBufferKey& rhs ) const
343  {
344  if( lhs.memFlags < rhs.memFlags )
345  {
346  return true;
347  }
348  else if( lhs.memFlags == rhs.memFlags )
349  {
350  if( lhs.buffContext( ) < rhs.buffContext( ) )
351  {
352  return true;
353  }
354  else if( lhs.buffContext( ) == rhs.buffContext( ) )
355  {
356  if( lhs.host_ptr < rhs.host_ptr )
357  {
358  return true;
359  }
360  else
361  {
362  return false;
363  }
364  }
365  else
366  {
367  return false;
368  }
369  }
370  else
371  {
372  return false;
373  }
374  }
375  };
376 
377  typedef std::multimap< descBufferKey, descBufferValue, descBufferComp > mapBufferType;
378 
387  class UnlockBuffer
388  {
389  mapBufferType::iterator m_iter;
390  control& m_control;
391 
392  public:
393  // Basic constructor requires a reference to the container and a positional element
394  UnlockBuffer( control& p_control, mapBufferType::iterator it ): m_iter( it ), m_control( p_control )
395  {}
396 
397  void operator( )( const void* pBuff )
398  {
399  // TODO: I think a general mutex is overkill here; we should try to use an interlocked instruction to modify the
400  // inUse flag
401  boost::lock_guard< boost::mutex > lock( m_control.mapGuard );
402  m_iter->second.inUse = false;
403  }
404  };
405 
406  friend class UnlockBuffer;
407  mapBufferType mapBuffer;
408  boost::mutex mapGuard;
409 
410  }; // end class control
411 
412  };
413 };
414 
415 
416 // Implementor note:
417 // When adding a new field to this structure, don't forget to:
418 // * Add the new field, ie "int _foo.
419 // * Add setter function and getter function, ie "void foo(int fooValue)" and "int foo const { return _foo; }"
420 // * Add the field to the private constructor. This is used to set the global default "_defaultControl".
421 // * Add the field to the public constructor, copying from the _defaultControl.
422 
423 // Sample usage:
424 // bolt::control c(myCmdQueue);
425 // c.debug(bolt::control::ShowCompile);
426 // bolt::cl::reduce(c, a.begin(), a.end(), std::plus<int>);
427 //
428 //
429 // reduce (bolt::control(myCmdQueue),
430 
431 #endif