Bolt  1.1
C++ template library with support for OpenCL
device_vector.h
Go to the documentation of this file.
1 /***************************************************************************
2 * Copyright 2012 - 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 
16 ***************************************************************************/
17 
18 
19 
20 
21 #pragma once
22 #if !defined( BOLT_CL_DEVICE_VECTOR_H )
23 #define BOLT_CL_DEVICE_VECTOR_H
24 
25 #include <iterator>
26 #include <type_traits>
27 #include <numeric>
28 #include "bolt/cl/bolt.h"
30 #include <iostream>
31 #include <boost/iterator/iterator_facade.hpp>
32 #include <boost/iterator/reverse_iterator.hpp>
33 #include <boost/shared_array.hpp>
34 
42 namespace bolt
43 {
47 namespace cl
48 {
59  : public std::random_access_iterator_tag
60  { // identifying tag for random-access iterators
61  };
62 
71  template< typename T >
73  {
77  template< typename Container >
78  class UnMapBufferFunctor
79  {
80  Container& m_Container;
81 
82  public:
83  // Basic constructor requires a reference to the container and a positional element
84  UnMapBufferFunctor( Container& rhs ): m_Container( rhs )
85  {}
86 
87  void operator( )( const void* pBuff )
88  {
89  ::cl::Event unmapEvent;
90 
91  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, const_cast< void* >( pBuff ), NULL, &unmapEvent ),
92  "shared_ptr failed to unmap host memory back to device memory" );
93  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
94  }
95  };
96 
97  typedef T* naked_pointer;
98  typedef const T* const_naked_pointer;
99 
100  public:
101 
102  // Useful typedefs specific to this container
103  typedef T value_type;
104  typedef ptrdiff_t difference_type;
105  typedef difference_type distance_type;
106  typedef size_t size_type;
107 
108  typedef boost::shared_array< value_type > pointer;
109  typedef boost::shared_array< const value_type > const_pointer;
110 
118  template< typename Container >
120  {
121  public:
122  reference_base(Container &rhs, size_type index ): m_Container( rhs ), m_Index( index )
123  {}
124 
125 
126 
127  // Automatic type conversion operator to turn the reference object into a value_type
128  operator value_type( ) const
129  {
130  cl_int l_Error = CL_SUCCESS;
131  naked_pointer result = reinterpret_cast< naked_pointer >( m_Container.m_commQueue.enqueueMapBuffer(
132  m_Container.m_devMemory, true, CL_MAP_READ, m_Index * sizeof( value_type ), sizeof( value_type ), NULL, NULL, &l_Error ) );
133  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
134 
135  value_type valTmp = *result;
136 
137  ::cl::Event unmapEvent;
138  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, result, NULL, &unmapEvent ), "device_vector failed to unmap host memory back to device memory" );
139  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
140 
141  return valTmp;
142  }
143 
144  reference_base< Container >& operator=( const value_type& rhs )
145  {
146  cl_int l_Error = CL_SUCCESS;
147  naked_pointer result = reinterpret_cast< naked_pointer >( m_Container.m_commQueue.enqueueMapBuffer(
148  m_Container.m_devMemory, true, CL_MAP_WRITE_INVALIDATE_REGION, m_Index * sizeof( value_type ), sizeof( value_type ), NULL, NULL, &l_Error ) );
149  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
150 
151  *result = rhs;
152 
153  ::cl::Event unmapEvent;
154  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, result, NULL, &unmapEvent ), "device_vector failed to unmap host memory back to device memory" );
155  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
156 
157  return *this;
158  }
159 
162  Container& getContainer( ) const
163  {
164  return m_Container;
165  }
166 
167  size_type getIndex() const
168  {
169  return m_Index;
170  }
171 
172  private:
173  Container& m_Container;
174  size_type m_Index;
175  };
176 
180 
186  typedef const value_type const_reference;
187 
188  // Handy for the reference class to get at the wrapped ::cl objects
189  //friend class reference;
190 
201  template< typename Container >
202  class iterator_base: public boost::iterator_facade< iterator_base< Container >, value_type, device_vector_tag,
203  typename device_vector::reference, int >
204  {
205  public:
206  typedef typename boost::iterator_facade< iterator_base< Container >, value_type, device_vector_tag,
207  typename device_vector::reference, int >::difference_type difference_type;
208 
209 
210  //typedef iterator_facade::difference_type difference_type;
211 
212  // This class represents the iterator data transferred to the openCL device. Transferring pointers is tricky,
213  // the only reason we allocate space for a pointer in this payload is because the openCl clSetKernelArg() checks the
214  // size ( bytes ) of the argument passed in, and the corresponding GPU iterator has a pointer member.
215  // The value of the pointer is not relevant on host side, and is initialized on the device side with the init method
216  // This size of the payload needs to be able to encapsulate both 32bit and 64bit devices
217  // sizeof( 32bit device payload ) = 32bit index & 32bit pointer = 8 bytes
218  // sizeof( 64bit device payload ) = 32bit index & 64bit aligned pointer = 16 bytes
219  struct Payload
220  {
221  difference_type m_Index;
222  difference_type m_Ptr1[ 3 ]; // Represents device pointer, big enough for 32 or 64bit
223  };
224 
225 
226  // Basic constructor requires a reference to the container and a positional element
227  iterator_base( ): m_Container( getContainer() ), m_Index( 0 )
228  {}
229 
230  // Basic constructor requires a reference to the container and a positional element
231  iterator_base( Container& rhs, difference_type index ): m_Container( rhs ), m_Index( index )
232  {}
233 
234  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
235  template< typename OtherContainer >
236  iterator_base( const iterator_base< OtherContainer >& rhs ): m_Container( rhs.m_Container ), m_Index( rhs.m_Index )
237  {}
238 
239  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
240  //template< typename Container >
241  iterator_base< Container >& operator = ( const iterator_base< Container >& rhs )
242  {
243  m_Container = rhs.m_Container;
244  m_Index = rhs.m_Index;
245  return *this;
246  }
247 
248  iterator_base< Container > & operator+= ( const difference_type & n )
249  {
250  advance( n );
251  return *this;
252  }
253 
254  iterator_base< Container >& operator = ( const difference_type & n )
255  {
256  advance( n );
257  return *this;
258  }
259 
260 
261 
262  const iterator_base< Container > operator + ( const difference_type & n ) const
263  {
264  iterator_base< Container > result(*this);
265  result.advance(n);
266  return result;
267  }
268 
269  Container& getContainer( ) const
270  {
271  return m_Container;
272  }
273 
274  // This method initializes the payload of the iterator for the cl device; the contents of the pointer is 0 as it has no relevance
275  // on the host
276  const Payload gpuPayload( ) const
277  {
278  Payload payload = { m_Index, { 0, 0, 0 } };
279  return payload;
280  }
281 
282  // Calculates the size of payload for the cl device. The bitness of the device is independant of the host and must be
283  // queried. The bitness of the device determines the size of the pointer contained in the payload. 64bit pointers must
284  // be 8 byte aligned, so
285  const difference_type gpuPayloadSize( ) const
286  {
287  cl_int l_Error = CL_SUCCESS;
288  ::cl::Device which_device;
289  l_Error = m_Container.m_commQueue.getInfo(CL_QUEUE_DEVICE,&which_device );
290 
291  cl_uint deviceBits = which_device.getInfo< CL_DEVICE_ADDRESS_BITS >( );
292 
293  // Size of index and pointer
294  difference_type payloadSize = sizeof( difference_type ) + ( deviceBits >> 3 );
295 
296  // 64bit devices need to add padding for 8 byte aligned pointer
297  if( deviceBits == 64 )
298  payloadSize += 4;
299 
300  return payloadSize;
301 
302  }
303 
304  difference_type m_Index;
305  difference_type distance_to( const iterator_base< Container >& rhs ) const
306  {
307  return static_cast< difference_type >( rhs.m_Index - m_Index );
308  }
309  private:
310 
311 
312  // Payload payload;
313  // Implementation detail of boost.iterator
314  friend class boost::iterator_core_access;
315 
316  // Handy for the device_vector erase methods
317  friend class device_vector< value_type >;
318 
319  // Used for templatized copy constructor and the templatized equal operator
320  template < typename > friend class iterator_base;
321 
322  void advance( difference_type n )
323  {
324  m_Index += n;
325  }
326 
327  void increment( )
328  {
329  advance( 1 );
330  }
331 
332  void decrement( )
333  {
334  advance( -1 );
335  }
336 
337  template< typename OtherContainer >
338  bool equal( const iterator_base< OtherContainer >& rhs ) const
339  {
340  bool sameIndex = rhs.m_Index == m_Index;
341  bool sameContainer = (&m_Container == &rhs.m_Container );
342 
343  return ( sameIndex && sameContainer );
344  }
345 
346  reference dereference( ) const
347  {
348  return m_Container[ m_Index ];
349  }
350 
351  Container& m_Container;
352  };
353 
363  template< typename Container >
364  class reverse_iterator_base: public boost::iterator_facade< reverse_iterator_base< Container >, value_type, std::random_access_iterator_tag, typename device_vector::reference, int >
365  {
366  public:
367 
368  // Basic constructor requires a reference to the container and a positional element
369  reverse_iterator_base( Container& lhs, size_type index ): m_Container( lhs ), m_Index( index-1 )
370  {}
371 
372  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
373  template< typename OtherContainer >
374  reverse_iterator_base( const reverse_iterator_base< OtherContainer >& lhs ): m_Container( lhs.m_Container ), m_Index( lhs.m_Index-1 )
375  {}
376 
377  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
378  //template< typename Container >
380  {
381  m_Container = lhs.m_Container;
382  m_Index = lhs.m_Index;
383  return *this;
384  }
385 
386  reverse_iterator_base< Container >& operator+= ( const difference_type & n )
387  {
388  advance( -n );
389  return *this;
390  }
391 
392  const reverse_iterator_base< Container > operator+ ( const difference_type & n ) const
393  {
395  result.advance(-n);
396  return result;
397  }
398 #if !defined(_WIN32) && defined(__x86_64__)
399  const reverse_iterator_base< Container > operator+ ( const int & n ) const
400  {
402  result.advance(-n);
403  return result;
404  }
405 #endif
406 
407 
408 
409  int getIndex() const
410  {
411  return m_Index;
412  }
413 
414  //iterator_base<Container> base()
415  //{
416  // iterator_base<Container>(m_Container,m_Index-1);
417  //}
418 
419  difference_type distance_to( const reverse_iterator_base< Container >& lhs ) const
420  {
421  return static_cast< difference_type >( m_Index - lhs.m_Index );
422  }
423 
424  private:
425  // Implementation detail of boost.iterator
426  friend class boost::iterator_core_access;
427 
428  // Handy for the device_vector erase methods
429  friend class device_vector< value_type >;
430 
431  // Used for templatized copy constructor and the templatized equal operator
432  template < typename > friend class reverse_iterator_base;
433 
434  void advance( difference_type n )
435  {
436  m_Index += n;
437  }
438 
439  void increment( )
440  {
441  advance( -1 );
442  }
443 
444  void decrement( )
445  {
446  advance( 1 );
447  }
448 
449 
450  template< typename OtherContainer >
451  bool equal( const reverse_iterator_base< OtherContainer >& lhs ) const
452  {
453  bool sameIndex = lhs.m_Index == m_Index;
454  bool sameContainer = (&m_Container == &lhs.m_Container );
455 
456  return ( sameIndex && sameContainer );
457  }
458 
459  reference dereference( ) const
460  {
461  return m_Container[ m_Index ];
462  }
463 
464  Container& m_Container;
465  size_type m_Index;
466  };
467 
471 
475 
479 
483 
484 
490  device_vector( /* cl_mem_flags flags = CL_MEM_READ_WRITE,*/ const control& ctl = control::getDefault( ) ): m_Size( 0 ), m_commQueue( ctl.getCommandQueue( ) ), m_Flags( CL_MEM_READ_WRITE )
491  {
492  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
493  m_devMemory = NULL;
494  }
495 
505  device_vector( size_type newSize, const value_type& value = value_type( ), cl_mem_flags flags = CL_MEM_READ_WRITE,
506  bool init = true, const control& ctl = control::getDefault( ) ): m_Size( newSize ), m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
507  {
508  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
509 
510  // We want to use the context from the passed in commandqueue to initialize our buffer
511  cl_int l_Error = CL_SUCCESS;
512  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
513  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
514 
515  if( m_Size > 0 )
516  {
517  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ) );
518 
519  if( init )
520  {
521  std::vector< ::cl::Event > fillEvent( 1 );
522 
523  //
524  // note: If the size of value is not a power of two, we fill serially. Another approach is to
525  // launch a templatized fill kernel, but it leads to complications.
526 
527  try
528  {
529  size_t sizeDS = sizeof(value_type);
530 
531  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
532  {
533  V_OPENCL( m_commQueue.enqueueFillBuffer< value_type >( m_devMemory, value, 0,
534  newSize * sizeof( value_type ), NULL, &fillEvent.front( ) ),
535  "device_vector failed to fill the internal buffer with the requested pattern");
536  }
537  else // non 2^n data types
538  {
539  // Map the buffer to host
540  ::cl::Event fill_mapEvent;
541  value_type *host_buffer = ( value_type* )ctl.getCommandQueue( ).enqueueMapBuffer (
542  m_devMemory,
543  false,
544  CL_MAP_READ | CL_MAP_WRITE,
545  0,
546  sizeof( value_type )*newSize,
547  NULL,
548  &fill_mapEvent,
549  &l_Error );
550 
551  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
552  bolt::cl::wait( ctl, fill_mapEvent );
553 
554  // Use serial fill_n to fill the device_vector with value
555 #if defined(_WIN32)
556  std::fill_n( stdext::make_checked_array_iterator( host_buffer, newSize ),
557  newSize,
558  value );
559 #else
560  std::fill_n( host_buffer,
561  newSize,
562  value );
563 #endif
564 
565 
566  // Unmap the buffer
567  l_Error = ctl.getCommandQueue( ).enqueueUnmapMemObject( m_devMemory,
568  host_buffer,
569  NULL,
570  &fillEvent.front( ) );
571  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
572 
573  }
574  }
575  catch( std::exception& e )
576  {
577  std::cout << "device_vector enqueueFillBuffer error condition reported:" << std::endl << e.what() << std::endl;
578  //return 1;
579  }
580 
581  try
582  {
583  // Not allowed to return until the fill operation is finished
584  V_OPENCL( m_commQueue.enqueueWaitForEvents( fillEvent ), "device_vector failed to wait for an event" );
585  }
586  catch( std::exception& e )
587  {
588  std::cout << "device_vector enqueueFillBuffer enqueueWaitForEvents error condition reported:" << std::endl << e.what() << std::endl;
589  //return 1;
590  }
591  }
592  }
593  else
594  {
595  m_devMemory=NULL;
596  }
597  }
598 
607  template< typename InputIterator >
608  device_vector( const InputIterator begin, size_type newSize, cl_mem_flags flags = CL_MEM_READ_WRITE,
609  bool init = true, const control& ctl = control::getDefault( ),
610  typename std::enable_if< !std::is_integral< InputIterator >::value >::type* = 0 ): m_Size( newSize ),
611  m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
612  {
613  static_assert( std::is_convertible< value_type, typename std::iterator_traits< InputIterator >::value_type >::value,
614  "iterator value_type does not convert to device_vector value_type" );
615  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
616 
617  if ( m_Size == 0 )
618  {
619  m_devMemory=NULL;
620  return;
621  }
622  // We want to use the context from the passed in commandqueue to initialize our buffer
623  cl_int l_Error = CL_SUCCESS;
624  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
625  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
626 
627  if( m_Flags & CL_MEM_USE_HOST_PTR )
628  {
629  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ),
630  reinterpret_cast< value_type* >( const_cast< value_type* >( &*begin ) ) );
631  }
632  else
633  {
634  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ) );
635 
636  if( init )
637  {
638  size_t byteSize = m_Size * sizeof( value_type );
639 
640  // Note: The Copy API doesn't work because it uses the concept of a 'default' accelerator
641  // ::cl::copy( begin, begin+m_Size, m_devMemory );
642  naked_pointer pointer = static_cast< naked_pointer >( m_commQueue.enqueueMapBuffer(
643  m_devMemory, CL_TRUE, CL_MEM_WRITE_ONLY, 0, byteSize, 0, 0, &l_Error) );
644  V_OPENCL( l_Error, "enqueueMapBuffer failed in device_vector constructor" );
645 #if (_WIN32)
646  std::copy( begin, begin + m_Size, stdext::checked_array_iterator< naked_pointer >( pointer, m_Size ) );
647 #else
648  std::copy( begin, begin + m_Size, pointer );
649 #endif
650  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, pointer, 0, 0 );
651  V_OPENCL( l_Error, "enqueueUnmapMemObject failed in device_vector constructor" );
652  }
653  }
654  };
655 
663  template< typename InputIterator >
664  device_vector( const InputIterator begin, const InputIterator end, cl_mem_flags flags = CL_MEM_READ_WRITE, const control& ctl = control::getDefault( ),
665  typename std::enable_if< !std::is_integral< InputIterator >::value >::type* = 0 ): m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
666  {
667  static_assert( std::is_convertible< value_type, typename std::iterator_traits< InputIterator >::value_type >::value,
668  "iterator value_type does not convert to device_vector value_type" );
669  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
670 
671  // We want to use the context from the passed in commandqueue to initialize our buffer
672  cl_int l_Error = CL_SUCCESS;
673  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
674  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
675 
676  m_Size = std::distance( begin, end );
677  if ( m_Size == 0 )
678  {
679  m_devMemory=NULL;
680  return;
681  }
682  size_t byteSize = m_Size * sizeof( value_type );
683 
684  if( m_Flags & CL_MEM_USE_HOST_PTR )
685  {
686  m_devMemory = ::cl::Buffer( l_Context, m_Flags, byteSize,
687  reinterpret_cast< value_type* >( const_cast< value_type* >( &*begin ) ) );
688  }
689  else
690  {
691  m_devMemory = ::cl::Buffer( l_Context, m_Flags, byteSize );
692 
693  // Note: The Copy API doesn't work because it uses the concept of a 'default' accelerator
694  //::cl::copy( begin, end, m_devMemory );
695  naked_pointer pointer = static_cast< naked_pointer >( m_commQueue.enqueueMapBuffer(
696  m_devMemory, CL_TRUE, CL_MEM_WRITE_ONLY, 0, byteSize, 0, 0, &l_Error) );
697  V_OPENCL( l_Error, "enqueueMapBuffer failed in device_vector constructor" );
698 #if (_WIN32)
699  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( pointer, m_Size ) );
700 #else
701  std::copy( begin, end, pointer );
702 #endif
703  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, pointer, 0, 0 );
704  V_OPENCL( l_Error, "enqueueUnmapMemObject failed in device_vector constructor" );
705  }
706  };
707 
712  device_vector( const ::cl::Buffer& rhs, const control& ctl = control::getDefault( ) ): m_devMemory( rhs ), m_commQueue( ctl.getCommandQueue( ) )
713  {
714  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
715 
716  m_Size = capacity( );
717 
718  cl_int l_Error = CL_SUCCESS;
719  m_Flags = m_devMemory.getInfo< CL_MEM_FLAGS >( &l_Error );
720  V_OPENCL( l_Error, "device_vector failed to query for the memory flags of the ::cl::Buffer object" );
721  };
722 
723  // Copying methods
724  device_vector( const device_vector& rhs ): m_Flags( rhs.m_Flags ), m_Size( 0 ), m_commQueue( rhs.m_commQueue )
725  {
726  // This method will set the m_Size member variable upon successful completion
727  resize( rhs.m_Size );
728 
729  if( m_Size == 0 )
730  return;
731 
732  size_type l_srcSize = m_Size * sizeof( value_type );
733  ::cl::Event copyEvent;
734 
735  cl_int l_Error = CL_SUCCESS;
736  l_Error = m_commQueue.enqueueCopyBuffer( rhs.m_devMemory, m_devMemory, 0, 0, l_srcSize, NULL, &copyEvent );
737  V_OPENCL( l_Error, "device_vector failed to copy data inside of operator=()" );
738  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait for copy event" );
739  }
740 
741  device_vector& operator=( const device_vector& rhs )
742  {
743  if( this == &rhs )
744  return *this;
745 
746  m_Flags = rhs.m_Flags;
747  m_commQueue = rhs.m_commQueue;
748  m_Size = capacity( );
749 
750  // This method will set the m_Size member variable upon successful completion
751  resize( rhs.m_Size );
752 
753  if( m_Size == 0 )
754  return *this;
755 
756  size_type l_srcSize = m_Size * sizeof( value_type );
757  ::cl::Event copyEvent;
758 
759  cl_int l_Error = CL_SUCCESS;
760  l_Error = m_commQueue.enqueueCopyBuffer( rhs.m_devMemory, m_devMemory, 0, 0, l_srcSize, NULL, &copyEvent );
761  V_OPENCL( l_Error, "device_vector failed to copy data inside of operator=()" );
762  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait for copy event" );
763 
764  return *this;
765  }
766 
767  // Member functions
768 
781  void resize( size_type reqSize, const value_type& val = value_type( ) )
782  {
783  if( (m_Flags & CL_MEM_USE_HOST_PTR) != 0 )
784  {
785  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE ,
786  "A device_vector can not resize() memory not under its direct control" );
787  }
788 
789  size_type cap = capacity( );
790 
791  if( reqSize == cap )
792  return;
793 
794  if( reqSize > max_size( ) )
795  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE ,
796  "The amount of memory requested exceeds what is available" );
797 
798  cl_int l_Error = CL_SUCCESS;
799 
800  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
801  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::Buffer object" );
802 
803  size_type l_reqSize = reqSize * sizeof( value_type );
804  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_reqSize, NULL, &l_Error );
805 
806  size_type l_srcSize = m_Size * sizeof( value_type );
807 
808  if( l_srcSize > 0 )
809  {
810  // If the new buffer size is greater than the old, the new elements must be initialized to the value specified on the
811  // function parameter
812  if( l_reqSize > l_srcSize )
813  {
814  std::vector< ::cl::Event > copyEvent( 1 );
815  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory,
816  l_tmpBuffer,
817  0,
818  0,
819  l_srcSize,
820  NULL,
821  &copyEvent.front( ) );
822  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
823  ::cl::Event fillEvent;
824 
825  size_t sizeDS = sizeof(value_type);
826  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
827  {
828  l_Error = m_commQueue.enqueueFillBuffer< value_type >( l_tmpBuffer,
829  val,
830  l_srcSize,
831  (l_reqSize - l_srcSize),
832  &copyEvent,
833  &fillEvent );
834  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
835  // Not allowed to return until the copy operation is finished
836  }
837  else // non 2^n data types
838  {
839  // Map the buffer to host
840  ::cl::Event fill_mapEvent;
841  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer (
842  l_tmpBuffer,
843  false,
844  CL_MAP_READ | CL_MAP_WRITE,
845  l_srcSize,
846  (l_reqSize - l_srcSize),
847  NULL,
848  &fill_mapEvent,
849  &l_Error );
850 
851  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
852  fill_mapEvent.wait( );
853 
854  // Use serial fill_n to fill the device_vector with value
855 #if defined(_WIN32)
856  std::fill_n( stdext::make_checked_array_iterator( host_buffer , reqSize ),
857  reqSize,
858  val );
859 #else
860  std::fill_n( host_buffer,
861  (reqSize - m_Size),
862  val );
863 #endif
864 
865 
866  // Unmap the buffer
867  l_Error = m_commQueue.enqueueUnmapMemObject( l_tmpBuffer,
868  host_buffer,
869  NULL,
870  &fillEvent );
871  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
872  }
873 
874  l_Error = fillEvent.wait( );
875  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
876  }
877  else
878  {
879  std::vector< ::cl::Event > copyEvent( 1 );
880  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_reqSize, NULL, &copyEvent.front( ) );
881  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
882  // Not allowed to return until the copy operation is finished
883  l_Error = m_commQueue.enqueueWaitForEvents( copyEvent );
884  V_OPENCL( l_Error, "device_vector failed to wait for copy event" );
885  }
886  }
887  else
888  {
889  ::cl::Event fillEvent;
890  size_t sizeDS = sizeof(value_type);
891  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
892  {
893  l_Error = m_commQueue.enqueueFillBuffer< value_type >( l_tmpBuffer, val, 0, l_reqSize, NULL, &fillEvent );
894  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
895 
896  }
897  else // non 2^n data types
898  {
899  // Map the buffer to host
900  ::cl::Event fill_mapEvent;
901  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer (
902  l_tmpBuffer,
903  false,
904  CL_MAP_READ | CL_MAP_WRITE,
905  0,
906  l_reqSize,
907  NULL,
908  &fill_mapEvent,
909  &l_Error );
910 
911  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
912  fill_mapEvent.wait( );
913 
914  // Use serial fill_n to fill the device_vector with value
915 #if defined(_WIN32)
916  std::fill_n( stdext::make_checked_array_iterator( host_buffer , reqSize ),
917  reqSize,
918  val );
919 #else
920  std::fill_n( host_buffer,
921  reqSize,
922  val );
923 #endif
924 
925  // Unmap the buffer
926  l_Error = m_commQueue.enqueueUnmapMemObject( l_tmpBuffer,
927  host_buffer,
928  NULL,
929  &fillEvent );
930  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
931  }
932 
933  // Not allowed to return until the fill operation is finished
934  l_Error = fillEvent.wait( );
935  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
936  }
937 
938  // Remember the new size
939  m_Size = reqSize;
940 
941  // Operator= should call retain/release appropriately
942  m_devMemory = l_tmpBuffer;
943  }
944 
949  size_type size( void ) const
950  {
951  return m_Size;
952  }
953 
957  size_type max_size( void ) const
958  {
959  cl_int l_Error = CL_SUCCESS;
960 
961  ::cl::Device l_Device = m_commQueue.getInfo< CL_QUEUE_DEVICE >( &l_Error );
962  V_OPENCL( l_Error, "device_vector failed to query for the device of the command queue" );
963 
964  cl_ulong l_MaxSize = l_Device.getInfo< CL_DEVICE_MAX_MEM_ALLOC_SIZE >( &l_Error );
965  V_OPENCL( l_Error, "device_vector failed to query device for the maximum memory size" );
966 
967  return static_cast< size_type >( l_MaxSize / sizeof( value_type ) );
968  }
969 
979  void reserve( size_type reqSize )
980  {
981  if( reqSize <= capacity( ) )
982  return;
983 
984  if( reqSize > max_size( ) )
985  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "The amount of memory requested exceeds what is available" );
986 
987  // We want to use the context from the passed in commandqueue to initialize our buffer
988  cl_int l_Error = CL_SUCCESS;
989  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
990  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
991 
992  if( m_Size == 0 )
993  {
994  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, reqSize * sizeof( value_type ) );
995  m_devMemory = l_tmpBuffer;
996  return;
997  }
998 
999  size_type l_size = reqSize * sizeof( value_type );
1000  // Can't user host_ptr because l_size is guranteed to be bigger
1001  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_size, NULL, &l_Error );
1002  V_OPENCL( l_Error, "device_vector can not create an temporary internal OpenCL buffer" );
1003 
1004  size_type l_srcSize = m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error );
1005  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1006 
1007  ::cl::Event copyEvent;
1008  V_OPENCL( m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_srcSize, NULL, &copyEvent ),
1009  "device_vector failed to copy from buffer to buffer " );
1010 
1011  // Not allowed to return until the copy operation is finished
1012  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait on an event object" );
1013 
1014  // Operator= should call retain/release appropriately
1015  m_devMemory = l_tmpBuffer;
1016  }
1017 
1023  size_type capacity( void ) const
1024  {
1025  size_t l_memSize = 0;
1026  cl_int l_Error = CL_SUCCESS;
1027 
1028  // this seems like bug; what if i popped everything?
1029  // if( m_Size == 0 )
1030  // return m_Size;
1031  if(m_devMemory() == NULL)
1032  return 0;
1033 
1034  l_memSize = m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error );
1035  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1036  return static_cast< size_type >( l_memSize / sizeof( value_type ) );
1037 
1038  }
1039 
1047  {
1048  if( m_Size > capacity( ) )
1049  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "device_vector size can not be greater than capacity( )" );
1050 
1051  if( m_Size == capacity( ) )
1052  return;
1053 
1054  // We want to use the context from the passed in commandqueue to initialize our buffer
1055  cl_int l_Error = CL_SUCCESS;
1056  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
1057  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
1058 
1059  size_type l_newSize = m_Size * sizeof( value_type );
1060  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_newSize, NULL, &l_Error );
1061  V_OPENCL( l_Error, "device_vector can not create an temporary internal OpenCL buffer" );
1062 
1063  //TODO - this is equal to the capacity()
1064  size_type l_srcSize = m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error );
1065  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1066 
1067  std::vector< ::cl::Event > copyEvent( 1 );
1068  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_newSize, NULL, &copyEvent.front( ) );
1069  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
1070 
1071  // Not allowed to return until the copy operation is finished
1072  l_Error = m_commQueue.enqueueWaitForEvents( copyEvent );
1073  V_OPENCL( l_Error, "device_vector failed to wait for copy event" );
1074 
1075  // Operator= should call retain/release appropriately
1076  m_devMemory = l_tmpBuffer;
1077  }
1078 
1082  reference operator[]( size_type n )
1083  {
1084 
1085  return reference( *this, n );
1086  }
1087 
1091  const_reference operator[]( size_type n ) const
1092  {
1093  cl_int l_Error = CL_SUCCESS;
1094 
1095  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ, n * sizeof( value_type), sizeof( value_type), NULL, NULL, &l_Error ) );
1096  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1097 
1098  const_reference tmpRef = *ptrBuff;
1099 
1100  ::cl::Event unmapEvent;
1101  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1102  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1103  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1104 
1105  return tmpRef;
1106  }
1107 
1111  iterator begin( void )
1112  {
1113  return iterator( *this, 0 );
1114  }
1115 
1120  const_iterator begin( void ) const
1121  {
1122  return const_iterator( *this, 0 );
1123  }
1124 
1130  const_iterator cbegin( void ) const
1131  {
1132  return const_iterator( *this, 0 );
1133  }
1134 
1139  {
1140  //static_assert( false, "Reverse iterators are not yet implemented" );
1141  return reverse_iterator( *this, m_Size );
1142  }
1143 
1150  {
1151  //static_assert( false, "Reverse iterators are not yet implemented" );
1152  return const_reverse_iterator( *this, m_Size );
1153  }
1154 
1162  {
1163  //static_assert( false, "Reverse iterators are not yet implemented" );
1164  return const_reverse_iterator( *this, m_Size );
1165  }
1166 
1170  iterator end( void )
1171  {
1172  return iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1173  }
1174 
1179  const_iterator end( void ) const
1180  {
1181  return const_iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1182  }
1183 
1189  const_iterator cend( void ) const
1190  {
1191  return const_iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1192  }
1193 
1199  {
1200  return reverse_iterator( *this, 0 );
1201  }
1202 
1209  {
1210  //static_assert( false, "Reverse iterators are not yet implemented" );
1211  return const_reverse_iterator( *this, 0 );
1212  }
1213 
1221  {
1222  return const_reverse_iterator( *this, 0 );
1223  }
1224 
1228  reference front( void )
1229  {
1230  return (*begin());
1231  }
1232 
1236  const_reference front( void ) const
1237  {
1238  return (*begin());
1239  }
1240 
1244  reference back( void )
1245  {
1246  return ( *(end() - 1) );
1247  }
1248 
1252  const_reference back( void ) const
1253  {
1254  return ( *(end() - 1) );
1255  }
1256 
1257  pointer data( void )
1258  {
1259  if(0 == size())
1260  {
1261  pointer sp;
1262  return sp;
1263  }
1264  cl_int l_Error = CL_SUCCESS;
1265 
1266  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1267  0, capacity() * sizeof( value_type ), NULL, NULL, &l_Error ) );
1268 
1269  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1270 
1271  pointer sp( ptrBuff, UnMapBufferFunctor< device_vector< value_type > >( *this ) );
1272 
1273  return sp;
1274  }
1275 
1276  const_pointer data( void ) const
1277  {
1278  cl_int l_Error = CL_SUCCESS;
1279 
1280  const_naked_pointer ptrBuff = reinterpret_cast< const_naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ,
1281  0, capacity() * sizeof( value_type ), NULL, NULL, &l_Error ) );
1282  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1283 
1284  const_pointer sp( ptrBuff, UnMapBufferFunctor< const device_vector< value_type > >( *this ) );
1285  return sp;
1286  }
1287 
1291  void clear( void )
1292  {
1293  // Only way to release the Buffer resource is to explicitly call the destructor
1294  // m_devMemory.~Buffer( );
1295 
1296  // Allocate a temp empty buffer on the stack, because of a double release problem with explicitly
1297  // calling the Wrapper destructor with cl.hpp version 1.2.
1298  ::cl::Buffer tmp;
1299  m_devMemory = tmp;
1300 
1301  m_Size = 0;
1302  }
1303 
1307  bool empty( void ) const
1308  {
1309  return m_Size ? false: true;
1310  }
1311 
1315  void push_back( const value_type& value )
1316  {
1317  if( m_Size > capacity( ) )
1318  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "device_vector size can not be greater than capacity( )" );
1319 
1320  // Need to grow the vector to push new value.
1321  // Vectors double their capacity on push_back if the array is not big enough.
1322  if( m_Size == capacity( ) )
1323  {
1324  m_Size ? reserve( m_Size * 2 ) : reserve( 1 );
1325  }
1326 
1327  cl_int l_Error = CL_SUCCESS;
1328 
1329  naked_pointer result = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_WRITE_INVALIDATE_REGION,
1330  m_Size * sizeof( value_type), sizeof( value_type ), NULL, NULL, &l_Error ) );
1331  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for push_back" );
1332  *result = value;
1333 
1334  ::cl::Event unmapEvent;
1335  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, result, NULL, &unmapEvent );
1336  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1337  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1338 
1339  ++m_Size;
1340  }
1341 
1344  void pop_back( void )
1345  {
1346  if( m_Size > 0 )
1347  {
1348  --m_Size;
1349  }
1350  }
1351 
1355  void swap( device_vector& vec )
1356  {
1357  if( this == &vec )
1358  return;
1359 
1360  ::cl::Buffer swapBuffer( m_devMemory );
1361  m_devMemory = vec.m_devMemory;
1362  vec.m_devMemory = swapBuffer;
1363 
1364  ::cl::CommandQueue swapQueue( m_commQueue );
1365  m_commQueue = vec.m_commQueue;
1366  vec.m_commQueue = swapQueue;
1367 
1368  size_type sizeTmp = m_Size;
1369  m_Size = vec.m_Size;
1370  vec.m_Size = sizeTmp;
1371 
1372  cl_mem_flags flagsTmp = m_Flags;
1373  m_Flags = vec.m_Flags;
1374  vec.m_Flags = flagsTmp;
1375  }
1376 
1382  {
1383  if( &index.m_Container != this )
1384  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1385 
1386  iterator l_End = end( );
1387  if( index.m_Index >= l_End.m_Index )
1388  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1389 
1390  size_type sizeRegion = l_End.m_Index - index.m_Index;
1391 
1392  cl_int l_Error = CL_SUCCESS;
1393  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1394  index.m_Index * sizeof( value_type ), sizeRegion * sizeof( value_type ), NULL, NULL, &l_Error ) );
1395  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1396 
1397  ::memmove( ptrBuff, ptrBuff + 1, (sizeRegion - 1)*sizeof( value_type ) );
1398 
1399  ::cl::Event unmapEvent;
1400  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1401  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1402  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1403 
1404  --m_Size;
1405 
1406  size_type newIndex = (m_Size < index.m_Index) ? m_Size : index.m_Index;
1407  return iterator( *this, static_cast< difference_type >( newIndex ) );
1408  }
1409 
1416  {
1417  if(( &first.m_Container != this ) && ( &last.m_Container != this ) )
1418  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1419 
1420  if( last.m_Index > m_Size )
1421  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1422 
1423  if( (first == begin( )) && (last == end( )) )
1424  {
1425  clear( );
1426  return iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1427  }
1428 
1429  iterator l_End = end( );
1430  size_type sizeMap = l_End.m_Index - first.m_Index;
1431 
1432  cl_int l_Error = CL_SUCCESS;
1433  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1434  first.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1435  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1436 
1437  size_type sizeErase = last.m_Index - first.m_Index;
1438  ::memmove( ptrBuff, ptrBuff + sizeErase, (sizeMap - sizeErase)*sizeof( value_type ) );
1439 
1440  ::cl::Event unmapEvent;
1441  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1442  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1443  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1444 
1445  m_Size -= sizeErase;
1446 
1447  size_type newIndex = (m_Size < last.m_Index) ? m_Size : last.m_Index;
1448  return iterator( *this, static_cast< typename iterator::difference_type >( newIndex ) );
1449  }
1450 
1458  iterator insert( const_iterator index, const value_type& value )
1459  {
1460  if( &index.m_Container != this )
1461  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1462 
1463  if( index.m_Index > m_Size )
1464  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1465 
1466  if( index.m_Index == m_Size )
1467  {
1468  push_back( value );
1469  return iterator( *this, index.m_Index );
1470  }
1471 
1472  // Need to grow the vector to insert a new value.
1473  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1474  // right at first blush.
1475  if( m_Size == capacity( ) )
1476  {
1477  reserve( m_Size + 10 );
1478  }
1479 
1480  size_type sizeMap = (m_Size - index.m_Index) + 1;
1481 
1482  cl_int l_Error = CL_SUCCESS;
1483  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1484  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1485  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1486 
1487  // Shuffle the old values 1 element down
1488  ::memmove( ptrBuff + 1, ptrBuff, (sizeMap - 1)*sizeof( value_type ) );
1489 
1490  // Write the new value in its place
1491  *ptrBuff = value;
1492 
1493  ::cl::Event unmapEvent;
1494  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1495  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1496  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1497 
1498  ++m_Size;
1499 
1500  return iterator( *this, index.m_Index );
1501  }
1502 
1510  void insert( const_iterator index, size_type n, const value_type& value )
1511  {
1512  if( &index.m_Container != this )
1513  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1514 
1515  if( index.m_Index > m_Size )
1516  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1517 
1518  // Need to grow the vector to insert a new value.
1519  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1520  // right at first blush.
1521  if( ( m_Size + n ) > capacity( ) )
1522  {
1523  reserve( m_Size + n );
1524  }
1525 
1526  size_type sizeMap = (m_Size - index.m_Index) + n;
1527 
1528  cl_int l_Error = CL_SUCCESS;
1529  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1530  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1531  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1532 
1533  // Shuffle the old values n element down.
1534  ::memmove( ptrBuff + n, ptrBuff, (sizeMap - n)*sizeof( value_type ) );
1535 
1536  // Copy the new value n times in the buffer.
1537  for( size_type i = 0; i < n; ++i )
1538  {
1539  ptrBuff[ i ] = value;
1540  }
1541 
1542  ::cl::Event unmapEvent;
1543  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1544  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1545  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1546 
1547  m_Size += n;
1548  }
1549 
1550  template< typename InputIterator >
1551  void insert( const_iterator index, InputIterator begin, InputIterator end )
1552  {
1553  if( &index.m_Container != this )
1554  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1555 
1556  if( index.m_Index > m_Size )
1557  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1558 
1559  // Need to grow the vector to insert a new value.
1560  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1561  // right at first blush.
1562  size_type n = std::distance( begin, end );
1563  if( ( m_Size + n ) > capacity( ) )
1564  {
1565  reserve( m_Size + n );
1566  }
1567  size_type sizeMap = (m_Size - index.m_Index) + n;
1568 
1569  cl_int l_Error = CL_SUCCESS;
1570  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1571  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1572  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for iterator insert" );
1573 
1574  // Shuffle the old values n element down.
1575  ::memmove( ptrBuff + n, ptrBuff, (sizeMap - n)*sizeof( value_type ) );
1576 
1577 #if( _WIN32 )
1578  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( ptrBuff, n ) );
1579 #else
1580  std::copy( begin, end, ptrBuff );
1581 #endif
1582 
1583  ::cl::Event unmapEvent;
1584  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1585  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1586  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1587 
1588  m_Size += n;
1589  }
1590 
1598  void assign( size_type newSize, const value_type& value )
1599  {
1600  if( newSize > m_Size )
1601  {
1602  reserve( newSize );
1603  }
1604  m_Size = newSize;
1605 
1606  cl_int l_Error = CL_SUCCESS;
1607 
1608  ::cl::Event fillEvent;
1609  size_t sizeDS = sizeof(value_type);
1610 
1611  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
1612  {
1613  l_Error = m_commQueue.enqueueFillBuffer< value_type >( m_devMemory,
1614  value,
1615  0,
1616  m_Size * sizeof( value_type ),
1617  NULL,
1618  &fillEvent );
1619  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
1620  }
1621  else
1622  {
1623  // Map the buffer to host
1624  ::cl::Event fill_mapEvent;
1625  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer ( m_devMemory,
1626  false,
1627  CL_MAP_READ | CL_MAP_WRITE,
1628  0,
1629  sizeof( value_type )*newSize,
1630  NULL,
1631  &fill_mapEvent,
1632  &l_Error );
1633 
1634  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
1635  fill_mapEvent.wait( );
1636 
1637  // Use serial fill_n to fill the device_vector with value
1638  std::fill_n( host_buffer ,
1639  newSize,
1640  value );
1641 
1642  // Unmap the buffer
1643  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory,
1644  host_buffer,
1645  NULL,
1646  &fillEvent );
1647  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
1648  }
1649 
1650  // Not allowed to return until the copy operation is finished.
1651  l_Error = fillEvent.wait( );
1652  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
1653  }
1654 
1660 #if _MSC_VER == 1700
1661  template<typename InputIterator>
1662  typename std::enable_if< std::_Is_iterator<InputIterator>::value, void>::type
1663  assign( InputIterator begin, InputIterator end )
1664 #else
1665  template<typename InputIterator>
1666  typename std::enable_if< !std::is_same< typename std::iterator_traits<InputIterator >::value_type,
1667  size_type >::value, void>::type
1668  assign( InputIterator begin, InputIterator end )
1669 #endif
1670  {
1671  size_type l_Count = std::distance( begin, end );
1672 
1673  if( l_Count > m_Size )
1674  {
1675  reserve( l_Count );
1676  }
1677  m_Size = l_Count;
1678 
1679  cl_int l_Error = CL_SUCCESS;
1680 
1681  naked_pointer ptrBuffer = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0 , m_Size * sizeof( value_type ), NULL, NULL, &l_Error ) );
1682  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for push_back" );
1683 
1684 #if( _WIN32 )
1685  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( ptrBuffer, m_Size ) );
1686 #else
1687  std::copy( begin, end, ptrBuffer );
1688 #endif
1689  ::cl::Event unmapEvent;
1690  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuffer, NULL, &unmapEvent );
1691  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1692  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1693  }
1694 
1695 
1702  const ::cl::Buffer& getBuffer( ) const
1703  {
1704  return m_devMemory;
1705  }
1706 
1713  ::cl::Buffer& getBuffer( )
1714  {
1715  return m_devMemory;
1716  }
1717 
1718  private:
1719  ::cl::Buffer m_devMemory;
1720  ::cl::CommandQueue m_commQueue;
1721  size_type m_Size;
1722  cl_mem_flags m_Flags;
1723  };
1724 
1725  // This string represents the device side definition of the constant_iterator template
1726  static std::string deviceVectorIteratorTemplate = STRINGIFY_CODE(
1727  namespace bolt { namespace cl { \n
1728  template< typename T > \n
1729  class device_vector \n
1730  { \n
1731  public: \n
1732  class iterator \n
1733  { \n
1734  public:
1735  typedef int iterator_category; // device code does not understand std:: tags \n
1736  typedef T value_type; \n
1737  typedef int difference_type; \n
1738  typedef int size_type; \n
1739  typedef T* pointer; \n
1740  typedef T& reference; \n
1741 
1742  iterator( value_type init ): m_StartIndex( init ), m_Ptr( 0 ) \n
1743  {}; \n
1744 
1745  void init( global value_type* ptr )\n
1746  { \n
1747  m_Ptr = ptr; \n
1748  }; \n
1749 
1750  global value_type& operator[]( size_type threadID ) const \n
1751  { \n
1752  return m_Ptr[ m_StartIndex + threadID ]; \n
1753  } \n
1754 
1755  value_type operator*( ) const \n
1756  { \n
1757  return m_Ptr[ m_StartIndex + threadID ]; \n
1758  } \n
1759 
1760  size_type m_StartIndex; \n
1761  global value_type* m_Ptr; \n
1762  }; \n
1763  }; \n
1764  } } \n
1765  );
1766 }
1767 }
1768 
1770 BOLT_CREATE_CLCODE( bolt::cl::device_vector< cl_int >::iterator, bolt::cl::deviceVectorIteratorTemplate );
1771 /*Now derive each of the OpenCL Application data types from cl_int data type.*/
1772 //Visual Studio 2012 is not able to map char to cl_char. Hence this typename is added.
1774 
1784 
1785 #endif