Bolt  1.1
C++ template library with support for OpenCL
transform_reduce_range.h
Go to the documentation of this file.
1 /***************************************************************************
2 * Copyright 2012 - 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 
16 ***************************************************************************/
17 
22 #if !defined( BOLT_AMP_TRANSFORM_REDUCE_RANGE_H )
23 #define BOLT_AMP_TRANSFORM_REDUCE_RANGE_H
24 
25 #pragma once
26 
27 #include <bolt/transform_reduce.h>
28 
29 
30 #ifdef BOLT_POOL_ALLOC
31 #include <bolt/pool_alloc.h>
32 // hacky global var, move to bolt.cpp?
34 #endif
35 
36 namespace bolt {
37  namespace amp {
38 
39 #define VW 1
40 #define BARRIER(W) // FIXME - placeholder for future barrier insertions
41 
42 #define REDUCE_STEP(_IDX, _W) \
43  if (_IDX < _W) tiled_data[_IDX] = reduce_op(tiled_data[_IDX], tiled_data[_IDX+_W]); \
44  BARRIER(_W)
45 
46 
47  //=======================
48  // This version takes a start index and extent as the range to iterate.
49  // The tranform_op is called with topLeft, bottomRight, and stride for each section that to be processed by
50  // the function. Function must do iteration over the specified range with specified stride, and also reduce results.
51  // May avoid copies on some compilers and deliver higher performance than the cleaner transform_reduce function,
52  // where transform op only takes a single index point.
53  // Useful for indexing over all points in an image or array
54  template<typename outputT, int Rank, typename UnaryFunction, typename BinaryFunction>
55  outputT transform_reduce_range(concurrency::accelerator_view av,
56  concurrency::index<Rank> origin, concurrency::extent<Rank> ext,
57  UnaryFunction transform_op,
58  outputT init, BinaryFunction reduce_op)
59  {
60  using namespace concurrency;
61 
62  int wgPerComputeUnit = p_wgPerComputeUnit; // remove me.
63  int computeUnits = p_computeUnits;
64  int resultCnt = computeUnits * wgPerComputeUnit;
65 
66  // FIXME: implement a more clever algorithm for setting the shape of the calculation.
67  int globalH = wgPerComputeUnit * localH;
68  int globalW = computeUnits * localW;
69 
70  globalH = (ext[0] < globalH) ? ext[0] : globalH; //FIXME, this is not a multiple of localSize.
71  globalW = (ext[1] < globalW) ? ext[1] : globalW;
72 
73 
74  extent<2> launchExt(globalH, globalW);
75 #ifdef BOLT_POOL_ALLOC
76  bolt::ArrayPool<outputT>::PoolEntry &entry = arrayPool.alloc(av, resultCnt);
77  array<outputT,1> &results1 = *(entry._dBuffer);
78 #else
79  array<outputT,1> results1(resultCnt, av); // Output after reducing through LDS.
80 #endif
81  index<2> bottomRight(origin[0]+ext[0], origin[1]+ext[1]);
82 
83  // FIXME - support actual BARRIER operations.
84  // FIXME - support checks on local memory usage
85  // FIXME - reduce size of work for small problems.
86  concurrency::parallel_for_each(av, launchExt.tile<localH, localW>(), [=,&results1](concurrency::tiled_index<localH, localW> idx) mutable restrict(amp)
87  {
88  tile_static outputT tiled_data[waveSize];
89 
90 #if 1
91  init = reduce_op(init, transform_op(index<Rank>(origin[0]+idx.global[0], origin[1]+idx.global[1]), //top/left
92  bottomRight, // bottomRight
93  launchExt)); //stride
94 #endif
95 
96 #if 0
97  for (int y=origin[0]+idx.global[0]; y<origin[0]+ext[0]; y+=launchExt[0]) {
98  for (int x=origin[1]+idx.global[1]*VW; x<origin[1]+ext[1]; x+=launchExt[1]*VW) {
99  init = reduce_op(init, transform_op(concurrency::index<Rank>(y,x)));
100  };
101  };
102 #endif
103 
104  //---
105  // Reduce through LDS across wavefront:
106  int lx = localW * idx.local[0] + idx.local[1];
107  tiled_data[lx] = init;
108  BARRIER(waveSize);
109 
110  REDUCE_STEP(lx, 32);
111  REDUCE_STEP(lx, 16);
112  REDUCE_STEP(lx, 8);
113  REDUCE_STEP(lx, 4);
114  REDUCE_STEP(lx, 2);
115  REDUCE_STEP(lx, 1);
116 
117  //---
118  // Save result of this tile to global mem
119  if (lx== 0) {
120  results1[idx.tile[0]*computeUnits + idx.tile[1]] = tiled_data[0];
121  };
122 
123  } ); //end parallel_for_each
124  // results1[] now contains intermediate results which need to be combined together.
125 
126  //---
127  //Copy partial array back to host
128  // FIXME - we'd really like to use ZC memory for this final step
129  //std::vector<outputT> h_data(resultCnt);
130  //h_data = results1;
131  concurrency::copy(*entry._dBuffer, *entry._stagingBuffer);
132 
133 
134 
135  outputT finalReduction = init;
136  for (int i=0; i<results1.extent[0]; i++) {
137  finalReduction = reduce_op(finalReduction, (*entry._stagingBuffer)[i]);
138  };
139 
140 #ifdef BOLT_POOL_ALLOC
141  arrayPool.free(entry);
142 #endif
143 
144  return finalReduction;
145 
146  };
147 
148  template<typename outputT, int Rank, typename UnaryFunction, typename BinaryFunction>
149  outputT transform_reduce_range(
150  concurrency::index<Rank> origin, concurrency::extent<Rank> ext,
151  UnaryFunction transform_op,
152  outputT init, BinaryFunction reduce_op)
153  {
154  return transform_reduce_range(concurrency::accelerator().default_view, origin, ext, transform_op, init, reduce_op);
155  };
156 
157  }; // end namespace amp
158 
159 }; // end namespace bolt
160 
161 
162 #endif