ViennaCL - The Vienna Computing Library  1.5.1
vector_reduction.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
2 #define VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
27 #include <vector>
28 
30 
34 
36 
37 #include "viennacl/tools/tools.hpp"
38 
39 namespace viennacl{
40 
41  namespace generator{
42 
45 
46  vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
47  return m_*(k_+1)*scalartype_size;
48  }
49 
50  public:
52  vector_reduction(unsigned int vectorization, unsigned int m, unsigned int k, unsigned int num_groups) : profile_base(vectorization, m, k, 1), m_(m), k_(k), num_groups_(num_groups){ }
53 
54 
55  static std::string csv_format() {
56  return "Vec,M,K,NumGroups";
57  }
58 
59  std::string csv_representation() const{
60  std::ostringstream oss;
61  oss << vector_size_
62  << "," << m_
63  << "," << k_
64  << "," << num_groups_;
65  return oss.str();
66  }
67 
68  unsigned int m() const { return m_; }
69 
70  unsigned int k() const { return k_; }
71 
72  unsigned int num_groups() const { return num_groups_; }
73 
74  void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & kernel, unsigned int & n_arg) const{
75 
76  configure_local_sizes(kernel, kernel_id);
77  kernel.global_work_size(0,m_*num_groups_);
78  kernel.global_work_size(1,k_);
79 
80 
81  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
82  scheduler::statement::container_type exprs = it->first.array();
83  for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
85  scheduler::statement_node const * current_node = &(*iit);
86  //The LHS of the prod is a matrix
88  {
89  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
90  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
91  return;
92  }
93  else{
94  //The LHS of the prod is a matrix expression
95  current_node = &exprs[current_node->lhs.node_index];
97  {
98  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
99  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
100  return;
101  }
102  else if(current_node->rhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
103  {
104  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
105  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
106  return;
107  }
108  else{
109  assert(false && bool("unexpected expression tree"));
110  }
111  }
112  return;
113  }
114  }
115  }
116  }
117 
118  void kernel_arguments(statements_type const & /*statements*/, std::string & arguments_string) const{
119  arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
120  arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
121  }
122 
123  private:
124  void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
125 
126  std::vector<detail::mapped_vector_reduction*> exprs;
127  for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it){
128  for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit){
129  if(detail::mapped_vector_reduction * p = dynamic_cast<detail::mapped_vector_reduction*>(iit->second.get()))
130  exprs.push_back(p);
131  if(detail::mapped_matrix * p = dynamic_cast<detail::mapped_matrix*>(iit->second.get()))
132  p->bind_sizes("M","N");
133  }
134  }
135 
136  vcl_size_t lsize1 = m_;
137  vcl_size_t lsize2 = k_+1;
138  std::string scalartype = "float";
139  bool is_lhs_transposed = false;
140  if(exprs.front()->root_node().lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
141  if(exprs.front()->statement().array()[exprs.front()->root_node().lhs.node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
142  is_lhs_transposed = true;
143 
144  std::string size1 = "M", size2 = "N";
145  if(is_lhs_transposed)
146  std::swap(size1, size2);
147 
148  for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
149  stream << "__local " << (*it)->scalartype() << " buf" << std::distance(exprs.begin(), it) << '[' << lsize1*lsize2 << "];" << std::endl;
150  }
151 
152  stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
153  stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
154 
155 
156  stream << "for(unsigned int r = get_global_id(0) ; r < " << size1 << " ; r += get_global_size(0)){" << std::endl;
157  stream.inc_tab();
158 
159  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
160  stream << scalartype << " sum" << k << " = 0;" << std::endl;
161 
162  stream << "for(unsigned int c = get_local_id(1) ; c < " << size2 << " ; c += get_local_size(1)){" << std::endl;
163  stream.inc_tab();
164 
165  std::set<std::string> fetched;
166 
167  for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
168  viennacl::scheduler::statement const & statement = (*it)->statement();
169  viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
170  if(is_lhs_transposed)
171  detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("c", "r"),vector_size_,stream,(*it)->mapping());
172  else
173  detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("r", "c"),vector_size_,stream,(*it)->mapping());
174 
175  detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("c", "0"),vector_size_,stream,(*it)->mapping());
176  }
177 
178 
179  //Update sums;
180  for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
181  viennacl::scheduler::statement const & statement = (*it)->statement();
182  viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
183  std::string str;
184  detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
185  str += "*";
186  detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
187  stream << " sum" << std::distance(exprs.begin(),it) << " += " << str << ";" << std::endl;
188  }
189 
190 
191  stream.dec_tab();
192  stream << "}" << std::endl;
193 
194  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
195  stream << "buf" << k << "[lid0*" << lsize2 << "+ lid1] = sum" << k << ";" << std::endl;
196  }
197 
198  for(unsigned int stride = k_/2 ; stride>1 ; stride /=2){
199  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
200  stream << "if(lid1 < " << stride << ")" ;
201  stream << "{" << std::endl;
202  stream.inc_tab();
203 
204  for(vcl_size_t i = 0 ; i < exprs.size() ; ++i)
205  stream << "buf" << i << "[lid0*" << lsize2 << "+ lid1] += buf" << i << "[lid0*" << lsize2 << "+ lid1 + " << stride << "];" << std::endl;
206 
207  stream.dec_tab();
208  stream << "}" << std::endl;
209  }
210 
211 
212  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
213  stream << "if(lid1 == 0)" ;
214  stream << "{" << std::endl;
215  stream.inc_tab();
216  for(vcl_size_t i = 0 ; i < exprs.size() ; ++i){
217  stream << "buf" << i << "[lid0*" << lsize2 << "] += buf" << i << "[lid0*" << lsize2 << "+ 1];" << std::endl;
218  exprs[i]->access_name("buf"+utils::to_string(i)+"[lid0*"+utils::to_string(lsize2)+"]");
219  }
220  vcl_size_t i = 0;
221  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
222  std::string str;
223  detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("r","0"), -1, str, mapping[i++]), false);
224  stream << str << ";" << std::endl;
225  }
226  stream.dec_tab();
227  stream << "}" << std::endl;
228 
229 
230  stream.dec_tab();
231  stream << "}" << std::endl;
232 
233  }
234 
235  private:
236  unsigned int m_;
237  unsigned int k_;
238  unsigned int num_groups_;
239  };
240  }
241 }
242 
243 #endif
A stream class where the kernel sources are streamed to. Takes care of indentation of the sources...
Definition: utils.hpp:233
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:124
std::size_t vcl_size_t
Definition: forwards.h:58
vcl_size_t node_index
Definition: forwards.h:276
statement(container_type const &custom_array)
Definition: forwards.h:454
Internal utils for a dynamic OpenCL kernel generation.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
Base class for an operation profile.
Definition: profile_base.hpp:47
lhs_rhs_element lhs
Definition: forwards.h:422
Various little tools used here and there in ViennaCL.
Definition: forwards.h:176
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
unsigned int k() const
Definition: vector_reduction.hpp:70
Mapping of a matrix to a generator class.
Definition: mapped_objects.hpp:236
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
Functor for obtaining the internal number of columns of a ViennaCL matrix.
Definition: utils.hpp:188
std::list< std::pair< scheduler::statement, scheduler::statement_node > > statements_type
Definition: profile_base.hpp:49
vector_reduction(unsigned int vectorization, unsigned int m, unsigned int k, unsigned int num_groups)
The user constructor.
Definition: vector_reduction.hpp:52
lhs_rhs_element rhs
Definition: forwards.h:424
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
unsigned int m() const
Definition: vector_reduction.hpp:68
unsigned int num_groups() const
Definition: vector_reduction.hpp:72
several code generation helpers
void kernel_arguments(statements_type const &, std::string &arguments_string) const
Definition: vector_reduction.hpp:118
Base classes for the profiles.
Map ViennaCL objects to generator wrappers.
Functor for obtaining the internal number of rows of a ViennaCL matrix.
Definition: utils.hpp:181
void configure_local_sizes(viennacl::ocl::kernel &k, vcl_size_t) const
Definition: profile_base.hpp:59
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
std::vector< value_type > container_type
Definition: forwards.h:452
std::string csv_representation() const
csv representation of an operation
Definition: vector_reduction.hpp:59
OpenCL kernel template for reductions resulting in a vector. Example: Computing the row norms of a ma...
Definition: vector_reduction.hpp:44
unsigned int vector_size_
Definition: profile_base.hpp:178
void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const &statements, viennacl::ocl::kernel &kernel, unsigned int &n_arg) const
Configures the range and enqueues the arguments associated with the profile.
Definition: vector_reduction.hpp:74
std::string to_string(T const t)
Definition: utils.hpp:204
Mapping of a vector reduction (based on matrix-vector product)
Definition: mapped_objects.hpp:109
static std::string csv_format()
Definition: vector_reduction.hpp:55
statement_node_type_family type_family
Definition: forwards.h:269
viennacl::enable_if< viennacl::is_scalar< S1 >::value &&viennacl::is_scalar< S2 >::value >::type swap(S1 &s1, S2 &s2)
Swaps the contents of two scalars, data is copied.
Definition: scalar_operations.hpp:366
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:447
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:759
Main datastructure for an node in the statement tree.
Definition: forwards.h:420